Skip to content

Distributed MPI runs unable to save variables using jldsave #647

@taimoorsohail

Description

@taimoorsohail

Hi,

I am trying to save some variables of interest in my simulation using jldsave. Note that I can't use JLD2Writer or outputwriter because I need to save variables from the ocean, atmosphere and sea ice in the same file, for checkpointing, and JLD2Writer is not supported for all components of a coupled OceanSeaIceModel yet (?).

I have the below MWE:

using MPI
using CUDA

MPI.Init()
atexit(MPI.Finalize)  

using Oceananigans
using Oceananigans.Units
using Oceananigans.DistributedComputations
using Printf
using Dates
using Oceananigans.Architectures: on_architecture

output_path = expanduser("/g/data/v46/txs156/ocean-ensembles/outputs/")

arch = Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=false)

function immersed_latlon_grid(underlying_grid::LatitudeLongitudeGrid;
                                           radius = 5,  # controls width of Gaussian (in degrees)
                                           height = 2000, # hill height in meters
                                           λc = 0, φc = 45, # hill center (lon, lat)
                                           active_cells_map = false)

    Lz = underlying_grid.Lz

    # Convert degrees to radians if grid expects radians, but leaving in degrees here
    bottom_height(λ, φ) = begin
        # Gaussian hill centered at (λc, φc)=- λc)^2 +- φc)^2
        z = -Lz + height * exp(-/ (2 * radius^2))
        return z
    end

    grid = ImmersedBoundaryGrid(underlying_grid,
                                GridFittedBottom(bottom_height);
                                active_cells_map)

    return grid
end

Nx, Ny, Nz = 100, 100, 50
Lx, Ly = 100, 100

@info "Defining vertical z faces"

depth = -6000.0 # Depth of the ocean in meters
z_faces = ExponentialDiscretization(Nz, depth, 0) 

@info "Creating grid"

underlying_grid = LatitudeLongitudeGrid(arch,
                                        size = (Nx, Ny, Nz),
                                        z = z_faces,
                                        halo = (6, 6, 3),
                                        longitude = (0, 360),
                                        latitude = (-70, 70))




@info "Defining grid"

grid = immersed_latlon_grid(underlying_grid; active_cells_map=true)

@info "Creating free surface"
free_surface = SplitExplicitFreeSurface(grid; substeps = 70)

@info "Creating model"

ocean_model = HydrostaticFreeSurfaceModel(; grid, free_surface, timestepper = :SplitRungeKutta3)

@info "Creating simulation"

simulation = Simulation(ocean_model; Δt=10, verbose=false, stop_time=2hours)

function save_restart(sim)
    @info @sprintf("Saving checkpoint file")
    localrank = Integer(sim.model.architecture.local_rank)
    @info "Local rank: " * string(localrank)
    @info "Saving filename" * output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2"

    jldsave(output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2";
    u = on_architecture(CPU(), (sim.model.velocities.u)))
end

# Nice progress messaging is helpful:

## Print a progress message
progress_message(sim) = @printf("Iteration: %04d, time: %s, Δt: %s, max(|w|) = %.1e ms⁻¹, wall time: %s\n",
                                iteration(sim), prettytime(sim), prettytime(sim.Δt),
                                maximum(abs, sim.model.velocities.w), prettytime(sim.run_wall_time))

add_callback!(simulation, progress_message, IterationInterval(40))
add_callback!(simulation, save_restart, IterationInterval(40))

@info "Running simulation"
run!(simulation)

But running the simulation leads to the error at jldsave(output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2"; :

ERROR: LoadError: type Nothing has no field rank
Stacktrace:
  [1] getproperty(x::Nothing, f::Symbol)
    @ Base ./Base.jl:37
  [2] (::Oceananigans.BoundaryConditions.MultiRegionFillHalo{Oceananigans.BoundaryConditions.SouthAndNorth})(c::OffsetArrays.OffsetArray{Float64, 3, Array{Float64, 3}}, southbc::BoundaryCondition{Oceananigans.BoundaryConditions.MultiRegionCommunication, Nothing}, northbc::BoundaryCondition{Oceananigans.BoundaryConditions.MultiRegionCommunication, Nothing}, loc::Tuple{Center, Center, Nothing}, grid::LatitudeLongitudeGrid{Float64, Periodic, Oceananigans.Grids.FullyConnected, Bounded, Oceananigans.Grids.StaticVerticalDiscretization{OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}}, Float64, Float64, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, Float64, Float64, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, Float64, Float64, CPU, Int64}, buffers::Tuple{})
    @ Oceananigans.MultiRegion /g/data/v46/txs156/Oceananigans.jl/src/MultiRegion/multi_region_boundary_conditions.jl:113
  [3] fill_halo_event!
    @ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:40 [inlined]
  [4] #fill_halo_regions!#26
    @ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:32 [inlined]
  [5] fill_halo_regions!
    @ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:25 [inlined]
  [6] #fill_halo_regions!#63
    @ /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:857 [inlined]
    @ Oceananigans.Fields /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:843
    @ Oceananigans.ImmersedBoundaries /g/data/v46/txs156/Oceananigans.jl/src/ImmersedBoundaries/grid_fitted_bottom.jl:85
    @ Oceananigans.ImmersedBoundaries /g/data/v46/txs156/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_grid.jl:129
    @ Oceananigans.Fields /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:468
    @ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
 [12] UndefVarError: `jldsave` not definedUndefVarError: `jldsave` not defined
Stacktrace:
 [1] 
Stacktrace:
    @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
   @ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
   @ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
    @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
    @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
    @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
    @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
 [17] top-level scope
    @ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
 [7] top-level scope
   @ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
   @ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
 [7] top-level scope
   @ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[17346,1],0]
  Exit code:    1
--------------------------------------------------------------------------

Note I had to delete most of the stack trace because of character limits on Github...

But if you run the MWE with MPI (more than 1 GPU) you will see the issue I have!

cc @navidcy @simone-silvestri @xkykai

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions