-
Notifications
You must be signed in to change notification settings - Fork 21
Open
Labels
Description
Hi,
I am trying to save some variables of interest in my simulation using jldsave. Note that I can't use JLD2Writer or outputwriter because I need to save variables from the ocean, atmosphere and sea ice in the same file, for checkpointing, and JLD2Writer is not supported for all components of a coupled OceanSeaIceModel yet (?).
I have the below MWE:
using MPI
using CUDA
MPI.Init()
atexit(MPI.Finalize)
using Oceananigans
using Oceananigans.Units
using Oceananigans.DistributedComputations
using Printf
using Dates
using Oceananigans.Architectures: on_architecture
output_path = expanduser("/g/data/v46/txs156/ocean-ensembles/outputs/")
arch = Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=false)
function immersed_latlon_grid(underlying_grid::LatitudeLongitudeGrid;
radius = 5, # controls width of Gaussian (in degrees)
height = 2000, # hill height in meters
λc = 0, φc = 45, # hill center (lon, lat)
active_cells_map = false)
Lz = underlying_grid.Lz
# Convert degrees to radians if grid expects radians, but leaving in degrees here
bottom_height(λ, φ) = begin
# Gaussian hill centered at (λc, φc)
r² = (λ - λc)^2 + (φ - φc)^2
z = -Lz + height * exp(-r² / (2 * radius^2))
return z
end
grid = ImmersedBoundaryGrid(underlying_grid,
GridFittedBottom(bottom_height);
active_cells_map)
return grid
end
Nx, Ny, Nz = 100, 100, 50
Lx, Ly = 100, 100
@info "Defining vertical z faces"
depth = -6000.0 # Depth of the ocean in meters
z_faces = ExponentialDiscretization(Nz, depth, 0)
@info "Creating grid"
underlying_grid = LatitudeLongitudeGrid(arch,
size = (Nx, Ny, Nz),
z = z_faces,
halo = (6, 6, 3),
longitude = (0, 360),
latitude = (-70, 70))
@info "Defining grid"
grid = immersed_latlon_grid(underlying_grid; active_cells_map=true)
@info "Creating free surface"
free_surface = SplitExplicitFreeSurface(grid; substeps = 70)
@info "Creating model"
ocean_model = HydrostaticFreeSurfaceModel(; grid, free_surface, timestepper = :SplitRungeKutta3)
@info "Creating simulation"
simulation = Simulation(ocean_model; Δt=10, verbose=false, stop_time=2hours)
function save_restart(sim)
@info @sprintf("Saving checkpoint file")
localrank = Integer(sim.model.architecture.local_rank)
@info "Local rank: " * string(localrank)
@info "Saving filename" * output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2"
jldsave(output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2";
u = on_architecture(CPU(), (sim.model.velocities.u)))
end
# Nice progress messaging is helpful:
## Print a progress message
progress_message(sim) = @printf("Iteration: %04d, time: %s, Δt: %s, max(|w|) = %.1e ms⁻¹, wall time: %s\n",
iteration(sim), prettytime(sim), prettytime(sim.Δt),
maximum(abs, sim.model.velocities.w), prettytime(sim.run_wall_time))
add_callback!(simulation, progress_message, IterationInterval(40))
add_callback!(simulation, save_restart, IterationInterval(40))
@info "Running simulation"
run!(simulation)But running the simulation leads to the error at jldsave(output_path * "mwe_jldsave_distributed" * string(sim.model.clock.iteration) * "_rank$(localrank).jld2"; :
ERROR: LoadError: type Nothing has no field rank
Stacktrace:
[1] getproperty(x::Nothing, f::Symbol)
@ Base ./Base.jl:37
[2] (::Oceananigans.BoundaryConditions.MultiRegionFillHalo{Oceananigans.BoundaryConditions.SouthAndNorth})(c::OffsetArrays.OffsetArray{Float64, 3, Array{Float64, 3}}, southbc::BoundaryCondition{Oceananigans.BoundaryConditions.MultiRegionCommunication, Nothing}, northbc::BoundaryCondition{Oceananigans.BoundaryConditions.MultiRegionCommunication, Nothing}, loc::Tuple{Center, Center, Nothing}, grid::LatitudeLongitudeGrid{Float64, Periodic, Oceananigans.Grids.FullyConnected, Bounded, Oceananigans.Grids.StaticVerticalDiscretization{OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}}, Float64, Float64, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, Float64, Float64, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}, Int64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, OffsetArrays.OffsetVector{Float64, Vector{Float64}}, Float64, Float64, CPU, Int64}, buffers::Tuple{})
@ Oceananigans.MultiRegion /g/data/v46/txs156/Oceananigans.jl/src/MultiRegion/multi_region_boundary_conditions.jl:113
[3] fill_halo_event!
@ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:40 [inlined]
[4] #fill_halo_regions!#26
@ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:32 [inlined]
[5] fill_halo_regions!
@ /g/data/v46/txs156/Oceananigans.jl/src/BoundaryConditions/fill_halo_regions.jl:25 [inlined]
[6] #fill_halo_regions!#63
@ /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:857 [inlined]
@ Oceananigans.Fields /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:843
@ Oceananigans.ImmersedBoundaries /g/data/v46/txs156/Oceananigans.jl/src/ImmersedBoundaries/grid_fitted_bottom.jl:85
@ Oceananigans.ImmersedBoundaries /g/data/v46/txs156/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_grid.jl:129
@ Oceananigans.Fields /g/data/v46/txs156/Oceananigans.jl/src/Fields/field.jl:468
@ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
[12] UndefVarError: `jldsave` not definedUndefVarError: `jldsave` not defined
Stacktrace:
[1]
Stacktrace:
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
@ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
@ Main /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:82
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/callback.jl:15
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:238
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:136
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
[17] top-level scope
@ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:105
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
[7] top-level scope
@ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
@ Oceananigans.Simulations /g/data/v46/txs156/Oceananigans.jl/src/Simulations/run.jl:92
[7] top-level scope
@ /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
in expression starting at /g/data/v46/txs156/ocean-ensembles/mwes/JLDSave_distributed.jl:97
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[17346,1],0]
Exit code: 1
--------------------------------------------------------------------------Note I had to delete most of the stack trace because of character limits on Github...
But if you run the MWE with MPI (more than 1 GPU) you will see the issue I have!