Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
573a366
add SplitExplicit tests for checkpointer
navidcy Mar 20, 2025
41cb115
add SplitExplicit tests for checkpointer
navidcy Mar 20, 2025
0a52d4e
expose some checkpointer functionality; don't store properties as a C…
navidcy Mar 20, 2025
a776730
use julia v1.10.9
navidcy Mar 20, 2025
b66ae1b
pass properties to write_output!
navidcy Mar 20, 2025
be22062
bump patch release
navidcy Mar 20, 2025
0b67377
validate_properties -> validate_checkpointed_properties
navidcy Mar 20, 2025
fe8e70a
add default properties kwarg to write_output!
navidcy Mar 20, 2025
83f4ea2
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Mar 22, 2025
cbf323d
remove stray spaces and add backticks
navidcy Mar 25, 2025
40dccba
update docstring
navidcy Mar 25, 2025
38ef5b6
code alignment
navidcy Mar 25, 2025
e526910
merge main
navidcy Mar 25, 2025
43a644f
Merge branch 'ncc/checkopointer-shenanigans-2' of github.com:CliMA/Oc…
navidcy Mar 25, 2025
3aeeed7
set_clock! + clock.last_stage_Δt, clock.last_Δt in tick!(clock, Δt)
navidcy Mar 25, 2025
c0eb07b
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Mar 25, 2025
c3c007f
add set_clock! for OceananigansModels
navidcy Mar 25, 2025
a21ff8a
clock.last_Δt = Δt is part of tick!(clock, Δt)
navidcy Mar 25, 2025
3329188
add set_clock!(::Simulation, clock)
navidcy Mar 25, 2025
2f6ef7d
add docs for align_time_step
navidcy Mar 25, 2025
96286b1
wip
navidcy Apr 24, 2025
2686f8c
merge main
navidcy Apr 24, 2025
d5420c7
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy May 23, 2025
7607706
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jun 26, 2025
847c953
Update Project.toml
navidcy Jun 26, 2025
635550b
Update src/OutputWriters/checkpointer.jl
navidcy Jun 26, 2025
0ca81dd
Apply suggestions from code review
navidcy Jun 26, 2025
666dfcc
Update checkpointer.jl
navidcy Jun 26, 2025
97e3ede
Update simulation.jl
navidcy Jun 26, 2025
0992038
Update clock.jl
navidcy Jun 26, 2025
a9e6657
Update clock.jl
navidcy Jun 26, 2025
e4e07e0
clock from main
navidcy Jun 26, 2025
3da6865
updates in clock
navidcy Jun 26, 2025
b91aa9b
clock from main
navidcy Jun 26, 2025
0ac4427
updates in clock
navidcy Jun 26, 2025
cb01cf4
updates in clock
navidcy Jun 26, 2025
2ac952f
clock from main
navidcy Jun 26, 2025
a673011
Update clock.jl
navidcy Jun 26, 2025
28f6e39
import AbstractModel
navidcy Jun 27, 2025
9cface9
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 6, 2025
023df35
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 9, 2025
db85355
Update runge_kutta_3.jl
navidcy Jul 9, 2025
8c9f32d
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 9, 2025
ac6d76f
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 10, 2025
4319b16
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 16, 2025
05dc781
Merge branch 'main' into ncc/checkopointer-shenanigans-2
navidcy Jul 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
env:
JULIA_VERSION: "1.10.8"
JULIA_VERSION: "1.10.9"
JULIA_MINOR_VERSION: "1.10"
TARTARUS_HOME: "/storage5/buildkite-agent"
JULIA_PKG_SERVER_REGISTRY_PREFERENCE: eager
Expand Down Expand Up @@ -209,7 +209,7 @@ steps:
architecture: CPU
retry:
automatic:
- exit_status: 1
- exit_status: 1
limit: 1
depends_on: "init"

Expand All @@ -224,7 +224,7 @@ steps:
architecture: CPU
retry:
automatic:
- exit_status: 1
- exit_status: 1
limit: 1
depends_on: "init"

Expand Down Expand Up @@ -337,7 +337,7 @@ steps:
##### Turbulence Closures
#####


- label: "🎣 gpu turbulence closures"
env:
JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
Expand Down Expand Up @@ -683,7 +683,7 @@ steps:
architecture: CPU
retry:
automatic:
- exit_status: 1
- exit_status: 1
limit: 1
depends_on: "init"

Expand All @@ -699,7 +699,7 @@ steps:
architecture: CPU
retry:
automatic:
- exit_status: 1
- exit_status: 1
limit: 1
depends_on: "init"

Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Oceananigans"
uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
authors = ["Climate Modeling Alliance and contributors"]
version = "0.96.2"
version = "0.96.3"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down
116 changes: 64 additions & 52 deletions src/OutputWriters/checkpointer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,60 @@ using Oceananigans: fields, prognostic_fields
using Oceananigans.Fields: offset_data
using Oceananigans.TimeSteppers: QuasiAdamsBashforth2TimeStepper

import Oceananigans.Fields: set!
import Oceananigans.Fields: set!

mutable struct Checkpointer{T, P} <: AbstractOutputWriter
mutable struct Checkpointer{T} <: AbstractOutputWriter
schedule :: T
dir :: String
prefix :: String
properties :: P
overwrite_existing :: Bool
verbose :: Bool
cleanup :: Bool
end

function default_checkpointed_properties(model)
properties = [:grid, :particles, :clock, :timestepper]
#if has_ab2_timestepper(model)
# push!(properties, :timestepper)
#end
return properties
end
default_checkpointed_properties(model) = [:grid, :particles, :clock, :timestepper]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add the timestepper only if we use AB2, in the end RK3 does not require it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should the fallback be nothing, and each model add defaults that are specific to the model? The models have to figure this out.


has_ab2_timestepper(model) = try
model.timestepper isa QuasiAdamsBashforth2TimeStepper
catch
false
end

# Certain properties are required for `set!` to pickup from a checkpoint.
function required_checkpointed_properties(model)
properties = [:grid, :particles, :clock]

if has_ab2_timestepper(model)
push!(properties, :timestepper)
end

return properties
end

function validate_checkpointed_properties(model, properties)
required_properties = required_checkpointed_properties(model)

for rp in required_properties
if rp ∉ properties
@warn "$rp is required for checkpointing. It will be added to checkpointed properties"
push!(properties, rp)
end
end

for p in properties
p isa Symbol || error("Property $p to be checkpointed must be a Symbol.")
p ∉ propertynames(model) && error("Cannot checkpoint $p, it is not a model property!")

if (p ∉ required_properties) && has_reference(Function, getproperty(model, p))
@warn "model.$p contains a function somewhere in its hierarchy and will not be checkpointed."
filter!(e -> e != p, properties)
end
end

return properties
end


"""
Checkpointer(model;
schedule,
Expand Down Expand Up @@ -79,36 +107,11 @@ function Checkpointer(model; schedule,
prefix = "checkpoint",
overwrite_existing = false,
verbose = false,
cleanup = false,
properties = default_checkpointed_properties(model))

# Certain properties are required for `set!` to pickup from a checkpoint.
required_properties = [:grid, :particles, :clock]

if has_ab2_timestepper(model)
push!(required_properties, :timestepper)
end

for rp in required_properties
if rp ∉ properties
@warn "$rp is required for checkpointing. It will be added to checkpointed properties"
push!(properties, rp)
end
end

for p in properties
p isa Symbol || error("Property $p to be checkpointed must be a Symbol.")
p ∉ propertynames(model) && error("Cannot checkpoint $p, it is not a model property!")

if (p ∉ required_properties) && has_reference(Function, getproperty(model, p))
@warn "model.$p contains a function somewhere in its hierarchy and will not be checkpointed."
filter!(e -> e != p, properties)
end
end
cleanup = false)

mkpath(dir)

return Checkpointer(schedule, dir, prefix, properties, overwrite_existing, verbose, cleanup)
return Checkpointer(schedule, dir, prefix, overwrite_existing, verbose, cleanup)
end

#####
Expand Down Expand Up @@ -177,29 +180,40 @@ end
function write_output!(c::Checkpointer, model)
filepath = checkpoint_path(model.clock.iteration, c)
c.verbose && @info "Checkpointing to file $filepath..."
addr = checkpointer_address(model)

t1 = time_ns()

jldopen(filepath, "w") do file
file["$addr/checkpointed_properties"] = c.properties
serializeproperties!(file, model, c.properties, addr)
model_fields = prognostic_fields(model)
field_names = keys(model_fields)
for name in field_names
full_address = "$addr/$name"
serializeproperty!(file, full_address, model_fields[name])
end
end
write_output!(c, model, filepath, "w")

t2, sz = time_ns(), filesize(filepath)

c.verbose && @info "Checkpointing done: time=$(prettytime((t2 - t1) * 1e-9)), size=$(pretty_filesize(sz))"

c.cleanup && cleanup_checkpoints(c)

return nothing
end

function write_output!(c, model, filepath::AbstractString, mode::AbstractString;
properties = default_checkpointed_properties(model))
@show properties
@show model

properties = validate_checkpointed_properties(model, properties)
addr = checkpointer_address(model)

jldopen(filepath, mode) do file
file["$addr/checkpointed_properties"] =
serializeproperties!(file, model, properties, addr)
model_fields = prognostic_fields(model)
field_names = keys(model_fields)
for name in field_names
full_address = "$addr/$name"
serializeproperty!(file, full_address, model_fields[name])
end
end
end

function cleanup_checkpoints(checkpointer)
filepaths = glob(checkpoint_superprefix(checkpointer.prefix) * "*.jld2", checkpointer.dir)
latest_checkpoint_filepath = latest_checkpoint(checkpointer, filepaths)
Expand All @@ -212,13 +226,12 @@ end
#####

"""
set!(model, filepath::AbstractString)
set!(model::AbstractModel, filepath::AbstractString)

Set data in `model.velocities`, `model.tracers`, `model.timestepper.Gⁿ`, and
`model.timestepper.G⁻` to checkpointed data stored at `filepath`.
"""
function set!(model::AbstractModel, filepath::AbstractString)

addr = checkpointer_address(model)

jldopen(filepath, "r") do file
Expand Down Expand Up @@ -280,9 +293,8 @@ function set_time_stepper_tendencies!(timestepper, file, model_fields, addr)
return nothing
end

# For self-starting timesteppers like RK3 we do nothing
# For self-starting timesteppers like RK3 we do nothing
set_time_stepper!(timestepper, args...) = nothing

set_time_stepper!(timestepper::QuasiAdamsBashforth2TimeStepper, args...) =
set_time_stepper_tendencies!(timestepper, args...)

25 changes: 12 additions & 13 deletions src/Simulations/run.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ end
"""
aligned_time_step(sim, Δt)

Return a time step 'aligned' with `sim.stop_time`, output writer schedules,
Return a time step 'aligned' with `sim.stop_time`, output writer schedules,
and callback schedules. Alignment with `sim.stop_time` takes precedence.
"""
function aligned_time_step(sim::Simulation, Δt)
Expand All @@ -45,7 +45,7 @@ function aligned_time_step(sim::Simulation, Δt)

# Align time step with output writing and callback execution
aligned_Δt = schedule_aligned_time_step(sim, aligned_Δt)

# Align time step with simulation stop time
time_left = unit_time(sim.stop_time - clock.time)
aligned_Δt = min(aligned_Δt, time_left)
Expand Down Expand Up @@ -105,7 +105,7 @@ function run!(sim; pickup=false)
time_step!(sim)
end

for callback in values(sim.callbacks)
for callback in values(sim.callbacks)
finalize!(callback, sim)
end

Expand Down Expand Up @@ -137,7 +137,7 @@ function time_step!(sim::Simulation)
initial_time_step = !(sim.initialized)
initial_time_step && initialize!(sim)

if initial_time_step && sim.verbose
if initial_time_step && sim.verbose
@info "Executing initial time step..."
start_time = time_ns()
end
Expand All @@ -153,7 +153,7 @@ function time_step!(sim::Simulation)

# Callbacks and callback-like things
for diag in values(sim.diagnostics)
diag.schedule(sim.model) && run_diagnostic!(diag, sim.model)
diag.schedule(sim.model) && run_diagnostic!(diag, sim.model)
end

for callback in values(sim.callbacks)
Expand All @@ -162,7 +162,7 @@ function time_step!(sim::Simulation)
end

for writer in values(sim.output_writers)
writer.schedule(sim.model) && write_output!(writer, sim.model)
writer.schedule(sim.model) && write_output!(writer, sim.model)
end

if initial_time_step && sim.verbose
Expand All @@ -188,14 +188,14 @@ we_want_to_pickup(pickup::Integer) = true
we_want_to_pickup(pickup::String) = true
we_want_to_pickup(pickup) = throw(ArgumentError("Cannot run! with pickup=$pickup"))

"""
initialize!(sim::Simulation, pickup=false)
"""
initialize!(sim::Simulation)

Initialize a simulation:

- Update the auxiliary state of the simulation (filling halo regions, computing auxiliary fields)
- Evaluate all diagnostics, callbacks, and output writers if sim.model.clock.iteration == 0
- Add diagnostics that "depend" on output writers
- Update the auxiliary state of the simulation (filling halo regions, computing auxiliary fields).
- Evaluate all diagnostics, callbacks, and output writers if `sim.model.clock.iteration == 0`.
- Add diagnostics that "depend" on output writers.
"""
function initialize!(sim::Simulation)
if sim.verbose
Expand Down Expand Up @@ -228,7 +228,7 @@ function initialize!(sim::Simulation)
run_diagnostic!(diag, model)
end

for callback in values(sim.callbacks)
for callback in values(sim.callbacks)
callback.callsite isa TimeStepCallsite && callback(sim)
end

Expand All @@ -247,4 +247,3 @@ function initialize!(sim::Simulation)

return nothing
end

6 changes: 2 additions & 4 deletions src/TimeSteppers/clock.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import Base: show
import Oceananigans.Units: Time

"""
mutable struct Clock{T, FT}
mutable struct Clock{TT, DT, IT, S}

Keeps track of the current `time`, `last_Δt`, `iteration` number, and time-stepping `stage`.
Keep track of the current `time`, `last_Δt`, `iteration` number, and time-stepping `stage`.
The `stage` is updated only for multi-stage time-stepping methods. The `time::T` is
either a number or a `DateTime` object.
"""
Expand Down Expand Up @@ -121,5 +121,3 @@ Adapt.adapt_structure(to, clock::Clock) = (time = clock.time,
last_stage_Δt = clock.last_stage_Δt,
iteration = clock.iteration,
stage = clock.stage)


20 changes: 12 additions & 8 deletions test/test_checkpointer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,8 @@ function test_thermal_bubble_checkpointer_output(arch)
return run_checkpointer_tests(true_model, test_model, Δt)
end

function test_hydrostatic_splash_checkpointer(arch, free_surface)
function test_hydrostatic_splash_checkpointer(grid, free_surface)
# Create and run "true model"
Nx, Ny, Nz = 16, 16, 4
Lx, Ly, Lz = 1, 1, 1

grid = RectilinearGrid(arch, size=(Nx, Ny, Nz), x=(-10, 10), y=(-10, 10), z=(-1, 0))
closure = ScalarDiffusivity(ν=1e-2, κ=1e-2)
true_model = HydrostaticFreeSurfaceModel(; grid, free_surface, closure, buoyancy=nothing, tracers=())
test_model = deepcopy(true_model)
Expand Down Expand Up @@ -196,12 +192,20 @@ end
for arch in archs
@testset "Checkpointer [$(typeof(arch))]" begin
@info " Testing Checkpointer [$(typeof(arch))]..."

test_thermal_bubble_checkpointer_output(arch)


# create a grid to test hydrostatic model
Nx, Ny, Nz = 16, 16, 4
Lx, Ly, Lz = 1, 1, 1
grid = RectilinearGrid(arch, size=(Nx, Ny, Nz), x=(-10, 10), y=(-10, 10), z=(-1, 0))

for free_surface in [ExplicitFreeSurface(gravitational_acceleration=1),
ImplicitFreeSurface(gravitational_acceleration=1)]
ImplicitFreeSurface(gravitational_acceleration=1),
SplitExplicitFreeSurface(gravitational_acceleration=1, substeps=5),
SplitExplicitFreeSurface(grid; cfl = 0.7, gravitational_acceleration=1)]

test_hydrostatic_splash_checkpointer(arch, free_surface)
test_hydrostatic_splash_checkpointer(grid, free_surface)
end

run_checkpointer_cleanup_tests(arch)
Expand Down
Loading