Skip to content

Commit 0cf269f

Browse files
authored
Merge pull request #2 from alan-turing-institute/dawn
Add code for running on Dawn
2 parents 4f95095 + 9e259a3 commit 0cf269f

13 files changed

+761
-1
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ venv/
135135
ENV/
136136
env.bak/
137137
venv.bak/
138+
venv_*/
138139

139140
# Spyder project settings
140141
.spyderproject
@@ -175,3 +176,7 @@ cython_debug/
175176

176177
# Aurora speciic things
177178
cdsapi.config
179+
era5
180+
181+
# slurm outputs
182+
slurm-*.out

.isort.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[settings]
2+
profile=black

.pre-commit-config.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# See https://pre-commit.com for more information
2+
# See https://pre-commit.com/hooks.html for more hooks
3+
repos:
4+
- repo: https://github.com/pre-commit/pre-commit-hooks
5+
rev: v5.0.0
6+
hooks:
7+
- id: trailing-whitespace
8+
- id: end-of-file-fixer
9+
- id: check-yaml
10+
- id: check-added-large-files
11+
- id: check-merge-conflict
12+
- id: check-symlinks
13+
- id: mixed-line-ending
14+
- repo: https://github.com/psf/black
15+
rev: 25.1.0
16+
hooks:
17+
- id: black
18+
exclude: baskerville
19+
- repo: https://github.com/pycqa/isort
20+
rev: 6.0.1
21+
hooks:
22+
- id: isort
23+
name: isort (python)
24+
exclude: baskerville
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
url: https://cds.climate.copernicus.eu/api
2-
key:
2+
key:

dawn/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Running Aurora on Dawn
2+
3+
## Set up (from scratch)
4+
5+
1. Create venv with python=3.11
6+
2. Activate venv
7+
3. Install aurora (`pip install microsoft-aurora`)
8+
4. Install intel extension for pytorch as per [docs](https://pytorch-extension.intel.com/installation)
9+
10+
## Set up (quick)
11+
12+
1. Create venv with python=3.11
13+
2. Activate venv
14+
3. Install from requirements `pip install -r environments/requirements.txt`

dawn/batch/fine_tune.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=fine-tuning
3+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
4+
#SBATCH --partition=pvc9 # Dawn PVC partition
5+
#SBATCH -n 1 # Number of tasks (usually number of MPI ranks)
6+
#SBATCH -c 96 # Number of cores per task
7+
#SBATCH --gres=gpu:4 # Number of requested GPUs per node
8+
9+
set -o xtrace
10+
set -o errexit
11+
12+
module purge
13+
module load default-dawn
14+
15+
source ../environments/venv_3_11_9/bin/activate
16+
17+
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
18+
19+
cd ../scripts/
20+
21+
ipython fine_tune.py

dawn/environments/requirements.txt

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# Note that Intel's patched version of torch and intel-extension-for-pytorch
2+
# need to be installed from Intel's package repo and after Aurora is installed.
3+
annotated-types==0.7.0
4+
anyio==4.9.0
5+
argon2-cffi==23.1.0
6+
argon2-cffi-bindings==21.2.0
7+
arrow==1.3.0
8+
asttokens==3.0.0
9+
async-lru==2.0.5
10+
attrs==25.3.0
11+
azure-core==1.34.0
12+
azure-storage-blob==12.25.1
13+
babel==2.17.0
14+
beautifulsoup4==4.13.4
15+
black==25.1.0
16+
bleach==6.2.0
17+
cdsapi==0.7.5
18+
certifi==2025.4.26
19+
cffi==1.17.1
20+
cfgv==3.4.0
21+
cftime==1.6.4.post1
22+
charset-normalizer==3.4.2
23+
click==8.2.1
24+
comm==0.2.2
25+
contourpy==1.3.2
26+
cryptography==44.0.3
27+
cycler==0.12.1
28+
datapi==0.4.0
29+
debugpy==1.8.14
30+
decorator==5.2.1
31+
defusedxml==0.7.1
32+
distlib==0.3.9
33+
dpcpp-cpp-rt==2025.0.4
34+
einops==0.8.1
35+
executing==2.2.0
36+
fastjsonschema==2.21.1
37+
filelock==3.13.1
38+
fonttools==4.58.0
39+
fqdn==1.5.1
40+
fsspec==2024.6.1
41+
h11==0.16.0
42+
httpcore==1.0.9
43+
httpx==0.28.1
44+
huggingface-hub==0.30.2
45+
identify==2.6.10
46+
idna==3.10
47+
impi-devel==2021.14.1
48+
impi-rt==2021.14.1
49+
intel-cmplr-lib-rt==2025.0.4
50+
intel-cmplr-lib-ur==2025.0.4
51+
intel-cmplr-lic-rt==2025.0.4
52+
intel-opencl-rt==2025.0.4
53+
intel-openmp==2025.0.4
54+
intel-pti==0.10.1
55+
intel-sycl-rt==2025.0.4
56+
intel_extension_for_pytorch==2.7.10+xpu
57+
ipykernel==6.29.5
58+
ipython==9.2.0
59+
ipython_pygments_lexers==1.1.1
60+
ipywidgets==8.1.7
61+
isodate==0.7.2
62+
isoduration==20.11.0
63+
isort==6.0.1
64+
jedi==0.19.2
65+
Jinja2==3.1.4
66+
json5==0.12.0
67+
jsonpointer==3.0.0
68+
jsonschema==4.23.0
69+
jsonschema-specifications==2025.4.1
70+
jupyter==1.1.1
71+
jupyter-console==6.6.3
72+
jupyter-events==0.12.0
73+
jupyter-lsp==2.2.5
74+
jupyter_client==8.6.3
75+
jupyter_core==5.7.2
76+
jupyter_server==2.16.0
77+
jupyter_server_terminals==0.5.3
78+
jupyterlab==4.4.2
79+
jupyterlab_pygments==0.3.0
80+
jupyterlab_server==2.27.3
81+
jupyterlab_widgets==3.0.15
82+
kiwisolver==1.4.8
83+
MarkupSafe==2.1.5
84+
matplotlib==3.10.3
85+
matplotlib-inline==0.1.7
86+
-e git+ssh://[email protected]/alan-turing-institute/aurora.git@3b1b9934dfe4c310bd40d83ea787b3a1694d2478#egg=microsoft_aurora
87+
mistune==3.1.3
88+
mkl==2025.0.1
89+
mkl-dpcpp==2025.0.1
90+
mpmath==1.3.0
91+
multiurl==0.3.5
92+
mypy_extensions==1.1.0
93+
nbclient==0.10.2
94+
nbconvert==7.16.6
95+
nbformat==5.10.4
96+
nest-asyncio==1.6.0
97+
netCDF4==1.7.2
98+
networkx==3.3
99+
nodeenv==1.9.1
100+
notebook==7.4.2
101+
notebook_shim==0.2.4
102+
numpy==2.1.2
103+
nvidia-cublas-cu12==12.6.4.1
104+
nvidia-cuda-cupti-cu12==12.6.80
105+
nvidia-cuda-nvrtc-cu12==12.6.77
106+
nvidia-cuda-runtime-cu12==12.6.77
107+
nvidia-cudnn-cu12==9.5.1.17
108+
nvidia-cufft-cu12==11.3.0.4
109+
nvidia-cufile-cu12==1.11.1.6
110+
nvidia-curand-cu12==10.3.7.77
111+
nvidia-cusolver-cu12==11.7.1.2
112+
nvidia-cusparse-cu12==12.5.4.2
113+
nvidia-cusparselt-cu12==0.6.3
114+
nvidia-nccl-cu12==2.26.2
115+
nvidia-nvjitlink-cu12==12.6.85
116+
nvidia-nvtx-cu12==12.6.77
117+
oneccl==2021.14.1
118+
oneccl-bind-pt==2.7.0+xpu
119+
oneccl-devel==2021.14.1
120+
onemkl-sycl-blas==2025.0.1
121+
onemkl-sycl-datafitting==2025.0.1
122+
onemkl-sycl-dft==2025.0.1
123+
onemkl-sycl-lapack==2025.0.1
124+
onemkl-sycl-rng==2025.0.1
125+
onemkl-sycl-sparse==2025.0.1
126+
onemkl-sycl-stats==2025.0.1
127+
onemkl-sycl-vm==2025.0.1
128+
overrides==7.7.0
129+
packaging==25.0
130+
pandas==2.2.3
131+
pandocfilters==1.5.1
132+
parso==0.8.4
133+
pathspec==0.12.1
134+
pexpect==4.9.0
135+
pillow==11.0.0
136+
platformdirs==4.3.7
137+
pre_commit==4.2.0
138+
prometheus_client==0.22.0
139+
prompt_toolkit==3.0.51
140+
psutil==7.0.0
141+
ptyprocess==0.7.0
142+
pure_eval==0.2.3
143+
pycparser==2.22
144+
pydantic==2.11.4
145+
pydantic_core==2.33.2
146+
Pygments==2.19.1
147+
pyparsing==3.2.3
148+
python-dateutil==2.9.0.post0
149+
python-json-logger==3.3.0
150+
pytorch-triton-xpu==3.3.0
151+
pytz==2025.2
152+
PyYAML==6.0.2
153+
pyzmq==26.4.0
154+
referencing==0.36.2
155+
requests==2.32.3
156+
rfc3339-validator==0.1.4
157+
rfc3986-validator==0.1.1
158+
rpds-py==0.25.1
159+
ruamel.yaml==0.18.10
160+
ruamel.yaml.clib==0.2.12
161+
scipy==1.15.2
162+
Send2Trash==1.8.3
163+
six==1.17.0
164+
sniffio==1.3.1
165+
soupsieve==2.7
166+
stack-data==0.6.3
167+
sympy==1.13.3
168+
tbb==2022.1.0
169+
tcmlib==1.2.0
170+
terminado==0.18.1
171+
timm==0.6.13
172+
tinycss2==1.4.0
173+
torch==2.7.0+xpu
174+
torchaudio==2.7.0+xpu
175+
torchvision==0.22.0+xpu
176+
tornado==6.5
177+
tqdm==4.67.1
178+
traitlets==5.14.3
179+
triton==3.3.0
180+
types-python-dateutil==2.9.0.20250516
181+
typing-inspection==0.4.0
182+
typing_extensions==4.12.2
183+
tzdata==2025.2
184+
umf==0.9.1
185+
uri-template==1.3.0
186+
urllib3==2.4.0
187+
virtualenv==20.31.1
188+
wcwidth==0.2.13
189+
webcolors==24.11.1
190+
webencodings==0.5.1
191+
websocket-client==1.8.0
192+
widgetsnbextension==4.0.14
193+
xarray==2025.4.0

dawn/scripts/aurora_loss.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""Loss functions for Aurora model training."""
2+
3+
import torch
4+
5+
6+
def mae(x_hat_t, x_t):
7+
lamb = 2
8+
vs_va = 9
9+
surface = {
10+
"2t": 3.0,
11+
"msl": 1.5,
12+
"10u": 0.77,
13+
"10v": 0.66,
14+
}
15+
atmos = {
16+
"z": 2.8,
17+
"q": 0.78,
18+
"t": 1.7,
19+
"u": 0.87,
20+
"v": 0.6,
21+
}
22+
foo = sum(
23+
[
24+
(v / (720 * 1440))
25+
* torch.sum(
26+
torch.abs(x_hat_t.surf_vars[k] - x_t.surf_vars[k][:, :, :720, :])
27+
)
28+
for k, v in surface.items()
29+
]
30+
)
31+
bar = sum(
32+
[
33+
(v / (720 * 1440 * 13))
34+
* torch.sum(
35+
torch.abs(x_hat_t.atmos_vars[k] - x_t.atmos_vars[k][:, :, :, :720, :])
36+
)
37+
for k, v in atmos.items()
38+
]
39+
)
40+
41+
alpha = 0.25
42+
43+
return (lamb / vs_va) * ((alpha * foo) + bar)

0 commit comments

Comments
 (0)