From 4a24e17384f270d7894e670a3f407f973d0237b1 Mon Sep 17 00:00:00 2001
From: Mariah Pope <mariah.pope@tomorrow.io>
Date: Wed, 29 Oct 2025 14:45:09 -0400
Subject: [PATCH] updates so AML works and has better documentation throughout
 the notebook

---
 .../replay/azure_ml/anemoi_pipeline.ipynb     | 51 +++++++++++++------
 .../azure_ml/conf/conda/ufs2arco-conda.yaml   |  6 +++
 .../replay/azure_ml/conf/data/replay.yaml     |  2 +-
 .../azure_ml/conf/data/submit_ufs2arco.sh     |  7 +--
 .../azure_ml/conf/training/submit_training.sh |  2 +-
 5 files changed, 45 insertions(+), 23 deletions(-)
 create mode 100644 examples/getting_started/replay/azure_ml/conf/conda/ufs2arco-conda.yaml

diff --git a/examples/getting_started/replay/azure_ml/anemoi_pipeline.ipynb b/examples/getting_started/replay/azure_ml/anemoi_pipeline.ipynb
index 4aea1fd..821191c 100644
--- a/examples/getting_started/replay/azure_ml/anemoi_pipeline.ipynb
+++ b/examples/getting_started/replay/azure_ml/anemoi_pipeline.ipynb
@@ -51,7 +51,7 @@
    "metadata": {},
    "source": [
     "## Step 1: Create conda environments\n",
-    "You only need to do this once! Therefore, check if these environments were already created by someone else before running. After creation, you should be able to find them within the \"environments\" tab in AzureML. Anytime you re-run this, it will create a new version (e.g. myenv:1 would then become myenv:2). This is helpful if there is anything you wish to update in your environment (package versions, add an additional package, etc.)\n",
+    "You only need to do this once (additionally, check if these environments were already created by someone else before running). After creation, you should be able to find them within the \"environments\" tab in AzureML. Anytime you re-run this, it will create a new version (e.g. myenv:1 would then become myenv:2). This is helpful if there is anything you wish to update in your environment (package versions, add an additional package, etc.)\n",
     "\n",
     "Environments we will create:\n",
     "1) ufs2arco for data processing\n",
@@ -122,7 +122,20 @@
    "metadata": {},
    "source": [
     "## Step 2: Create replay dataset with ufs2arco\n",
-    "Saves a dataset to your default datastore that will include training and validation data together in one dataset. The zarr is saved to your default datastore."
+    "This step saves a dataset that will include training and validation data together in one dataset. The zarr is saved to your default datastore. \n",
+    "\n",
+    "\n",
+    "The default set up here assumes you have access to MPI. If you do not, here are a few changes to make to successfully create data without using MPI. Note, this will make the process take a lot longer (feel free to change your date range to something smaller if you are simply trying to run tests).\n",
+    "\n",
+    "- conf/data/replay.yaml: At the top of the yaml change mover to `datamover` instead of `mpidatamover`, and add a line beneath that that says `batch_size: 2`\n",
+    "- conf/data/submit_ufs2arco.sh: change `mpirun --allow-run-as-root -np 8 ufs2arco replay.yaml` to simply say `ufs2arco replay.yaml`\n",
+    "\n",
+    "Otherwise, you should not have to change anything and should be good to go to simply run this cell!\n",
+    "\n",
+    "There are a few additional things to note:\n",
+    "- If you have executed this cell before and the job started saving any output, due to job failure or maybe you wanted to change some configurations and test them after, the job will likely fail because a zarr already exists. Either go delete the zarr that you have, or simply rename the new zarr you wish to save by changing `output_zarr`\n",
+    "- This job assumes you have access to a `Standard-D13-v2` instance. If you do not, change this to a CPU instance that you do have access to.\n",
+    "- The current format of this cell also assumes that you are using version 1 (e.g. `environment=\"ufs2arco:1\"`) of your ufs2arco environment. If you have recreated this environment for whatever reason, you will need to update the version number to the most recent version."
    ]
   },
   {
@@ -140,9 +153,9 @@
     ")\n",
     "\n",
     "command_job = command(\n",
-    "    code=\"./data\",\n",
+    "    code=\"conf/data\",\n",
     "    command=f\"bash submit_ufs2arco.sh ${{outputs.output_blob}} {output_path} {output_zarr}\",\n",
-    "    environment=\"ufs2arco:2\",\n",
+    "    environment=\"ufs2arco:1\",\n",
     "    compute=\"Standard-D13-v2\",\n",
     "    experiment_name=\"ufs2arco\",\n",
     "    display_name=\"training_dataset\",\n",
@@ -161,7 +174,13 @@
    "source": [
     "## Step 3: Submit a training job with anemoi-core\n",
     "\n",
-    "Checkpoints and plots all saved to the default datastore."
+    "After your dataset job has completed, submit this cell to complete model training. Checkpoints and plots all saved to the default datastore.\n",
+    "\n",
+    "A few notes to check before submission:\n",
+    "- As noted in the data step, please check your environment version and compute to make sure that they match what you intend to use. \n",
+    "- A `Standard-NC4as-T4-v3` is a great option for this task if it is available to you.\n",
+    "\n",
+    "In future work we will make a version of this that can succesfully run on a CPU. This is so users may run this with free resources."
    ]
   },
   {
@@ -182,13 +201,13 @@
     "outputs = Output(\n",
     "    type=\"uri_folder\",\n",
     "    mode=\"upload\",\n",
-    "    path=f\"azureml://datastores/{default_ds.name}/paths/training_output/\",\n",
+    "    path=f\"azureml://datastores/{default_ds.name}/paths/training_output\",\n",
     ")\n",
     "\n",
     "command_job = command(\n",
-    "    code=\"./training\",\n",
+    "    code=\"conf/training\",\n",
     "    command=f\"bash submit_training.sh ${{inputs.data}} ${{outputs.output_dir}}\",\n",
-    "    environment=\"anemoi:7\",\n",
+    "    environment=\"anemoi:1\",\n",
     "    compute=\"Standard-NC4as-T4-v3\",\n",
     "    experiment_name=\"anemoi-training\",\n",
     "    display_name=\"anemoi-training\",\n",
@@ -208,9 +227,11 @@
    "source": [
     "## Step 4: Submit inference job with anemoi-inference\n",
     "\n",
-    "Load the model checkpoint from the default datastore and create one 240-hr forecast. The output is saved to the default datastore. We currently use a python script to submit inference instead of simply using \"anemoi-inference run\" command because it makes it a tiny bit easier to load everything. We can still use the other way, but I found this to be a little easier.\n",
+    "Run this cell after you have successfully completed trainnig. This will load the model checkpoint from the default datastore and create one 240-hr forecast. The output is saved to the default datastore.\n",
     "\n",
-    "- TODO: create options for this step, and build out this python script to be a lot more robust."
+    "A few notes to check before submission:\n",
+    "- As always, check your compute and environment. \n",
+    "- In the first line (`input_path`), you will need to go to your default datastore and find the run_id (`3f476fd7-65ca-4d98-b3d5-b622d88a0d7d`) that is unique to you. This ID differs with each model run."
    ]
   },
   {
@@ -242,9 +263,9 @@
     ")\n",
     "\n",
     "command_job = command(\n",
-    "    code=\"./inference\",\n",
+    "    code=\"conf/inference\",\n",
     "    command=f\"python inference.py ${{inputs.ckpt}} ${{inputs.zarr}} ${{outputs.output_dir}}\",\n",
-    "    environment=\"anemoi:7\",\n",
+    "    environment=\"anemoi:1\",\n",
     "    compute=\"Standard-NC4as-T4-v3\",\n",
     "    experiment_name=\"anemoi-inference\",\n",
     "    display_name=\"anemoi-inference\",\n",
@@ -267,7 +288,7 @@
    "source": [
     "## Step 5: Submit verification job with wxvx\n",
     "\n",
-    "Load inference, post-process the output so that will work with wxvx, and run verification against the GGFS for a handful of variables."
+    "Load inference, post-process the output so that will work with wxvx, and run verification against the GFS for a handful of variables."
    ]
   },
   {
@@ -291,9 +312,9 @@
     ")\n",
     "\n",
     "command_job = command(\n",
-    "    code=\"./verification\",\n",
+    "    code=\"conf/verification\",\n",
     "    command=f\"bash submit_wxvx.sh ${{inputs.zarr}} ${{outputs.output_dir}}\",\n",
-    "    environment=\"wxvx:4\",\n",
+    "    environment=\"wxvx:1\",\n",
     "    compute=\"Standard-D13-v2\",\n",
     "    experiment_name=\"wxvx\",\n",
     "    display_name=\"wxvx\",\n",
diff --git a/examples/getting_started/replay/azure_ml/conf/conda/ufs2arco-conda.yaml b/examples/getting_started/replay/azure_ml/conf/conda/ufs2arco-conda.yaml
new file mode 100644
index 0000000..2253476
--- /dev/null
+++ b/examples/getting_started/replay/azure_ml/conf/conda/ufs2arco-conda.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+dependencies:
+  - python=3.11
+  - ufs2arco=0.17.0
+name: ufs2arco
diff --git a/examples/getting_started/replay/azure_ml/conf/data/replay.yaml b/examples/getting_started/replay/azure_ml/conf/data/replay.yaml
index 61f89e1..0ec52c6 100644
--- a/examples/getting_started/replay/azure_ml/conf/data/replay.yaml
+++ b/examples/getting_started/replay/azure_ml/conf/data/replay.yaml
@@ -66,7 +66,7 @@ target:
   sort_channels_by_levels: True
   statistics_period:
     start: 2022-01-01T00
-    end: 2023-12-31T18
+    end: 2022-12-31T18
 
   forcings:
     - cos_latitude
diff --git a/examples/getting_started/replay/azure_ml/conf/data/submit_ufs2arco.sh b/examples/getting_started/replay/azure_ml/conf/data/submit_ufs2arco.sh
index 87143f4..20387d5 100644
--- a/examples/getting_started/replay/azure_ml/conf/data/submit_ufs2arco.sh
+++ b/examples/getting_started/replay/azure_ml/conf/data/submit_ufs2arco.sh
@@ -9,9 +9,4 @@ echo "Output zarr: $OUTPUT_ZARR"
 mkdir -p $OUTPUT_PATH
 ln -s $OUTPUT_BLOB "$OUTPUT_PATH/$OUTPUT_ZARR"
 
-export OMPI_COMM_WORLD_SIZE=8
-export OMPI_COMM_WORLD_RANK=0
-export RANK=0
-export WORLD_SIZE=8
-
-mpirun -np 8 ufs2arco replay.yaml
+mpirun --allow-run-as-root -np 8 ufs2arco replay.yaml
diff --git a/examples/getting_started/replay/azure_ml/conf/training/submit_training.sh b/examples/getting_started/replay/azure_ml/conf/training/submit_training.sh
index c87cdd5..9e5abe3 100644
--- a/examples/getting_started/replay/azure_ml/conf/training/submit_training.sh
+++ b/examples/getting_started/replay/azure_ml/conf/training/submit_training.sh
@@ -22,5 +22,5 @@ anemoi-training train --config-name=config.yaml
 cp -R /workdir/training_output/* $OUTPUT_DIR
 echo "Model output saved to: $OUTPUT_DIR"
 
-# TODO: This should formally log within azureml somehow. 
+# TODO: This should formally log within azureml.
 # As of right now, we are just saving everything to blob storage and loading it from there later.
\ No newline at end of file