From 8d7a603821d665ba929e9540dd973a3af43be972 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 00:39:36 +0100
Subject: [PATCH 01/38] WIP

---
 tests/test_tensor_parallel.py |  1 -
 tests/test_training.py        | 25 ++++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 25921c12a..ed383e17a 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -293,6 +293,5 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
 
         self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_training.py b/tests/test_training.py
index c77cb9af2..7115ef86d 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -50,7 +50,7 @@ def get_3d_dimensions():
         dp_size = 2
         pp_size = 2
         tp_size = 2
-    if num_gpus >= 4:
+    elif num_gpus >= 4:
         dp_size = 1
         pp_size = 2
         tp_size = 2
@@ -592,3 +592,26 @@ def test_skip_train_iteration(self):
         train_iterations = range(1,10)
         for i in train_iterations:
             self.assertTrue(f"iteration {i:8d}/" in cs.out)
+
+    def test_layer_norm_consistent(self):
+        # skip iterations setup
+        extra_args = f"""
+            --skip-train-iteration-range 2-2 4-7
+        """.split()
+
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()
+        args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+        args.extend(extra_args)
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        checkpoint_path = os.path.join(output_dir, "checkpoints")
+        print(os.listdir(checkpoint_path))
+        assert False

From 240f673e298945f592b8fbe92661930b57897c50 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:25:11 +0100
Subject: [PATCH 02/38] Wip

---
 megatron/mpu/mappings.py |  2 ++
 tests/test_training.py   | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py
index 821d9acfe..6056f94f6 100644
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -82,6 +82,7 @@ def symbolic(graph, input_):
     
     @staticmethod
     def forward(ctx, input_):
+        # TODO: we need to assert that the input_ are all the same within a group
         return input_
 
     @staticmethod
@@ -102,6 +103,7 @@ def forward(ctx, input_):
 
     @staticmethod
     def backward(ctx, grad_output):
+        # TODO: we need to assert that the grad_output are all the same within a group
         return grad_output
 
 
diff --git a/tests/test_training.py b/tests/test_training.py
index 7115ef86d..eb6ce9b72 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -20,6 +20,8 @@
 import re
 import unittest
 from pathlib import Path
+
+import torch
 from parameterized import parameterized
 
 from megatron.testing_utils import (
@@ -594,15 +596,9 @@ def test_skip_train_iteration(self):
             self.assertTrue(f"iteration {i:8d}/" in cs.out)
 
     def test_layer_norm_consistent(self):
-        # skip iterations setup
-        extra_args = f"""
-            --skip-train-iteration-range 2-2 4-7
-        """.split()
-
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
         args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
-        args.extend(extra_args)
         script = [f"{src_dir}/pretrain_gpt.py"]
         launcher = get_launcher(num_gpus)
         cmd = launcher + script + args + ds_args
@@ -612,6 +608,10 @@ def test_layer_norm_consistent(self):
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
 
-        checkpoint_path = os.path.join(output_dir, "checkpoints")
+        checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step_10")
         print(os.listdir(checkpoint_path))
+        key="input_layernorm.weight"
+        files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"]
+        weights = [torch.load(os.path.join(checkpoint_path,file))[key] for file in files_to_test]
+        torch.testing.assert_close(weights[0], weights[1], rtol=0.0, atol=0.0, check_device=False)
         assert False

From 1cdcd7de2ece875988c7922c42e21c7af2ae1c99 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:27:35 +0100
Subject: [PATCH 03/38] Woops

---
 tests/test_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index eb6ce9b72..99ff13b84 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -608,7 +608,7 @@ def test_layer_norm_consistent(self):
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
 
-        checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step_10")
+        checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step10")
         print(os.listdir(checkpoint_path))
         key="input_layernorm.weight"
         files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"]

From 29372806533f5ac09749729620bda958a28d6f6b Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:39:58 +0100
Subject: [PATCH 04/38] WIP

---
 tests/test_training.py | 66 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 99ff13b84..943649e32 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -598,7 +598,71 @@ def test_skip_train_iteration(self):
     def test_layer_norm_consistent(self):
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
-        args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+        num_gpus = 2
+        args = f"""
+                --tensor-model-parallel-size {2}
+                --pipeline-model-parallel-size {1}
+                --distributed-backend nccl
+
+                --log-interval 1
+                --save-interval 10
+                --eval-interval 10
+                --eval-iters 5
+                --checkpoint-activations
+                --partition-activations
+                --exit-interval {20}
+
+                --merge-file {data_dir}/gpt2-tiny-merges.txt
+                --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                --save {output_dir}/checkpoints
+                --load {output_dir}/checkpoints
+                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                --tensorboard-dir {output_dir}/tensorboard
+                --tensorboard-queue-size 5
+                --log-timers-to-tensorboard
+                --log-batch-size-to-tensorboard
+                --log-validation-ppl-to-tensorboard
+
+                --num-layers 2
+                --hidden-size 64
+                --num-attention-heads 2
+                --seq-length {seq_len}
+                --max-position-embeddings 1024
+                --micro-batch-size 1
+                --global-batch-size 16
+
+                --optimizer adam
+                --adam-beta1 0.9
+                --adam-beta2 0.95
+                --adam-eps 1e-8
+                --lr 1e-4
+                --lr-warmup-samples 5
+                --clip-grad 1.0
+                --weight-decay 1e-1
+                --embed-layernorm
+                --fp16
+
+                --log-level debug
+                --log-level-replica info
+
+                --rampup-batch-size 2 2 200
+                --train-samples 200
+
+                --lr-decay-samples 6
+
+        """.split()
+
+        ds_args = f"""
+                --deepspeed
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+                --zero-stage 1
+                --deepspeed-activation-checkpointing
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+
+        """.split()
+
+        # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+
         script = [f"{src_dir}/pretrain_gpt.py"]
         launcher = get_launcher(num_gpus)
         cmd = launcher + script + args + ds_args

From 7fcff06bd3215082aeaeb935129b6866cd58019a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:41:27 +0100
Subject: [PATCH 05/38] Woops

---
 tests/test_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index 943649e32..c34521963 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -599,6 +599,7 @@ def test_layer_norm_consistent(self):
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
         num_gpus = 2
+        data_dir = f"{self.data_dir}/gpt2"
         args = f"""
                 --tensor-model-parallel-size {2}
                 --pipeline-model-parallel-size {1}

From 1f2f80072a5a2989461c9dbba5b55068d0f11da5 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:42:05 +0100
Subject: [PATCH 06/38] Woops

---
 tests/test_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index c34521963..70f161eba 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -599,6 +599,7 @@ def test_layer_norm_consistent(self):
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
         num_gpus = 2
+        seq_len = 128
         data_dir = f"{self.data_dir}/gpt2"
         args = f"""
                 --tensor-model-parallel-size {2}

From f152e487d8c6dcb9caa95414aa2dc644b5f50984 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:50:19 +0100
Subject: [PATCH 07/38] Woops

---
 tests/test_training.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 70f161eba..5a29a432e 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -674,10 +674,17 @@ def test_layer_norm_consistent(self):
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
 
-        checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step10")
-        print(os.listdir(checkpoint_path))
-        key="input_layernorm.weight"
-        files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"]
-        weights = [torch.load(os.path.join(checkpoint_path,file))[key] for file in files_to_test]
-        torch.testing.assert_close(weights[0], weights[1], rtol=0.0, atol=0.0, check_device=False)
+        checkpoints = ["global_step10", "global_step20"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
+        for checkpoint in checkpoints:
+            print(checkpoint)
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            print(os.listdir(checkpoint_path))
+            key = "input_layernorm.weight"
+            for files in files_to_compare:
+                print(files)
+                weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                ref = weights[0]
+                for weight in weights[1:]:
+                    torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
         assert False

From ce02dd16faee505ed7dc3beb1f49f9d5a0ba477f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:52:14 +0100
Subject: [PATCH 08/38] Test with alibi

---
 tests/test_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index 5a29a432e..0f3133d64 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -652,6 +652,7 @@ def test_layer_norm_consistent(self):
 
                 --lr-decay-samples 6
 
+                --position-embedding-type alibi
         """.split()
 
         ds_args = f"""

From 02365d145bd5887385abd359ebdc9e12ac9064cc Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:53:56 +0100
Subject: [PATCH 09/38] Still trying to reproduce

---
 tests/test_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index 0f3133d64..d6eeba220 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -688,4 +688,5 @@ def test_layer_norm_consistent(self):
                 ref = weights[0]
                 for weight in weights[1:]:
                     torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                print(ref)
         assert False

From 42d6b4e3be2c8bc7b496ac7ada5ce258c4d190a6 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 11:57:23 +0100
Subject: [PATCH 10/38] Huh

---
 tests/test_training.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index d6eeba220..98ad9117c 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -676,17 +676,19 @@ def test_layer_norm_consistent(self):
             execute_subprocess_async(cmd, env=self.get_env())
 
         checkpoints = ["global_step10", "global_step20"]
+        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
         for checkpoint in checkpoints:
             print(checkpoint)
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             print(os.listdir(checkpoint_path))
-            key = "input_layernorm.weight"
-            for files in files_to_compare:
-                print(files)
-                weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                ref = weights[0]
-                for weight in weights[1:]:
-                    torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
-                print(ref)
+            for key in keys_to_compare:
+                print(key)
+                for files in files_to_compare:
+                    print(files)
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                    print(key, ref)
         assert False

From c20c8ba466ae129ba3b81e2248b4e475494cc261 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 12:10:38 +0100
Subject: [PATCH 11/38] Have high LR to see weights actually change

---
 tests/test_training.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 98ad9117c..cb258147a 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -630,15 +630,14 @@ def test_layer_norm_consistent(self):
                 --num-attention-heads 2
                 --seq-length {seq_len}
                 --max-position-embeddings 1024
-                --micro-batch-size 1
+                --micro-batch-size 2
                 --global-batch-size 16
 
                 --optimizer adam
                 --adam-beta1 0.9
                 --adam-beta2 0.95
                 --adam-eps 1e-8
-                --lr 1e-4
-                --lr-warmup-samples 5
+                --lr 1e-1
                 --clip-grad 1.0
                 --weight-decay 1e-1
                 --embed-layernorm
@@ -650,8 +649,6 @@ def test_layer_norm_consistent(self):
                 --rampup-batch-size 2 2 200
                 --train-samples 200
 
-                --lr-decay-samples 6
-
                 --position-embedding-type alibi
         """.split()
 

From 7f2441edb6e23b6f923ac88e4f1c2e4bce19a028 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:08:48 +0100
Subject: [PATCH 12/38] Launch bf16

---
 tests/ds_config_bf16.json | 13 +++++++++++++
 tests/test_training.py    |  6 +++---
 2 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 tests/ds_config_bf16.json

diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json
new file mode 100644
index 000000000..7a07551f8
--- /dev/null
+++ b/tests/ds_config_bf16.json
@@ -0,0 +1,13 @@
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/tests/test_training.py b/tests/test_training.py
index cb258147a..b7daf9c7b 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -641,7 +641,7 @@ def test_layer_norm_consistent(self):
                 --clip-grad 1.0
                 --weight-decay 1e-1
                 --embed-layernorm
-                --fp16
+                --bf16
 
                 --log-level debug
                 --log-level-replica info
@@ -654,10 +654,10 @@ def test_layer_norm_consistent(self):
 
         ds_args = f"""
                 --deepspeed
-                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
                 --zero-stage 1
                 --deepspeed-activation-checkpointing
-                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
 
         """.split()
 

From a4172bf9c9ca6f4dc769d0e7cafc527ad0f8fe39 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:12:10 +0100
Subject: [PATCH 13/38] Woops

---
 tests/ds_config_bf16.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json
index 7a07551f8..1f02566c9 100644
--- a/tests/ds_config_bf16.json
+++ b/tests/ds_config_bf16.json
@@ -3,7 +3,7 @@
   "train_batch_size": 16,
   "gradient_clipping": 1.0,
   "zero_optimization": {
-    "stage": 1
+    "stage": 0
   },
   "bf16": {
     "enabled": true

From 5fbe1072df81b60b8a0987c8bd6e63cf50d0afb8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:22:28 +0100
Subject: [PATCH 14/38] Make test to work with both bf16 and fp16 to see who
 fails

---
 tests/test_training.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index b7daf9c7b..ea5c7700c 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -595,7 +595,8 @@ def test_skip_train_iteration(self):
         for i in train_iterations:
             self.assertTrue(f"iteration {i:8d}/" in cs.out)
 
-    def test_layer_norm_consistent(self):
+    @parameterized.expand(["bf16", "fp16"])
+    def test_layer_norm_consistent(self, variation):
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
         num_gpus = 2
@@ -641,7 +642,6 @@ def test_layer_norm_consistent(self):
                 --clip-grad 1.0
                 --weight-decay 1e-1
                 --embed-layernorm
-                --bf16
 
                 --log-level debug
                 --log-level-replica info
@@ -654,13 +654,22 @@ def test_layer_norm_consistent(self):
 
         ds_args = f"""
                 --deepspeed
-                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
-                --zero-stage 1
                 --deepspeed-activation-checkpointing
-                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
-
         """.split()
 
+        if variation == "bf16":
+            args.append("--bf16")
+            ds_args += [
+                "--zero-stage", "0"
+                "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json"
+            ]
+        elif variation == "fp16":
+            args.append("--fp16")
+            ds_args += [
+                "--zero-stage", "1"
+                "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json"
+            ]
+
         # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
 
         script = [f"{src_dir}/pretrain_gpt.py"]

From a0c09132ea1f6728c5165c49a997c9e743f53014 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:23:47 +0100
Subject: [PATCH 15/38] Woops

---
 tests/test_training.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index ea5c7700c..2b842fbbd 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -660,13 +660,13 @@ def test_layer_norm_consistent(self, variation):
         if variation == "bf16":
             args.append("--bf16")
             ds_args += [
-                "--zero-stage", "0"
+                "--zero-stage", "0",
                 "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json"
             ]
         elif variation == "fp16":
             args.append("--fp16")
             ds_args += [
-                "--zero-stage", "1"
+                "--zero-stage", "1",
                 "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json"
             ]
 

From 6b19339caa07e80fb21a2e5698ab4a60b2506f43 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:25:54 +0100
Subject: [PATCH 16/38] Remove assert

---
 tests/test_training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 2b842fbbd..d86d08074 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -697,4 +697,3 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
                     print(key, ref)
-        assert False

From a5e329580e5cc57e4b34d62842f701ba498cbdce Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:05:22 +0100
Subject: [PATCH 17/38] Try to figure out how the divergence happens

---
 megatron/model/fused_layer_norm.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 78645c236..3b3048a9f 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -19,6 +19,7 @@
 
 import numbers
 import torch
+from megatron import mpu
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
@@ -31,7 +32,6 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
 
   @staticmethod
   def forward(ctx, input, weight, bias, normalized_shape, eps):
-
     ctx.normalized_shape = normalized_shape
     ctx.eps = eps
     input_ = input.contiguous()
@@ -84,7 +84,12 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-
+    print(
+        mpu.get_tensor_model_parallel_group(),
+        mpu.get_tensor_model_parallel_rank(),
+        self.weight,
+        self.bias
+    )
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)
 

From 7145f6dfc4f3136fd78f8e24e9d8c6637a6afc79 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:28:55 +0100
Subject: [PATCH 18/38] I think bias starts to diverge first

---
 megatron/model/fused_layer_norm.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 3b3048a9f..2e19451a5 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -19,7 +19,7 @@
 
 import numbers
 import torch
-from megatron import mpu
+from megatron import mpu, print_rank_0
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
@@ -84,12 +84,17 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    print(
-        mpu.get_tensor_model_parallel_group(),
-        mpu.get_tensor_model_parallel_rank(),
-        self.weight,
-        self.bias
-    )
+    weights = [torch.empty_like(self.weight) for tp in mpu.get_tensor_model_parallel_world_size()]
+    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
+    biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()]
+    torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
+    if any(torch.any(weight != self.weight) for weight in weights):
+        print_rank_0("Weight sync failed")
+        print_rank_0(weights)
+    if any(torch.any(bias != self.bias) for bias in biases):
+        print_rank_0("Bias sync failed")
+        print_rank_0(biases)
+
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)
 

From 311e53175909895dab82e5472b1e704017840af0 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:31:36 +0100
Subject: [PATCH 19/38] Woops

---
 megatron/model/fused_layer_norm.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 2e19451a5..6a5e58b3d 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -89,11 +89,13 @@ def forward(self, input):
     biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()]
     torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
     if any(torch.any(weight != self.weight) for weight in weights):
-        print_rank_0("Weight sync failed")
-        print_rank_0(weights)
+        if mpu.get_tensor_model_parallel_rank() == 0:
+            print("Weight sync failed")
+            print(weights)
     if any(torch.any(bias != self.bias) for bias in biases):
-        print_rank_0("Bias sync failed")
-        print_rank_0(biases)
+        if mpu.get_tensor_model_parallel_rank() == 0:
+            print("Bias sync failed")
+            print(biases)
 
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)

From 39d4b8f9faf218130c38d101916eb6b83a70931b Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:33:28 +0100
Subject: [PATCH 20/38] Woops

---
 megatron/model/fused_layer_norm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 6a5e58b3d..c416d332b 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -19,7 +19,7 @@
 
 import numbers
 import torch
-from megatron import mpu, print_rank_0
+from megatron import mpu
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib

From 8ffb278f63bb192bb4a4c3ce726bd26e605dd8ba Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:34:42 +0100
Subject: [PATCH 21/38] Woops

---
 megatron/model/fused_layer_norm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index c416d332b..8430f528c 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -84,9 +84,9 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    weights = [torch.empty_like(self.weight) for tp in mpu.get_tensor_model_parallel_world_size()]
+    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
     torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
-    biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()]
+    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
     torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
     if any(torch.any(weight != self.weight) for weight in weights):
         if mpu.get_tensor_model_parallel_rank() == 0:

From 2389bfdfad878624cfa228d5295b6d9ed35bd45d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:00:07 +0100
Subject: [PATCH 22/38] Add embed layer norm

---
 tests/test_training.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index d86d08074..6b140306c 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -685,15 +685,21 @@ def test_layer_norm_consistent(self, variation):
         keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
         for checkpoint in checkpoints:
-            print(checkpoint)
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-            print(os.listdir(checkpoint_path))
             for key in keys_to_compare:
-                print(key)
                 for files in files_to_compare:
-                    print(files)
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
                         torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
-                    print(key, ref)
+
+        keys_to_compare = ["word_embeddings.norm.weight"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [0]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
\ No newline at end of file

From 0cf35ee3679898bab1524f0860d88e733867a12c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:05:01 +0100
Subject: [PATCH 23/38] Woops

---
 tests/test_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 6b140306c..7ef6f958d 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -694,7 +694,7 @@ def test_layer_norm_consistent(self, variation):
                         torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
 
         keys_to_compare = ["word_embeddings.norm.weight"]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [0]]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             for key in keys_to_compare:

From f0d6d179fbaf5a413f928bddd66361e1f7488ec3 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 23:15:58 +0100
Subject: [PATCH 24/38] Backward compatibility on torch

---
 tests/test_training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 7ef6f958d..fb72e59c6 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -33,7 +33,7 @@
     require_bnb_non_decorator,
     require_deepspeed,
     require_torch_gpu,
-    set_seed
+    set_seed, torch_assert_equal
 )
 
 set_seed(42)
@@ -691,7 +691,7 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
 
         keys_to_compare = ["word_embeddings.norm.weight"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
@@ -702,4 +702,4 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False)
\ No newline at end of file
+                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
\ No newline at end of file

From 07ccb3db1717f300fd09afaaf9eac57678ca0e5d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 23:23:54 +0100
Subject: [PATCH 25/38] Better

---
 tests/test_training.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index fb72e59c6..bc64ffb73 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -682,6 +682,8 @@ def test_layer_norm_consistent(self, variation):
             execute_subprocess_async(cmd, env=self.get_env())
 
         checkpoints = ["global_step10", "global_step20"]
+
+        # Check transformer layer norm
         keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
         for checkpoint in checkpoints:
@@ -693,6 +695,7 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
 
+        # Check embed layer norm
         keys_to_compare = ["word_embeddings.norm.weight"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
         for checkpoint in checkpoints:
@@ -702,4 +705,4 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
\ No newline at end of file
+                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)

From a5b5edc02b1251b4a72d0edb416ea06c35ff53b3 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 25 Mar 2022 18:28:44 -0700
Subject: [PATCH 26/38] fix

---
 tests/test_training.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index bc64ffb73..65067982e 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -693,7 +693,7 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)
 
         # Check embed layer norm
         keys_to_compare = ["word_embeddings.norm.weight"]
@@ -705,4 +705,4 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)

From c7f20066dcf0b7ec37dbaad40e3d9991505f4a28 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 29 Mar 2022 00:09:46 +0500
Subject: [PATCH 27/38] Sync lp/hp/optim for layer norms

---
 megatron/model/fused_layer_norm.py | 24 ++++-----
 megatron/training.py               | 22 ++++++++
 run_bf16.sh                        | 82 +++++++++++++++++++-----------
 tests/test_training.py             |  2 +-
 4 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 8430f528c..c344a5cba 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -84,18 +84,18 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
-    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
-    if any(torch.any(weight != self.weight) for weight in weights):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Weight sync failed")
-            print(weights)
-    if any(torch.any(bias != self.bias) for bias in biases):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Bias sync failed")
-            print(biases)
+#    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
+#    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
+#    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
+#    torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
+#    if any(torch.any(weight != self.weight) for weight in weights):
+#        if mpu.get_tensor_model_parallel_rank() == 0:
+#            print("Weight sync failed")
+#            print(weights)
+#    if any(torch.any(bias != self.bias) for bias in biases):
+#        if mpu.get_tensor_model_parallel_rank() == 0:
+#            print("Bias sync failed")
+#            print(biases)
 
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)
diff --git a/megatron/training.py b/megatron/training.py
index 84fd4eb9d..96426b401 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -412,8 +412,30 @@ def setup_model_and_optimizer(model_provider_func):
         torch.distributed.barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
+        print_rank_0(f'module = {model[0]}')
+        for layer_id in ['3', '4']:
+            if hasattr(model[0].module._modules[layer_id], 'input_layernorm'):
+                weight = model[0].module._modules[layer_id].input_layernorm.weight
+                print(f'rank {torch.distributed.get_rank()} before reduce weight = {weight}') 
+                torch.distributed.all_reduce(weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+                print(f'rank {torch.distributed.get_rank()} after reduce weight = {weight}') 
+
+                if weight._hp_mapping is not None:
+                    print(f'rank {torch.distributed.get_rank()} fixing hp for input_layernorm')
+                    #weight._hp_mapping.update_hp()
+                    hp = weight._hp_mapping.hp_fragment
+                    torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+                
+                    for key in ['exp_avg', 'exp_avg_sq']:
+                        optim_state = weight._hp_mapping.get_optim_state(key)
+                        print(f'rank {torch.distributed.get_rank()} before reduce optim state {key} = {optim_state}') 
+                        torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+                        print(f'rank {torch.distributed.get_rank()} after reduce optim state {key} = {optim_state}') 
+
     else:
         args.iteration = 0
+    
+    torch.distributed.barrier()
 
     # We only support local DDP with multiple micro-batches.
     if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
diff --git a/run_bf16.sh b/run_bf16.sh
index fd3a48398..f4295cc30 100755
--- a/run_bf16.sh
+++ b/run_bf16.sh
@@ -36,26 +36,6 @@ ZERO_STAGE=0
 #GLOBAL_BATCH=128
 #WORKER_STR="-i worker-0"
 
-
-TP=1
-PP=1
-DP=2
-WORLD_SIZE=$((TP*PP*DP))
-HIDDEN=1024
-LAYERS=24
-SEQ=1024
-GLOBAL_BATCH=1
-WORKER_STR=""
-
-MICRO_BATCH=1
-
-LR=6.0e-4
-MIN_LR=6.0e-5
-DTYPE="bf16"
-EXP_DIR=${HOME}/experiments/results/bf16
-LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
-mkdir -p $LOG_DIR
-
 while [[ $# -gt 0 ]]
 do
 key="$1"
@@ -66,30 +46,69 @@ case $key in
     ;;
     -z|--zero-stage)
     ZERO_STAGE=$2;
+    shift    
     shift
     ;;
     *)
-    echo "Unknown argument(s)"
-    usage
+    echo "Unknown argument(s): $key"
     exit 1
     shift
     ;;
 esac
 done
 
+TP=4
+PP=1
+DP=2
+WORLD_SIZE=$((TP*PP*DP))
+
+HIDDEN=1024
+LAYERS=24
+NHEADS=32
+SEQ=1024
+
+#LAYERS=2
+#HIDDEN=8
+#NHEADS=2
+#SEQ=8
+
+GLOBAL_BATCH=64
+WORKER_STR=""
+EXIT_ITERS=10
+TRAIN_SAMPLES=1000000
 
+MICRO_BATCH=32
+LR=1.0e-1
+MIN_LR=1.0e-1
+DTYPE="bf16"
+RUN_VERSION=1
+EXP_DIR=${HOME}/experiments/results/bf16
+RUN_TAG="tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_train_${EXIT_ITERS}_v${RUN_VERSION}"
+LOG_DIR="${EXP_DIR}/tensorboard/${RUN_TAG}"
+mkdir -p $LOG_DIR
+export BIT16_DUMP_FILE="${EXP_DIR}/${RUN_TAG}.txt"
+CHECKPOINT_DIR="./checkpoints/${DTYPE}_z${ZERO_STAGE}_tp${TP}_pp${PP}_dp${DP}_nl${LAYERS}_exit_${EXIT_ITERS}_v${RUN_VERSION}"
 options=" \
 	--tensor-model-parallel-size $TP \
 	--pipeline-model-parallel-size $PP \
         --num-layers $LAYERS \
         --hidden-size $HIDDEN \
-        --num-attention-heads 32 \
+        --num-attention-heads ${NHEADS} \
         --seq-length $SEQ \
-        --loss-scale 12 \
         --max-position-embeddings $SEQ \
 	--micro-batch-size $MICRO_BATCH \
 	--global-batch-size $GLOBAL_BATCH \
-	--train-iters 1000 \
+        --optimizer adam \
+        --adam-eps 1e-8 \
+        --lr-warmup-samples 5 \
+        --min-lr 1e-6 \
+        --lr-decay-style cosine \
+        --lr-decay-samples 12 \
+        --override-lr-scheduler \
+        --clip-grad 1.0 \
+        --weight-decay 1e-1 \
+        --embed-layernorm \
+        --partition-activations \
         --lr $LR \
 	--min-lr $MIN_LR \
         --lr-decay-style cosine \
@@ -100,17 +119,21 @@ options=" \
 	--vocab-file ${VOCAB_PATH} \
 	--merge-file ${MERGE_PATH} \
 	--save-interval 10000 \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
 	--weight-decay 0.1 \
 	--adam-beta1 0.9 \
 	--adam-beta2 0.95 \
 	--init-method-std 0.006 \
         --${DTYPE} \
 	--checkpoint-activations \
-	--exit-interval 10000 \
+        --train-samples ${TRAIN_SAMPLES} \
+	--exit-interval ${EXIT_ITERS} \
+        --seed 42 \
+        --load ${CHECKPOINT_DIR} \
+        --save ${CHECKPOINT_DIR} \
 	--tensorboard-dir $LOG_DIR
         "
+#        --split 10,0,0 \
+#         --rampup-batch-size 2 2 1_000 \
 
 
 if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
@@ -155,7 +178,8 @@ WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
 #WORKER_STR="-i worker-0:0,1,2,3"
 #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
 #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
-run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
 
 
 echo ${run_cmd}
diff --git a/tests/test_training.py b/tests/test_training.py
index 65067982e..b65a051e5 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -676,7 +676,7 @@ def test_layer_norm_consistent(self, variation):
         launcher = get_launcher(num_gpus)
         cmd = launcher + script + args + ds_args
         # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
 
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())

From 8f2ea60b7f44a81aa5655ae6d819c8c3f5389405 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 28 Mar 2022 13:13:17 -0700
Subject: [PATCH 28/38] fix requirements

---
 requirements.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index da76b5e44..47e11bf04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,9 +6,11 @@ pybind11
 regex
 six
 tensorboard
-torch>=1.7
+torch>=1.11
 transformers
-DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
+# for now using this branch for bf16 work
+DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates
+#DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
 # versions from HF transformers
 black==21.4b0
 isort>=5.5.4

From fc8f813df1cef6a90472fab504bb7a5509237099 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 28 Mar 2022 18:07:22 -0700
Subject: [PATCH 29/38] dynamically discovered layer norm weights / refactor

---
 megatron/training.py | 80 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 19 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 96426b401..bf500ae1d 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -367,6 +367,63 @@ def get_learning_rate_scheduler(optimizer):
     return lr_scheduler
 
 
+def sync_layer_norm(n, p):
+
+    rank = torch.distributed.get_rank()
+
+    print(f'rank {rank} processing {n}')
+
+    #return
+
+    # 1. bf16
+    #print(f'rank {rank} before reduce p = {p}')
+    torch.distributed.all_reduce(p, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+    #print(f'rank {rank} after reduce p = {p}')
+
+    if p._hp_mapping is not None:
+        #print(f'rank {rank} fixing hp for input_layernorm')
+        #p._hp_mapping.update_hp()
+
+        # 2. fp32
+        hp = p._hp_mapping.hp_fragment
+        torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
+        # 3. optim states
+        for key in ['exp_avg', 'exp_avg_sq']:
+            optim_state = p._hp_mapping.get_optim_state(key)
+            #print(f'rank {rank} before reduce optim state {key} = {optim_state}')
+            torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+            #print(f'rank {rank} after reduce optim state {key} = {optim_state}')
+
+
+def sync_all_layer_norms(model):
+    # syncs weight+bias for each of the following layer norms (via averaging across TP ranks)
+    # 1. word embedding front word_embeddings.norm
+    # 2. transformer block input_layernorm x 70
+    # 3. transformer block post_attention_layernorm x 70
+    # 4. word embedding head - I think it's just weight + bias w/o a proper name in the last layer file layer_0X-model_0X-model_states.pt, see: https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/affff3d2927864c6948075700c672971782441f4/megatron/model/gpt_model.py#L267
+
+    import re
+    layer_norms_params_end_with = [
+        "word_embeddings.norm.weight", "word_embeddings.norm.bias",
+        "input_layernorm.weight", "input_layernorm.bias",
+        "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
+
+    for n,p in model.named_parameters():
+        #print(n)
+        # XXX: would be much simpler to re-do this logic to traverse children modules and act on isinstance of MixedFusedLayerNorm instead
+        # 1. first easy to identify layer norm params as they have a unique prefix each
+        for end in layer_norms_params_end_with:
+            if n.endswith(end):
+                sync_layer_norm(n, p)
+
+        # 2. now the last layer norm that has no prefix
+        # hack: (\d\d): MixedFusedLayerNorm() is hanging there w/o any prefix name, so need to match something like:
+        # /^6.weight$/ or /^6.bias$/
+        if mpu.is_pipeline_last_stage() and re.match('^\d+\.(weight|bias)$', n):
+            sync_layer_norm(n, p)
+
+
 def setup_model_and_optimizer(model_provider_func):
     """Setup model and optimizer."""
     args = get_args()
@@ -413,28 +470,13 @@ def setup_model_and_optimizer(model_provider_func):
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
         print_rank_0(f'module = {model[0]}')
-        for layer_id in ['3', '4']:
-            if hasattr(model[0].module._modules[layer_id], 'input_layernorm'):
-                weight = model[0].module._modules[layer_id].input_layernorm.weight
-                print(f'rank {torch.distributed.get_rank()} before reduce weight = {weight}') 
-                torch.distributed.all_reduce(weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-                print(f'rank {torch.distributed.get_rank()} after reduce weight = {weight}') 
-
-                if weight._hp_mapping is not None:
-                    print(f'rank {torch.distributed.get_rank()} fixing hp for input_layernorm')
-                    #weight._hp_mapping.update_hp()
-                    hp = weight._hp_mapping.hp_fragment
-                    torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-                
-                    for key in ['exp_avg', 'exp_avg_sq']:
-                        optim_state = weight._hp_mapping.get_optim_state(key)
-                        print(f'rank {torch.distributed.get_rank()} before reduce optim state {key} = {optim_state}') 
-                        torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-                        print(f'rank {torch.distributed.get_rank()} after reduce optim state {key} = {optim_state}') 
 
+        # turn on to enable layer norm syncing
+        if 1:
+            sync_all_layer_norms(model[0].module)
     else:
         args.iteration = 0
-    
+
     torch.distributed.barrier()
 
     # We only support local DDP with multiple micro-batches.

From 4443e6d2ff7a88399ba055995eb79eb329d79a7c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 28 Mar 2022 18:18:28 -0700
Subject: [PATCH 30/38] fix regex

---
 megatron/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index bf500ae1d..a3ece5388 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -420,7 +420,7 @@ def sync_all_layer_norms(model):
         # 2. now the last layer norm that has no prefix
         # hack: (\d\d): MixedFusedLayerNorm() is hanging there w/o any prefix name, so need to match something like:
         # /^6.weight$/ or /^6.bias$/
-        if mpu.is_pipeline_last_stage() and re.match('^\d+\.(weight|bias)$', n):
+        if mpu.is_pipeline_last_stage() and re.match(r'^\d+\.(weight|bias)$', n):
             sync_layer_norm(n, p)
 
 

From d2aa4f18c6bac3c2b101e5b19c3ad509cbc1cf04 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 28 Mar 2022 18:30:06 -0700
Subject: [PATCH 31/38] add the test script

---
 compare_tp_weights.py | 83 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 compare_tp_weights.py

diff --git a/compare_tp_weights.py b/compare_tp_weights.py
new file mode 100644
index 000000000..c7e5779bf
--- /dev/null
+++ b/compare_tp_weights.py
@@ -0,0 +1,83 @@
+
+# usage:
+# python compare_tp_weights.py input_layernorm.weight 40 2 .
+
+# input_layernorm.weight
+# input_layernorm.bias
+# post_attention_layernorm.weight
+# post_attention_layernorm.bias
+
+# one liner for just 2 weights comparison
+# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' input_layernorm.weight layer_03-model_00-model_states.pt layer_03-model_01-model_states.pt
+
+# 13B
+# cd /gpfsdsstore/projects/rech/six/commun/checkpoints/tr1-13B/tr1-13B-with-optim/global_step168000
+# python ~/compare_tp_weights.py input_layernorm.weight 40 2 .
+
+# 104B
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/emb-norm/global_step16800
+#
+# python ~/compare_tp_weights.py input_layernorm.weight 64 4 .          > ~/104B.input_layernorm.weight.txt
+# python ~/compare_tp_weights.py post_attention_layernorm.weight 64 4 . > ~/104B.post_attention_layernorm.weight.txt
+# python ~/compare_tp_weights.py input_layernorm.bias 64 4 .            > ~/104B.input_layernorm.bias.txt
+# python ~/compare_tp_weights.py post_attention_layernorm.bias 64 4 .   > ~/104B.post_attention_layernorm.bias.txt
+
+# other 104B checkpoints:
+
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/to-back-up/tr8b-104B/checkpoints/cl-exp-02/global_step10500
+# mismatched 68
+#
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8-104B-wide/experiment11/global_step15660
+# mismatched
+#
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8-104B-wide/experiment06/global_step5100
+# python ~/compare_tp_weights.py input_layernorm.weight 32 4
+# **all matched**
+#
+# python ~/compare_tp_weights.py post_attention_layernorm.weight 32 4
+# not matched
+
+
+
+# # 104B/176B embed-norm check
+# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_00-model_states.pt layer_01-model_01-model_states.pt
+# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_01-model_states.pt layer_01-model_02-model_states.pt
+# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_02-model_states.pt layer_01-model_03-model_states.pt
+
+
+# # 176B
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr11-176B-ml/checkpoints/main/global_step16400
+# python ~/compare_tp_weights.py input_layernorm.weight 70 4 .          > ~/176B.input_layernorm.weight.txt
+# python ~/compare_tp_weights.py post_attention_layernorm.weight 70 4 . > ~/176B.post_attention_layernorm.weight.txt
+# python ~/compare_tp_weights.py input_layernorm.bias 70 4 .            > ~/176B.input_layernorm.bias.txt
+# python ~/compare_tp_weights.py post_attention_layernorm.bias 70 4 .   > ~/176B.post_attention_layernorm.bias.txt
+
+
+import torch, sys
+
+
+
+key, nlayers, tp_size, checkpoint_dir = sys.argv[1:5]
+
+print(f"checking key={key}")
+matched, mismatched = 0, 0
+for layer_id in range(int(nlayers)):
+    for tp in range(int(tp_size)-1):
+        f1 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp:02d}-model_states.pt"
+        f2 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp+1:02d}-model_states.pt"
+        c1 = torch.load(f1)
+        c2 = torch.load(f2)
+        # print(f1)
+        # print(f2)
+        header = f"layer_id={layer_id}: {tp}-{tp+1}"
+        try:
+            torch.testing.assert_close(c1[key], c2[key], rtol=0.0, atol=0.0, check_device=False)
+            print(f"✓ {header}")
+            matched += 1
+        except:
+            print(f"✗ {header}")
+            mismatched += 1
+            #raise
+
+print(f"Matched   : {matched}")
+print(f"Mismatched: {mismatched}")

From d64a947e6a31a7d968d493b2347461f15ef99f15 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 29 Mar 2022 08:16:16 -0700
Subject: [PATCH 32/38] compare on cpu

---
 compare_tp_weights.py              |  2 ++
 compare_tp_weights_cpu.py          | 42 ++++++++++++++++++++++++++++++
 megatron/model/fused_layer_norm.py | 10 ++++++-
 3 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 compare_tp_weights_cpu.py

diff --git a/compare_tp_weights.py b/compare_tp_weights.py
index c7e5779bf..69675e64c 100644
--- a/compare_tp_weights.py
+++ b/compare_tp_weights.py
@@ -44,6 +44,8 @@
 # python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_01-model_states.pt layer_01-model_02-model_states.pt
 # python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_02-model_states.pt layer_01-model_03-model_states.pt
 
+# same on cpu
+python -c 'import torch, sys; k=sys.argv[1]; a=torch.load(sys.argv[2], map_location=torch.device("cpu"));b=torch.load(sys.argv[3], map_location=torch.device("cpu")); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_00-model_states.pt layer_01-model_01-model_states.pt
 
 # # 176B
 # cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr11-176B-ml/checkpoints/main/global_step16400
diff --git a/compare_tp_weights_cpu.py b/compare_tp_weights_cpu.py
new file mode 100644
index 000000000..7c07bcabb
--- /dev/null
+++ b/compare_tp_weights_cpu.py
@@ -0,0 +1,42 @@
+
+# usage:
+# python compare_tp_weights.py input_layernorm.weight 40 2 .
+
+
+# 13B
+# cd /gpfsdsstore/projects/rech/six/commun/checkpoints/tr1-13B/tr1-13B-with-optim/global_step168000
+# python ~/compare_tp_weights.py input_layernorm.weight 40 2 .
+
+# 104B
+# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/emb-norm/global_step16800
+# python ~/compare_tp_weights.py input_layernorm.weight 64 4 .
+
+
+import torch, sys
+
+
+
+key, nlayers, tp_size, checkpoint_dir = sys.argv[1:5]
+
+print(f"checking key={key}")
+matched, mismatched = 0, 0
+for layer_id in range(int(nlayers)):
+    for tp in range(int(tp_size)-1):
+        f1 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp:02d}-model_states.pt"
+        f2 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp+1:02d}-model_states.pt"
+        c1 = torch.load(f1, map_location=torch.device('cpu'))
+        c2 = torch.load(f2, map_location=torch.device('cpu'))
+        # print(f1)
+        # print(f2)
+        header = f"layer_id={layer_id}: {tp}-{tp+1}"
+        try:
+            torch.testing.assert_close(c1[key], c2[key], rtol=0.0, atol=0.0, check_device=False)
+            print(f"✓ {header}")
+            matched += 1
+        except:
+            print(f"✗ {header}")
+            mismatched += 1
+            #raise
+
+print(f"Matched   : {matched}")
+print(f"Mismatched: {mismatched}")
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index c344a5cba..613734100 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -83,7 +83,7 @@ def reset_parameters(self):
     init.zeros_(self.bias)
 
 
-  def forward(self, input):
+  def forward_old(self, input):
 #    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
 #    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
 #    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
@@ -100,3 +100,11 @@ def forward(self, input):
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)
 
+
+  def forward(self, input):
+
+    torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+    torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
+    return FusedLayerNormAffineFunction.apply(
+      input, self.weight, self.bias, self.normalized_shape, self.eps)

From bf7eeb3a8bef1913153b5cd5fa2bc1bf667663c3 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 29 Mar 2022 09:45:58 -0700
Subject: [PATCH 33/38] add 2 more weights to sync

---
 megatron/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index a3ece5388..b1d2b9b60 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -407,7 +407,9 @@ def sync_all_layer_norms(model):
     layer_norms_params_end_with = [
         "word_embeddings.norm.weight", "word_embeddings.norm.bias",
         "input_layernorm.weight", "input_layernorm.bias",
-        "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
+        "post_attention_layernorm.weight", "post_attention_layernorm.bias",
+        "self_attention.dense.bias", "mlp.dense_4h_to_h.bias",
+    ]
 
     for n,p in model.named_parameters():
         #print(n)

From 8482595623b4d0ba8b548cea06ecaf9b0a3e39c0 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 30 Mar 2022 06:37:01 +0500
Subject: [PATCH 34/38] fp32 accessors

---
 megatron/training.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index b1d2b9b60..a5e64eaec 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -375,11 +375,22 @@ def sync_layer_norm(n, p):
 
     #return
 
+    fp32_param = p.get_full_hp_param()
+    torch.set_printoptions(sci_mode=False, precision=6)    
+    print(f'rank {rank} bf16 = {p}')
+    print(f'rank {rank} fp32 = {fp32_param}')
+    torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False)
+
+    for key in ['exp_avg', 'exp_avg_sq']:
+        full_optim_state = p.get_full_hp_param(optim_state_key=key)
+        print(f'rank {rank} full optim state {key} = {full_optim_state}')
+
     # 1. bf16
     #print(f'rank {rank} before reduce p = {p}')
     torch.distributed.all_reduce(p, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
     #print(f'rank {rank} after reduce p = {p}')
 
+
     if p._hp_mapping is not None:
         #print(f'rank {rank} fixing hp for input_layernorm')
         #p._hp_mapping.update_hp()
@@ -390,10 +401,10 @@ def sync_layer_norm(n, p):
 
         # 3. optim states
         for key in ['exp_avg', 'exp_avg_sq']:
-            optim_state = p._hp_mapping.get_optim_state(key)
-            #print(f'rank {rank} before reduce optim state {key} = {optim_state}')
-            torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-            #print(f'rank {rank} after reduce optim state {key} = {optim_state}')
+            optim_state_fragment = p._hp_mapping.get_optim_state_fragment(key)
+            #print(f'rank {rank} before reduce optim state fragment {key} = {optim_state_fragment}')
+            torch.distributed.all_reduce(optim_state_fragment, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+            #print(f'rank {rank} after reduce optim state fragment {key} = {optim_state_fragment}')
 
 
 def sync_all_layer_norms(model):

From 86b726cbd48e8c73f955fffe244d31c58932748c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 29 Mar 2022 18:57:27 -0700
Subject: [PATCH 35/38] improve the doc, and comment out the demo

---
 megatron/training.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index a5e64eaec..d40415797 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -375,15 +375,22 @@ def sync_layer_norm(n, p):
 
     #return
 
-    fp32_param = p.get_full_hp_param()
-    torch.set_printoptions(sci_mode=False, precision=6)    
-    print(f'rank {rank} bf16 = {p}')
-    print(f'rank {rank} fp32 = {fp32_param}')
-    torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False)
-
-    for key in ['exp_avg', 'exp_avg_sq']:
-        full_optim_state = p.get_full_hp_param(optim_state_key=key)
-        print(f'rank {rank} full optim state {key} = {full_optim_state}')
+    # # Here is how you can access fp32 version of the bf16 param and fp32 optim states
+    # #
+    # # Note that there is an all_gather called on all dp ranks when `get_full_hp_param` is called -
+    # # so it's not free
+    # #
+    # # a. fp32 param
+    # fp32_param = p.get_full_hp_param()
+    # torch.set_printoptions(sci_mode=False, precision=6)
+    # print(f'rank {rank} bf16 = {p}')
+    # print(f'rank {rank} fp32 = {fp32_param}')
+    # torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False)
+
+    # # b. fp32 optim states
+    # for key in ['exp_avg', 'exp_avg_sq']:
+    #     full_optim_state = p.get_full_hp_param(optim_state_key=key)
+    #     print(f'rank {rank} full optim state {key} = {full_optim_state}')
 
     # 1. bf16
     #print(f'rank {rank} before reduce p = {p}')

From 2ac141b1617b1bb5c175cd8fb34c0ce760b7c22b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 29 Mar 2022 19:14:08 -0700
Subject: [PATCH 36/38] typo

---
 megatron/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index d40415797..211ebbda4 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -377,7 +377,7 @@ def sync_layer_norm(n, p):
 
     # # Here is how you can access fp32 version of the bf16 param and fp32 optim states
     # #
-    # # Note that there is an all_gather called on all dp ranks when `get_full_hp_param` is called -
+    # # Note that there is an all_reduce called on all dp ranks when `get_full_hp_param` is called -
     # # so it's not free
     # #
     # # a. fp32 param

From d576775c34d7680b8c5a9b685de03a91b9a66448 Mon Sep 17 00:00:00 2001
From: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 6 Apr 2022 18:42:36 +0200
Subject: [PATCH 37/38] Sync torch_rng_state (#277)

---
 megatron/data/data_samplers.py |  1 +
 megatron/training.py           | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py
index 1cbeac312..b933ff34e 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
@@ -52,6 +52,7 @@ def build_pretraining_data_loader(dataset, consumed_samples):
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
                                        num_workers=args.num_workers,
+                                       generator=torch.Generator().manual_seed(args.seed),
                                        pin_memory=True)
 
 class MegatronPretrainingSampler:
diff --git a/megatron/training.py b/megatron/training.py
index 211ebbda4..cf148b2d7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -443,6 +443,16 @@ def sync_all_layer_norms(model):
         if mpu.is_pipeline_last_stage() and re.match(r'^\d+\.(weight|bias)$', n):
             sync_layer_norm(n, p)
 
+def sync_all_torch_random_state():
+    torch_rng_state = torch.get_rng_state()
+    # We use rank 1 as source of truth and sed the new
+    torch.distributed.broadcast(
+        torch_rng_state,
+        src=mpu.get_tensor_model_parallel_src_rank() + 1,
+        group=mpu.get_tensor_model_parallel_group()
+    )
+    torch.set_rng_state(torch_rng_state)
+
 
 def setup_model_and_optimizer(model_provider_func):
     """Setup model and optimizer."""
@@ -494,6 +504,7 @@ def setup_model_and_optimizer(model_provider_func):
         # turn on to enable layer norm syncing
         if 1:
             sync_all_layer_norms(model[0].module)
+            sync_all_torch_random_state()
     else:
         args.iteration = 0
 

From 475f3730200261449085deea28330b861e7820fe Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 6 Apr 2022 19:10:36 +0200
Subject: [PATCH 38/38] Fix device issue when using torch.broadcast

---
 megatron/training.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index cf148b2d7..bd9fa8e99 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -444,14 +444,14 @@ def sync_all_layer_norms(model):
             sync_layer_norm(n, p)
 
 def sync_all_torch_random_state():
-    torch_rng_state = torch.get_rng_state()
+    torch_rng_state = torch.get_rng_state().cuda()
     # We use rank 1 as source of truth and sed the new
     torch.distributed.broadcast(
         torch_rng_state,
         src=mpu.get_tensor_model_parallel_src_rank() + 1,
         group=mpu.get_tensor_model_parallel_group()
     )
-    torch.set_rng_state(torch_rng_state)
+    torch.set_rng_state(torch_rng_state.cpu())
 
 
 def setup_model_and_optimizer(model_provider_func):