From 8d7a603821d665ba929e9540dd973a3af43be972 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 00:39:36 +0100 Subject: [PATCH 01/38] WIP --- tests/test_tensor_parallel.py | 1 - tests/test_training.py | 25 ++++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 25921c12a..ed383e17a 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -293,6 +293,5 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self): self.assertEqual(str(exc_info.value), "5121 is not divisible by 128") - if __name__ == '__main__': unittest.main() diff --git a/tests/test_training.py b/tests/test_training.py index c77cb9af2..7115ef86d 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -50,7 +50,7 @@ def get_3d_dimensions(): dp_size = 2 pp_size = 2 tp_size = 2 - if num_gpus >= 4: + elif num_gpus >= 4: dp_size = 1 pp_size = 2 tp_size = 2 @@ -592,3 +592,26 @@ def test_skip_train_iteration(self): train_iterations = range(1,10) for i in train_iterations: self.assertTrue(f"iteration {i:8d}/" in cs.out) + + def test_layer_norm_consistent(self): + # skip iterations setup + extra_args = f""" + --skip-train-iteration-range 2-2 4-7 + """.split() + + src_dir = self.src_dir + output_dir = self.get_auto_remove_tmp_dir() + args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) + args.extend(extra_args) + script = [f"{src_dir}/pretrain_gpt.py"] + launcher = get_launcher(num_gpus) + cmd = launcher + script + args + ds_args + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + checkpoint_path = os.path.join(output_dir, "checkpoints") + print(os.listdir(checkpoint_path)) + assert False From 240f673e298945f592b8fbe92661930b57897c50 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:25:11 +0100 Subject: [PATCH 02/38] Wip --- megatron/mpu/mappings.py | 2 ++ tests/test_training.py | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py index 821d9acfe..6056f94f6 100644 --- a/megatron/mpu/mappings.py +++ b/megatron/mpu/mappings.py @@ -82,6 +82,7 @@ def symbolic(graph, input_): @staticmethod def forward(ctx, input_): + # TODO: we need to assert that the input_ are all the same within a group return input_ @staticmethod @@ -102,6 +103,7 @@ def forward(ctx, input_): @staticmethod def backward(ctx, grad_output): + # TODO: we need to assert that the grad_output are all the same within a group return grad_output diff --git a/tests/test_training.py b/tests/test_training.py index 7115ef86d..eb6ce9b72 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -20,6 +20,8 @@ import re import unittest from pathlib import Path + +import torch from parameterized import parameterized from megatron.testing_utils import ( @@ -594,15 +596,9 @@ def test_skip_train_iteration(self): self.assertTrue(f"iteration {i:8d}/" in cs.out) def test_layer_norm_consistent(self): - # skip iterations setup - extra_args = f""" - --skip-train-iteration-range 2-2 4-7 - """.split() - src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) - args.extend(extra_args) script = [f"{src_dir}/pretrain_gpt.py"] launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args @@ -612,6 +608,10 @@ def test_layer_norm_consistent(self): with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - checkpoint_path = os.path.join(output_dir, "checkpoints") + checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step_10") print(os.listdir(checkpoint_path)) + key="input_layernorm.weight" + files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"] + weights = [torch.load(os.path.join(checkpoint_path,file))[key] for file in files_to_test] + torch.testing.assert_close(weights[0], weights[1], rtol=0.0, atol=0.0, check_device=False) assert False From 1cdcd7de2ece875988c7922c42e21c7af2ae1c99 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:27:35 +0100 Subject: [PATCH 03/38] Woops --- tests/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index eb6ce9b72..99ff13b84 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -608,7 +608,7 @@ def test_layer_norm_consistent(self): with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step_10") + checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step10") print(os.listdir(checkpoint_path)) key="input_layernorm.weight" files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"] From 29372806533f5ac09749729620bda958a28d6f6b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:39:58 +0100 Subject: [PATCH 04/38] WIP --- tests/test_training.py | 66 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index 99ff13b84..943649e32 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -598,7 +598,71 @@ def test_skip_train_iteration(self): def test_layer_norm_consistent(self): src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() - args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) + num_gpus = 2 + args = f""" + --tensor-model-parallel-size {2} + --pipeline-model-parallel-size {1} + --distributed-backend nccl + + --log-interval 1 + --save-interval 10 + --eval-interval 10 + --eval-iters 5 + --checkpoint-activations + --partition-activations + --exit-interval {20} + + --merge-file {data_dir}/gpt2-tiny-merges.txt + --vocab-file {data_dir}/gpt2-tiny-vocab.json + --save {output_dir}/checkpoints + --load {output_dir}/checkpoints + --data-path {data_dir}/meg-gpt2-openwebtext_text_document + --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 + --log-timers-to-tensorboard + --log-batch-size-to-tensorboard + --log-validation-ppl-to-tensorboard + + --num-layers 2 + --hidden-size 64 + --num-attention-heads 2 + --seq-length {seq_len} + --max-position-embeddings 1024 + --micro-batch-size 1 + --global-batch-size 16 + + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --lr 1e-4 + --lr-warmup-samples 5 + --clip-grad 1.0 + --weight-decay 1e-1 + --embed-layernorm + --fp16 + + --log-level debug + --log-level-replica info + + --rampup-batch-size 2 2 200 + --train-samples 200 + + --lr-decay-samples 6 + + """.split() + + ds_args = f""" + --deepspeed + --deepspeed_config {self.test_file_dir_str}/ds_config.json + --zero-stage 1 + --deepspeed-activation-checkpointing + --deepspeed_config {self.test_file_dir_str}/ds_config.json + + """.split() + + # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) + script = [f"{src_dir}/pretrain_gpt.py"] launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args From 7fcff06bd3215082aeaeb935129b6866cd58019a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:41:27 +0100 Subject: [PATCH 05/38] Woops --- tests/test_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_training.py b/tests/test_training.py index 943649e32..c34521963 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -599,6 +599,7 @@ def test_layer_norm_consistent(self): src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() num_gpus = 2 + data_dir = f"{self.data_dir}/gpt2" args = f""" --tensor-model-parallel-size {2} --pipeline-model-parallel-size {1} From 1f2f80072a5a2989461c9dbba5b55068d0f11da5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:42:05 +0100 Subject: [PATCH 06/38] Woops --- tests/test_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_training.py b/tests/test_training.py index c34521963..70f161eba 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -599,6 +599,7 @@ def test_layer_norm_consistent(self): src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() num_gpus = 2 + seq_len = 128 data_dir = f"{self.data_dir}/gpt2" args = f""" --tensor-model-parallel-size {2} From f152e487d8c6dcb9caa95414aa2dc644b5f50984 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:50:19 +0100 Subject: [PATCH 07/38] Woops --- tests/test_training.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 70f161eba..5a29a432e 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -674,10 +674,17 @@ def test_layer_norm_consistent(self): with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - checkpoint_path = os.path.join(output_dir, "checkpoints", "global_step10") - print(os.listdir(checkpoint_path)) - key="input_layernorm.weight" - files_to_test=["layer_03-model_00-model_states.pt", "layer_03-model_01-model_states.pt"] - weights = [torch.load(os.path.join(checkpoint_path,file))[key] for file in files_to_test] - torch.testing.assert_close(weights[0], weights[1], rtol=0.0, atol=0.0, check_device=False) + checkpoints = ["global_step10", "global_step20"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] + for checkpoint in checkpoints: + print(checkpoint) + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + print(os.listdir(checkpoint_path)) + key = "input_layernorm.weight" + for files in files_to_compare: + print(files) + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) assert False From ce02dd16faee505ed7dc3beb1f49f9d5a0ba477f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:52:14 +0100 Subject: [PATCH 08/38] Test with alibi --- tests/test_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_training.py b/tests/test_training.py index 5a29a432e..0f3133d64 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -652,6 +652,7 @@ def test_layer_norm_consistent(self): --lr-decay-samples 6 + --position-embedding-type alibi """.split() ds_args = f""" From 02365d145bd5887385abd359ebdc9e12ac9064cc Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:53:56 +0100 Subject: [PATCH 09/38] Still trying to reproduce --- tests/test_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_training.py b/tests/test_training.py index 0f3133d64..d6eeba220 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -688,4 +688,5 @@ def test_layer_norm_consistent(self): ref = weights[0] for weight in weights[1:]: torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) + print(ref) assert False From 42d6b4e3be2c8bc7b496ac7ada5ce258c4d190a6 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 11:57:23 +0100 Subject: [PATCH 10/38] Huh --- tests/test_training.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index d6eeba220..98ad9117c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -676,17 +676,19 @@ def test_layer_norm_consistent(self): execute_subprocess_async(cmd, env=self.get_env()) checkpoints = ["global_step10", "global_step20"] + keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] for checkpoint in checkpoints: print(checkpoint) checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) print(os.listdir(checkpoint_path)) - key = "input_layernorm.weight" - for files in files_to_compare: - print(files) - weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - ref = weights[0] - for weight in weights[1:]: - torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) - print(ref) + for key in keys_to_compare: + print(key) + for files in files_to_compare: + print(files) + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) + print(key, ref) assert False From c20c8ba466ae129ba3b81e2248b4e475494cc261 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 12:10:38 +0100 Subject: [PATCH 11/38] Have high LR to see weights actually change --- tests/test_training.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 98ad9117c..cb258147a 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -630,15 +630,14 @@ def test_layer_norm_consistent(self): --num-attention-heads 2 --seq-length {seq_len} --max-position-embeddings 1024 - --micro-batch-size 1 + --micro-batch-size 2 --global-batch-size 16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.95 --adam-eps 1e-8 - --lr 1e-4 - --lr-warmup-samples 5 + --lr 1e-1 --clip-grad 1.0 --weight-decay 1e-1 --embed-layernorm @@ -650,8 +649,6 @@ def test_layer_norm_consistent(self): --rampup-batch-size 2 2 200 --train-samples 200 - --lr-decay-samples 6 - --position-embedding-type alibi """.split() From 7f2441edb6e23b6f923ac88e4f1c2e4bce19a028 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 14:08:48 +0100 Subject: [PATCH 12/38] Launch bf16 --- tests/ds_config_bf16.json | 13 +++++++++++++ tests/test_training.py | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 tests/ds_config_bf16.json diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json new file mode 100644 index 000000000..7a07551f8 --- /dev/null +++ b/tests/ds_config_bf16.json @@ -0,0 +1,13 @@ +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 16, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 1 + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/tests/test_training.py b/tests/test_training.py index cb258147a..b7daf9c7b 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -641,7 +641,7 @@ def test_layer_norm_consistent(self): --clip-grad 1.0 --weight-decay 1e-1 --embed-layernorm - --fp16 + --bf16 --log-level debug --log-level-replica info @@ -654,10 +654,10 @@ def test_layer_norm_consistent(self): ds_args = f""" --deepspeed - --deepspeed_config {self.test_file_dir_str}/ds_config.json + --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json --zero-stage 1 --deepspeed-activation-checkpointing - --deepspeed_config {self.test_file_dir_str}/ds_config.json + --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json """.split() From a4172bf9c9ca6f4dc769d0e7cafc527ad0f8fe39 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 14:12:10 +0100 Subject: [PATCH 13/38] Woops --- tests/ds_config_bf16.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json index 7a07551f8..1f02566c9 100644 --- a/tests/ds_config_bf16.json +++ b/tests/ds_config_bf16.json @@ -3,7 +3,7 @@ "train_batch_size": 16, "gradient_clipping": 1.0, "zero_optimization": { - "stage": 1 + "stage": 0 }, "bf16": { "enabled": true From 5fbe1072df81b60b8a0987c8bd6e63cf50d0afb8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 14:22:28 +0100 Subject: [PATCH 14/38] Make test to work with both bf16 and fp16 to see who fails --- tests/test_training.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index b7daf9c7b..ea5c7700c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -595,7 +595,8 @@ def test_skip_train_iteration(self): for i in train_iterations: self.assertTrue(f"iteration {i:8d}/" in cs.out) - def test_layer_norm_consistent(self): + @parameterized.expand(["bf16", "fp16"]) + def test_layer_norm_consistent(self, variation): src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() num_gpus = 2 @@ -641,7 +642,6 @@ def test_layer_norm_consistent(self): --clip-grad 1.0 --weight-decay 1e-1 --embed-layernorm - --bf16 --log-level debug --log-level-replica info @@ -654,13 +654,22 @@ def test_layer_norm_consistent(self): ds_args = f""" --deepspeed - --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json - --zero-stage 1 --deepspeed-activation-checkpointing - --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json - """.split() + if variation == "bf16": + args.append("--bf16") + ds_args += [ + "--zero-stage", "0" + "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json" + ] + elif variation == "fp16": + args.append("--fp16") + ds_args += [ + "--zero-stage", "1" + "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json" + ] + # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) script = [f"{src_dir}/pretrain_gpt.py"] From a0c09132ea1f6728c5165c49a997c9e743f53014 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 14:23:47 +0100 Subject: [PATCH 15/38] Woops --- tests/test_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index ea5c7700c..2b842fbbd 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -660,13 +660,13 @@ def test_layer_norm_consistent(self, variation): if variation == "bf16": args.append("--bf16") ds_args += [ - "--zero-stage", "0" + "--zero-stage", "0", "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json" ] elif variation == "fp16": args.append("--fp16") ds_args += [ - "--zero-stage", "1" + "--zero-stage", "1", "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json" ] From 6b19339caa07e80fb21a2e5698ab4a60b2506f43 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 14:25:54 +0100 Subject: [PATCH 16/38] Remove assert --- tests/test_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index 2b842fbbd..d86d08074 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -697,4 +697,3 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) print(key, ref) - assert False From a5e329580e5cc57e4b34d62842f701ba498cbdce Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:05:22 +0100 Subject: [PATCH 17/38] Try to figure out how the divergence happens --- megatron/model/fused_layer_norm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 78645c236..3b3048a9f 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -19,6 +19,7 @@ import numbers import torch +from megatron import mpu from torch.nn.parameter import Parameter from torch.nn import init import importlib @@ -31,7 +32,6 @@ class FusedLayerNormAffineFunction(torch.autograd.Function): @staticmethod def forward(ctx, input, weight, bias, normalized_shape, eps): - ctx.normalized_shape = normalized_shape ctx.eps = eps input_ = input.contiguous() @@ -84,7 +84,12 @@ def reset_parameters(self): def forward(self, input): - + print( + mpu.get_tensor_model_parallel_group(), + mpu.get_tensor_model_parallel_rank(), + self.weight, + self.bias + ) return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) From 7145f6dfc4f3136fd78f8e24e9d8c6637a6afc79 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:28:55 +0100 Subject: [PATCH 18/38] I think bias starts to diverge first --- megatron/model/fused_layer_norm.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 3b3048a9f..2e19451a5 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -19,7 +19,7 @@ import numbers import torch -from megatron import mpu +from megatron import mpu, print_rank_0 from torch.nn.parameter import Parameter from torch.nn import init import importlib @@ -84,12 +84,17 @@ def reset_parameters(self): def forward(self, input): - print( - mpu.get_tensor_model_parallel_group(), - mpu.get_tensor_model_parallel_rank(), - self.weight, - self.bias - ) + weights = [torch.empty_like(self.weight) for tp in mpu.get_tensor_model_parallel_world_size()] + torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) + biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()] + torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) + if any(torch.any(weight != self.weight) for weight in weights): + print_rank_0("Weight sync failed") + print_rank_0(weights) + if any(torch.any(bias != self.bias) for bias in biases): + print_rank_0("Bias sync failed") + print_rank_0(biases) + return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) From 311e53175909895dab82e5472b1e704017840af0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:31:36 +0100 Subject: [PATCH 19/38] Woops --- megatron/model/fused_layer_norm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 2e19451a5..6a5e58b3d 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -89,11 +89,13 @@ def forward(self, input): biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()] torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) if any(torch.any(weight != self.weight) for weight in weights): - print_rank_0("Weight sync failed") - print_rank_0(weights) + if mpu.get_tensor_model_parallel_rank() == 0: + print("Weight sync failed") + print(weights) if any(torch.any(bias != self.bias) for bias in biases): - print_rank_0("Bias sync failed") - print_rank_0(biases) + if mpu.get_tensor_model_parallel_rank() == 0: + print("Bias sync failed") + print(biases) return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) From 39d4b8f9faf218130c38d101916eb6b83a70931b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:33:28 +0100 Subject: [PATCH 20/38] Woops --- megatron/model/fused_layer_norm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 6a5e58b3d..c416d332b 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -19,7 +19,7 @@ import numbers import torch -from megatron import mpu, print_rank_0 +from megatron import mpu from torch.nn.parameter import Parameter from torch.nn import init import importlib From 8ffb278f63bb192bb4a4c3ce726bd26e605dd8ba Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:34:42 +0100 Subject: [PATCH 21/38] Woops --- megatron/model/fused_layer_norm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index c416d332b..8430f528c 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -84,9 +84,9 @@ def reset_parameters(self): def forward(self, input): - weights = [torch.empty_like(self.weight) for tp in mpu.get_tensor_model_parallel_world_size()] + weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())] torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) - biases = [torch.empty_like(self.bias) for tp in mpu.get_tensor_model_parallel_world_size()] + biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())] torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) if any(torch.any(weight != self.weight) for weight in weights): if mpu.get_tensor_model_parallel_rank() == 0: From 2389bfdfad878624cfa228d5295b6d9ed35bd45d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:00:07 +0100 Subject: [PATCH 22/38] Add embed layer norm --- tests/test_training.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index d86d08074..6b140306c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -685,15 +685,21 @@ def test_layer_norm_consistent(self, variation): keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] for checkpoint in checkpoints: - print(checkpoint) checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - print(os.listdir(checkpoint_path)) for key in keys_to_compare: - print(key) for files in files_to_compare: - print(files) weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) - print(key, ref) + + keys_to_compare = ["word_embeddings.norm.weight"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [0]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) \ No newline at end of file From 0cf35ee3679898bab1524f0860d88e733867a12c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:05:01 +0100 Subject: [PATCH 23/38] Woops --- tests/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index 6b140306c..7ef6f958d 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -694,7 +694,7 @@ def test_layer_norm_consistent(self, variation): torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) keys_to_compare = ["word_embeddings.norm.weight"] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [0]] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) for key in keys_to_compare: From f0d6d179fbaf5a413f928bddd66361e1f7488ec3 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 23:15:58 +0100 Subject: [PATCH 24/38] Backward compatibility on torch --- tests/test_training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 7ef6f958d..fb72e59c6 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -33,7 +33,7 @@ require_bnb_non_decorator, require_deepspeed, require_torch_gpu, - set_seed + set_seed, torch_assert_equal ) set_seed(42) @@ -691,7 +691,7 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) + torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) keys_to_compare = ["word_embeddings.norm.weight"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] @@ -702,4 +702,4 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch.testing.assert_close(ref, weight, rtol=0.0, atol=0.0, check_device=False) \ No newline at end of file + torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) \ No newline at end of file From 07ccb3db1717f300fd09afaaf9eac57678ca0e5d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 23:23:54 +0100 Subject: [PATCH 25/38] Better --- tests/test_training.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index fb72e59c6..bc64ffb73 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -682,6 +682,8 @@ def test_layer_norm_consistent(self, variation): execute_subprocess_async(cmd, env=self.get_env()) checkpoints = ["global_step10", "global_step20"] + + # Check transformer layer norm keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] for checkpoint in checkpoints: @@ -693,6 +695,7 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + # Check embed layer norm keys_to_compare = ["word_embeddings.norm.weight"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] for checkpoint in checkpoints: @@ -702,4 +705,4 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) \ No newline at end of file + torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) From a5b5edc02b1251b4a72d0edb416ea06c35ff53b3 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 25 Mar 2022 18:28:44 -0700 Subject: [PATCH 26/38] fix --- tests/test_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index bc64ffb73..65067982e 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -693,7 +693,7 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + torch_assert_equal(ref, weight, check_device=False) # Check embed layer norm keys_to_compare = ["word_embeddings.norm.weight"] @@ -705,4 +705,4 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + torch_assert_equal(ref, weight, check_device=False) From c7f20066dcf0b7ec37dbaad40e3d9991505f4a28 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 29 Mar 2022 00:09:46 +0500 Subject: [PATCH 27/38] Sync lp/hp/optim for layer norms --- megatron/model/fused_layer_norm.py | 24 ++++----- megatron/training.py | 22 ++++++++ run_bf16.sh | 82 +++++++++++++++++++----------- tests/test_training.py | 2 +- 4 files changed, 88 insertions(+), 42 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 8430f528c..c344a5cba 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -84,18 +84,18 @@ def reset_parameters(self): def forward(self, input): - weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())] - torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) - biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())] - torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) - if any(torch.any(weight != self.weight) for weight in weights): - if mpu.get_tensor_model_parallel_rank() == 0: - print("Weight sync failed") - print(weights) - if any(torch.any(bias != self.bias) for bias in biases): - if mpu.get_tensor_model_parallel_rank() == 0: - print("Bias sync failed") - print(biases) +# weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())] +# torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) +# biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())] +# torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) +# if any(torch.any(weight != self.weight) for weight in weights): +# if mpu.get_tensor_model_parallel_rank() == 0: +# print("Weight sync failed") +# print(weights) +# if any(torch.any(bias != self.bias) for bias in biases): +# if mpu.get_tensor_model_parallel_rank() == 0: +# print("Bias sync failed") +# print(biases) return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) diff --git a/megatron/training.py b/megatron/training.py index 84fd4eb9d..96426b401 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -412,8 +412,30 @@ def setup_model_and_optimizer(model_provider_func): torch.distributed.barrier() timers('load-checkpoint').stop() timers.log(['load-checkpoint']) + print_rank_0(f'module = {model[0]}') + for layer_id in ['3', '4']: + if hasattr(model[0].module._modules[layer_id], 'input_layernorm'): + weight = model[0].module._modules[layer_id].input_layernorm.weight + print(f'rank {torch.distributed.get_rank()} before reduce weight = {weight}') + torch.distributed.all_reduce(weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + print(f'rank {torch.distributed.get_rank()} after reduce weight = {weight}') + + if weight._hp_mapping is not None: + print(f'rank {torch.distributed.get_rank()} fixing hp for input_layernorm') + #weight._hp_mapping.update_hp() + hp = weight._hp_mapping.hp_fragment + torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + + for key in ['exp_avg', 'exp_avg_sq']: + optim_state = weight._hp_mapping.get_optim_state(key) + print(f'rank {torch.distributed.get_rank()} before reduce optim state {key} = {optim_state}') + torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + print(f'rank {torch.distributed.get_rank()} after reduce optim state {key} = {optim_state}') + else: args.iteration = 0 + + torch.distributed.barrier() # We only support local DDP with multiple micro-batches. if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1: diff --git a/run_bf16.sh b/run_bf16.sh index fd3a48398..f4295cc30 100755 --- a/run_bf16.sh +++ b/run_bf16.sh @@ -36,26 +36,6 @@ ZERO_STAGE=0 #GLOBAL_BATCH=128 #WORKER_STR="-i worker-0" - -TP=1 -PP=1 -DP=2 -WORLD_SIZE=$((TP*PP*DP)) -HIDDEN=1024 -LAYERS=24 -SEQ=1024 -GLOBAL_BATCH=1 -WORKER_STR="" - -MICRO_BATCH=1 - -LR=6.0e-4 -MIN_LR=6.0e-5 -DTYPE="bf16" -EXP_DIR=${HOME}/experiments/results/bf16 -LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3" -mkdir -p $LOG_DIR - while [[ $# -gt 0 ]] do key="$1" @@ -66,30 +46,69 @@ case $key in ;; -z|--zero-stage) ZERO_STAGE=$2; + shift shift ;; *) - echo "Unknown argument(s)" - usage + echo "Unknown argument(s): $key" exit 1 shift ;; esac done +TP=4 +PP=1 +DP=2 +WORLD_SIZE=$((TP*PP*DP)) + +HIDDEN=1024 +LAYERS=24 +NHEADS=32 +SEQ=1024 + +#LAYERS=2 +#HIDDEN=8 +#NHEADS=2 +#SEQ=8 + +GLOBAL_BATCH=64 +WORKER_STR="" +EXIT_ITERS=10 +TRAIN_SAMPLES=1000000 +MICRO_BATCH=32 +LR=1.0e-1 +MIN_LR=1.0e-1 +DTYPE="bf16" +RUN_VERSION=1 +EXP_DIR=${HOME}/experiments/results/bf16 +RUN_TAG="tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_train_${EXIT_ITERS}_v${RUN_VERSION}" +LOG_DIR="${EXP_DIR}/tensorboard/${RUN_TAG}" +mkdir -p $LOG_DIR +export BIT16_DUMP_FILE="${EXP_DIR}/${RUN_TAG}.txt" +CHECKPOINT_DIR="./checkpoints/${DTYPE}_z${ZERO_STAGE}_tp${TP}_pp${PP}_dp${DP}_nl${LAYERS}_exit_${EXIT_ITERS}_v${RUN_VERSION}" options=" \ --tensor-model-parallel-size $TP \ --pipeline-model-parallel-size $PP \ --num-layers $LAYERS \ --hidden-size $HIDDEN \ - --num-attention-heads 32 \ + --num-attention-heads ${NHEADS} \ --seq-length $SEQ \ - --loss-scale 12 \ --max-position-embeddings $SEQ \ --micro-batch-size $MICRO_BATCH \ --global-batch-size $GLOBAL_BATCH \ - --train-iters 1000 \ + --optimizer adam \ + --adam-eps 1e-8 \ + --lr-warmup-samples 5 \ + --min-lr 1e-6 \ + --lr-decay-style cosine \ + --lr-decay-samples 12 \ + --override-lr-scheduler \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --embed-layernorm \ + --partition-activations \ --lr $LR \ --min-lr $MIN_LR \ --lr-decay-style cosine \ @@ -100,17 +119,21 @@ options=" \ --vocab-file ${VOCAB_PATH} \ --merge-file ${MERGE_PATH} \ --save-interval 10000 \ - --split 98,2,0 \ - --clip-grad 1.0 \ --weight-decay 0.1 \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --init-method-std 0.006 \ --${DTYPE} \ --checkpoint-activations \ - --exit-interval 10000 \ + --train-samples ${TRAIN_SAMPLES} \ + --exit-interval ${EXIT_ITERS} \ + --seed 42 \ + --load ${CHECKPOINT_DIR} \ + --save ${CHECKPOINT_DIR} \ --tensorboard-dir $LOG_DIR " +# --split 10,0,0 \ +# --rampup-batch-size 2 2 1_000 \ if [[ ${USE_DEEPSPEED} -eq 1 ]]; then @@ -155,7 +178,8 @@ WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" #WORKER_STR="-i worker-0:0,1,2,3" #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" -run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" + +run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" echo ${run_cmd} diff --git a/tests/test_training.py b/tests/test_training.py index 65067982e..b65a051e5 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -676,7 +676,7 @@ def test_layer_norm_consistent(self, variation): launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args # keep for quick debug - # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) From 8f2ea60b7f44a81aa5655ae6d819c8c3f5389405 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 28 Mar 2022 13:13:17 -0700 Subject: [PATCH 28/38] fix requirements --- requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index da76b5e44..47e11bf04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,11 @@ pybind11 regex six tensorboard -torch>=1.7 +torch>=1.11 transformers -DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git +# for now using this branch for bf16 work +DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates +#DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git # versions from HF transformers black==21.4b0 isort>=5.5.4 From fc8f813df1cef6a90472fab504bb7a5509237099 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 28 Mar 2022 18:07:22 -0700 Subject: [PATCH 29/38] dynamically discovered layer norm weights / refactor --- megatron/training.py | 80 +++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 96426b401..bf500ae1d 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -367,6 +367,63 @@ def get_learning_rate_scheduler(optimizer): return lr_scheduler +def sync_layer_norm(n, p): + + rank = torch.distributed.get_rank() + + print(f'rank {rank} processing {n}') + + #return + + # 1. bf16 + #print(f'rank {rank} before reduce p = {p}') + torch.distributed.all_reduce(p, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + #print(f'rank {rank} after reduce p = {p}') + + if p._hp_mapping is not None: + #print(f'rank {rank} fixing hp for input_layernorm') + #p._hp_mapping.update_hp() + + # 2. fp32 + hp = p._hp_mapping.hp_fragment + torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + + # 3. optim states + for key in ['exp_avg', 'exp_avg_sq']: + optim_state = p._hp_mapping.get_optim_state(key) + #print(f'rank {rank} before reduce optim state {key} = {optim_state}') + torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + #print(f'rank {rank} after reduce optim state {key} = {optim_state}') + + +def sync_all_layer_norms(model): + # syncs weight+bias for each of the following layer norms (via averaging across TP ranks) + # 1. word embedding front word_embeddings.norm + # 2. transformer block input_layernorm x 70 + # 3. transformer block post_attention_layernorm x 70 + # 4. word embedding head - I think it's just weight + bias w/o a proper name in the last layer file layer_0X-model_0X-model_states.pt, see: https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/affff3d2927864c6948075700c672971782441f4/megatron/model/gpt_model.py#L267 + + import re + layer_norms_params_end_with = [ + "word_embeddings.norm.weight", "word_embeddings.norm.bias", + "input_layernorm.weight", "input_layernorm.bias", + "post_attention_layernorm.weight", "post_attention_layernorm.bias"] + + for n,p in model.named_parameters(): + #print(n) + # XXX: would be much simpler to re-do this logic to traverse children modules and act on isinstance of MixedFusedLayerNorm instead + # 1. first easy to identify layer norm params as they have a unique prefix each + for end in layer_norms_params_end_with: + if n.endswith(end): + sync_layer_norm(n, p) + + # 2. now the last layer norm that has no prefix + # hack: (\d\d): MixedFusedLayerNorm() is hanging there w/o any prefix name, so need to match something like: + # /^6.weight$/ or /^6.bias$/ + if mpu.is_pipeline_last_stage() and re.match('^\d+\.(weight|bias)$', n): + sync_layer_norm(n, p) + + def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() @@ -413,28 +470,13 @@ def setup_model_and_optimizer(model_provider_func): timers('load-checkpoint').stop() timers.log(['load-checkpoint']) print_rank_0(f'module = {model[0]}') - for layer_id in ['3', '4']: - if hasattr(model[0].module._modules[layer_id], 'input_layernorm'): - weight = model[0].module._modules[layer_id].input_layernorm.weight - print(f'rank {torch.distributed.get_rank()} before reduce weight = {weight}') - torch.distributed.all_reduce(weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) - print(f'rank {torch.distributed.get_rank()} after reduce weight = {weight}') - - if weight._hp_mapping is not None: - print(f'rank {torch.distributed.get_rank()} fixing hp for input_layernorm') - #weight._hp_mapping.update_hp() - hp = weight._hp_mapping.hp_fragment - torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) - - for key in ['exp_avg', 'exp_avg_sq']: - optim_state = weight._hp_mapping.get_optim_state(key) - print(f'rank {torch.distributed.get_rank()} before reduce optim state {key} = {optim_state}') - torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) - print(f'rank {torch.distributed.get_rank()} after reduce optim state {key} = {optim_state}') + # turn on to enable layer norm syncing + if 1: + sync_all_layer_norms(model[0].module) else: args.iteration = 0 - + torch.distributed.barrier() # We only support local DDP with multiple micro-batches. From 4443e6d2ff7a88399ba055995eb79eb329d79a7c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 28 Mar 2022 18:18:28 -0700 Subject: [PATCH 30/38] fix regex --- megatron/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index bf500ae1d..a3ece5388 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -420,7 +420,7 @@ def sync_all_layer_norms(model): # 2. now the last layer norm that has no prefix # hack: (\d\d): MixedFusedLayerNorm() is hanging there w/o any prefix name, so need to match something like: # /^6.weight$/ or /^6.bias$/ - if mpu.is_pipeline_last_stage() and re.match('^\d+\.(weight|bias)$', n): + if mpu.is_pipeline_last_stage() and re.match(r'^\d+\.(weight|bias)$', n): sync_layer_norm(n, p) From d2aa4f18c6bac3c2b101e5b19c3ad509cbc1cf04 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 28 Mar 2022 18:30:06 -0700 Subject: [PATCH 31/38] add the test script --- compare_tp_weights.py | 83 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 compare_tp_weights.py diff --git a/compare_tp_weights.py b/compare_tp_weights.py new file mode 100644 index 000000000..c7e5779bf --- /dev/null +++ b/compare_tp_weights.py @@ -0,0 +1,83 @@ + +# usage: +# python compare_tp_weights.py input_layernorm.weight 40 2 . + +# input_layernorm.weight +# input_layernorm.bias +# post_attention_layernorm.weight +# post_attention_layernorm.bias + +# one liner for just 2 weights comparison +# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' input_layernorm.weight layer_03-model_00-model_states.pt layer_03-model_01-model_states.pt + +# 13B +# cd /gpfsdsstore/projects/rech/six/commun/checkpoints/tr1-13B/tr1-13B-with-optim/global_step168000 +# python ~/compare_tp_weights.py input_layernorm.weight 40 2 . + +# 104B +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/emb-norm/global_step16800 +# +# python ~/compare_tp_weights.py input_layernorm.weight 64 4 . > ~/104B.input_layernorm.weight.txt +# python ~/compare_tp_weights.py post_attention_layernorm.weight 64 4 . > ~/104B.post_attention_layernorm.weight.txt +# python ~/compare_tp_weights.py input_layernorm.bias 64 4 . > ~/104B.input_layernorm.bias.txt +# python ~/compare_tp_weights.py post_attention_layernorm.bias 64 4 . > ~/104B.post_attention_layernorm.bias.txt + +# other 104B checkpoints: + +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/to-back-up/tr8b-104B/checkpoints/cl-exp-02/global_step10500 +# mismatched 68 +# +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8-104B-wide/experiment11/global_step15660 +# mismatched +# +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8-104B-wide/experiment06/global_step5100 +# python ~/compare_tp_weights.py input_layernorm.weight 32 4 +# **all matched** +# +# python ~/compare_tp_weights.py post_attention_layernorm.weight 32 4 +# not matched + + + +# # 104B/176B embed-norm check +# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_00-model_states.pt layer_01-model_01-model_states.pt +# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_01-model_states.pt layer_01-model_02-model_states.pt +# python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_02-model_states.pt layer_01-model_03-model_states.pt + + +# # 176B +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr11-176B-ml/checkpoints/main/global_step16400 +# python ~/compare_tp_weights.py input_layernorm.weight 70 4 . > ~/176B.input_layernorm.weight.txt +# python ~/compare_tp_weights.py post_attention_layernorm.weight 70 4 . > ~/176B.post_attention_layernorm.weight.txt +# python ~/compare_tp_weights.py input_layernorm.bias 70 4 . > ~/176B.input_layernorm.bias.txt +# python ~/compare_tp_weights.py post_attention_layernorm.bias 70 4 . > ~/176B.post_attention_layernorm.bias.txt + + +import torch, sys + + + +key, nlayers, tp_size, checkpoint_dir = sys.argv[1:5] + +print(f"checking key={key}") +matched, mismatched = 0, 0 +for layer_id in range(int(nlayers)): + for tp in range(int(tp_size)-1): + f1 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp:02d}-model_states.pt" + f2 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp+1:02d}-model_states.pt" + c1 = torch.load(f1) + c2 = torch.load(f2) + # print(f1) + # print(f2) + header = f"layer_id={layer_id}: {tp}-{tp+1}" + try: + torch.testing.assert_close(c1[key], c2[key], rtol=0.0, atol=0.0, check_device=False) + print(f"✓ {header}") + matched += 1 + except: + print(f"✗ {header}") + mismatched += 1 + #raise + +print(f"Matched : {matched}") +print(f"Mismatched: {mismatched}") From d64a947e6a31a7d968d493b2347461f15ef99f15 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 29 Mar 2022 08:16:16 -0700 Subject: [PATCH 32/38] compare on cpu --- compare_tp_weights.py | 2 ++ compare_tp_weights_cpu.py | 42 ++++++++++++++++++++++++++++++ megatron/model/fused_layer_norm.py | 10 ++++++- 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 compare_tp_weights_cpu.py diff --git a/compare_tp_weights.py b/compare_tp_weights.py index c7e5779bf..69675e64c 100644 --- a/compare_tp_weights.py +++ b/compare_tp_weights.py @@ -44,6 +44,8 @@ # python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_01-model_states.pt layer_01-model_02-model_states.pt # python -c 'import torch, sys; k=sys.argv[1]; a,b = map(torch.load, sys.argv[2:4]); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_02-model_states.pt layer_01-model_03-model_states.pt +# same on cpu +python -c 'import torch, sys; k=sys.argv[1]; a=torch.load(sys.argv[2], map_location=torch.device("cpu"));b=torch.load(sys.argv[3], map_location=torch.device("cpu")); print("Exact match" if torch.testing.assert_close(a[k], b[k], rtol=0.0, atol=0.0, check_device=False) is None else "Mismatch")' word_embeddings.norm.weight layer_01-model_00-model_states.pt layer_01-model_01-model_states.pt # # 176B # cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr11-176B-ml/checkpoints/main/global_step16400 diff --git a/compare_tp_weights_cpu.py b/compare_tp_weights_cpu.py new file mode 100644 index 000000000..7c07bcabb --- /dev/null +++ b/compare_tp_weights_cpu.py @@ -0,0 +1,42 @@ + +# usage: +# python compare_tp_weights.py input_layernorm.weight 40 2 . + + +# 13B +# cd /gpfsdsstore/projects/rech/six/commun/checkpoints/tr1-13B/tr1-13B-with-optim/global_step168000 +# python ~/compare_tp_weights.py input_layernorm.weight 40 2 . + +# 104B +# cd /gpfsssd/scratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints/emb-norm/global_step16800 +# python ~/compare_tp_weights.py input_layernorm.weight 64 4 . + + +import torch, sys + + + +key, nlayers, tp_size, checkpoint_dir = sys.argv[1:5] + +print(f"checking key={key}") +matched, mismatched = 0, 0 +for layer_id in range(int(nlayers)): + for tp in range(int(tp_size)-1): + f1 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp:02d}-model_states.pt" + f2 = f"{checkpoint_dir}/layer_{3+layer_id:02d}-model_{tp+1:02d}-model_states.pt" + c1 = torch.load(f1, map_location=torch.device('cpu')) + c2 = torch.load(f2, map_location=torch.device('cpu')) + # print(f1) + # print(f2) + header = f"layer_id={layer_id}: {tp}-{tp+1}" + try: + torch.testing.assert_close(c1[key], c2[key], rtol=0.0, atol=0.0, check_device=False) + print(f"✓ {header}") + matched += 1 + except: + print(f"✗ {header}") + mismatched += 1 + #raise + +print(f"Matched : {matched}") +print(f"Mismatched: {mismatched}") diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index c344a5cba..613734100 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -83,7 +83,7 @@ def reset_parameters(self): init.zeros_(self.bias) - def forward(self, input): + def forward_old(self, input): # weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())] # torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) # biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())] @@ -100,3 +100,11 @@ def forward(self, input): return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) + + def forward(self, input): + + torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + + return FusedLayerNormAffineFunction.apply( + input, self.weight, self.bias, self.normalized_shape, self.eps) From bf7eeb3a8bef1913153b5cd5fa2bc1bf667663c3 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 29 Mar 2022 09:45:58 -0700 Subject: [PATCH 33/38] add 2 more weights to sync --- megatron/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index a3ece5388..b1d2b9b60 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -407,7 +407,9 @@ def sync_all_layer_norms(model): layer_norms_params_end_with = [ "word_embeddings.norm.weight", "word_embeddings.norm.bias", "input_layernorm.weight", "input_layernorm.bias", - "post_attention_layernorm.weight", "post_attention_layernorm.bias"] + "post_attention_layernorm.weight", "post_attention_layernorm.bias", + "self_attention.dense.bias", "mlp.dense_4h_to_h.bias", + ] for n,p in model.named_parameters(): #print(n) From 8482595623b4d0ba8b548cea06ecaf9b0a3e39c0 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 30 Mar 2022 06:37:01 +0500 Subject: [PATCH 34/38] fp32 accessors --- megatron/training.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index b1d2b9b60..a5e64eaec 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -375,11 +375,22 @@ def sync_layer_norm(n, p): #return + fp32_param = p.get_full_hp_param() + torch.set_printoptions(sci_mode=False, precision=6) + print(f'rank {rank} bf16 = {p}') + print(f'rank {rank} fp32 = {fp32_param}') + torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False) + + for key in ['exp_avg', 'exp_avg_sq']: + full_optim_state = p.get_full_hp_param(optim_state_key=key) + print(f'rank {rank} full optim state {key} = {full_optim_state}') + # 1. bf16 #print(f'rank {rank} before reduce p = {p}') torch.distributed.all_reduce(p, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) #print(f'rank {rank} after reduce p = {p}') + if p._hp_mapping is not None: #print(f'rank {rank} fixing hp for input_layernorm') #p._hp_mapping.update_hp() @@ -390,10 +401,10 @@ def sync_layer_norm(n, p): # 3. optim states for key in ['exp_avg', 'exp_avg_sq']: - optim_state = p._hp_mapping.get_optim_state(key) - #print(f'rank {rank} before reduce optim state {key} = {optim_state}') - torch.distributed.all_reduce(optim_state, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) - #print(f'rank {rank} after reduce optim state {key} = {optim_state}') + optim_state_fragment = p._hp_mapping.get_optim_state_fragment(key) + #print(f'rank {rank} before reduce optim state fragment {key} = {optim_state_fragment}') + torch.distributed.all_reduce(optim_state_fragment, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + #print(f'rank {rank} after reduce optim state fragment {key} = {optim_state_fragment}') def sync_all_layer_norms(model): From 86b726cbd48e8c73f955fffe244d31c58932748c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 29 Mar 2022 18:57:27 -0700 Subject: [PATCH 35/38] improve the doc, and comment out the demo --- megatron/training.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index a5e64eaec..d40415797 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -375,15 +375,22 @@ def sync_layer_norm(n, p): #return - fp32_param = p.get_full_hp_param() - torch.set_printoptions(sci_mode=False, precision=6) - print(f'rank {rank} bf16 = {p}') - print(f'rank {rank} fp32 = {fp32_param}') - torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False) - - for key in ['exp_avg', 'exp_avg_sq']: - full_optim_state = p.get_full_hp_param(optim_state_key=key) - print(f'rank {rank} full optim state {key} = {full_optim_state}') + # # Here is how you can access fp32 version of the bf16 param and fp32 optim states + # # + # # Note that there is an all_gather called on all dp ranks when `get_full_hp_param` is called - + # # so it's not free + # # + # # a. fp32 param + # fp32_param = p.get_full_hp_param() + # torch.set_printoptions(sci_mode=False, precision=6) + # print(f'rank {rank} bf16 = {p}') + # print(f'rank {rank} fp32 = {fp32_param}') + # torch.testing.assert_close(p, fp32_param, rtol=4e-3, atol=0, check_dtype=False) + + # # b. fp32 optim states + # for key in ['exp_avg', 'exp_avg_sq']: + # full_optim_state = p.get_full_hp_param(optim_state_key=key) + # print(f'rank {rank} full optim state {key} = {full_optim_state}') # 1. bf16 #print(f'rank {rank} before reduce p = {p}') From 2ac141b1617b1bb5c175cd8fb34c0ce760b7c22b Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 29 Mar 2022 19:14:08 -0700 Subject: [PATCH 36/38] typo --- megatron/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index d40415797..211ebbda4 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -377,7 +377,7 @@ def sync_layer_norm(n, p): # # Here is how you can access fp32 version of the bf16 param and fp32 optim states # # - # # Note that there is an all_gather called on all dp ranks when `get_full_hp_param` is called - + # # Note that there is an all_reduce called on all dp ranks when `get_full_hp_param` is called - # # so it's not free # # # # a. fp32 param From d576775c34d7680b8c5a9b685de03a91b9a66448 Mon Sep 17 00:00:00 2001 From: Thomas Wang <24695242+thomasw21@users.noreply.github.com> Date: Wed, 6 Apr 2022 18:42:36 +0200 Subject: [PATCH 37/38] Sync torch_rng_state (#277) --- megatron/data/data_samplers.py | 1 + megatron/training.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 1cbeac312..b933ff34e 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -52,6 +52,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, + generator=torch.Generator().manual_seed(args.seed), pin_memory=True) class MegatronPretrainingSampler: diff --git a/megatron/training.py b/megatron/training.py index 211ebbda4..cf148b2d7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -443,6 +443,16 @@ def sync_all_layer_norms(model): if mpu.is_pipeline_last_stage() and re.match(r'^\d+\.(weight|bias)$', n): sync_layer_norm(n, p) +def sync_all_torch_random_state(): + torch_rng_state = torch.get_rng_state() + # We use rank 1 as source of truth and sed the new + torch.distributed.broadcast( + torch_rng_state, + src=mpu.get_tensor_model_parallel_src_rank() + 1, + group=mpu.get_tensor_model_parallel_group() + ) + torch.set_rng_state(torch_rng_state) + def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" @@ -494,6 +504,7 @@ def setup_model_and_optimizer(model_provider_func): # turn on to enable layer norm syncing if 1: sync_all_layer_norms(model[0].module) + sync_all_torch_random_state() else: args.iteration = 0 From 475f3730200261449085deea28330b861e7820fe Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 6 Apr 2022 19:10:36 +0200 Subject: [PATCH 38/38] Fix device issue when using torch.broadcast --- megatron/training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index cf148b2d7..bd9fa8e99 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -444,14 +444,14 @@ def sync_all_layer_norms(model): sync_layer_norm(n, p) def sync_all_torch_random_state(): - torch_rng_state = torch.get_rng_state() + torch_rng_state = torch.get_rng_state().cuda() # We use rank 1 as source of truth and sed the new torch.distributed.broadcast( torch_rng_state, src=mpu.get_tensor_model_parallel_src_rank() + 1, group=mpu.get_tensor_model_parallel_group() ) - torch.set_rng_state(torch_rng_state) + torch.set_rng_state(torch_rng_state.cpu()) def setup_model_and_optimizer(model_provider_func):