From d02bb32a36eecab15e2782839eb5b6838df5cb88 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 5 Jun 2024 19:06:16 +0300 Subject: [PATCH] fix fp16 precision issue (#9376) * fix fp16 precision issue by disabling enable_autocast Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * add fp16 precision test Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 4 ++++ .../nlp/models/language_modeling/megatron_base_model.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 83ef8a8b4339..b7e1eda5912d 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -3322,8 +3322,10 @@ jobs: trainer.limit_val_batches=2 \ trainer.accumulate_grad_batches=1 \ trainer.max_steps=3 \ + trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.megatron_amp_O2=False \ model.tensor_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ @@ -3355,9 +3357,11 @@ jobs: trainer.limit_val_batches=2 \ trainer.accumulate_grad_batches=1 \ trainer.max_steps=6 \ + trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ exp_manager.resume_if_exists=True \ + model.megatron_amp_O2=False \ model.tensor_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index a27f9fd5e5e4..96b7fd2cbf15 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -1140,6 +1140,10 @@ def build_model_parallel_config(self) -> ModelParallelConfig: "tp_comm_overlap": self.cfg.get('ub_tp_comm_overlap', False), } + # Set enable_autocast to False when precision is fp16 and not using bias + if not megatron_amp_O2 and not self.cfg.get('bias', True): + config_mapping["enable_autocast"] = False + # instantitate ModelParallelConfig from this dict mp_config_dict = {}