diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 08af7c8c0617..b7e1eda5912d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -100,7 +100,7 @@ jobs:
             # Transformer Engine installation
             git clone https://github.com/NVIDIA/TransformerEngine.git && \
                 pushd TransformerEngine && \
-                git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
+                git fetch origin a51ff542dcb1f605aa54f9b0e1aaadb132acd53d && \
                 git checkout FETCH_HEAD && \
                 git submodule init && git submodule update && \
                 NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
@@ -118,7 +118,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 && \
+                git checkout dace0330ac8b5bbc3a21ff14f7ef7544abb16334 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \
@@ -1414,717 +1414,10 @@ jobs:
           if: "failure()"
 
 
-  # L2: Dialogue Classification
-
-  # TODO: pleasefixme
-  # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure-gpus-1
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           cd examples/nlp/dialogue && \
-  #           python dialogue.py \
-  #           model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-  #           model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
-  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-  #           model.dataset.dialogues_example_dir=sgd_gen_outputs \
-  #           model.dataset.task_name=debug_sample \
-  #           trainer.max_steps=1 \
-  #           trainer.max_epochs=1 \
-  #           model.train_ds.batch_size=2 \
-  #           model.validation_ds.batch_size=2 \
-  #           model.test_ds.batch_size=2 \
-  #           model.nemo_path=null \
-  #           trainer.val_check_interval=0.0 \
-  #           trainer.devices=1 \
-  #           model.dataset.use_cache=false \
-  #           model.tokenizer.special_tokens={pad_token:"endoftext"} \
-  #           model.tokenizer.tokenizer_name=gpt2 \
-  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-  #           model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
-  #           trainer.accelerator=gpu \
-  #           exp_manager=null  && \
-  #           rm -rf sgd_gen_outputs
-
-  L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.dataset.num_tasks=6 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-cased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-            model.dataset.task=assistant \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-            model.dataset.task=zero_shot \
-            model.dataset.prompt_template="This example is" \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=megatron \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_bart_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_dialogue_nearest_neighbour_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2: Dialogue Generation
-  L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender_s2s \
-            model.dataset.task=ms_marco \
-            model.library=huggingface \
-            model.dataset.debug_mode=True \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-            model.dataset.task_name=debug_sample \
-            model.dataset.task=sgd_generation \
-            model.dataset.input_field=utterance+system_actions \
-            model.dataset.output_field=system_utterance \
-            model.dataset.use_cache=false \
-            model.dataset.system_utterance=next_turn \
-            model.dataset.debug_mode=True \
-            model.dataset.prompt_template=slots_values \
-            model.library=huggingface \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-#     - name: L2: Dialogue Generation Part 2
-#       when {
-#         anyOf {
-#           branch main
-#           changeRequest target: main
-#         }
-#       }
-#       failFast true
-#       parallel {
-#         - name: Dialogue: Answer Extender using DialogueGPTGenerationModel
-#           - run: |
-#             cd examples/nlp/dialogue && \
-#             python dialogue.py \
-#             do_training=False \
-#             model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-#             model.dataset.dialogues_example_dir=answer_extender \
-#             model.library=huggingface \
-#             model.dataset.task=ms_marco \
-#             model.dataset.debug_mode=True \
-#             trainer.val_check_interval=0.0 \
-#             trainer.devices=1 \
-#             model.dataset.use_cache=false \
-#             model.language_model.pretrained_model_name=gpt2 \
-#             trainer.accelerator=gpu \
-#             exp_manager=null  && \
-#             rm -rf answer_extender
-#           }
-#         }
-#       }
-#     }
-
-  # L2: COPY
-  L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender \
-            model.library=huggingface \
-            model.dataset.task=ms_marco \
-            model.dataset.debug_mode=True \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2: Duplex Text Normalization
-  L2_Duplex_Text_Normalization_with_Tarred_dataset:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/duplex_text_normalization && \
-            python duplex_text_normalization_train.py \
-            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-            mode=tn \
-            lang=en \
-            tagger_model.do_training=false \
-            decoder_model.transformer=t5-small \
-            data.validation_ds.batch_size=2 \
-            data.train_ds.use_cache=false \
-            data.validation_ds.use_cache=false \
-            data.test_ds.batch_size=2 \
-            data.train_ds.decoder_data_augmentation=false \
-            data.train_ds.num_workers=2 \
-            decoder_trainer.devices=[0,1] \
-            decoder_trainer.accelerator="gpu" \
-            data.train_ds.use_tarred_dataset=true \
-            +decoder_trainer.fast_dev_run=true \
-            decoder_exp_manager.create_checkpoint_callback=false \
-            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-            data.test_ds.use_cache=false \
-            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-# Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
-# TODO: add when megatron bert is supported again in NeMo
-# - name: L2: MegaBERT Token Classification
-#   when {
-#     anyOf {
-#       branch main
-#       changeRequest target: main
-#     }
-#   }
-#   failFast true
-#   - run: |
-#     cd examples/nlp/token_classification && \
-#     python token_classification_train.py \
-#     model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-#     model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
-#     model.train_ds.batch_size=10 \
-#     model.dataset.max_seq_length=50 \
-#     model.dataset.use_cache=false \
-#     trainer.accelerator=gpu \
-#     trainer.strategy=ddp \
-#     trainer.precision=16 \
-#     trainer.devices=1 \
-#     trainer.accelerator="gpu" \
-#     +trainer.fast_dev_run=true \
-#     exp_manager=null
-#   }
-# }
-
-  # L2: BERT Text Classification
-  L2_BERT_Text_Classification_with_BERT_Test:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/text_classification && \
-            python text_classification_with_bert.py \
-            model.dataset.num_classes=6 \
-            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-            model.language_model.pretrained_model_name=distilbert-base-uncased \
-            model.train_ds.batch_size=10 \
-            model.dataset.max_seq_length=50 \
-            model.dataset.use_cache=false \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
+  # L2: Duplex Text Normalization
+  L2_Duplex_Text_Normalization_with_Tarred_dataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
+    runs-on: self-hosted-azure
     timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
@@ -2140,68 +1433,30 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
+            cd examples/nlp/duplex_text_normalization && \
+            python duplex_text_normalization_train.py \
+            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
+            mode=tn \
+            lang=en \
+            tagger_model.do_training=false \
+            decoder_model.transformer=t5-small \
+            data.validation_ds.batch_size=2 \
+            data.train_ds.use_cache=false \
+            data.validation_ds.use_cache=false \
+            data.test_ds.batch_size=2 \
+            data.train_ds.decoder_data_augmentation=false \
+            data.train_ds.num_workers=2 \
+            decoder_trainer.devices=[0,1] \
+            decoder_trainer.accelerator="gpu" \
+            data.train_ds.use_tarred_dataset=true \
+            +decoder_trainer.fast_dev_run=true \
+            decoder_exp_manager.create_checkpoint_callback=false \
+            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
+            data.test_ds.use_cache=false \
+            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
 
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
@@ -2528,132 +1783,6 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir}" \
-              model.validation_ds.ds_item="${tmp_data_dir}" \
-              model.test_ds.ds_item="${tmp_data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=true && \
-            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-            rm -rf "${tmp_data_dir}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir_2}" \
-              model.validation_ds.ds_item="${tmp_data_dir_2}" \
-              model.test_ds.ds_item="${tmp_data_dir_2}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-              "${tmp_data_dir_2}" \
-              "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  # Punctuation & Capitalization tarred dataset:
-  Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-              "${data_dir}"/ && \
-            usual_data=${data_dir}/wmt_wiki_10000 && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tarred_data=${output_dir}/train_tarred && \
-            tokens_in_batch=2000 && \
-            max_seq_length=512 && \
-            lm_model=distilbert-base-uncased && \
-            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-              --text ${usual_data}/input.txt \
-              --labels ${usual_data}/labels.txt \
-              --output_dir ${tarred_data} \
-              --tokens_in_batch ${tokens_in_batch} \
-              --max_seq_length 512 \
-              --lines_per_dataset_fragment 2000 \
-              --num_batches_per_tarfile 5 \
-              --tar_file_prefix punctuation_capitalization \
-              --tokenizer_name ${lm_model} \
-              --use_fast_tokenizer \
-              --pad_label O \
-              --n_jobs 3 && \
-            echo "Number of tarred files in dataset:" && \
-            ls ${tarred_data}/*.tar | wc -l && \
-            echo "Label id files in dataset:" && \
-            ls ${tarred_data}/*.csv && \
-            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.train_ds.ds_item=${tarred_data} \
-              model.language_model.pretrained_model_name=${lm_model} \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.tar_metadata_file=${metadata_file} \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir=${output_dir}/output && \
-            rm -rf "${output_dir}" "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
   # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
   Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
     needs: [cicd-test-container-setup]
@@ -2899,38 +2028,6 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  # L2: Entity Linking        
-  L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/entity_linking && \
-            python self_alignment_pretraining.py \
-            project_dir=. \
-            trainer.val_check_interval=3 \
-            model.raw_data=None \
-            model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-            model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-            model.train_ds.batch_size=8 \
-            model.validation_ds.batch_size=8 \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
 
   # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
   # is in the release container
@@ -3689,7 +2786,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3697,7 +2794,8 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=10 \
-            trainer.precision=16 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             model.pipeline_model_parallel_size=2 \
@@ -3718,7 +2816,7 @@ jobs:
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3726,7 +2824,8 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=20 \
-            trainer.precision=16 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             exp_manager.resume_if_exists=True \
@@ -3771,7 +2870,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3779,7 +2878,8 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=10 \
-            trainer.precision=16 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             model.tensor_model_parallel_size=2 \
@@ -3801,7 +2901,7 @@ jobs:
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3809,7 +2909,8 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=20 \
-            trainer.precision=16 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             exp_manager.resume_if_exists=True \
@@ -3854,7 +2955,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3862,7 +2963,6 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=10 \
-            trainer.precision=32 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             model.mcore_bert=True \
@@ -3885,7 +2985,7 @@ jobs:
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
             trainer.accelerator=gpu \
             trainer.log_every_n_steps=1 \
@@ -3893,7 +2993,6 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=20 \
-            trainer.precision=32 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
             exp_manager.resume_if_exists=True \
@@ -4226,6 +3325,7 @@ jobs:
             trainer.precision=16 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.megatron_amp_O2=False \
             model.tensor_model_parallel_size=2 \
             model.optim.name=fused_adam \
             model.optim.lr=2e-4 \
@@ -4235,7 +3335,6 @@ jobs:
             model.max_position_embeddings=128 \
             model.encoder_seq_length=128 \
             model.data.seq_length=128 \
-            model.normalization=rmsnorm \
             model.bias=False \
             model.bias_activation_fusion=False \
             model.bias_dropout_add_fusion=False \
@@ -4262,6 +3361,7 @@ jobs:
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             exp_manager.resume_if_exists=True \
+            model.megatron_amp_O2=False \
             model.tensor_model_parallel_size=2 \
             model.optim.name=fused_adam \
             model.optim.lr=2e-4 \
@@ -4271,7 +3371,6 @@ jobs:
             model.max_position_embeddings=128 \
             model.encoder_seq_length=128 \
             model.data.seq_length=128 \
-            model.normalization=rmsnorm \
             model.bias=False \
             model.bias_activation_fusion=False \
             model.bias_dropout_add_fusion=False \
@@ -4317,7 +3416,6 @@ jobs:
            trainer.limit_val_batches=2 \
            trainer.accumulate_grad_batches=1 \
            trainer.max_steps=3 \
-           trainer.precision=16 \
            trainer.gradient_clip_val=1.0 \
            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
            model.tensor_model_parallel_size=2 \
@@ -4331,7 +3429,6 @@ jobs:
            model.data.seq_length=128 \
            model.position_embedding_type=rope \
            model.rotary_percentage=0.5 \
-           model.normalization=rmsnorm \
            model.bias=False \
            model.bias_activation_fusion=False \
            model.bias_dropout_add_fusion=False \
@@ -4355,7 +3452,6 @@ jobs:
             # trainer.limit_val_batches=1 \
             # trainer.accumulate_grad_batches=1 \
             # trainer.max_steps=6 \
-            # trainer.precision=16 \
             # trainer.gradient_clip_val=1.0 \
             # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             # exp_manager.resume_if_exists=True \
@@ -4508,7 +3604,6 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=3 \
-            trainer.precision=16 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             model.tensor_model_parallel_size=2 \
@@ -4521,7 +3616,6 @@ jobs:
             model.encoder_seq_length=128 \
             model.data.seq_length=128 \
             model.position_embedding_type=alibi \
-            model.normalization=rmsnorm \
             model.bias=False \
             model.bias_activation_fusion=False \
             model.bias_dropout_add_fusion=False \
@@ -4545,7 +3639,6 @@ jobs:
             #trainer.limit_val_batches=1 \
             #trainer.accumulate_grad_batches=1 \
             #trainer.max_steps=6 \
-            #trainer.precision=16 \
             #trainer.gradient_clip_val=1.0 \
             #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             #exp_manager.resume_if_exists=True \
@@ -4605,7 +3698,6 @@ jobs:
             trainer.limit_val_batches=2 \
             trainer.accumulate_grad_batches=1 \
             trainer.max_steps=3 \
-            trainer.precision=16 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             model.tensor_model_parallel_size=2 \
@@ -4618,7 +3710,6 @@ jobs:
             model.encoder_seq_length=128 \
             model.data.seq_length=128 \
             model.position_embedding_type=kerple \
-            model.normalization=rmsnorm \
             model.bias=False \
             model.bias_activation_fusion=False \
             model.bias_dropout_add_fusion=False \
@@ -4880,7 +3971,7 @@ jobs:
             python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
             trainer.devices=1 \
             trainer.num_nodes=1 \
-            trainer.precision=32 \
+            trainer.precision=bf16 \
             trainer.max_steps=4 \
             trainer.val_check_interval=4 \
             trainer.enable_checkpointing=False \
@@ -5994,102 +5085,6 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  # L2: Megatron T5 GLUE/XNLI Finetuning 
-  # TODO(Oktai15): update it in 1.8.0 version
-  L2_Megatron_T5_GLUE_RTE:  
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=rte \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_glue_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
-  L2_Megatron_T5_GLUE_XNLI:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            -cn megatron_t5_config_finetune_glue_xnli \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=2 \
-            model.data.test_ds.micro_batch_size=2 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=xnli \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            model.data.test_ds.task_name=xnli \
-            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_xnli_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
  
   L2_Megatron_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
@@ -6179,8 +5174,10 @@ jobs:
                 trainer.max_steps=10 \
                 trainer.limit_val_batches=7 \
                 trainer.val_check_interval=10 \
+                trainer.precision=16 \
                 exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
                 model.mcore_gpt=True \
+                model.megatron_amp_O2=False \
                 model.data.data_impl=mock \
                 model.data.data_prefix=[]
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -6543,23 +5540,7 @@ jobs:
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA
-      - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel
-      - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel
-      - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel
-      - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier
-      - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel
-      - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel
-      - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel
-      - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel
       - L2_Duplex_Text_Normalization_with_Tarred_dataset
-      - L2_BERT_Text_Classification_with_BERT_Test
-      - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0
-      - L2_Parallel_BART_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_BART_Question-Answering_SQUAD_v2_0
-      - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0
       - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
       - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
       - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
@@ -6567,13 +5548,10 @@ jobs:
       - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
       - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
       - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
-      - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data
-      - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset
       - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir
       - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
-      - L2_Entity_Linking_Self_Alignment_Pretraining_BERT
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation
@@ -6613,8 +5591,6 @@ jobs:
       - L2_Megatron_T5_Eval
       - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_GLUE_RTE
-      - L2_Megatron_T5_GLUE_XNLI
       - L2_Megatron_T5_PEFT_Lora_TP2
       - L2_Megatron_Mock_Data_Generation_MockGPTDataset
       - L2_Megatron_Mock_Data_Generation_MockT5Dataset
diff --git a/README.rst b/README.rst
index 0b05bd0390f8..6008e71f6a75 100644
--- a/README.rst
+++ b/README.rst
@@ -247,7 +247,7 @@ Use this installation mode if you want the latest released version.
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     pip install nemo_toolkit['all']
 
 Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
@@ -272,7 +272,7 @@ Use this installation mode if you want the version from a particular GitHub bran
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
@@ -310,7 +310,7 @@ To install NeMo on Mac with Apple M-Series GPU:
     conda install -c conda-forge pynini
 
     # install Cython manually
-    pip install cython
+    pip install cython packaging
 
     # clone the repo and install in development mode
     git clone https://github.com/NVIDIA/NeMo
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
index a59a2628cd2f..acb499f18ffb 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -80,6 +80,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
index 8f8f7e40e39a..8dd978bb00e4 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -78,6 +78,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
index 69b21b496ddd..9f199c2dd488 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
index 8fd096525e74..c7f83216aa0b 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -84,6 +84,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
index b0965b580d5b..6f356ce91caa 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
index 9c144d22edec..870bb0190c03 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
index 69e4546b77a7..3fc91cc1e436 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -87,6 +87,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
index ea98d13e62da..e99ba69df57a 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
index 2fab24fa6373..3e3d2bf6788e 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 18
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
index 4d5f4dbcbd06..5f6c37288ae9 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
index 47ad5aa458ca..6e7b5e107629 100644
--- a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
+++ b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
@@ -79,6 +79,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 1763c2035805..902677425f2a 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -163,9 +163,7 @@ class TranscriptionConfig:
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index df2f31072851..446e40714460 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -101,10 +101,8 @@ class ParallelTranscriptionConfig:
     use_cer: bool = False
 
     # decoding strategy for RNNT models
-    # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    # Double check whether fused_batch_size=-1 is right
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index bc66ae717ebb..4eef38e715d4 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -5,7 +5,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -41,7 +41,7 @@ exp_manager:
 
 model:
   # model parallelism 
-  mcore_bert: False
+  mcore_bert: True
   micro_batch_size: 4
   global_batch_size: 8
   tensor_model_parallel_size: 1
@@ -85,7 +85,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: False 
 
@@ -158,4 +158,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
\ No newline at end of file
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 0295f96db838..5118b4171ef2 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,7 +9,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -55,7 +55,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -120,7 +120,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py
index bab691913c0a..16a70925ac9b 100644
--- a/nemo/collections/asr/losses/ssl_losses/contrastive.py
+++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import ceil
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -25,8 +27,7 @@
 class ContrastiveLoss(Loss):
     @property
     def input_types(self):
-        """Input types definitions for Contrastive.
-        """
+        """Input types definitions for Contrastive."""
         return {
             "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
             "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
@@ -147,13 +148,17 @@ def sample_negatives(self, y, num):
 
     @typecheck()
     def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
-        spec_in = spectrograms.transpose(-2, -1)
+        targets = spectrograms.transpose(-2, -1)
         masks = spec_masks.transpose(-2, -1)
-        targets = spec_in
         # BxTxC
+        diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]
+
+        if diff > 0:
+            targets = F.pad(targets, (0, 0, 0, diff))
+            masks = F.pad(masks, (0, 0, 0, diff))
 
-        targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
-        masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
+        targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
+        masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
 
         if self.quantized_targets:
             if self.store_ids:
@@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             if self.sample_from_non_masked:
                 # sample from all steps in utterance
                 negatives, _ = self.sample_negatives(
-                    targets.transpose(0, 1), targets_masked_only.size(0),  # TxBxC  # T'
+                    targets.transpose(0, 1),
+                    targets_masked_only.size(0),  # TxBxC  # T'
                 )
             else:
                 # only sample from masked steps in utterance
@@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             elif self.sample_from_non_masked:
                 # sample from all steps in batch
                 negatives, _ = self.sample_negatives(
-                    targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0),  # BTxC
+                    targets.reshape(targets.shape[0] * targets.shape[1], -1),
+                    targets_masked_only.size(0),  # BTxC
                 )  # T'
             else:
                 # only sample from masked steps
diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index 533f276c0018..93913a43c1b5 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -74,10 +74,10 @@ def get_available_model_names(class_name):
 
 class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin):
     """
-    Inference model Class for offline speaker diarization. 
-    This class handles required functionality for diarization : Speech Activity Detection, Segmentation, 
-    Extract Embeddings, Clustering, Resegmentation and Scoring. 
-    All the parameters are passed through config file 
+    Inference model Class for offline speaker diarization.
+    This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
+    Extract Embeddings, Clustering, Resegmentation and Scoring.
+    All the parameters are passed through config file
     """
 
     def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None):
@@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None):
         Initialize speaker embedding model with model name or path passed through config
         """
         if speaker_model is not None:
-            self._speaker_model = speaker_model
+            if self._cfg.device is None and torch.cuda.is_available():
+                self._speaker_model = speaker_model.to(torch.device('cuda'))
+            else:
+                self._speaker_model = speaker_model
         else:
             model_path = self._cfg.diarizer.speaker_embeddings.model_path
             if model_path is not None and model_path.endswith('.nemo'):
@@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None):
                 self._speaker_model = EncDecSpeakerLabelModel.from_pretrained(
                     model_name=model_path, map_location=self._cfg.device
                 )
-
         self.multiscale_args_dict = parse_scale_configs(
             self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec,
             self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec,
@@ -171,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input):
             'sample_rate': self._cfg.sample_rate,
             'batch_size': self._cfg.get('batch_size'),
             'vad_stream': True,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'window_length_in_sec': self._vad_window_length_in_sec,
             'shift_length_in_sec': self._vad_shift_length_in_sec,
             'trim_silence': False,
@@ -192,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file):
 
     def _run_vad(self, manifest_file):
         """
-        Run voice activity detection. 
-        Get log probability of voice activity detection and smoothes using the post processing parameters. 
+        Run voice activity detection.
+        Get log probability of voice activity detection and smoothes using the post processing parameters.
         Using generated frame level predictions generated manifest file for later speaker embedding extraction.
         input:
         manifest_file (str) : Manifest file containing path to audio file and label as infer
@@ -338,7 +342,7 @@ def _perform_speech_activity_detection(self):
     def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int):
         """
         This method extracts speaker embeddings from segments passed through manifest_file
-        Optionally you may save the intermediate speaker embeddings for debugging or any use. 
+        Optionally you may save the intermediate speaker embeddings for debugging or any use.
         """
         logging.info("Extracting embeddings for Diarization")
         self._setup_spkr_test_data(manifest_file)
diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
index 23ab5469e60c..071c53417ae2 100644
--- a/nemo/collections/asr/models/label_models.py
+++ b/nemo/collections/asr/models/label_models.py
@@ -136,7 +136,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if 'loss' in cfg:
             cfg_eval_loss = copy.deepcopy(cfg.loss)
 
-            if 'angular' in cfg.loss._target_:
+            if '_target_' in cfg.loss and 'angular' in cfg.loss._target_:
                 OmegaConf.set_struct(cfg, True)
                 with open_dict(cfg):
                     cfg.decoder.angular = True
@@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len):
     @typecheck()
     def forward(self, input_signal, input_signal_length):
         processed_signal, processed_signal_len = self.preprocessor(
-            input_signal=input_signal, length=input_signal_length,
+            input_signal=input_signal,
+            length=input_signal_length,
         )
 
         if self.spec_augmentation is not None and self.training:
@@ -627,7 +628,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
         dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=dataset.fixed_seq_collate_fn,
         )
 
         logits = []
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index b9642b3ea5dc..22c0300e8b82 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -118,6 +118,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
             Defaults to None.
         conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases.
             Defaults to False
+        use_bias (bool): Use bias in all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
         dropout (float): the dropout rate used in all layers except the attention layers
             Defaults to 0.1.
         dropout_pre_encoder (float): the dropout rate used before the encoder
@@ -282,6 +284,7 @@ def __init__(
         conv_kernel_size=31,
         conv_norm_type='batch_norm',
         conv_context_size=None,
+        use_bias=True,
         dropout=0.1,
         dropout_pre_encoder=0.1,
         dropout_emb=0.1,
@@ -356,7 +359,9 @@ def __init__(
         if reduction and reduction_factor > 1:
             assert reduction_position >= -1 and reduction_position < n_layers
             self.reduction_subsampling = SubsamplingReductionModule(
-                reduction=reduction, d_model=d_model, reduction_factor=reduction_factor,
+                reduction=reduction,
+                d_model=d_model,
+                reduction_factor=reduction_factor,
             )
             self.reduction_position = reduction_position
         else:
@@ -424,6 +429,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 att_context_size=self.att_context_size,
+                use_bias=use_bias,
             )
             self.layers.append(layer)
 
@@ -804,15 +810,15 @@ def setup_streaming_params(
         max_context: int = 10000,
     ):
         """
-            This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
-            The streaming configuration is needed to simulate streaming inference.
-
-            Args:
-                chunk_size (int): overrides the chunk size
-                shift_size (int): overrides the shift size for chunks
-                left_chunks (int): overrides the number of left chunks visible to each chunk
-                max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
-                    Defaults to -1 (means feat_out is d_model)
+        This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
+        The streaming configuration is needed to simulate streaming inference.
+
+        Args:
+            chunk_size (int): overrides the chunk size
+            shift_size (int): overrides the shift size for chunks
+            left_chunks (int): overrides the number of left chunks visible to each chunk
+            max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
+                Defaults to -1 (means feat_out is d_model)
         """
         streaming_cfg = CacheAwareStreamingConfig()
 
@@ -903,12 +909,19 @@ def get_initial_cache_state(self, batch_size=1, dtype=torch.float32, device=None
             create_tensor = torch.zeros
         last_time_cache_size = self.conv_context_size[0]
         cache_last_channel = create_tensor(
-            (len(self.layers), batch_size, self.streaming_cfg.last_channel_cache_size, self.d_model,),
+            (
+                len(self.layers),
+                batch_size,
+                self.streaming_cfg.last_channel_cache_size,
+                self.d_model,
+            ),
             device=device,
             dtype=dtype,
         )
         cache_last_time = create_tensor(
-            (len(self.layers), batch_size, self.d_model, last_time_cache_size), device=device, dtype=dtype,
+            (len(self.layers), batch_size, self.d_model, last_time_cache_size),
+            device=device,
+            dtype=dtype,
         )
         if max_dim > 0:
             cache_last_channel_len = torch.randint(
@@ -934,7 +947,6 @@ def change_attention_model(
         update_config: bool = True,
         device: torch.device = None,
     ):
-
         """
         Update the self_attention_model which changes the positional encoding and attention layers.
 
@@ -1053,7 +1065,7 @@ def change_attention_model(
 
     def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int):
         """
-        Update the conv_chunking_factor (int) 
+        Update the conv_chunking_factor (int)
         Default is 1 (auto)
         Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers
 
@@ -1098,7 +1110,9 @@ def _update_adapter_cfg_input_dim(self, cfg: DictConfig):
         cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model)
         return cfg
 
-    def get_accepted_adapter_types(self,) -> Set[type]:
+    def get_accepted_adapter_types(
+        self,
+    ) -> Set[type]:
         types = super().get_accepted_adapter_types()
 
         if len(types) == 0:
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index aed6cc16245c..efd23ef44628 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -56,6 +56,8 @@ class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin):
         conv_kernel_size (int): kernel size for depthwise convolution in convolution module
         dropout (float): dropout probabilities for linear layers
         dropout_att (float): dropout probabilities for attention distributions
+        use_bias (bool): Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
     """
 
     def __init__(
@@ -75,6 +77,7 @@ def __init__(
         pos_bias_u=None,
         pos_bias_v=None,
         att_context_size=[-1, -1],
+        use_bias=True,
     ):
         super(ConformerLayer, self).__init__()
 
@@ -84,7 +87,7 @@ def __init__(
 
         # first feed forward module
         self.norm_feed_forward1 = LayerNorm(d_model)
-        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         # convolution module
         self.norm_conv = LayerNorm(d_model)
@@ -93,6 +96,7 @@ def __init__(
             kernel_size=conv_kernel_size,
             norm_type=conv_norm_type,
             conv_context_size=conv_context_size,
+            use_bias=use_bias,
         )
 
         # multi-headed self-attention module
@@ -107,6 +111,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'rel_pos_local_attn':
             self.self_attn = RelPositionMultiHeadAttentionLongformer(
@@ -120,10 +125,15 @@ def __init__(
                 global_tokens=global_tokens,
                 global_tokens_spacing=global_tokens_spacing,
                 global_attn_separate=global_attn_separate,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'abs_pos':
             self.self_attn = MultiHeadAttention(
-                n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len
+                n_head=n_heads,
+                n_feat=d_model,
+                dropout_rate=dropout_att,
+                max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         else:
             raise ValueError(
@@ -133,7 +143,7 @@ def __init__(
 
         # second feed forward module
         self.norm_feed_forward2 = LayerNorm(d_model)
-        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         self.dropout = nn.Dropout(dropout)
         self.norm_out = LayerNorm(d_model)
@@ -280,16 +290,25 @@ class ConformerConvolution(nn.Module):
         pointwise_activation (str): name of the activation function to be used for the pointwise conv.
             Note that Conformer uses a special key `glu_` which is treated as the original default from
             the paper.
+        use_bias (bool): Use bias in all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
+            Defaults to True
     """
 
     def __init__(
-        self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_'
+        self,
+        d_model,
+        kernel_size,
+        norm_type='batch_norm',
+        conv_context_size=None,
+        pointwise_activation='glu_',
+        use_bias=True,
     ):
         super(ConformerConvolution, self).__init__()
         assert (kernel_size - 1) % 2 == 0
         self.d_model = d_model
         self.kernel_size = kernel_size
         self.norm_type = norm_type
+        self.use_bias = use_bias
 
         if conv_context_size is None:
             conv_context_size = (kernel_size - 1) // 2
@@ -305,7 +324,12 @@ def __init__(
             dw_conv_input_dim = d_model
 
         self.pointwise_conv1 = nn.Conv1d(
-            in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=d_model,
+            out_channels=d_model * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
         self.depthwise_conv = CausalConv1D(
@@ -315,7 +339,7 @@ def __init__(
             stride=1,
             padding=conv_context_size,
             groups=dw_conv_input_dim,
-            bias=True,
+            bias=self.use_bias,
         )
 
         if norm_type == 'batch_norm':
@@ -334,7 +358,12 @@ def __init__(
 
         self.activation = Swish()
         self.pointwise_conv2 = nn.Conv1d(
-            in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=dw_conv_input_dim,
+            out_channels=d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
     def forward(self, x, pad_mask=None, cache=None):
@@ -370,31 +399,34 @@ def forward(self, x, pad_mask=None, cache=None):
             return x, cache
 
     def reset_parameters_conv(self):
-        pw1_max = pw2_max = self.d_model ** -0.5
-        dw_max = self.kernel_size ** -0.5
+        pw1_max = pw2_max = self.d_model**-0.5
+        dw_max = self.kernel_size**-0.5
 
         with torch.no_grad():
             nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max)
-            nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
             nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max)
-            nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
             nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max)
-            nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
+            if self.use_bias:
+                nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
+                nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
+                nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
 
 
 class ConformerFeedForward(nn.Module):
     """
     feed-forward module of Conformer model.
+    use_bias (bool): Apply bias to all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
     """
 
-    def __init__(self, d_model, d_ff, dropout, activation=Swish()):
+    def __init__(self, d_model, d_ff, dropout, activation=Swish(), use_bias=True):
         super(ConformerFeedForward, self).__init__()
         self.d_model = d_model
         self.d_ff = d_ff
-        self.linear1 = nn.Linear(d_model, d_ff)
+        self.use_bias = use_bias
+        self.linear1 = nn.Linear(d_model, d_ff, bias=self.use_bias)
         self.activation = activation
         self.dropout = nn.Dropout(p=dropout)
-        self.linear2 = nn.Linear(d_ff, d_model)
+        self.linear2 = nn.Linear(d_ff, d_model, bias=self.use_bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -404,10 +436,11 @@ def forward(self, x):
         return x
 
     def reset_parameters_ff(self):
-        ffn1_max = self.d_model ** -0.5
-        ffn2_max = self.d_ff ** -0.5
+        ffn1_max = self.d_model**-0.5
+        ffn2_max = self.d_ff**-0.5
         with torch.no_grad():
             nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max)
-            nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
             nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max)
-            nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
+            if self.use_bias:
+                nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
+                nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index a7f57c82279a..74204cf73d8e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,7 +394,17 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+            decoder_lengths = torch.tensor(
+                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
+            ).expand(decoder_output.shape[0])
+
+        # GreedyCTCInfer::forward(), by accident, works with
+        # decoder_lengths on either CPU or GPU when decoder_output is
+        # on GPU. For the sake of backwards compatibility, we also
+        # allow decoder_lengths to be on the CPU device. In this case,
+        # we simply copy the decoder_lengths from CPU to GPU. If both
+        # tensors are already on the same device, this is a no-op.
+        decoder_lengths = decoder_lengths.to(decoder_output.device)
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index 93cef4d4138e..aa49435ded16 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -37,7 +37,7 @@
 
 def create_outer_for_loop_kernel():
     """
-    Creates a kernel that evaluates whether or not to enter the for loop body. 
+    Creates a kernel that evaluates whether or not to enter the for loop body.
     Effectively substitutes for `for time_idx in range(trip_count)`
     such that that for loop can run on a GPU.
     """
@@ -171,8 +171,10 @@ def _reinitialize(self, max_time, batch_size, encoder_output, encoder_output_len
 
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.device)
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             # This is failing...
             self.f = torch.zeros(
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index 6a866a617f35..19d713405953 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -55,21 +55,23 @@ class MultiHeadAttention(nn.Module):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to remove bias in linear and conv layers
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True):
         """Construct an MultiHeadedAttention object."""
         super(MultiHeadAttention, self).__init__()
         self.cache_drop_size = None
+        self.use_bias = use_bias
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
         self.s_d_k = math.sqrt(self.d_k)
         self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)
         self.dropout = nn.Dropout(p=dropout_rate)
 
         self._max_cache_len = max_cache_len
@@ -161,11 +163,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True):
         """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len)
+        super().__init__(
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            max_cache_len=max_cache_len,
+            use_bias=use_bias,
+        )
         # linear transformation for positional encoding
         self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
         # these two learnable biases are used in matrix c and matrix d
@@ -253,7 +262,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
 class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
     """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer.
     Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py)
-    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) 
+    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py)
     Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL),
            https://arxiv.org/abs/2004.05150 (Longformer)
     Args:
@@ -267,6 +276,7 @@ class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
         global_tokens (int): number of tokens to be used for global attention
         global_tokens_spacing (int): how far apart the global tokens are
         global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
     def __init__(
@@ -281,6 +291,7 @@ def __init__(
         global_tokens=0,
         global_tokens_spacing=1,
         global_attn_separate=False,
+        use_bias=True,
     ):
         """Construct an RelPositionMultiHeadAttentionLongformer object."""
         super().__init__(
@@ -290,6 +301,7 @@ def __init__(
             pos_bias_u=pos_bias_u,
             pos_bias_v=pos_bias_v,
             max_cache_len=max_cache_len,
+            use_bias=use_bias,
         )
         self.att_context_size = att_context_size
         self.global_tokens = global_tokens
@@ -297,9 +309,9 @@ def __init__(
         self.global_attn_separate = global_attn_separate
 
         if self.global_attn_separate:
-            self.global_q = nn.Linear(n_feat, n_feat)
-            self.global_k = nn.Linear(n_feat, n_feat)
-            self.global_v = nn.Linear(n_feat, n_feat)
+            self.global_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_v = nn.Linear(n_feat, n_feat, bias=use_bias)
 
     def forward(self, query, key, value, pad_mask, pos_emb, cache=None):
         """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks
@@ -650,7 +662,8 @@ def _compute_out_global_to_all(
         global_attn_scores = global_attn_scores.transpose(1, 2)
 
         global_attn_scores = global_attn_scores.masked_fill(
-            is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min,
+            is_index_masked.transpose(2, 3),
+            torch.finfo(global_attn_scores.dtype).min,
         )
 
         global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len)
@@ -747,7 +760,9 @@ def _get_invalid_locations_mask(self, w: int, device: str):
         return mask.bool().to(device), ending_mask
 
     def mask_invalid_locations(
-        self, input_tensor: torch.Tensor, w: int,
+        self,
+        input_tensor: torch.Tensor,
+        w: int,
     ):
         """
         Mask locations invalid for the sliding window attention
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 2416d916ac13..eb4088f84cae 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -331,7 +331,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
                         loop_labels=self.cfg.greedy.get('loop_labels', True),
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -347,7 +347,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         include_duration_confidence=self.tdt_include_duration_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
 
             else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index fa7a5cc95fec..420e49c96142 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -592,7 +592,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2360,7 +2360,7 @@ class GreedyBatchedRNNTInferConfig:
     tdt_include_duration_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
     loop_labels: bool = True
-    use_cuda_graph_decoder: bool = False
+    use_cuda_graph_decoder: bool = True
 
     def __post_init__(self):
         # OmegaConf.structured ensures that post_init check is always executed
@@ -2712,7 +2712,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 718deb7a409c..c0783c301c44 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -630,14 +630,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_outer_loop()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
@@ -645,14 +649,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._inner_loop_code()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._after_inner_loop()
 
@@ -660,12 +668,11 @@ def _full_graph_compile(self):
         """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.full_graph = torch.cuda.CUDAGraph()
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index 7ad7065e019c..4132c453d570 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -691,14 +691,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_outer_loop()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
@@ -706,14 +710,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._inner_loop_code()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._after_inner_loop()
 
@@ -726,7 +734,7 @@ def _full_graph_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 9efd6444aecd..978cf023bd81 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -95,7 +95,9 @@ class LhotseDataLoadingConfig:
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
-    noise_path: Any | None = None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    noise_path: Any | None = (
+        None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    )
     noise_snr: tuple[float, float] = (10.0, 20.0)
     noise_mix_prob: float = 0.5
     #   b. On-the-fly 3-way speed perturbation.
@@ -114,7 +116,9 @@ class LhotseDataLoadingConfig:
     cut_into_windows_duration: Optional[float] = None  # set this to enable
     cut_into_windows_hop: Optional[float] = None
     #       III) common options
-    keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    keep_excessive_supervisions: bool = (
+        True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    )
     #   e. RIR augmentation (synthetic RIR if rir_path is None)
     #   at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
     rir_enabled: bool = False
@@ -130,7 +134,11 @@ class LhotseDataLoadingConfig:
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
+    config: DictConfig,
+    global_rank: int,
+    world_size: int,
+    dataset: torch.utils.data.Dataset,
+    tokenizer=None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -205,7 +213,11 @@ def get_lhotse_dataloader_from_config(
     #    and applying it here (before sampler/dataset) ensures optimal
     #    bucket allocation.
     if config.perturb_speed:
-        cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
+        cuts = CutSet.mux(
+            cuts,
+            cuts.perturb_speed(0.9),
+            cuts.perturb_speed(1.1),
+        )
 
     # 2.d: truncation/slicing
     if config.truncate_duration is not None:
@@ -291,7 +303,10 @@ def get_lhotse_dataloader_from_config(
         # object with texts joined by a whitespace so that "regular" dataset classes don't
         # have to add a special support for multi-supervision cuts.
         sampler = sampler.map(
-            CutConcatenate(gap=config.concatenate_gap_seconds, duration_factor=config.concatenate_duration_factor,)
+            CutConcatenate(
+                gap=config.concatenate_gap_seconds,
+                duration_factor=config.concatenate_duration_factor,
+            )
         )
         if config.db_norm is not None:
             sampler = sampler.map(partial(_normalize_loudness, db_norm=config.db_norm))
@@ -326,7 +341,10 @@ def get_lhotse_dataloader_from_config(
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = torch.utils.data.DataLoader(
-        **dloader_kwargs, batch_size=None, num_workers=config.num_workers, pin_memory=config.pin_memory,
+        **dloader_kwargs,
+        batch_size=None,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
     )
 
     return dloader
@@ -377,7 +395,9 @@ class MultimodalSamplingConstraint(SamplingConstraint):
 
     def __post_init__(self):
         self._internal = TokenConstraint(
-            max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
+            max_tokens=self.batch_tokens,
+            max_examples=self.batch_size,
+            quadratic_length=self.quadratic_factor,
         )
 
     def add(self, example: Any) -> None:
@@ -487,7 +507,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool):
             warnings.warn(
                 "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
             )
-        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+        try:
+            torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        except RuntimeError:
+            logging.info(
+                "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
+            )
 
 
 def _select_channel(cut, channel_selector: int | str) -> list:
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index caaab2c5d67e..d346a8b14f3b 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -188,7 +188,10 @@ def flatten_frames(self, cap):
 
 
 def tokenize(
-    texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int,
+    texts: Union[str, List[str]],
+    tokenizer: Any,
+    context_length: int,
+    add_extra_token: int,
 ) -> torch.LongTensor:
     """
     Returns the tokenized representation of given input string(s). If the list of tokens exceeds the context
@@ -295,7 +298,11 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     return sources
 
 
-def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_llama_2(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
 
@@ -379,10 +386,17 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_v1(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the Vicuna V1 model configuration.
 
@@ -462,10 +476,17 @@ def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nvgpt(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -503,9 +524,9 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
                 if 'label' not in turn:
-                    turn[
-                        'label'
-                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    turn['label'] = (
+                        "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    )
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:
@@ -567,10 +588,17 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nv_dpo(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -666,10 +694,17 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
+def preprocess_plain(
+    sources,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses plain text sources (no template) for tokenization and label generation.
 
@@ -717,7 +752,10 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
 class LazySupervisedDataset(Dataset):
@@ -870,15 +908,35 @@ def expand2square(pil_img, background_color):
             sources = copy.deepcopy(sources)
 
         if self.conv_template in ["nvgpt", "nv_steerlm"]:
-            data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nvgpt(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "nv_dpo":
-            data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nv_dpo(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "v1":
-            data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_v1(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "llama_2":
-            data_dict = preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "plain":
-            data_dict = preprocess_plain(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_plain(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         else:
             raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
 
@@ -981,7 +1039,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         tokens = batch['tokens']
         labels = batch['labels']
-        media_type = model_cfg.data.get('media_type')
+        media_type = model_cfg.data.get('media_type', 'image')
         if media_type == 'image':
             media = batch.get('image')
         elif media_type == 'video':
@@ -1048,7 +1106,12 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         )
     else:
         # TODO(yuya): Fix this hard-code for our own CLIP
-        image_processor = image_transform(crop_size, is_train=False, mean=None, std=None,)
+        image_processor = image_transform(
+            crop_size,
+            is_train=False,
+            mean=None,
+            std=None,
+        )
 
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index f5689c706e2c..f82ede61a02d 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -111,7 +111,11 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=Fal
         if use_te:
             activation = 'gelu' if not glu else 'geglu'
             # TODO: more parameters to be confirmed, dropout, seq_length
-            self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,)
+            self.net = LayerNormMLP(
+                hidden_size=dim,
+                ffn_hidden_size=inner_dim,
+                activation=activation,
+            )
         else:
             norm = nn.LayerNorm(dim)
             project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
@@ -253,7 +257,7 @@ def __init__(
         self.query_dim = query_dim
         self.dim_head = dim_head
 
-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
         self.heads = heads
 
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
index 7fc5c208004f..de425e8dba13 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
@@ -226,7 +226,10 @@ def __init__(
             # timestep embedding
             self.temb = nn.Module()
             self.temb.dense = nn.ModuleList(
-                [torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch),]
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
             )
 
         # downsampling
@@ -662,7 +665,11 @@ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
             ]
         )
 
-        self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1,)
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
 
     def forward(self, x):
         x = self.conv_in(x)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index b610f921a22a..b99c692b41a2 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -102,10 +102,14 @@ class AttentionPool2d(nn.Module):
     """
 
     def __init__(
-        self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None,
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
     ):
         super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
         self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
         self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
         self.num_heads = embed_dim // num_heads_channels
@@ -319,7 +323,10 @@ def __init__(
             self.emb_layers = None
             self.exchange_temb_dims = False
         else:
-            self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),)
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(emb_channels, self.emb_out_channels),
+            )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups),
             nn.Dropout(p=dropout),
@@ -387,7 +394,12 @@ class AttentionBlock(nn.Module):
     """
 
     def __init__(
-        self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
     ):
         super().__init__()
         self.channels = channels
@@ -438,7 +450,7 @@ def count_flops_attn(model, _x, y):
     # We perform two matmuls with the same number of ops.
     # The first computes the weight matrix, the second computes
     # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    matmul_ops = 2 * b * (num_spatial**2) * c
     model.total_ops += th.DoubleTensor([matmul_ops])
 
 
@@ -640,7 +652,10 @@ def __init__(
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
             assert all(
-                map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),)
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
             )
             logging.info(
                 f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
@@ -661,7 +676,9 @@ def __init__(
         self.predict_codebook_ids = n_embed is not None
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps))
@@ -678,7 +695,9 @@ def __init__(
                 self.label_emb = nn.Sequential(
                     Timestep(model_channels),
                     nn.Sequential(
-                        linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     ),
                 )
             elif self.num_classes == "sequential":
@@ -686,7 +705,9 @@ def __init__(
                 self.adm_in_channels = adm_in_channels
                 self.label_emb = nn.Sequential(
                     nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     )
                 )
             else:
@@ -797,26 +818,28 @@ def __init__(
                 use_scale_shift_norm=use_scale_shift_norm,
                 resblock_gn_groups=resblock_gn_groups,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer(
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth_middle,
-                context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint,
-                use_flash_attention=use_flash_attention,
-                use_te=self.use_te_fp8,
-                lora_network_alpha=lora_network_alpha,
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    use_checkpoint=use_checkpoint,
+                    use_flash_attention=use_flash_attention,
+                    use_te=self.use_te_fp8,
+                    lora_network_alpha=lora_network_alpha,
+                )
             ),
             ResBlock(
                 ch,
@@ -1110,9 +1133,15 @@ def te_fp8_key_mapping(self, unet_dict):
             # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
             # norm_to_q.weight -> to_q.weight
             new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn1.to_q.weight', 'attn1.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn1.to_q.weight',
+                'attn1.norm_to_q.weight',
+            )
             new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn2.to_q.weight', 'attn2.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn2.to_q.weight',
+                'attn2.norm_to_q.weight',
+            )
 
             ### LayerNormMLP
             # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
@@ -1201,7 +1230,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         def _find_mismatched_keys(
-            state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes,
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
         ):
             mismatched_keys = []
             if ignore_mismatched_sizes:
@@ -1221,7 +1253,10 @@ def _find_mismatched_keys(
         if state_dict is not None:
             # Whole checkpoint
             mismatched_keys = _find_mismatched_keys(
-                state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes,
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
             )
             error_msgs = self._load_state_dict_into_model(state_dict)
         return missing_keys, unexpected_keys, mismatched_keys, error_msgs
@@ -1316,9 +1351,14 @@ def _forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             return self.out(h)
 
     def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
-        with transformer_engine.pytorch.fp8_autocast(
-            enabled=self.use_te_fp8, fp8_recipe=self.fp8_recipe,
-        ) if self.use_te_fp8 else nullcontext():
+        with (
+            transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_te_fp8,
+                fp8_recipe=self.fp8_recipe,
+            )
+            if self.use_te_fp8
+            else nullcontext()
+        ):
             out = self._forward(x, timesteps, context, y, **kwargs)
         return out
 
@@ -1374,7 +1414,9 @@ def __init__(
 
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.input_blocks = nn.ModuleList(
@@ -1476,11 +1518,15 @@ def __init__(
         elif pool == "attention":
             assert num_head_channels != -1
             self.out = nn.Sequential(
-                normalization(ch), nn.SiLU(), AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
             )
         elif pool == "spatial":
             self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels),
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
             )
         elif pool == "spatial_v2":
             self.out = nn.Sequential(
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 3b446f4a42c3..4ad5b6023c0b 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -37,7 +37,7 @@
 
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
     if schedule == "linear":
-        betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2
 
     elif schedule == "cosine":
         timesteps = torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
@@ -162,7 +162,10 @@ def backward(ctx, *output_grads):
             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
             output_tensors = ctx.run_function(*shallow_copies)
         input_grads = torch.autograd.grad(
-            output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True,
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
         )
         del ctx.input_tensors
         del ctx.input_params
@@ -312,7 +315,11 @@ def interpolate_fn(x, xp, yp):
     start_idx = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(1, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
     start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
@@ -320,7 +327,11 @@ def interpolate_fn(x, xp, yp):
     start_idx2 = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(0, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
     start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
index 98d24802189e..92c56a4c20df 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
@@ -17,6 +17,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueAssistantDataProcessor']
 
@@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg):
             data_dir: path to data directory
             tokenizer: tokenizer object
         """
+        # deprecation warning
+        deprecated_warning("DialogueAssistantDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -69,16 +73,15 @@ def open_file(self, filename):
 
     @staticmethod
     def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids):
-
         """
         Extract continuous spans of slot_ids
 
-        To accomodate slots with distinct labels for B-label1 and I-label1, 
+        To accomodate slots with distinct labels for B-label1 and I-label1,
         slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1
-        
+
         Args:
             Slot: list of int representing slot of each word token
-            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 
+            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12
             Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday"
             Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens,
             each containing a start position and an exclusive end position
@@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str):
                 "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words},
                 "label_positions": {
                     "slots": {
-                        slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,}
+                        slot: {
+                            "start": position[0],
+                            "exclusive_end": position[1],
+                            "slot": slot,
+                        }
                         for slot, position in slot_to_start_and_exclusive_end.items()
                     }
                 },
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
index 2a4b21c70535..c41c1f5e04ca 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
@@ -17,6 +17,7 @@
 import random
 
 from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDataProcessor']
 
@@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor):
     """
 
     def __init__(self):
+        # deprecation warning
+        deprecated_warning("DialogueDataProcessor")
+
         raise NotImplementedError()
 
     def get_train_examples(self):
@@ -58,8 +62,8 @@ def get_test_examples(self):
     def get_relevant_idxs(dataset_split, n_samples, dev_proportion):
         """
         Obtain indexes for each dataset_split, when train and dev sets are not in separate files
-        
-        Args: 
+
+        Args:
             dataset_split: train, dev or test
             n_samples: total number of samples
             dev_proportion: value from 1 to 99 that represent proportion of data in dev set
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
index 5e58919b7652..56e99c4bcfe9 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDesignDataProcessor']
 
@@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueDesignDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -50,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
         Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
index 58814a8eee90..67d58ff5d21e 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
@@ -19,13 +19,13 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMellonQADataProcessor']
 
 
 class DialogueMellonQADataProcessor(DialogueDataProcessor):
-    """Data Processor for Mellon QA dialogues. 
-    """
+    """Data Processor for Mellon QA dialogues."""
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
         """
@@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMellonQADataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -51,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str):
             input_example = {
                 "utterance": utterance,
                 "example_id": i,
-                "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,},
+                "labels": {
+                    "response": answer,
+                    "fluent_response": well_formed_answer,
+                    "passage": passage,
+                },
             }
             example = DialogueInputExample(input_example)
             examples.append(example)
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
index 78f434c1d5dd..d09960a35d69 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
@@ -19,15 +19,16 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMSMarcoDataProcessor']
 
 
 class DialogueMSMarcoDataProcessor(DialogueDataProcessor):
     """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering)
-       Please agree to the Terms of Use before downloading data at 
-       https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
-       https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
+    Please agree to the Terms of Use before downloading data at
+    https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
+    https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
     """
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
@@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             debug_mode: reduce number of samples to load in order to increase speed of processing
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMSMarcoDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -55,7 +59,7 @@ def open_json(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
index a78e1973e55f..1d37c26f1c45 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
@@ -28,6 +28,7 @@
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
 from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 from nemo.utils.get_rank import is_global_rank_zero
 
 __all__ = ['DialogueSGDDataProcessor']
@@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         #   git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git
 
     ***Data format***
-    SGD data comes with a JSON schema file and dialogue files for each dataset split. 
+    SGD data comes with a JSON schema file and dialogue files for each dataset split.
 
     In the following we will show an example for a service entry in the schema file.
     * service_name
@@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         * result_slots (not used)
 
 
-    In the following we will show an example for a dialogue. 
+    In the following we will show an example for a dialogue.
     * dialogue_id
     * services
     * turns
@@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
             * state
                 * active_intent
                 * requeste_slots
-                * slot_values 
+                * slot_values
         * speaker - [USER, SYSTEM]
         * utterance
 
     """
 
     def __init__(
-        self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None,
+        self,
+        data_dir: str,
+        dialogues_example_dir: str,
+        tokenizer: object,
+        cfg=None,
     ):
         """
         Constructs DialogueSGDDataProcessor
@@ -104,6 +109,9 @@ def __init__(
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueSGDDataProcessor")
+
         self.data_dir = data_dir
         self.cfg = cfg
 
@@ -213,7 +221,7 @@ def get_labels(self):
 
     def get_dialog_examples(self, dataset_split: str) -> List[object]:
         """
-        Loads preprocessed dialogue examples from disk. 
+        Loads preprocessed dialogue examples from disk.
         Args:
             dataset_split: dataset split
         Returns:
@@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp
         Returns a list of `InputExample`s of the data splits' dialogues.
         Args:
             dataset_split: data split, can be "train", "dev", or "test".
-            schemas: schema for all services of all datasets 
+            schemas: schema for all services of all datasets
             subsample: whether to balance postive and negative samples in the dataset
         Returns:
             examples: a list of `InputExample`s.
@@ -447,9 +455,9 @@ def _create_examples_from_turn(
                 "example_id_num": example_id_num,
                 "utterance": user_utterance,
                 "system_utterance": system_utterance,
-                "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]}
-                if system_frame is not None
-                else None,
+                "system_slots": (
+                    {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None
+                ),
                 "system_actions": system_frame["actions"] if system_frame is not None else None,
                 "labels": {
                     "service": service,
@@ -464,9 +472,11 @@ def _create_examples_from_turn(
                         for intent in schemas.get_service_schema(service).intents
                     ],
                     "slots": {
-                        slot: schemas.get_service_schema(service).get_categorical_slot_values(slot)
-                        if slot in categorical_slots
-                        else []
+                        slot: (
+                            schemas.get_service_schema(service).get_categorical_slot_values(slot)
+                            if slot in categorical_slots
+                            else []
+                        )
                         for slot in all_possible_slots
                     },
                 },
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
index 0931fe383f94..33d46c308e81 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
@@ -21,12 +21,12 @@
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
 from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset']
 
 
 class DialogueBERTDataset(DialogueDataset):
-
     """
     Creates a dataset to use for the task of joint intent
     and slot classification with pretrained model.
@@ -37,8 +37,7 @@ class DialogueBERTDataset(DialogueDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
             'segment_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -57,6 +56,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: config container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueBERTDataset")
+
         self.cfg = cfg
         self.all_possible_labels = dialogues_processor.intents
         self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))}
@@ -183,7 +185,7 @@ def get_features(
         ignore_start_end=False,
     ):
         """
-        Convert queries (utterance, intent label and slot labels) to BERT input format 
+        Convert queries (utterance, intent label and slot labels) to BERT input format
         """
 
         all_subtokens = []
@@ -297,7 +299,7 @@ class DialogueIntentSlotInferenceDataset(DialogueBERTDataset):
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
         """
-            Returns definitions of module output ports.
+        Returns definitions of module output ports.
         """
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -308,6 +310,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         }
 
     def __init__(self, queries, max_seq_length, tokenizer, do_lower_case):
+        # deprecation warning
+        deprecated_warning("DialogueIntentSlotInferenceDataset")
+
         if do_lower_case:
             queries = [query.lower() for query in queries]
 
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
index 1ac04a856a89..f89a5013c2ae 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
@@ -21,27 +21,31 @@
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueGPTClassificationDataset(DialogueDataset):
     '''
     Designed for classification tasks such as intent/domain classification as well as slot tagging
 
-    Dataset Class 
+    Dataset Class
         1. Performs Model-dependent (but Data-independent) operations (tokenization etc)
         2. This can allow the same model preprocessing for multiple datasources
-        3. Users can configurate which labels to use for modelling 
+        3. Users can configurate which labels to use for modelling
             (e.g. intent classification, slot filling or both together etc)
     '''
 
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
+        """Constructor
         Args:
             dataset_split: dataset split
             dialogues_processor: Data generator for SGD dialogues
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueGPTClassificationDataset")
+
         self.cfg = cfg
 
         if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation":
@@ -229,19 +233,18 @@ def collate_fn(self, batch):
         return all_items
 
     def __getitem__(self, idx: int):
-
         '''
         State how the input and output samples look like
 
         This template can be changed
 
-        Training example: 
+        Training example:
             e.g. <utterance> service: restaurant
             e.g. <task description> <utterance> service: restaurant
             e.g. <utterance>\nintent: set alarm\nslots: <slot_name1>(<slot_value1>), <slot_name1>(<slot_value1>)
 
         Generation example:
-            e.g. <utterance> service: 
+            e.g. <utterance> service:
 
         '''
         ex = self.features[idx].data
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
index 7de02d75c574..8ddbc2e3925e 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
@@ -18,12 +18,13 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueGPTGenerationDataset(DialogueDataset):
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation 
+        """Constructor
+        Designed for free form generation tasks such as Dialogue Response Generation
 
         Args:
             dataset_split: dataset split
@@ -31,6 +32,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueGPTGenerationDataset")
+
         self.cfg = cfg
         self.input_label_type = self.cfg.input_field
         self.output_label_type = self.cfg.output_field
@@ -80,7 +84,7 @@ def format_prompt(self, ex):
         '''
         Formats training prompt based on self.input_field_type
 
-        Training example: 
+        Training example:
             e.g. response: <response> # input_label_type = response
             e.g. utterance: <utterance> # input_label_type = utterance
             e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
@@ -91,7 +95,6 @@ def format_prompt(self, ex):
         return input_sentence
 
     def __getitem__(self, idx: int):
-
         '''
         For each example, this function determines the format of input and output sequences based on user-specified conguration.
         This is controlled by model.dataset.input_field and model.dataset.output_field
@@ -99,9 +102,9 @@ def __getitem__(self, idx: int):
             If model.dataset.input_field == response and model.dataset.output_field == fluent_response:
                 Input = "response: <response>" and output = "response: <response> fluent_response: <fluent_response>" (with loss calculated from <fluent_response> only)
             If model.dataset.input_field == utterance and model.dataset.output_field == response:
-                Input = "utterance: <utterance>" and output = "utterance: <utterance> response: <response>" (with loss calculated from <response> only) 
+                Input = "utterance: <utterance>" and output = "utterance: <utterance> response: <response>" (with loss calculated from <response> only)
             If model.dataset.input_field == passage+utterance and model.dataset.output_field == response:
-                Input = "passage: <passage> utterance: <utterance>" and output="passage: <passage> utterance: <utterance> response: <response>" (with loss calculated from <response> only) 
+                Input = "passage: <passage> utterance: <utterance>" and output="passage: <passage> utterance: <utterance> response: <response>" (with loss calculated from <response> only)
         '''
         ex = self.features[idx].data
 
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
index 8618f2f8c7b4..dc123ca0e3d7 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
@@ -17,6 +17,7 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueNearestNeighbourDataset']
 
@@ -33,6 +34,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             dialogues_processor: Data generator for dialogues
             tokenizer: tokenizer to split text into sub-word tokens
         """
+        # deprecation warning
+        deprecated_warning("DialogueNearestNeighbourDataset")
+
         self.cfg = cfg
         self.tokenizer = tokenizer
         self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
index 78fda55edd2e..df522b74e861 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
@@ -16,12 +16,13 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueS2SGenerationDataset(DialogueDataset):
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation 
+        """Constructor
+        Designed for free form generation tasks such as Dialogue Response Generation
 
         Args:
             dataset_split: dataset split
@@ -29,6 +30,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueS2SGenerationDataset")
+
         self.cfg = cfg
         self.input_label_type = self.cfg.input_field
         self.output_label_type = self.cfg.output_field
@@ -45,7 +49,7 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
     @staticmethod
     def format_actions(prompt_template, actions):
         """
-        Formats actions based on prompt_template 
+        Formats actions based on prompt_template
 
         Args:
             prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions
@@ -118,7 +122,7 @@ def format_prompt(self, ex):
         '''
         Formats training prompt based on self.input_field_type
 
-        Training example: 
+        Training example:
             e.g. response: <response> # input_label_type = response
             e.g. utterance: <utterance> # input_label_type = utterance
             e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
@@ -128,13 +132,12 @@ def format_prompt(self, ex):
         return input_sentence
 
     def __getitem__(self, idx: int):
-
         '''
         State how the input and output samples look like
 
         This template can be changed
 
-        Training example: 
+        Training example:
             e.g. INPUT - "response: <response>" OUTPUT - "<fluent_response>"  # input_label_type = response, output_label_type = fluent_response
             e.g. INPUT - "utterance: <utterance>" OUTPUT - "<response>" # input_label_type = utterance, output_label_type = response
             e.g. INPUT - "passage: <passage> utterance: <utterance>" OUTPUT - "<response>" # input_label_type = passage+utterance, output_label_type = response
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
index f2a0f58bcfac..c1308238bea1 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset
 from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueZeroShotIntentDataset']
 
@@ -36,8 +37,7 @@ class DialogueZeroShotIntentDataset(GLUEDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
             'segment_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -55,6 +55,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
                 num_classes: number of classes in the data (should be either 2 or 3, corresponding to
                 labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"])
         """
+        # deprecation warning
+        deprecated_warning("DialogueZeroShotIntentDataset")
+
         self.cfg = cfg
         self.tokenizer = tokenizer
         if self.cfg.num_classes not in [2, 3]:
@@ -69,9 +72,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             'eos_token': tokenizer.eos_token,
             'pad_token': tokenizer.pad_token,
             'cls_token': tokenizer.cls_token,
-            'sep_token_extra': tokenizer.eos_token
-            if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower()
-            else None,
+            'sep_token_extra': (
+                tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None
+            ),
         }
 
         self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
@@ -128,9 +131,9 @@ def convert_examples_to_features(
             * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
 
         The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-        
+
         The convention in BERT is:
-        
+
             a. For sequence pairs:
                 * tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
                 * type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
@@ -148,9 +151,9 @@ def convert_examples_to_features(
         For classification tasks, the first vector (corresponding to [CLS])
         is used as as the "sentence vector". Note that this only makes sense
         because the entire model is fine-tuned.
-        
+
         The convention for NMT is:
-        
+
             a. For sequence pairs:
                 * tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
                 * type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
index 5d985466ff6c..bbd14f47a651 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
@@ -17,6 +17,7 @@
 from nemo.collections.nlp.modules.common import VirtualPromptSource
 from nemo.core import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['BasePromptLearningDataset']
 
@@ -41,6 +42,9 @@ def __init__(
         add_eos: bool = True,
         for_train: bool = True,
     ):
+        # deprecation warning
+        deprecated_warning("BasePromptLearningDataset")
+
         self.tokenizer = tokenizer
         self.virtual_prompt_source = virtual_prompt_source
         self.task_templates = task_templates
@@ -72,7 +76,7 @@ def __init__(
             raise ValueError("Datasets must be a list of dicts or a list of filepath strings")
 
     def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits):
-        """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """
+        """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers"""
         total_inserted_tokens = 0
 
         for idx in range(len(virtual_token_splits)):
@@ -85,7 +89,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits
         return input_example
 
     def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0):
-        """ Try to truncate input text to fit into the max sequence length """
+        """Try to truncate input text to fit into the max sequence length"""
         logging.info(
             f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'"
         )
@@ -115,7 +119,7 @@ def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virt
         return input_ids
 
     def _add_leading_space(self, taskname, field_name, field_text):
-        """ Add leading space to text if there is a space before it in the template """
+        """Add leading space to text if there is a space before it in the template"""
         prompt_template = self.task_templates[taskname]["prompt_template"]
         field_text_start = prompt_template.find("{" + field_name + "}")
         if field_text_start != 0 and prompt_template[field_text_start - 1] == " ":
@@ -187,11 +191,11 @@ def pad_taskname_ids(self, taskname_ids):
 
 
 def find_subsequence_location(sequence, subsequence):
-    """ Finds the start and end index of the first occurance 
-        of a given subsequence within a larger list. Returns 
-        the two indices corresponding to the postition of 
-        the first and last token of the subseqeunce.
-        Assumes subsequence is known to be in sequence. 
+    """Finds the start and end index of the first occurance
+    of a given subsequence within a larger list. Returns
+    the two indices corresponding to the postition of
+    the first and last token of the subseqeunce.
+    Assumes subsequence is known to be in sequence.
     """
     assert len(sequence) >= len(subsequence), "subsequence too long"
 
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index 4b1b4f61d439..11795bd150f1 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
 from nemo.core import Dataset
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['GPTPromptLearningDataset']
 
@@ -30,7 +31,7 @@
 class GPTPromptLearningDataset(Dataset):
     """
     The dataset class for prompt-tuning or p-tuning pretrained GPT models.
-    
+
     Args:
         data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example
         tokenizer (tokenizer): Tokenizer from frozen language model
@@ -39,7 +40,7 @@ class GPTPromptLearningDataset(Dataset):
         pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [<prompt_1>, <prompt_2>, ...] up to max num virtual tokens
         pad_token_id (int): ID of pad token from tokenizer
         max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. 
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
         add_bos (bool): Whether to add a beginning of sentence token to each data example
         add_eos (bool): Whether to add an end of sentence token to each data example
         for_train (bool): Whether you're creating a dataset for training or inference
@@ -63,6 +64,9 @@ def __init__(
         cache_data_path: str = None,  # the cache file
         load_cache: bool = True,  # whether to load from the cache if it is available
     ):
+        # deprecation warning
+        deprecated_warning("GPTPromptLearningDataset")
+
         self.tokenizer = tokenizer
         self.virtual_prompt_source = virtual_prompt_source
         self.task_templates = task_templates
@@ -112,9 +116,9 @@ def __init__(
     def load_data(self, dataset):
         """
         Loads a dataset by filling in the task templates specified in the config file
-        with the information from each training/inference example. Converts all input 
-        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in 
-        the task templates with the actual virtual prompt token ids. 
+        with the information from each training/inference example. Converts all input
+        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in
+        the task templates with the actual virtual prompt token ids.
 
         params:
             dataset: A list of json objects or a dictionary objects each
@@ -241,7 +245,7 @@ def _input_sanity_checks(
             assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end"
 
     def _insert_text_in_template(self, input_example, prompt_template_fields, doc):
-        """ Format the input example according to the template """
+        """Format the input example according to the template"""
         for field in prompt_template_fields:
             if field in doc.keys():
                 field_text = doc[field]
@@ -255,7 +259,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc):
         return input_example.strip(" ")
 
     def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits):
-        """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """
+        """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers"""
         total_inserted_tokens = 0
 
         for idx in range(len(virtual_token_splits)):
@@ -270,7 +274,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits
     def _truncate_input(
         self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits
     ):
-        """ Try to truncate input text to fit into the max sequence length """
+        """Try to truncate input text to fit into the max sequence length"""
         logging.info(
             f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'"
         )
@@ -297,8 +301,8 @@ def _truncate_input(
         return input_ids
 
     def _find_answer_start(self, taskname, input_ids, answer_field, doc):
-        """ Find the token ids corresponding to the answer start, for loss masking purposes.
-            Assumes the answer is always at the end of the prompt.
+        """Find the token ids corresponding to the answer start, for loss masking purposes.
+        Assumes the answer is always at the end of the prompt.
         """
         answer_text = doc[answer_field]
         answer_text = self._add_leading_space(taskname, answer_field, answer_text)
@@ -313,7 +317,7 @@ def _find_answer_start(self, taskname, input_ids, answer_field, doc):
         return answer_start_idx
 
     def _add_leading_space(self, taskname, field_name, field_text):
-        """ Add leading space to text if there is a space before it in the template """
+        """Add leading space to text if there is a space before it in the template"""
         prompt_template = self.task_templates[taskname]["prompt_template"]
         field_text_start = prompt_template.find("{" + field_name + "}")
         if field_text_start != 0 and prompt_template[field_text_start - 1] == " ":
@@ -331,7 +335,7 @@ def _ceil_to_nearest(self, n, m):
         return (n + m - 1) // m * m
 
     def collate_fn(self, batch, tp_workers=0):
-        """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """
+        """Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch"""
         taskname_ids, input_ids, answer_starts = zip(*batch)
 
         # Pad taskname_ids to be the same length for the prompt encoder
@@ -380,7 +384,7 @@ def collate_fn(self, batch, tp_workers=0):
         return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids
 
     def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
-        """ Pad input_ids in batch to max batch length while building loss mask """
+        """Pad input_ids in batch to max batch length while building loss mask"""
         batch_loss_masks = []
         padded_input_ids = []
         for ids, answer_start_idx in zip(input_ids, answer_starts):
@@ -410,7 +414,7 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
 
     def inference_collate_fn(self, batch):
         """
-        Used for loading inference data. 
+        Used for loading inference data.
         """
         task_id_nums, input_ids, answer_starts = zip(*batch)
         input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
index 4070098b5e67..87174b69ffc2 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
@@ -22,10 +22,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_bert_input_example import BERTQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BERTQADataset(QADataset):
-    """ Creates a Dataset for BERT architecture based Exractive QA """
+    """Creates a Dataset for BERT architecture based Exractive QA"""
 
     def __init__(
         self,
@@ -41,6 +42,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("BERTQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -92,7 +96,7 @@ def __init__(
             self.features[i] = BERTQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -110,7 +114,7 @@ def _set_cached_features_filename(self):
         )
 
     def _convert_examples_to_features(self):
-        """ Converts loaded examples to features """
+        """Converts loaded examples to features"""
 
         logging.info(f"Preprocessing data into features.")
 
@@ -161,7 +165,7 @@ def _convert_examples_to_features(self):
                 example.doc_tokens = doc_tokens
 
             # the text to tokens step is the slowest step
-            for (i, token) in enumerate(doc_tokens):
+            for i, token in enumerate(doc_tokens):
                 orig_to_tok_index.append(len(all_doc_tokens))
                 if token not in text_to_tokens_dict:
                     text_to_tokens_dict[token] = self.tokenizer.text_to_tokens(token)
@@ -199,7 +203,7 @@ def _convert_examples_to_features(self):
             # make compatible for hashing
             doc_spans = tuple(doc_spans)
 
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
+            for doc_span_index, doc_span in enumerate(doc_spans):
 
                 tokens = [self.tokenizer.cls_token] + query_tokens + [self.tokenizer.sep_token]
                 segment_ids = [0 for i in range(len(tokens))]
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
index 783b2dd33f31..553f5984952c 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
@@ -28,14 +28,24 @@
 )
 from nemo.core.classes import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class QADataset(Dataset):
-    ''' Abstract base class for QA Datasets with common utility methods '''
+    '''Abstract base class for QA Datasets with common utility methods'''
 
     def __init__(
-        self, data_file: str, processor: object, tokenizer: object, mode: str, num_samples: int, **kwargs,
+        self,
+        data_file: str,
+        processor: object,
+        tokenizer: object,
+        mode: str,
+        num_samples: int,
+        **kwargs,
     ):
+        # deprecation warning
+        deprecated_warning("QADataset")
+
         self.mode = mode
         self.data_file = data_file
         self.processor = processor
@@ -100,7 +110,7 @@ def get_best_span_index(doc_spans, position):
 
         best_score = None
         best_span_index = None
-        for (span_index, doc_span) in enumerate(doc_spans):
+        for span_index, doc_span in enumerate(doc_spans):
             end = doc_span.start + doc_span.length - 1
             if position < doc_span.start:
                 continue
@@ -150,7 +160,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride):
             all_doc_tokens: list of all tokens in document
             max_tokens_for_doc: maximum number of tokens in each doc span
             doc_stride: stride size which sliding window moves with
-        
+
         Returns:
             doc_spans: all possible doc_spans from document
         """
@@ -179,7 +189,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
             doc_span
             tok_start_position: start position of answer in document
             tok_end_position: end position of answer in document
-        
+
         Returns:
             average distance of doc_span to answer
         """
@@ -193,7 +203,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
     @staticmethod
     def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode):
         """
-        Filters out doc_spans, which might not be relevant to answering question, 
+        Filters out doc_spans, which might not be relevant to answering question,
         which can be helpful when document is extremely long leading to many doc_spans with no answers
 
         Args:
@@ -204,7 +214,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode
                 all: do not filter
                 only_positive: only keep doc_spans containing the answer
                 limited_negative: only keep 10 doc_spans that are nearest to answer
-        
+
         Returns:
             doc_spans: doc_spans after filtering
         """
@@ -282,9 +292,13 @@ def get_doc_tokens_and_offset_from_context_id(
 
     @staticmethod
     def improve_answer_span(
-        doc_tokens: List[str], input_start: int, input_end: int, tokenizer: object, orig_answer_text: str,
+        doc_tokens: List[str],
+        input_start: int,
+        input_end: int,
+        tokenizer: object,
+        orig_answer_text: str,
     ):
-        """ Returns tokenized answer spans that better match the annotated answer """
+        """Returns tokenized answer spans that better match the annotated answer"""
 
         tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
 
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
index d6484b33e202..1eeb312a62a9 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
@@ -24,10 +24,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_gpt_input_example import GPTQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class GPTQADataset(QADataset):
-    """ Creates a Dataset for GPT architecture based Generative QA """
+    """Creates a Dataset for GPT architecture based Generative QA"""
 
     def __init__(
         self,
@@ -44,6 +45,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("GPTQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -76,7 +80,7 @@ def __init__(
             self.features[i] = GPTQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -120,7 +124,11 @@ def _convert_examples_to_features(self):
             formatted_query, query_tokens_length = self._prep_query(query_prefix, example)
             formatted_answer, answer_tokens_length = self._prep_answer(example)
             context_tokens, context_spans = self._prep_context(
-                example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens,
+                example,
+                query_tokens_length,
+                answer_tokens_length,
+                context_prefix_tokens,
+                answer_prefix_tokens,
             )
 
             unique_id = self._encode_all_context_spans(
@@ -170,7 +178,12 @@ def _prep_answer(self, example):
         return self._get_truncated_sentence_and_len(target, self.max_answer_length)
 
     def _prep_context(
-        self, example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens,
+        self,
+        example,
+        query_tokens_length,
+        answer_tokens_length,
+        context_prefix_tokens,
+        answer_prefix_tokens,
     ):
         """
         Calculates the maximum possible length for a given context given a question
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
index 1f9a8ef615a9..c65c8a43c440 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
@@ -23,10 +23,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_s2s_input_example import S2SQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class S2SQADataset(QADataset):
-    """ Creates a Dataset for T5/BART architecture based Generative QA """
+    """Creates a Dataset for T5/BART architecture based Generative QA"""
 
     def __init__(
         self,
@@ -43,6 +44,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("S2SQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -75,7 +79,7 @@ def __init__(
             self.features[i] = S2SQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -117,7 +121,12 @@ def _convert_examples_to_features(self):
             context_tokens, context_spans = self._prep_context(example, query_tokens, context_prefix_tokens)
 
             unique_id = self._encode_all_context_spans(
-                unique_id, context_spans, context_tokens, formatted_query, example, example_index,
+                unique_id,
+                context_spans,
+                context_tokens,
+                formatted_query,
+                example,
+                example_index,
             )
 
         # delete self.examples during training mode to save memory
@@ -155,7 +164,13 @@ def _prep_context(self, example, query_tokens, context_prefix_tokens):
         return context_tokens, context_spans
 
     def _encode_all_context_spans(
-        self, unique_id, context_spans, context_tokens, formatted_query, example, example_index,
+        self,
+        unique_id,
+        context_spans,
+        context_tokens,
+        formatted_query,
+        example,
+        example_index,
     ):
         """
         Fromats all spans extracted from a single context as:
@@ -173,7 +188,11 @@ def _encode_all_context_spans(
 
             # encode input
             encoded_input_dict = self.tokenizer.tokenizer(
-                source, truncation=True, max_length=self.max_seq_length, padding="max_length", return_tensors="pt",
+                source,
+                truncation=True,
+                max_length=self.max_seq_length,
+                padding="max_length",
+                return_tensors="pt",
             )
             input_ids = torch.squeeze(encoded_input_dict["input_ids"])
             input_attn_mask = torch.squeeze(encoded_input_dict["attention_mask"])
@@ -223,7 +242,11 @@ def _encode_answer(self, example, context_span_text):
             target = example.answer_text
 
         encoded_output_dict = self.tokenizer.tokenizer(
-            target, truncation=True, max_length=self.max_answer_length, padding="max_length", return_tensors="pt",
+            target,
+            truncation=True,
+            max_length=self.max_answer_length,
+            padding="max_length",
+            return_tensors="pt",
         )
         labels = torch.squeeze(encoded_output_dict["input_ids"])
         labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100
diff --git a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
index ee1a0957dbbb..2abe9b7c0aaa 100644
--- a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
+++ b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
@@ -46,6 +46,7 @@
 )
 from nemo.core.classes import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['SquadDataset', 'InputFeatures', '_check_is_max_context']
 
@@ -114,7 +115,7 @@ def get_best_span_index(doc_spans, position):
     """
     best_score = None
     best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
+    for span_index, doc_span in enumerate(doc_spans):
         end = doc_span.start + doc_span.length - 1
         if position < doc_span.start:
             continue
@@ -165,6 +166,9 @@ def __init__(
         mode: str,
         use_cache: bool,
     ):
+        # deprecation warning
+        deprecated_warning("SquadDataset")
+
         self.tokenizer = tokenizer
         self.version_2_with_negative = version_2_with_negative
         self.processor = SquadProcessor(data_file=data_file, mode=mode)
@@ -337,7 +341,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride):
             all_doc_tokens: list of all tokens in document
             max_tokens_for_doc: maximum number of tokens in each doc span
             doc_stride: stride size which sliding window moves with
-        
+
         Returns:
             doc_spans: all possible doc_spans from document
         """
@@ -375,7 +379,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
             doc_span
             tok_start_position: start position of answer in document
             tok_end_position: end position of answer in document
-        
+
         Returns:
             average distance of doc_span to answer
         """
@@ -387,7 +391,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
     @staticmethod
     def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode):
         """
-        Filters out doc_spans, which might not be relevant to answering question, 
+        Filters out doc_spans, which might not be relevant to answering question,
         which can be helpful when document is extremely long leading to many doc_spans with no answers
 
         Args:
@@ -398,7 +402,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode
                 all: do not filter
                 only_positive: only keep doc_spans containing the answer
                 limited_negative: only keep 10 doc_spans that are nearest to answer
-        
+
         Returns:
             doc_spans: doc_spans after filtering
         """
@@ -481,7 +485,7 @@ def convert_examples_to_features(
             if self.mode != TRAINING_MODE:
                 example.doc_tokens = doc_tokens
             # the text to tokens step is the slowest step
-            for (i, token) in enumerate(doc_tokens):
+            for i, token in enumerate(doc_tokens):
                 orig_to_tok_index.append(len(all_doc_tokens))
                 if token not in text_to_tokens_dict:
                     text_to_tokens_dict[token] = tokenizer.text_to_tokens(token)
@@ -521,7 +525,7 @@ def convert_examples_to_features(
             # make compatible for hashing
             doc_spans = tuple(doc_spans)
 
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
+            for doc_span_index, doc_span in enumerate(doc_spans):
 
                 tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token]
                 segment_ids = [0 for i in range(len(tokens))]
@@ -681,7 +685,7 @@ def get_predictions(
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
         scores_diff_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(self.examples):
+        for example_index, example in enumerate(self.examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -706,7 +710,7 @@ def get_predictions(
             null_start_logit = 0
             # end logit at the slice with min null score
             null_end_logit = 0
-            for (feature_index, feature) in enumerate(features):
+            for feature_index, feature in enumerate(features):
                 pos = unique_id_to_pos[feature.unique_id]
                 start_indexes = get_best_indexes(start_logits[pos], n_best_size)
                 end_indexes = get_best_indexes(end_logits[pos], n_best_size)
@@ -825,7 +829,7 @@ def get_predictions(
             probs = _compute_softmax(total_scores)
 
             nbest_json = []
-            for (i, entry) in enumerate(nbest):
+            for i, entry in enumerate(nbest):
                 output = collections.OrderedDict()
                 output["question"] = example.question_text
                 output["text"] = entry.text
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
index 803d0eaf8aed..c98abb300c64 100644
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
+++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
@@ -20,6 +20,8 @@
 
 from transformers import PreTrainedTokenizerBase
 
+from nemo.utils.decorators import deprecated_warning
+
 """Build BERT Examples from asr hypothesis, customization candidates, target labels, span info.
 """
 
@@ -52,7 +54,7 @@ def __init__(
             input_ids: indices of single characters (treated as subwords)
             input_mask: list of bools with 0s in place of input_ids to be masked
             segment_ids: list of ints from 0 to 10 to denote the text segment type (
-                0 - for tokens of ASR hypothesis, 
+                0 - for tokens of ASR hypothesis,
                 1 - for tokens of the first candidate
                 ...
                 10 - for tokens of the tenth candidate
@@ -60,7 +62,7 @@ def __init__(
             input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer)
             input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked
             segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords
-            character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) 
+            character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords)
             fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set
             labels_mask: bool tensor with 0s in place of label tokens to be masked
             labels: indices of semiotic classes which should be predicted from each of the
@@ -68,6 +70,9 @@ def __init__(
             spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM)
             default_label: The default label
         """
+        # deprecation warning
+        deprecated_warning("BertExample")
+
         input_len = len(input_ids)
         if not (
             input_len == len(input_mask)
@@ -123,6 +128,9 @@ def __init__(
             tokenizer: Tokenizer object.
             max_seq_length: Maximum sequence length.
         """
+        # deprecation warning
+        deprecated_warning("BertExampleBuilder")
+
         self._label_map = label_map
         self._semiotic_classes = semiotic_classes
         self._tokenizer = tokenizer
@@ -183,9 +191,15 @@ def build_bert_example(
                 tags[start:end] = [t for i in range(end - start)]
 
         # get input features for characters
-        (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features(
-            hyp=hyp, ref=ref, tags=tags
-        )
+        (
+            input_ids,
+            input_mask,
+            segment_ids,
+            labels_mask,
+            labels,
+            _,
+            _,
+        ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags)
 
         # get input features for words
         hyp_with_words = hyp.replace(" ", "").replace("_", " ")
@@ -243,11 +257,11 @@ def build_bert_example(
         return example
 
     def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]:
-        """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample
-            
-            Example:
-                span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
-                result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)]
+        """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample
+
+        Example:
+            span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
+            result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)]
         """
         result_spans = []
 
@@ -267,26 +281,26 @@ def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]:
     def _get_fragment_indices(
         self, hyp: str, targets: List[int], span_info_parts: List[str]
     ) -> Tuple[List[Tuple[int, int, int]]]:
-        """ Build fragment indices for real candidates.
-            This is used only at inference.
-            After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams).
-            In this function we 
-               1) adjust start/end positions to match word borders (possibly in multiple ways). 
-               2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). 
-
-            Args:
-                hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore).
-                targets: list of candidate ids (only for real candidates, not dummy)
-                span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text.
-            Returns:
-                List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id.
-                Note that returned fragments can be unsorted and can overlap, it's ok.
-            Example:
-                hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-                targets: [1 2 3 4 6 7 9]
-                span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion.
-                fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)]
-            """
+        """Build fragment indices for real candidates.
+        This is used only at inference.
+        After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams).
+        In this function we
+           1) adjust start/end positions to match word borders (possibly in multiple ways).
+           2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment).
+
+        Args:
+            hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore).
+            targets: list of candidate ids (only for real candidates, not dummy)
+            span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text.
+        Returns:
+            List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id.
+            Note that returned fragments can be unsorted and can overlap, it's ok.
+        Example:
+            hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
+            targets: [1 2 3 4 6 7 9]
+            span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion.
+            fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)]
+        """
 
         fragment_indices = []
 
@@ -337,18 +351,18 @@ def _get_fragment_indices(
         return fragment_indices
 
     def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]:
-        """ Maps each single character to the position of its corresponding subword.
-
-            Args:
-                input_ids: List of character token ids.
-                input_ids_for_subwords: List of subword token ids.
-            Returns:
-                List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids)
-
-            Example:
-                input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102]
-                input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102]
-                result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47]
+        """Maps each single character to the position of its corresponding subword.
+
+        Args:
+            input_ids: List of character token ids.
+            input_ids_for_subwords: List of subword token ids.
+        Returns:
+            List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids)
+
+        Example:
+            input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102]
+            input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102]
+            result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47]
         """
         character_pos_to_subword_pos = [0 for _ in input_ids]
 
@@ -453,7 +467,7 @@ def _get_input_features(
             ref:  "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy"
             tags: None (not used for word-based case)
 
-            resulting token sequence: 
+            resulting token sequence:
                 '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]']
         """
 
@@ -542,9 +556,9 @@ def read_input_file(
             infer: If true, input examples do not contain target info.
 
         Returns:
-            examples: List of converted examples (BertExample). 
+            examples: List of converted examples (BertExample).
                or
-            (examples, hyps_refs): If infer==true, returns h 
+            (examples, hyps_refs): If infer==true, returns h
         """
 
         if not path.exists(input_filename):
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
index 7737bfa67f00..07ca790866c7 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
@@ -45,14 +45,19 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueGPTClassificationModel']
 
 
 class DialogueGPTClassificationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueGPTClassificationModel")
 
         self.cfg = cfg
         self.eval_mode = cfg.dataset.eval_mode
@@ -101,14 +106,14 @@ def __init__(
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override for prompt learning. 
-        Optimizer will get self._optimizer_param_groups. 
+        ModelPT override for prompt learning.
+        Optimizer will get self._optimizer_param_groups.
         Makes two optimizer param groups, one for the frozen model params
-        and one for the prompt-table/prompt-encoder params. The learning 
+        and one for the prompt-table/prompt-encoder params. The learning
         rate for the frozen model's params will always be zero effectively
         freezing the model's params but still allowing for the needed gradients
-        to be passed around in pipeline parallel models. The prompt-encoder 
-        and/or prompt table will use the learning rate set by the user. 
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
         """
         if not self.prompt_learning:
             super().setup_optimizer_param_groups()
@@ -328,7 +333,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True):
                 len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0
             )
             position_ids = torch.arange(
-                start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device,
+                start=0,
+                end=num_prompt_tokens + input_ids.size(1),
+                dtype=torch.long,
+                device=input_ids.device,
             )
 
             prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids)
@@ -708,7 +716,9 @@ def prepare_data(self):
             )
         elif self._cfg.dataset.task == 'design':
             self.dialogues_processor = DialogueDesignDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset,
+                data_dir=self._cfg.dataset.data_dir,
+                tokenizer=self.tokenizer,
+                cfg=self._cfg.dataset,
             )
         else:
             raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model")
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
index 602c15a50c76..116605b65d52 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
@@ -35,6 +35,7 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueGPTGenerationModel']
 
@@ -43,8 +44,12 @@
 
 class DialogueGPTGenerationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueGPTGenerationModel")
 
         self.cfg = cfg
         self.data_prepared = False
@@ -108,7 +113,10 @@ def eval_epoch_end(self, outputs, mode='val'):
         )
 
         DialogueGenerationMetrics.save_predictions(
-            filename, generated_field, ground_truth_field, inputs,
+            filename,
+            generated_field,
+            ground_truth_field,
+            inputs,
         )
 
         label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
@@ -155,7 +163,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True):
             )
 
             position_ids = torch.arange(
-                start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device,
+                start=0,
+                end=num_prompt_tokens + input_ids.size(1),
+                dtype=torch.long,
+                device=input_ids.device,
             )
 
             position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1)
@@ -228,7 +239,7 @@ def setup(self, stage=None):
 
     def prepare_megatron_generation(self, labels, input_ids, template_length):
         """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference 
+        # adapted from MegatronGPTModel._bucketize_gpt_inference
         """
         batch_size = labels.size(0)
         prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
index 455b0fa17a85..29e2627fa038 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
@@ -34,14 +34,18 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueNearestNeighbourModel']
 
 
 class DialogueNearestNeighbourModel(NLPModel):
-    """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions """
+    """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions"""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("DialogueNearestNeighbourModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer)
         if self.cfg.library == "huggingface":
@@ -155,7 +159,10 @@ def on_validation_epoch_end(self):
         filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
 
         DialogueGenerationMetrics.save_predictions(
-            filename, predicted_labels, ground_truth_labels, decoded_inputs,
+            filename,
+            predicted_labels,
+            ground_truth_labels,
+            decoded_inputs,
         )
 
         label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
index 9655fbea2722..73f09f62b1d5 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
@@ -32,6 +32,7 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
@@ -46,8 +47,12 @@
 
 class DialogueS2SGenerationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueS2SGenerationModel")
 
         self.cfg = cfg
         self.data_prepared = False
@@ -120,7 +125,10 @@ def eval_epoch_end(self, outputs, mode='val'):
         )
 
         DialogueGenerationMetrics.save_predictions(
-            filename, generated_field, ground_truth_field, inputs,
+            filename,
+            generated_field,
+            ground_truth_field,
+            inputs,
         )
 
         label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
@@ -172,7 +180,7 @@ def forward(self, input_ids, attention_masks, labels):
 
     def prepare_megatron_generation(self, labels, input_ids, template_length):
         """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference 
+        # adapted from MegatronGPTModel._bucketize_gpt_inference
         """
         batch_size = labels.size(0)
         prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
index 0e007a7bcdd1..5298c060df08 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
@@ -36,6 +36,7 @@
 from nemo.collections.nlp.models import TextClassificationModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueZeroShotIntentModel']
 
@@ -44,6 +45,9 @@ class DialogueZeroShotIntentModel(TextClassificationModel):
     """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("DialogueZeroShotIntentModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer)
 
@@ -275,7 +279,10 @@ def on_validation_epoch_end(self, split="val"):
         filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
 
         DialogueGenerationMetrics.save_predictions(
-            filename, predicted_labels, ground_truth_labels, utterances,
+            filename,
+            predicted_labels,
+            ground_truth_labels,
+            utterances,
         )
 
         label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
@@ -316,7 +323,6 @@ def predict(
         entailment_idx=1,
         contradiction_idx=0,
     ) -> List[Dict]:
-
         """
         Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query.
 
diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
index a34afa64674d..777d468084e2 100644
--- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
@@ -35,12 +35,15 @@
 from nemo.core.classes import typecheck
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class IntentSlotClassificationModel(NLPModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """ Initializes BERT Joint Intent and Slot model.
-        """
+        """Initializes BERT Joint Intent and Slot model."""
+        # deprecation warning
+        deprecated_warning("IntentSlotClassificationModel")
+
         self.max_seq_length = cfg.dataset.max_seq_length
         self.cfg = cfg
         # Check the presence of data_dir.
@@ -78,7 +81,7 @@ def _set_defaults_data_desc(self, cfg):
             OmegaConf.set_struct(cfg, True)
 
     def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
-        """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """
+        """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc."""
         # Save data from data desc to config - so it can be reused later, e.g. in inference.
         data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
         OmegaConf.set_struct(cfg, False)
@@ -112,7 +115,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
         OmegaConf.set_struct(cfg, True)
 
     def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
-        """ Saves label ids map to a file """
+        """Saves label ids map to a file"""
         with open(filename, 'w') as out:
             labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
             out.write('\n'.join(labels))
@@ -120,7 +123,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
             logging.info(f'Labels mapping saved to : {out.name}')
 
     def _reconfigure_classifier(self):
-        """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """
+        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
 
         self.classifier = SequenceTokenClassifier(
             hidden_size=self.hidden_size,
@@ -310,7 +313,7 @@ def get_utterance_tokens(self, token_ids, token_masks):
         Args:
             token_ids: IntTensor of size (max_seq_len, )
             token_masks: BoolTensor of size (max_seq_len, )
-        
+
         Returns
             token_list: List of Str (list of tokens with len <= max_seq_len)
         """
diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
index b350fd01fa09..3b30dfccd9ce 100644
--- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py
+++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
@@ -35,6 +35,7 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['SGDQAModel']
 
@@ -44,7 +45,7 @@ class SGDQAModel(NLPModel):
     Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049)
 
     The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855).
-    The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. 
+    The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs.
     The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker.
     The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes
     the dialogue state across turns.
@@ -55,6 +56,9 @@ def output_module(self):
         return self.decoder
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("SGDQAModel")
+
         self.data_prepared = False
         super().__init__(cfg=cfg, trainer=trainer)
         self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout)
@@ -146,7 +150,7 @@ def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_
         Called at every validation step to aggregate and postprocess outputs on each GPU
         Args:
             batch: input batch at validation step
-            batch_idx: batch index 
+            batch_idx: batch index
             dataloader_idx: dataloader index
         """
         loss, tensors = self.eval_step_helper(batch=batch)
@@ -163,7 +167,7 @@ def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: i
         Called at every test step to aggregate and postprocess outputs on each GPU
         Args:
             batch: input batch at test step
-            batch_idx: batch index 
+            batch_idx: batch index
             dataloader_idx: dataloader index
         """
         loss, tensors = self.eval_step_helper(batch=batch)
@@ -318,8 +322,8 @@ def eval_step_helper(self, batch: List[torch.Tensor]):
             torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype),
             total_scores,
         )
-        max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens ** 2), axis=-1)
-        max_span_p = torch.max(total_scores.view(-1, max_num_tokens ** 2), axis=-1)[0]
+        max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1)
+        max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0]
 
         span_start_index = torch.floor_divide(max_span_index, max_num_tokens)
         span_end_index = torch.fmod(max_span_index, max_num_tokens)
@@ -415,7 +419,7 @@ def format_turn_id(ex_id_num):
 
         def combine_predictions_in_example(predictions: dict, batch_size: int):
             '''
-            Combines predicted values to a single example. 
+            Combines predicted values to a single example.
             Args:
                 predictions: predictions ordered by keys then batch
                 batch_size: batch size
diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
index f3ef3ccb87f9..4afae81e3893 100644
--- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
+++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
@@ -26,6 +26,7 @@
 from nemo.core.classes.exportable import Exportable
 from nemo.core.neural_types import LogitsType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['EntityLinkingModel']
 
@@ -44,6 +45,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         """Initializes the SAP-BERT model for entity linking."""
 
+        # deprecation warning
+        deprecated_warning("EntityLinkingModel")
+
         # tokenizer needed before super().__init__() so dataset and loader can process data
         self._setup_tokenizer(cfg.tokenizer)
 
@@ -123,7 +127,7 @@ def on_validation_epoch_end(self):
         Args:
             outputs: list of individual outputs of each validation step.
         Returns:
-            
+
         """
         if self.validation_step_outputs:
             avg_loss = torch.stack(
diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
index 4a073e2ada1c..4447ebb89386 100644
--- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
+++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
@@ -31,6 +31,7 @@
 from nemo.core.classes import typecheck
 from nemo.core.neural_types import NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['GLUEModel']
 
@@ -78,6 +79,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         """
         Initializes model to use BERT model for GLUE tasks.
         """
+        # deprecation warning
+        deprecated_warning("GLUEModel")
 
         if cfg.task_name not in cfg.supported_tasks:
             raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}')
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index e7ae529fe4e2..2ff6a2ae0a85 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -14,7 +14,6 @@
 
 """BERT model."""
 
-import warnings
 from dataclasses import dataclass
 
 import torch
@@ -33,6 +32,7 @@
     parallel_lm_logits,
     scaled_init_method_normal,
 )
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -142,7 +142,13 @@ def forward(self, hidden_states, word_embeddings_weight):
 
 
 def post_language_model_processing(
-    lm_output, pooled_output, lm_head, binary_head, lm_labels, logit_weights, fp16_lm_cross_entropy,
+    lm_output,
+    pooled_output,
+    lm_head,
+    binary_head,
+    lm_labels,
+    logit_weights,
+    fp16_lm_cross_entropy,
 ):
     # lm_logits: [s, b, vocab_size]
     lm_logits = lm_head(lm_output, logit_weights)
@@ -339,16 +345,14 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
             transformer_block_type=self.transformer_block_type,
         )
 
-        if self.add_pooler:
-            self.pooler = Pooler(
-                self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel
-            )
-
         # Output
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
 
-            self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
+            self.lm_head = MCoreBertLMHead(
+                self.config.hidden_size,
+                self.config,
+            )
 
             self.output_layer = tensor_parallel.ColumnParallelLinear(
                 self.config.hidden_size,
@@ -361,6 +365,11 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
                 skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
             )
 
+            if self.add_pooler:
+                self.pooler = Pooler(
+                    self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel
+                )
+
             self.binary_head = None
             if self.add_binary_head:
                 # TODO: Shoudl switch this to TE ?
@@ -476,10 +485,9 @@ def __init__(
         sequence_parallel=False,
         position_embedding_type='learned_absolute',
     ):
-        warnings.warn(
-            "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.",
-            DeprecationWarning,
-        )
+        # deprecation warning
+        deprecated_warning("NeMoBertModel", "MCoreBertModelWrapperWithPostLNSupport")
+
         super(NeMoBertModel, self).__init__(config=config)
         self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
         self.add_binary_head = add_binary_head
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
index 19fafb796fd7..c572d94acd11 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
@@ -24,6 +24,7 @@
     parallel_lm_logits,
     scaled_init_method_normal,
 )
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -167,6 +168,9 @@ def __init__(
         seq_len_interpolation_factor=None,
         rotary_base=10000,
     ):
+        # deprecation warning
+        deprecated_warning("GPTModel", "McoreGPTModel")
+
         super(GPTModel, self).__init__(config=config, share_token_embeddings=share_embeddings_and_output_weights)
 
         self.parallel_output = parallel_output
@@ -250,7 +254,9 @@ def __init__(
 
         if self.share_embeddings_and_output_weights:
             self.initialize_word_embeddings(
-                init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size,
+                init_method=init_method_normal(init_method_std),
+                vocab_size=vocab_size,
+                hidden_size=hidden_size,
             )
 
     def set_input_tensor(self, input_tensor):
@@ -299,9 +305,11 @@ def forward(
             post_process_result = post_language_model_processing(
                 loss_lm_output,
                 loss_labels,
-                self.language_model.output_layer.weight
-                if not self.share_embeddings_and_output_weights
-                else self.word_embeddings_weight(),
+                (
+                    self.language_model.output_layer.weight
+                    if not self.share_embeddings_and_output_weights
+                    else self.word_embeddings_weight()
+                ),
                 get_key_value,
                 self.parallel_output,
                 forward_method_parallel_output,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index a27f9fd5e5e4..96b7fd2cbf15 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -1140,6 +1140,10 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "tp_comm_overlap": self.cfg.get('ub_tp_comm_overlap', False),
         }
 
+        # Set enable_autocast to False when precision is fp16 and not using bias
+        if not megatron_amp_O2 and not self.cfg.get('bias', True):
+            config_mapping["enable_autocast"] = False
+
         # instantitate ModelParallelConfig from this dict
         mp_config_dict = {}
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 4d4cc09d0751..2652d91710ff 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -37,6 +37,7 @@
 from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
@@ -63,25 +64,28 @@
 
 class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration):
     """
-    Model class for prompt-tuning or p-tuning a pretrained Megatron model. 
+    Model class for prompt-tuning or p-tuning a pretrained Megatron model.
 
     Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
     certain token embeddings from the the pretrained model's vocabulary
-    and directly tunes these embedding weights. The token embeddings used in 
-    initalization are specified by the user in the config file. The model can 
-    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a 
-    prompt table and can be added or deleted without disrupting virtual prompts 
-    for other tasks. 
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
 
     P-tuning initializes an LSTM encoder model that generates virtual prompt
     embeddings for every task. Each task shares the same encoder. After ptuning
     is compelete, the learned virtual prompts can be saved to the prompt table
-    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a 
-    new virtual prompt via p-tuning, they do not need to retrain on all previous 
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
     tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
+        # deprecation warning
+        deprecated_warning("MegatronBasePromptLearningModel")
+
         super().__init__(cfg, trainer)
 
         self.config: ModelParallelConfig = self.model_parallel_config
@@ -156,10 +160,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
     def load_task_templates(self, task_templates):
         """
-        Takes in the task template portion of the config and turns  
-        it into a table where each task's prompt template and 
-        the number of virtual tokens to insert in a given part of 
-        the prompt template are specified. 
+        Takes in the task template portion of the config and turns
+        it into a table where each task's prompt template and
+        the number of virtual tokens to insert in a given part of
+        the prompt template are specified.
         """
         self.task_templates = {}
         self.task_id_num_to_name = {}
@@ -215,18 +219,17 @@ def init_prompt_encoder(self):
         )
 
     def freeze_existing_word_embeddings(self):
-        """Freeze params of existing virtual prompts that should not be tuned further
-        """
+        """Freeze params of existing virtual prompts that should not be tuned further"""
         # Make sure word embeddings are frozen
         for params in self.word_embeddings.parameters():
             params.requires_grad = False
 
     def state_dict(self):
         """
-        Custom state dict that only contains prompt table and prompt encoder parameters. 
-        No frozen model parameters are stored in the state dict. Prompt encoder parameters 
+        Custom state dict that only contains prompt table and prompt encoder parameters.
+        No frozen model parameters are stored in the state dict. Prompt encoder parameters
         are only in state dict for intermediate checkpoints saved during training. Final
-        nemo checkpoints at the end of training will contain prompt table parameters only. 
+        nemo checkpoints at the end of training will contain prompt table parameters only.
         """
         state_dict_ = {}
 
@@ -241,7 +244,7 @@ def state_dict(self):
     def load_state_dict(self, state_dict, strict: bool = True):
         """
         Custom load state dict method that only loads prompt table and prompt encoder
-        parameters. Matching load method for this class' custom state dict method. 
+        parameters. Matching load method for this class' custom state dict method.
         """
         if self.first_stage_of_pipeline():
             if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
@@ -253,7 +256,7 @@ def load_state_dict(self, state_dict, strict: bool = True):
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override. Optimizer will get self._optimizer_param_groups. 
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
         Only want virtual prompt params to be passed to the optimizer.
         """
         ## Freeze frozen model
@@ -272,8 +275,8 @@ def setup_optimizer_param_groups(self):
 
     def embed_input(self, input_ids: Tensor, taskname_ids: Tensor, use_cached_reps: bool):
         """
-        Replaces the virtual tokens in the input_ids with embeddings 
-        calculated from either the 'prompt_table' or 'prompt_encoder'. 
+        Replaces the virtual tokens in the input_ids with embeddings
+        calculated from either the 'prompt_table' or 'prompt_encoder'.
         The virtual token placeholders have token_ids listed in
         `self.pseudo_token_ids`.
 
@@ -422,7 +425,7 @@ def load_frozen_model(self, cfg, trainer):
 def get_pseudo_tokens(num_virtual_tokens):
     """
     Takes in an integer and returns a list of strings where each string
-    is a numbered virtual token placeholder. If 
+    is a numbered virtual token placeholder. If
     num_virtual_tokens = 3, then this function returns:
 
     ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
@@ -430,7 +433,7 @@ def get_pseudo_tokens(num_virtual_tokens):
     Args:
         num_virtual_tokens: (int) Number of virtual token strings you want to make
 
-    returns a list of string. 
+    returns a list of string.
 
     """
     pseudo_tokens = [
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 1278040b32dd..0cea03fd544b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -174,7 +174,7 @@ def forward(self, **kwargs):
         the superclass by the square root of the hidden size specified in the configuration.
         """
         embeddings = super().forward(**kwargs)
-        return embeddings * torch.tensor(self.config.hidden_size ** 0.5, dtype=embeddings.dtype)
+        return embeddings * torch.tensor(self.config.hidden_size**0.5, dtype=embeddings.dtype)
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
@@ -196,11 +196,14 @@ def __init__(self, model):
 
     def forward(self, tokens, position_ids, attention_mask):
         if self.fp8_enabled and HAVE_TE:
-            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
-                enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
-            ), torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                transformer_engine.pytorch.onnx_export(self.fp8_enabled),
+                transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe),
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -211,9 +214,12 @@ def forward(self, tokens, position_ids, attention_mask):
                     labels=None,
                 )
         else:
-            with torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -509,7 +515,7 @@ def setup_optimizer_param_groups(self):
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
     def setup_mcore_distributed_parallel(self):
-        """Set up mcore distributed data parallel """
+        """Set up mcore distributed data parallel"""
         if self.with_distributed_adam and self.use_mcore_dist_optim:
             config = get_model_config(self.model[0])
             ddp_config = DistributedDataParallelConfig(
@@ -641,7 +647,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
                 if self.validation_param_sync_overlap:
                     param_sync_func = self.sync_overlap_parameters
             elif not self.use_mcore_dist_optim:
-                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
             else:
@@ -744,9 +753,9 @@ def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -877,7 +886,11 @@ def training_step(self, dataloader_iter):
         if self.log_memory_usage:
             mem_reserved = torch.cuda.max_memory_reserved()
             self.log(
-                'peak_memory_usage', mem_reserved, prog_bar=True, rank_zero_only=True, batch_size=1,
+                'peak_memory_usage',
+                mem_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
             )
 
         ## logging
@@ -901,20 +914,29 @@ def training_step(self, dataloader_iter):
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
-            'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'global_step',
+            self.trainer.global_step,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         consumed_samples = self._compute_consumed_samples_after_training_step()
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
-            'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'consumed_samples',
+            consumed_samples,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         if self.rampup_batch_size:
             self.prev_global_batch_size = current_global_batch_size
             self.prev_consumed_samples = consumed_samples
             num_microbatch_calculator.update(
-                consumed_samples=consumed_samples, consistency_check=False,
+                consumed_samples=consumed_samples,
+                consistency_check=False,
             )
             current_global_batch_size = num_microbatch_calculator.current_global_batch_size
             self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -923,20 +945,20 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
     def _append_sequence_parallel_module_grads(self, module, grads):
-        """ Helper method for allreduce_sequence_parallel_gradients"""
+        """Helper method for allreduce_sequence_parallel_gradients"""
 
         for param in module.parameters():
             sequence_parallel_param = getattr(param, 'sequence_parallel', False) or getattr(
@@ -954,9 +976,9 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def allreduce_sequence_parallel_gradients(self):
-        """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-            Modified from megatron-lm:
-            https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+        """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+        Modified from megatron-lm:
+        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
         """
 
         grads = []
@@ -974,8 +996,7 @@ def allreduce_sequence_parallel_gradients(self):
             buf.copy_(synced)
 
     def allreduce_fsdp_sharding_omitted_gradients(self):
-        """ All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain).
-        """
+        """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain)."""
         assert isinstance(self.model, torch.nn.Module)
         grads = []
         for param in self.model.parameters():
@@ -1022,16 +1043,16 @@ def allreduce_first_last_embeddings(self):
                     torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
 
     def _make_data_iterator_list(self, data_iterator: Iterator) -> List[Iterator]:
-        """ Convert data iterator into form expected by Megatron
-
-            With interleaved pipeline parallelism, Megatron expects a
-            list of one data iterator per model chunk. Each model
-            chunk independently gets data from its data iterator, so
-            we need to interact with the data iterator multiple times
-            for each microbatch step. Instead of incorporating this
-            logic into the data loader, we cache the iterator's output
-            to the first model chunk and reuse it in the other model
-            chunks.
+        """Convert data iterator into form expected by Megatron
+
+        With interleaved pipeline parallelism, Megatron expects a
+        list of one data iterator per model chunk. Each model
+        chunk independently gets data from its data iterator, so
+        we need to interact with the data iterator multiple times
+        for each microbatch step. Instead of incorporating this
+        logic into the data loader, we cache the iterator's output
+        to the first model chunk and reuse it in the other model
+        chunks.
         """
 
         if not isinstance(self.model, list) or len(self.model) == 1:
@@ -1323,10 +1344,10 @@ def id_func(output_tensor):
 
     def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.
@@ -1387,7 +1408,9 @@ def on_validation_epoch_end(self):
             if self.loss_broadcast_src_rank is None:
                 self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
             torch.distributed.broadcast(
-                averaged_loss, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(),
+                averaged_loss,
+                self.loss_broadcast_src_rank,
+                group=parallel_state.get_pipeline_model_parallel_group(),
             )
 
         self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -1492,7 +1515,10 @@ def build_train_valid_test_datasets(self):
                 dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config,
+                dataset_type,
+                train_valid_test_num_samples,
+                is_dataset_built_on_rank,
+                dataset_config,
             ).build()
 
         if self._train_ds is not None:
@@ -1702,16 +1728,16 @@ def list_available_models(self):
         return None
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        When using pipeline parallelism, we need the global batch to remain on the CPU,
+        since the memory overhead will be too high when using a large number of microbatches.
+        Microbatches are transferred from CPU to GPU inside the pipeline.
         """
         return batch
 
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -1788,9 +1814,9 @@ def on_load_checkpoint(self, checkpoint) -> None:
 
     def on_validation_model_zero_grad(self) -> None:
         """
-         Skip gradient zeroing at the beginning of validation routine.
-         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
-         """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
         if not self.validation_param_sync_overlap:
             super().on_validation_model_zero_grad()
 
@@ -1859,9 +1885,9 @@ def initialize_last_rank_embeddings(self):
                     parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def _reset_activation_checkpointing_args(self):
-        """ Disables activation checkpointing completely and saves the values so that
-            _restore_activation_checkpointing_args can restore them later. This function must always be
-            called before _restore_activation_checkpointing_args.
+        """Disables activation checkpointing completely and saves the values so that
+        _restore_activation_checkpointing_args can restore them later. This function must always be
+        called before _restore_activation_checkpointing_args.
         """
         # Store values to restore them later.
         self.last_activations_checkpoint_granularity = self.cfg.activations_checkpoint_granularity
@@ -1888,9 +1914,9 @@ def _reset_activation_checkpointing_args(self):
                 module.language_model.encoder.activations_checkpoint_layers_per_pipeline = None
 
     def _restore_activation_checkpointing_args(self):
-        """ Restores the activation checkpointing parameters using the values saved by
-            _reset_activation_checkpointing_args. This function must never be called before
-            _reset_activation_checkpointing_args.
+        """Restores the activation checkpointing parameters using the values saved by
+        _reset_activation_checkpointing_args. This function must never be called before
+        _reset_activation_checkpointing_args.
         """
         # Restore config values.
         self.cfg.activations_checkpoint_granularity = self.last_activations_checkpoint_granularity
@@ -1917,9 +1943,9 @@ def _restore_activation_checkpointing_args(self):
                 )
 
     def _reset_sequence_parallelism_args(self):
-        """ Disables sequence parallelism completely and saves the values so that
-            _restore_sequence_parallelism_args can restore them later. This function must always be
-            called before _restore_sequence_parallelism_args.
+        """Disables sequence parallelism completely and saves the values so that
+        _restore_sequence_parallelism_args can restore them later. This function must always be
+        called before _restore_sequence_parallelism_args.
         """
         # Store values to restore them later.
         self.last_sequence_parallel = self.cfg.sequence_parallel
@@ -1936,9 +1962,9 @@ def _reset_sequence_parallelism_args(self):
                     mod.sequence_parallel = False
 
     def _restore_sequence_parallelism_args(self):
-        """ Restores the sequence parallelism parameters using the values saved by
-            _reset_sequence_parallelism_args. This function must never be called before
-            _reset_sequence_parallelism_args.
+        """Restores the sequence parallelism parameters using the values saved by
+        _reset_sequence_parallelism_args. This function must never be called before
+        _reset_sequence_parallelism_args.
         """
         # Restore config values.
         self.cfg.sequence_parallel = self.last_sequence_parallel
@@ -1952,12 +1978,18 @@ def _restore_sequence_parallelism_args(self):
                     mod.sequence_parallel = self.last_sequence_parallel
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core gpt transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
+        if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0:
+            raise ValueError(
+                f"num_layers ({self.cfg.num_layers}) should be divisible by "
+                f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})"
+            )
+
         normalization = self.cfg.get('normalization', 'layernorm').lower()
         layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
         if normalization == 'layernorm':
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 5ee7a3fcf480..acfc22439a7d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -44,6 +44,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
@@ -72,25 +73,28 @@
 
 class MegatronGPTPromptLearningModel(MegatronBasePromptLearningModel):
     """
-    Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. 
+    Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model.
 
     Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
     certain token embeddings from the the pretrained GPT model's vocabulary
-    and directly tunes these embedding weights. The token embeddings used in 
-    initalization are specified by the user in the config file. The model can 
-    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a 
-    prompt table and can be added or deleted without disrupting virtual prompts 
-    for other tasks. 
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
 
     P-tuning initializes an LSTM encoder model that generates virtual prompt
     embeddings for every task. Each task shares the same encoder. After ptuning
     is compelete, the learned virtual prompts can be saved to the prompt table
-    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a 
-    new virtual prompt via p-tuning, they do not need to retrain on all previous 
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
     tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
+        # deprecation warning
+        deprecated_warning("MegatronGPTPromptLearningModel")
+
         super().__init__(cfg, trainer)
 
         self.inference_params = None
@@ -305,8 +309,8 @@ def forward(
 
     def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
         """
-            Dataloader produces a global batch which is turned into an iterator of microbatches.
-            The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions.
+        Dataloader produces a global batch which is turned into an iterator of microbatches.
+        The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions.
         """
         # Get seq length of batch
         batch, _, _ = next(dataloader_iter)
@@ -361,15 +365,15 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
@@ -415,11 +419,19 @@ def validation_step(self, dataloader_iter):
                 labels_text.append(label)
             if mode == 'val':
                 self.validation_step_outputs.append(
-                    {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,}
+                    {
+                        'loss': loss_mean,
+                        'preds': preds_text,
+                        'labels': labels_text,
+                    }
                 )
             else:
                 self.test_step_outputs.append(
-                    {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,}
+                    {
+                        'loss': loss_mean,
+                        'preds': preds_text,
+                        'labels': labels_text,
+                    }
                 )
             return {
                 'loss': loss_mean,
@@ -427,8 +439,10 @@ def validation_step(self, dataloader_iter):
                 'labels': labels_text,
             }
 
-        self.validation_step_outputs.append({'loss': loss_mean}) if mode == 'val' else self.test_step_outputs.append(
-            {'loss': loss_mean}
+        (
+            self.validation_step_outputs.append({'loss': loss_mean})
+            if mode == 'val'
+            else self.test_step_outputs.append({'loss': loss_mean})
         )
         return {'loss': loss_mean}
 
@@ -481,7 +495,8 @@ def on_validation_epoch_end(self):
                 gather_results_dedup = list(set(itertools.chain(*gather_results)))
 
                 val_metric_dict = self.validation_metric.get_score(
-                    [i[1] for i in gather_results_dedup], [i[0] for i in gather_results_dedup],
+                    [i[1] for i in gather_results_dedup],
+                    [i[0] for i in gather_results_dedup],
                 )
 
                 for metric, val in val_metric_dict.items():
@@ -638,9 +653,9 @@ def build_virtual_prompt_dataset(
             drop_last=drop_last,
             num_workers=num_workers,
             pin_memory=pin_memory,
-            persistent_workers=True
-            if num_workers > 0
-            else False,  # (@adithyare and @eharper) We need this to make spawn=True to work.
+            persistent_workers=(
+                True if num_workers > 0 else False
+            ),  # (@adithyare and @eharper) We need this to make spawn=True to work.
         )
 
         return dataset, dataloader
@@ -815,7 +830,7 @@ def list_available_models(cls):
 def get_pseudo_tokens(num_virtual_tokens):
     """
     Takes in an integer and returns a list of strings where each string
-    is a numbered virtual token placeholder. If 
+    is a numbered virtual token placeholder. If
     num_virtual_tokens = 3, then this function returns:
 
     ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
@@ -823,7 +838,7 @@ def get_pseudo_tokens(num_virtual_tokens):
     Args:
         num_virtual_tokens: (int) Number of virtual token strings you want to make
 
-    returns a list of string. 
+    returns a list of string.
 
     """
     pseudo_tokens = [
diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
index 41c6125ba05f..fd27c7ba1582 100644
--- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
+++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_bottleneck_model.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
 from nemo.core.classes.common import typecheck
 from nemo.utils import timers
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['MTBottleneckModel']
 
@@ -57,6 +58,8 @@ class MTBottleneckModel(MTEncDecModel):
     """
 
     def __init__(self, cfg: MTBottleneckModelConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("MTBottleneckModel")
         super().__init__(cfg=cfg, trainer=trainer)
 
         self.model_type: str = cfg.get("model_type", "nll")
@@ -184,7 +187,11 @@ def loss(
             output_mask = (tgt_labels != self.decoder_tokenizer.pad_id).type_as(tgt_log_probs)
 
             log_p_x_given_z_per_token = (
-                -recon_loss_fn(log_probs=tgt_log_probs, labels=tgt_labels,).view(tgt_log_probs.shape[:2]) * output_mask
+                -recon_loss_fn(
+                    log_probs=tgt_log_probs,
+                    labels=tgt_labels,
+                ).view(tgt_log_probs.shape[:2])
+                * output_mask
             )
 
             # probability per sample
@@ -216,7 +223,10 @@ def loss(
 
         if self.model_type in ["mim", "vae"]:
             # tokens = tgt_mask.sum()
-            q_z_given_x = torch.distributions.Normal(loc=z_mean, scale=torch.exp(0.5 * z_logv),)
+            q_z_given_x = torch.distributions.Normal(
+                loc=z_mean,
+                scale=torch.exp(0.5 * z_logv),
+            )
             # average latent distribution to match averaging of observations
             if self.recon_per_token:
                 # average latent per dimension - to heuristically match per-token reconstruction
@@ -225,7 +235,10 @@ def loss(
                 log_q_z_given_x = q_z_given_x.log_prob(z).sum(-1).sum(-1).mean()
 
             # build prior distribution
-            p_z = torch.distributions.Normal(loc=torch.zeros_like(z), scale=torch.ones_like(z),)
+            p_z = torch.distributions.Normal(
+                loc=torch.zeros_like(z),
+                scale=torch.ones_like(z),
+            )
             if self.recon_per_token:
                 # average latent distribution similar to averaging of observations
                 log_p_z = p_z.log_prob(z).mean(-1).mean(-1).mean()
@@ -267,7 +280,11 @@ def forward(self, src, src_mask, tgt, tgt_mask, timer=None):
 
         if timer is not None:
             timer.start("encoder")
-        enc_hiddens, enc_mask = self.encoder(input_ids=src, encoder_mask=src_mask, return_mask=True,)
+        enc_hiddens, enc_mask = self.encoder(
+            input_ids=src,
+            encoder_mask=src_mask,
+            return_mask=True,
+        )
 
         # build posterior distribution q(x|z)
         z, z_mean, z_logv = self.encode_latent(hidden=enc_hiddens)
@@ -283,7 +300,10 @@ def forward(self, src, src_mask, tgt, tgt_mask, timer=None):
         context_hiddens = self.latent2hidden(z)
 
         tgt_hiddens = self.decoder(
-            input_ids=tgt, decoder_mask=tgt_mask, encoder_embeddings=context_hiddens, encoder_mask=enc_mask,
+            input_ids=tgt,
+            decoder_mask=tgt_mask,
+            encoder_embeddings=context_hiddens,
+            encoder_mask=enc_mask,
         )
 
         # build decoding distribution
@@ -426,18 +446,25 @@ def eval_step(self, batch, batch_idx, mode, dataloader_idx=0):
             return_info=True,
         )
         # pass cache to sampler in order to reuse encoder's output
-        cache = dict(z=z, z_mean=z_mean, z_mask=z_mask, timer=timer,)
+        cache = dict(
+            z=z,
+            z_mean=z_mean,
+            z_mask=z_mask,
+            timer=timer,
+        )
 
         inputs, translations = self.batch_translate(src=src_ids, src_mask=src_mask, cache=cache)
 
         num_measurements = labels.shape[0] * labels.shape[1]
         if dataloader_idx == 0:
             getattr(self, f'{mode}_loss')(
-                loss=eval_loss, num_measurements=num_measurements,
+                loss=eval_loss,
+                num_measurements=num_measurements,
             )
         else:
             getattr(self, f'{mode}_loss_{dataloader_idx}')(
-                loss=eval_loss, num_measurements=num_measurements,
+                loss=eval_loss,
+                num_measurements=num_measurements,
             )
         np_tgt = tgt_ids.detach().cpu().numpy()
         ground_truths = [self.decoder_tokenizer.ids_to_text(tgt) for tgt in np_tgt]
diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py
index bfb45f51b6ac..7ca78f2e136e 100644
--- a/nemo/collections/nlp/models/question_answering/qa_base_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py
@@ -25,10 +25,14 @@
 )
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BaseQAModel(NLPModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=True):
+        # deprecation warning
+        deprecated_warning("BaseQAModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer, no_lm_init=no_lm_init)
 
@@ -82,10 +86,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
 
     @torch.no_grad()
     def _get_per_sample_perplexity(self, logits, labels):
-        """ Returns average perplexity for each sample in the batch  """
+        """Returns average perplexity for each sample in the batch"""
 
         loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
-        unreduced_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1),)
+        unreduced_loss = loss_fct(
+            logits.view(-1, logits.size(-1)),
+            labels.view(-1),
+        )
         unreduced_loss = unreduced_loss.reshape(labels.shape)
         mask_0 = unreduced_loss != 0
         per_sample_perplexity = torch.exp((unreduced_loss * mask_0).sum(axis=1) / mask_0.sum(axis=1))
diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
index 196fab4e3a04..d4bdef6d871d 100644
--- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
@@ -31,12 +31,15 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BERTQAModel(BaseQAModel):
-    """ BERT model with a QA (token classification) head """
+    """BERT model with a QA (token classification) head"""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("BERTQAModel")
 
         super().__init__(cfg=cfg, trainer=trainer, no_lm_init=False)
         self.classifier = TokenClassifier(
@@ -190,7 +193,7 @@ def inference(
             num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
             output_nbest_file: optional output file for writing out nbest list
             output_prediction_file: optional output file for writing out predictions
-            
+
         Returns:
             model predictions, model nbest list
         """
@@ -209,7 +212,10 @@ def inference(
             logging.set_verbosity(logging.WARNING)
 
             infer_datalayer = self.setup_inference_data(
-                file, batch_size=batch_size, num_samples=num_samples, num_workers=2,
+                file,
+                batch_size=batch_size,
+                num_samples=num_samples,
+                num_workers=2,
             )
 
             all_logits = []
@@ -244,7 +250,9 @@ def inference(
 
             if output_prediction_file:
                 QAMetrics.dump_predicted_answers_to_file(
-                    output_prediction_file, infer_datalayer.dataset.examples, all_predictions,
+                    output_prediction_file,
+                    infer_datalayer.dataset.examples,
+                    all_predictions,
                 )
 
             if output_nbest_file:
@@ -324,7 +332,7 @@ def get_predictions(
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
         scores_diff_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -349,7 +357,7 @@ def get_predictions(
             null_start_logit = 0
             # end logit at the slice with min null score
             null_end_logit = 0
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 start_indexes = self._get_best_indexes(start_logits[pos], n_best_size)
                 end_indexes = self._get_best_indexes(end_logits[pos], n_best_size)
@@ -468,7 +476,7 @@ def get_predictions(
             probs = _compute_softmax(total_scores)
 
             nbest_json = []
-            for (i, entry) in enumerate(nbest):
+            for i, entry in enumerate(nbest):
                 output = collections.OrderedDict()
                 output["question"] = example.question_text
                 output["text"] = entry.text
@@ -531,7 +539,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_best_indexes(self, logits, n_best_size):
-        """ Get the n-best logits from a list """
+        """Get the n-best logits from a list"""
 
         best_indices = np.argsort(logits)[::-1]
 
@@ -570,7 +578,7 @@ def _get_final_text(self, pred_text: str, orig_text: str, do_lower_case: bool, v
         def _strip_spaces(text):
             ns_chars = []
             ns_to_s_map = collections.OrderedDict()
-            for (i, c) in enumerate(text):
+            for i, c in enumerate(text):
                 if c == " ":
                     continue
                 ns_to_s_map[len(ns_chars)] = i
@@ -599,14 +607,16 @@ def _strip_spaces(text):
         if len(orig_ns_text) != len(tok_ns_text):
             if verbose_logging:
                 logging.warning(
-                    "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+                    "Length not equal after stripping spaces: '%s' vs '%s'",
+                    orig_ns_text,
+                    tok_ns_text,
                 )
             return orig_text
 
         # We then project the characters in `pred_text` back to `orig_text` using
         # the character-to-character alignment.
         tok_s_to_ns_map = {}
-        for (i, tok_index) in tok_ns_to_s_map.items():
+        for i, tok_index in tok_ns_to_s_map.items():
             tok_s_to_ns_map[tok_index] = i
 
         orig_start_position = None
diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
index 405b9a1e05ad..059cf5625f15 100644
--- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
@@ -27,10 +27,14 @@
 from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class GPTQAModel(BaseQAModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("GPTQAModel")
+
         self.cfg = cfg
 
         self.setup_tokenizer(cfg.tokenizer)
@@ -102,7 +106,11 @@ def on_validation_epoch_end(self):
 
         eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset
         eval_results, _, _ = self.evaluate(
-            eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers,
+            eval_dataset.features,
+            eval_dataset.examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_answers,
         )
 
         self.log(f'{prefix}_loss', avg_loss)
@@ -185,10 +193,19 @@ def inference(
         return all_predictions, all_nbest_perdictions
 
     def evaluate(
-        self, features, examples, unique_ids, per_sample_perplexity, generated_texts,
+        self,
+        features,
+        examples,
+        unique_ids,
+        per_sample_perplexity,
+        generated_texts,
     ):
         all_predictions, all_nbest_predictions = self._get_predictions(
-            features, examples, unique_ids, per_sample_perplexity, generated_texts,
+            features,
+            examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_texts,
         )
 
         eval_results = QAMetrics.evaluate_predictions(examples, all_predictions)
@@ -226,7 +243,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_predictions(
-        self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List,
+        self,
+        features,
+        examples: List,
+        unique_ids: List[int],
+        per_sample_perplexity: List,
+        generated_texts: List,
     ):
         unique_id_to_pos = {}
         for index, unique_id in enumerate(unique_ids):
@@ -242,7 +264,7 @@ def _get_predictions(
 
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -250,7 +272,7 @@ def _get_predictions(
 
             curr_features = example_index_to_features[example_index]
             prelim_predictions = []
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 curr_perplexity = per_sample_perplexity[pos]
                 curr_generated_text = generated_texts[pos]
diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py
index 6fb2054a2237..2147d7d6a5bf 100644
--- a/nemo/collections/nlp/models/question_answering/qa_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_model.py
@@ -32,6 +32,7 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['QAModel']
 
@@ -42,6 +43,9 @@ class QAModel(NLPModel):
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("QAModel")
+
         super().__init__(cfg=cfg, trainer=trainer)
         self.classifier = TokenClassifier(
             hidden_size=self.hidden_size,
@@ -186,7 +190,7 @@ def inference(
             num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
             output_nbest_file: optional output file for writing out nbest list
             output_prediction_file: optional output file for writing out predictions
-            
+
         Returns:
             model predictions, model nbest list
         """
diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
index 81001fb66da7..5ad959fd1b6f 100644
--- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
@@ -28,10 +28,13 @@
 from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class S2SQAModel(BaseQAModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("S2SQAModel")
 
         self.cfg = cfg
 
@@ -120,7 +123,11 @@ def on_validation_epoch_end(self):
 
         eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset
         eval_results, _, _ = self.evaluate(
-            eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers,
+            eval_dataset.features,
+            eval_dataset.examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_answers,
         )
 
         self.log(f'{prefix}_loss', avg_loss)
@@ -145,7 +152,11 @@ def forward(self, input_ids, input_attn_mask, labels):
             labels = torch.where(labels != -100, labels, torch.zeros_like(labels))
             output_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels))
             unmasked_unreduced_loss = self.language_model(
-                input_ids, labels[:, :-1], input_attn_mask, output_attn_masks[:, :-1], lm_labels=labels[:, 1:],
+                input_ids,
+                labels[:, :-1],
+                input_attn_mask,
+                output_attn_masks[:, :-1],
+                lm_labels=labels[:, 1:],
             )
             loss = self.language_model.loss_func(output_attn_masks[:, 1:], unmasked_unreduced_loss)
             per_sample_perplexity = torch.exp(unmasked_unreduced_loss)
@@ -210,10 +221,19 @@ def inference(
         return all_predictions, all_nbest_predictions
 
     def evaluate(
-        self, features, examples, unique_ids, per_sample_perplexity, generated_texts,
+        self,
+        features,
+        examples,
+        unique_ids,
+        per_sample_perplexity,
+        generated_texts,
     ):
         all_predictions, all_nbest_json = self._get_predictions(
-            features, examples, unique_ids, per_sample_perplexity, generated_texts,
+            features,
+            examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_texts,
         )
 
         eval_results = QAMetrics.evaluate_predictions(examples, all_predictions)
@@ -251,7 +271,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_predictions(
-        self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List,
+        self,
+        features,
+        examples: List,
+        unique_ids: List[int],
+        per_sample_perplexity: List,
+        generated_texts: List,
     ):
 
         unique_id_to_pos = {}
@@ -268,7 +293,7 @@ def _get_predictions(
 
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -276,7 +301,7 @@ def _get_predictions(
 
             curr_features = example_index_to_features[example_index]
             prelim_predictions = []
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 curr_perplexity = per_sample_perplexity[pos]
                 curr_generated_text = generated_texts[pos]
@@ -339,7 +364,10 @@ def _generate_candidates(self, input_ids, input_attn_mask):
                 "max_length": num_tokens_to_generate,
             }
             generated_tokens = self.language_model.generate(**param_dict)
-            generated_answers = self.tokenizer.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True,)
+            generated_answers = self.tokenizer.tokenizer.batch_decode(
+                generated_tokens,
+                skip_special_tokens=True,
+            )
             generated_answers = [ans.strip() for ans in generated_answers]
 
         elif self.cfg.library == 'megatron':
diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
index eed94f2e1e31..d9e08f6764fc 100644
--- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
+++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
@@ -35,7 +35,7 @@
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.core.neural_types import LogitsType, NeuralType
 from nemo.utils import logging
-from nemo.utils.decorators import experimental
+from nemo.utils.decorators import deprecated_warning, experimental
 
 __all__ = ["SpellcheckingAsrCustomizationModel"]
 
@@ -48,7 +48,7 @@ class SpellcheckingAsrCustomizationModel(NLPModel):
     It takes as input ASR hypothesis and candidate customization entries.
     It labels the hypothesis with correct entry index or 0.
     Example input:   [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ...
-    Input segments:      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1 1 1 1 1 1 1     2 2 2 2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3     4      
+    Input segments:      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1 1 1 1 1 1 1     2 2 2 2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3     4
     Example output:      0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0     ...
     """
 
@@ -67,6 +67,9 @@ def output_module(self):
         return self
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None:
+        # deprecation warning
+        deprecated_warning("SpellcheckingAsrCustomizationModel")
+
         super().__init__(cfg=cfg, trainer=trainer)
 
         # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids
@@ -321,7 +324,7 @@ def on_test_epoch_end(self):
 
     @torch.no_grad()
     def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None:
-        """ Main function for Inference
+        """Main function for Inference
 
         Args:
             dataloader_cfg: config for dataloader
@@ -517,7 +520,7 @@ def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.ut
         Setup function for a infer data loader.
         Args:
             cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory
-            input_name: path to input file. 
+            input_name: path to input file.
         Returns:
             A pytorch DataLoader.
         """
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index fd32ac844274..0ec9664dbca5 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -69,7 +69,11 @@ def forward_step(self, batch, tensor_shape):
         fwd_bwd_function = get_forward_backward_func()
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -104,7 +108,7 @@ def tokenize_batch(self, sentences, max_len, add_BOS):
 
     @abc.abstractclassmethod
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length
+        """clip the max len based on the LM model max sequence length
         Args:
             maxlen (int): the max len computed from the context and number of tokens to generate
         returns (int):
@@ -119,7 +123,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
            context_length (int): the context token length
            compute_attention_mask: bool: set to True to compute attention mask (not needed for FA)
         Args:
-            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated 
+            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated
         """
         pass
 
@@ -262,7 +266,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -336,7 +340,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -390,7 +394,11 @@ def forward_step(self, batch, tensor_shape_and_context_length):
 
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -415,10 +423,18 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     list_data_dict = []
     if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]:
         record = {
-            'system': '\n'
-            if multimodal_cfg["conv_template"] == 'nv_dpo'
-            else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
-            'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
+            'system': (
+                '\n'
+                if multimodal_cfg["conv_template"] == 'nv_dpo'
+                else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n'
+            ),
+            'conversations': [
+                {'from': 'User', 'value': prompt},
+                {
+                    'from': 'Assistant',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -441,7 +457,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
 
     elif multimodal_cfg["conv_template"] == "llama_2":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -455,7 +480,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -488,7 +522,8 @@ def __init__(self, model):
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
             image_token_len=self.data_cfg.image_token_len,
-            image_folder=self.data_cfg.image_folder,
+            image_folder=self.data_cfg.get('image_folder', None),
+            video_folder=self.data_cfg.get('video_folder', None),
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=None,
@@ -499,7 +534,7 @@ def __init__(self, model):
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.cfg.encoder_seq_length + 1:
             maxlen = self.model.cfg.encoder_seq_length + 1
         return maxlen
@@ -616,7 +651,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.frozen_model.cfg.encoder_seq_length + 1:
             maxlen = self.model.frozen_model.cfg.encoder_seq_length + 1
         return maxlen
@@ -681,7 +716,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -830,21 +865,21 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
 
                 # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm)
                 if contain_encoder:  # the first cross-attention decoder layer contain encoder
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_chunk_length = inference_retro_chunk_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_chunk_length = (
+                        inference_retro_chunk_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_retrieved_length = (
+                        inference_retro_retrieved_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attn_bda.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].pre_mlp_layernorm.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
                     contain_encoder = False
 
         return context_tokens
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 1c68ebff8121..053385b3d8e6 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -63,7 +63,6 @@
     # since PyTorch 2.3 the path has changed
     from torch.amp.grad_scaler import _refresh_per_optimizer_state
 
-from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
 from nemo.collections.nlp.parts import utils_funcs
@@ -120,7 +119,7 @@
 def init_model_parallel(
     sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30
 ) -> None:
-    """ Initializes Megatron-LM model parallel if using model parallelism.
+    """Initializes Megatron-LM model parallel if using model parallelism.
 
     Args:
         sharp: Apply SHARP to NCCL data-parallel communication.
@@ -164,7 +163,7 @@ def init_model_parallel(
 
 
 class NLPDDPStrategy(DDPStrategy):
-    """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
+    """DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
 
     Args:
         no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
@@ -231,8 +230,8 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) ->
                 )
 
     def configure_ddp(self):
-        """ Override LightningModule ddp if using model parallel.
-            Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
+        """Override LightningModule ddp if using model parallel.
+        Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
         """
 
         if (hasattr(self.model, 'megatron_amp_O2') and self.model.megatron_amp_O2) or (
@@ -406,7 +405,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
             self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     def _fix_tensors_device(self, ckpt: Dict) -> Dict:
-        """ Ensure checkpoint tensors are on the correct device."""
+        """Ensure checkpoint tensors are on the correct device."""
         assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
         cur_dev = torch.device("cuda", index=torch.cuda.current_device())
 
@@ -418,10 +417,10 @@ def _fix_device(t):
         return dict_list_map_outplace(_fix_device, ckpt)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ PTL method which we override to integrate distributed checkpoints for model parallel models.
-            In order to load distributed checkpoints we need to provide the sharded_state_dict to 
-            the distributed load function. We get the sharded_state_dict from self.lightning_module
-            which makes it convenient to have the loading logic happen at the strategy level.
+        """PTL method which we override to integrate distributed checkpoints for model parallel models.
+        In order to load distributed checkpoints we need to provide the sharded_state_dict to
+        the distributed load function. We get the sharded_state_dict from self.lightning_module
+        which makes it convenient to have the loading logic happen at the strategy level.
         """
 
         fs = get_filesystem(checkpoint_path)
@@ -452,8 +451,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
         # check if filepath is a distributed checkpoint
-        if self.use_distributed_checkpointing and self.is_global_zero:
-            self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
+        if self.use_distributed_checkpointing:
+            if self.is_global_zero:
+                self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
 
         # legacy checkpoint logic, does not use megatron core
         else:
@@ -500,15 +500,15 @@ def distributed_sampler_kwargs(self):
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ This needs to be True for distributed checkpointing because
-            we require the model to have configured the optimizer before 
-            deserializing the checkpoint.
+        """This needs to be True for distributed checkpointing because
+        we require the model to have configured the optimizer before
+        deserializing the checkpoint.
         """
         return True
 
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
-    """ Version of NLPDDPStrategy to be used in a Jupyter Notebook
+    """Version of NLPDDPStrategy to be used in a Jupyter Notebook
     A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
     single-GPU training (e.g. in a Jupyter notebook)
     A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
@@ -546,7 +546,7 @@ def _get_full_state_dict_context(module: torch.nn.Module, rank0_only: bool = Fal
 
 
 class NLPFSDPStrategy(FSDPStrategy):
-    """ FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
+    """FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
 
     Args:
         sharding_strategy: FSDP parameter sharding strategy.
@@ -583,6 +583,9 @@ def __init__(
         # Use the default FSDP backward-prefetch policy for proper communication overlap.
         kwargs['backward_prefetch'] = BackwardPrefetch.BACKWARD_PRE
 
+        # import here to prevent circular imports
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
+
         # Set FSDP wrapping policy: use Transformer layer module as the FSDP sharding granularity.
         self.fsdp_wrap_module = {
             MCoreTransformerLayer,
@@ -639,7 +642,11 @@ def _set_mixed_precision_recipe(
             reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None)
         if set_buffer_dtype is not None:
             buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None)
-        return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,)
+        return MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype,
+        )
 
     def setup_environment(self) -> None:
         """
@@ -750,7 +757,9 @@ def _get_osd(opt_state):
                         with FSDP.summon_full_params(self.model, writeback=True, rank0_only=False):
                             # rekey the osd stored from non-FSDP model
                             rekeyed_osd = FSDP.rekey_optim_state_dict(
-                                temp_osd, OptimStateKeyType.PARAM_NAME, self.model,
+                                temp_osd,
+                                OptimStateKeyType.PARAM_NAME,
+                                self.model,
                             )
                         temp_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, self.model)
                     except Exception as e:
@@ -758,7 +767,9 @@ def _get_osd(opt_state):
                         exit(1)
                 # Shard optimizer state dict
                 sharded_osd = FSDP.optim_state_dict_to_load(
-                    optim_state_dict=temp_osd, model=self.model, optim=optimizer,
+                    optim_state_dict=temp_osd,
+                    model=self.model,
+                    optim=optimizer,
                 )
 
                 optimizer.load_state_dict(sharded_osd)
@@ -767,9 +778,9 @@ def _get_osd(opt_state):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        """ Store checkpoints
-            1. In case of sharded checkpoint, all ranks store unique checkpoints.
-            2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
+        """Store checkpoints
+        1. In case of sharded checkpoint, all ranks store unique checkpoints.
+        2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
         """
         app_state = AppState()
         filepath = inject_model_parallel_rank(filepath, fsdp_sharded_ckpt=self.sharded_checkpoint)
@@ -780,8 +791,7 @@ def save_checkpoint(
             self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ Load checkpoints
-        """
+        """Load checkpoints"""
         # 1. Load normal or FSDP-sharded checkpoints.
         fs = get_filesystem(checkpoint_path)
 
@@ -798,8 +808,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         return checkpoint
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
-        """ Remove checkpoints
-        """
+        """Remove checkpoints"""
         # legacy checkpoint logic, does not use megatron core
         app_state = AppState()
         # PTL override to accomodate model parallel checkpoints
@@ -814,9 +823,9 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
-            FSDP sharding to match FSDP-sharded format between the checkpoint and the current
-            model and optimizer.
+        """When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
+        FSDP sharding to match FSDP-sharded format between the checkpoint and the current
+        model and optimizer.
         """
         return True
 
@@ -915,7 +924,8 @@ def dummy():
                     else:
                         # move weights to the tmpdir
                         for tp_rank, pp_rank in itertools.product(
-                            range(app_state.tensor_model_parallel_size), range(app_state.pipeline_model_parallel_size),
+                            range(app_state.tensor_model_parallel_size),
+                            range(app_state.pipeline_model_parallel_size),
                         ):
                             os.makedirs(os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}'))
                             mp_model_weights = os.path.join(
@@ -1000,6 +1010,7 @@ def modify_state_dict(self, conf, state_dict):
         loaded_keys = state_dict.keys()
         if 'model.model.diffusion_model.input_blocks.1.0.in_layers.2.weight' in loaded_keys:
             new_state_dict = {}
+
             # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following
             def should_process(key):
                 base_str = "model.model.diffusion_model."
@@ -1110,7 +1121,13 @@ def restore_from(
         # Get path where the command is executed - the artifacts will be "retrieved" there
         # (original .nemo behavior)
         loaded_params = super().load_config_and_state_dict(
-            calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer,
+            calling_cls,
+            restore_path,
+            override_config_path,
+            map_location,
+            strict,
+            return_config,
+            trainer,
         )
         if not isinstance(loaded_params, tuple) or return_config is True:
             return loaded_params
@@ -1165,12 +1182,12 @@ def dummy():
 
 
 class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1206,12 +1223,12 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class FSDPMixedPrecisionPlugin(FSDPPrecision):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1246,7 +1263,7 @@ class GradScaler(torch.cuda.amp.GradScaler):
 
     def __init__(
         self,
-        init_scale=2.0 ** 16,
+        init_scale=2.0**16,
         growth_factor=2.0,
         backoff_factor=0.5,
         growth_interval=2000,
@@ -1500,7 +1517,7 @@ def optimizer_step(
 
     @contextmanager
     def forward_context(self) -> Generator[None, None, None]:
-        """ No explicit precision casting. Inputs are supposed to be manually casted """
+        """No explicit precision casting. Inputs are supposed to be manually casted"""
         try:
             yield
         finally:
@@ -1508,7 +1525,7 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class GlobalBatchDataFetcher(_DataFetcher):
-    """ Overrides PTL DataFetcher. Used to fetch global batches."""
+    """Overrides PTL DataFetcher. Used to fetch global batches."""
 
     def __init__(self, prefetch_batches: int = 0, store_on_device: bool = False) -> None:
 
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
index 22dea8ac47cd..25e09cf3eacc 100644
--- a/nemo/deploy/deploy_pytriton.py
+++ b/nemo/deploy/deploy_pytriton.py
@@ -24,7 +24,6 @@
 
 
 class DeployPyTriton(DeployBase):
-
     """
     Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
 
@@ -102,7 +101,6 @@ def __init__(
         )
 
     def deploy(self):
-
         """
         Deploys any models to Triton Inference Server.
         """
@@ -148,7 +146,6 @@ def deploy(self):
             print(e)
 
     def serve(self):
-
         """
         Starts serving the model and waits for the requests
         """
@@ -163,7 +160,6 @@ def serve(self):
             print(e)
 
     def run(self):
-
         """
         Starts serving the model asynchronously.
         """
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 6a4337024eeb..c8387914c2e9 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase):
 
     def __init__(self, url, model_name):
         super().__init__(
-            url=url, model_name=model_name,
+            url=url,
+            model_name=model_name,
         )
 
     def query_llm(
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index cad7b821b3b4..b030165a3d45 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -82,15 +82,24 @@ class TensorRTLLM(ITritonDeployable):
 
     """
 
-    def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
+    def __init__(
+        self,
+        model_dir: str,
+        lora_ckpt_list: List[str] = None,
+        load_model: bool = True,
+        use_python_runtime: bool = True,
+    ):
         """
         Args:
             model_dir (str): path for storing the TensorRT-LLM model files.
+            lora_ckpt_list (List[str]): lora checkpoint paths.
             load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+            use_python_runtime (bool): whether to use python or c++ runtime.
         """
 
         self.model_dir = model_dir
         self.lora_ckpt_list = lora_ckpt_list
+        self.use_python_runtime = use_python_runtime
         self.model = None
         self.tokenizer = None
         self.n_gpus = None
@@ -623,7 +632,7 @@ def _prep_ptuning_table(self):
         if len(vtokens_embeddings) > 0:
             self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
 
-            max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size']
+            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
             actual_prompt_table_size = self.p_table.shape[0]
 
             if actual_prompt_table_size > max_prompt_embedding_table_size:
@@ -754,7 +763,10 @@ def _load(self):
                     self._load_config_file()
                     self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
                     self.model = load(
-                        tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
+                        tokenizer=self.tokenizer,
+                        engine_dir=self.model_dir,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_python_runtime=self.use_python_runtime,
                     )
                     self._load_prompt_tables()
                 except Exception as error:
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
index b3c0e2257e9f..2d1993fd74c0 100644
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ b/nemo/export/trt_llm/decoder/decoder.py
@@ -90,7 +90,11 @@ def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
         pass
 
     def __init__(
-        self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
+        self,
+        decoder_type: str,
+        dtype: trt.DataType = trt.float16,
+        rank: int = 0,
+        tensor_parallel: int = 1,
     ):
         """Initializes the DecoderLayerConfigBuilder."""
         self.decoder_type = decoder_type
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
index 91edc7794607..e05979fa75a0 100644
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ b/nemo/export/trt_llm/decoder/falcon.py
@@ -69,7 +69,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -78,13 +82,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -130,4 +146,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('bias', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return FalconDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
index 10301c7a47d7..37f843dcf0ca 100644
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ b/nemo/export/trt_llm/decoder/gemma.py
@@ -64,7 +64,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -73,13 +77,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -128,4 +144,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('dense_context_fmha', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GemmaDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
index 8af4e4ef01e4..a405aabbbd48 100644
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ b/nemo/export/trt_llm/decoder/gpt.py
@@ -54,11 +54,18 @@ def build_input_layernorm(self, layer) -> LayernormConfig:
     def build_attention(self, layer) -> AttentionConfig:
         config = AttentionConfig()
         config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            [layer.attn.c_attn],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -67,10 +74,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_fc,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -126,4 +141,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('rotary_pct', rotary_pct)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GPTDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
index aa65ca385a47..327a31fdd35c 100644
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ b/nemo/export/trt_llm/decoder/gptj.py
@@ -60,7 +60,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.out_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.rotary_dim = layer.attn.rotary_dim
@@ -71,10 +75,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_in,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_out,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
index 873c0306375b..b37d62e214de 100644
--- a/nemo/export/trt_llm/decoder/llama.py
+++ b/nemo/export/trt_llm/decoder/llama.py
@@ -66,7 +66,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -75,13 +79,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -147,4 +163,7 @@ def build_decoder(self, layer):
                 config.moe_tp_mode = layer.moe_tp_mode
                 config.moe_normalization_mode = layer.moe_renorm_mode
 
-        return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
+        return LLaMADecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
index dd360afd6b8a..0f120dc56153 100644
--- a/nemo/export/trt_llm/model_config.py
+++ b/nemo/export/trt_llm/model_config.py
@@ -122,7 +122,11 @@ def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=
         if hasattr(module, "bias") and module.bias is not None:
             if linear_type == LINEAR_COLUMN:
                 config.bias = np.ascontiguousarray(
-                    split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,)
+                    split(
+                        torch_to_numpy_with_dtype(module.bias, dtype),
+                        tensor_parallel,
+                        rank,
+                    )
                 )
             else:
                 config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
@@ -234,7 +238,9 @@ class AttentionConfig:
 
     @staticmethod
     def from_nemo(
-        weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0,
+        weights_dict: Dict[str, np.ndarray],
+        layer_id: int,
+        rank: int = 0,
     ):
         """Converts the nemo weights and config to `AttentionConfig`."""
         attention = AttentionConfig()
@@ -243,12 +249,16 @@ def from_nemo(
             weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
         )
         attention.qkv.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
+            weights_dict,
+            f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
         )
 
         attention.dense = LinearConfig(linear_type=LINEAR_ROW)
         attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",)
+        attention.dense.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.attention.dense.bias",
+        )
         return attention
 
 
@@ -276,7 +286,10 @@ def from_nemo(
 
         # print("********** mlp.fc.weight : ", mlp.fc.weight )
 
-        mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",)
+        mlp.fc.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",
+        )
 
         gated = is_gated_activation(mlp.hidden_act)
         is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
@@ -287,9 +300,13 @@ def from_nemo(
                 if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
                 else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
             )
-            mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,)
+            mlp.gate.weight = get_tensor_from_dict(
+                weights_dict,
+                layer_name,
+            )
             mlp.gate.bias = get_tensor_from_dict(
-                weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
+                weights_dict,
+                f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
             )
 
         mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
@@ -382,19 +399,23 @@ def from_nemo(
             LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
         )
         layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.weight",
         )
         layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.bias",
         )
 
         layer_config.mlp_layernorm = LayernormConfig()
         layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
         layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.weight",
         )
         layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.bias",
         )
 
         layer_config.post_layernorm = LayernormConfig()
@@ -403,10 +424,12 @@ def from_nemo(
         )
 
         layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.weight",
         )
         layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.bias",
         )
 
         if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
@@ -415,7 +438,11 @@ def from_nemo(
         if layer_config.mlp_layernorm.weight is None:
             layer_config.mlp_layernorm = None
 
-        layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,)
+        layer_config.attention = AttentionConfig.from_nemo(
+            weights_dict,
+            layer_id,
+            rank,
+        )
 
         moe = False
         if llm_config.moe_num_experts is not None:
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
index 9026cd9cfba9..c3564f1c4e8e 100644
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -106,7 +106,9 @@ def extract_layers_with_prefix(model_, prefix):
 
 class UnpackedNemoCheckpointDir:
     def __init__(
-        self, checkpoints_dir: typing.Union[pathlib.Path, TarPath], load_checkpoints_to_cpu: bool = False,
+        self,
+        checkpoints_dir: typing.Union[pathlib.Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
     ):
         assert isinstance(checkpoints_dir, (pathlib.Path, TarPath))
         self._checkpoints_dir = checkpoints_dir
@@ -121,11 +123,7 @@ def model_config(self):
         model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
         if model_configs_paths:
             if len(model_configs_paths) > 1:
-                raise RuntimeError(
-                    f"There are more than single {model_config_filename} in"
-                    f" {self._checkpoints_dir}:"
-                    f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}"
-                )
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
             model_config_path = model_configs_paths[0]
             LOGGER.debug("Loading model config from %s", model_config_path)
             with model_config_path.open("r") as model_config_file:
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
index 736d6180807e..f4b44552af63 100644
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -144,7 +144,12 @@ def forward(
         if attention_mask is not None:
             attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
 
-        for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)):
+        for layer_idx, (layer, past) in enumerate(
+            zip(
+                self.layers,
+                kv_cache_params.past_key_value,
+            )
+        ):
 
             decoder_params = {
                 "hidden_states": hidden_states,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 92fc36272f7c..1bdfd5237caf 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -16,17 +16,19 @@
 import json
 import logging
 import os
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import numpy as np
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
@@ -55,7 +57,7 @@ class TensorrtLLMHostContext:
 class TensorrtLLMWorkerContext:
     """The MPI worker side context for TRT LLM inference."""
 
-    decoder: ModelRunnerCpp = None
+    decoder: ModelRunner = None
     sampling_config: SamplingConfig = None
     lora_manager: LoraManager = None
     max_batch_size: int = 0
@@ -128,7 +130,13 @@ def _read_config(config_path: Path):
     return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
 
 
-def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1):
+def _load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir,
+    lora_ckpt_list=None,
+    num_beams=1,
+    use_python_runtime: bool = True,
+):
     """The impl of `load` API for on a single GPU worker."""
     try:
         tensorrt_llm.logger.set_level("info")
@@ -147,17 +155,26 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b
 
         runtime_rank = tensorrt_llm.mpi_rank()
 
-        decoder = ModelRunnerCpp.from_dir(
-            engine_dir=engine_dir,
-            lora_dir=lora_ckpt_list,
-            lora_ckpt_source="nemo",
-            rank=runtime_rank,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            debug_mode=False,
-        )
+        if use_python_runtime:
+            decoder = ModelRunner.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                debug_mode=False,
+            )
+        else:
+            decoder = ModelRunnerCpp.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                max_batch_size=max_batch_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_beam_width=max_beam_width,
+                debug_mode=False,
+            )
 
         sampling_config = SamplingConfig(
             end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
@@ -218,6 +235,13 @@ def _forward(
         with torch.no_grad():
             prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
 
+            if prompt_table is not None:
+                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
+                tmp_dir = tempfile.TemporaryDirectory()
+                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
+                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
+                prompt_table = prompt_table_path
+
             outputs = decoder.generate(
                 input_tensors,
                 max_new_tokens=max_output_len,
@@ -230,6 +254,7 @@ def _forward(
                 stop_words_list=stop_words_list,
                 bad_words_list=bad_words_list,
                 lora_uids=lora_uids,
+                prompt_table_path=prompt_table,
                 prompt_table=prompt_table,
                 prompt_tasks=prompt_tasks,
                 streaming=streaming,
@@ -239,6 +264,9 @@ def _forward(
 
             torch.cuda.synchronize()
 
+            if prompt_table is not None:
+                tmp_dir.cleanup()
+
         runtime_rank = tensorrt_llm.mpi_rank()
         if runtime_rank == 0 or multiprocessed_env:
             return outputs
@@ -251,7 +279,11 @@ def _forward(
 
 
 def load(
-    tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1
+    tokenizer: PreTrainedTokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    use_python_runtime: bool = True,
 ) -> TensorrtLLMHostContext:
     """Loaded the compiled LLM model and run it.
 
@@ -263,24 +295,30 @@ def load(
         config = json.load(f)
     world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
     elif tensorrt_llm.mpi_world_size() > 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
         tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
         for _ in range(world_size):
-            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams)
+            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
             futures.append(future)
         for future in futures:
             future.result()
 
     max_batch_size = config["build_config"]["max_batch_size"]
     max_input_len = config["build_config"]["max_input_len"]
-    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index f8bdb9d9b294..3bbd7cbfce14 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -21,6 +21,9 @@
 
 import pytorch_lightning
 import torch
+from _weakref import proxy
+
+from lightning_fabric.utilities.cloud_io import get_filesystem
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -32,8 +35,8 @@
 
 
 class NeMoModelCheckpoint(ModelCheckpoint):
-    """ Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
-    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based 
+    """Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
+    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based
     on the best checkpoint saved (according to the monitor value).
     Also contains func to save the EMA copy of the model.
     """
@@ -188,7 +191,6 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.')
         # since we are creating tarfile artifacts we need to update .nemo path
-        self._backup_existing_nemo_ckpt(trainer)
         app_state.model_restore_path = self._format_nemo_checkpoint_name()
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path)
@@ -212,14 +214,19 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             pl_module.load_state_dict(checkpoint, strict=True)
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New best .nemo model saved to: {app_state.model_restore_path}")
             pl_module.load_state_dict(old_state_dict, strict=True)
         else:
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New .nemo model saved to: {app_state.model_restore_path}")
+        if backup_path is not None and is_global_rank_zero():
+            logging.info(f'Removing old .nemo backup {backup_path}')
+            get_filesystem(backup_path).rm(backup_path)
         return output
 
     def on_train_end(self, trainer, pl_module):
@@ -258,16 +265,25 @@ def on_train_end(self, trainer, pl_module):
                 trainer._checkpoint_connector.restore(self.best_model_path)
 
         if self.save_nemo_on_train_end:
-            self._backup_existing_nemo_ckpt(trainer)
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
+            if backup_path is not None and is_global_rank_zero():
+                logging.info(f'Removing old .nemo backup {backup_path}')
+                get_filesystem(backup_path).rm(backup_path)
 
-    def _backup_existing_nemo_ckpt(self, trainer) -> str:
-        """ Search for an available name with version infix and rename existing checkpoint.
+    def _backup_existing_nemo_ckpt(self, trainer) -> Optional[str]:
+        """Search for an available name with version infix and rename existing checkpoint.
 
         NOTE: this behavior is slightly different from regular checkpoints.
         PTL creates new regular checkpoint with the first available name.
         Here, for backward compatibility, we create .nemo checkpoint as before
         and create a backup under the first available name.
+
+        Args:
+            trainer (Trainer): trainer instance.
+
+        Returns:
+            Path to the backup checkpoint or None, if no backup was created
         """
         base_path = self._format_nemo_checkpoint_name()
         available_path = base_path
@@ -276,11 +292,13 @@ def _backup_existing_nemo_ckpt(self, trainer) -> str:
             while self.file_exists(available_path, trainer, check_dist_ckpt=False):
                 available_path = self._format_nemo_checkpoint_name(version_cnt)
                 version_cnt += 1
-        if available_path != base_path:
-            if trainer.is_global_zero:
-                logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
-                shutil.move(base_path, available_path)
-            trainer.strategy.barrier()
+        if available_path == base_path:
+            # no existing ckpt, no need to backup
+            return None
+        if trainer.is_global_zero:
+            logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
+            shutil.move(base_path, available_path)
+        trainer.strategy.barrier()
         return available_path
 
     def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str:
@@ -330,15 +348,15 @@ def _ema_callback(self, trainer: 'pytorch_lightning.Trainer') -> Optional[EMA]:
 
     @staticmethod
     def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str]) -> Path:
-        """ Format the path to the unfinished checkpoint marker file.
-        
+        """Format the path to the unfinished checkpoint marker file.
+
         If the marker file exists, corresponding checkpoint is considered unfinished/incomplete.
         NOTE: Marker path for the EMA checkpoint part is the same as for the original checkpoint.
-        
+
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
               Does not need to exist.
-            
+
         Returns:
             Path to the unfinished checkpoint marker file.
         """
@@ -350,7 +368,7 @@ def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str])
 
     @staticmethod
     def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
-        """ Check if the checkpoint is unfinished.
+        """Check if the checkpoint is unfinished.
 
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
@@ -363,7 +381,7 @@ def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
 
     @staticmethod
     def set_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barrier_after=False) -> None:
-        """ Marks given checkpoint as unfinished.
+        """Marks given checkpoint as unfinished.
 
         Args:
             checkpoint_filepath: Path to the checkpoint file or dir.
@@ -499,7 +517,7 @@ def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, curren
         A checkpoint won't be deleted if any of the cases apply:
         - The previous checkpoint is the same as the current checkpoint (means the old was already overwritten by new)
         - The previous checkpoint is not in the current checkpoint directory and the filesystem is local
-        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local 
+        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local
             and the resumed from checkpoint is not the last checkpoint
         """
         if previous == current:
diff --git a/nemo/utils/decorators/__init__.py b/nemo/utils/decorators/__init__.py
index 4468a3bc09b5..2cfec9e40d64 100644
--- a/nemo/utils/decorators/__init__.py
+++ b/nemo/utils/decorators/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from nemo.utils.decorators.deprecated import deprecated
+from nemo.utils.decorators.deprecated import deprecated, deprecated_warning
 from nemo.utils.decorators.experimental import experimental
 from nemo.utils.decorators.port_docs import add_port_docs
diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py
index 65f92e62563e..40957bb343d4 100644
--- a/nemo/utils/decorators/deprecated.py
+++ b/nemo/utils/decorators/deprecated.py
@@ -30,14 +30,14 @@
 
 def deprecated(wrapped=None, version=None, explanation=None, wait_seconds=0):
     """
-        Decorator which can be used for indicating that a function/class is deprecated and going to be removed.
-        Tracks down which function/class printed the warning and will print it only once per call.
-
-        Args:
-          version: Version in which the function/class will be removed (optional).
-          explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional).
-          wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
-          with subsequent logging messages.
+    Decorator which can be used for indicating that a function/class is deprecated and going to be removed.
+    Tracks down which function/class printed the warning and will print it only once per call.
+
+    Args:
+      version: Version in which the function/class will be removed (optional).
+      explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional).
+      wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
+      with subsequent logging messages.
     """
 
     if wrapped is None:
@@ -71,3 +71,26 @@ def wrapper(wrapped, instance, args, kwargs):
         return wrapped(*args, **kwargs)
 
     return wrapper(wrapped)
+
+
+def deprecated_warning(old_method=None, new_method=None, wait_seconds=2):
+    """
+    Function which can be used for indicating that a function/class is deprecated and going to be removed.
+
+    Args:
+      old_method: Name of deprecated class/function.
+      new_method: Name of new class/function to use.
+      wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
+      with subsequent logging messages.
+    """
+
+    # Create a banner
+    if new_method is not None:
+        msg = f"*****  {old_method} is deprecated. Please, use {new_method} instead.  *****"
+    else:
+        msg = f"*****  {old_method} is deprecated and will be removed soon.  *****"
+    banner = '\n'.join(['*' * len(msg)] * 2 + [msg] + ['*' * len(msg)] * 2)
+
+    logging.warning(f"\n\n{banner}\n")
+    logging.warning(f"Waiting for {wait_seconds} seconds before this message disappears.")
+    time.sleep(wait_seconds)
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 9fd75ad8a95a..494a9ab6d672 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,6 @@
 accelerated-scan
 boto3
-causal-conv1d>=1.2.0
+causal-conv1d==1.2.0.post2
 einops
 faiss-cpu
 fasttext
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 7370731ec996..5a2440b0fa2f 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -80,7 +80,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -133,6 +133,13 @@ def get_args(argv):
     parser.add_argument(
         "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
     )
+    parser.add_argument(
+        "-ucr",
+        '--use_cpp_runtime',
+        default=False,
+        action='store_true',
+        help='Use TensorRT LLM C++ runtime',
+    )
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
 
     args = parser.parse_args(argv)
@@ -206,32 +213,14 @@ def nemo_deploy(argv):
                 )
                 return
 
-    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
+        use_python_runtime=(not args.use_cpp_runtime),
+    )
 
     if args.nemo_checkpoint is not None:
-
-        trt_llm_exporter.export(
-            nemo_checkpoint_path=args.nemo_checkpoint,
-            model_type=args.model_type,
-            n_gpus=args.num_gpus,
-            tensor_parallel_size=args.num_gpus,
-            pipeline_parallel_size=1,
-            max_input_token=args.max_input_len,
-            max_output_token=args.max_output_len,
-            max_batch_size=args.max_batch_size,
-            max_num_tokens=args.max_num_tokens,
-            opt_num_tokens=args.opt_num_tokens,
-            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
-            remove_input_padding=(not args.disable_remove_input_padding),
-            dtype=args.dtype,
-            enable_multi_block_mode=args.multi_block_mode,
-            use_lora_plugin=args.use_lora_plugin,
-            lora_target_modules=args.lora_target_modules,
-            max_lora_rank=args.max_lora_rank,
-            save_nemo_model_config=True,
-        )
-
         try:
             LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
             trt_llm_exporter.export(
@@ -246,7 +235,7 @@ def nemo_deploy(argv):
                 max_num_tokens=args.max_num_tokens,
                 opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-                paged_kv_cache=args.use_paged_kv_cache,
+                paged_kv_cache=(not args.no_paged_kv_cache),
                 remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index e9741516cf00..a9c16bf8cff6 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -45,8 +45,8 @@ def get_args(argv):
     parser.add_argument(
         "-dt",
         "--dtype",
-        choices=["bf16", "fp16", "fp8", "int8"],
-        default="bf16",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
         type=str,
         help="dtype of the model on TensorRT-LLM",
     )
@@ -59,7 +59,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -123,7 +123,7 @@ def nemo_export_trt_llm(argv):
     LOGGER.info("Logging level set to {}".format(loglevel))
     LOGGER.info(args)
 
-    if args.dtype != "bf16":
+    if args.dtype != "bfloat16":
         LOGGER.error(
             "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. "
             "Support for the other precisions will be added in the coming releases."
@@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(
@@ -146,7 +146,7 @@ def nemo_export_trt_llm(argv):
             max_num_tokens=args.max_num_tokens,
             opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
+            paged_kv_cache=(not args.no_paged_kv_cache),
             remove_input_padding=(not args.disable_remove_input_padding),
             dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,
diff --git a/tests/collections/asr/confidence/test_asr_confidence.py b/tests/collections/asr/confidence/test_asr_confidence.py
index edf35bb17b0b..015264a9debe 100644
--- a/tests/collections/asr/confidence/test_asr_confidence.py
+++ b/tests/collections/asr/confidence/test_asr_confidence.py
@@ -72,6 +72,7 @@ def audio_and_texts(test_data_dir):
 
 
 class TestASRConfidenceBenchmark:
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
@@ -103,6 +104,7 @@ def test_run_confidence_benchmark(
                 atol=TOL,
             )
 
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index dd8871c329fc..02332f170759 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,8 +200,41 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "logprobs_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
     def test_batched_decoding_logprobs(
-        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+        self,
+        tmp_tokenizer,
+        alignments,
+        timestamps,
+        preserve_frame_confidence,
+        length_is_none,
+        logprobs_device,
+        length_device,
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -217,7 +250,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V))
+        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -225,7 +258,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -249,7 +282,33 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
+    @pytest.mark.parametrize(
+        "labels_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batch'
@@ -258,7 +317,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T))
+        input_labels = torch.randint(V, size=(B, T), device=labels_device)
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -266,7 +325,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
diff --git a/tests/collections/nlp/test_dialogue.py b/tests/collections/nlp/test_dialogue.py
deleted file mode 100644
index 9c227f737d98..000000000000
--- a/tests/collections/nlp/test_dialogue.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import (
-    DialogueGPTClassificationDataset,
-)
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics, DialogueGenerationMetrics
-from nemo.collections.nlp.models.dialogue.dialogue_nearest_neighbour_model import DialogueNearestNeighbourModel
-
-
-@pytest.mark.unit
-def test_dialogue_metric_generation_f1():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    precision, recall, f1 = DialogueGenerationMetrics._get_one_f1(generated_field, ground_truth_field)
-    assert precision == 75
-    assert recall == 75
-    assert f1 == 75
-
-
-@pytest.mark.unit
-def test_dialogue_metric_split_label_and_slots():
-    fields = ["reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)", "time_of_day(7pm)"]
-    labels, slots_list = DialogueClassificationMetrics.split_label_and_slots(fields, with_slots=True)
-    assert labels == ["reserve_restaurant", 'none']
-    assert slots_list == [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]]
-
-
-@pytest.mark.unit
-def test_dialogue_metric_slot_filling_metrics():
-    generated_slots = [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]]
-    ground_truth_slots = [["time_of_day(7pm)"], ["time_of_day(7pm)", "number_of_people(3)"]]
-
-    (
-        avg_precision,
-        avg_recall,
-        avg_f1,
-        avg_joint_goal_accuracy,
-    ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots)
-
-    assert avg_precision == 75
-    assert avg_recall == 75
-    assert avg_f1 == 75
-    assert avg_joint_goal_accuracy == 0
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_data_processor_normalize_zero_shot_intent():
-    label0 = 'food_ordering.contextual_query'
-    normalized_label0 = 'contextual query'
-
-    label1 = 'food_ordering.nomatch'
-    normalized_label1 = 'no match'
-
-    label2 = 'food_ordering.no'
-    normalized_label2 = 'no'
-
-    assert normalized_label0 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label0)
-    assert normalized_label1 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label1)
-    assert normalized_label2 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label2)
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_data_processor_get_continuous_slots():
-    slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {19: [3, 5], 18: [5, 6]}
-
-    # here 18 and 19 maps to the same slot (originally variants of B-slot and I-slot)
-    slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 18, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {18: [3, 6]}
-
-    # test if function works when non-empty slots are at boundary
-    slot_ids = [18, 54, 54, 19, 19]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {18: [0, 1], 19: [3, 5]}
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_map_bio_format_slots_to_unified_slots():
-
-    slots = ['B-time', 'I-time', 'B-alarm', 'I-alarm', 'O']
-    gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '0', '2': '1', '3': '1', '4': '2'}
-    gt_unified_slots = ['time', 'alarm', 'O']
-    (
-        bio_slot_ids_to_unified_slot_ids,
-        unified_slots,
-    ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots)
-    assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids
-    assert gt_unified_slots == unified_slots
-
-    # case in which BIOS scheme was not used in annotation
-    slots = ['time', 'alarm', 'O']
-    gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '1', '2': '2'}
-    gt_unified_slots = ['time', 'alarm', 'O']
-    (
-        bio_slot_ids_to_unified_slot_ids,
-        unified_slots,
-    ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots)
-
-    assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids
-    assert gt_unified_slots == unified_slots
-
-
-@pytest.mark.unit
-def test_dialogue_data_processor_get_relevant_idxs():
-
-    dataset_split = 'train'
-    dev_proportion = 10
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 900
-    assert idxs != list(range(900))
-
-    dataset_split = 'dev'
-    dev_proportion = 40
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 400
-    assert idxs != list(range(400))
-
-    dataset_split = 'test'
-    dev_proportion = 40
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 1000
-    assert idxs == list(range(1000))
-
-
-@pytest.mark.unit
-def test_dialogue_sgd_data_processor_convert_camelcase_to_lower():
-    label = 'none'
-    gt_converted_label = 'none'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-    label = 'ReserveRestaurant'
-    gt_converted_label = 'reserve restaurant'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-    label = 'Alarm'
-    gt_converted_label = 'alarm'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-
-@pytest.mark.unit
-def test_dialogue_gpt_classification_dataset_linearize_slots():
-
-    slots = []
-    linearized_slots = 'None'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-    slots = {'time': '7pm', 'place': 'field'}
-    linearized_slots = 'time(7pm), place(field)'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-    slots = {'time': ['7pm', '1900'], 'place': 'field'}
-    linearized_slots = 'time(7pm), place(field)'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-
-@pytest.mark.unit
-def test_dialogue_gpt_classification_dataset_linearize_slots():
-
-    actions = [
-        {'act': 'inform', 'slot': 'time', 'values': ['7pm', '1900']},
-        {'act': 'confirm', 'slot': 'place', 'values': ['hall']},
-    ]
-
-    prompt_template = 'values'
-    formatted_actions = '7pm hall'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-    prompt_template = 'slots_values'
-    formatted_actions = 'time (7pm) place (hall)'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-    prompt_template = 'acts_slots_values'
-    formatted_actions = 'inform time (7pm) confirm place (hall)'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-
-@pytest.mark.unit
-def test_dialogue_sgd_dataset_naive_tokenize():
-
-    utterance = 'I am feeling hungry so I would like to find a place to eat.'
-    tokens = [
-        'I',
-        ' ',
-        'am',
-        ' ',
-        'feeling',
-        ' ',
-        'hungry',
-        ' ',
-        'so',
-        ' ',
-        'I',
-        ' ',
-        'would',
-        ' ',
-        'like',
-        ' ',
-        'to',
-        ' ',
-        'find',
-        ' ',
-        'a',
-        ' ',
-        'place',
-        ' ',
-        'to',
-        ' ',
-        'eat',
-        '.',
-    ]
-    assert tokens == DialogueSGDBERTDataset._naive_tokenize(utterance)
-
-
-@pytest.mark.unit
-def test_dialogue_nearest_neighbour_mean_pooling():
-
-    model_output = [torch.ones(8, 512, 768)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.ones(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
-
-    model_output = [torch.zeros(8, 512, 768)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.zeros(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
-
-    model_output = [torch.cat([torch.zeros(8, 256, 768), torch.ones(8, 256, 768)], axis=1)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.ones(8, 768).float() * 0.5, DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
diff --git a/tests/collections/nlp/test_entity_linking_model.py b/tests/collections/nlp/test_entity_linking_model.py
deleted file mode 100644
index 16b768184296..000000000000
--- a/tests/collections/nlp/test_entity_linking_model.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
-
-import pytest
-import wget
-from omegaconf import OmegaConf
-
-from nemo.collections.nlp.models import EntityLinkingModel
-
-
-def get_cfg():
-
-    language_model = OmegaConf.create(
-        {"pretrained_model_name": "bert-base-uncased", "config_file": None, "config": None, "lm_checkpoint": None}
-    )
-
-    tokenizer = OmegaConf.create(
-        {"tokenizer_name": "bert-base-uncased", "vocab_file": None, "tokenizer_model": None, "do_lower_case": True}
-    )
-
-    model = OmegaConf.create(
-        {
-            "nemo_path": "sap_entity_linking.nemo",
-            "max_seq_length": 128,
-            "language_model": language_model,
-            "tokenizer": tokenizer,
-            "train_ds": None,
-            "validation_ds": None,
-        }
-    )
-
-    cfg = OmegaConf.create({"model": model})
-
-    return cfg
-
-
-class TestEntityLinkingModel:
-    @pytest.mark.with_downloads()
-    @pytest.mark.unit
-    def test_creation_saving_restoring(self):
-        # Create a new temporary directory
-        with tempfile.TemporaryDirectory() as restore_dir:
-            with tempfile.TemporaryDirectory() as save_dir:
-                model = EntityLinkingModel(cfg=get_cfg().model)
-                assert isinstance(model, EntityLinkingModel)
-
-                save_dir_path = save_dir
-
-                # Where model will be saved
-                model_save_path = os.path.join(save_dir, f"{model.__class__.__name__}.nemo")
-                model.save_to(save_path=model_save_path)
-
-                # Where model will be restored from
-                model_restore_path = os.path.join(restore_dir, f"{model.__class__.__name__}.nemo")
-                shutil.copy(model_save_path, model_restore_path)
-
-            # at this point save_dir should not exist
-            assert save_dir_path is not None and not os.path.exists(save_dir_path)
-            assert not os.path.exists(model_save_path)
-            assert os.path.exists(model_restore_path)
-
-            # attempt to restore
-            model_copy = model.__class__.restore_from(restore_path=model_restore_path)
-            assert model.num_weights == model_copy.num_weights
-
-
-if __name__ == "__main__":
-    t = TestEntityLinkingModel()
-    t.test_creation_saving_restoring()
diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py
deleted file mode 100644
index 8206457ec6ee..000000000000
--- a/tests/collections/nlp/test_megatron.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    import apex
-
-    apex_available = True
-except Exception:
-    apex_available = False
-
-import os
-import tempfile
-
-import onnx
-import pytest
-import torch
-from omegaconf import OmegaConf
-
-import nemo.collections.nlp as nemo_nlp
-from nemo.core.classes import typecheck
-
-
-def get_pretrained_bert_345m_uncased_model():
-    model_name = "megatron-bert-345m-uncased"
-    config = {"language_model": {"pretrained_model_name": model_name}, "tokenizer": {}}
-    omega_conf = OmegaConf.create(config)
-    model = nemo_nlp.modules.get_lm_model(cfg=omega_conf)
-    if torch.cuda.is_available():
-        model = model.cuda()
-    return model
-
-
-class TestMegatron:
-    @pytest.mark.skip("This test was written for megatron-lm")
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_list_pretrained_models(self):
-        pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list()
-        assert len(pretrained_lm_models) > 0
-
-    @pytest.mark.with_downloads()
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    @pytest.mark.skip("Only one Megatron model is allowed")
-    def test_get_model(self):
-        model = get_pretrained_bert_345m_uncased_model()
-        assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder)
-
-        typecheck.set_typecheck_enabled(enabled=False)
-        inp = model.input_example()
-        out = model.forward(*inp)
-        typecheck.set_typecheck_enabled(enabled=True)
-
-    @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
-    @pytest.mark.with_downloads()
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    @pytest.mark.skip("Megatron-LM BERT support deprecated. Supported in NeMo < 1.5")
-    def test_onnx_export(self):
-        model = get_pretrained_bert_345m_uncased_model()
-        assert model
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Generate filename in the temporary directory.
-            # Test export.
-            model.export(os.path.join(".", "megatron.onnx"))
-
-
-if __name__ == "__main__":
-    t = TestMegatron()
-    t.test_onnx_export()
diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py
deleted file mode 100644
index 20932b6c4e0d..000000000000
--- a/tests/collections/nlp/test_mem_map_dataset.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import csv
-import json
-import os
-
-import pytest
-
-from nemo.collections.nlp.data.language_modeling import text_memmap_dataset
-
-
-@pytest.fixture
-def jsonl_file(tmp_path):
-    # Create a temporary file path
-    file_path = tmp_path / "data.jsonl"
-
-    # Generate data to write to the JSONL file
-    data = [
-        {"name": "John", "age": 30},
-        {"name": "Jane", "age": 25},
-        {"name": "Bob", "age": 35},
-    ]
-
-    # Write data to the JSONL file
-    with open(file_path, mode="w") as file:
-        for item in data:
-            json.dump(item, file)
-            file.write("\n")
-
-    # Provide the file path to the test function
-    yield str(file_path)
-
-    # Optional: Clean up the temporary file after the test
-    file_path.unlink()
-
-
-@pytest.fixture
-def csv_file(tmp_path):
-    # Create a temporary file path
-    file_path = tmp_path / "data.csv"
-
-    # Generate data to write to the CSV file
-    data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]]
-
-    # Write data to the CSV file
-    with open(file_path, mode="w", newline="") as file:
-        writer = csv.writer(file)
-        writer.writerows(data)
-
-    # Provide the file path to the test function
-    yield str(file_path)
-
-    # Optional: Clean up the temporary file after the test
-    file_path.unlink()
-
-
-def test_jsonl_mem_map_dataset(jsonl_file):
-    """Test for JSONL memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0)
-    assert indexed_dataset[0] == {"name": "John", "age": 30}
-    assert indexed_dataset[1] == {"name": "Jane", "age": 25}
-    assert indexed_dataset[2] == {"name": "Bob", "age": 35}
-
-
-def test_csv_mem_map_dataset(csv_file):
-    """Test for CSV memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1)
-    assert indexed_dataset[0].strip() == "John"
-    assert indexed_dataset[1].strip() == "Jane"
-    assert indexed_dataset[2].strip() == "Bob"
-
-
-def test_csv_fields_mem_map_dataset(csv_file):
-    """Test for CSV memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.CSVFieldsMemmapDataset(
-        dataset_paths=[csv_file], data_fields={"ID": 0, "Name": 1}, header_lines=1
-    )
-    assert isinstance(indexed_dataset[0], dict)
-    assert sorted(indexed_dataset[0].keys()) == ["ID", "Name"]
-    assert indexed_dataset[0]["ID"] == "1" and indexed_dataset[1]["ID"] == "2" and indexed_dataset[2]["ID"] == "3"
-    assert (
-        indexed_dataset[0]["Name"].strip() == "John"
-        and indexed_dataset[1]["Name"].strip() == "Jane"
-        and indexed_dataset[2]["Name"].strip() == "Bob"
-    )
-
-
-@pytest.mark.parametrize(
-    "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset],
-)
-@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False])
-@pytest.mark.parametrize("relative_index_fn", [True, False])
-def test_mem_map_dataset_index_mapping_dir(
-    tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn,
-):
-    """Test for index_mapping_dir."""
-    if relative_index_fn:
-        jsonl_file = os.path.relpath(jsonl_file)
-    else:
-        jsonl_file = os.path.abspath(jsonl_file)
-
-    if use_alternative_index_mapping_dir:
-        index_mapping_dir = tmp_path / "subdir"
-        dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir))
-        # Index files should not be created in default location.
-        assert not os.path.isfile(f"{jsonl_file}.idx.npy")
-        assert not os.path.isfile(f"{jsonl_file}.idx.info")
-        if relative_index_fn:
-            # Remove leading ".." sequences.
-            while jsonl_file.startswith(("../")):
-                jsonl_file = jsonl_file.lstrip("../")
-        idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx"
-        assert os.path.isfile(f"{idx_fn}.npy")
-        assert os.path.isfile(f"{idx_fn}.info")
-    else:
-        text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0)
-        assert os.path.isfile(f"{jsonl_file}.idx.npy")
-        assert os.path.isfile(f"{jsonl_file}.idx.info")
diff --git a/tests/collections/nlp/test_prompt_learning.py b/tests/collections/nlp/test_prompt_learning.py
deleted file mode 100644
index 4597fe9ecef0..000000000000
--- a/tests/collections/nlp/test_prompt_learning.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import get_pseudo_tokens
-from nemo.collections.nlp.modules.common import VirtualPromptSource
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-from nemo.core import Dataset
-
-
-def get_prompt_tuning_dataset(
-    dataset_path, tokenizer, virtual_prompt_source, task_templates, pseudo_tokens,
-):
-    dataset = GPTPromptLearningDataset(
-        data=[dataset_path],
-        tokenizer=tokenizer,
-        virtual_prompt_source=virtual_prompt_source,
-        task_templates=task_templates,
-        pseudo_tokens=pseudo_tokens,
-        pad_token_id=tokenizer.unk_id,
-        max_seq_length=512,
-        min_seq_length=1,
-    )
-
-    return dataset
-
-
-def create_temp_dataset():
-    example_dataset_a = [
-        {'taskname': 'task name A', 'text': 'Test sentence one, Answer: ', 'answer': 'test'} for i in range(24)
-    ]
-    example_dataset_b = [
-        {'taskname': 'task name B', 'question': 'This is a question', 'answer': 'test'} for i in range(13)
-    ]
-    example_dataset = example_dataset_a + example_dataset_b
-    temp_file_name = 'temp_dataset_file.jsonl'
-
-    with open(temp_file_name, 'w') as temp:
-        for example in example_dataset:
-            temp.write(json.dumps(example) + '\n')
-
-    return temp_file_name
-
-
-def get_task_templates():
-    task_templates = {}
-    task_templates['task name A'] = {
-        "prompt_template": "<|VIRTUAL_PROMPT_0|>{text}{answer}",
-        "prompt_template_fields": ['text', 'answer'],
-        "total_virtual_tokens": 5,
-        "virtual_token_splits": [5],
-        "truncate_field": None,
-        "answer_only_loss": True,
-        "answer_field": "answer",
-        "task_id_num": 0,
-    }
-    task_templates['task name B'] = {
-        "prompt_template": "<|VIRTUAL_PROMPT_0|>{question}<|VIRTUAL_PROMPT_1|>{answer}{extra}",
-        "prompt_template_fields": ['question', 'answer', 'extra'],
-        "total_virtual_tokens": 10,
-        "virtual_token_splits": [7, 3],
-        "truncate_field": None,
-        "answer_only_loss": False,
-        "answer_field": None,
-        "task_id_num": 1,
-    }
-    return task_templates
-
-
-class TestMegatronGPTPromptLearningDataset:
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_init_prompt_learning_dataset(self):
-        tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer')
-        task_templates = get_task_templates()
-        dataset_path = create_temp_dataset()
-
-        # Setup virtual token place holders
-        total_virtual_tokens = 10
-        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
-        tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens})
-
-        dataset = get_prompt_tuning_dataset(
-            dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens,
-        )
-
-        print(type(dataset))
-
-        assert isinstance(dataset, Dataset)
-
-        os.remove(dataset_path)
-
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_prompt_learning_dataset_collate_fn_prompt_encoder(self):
-        tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer')
-        task_templates = get_task_templates()
-        dataset_path = create_temp_dataset()
-
-        # Setup virtual token place holders
-        total_virtual_tokens = 10
-        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
-        tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens})
-
-        dataset = get_prompt_tuning_dataset(
-            dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens,
-        )
-
-        batch = [dataset[i] for i in range(8)]
-        batch = dataset.collate_fn(batch)
-
-        assert len(batch) == 6
-
-        _, _, _, _, _, taskname_ids = batch
-
-        assert list(taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A")
-
-        os.remove(dataset_path)
-
-
-if __name__ == "__main__":
-    t = TestMegatronGPTPromptLearningDataset()
-    t.test_init_prompt_learning_dataset()
-    t.test_prompt_learning_dataset_collate_fn_prompt_encoder()
-    print('-' * 50 + '\nALL PROMPT TUNING UNIT TESTS PASS!\n' + '-' * 50)
diff --git a/tests/collections/nlp/test_qna.py b/tests/collections/nlp/test_qna.py
deleted file mode 100644
index 4a470cacb711..000000000000
--- a/tests/collections/nlp/test_qna.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
-from nemo.collections.nlp.data.question_answering.dataset.qa_gpt_dataset import GPTQADataset
-from nemo.collections.nlp.metrics.qa_metrics import QAMetrics
-
-
-@pytest.mark.unit
-def test_remove_articles():
-    sentences = [
-        "this is an apple",
-        "this is the apple",
-        "this is a fruit",
-    ]
-
-    expected_article_removed_sents = ["this is   apple", "this is   apple", "this is   fruit"]
-
-    article_removed_sents = [QAMetrics.remove_articles(sent) for sent in sentences]
-
-    assert article_removed_sents == expected_article_removed_sents
-
-
-@pytest.mark.unit
-def test_white_space_fix():
-    sentences = [
-        "sentence with a space",
-        "sentence with multiple   spaces",
-    ]
-
-    expected_white_space_fixed_sents = [
-        "sentence with a space",
-        "sentence with multiple spaces",
-    ]
-
-    white_space_fixed_sents = [QAMetrics.white_space_fix(sent) for sent in sentences]
-
-    assert white_space_fixed_sents == expected_white_space_fixed_sents
-
-
-@pytest.mark.unit
-def test_remove_punc():
-    sentence = "this, is. a! sentence: with; punctuations?"
-    expected_punc_removed_sent = "this is a sentence with punctuations"
-
-    punc_removed_sent = QAMetrics.remove_punc(sentence)
-
-    assert punc_removed_sent == expected_punc_removed_sent
-
-
-@pytest.mark.unit
-def test_get_normalized_tokens():
-    sentence = 'I am happy'
-    tokens = ['i', 'am', 'happy']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-    sentence = 'I am a person'
-    tokens = ['i', 'am', 'person']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-    sentence = 'I am a person.'
-    tokens = ['i', 'am', 'person']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-
-@pytest.mark.unit
-def test_get_one_f1():
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field)
-    assert f1 == 0.75
-
-    generated_field = ''
-    ground_truth_field = 'That'
-
-    f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field)
-    assert f1 == 0
-
-
-@pytest.mark.unit
-def test_get_one_exact_match():
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 0
-
-    generated_field = 'That is so good!'
-    ground_truth_field = 'That is so good.'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 1
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'that is so good'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 1
-
-
-@pytest.mark.unit
-def test_split_into_words():
-    text = 'hi yo'
-    char_to_word_offset = [0, 0, 0, 1, 1]
-    doc_tokens = ["hi", "yo"]
-    output = QADataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-    text = 'i am good'
-    char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2]
-    doc_tokens = ["i", "am", 'good']
-    output = QADataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-
-@pytest.mark.unit
-def test_get_doc_spans():
-    all_doc_tokens = ['a'] * 15
-    max_tokens_for_doc = 10
-    doc_stride = 5
-    doc_spans = QADataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride)
-
-    assert len(doc_spans) == 2
-    assert doc_spans[0].start == 0
-    assert doc_spans[0].length == 10
-    assert doc_spans[1].start == 5
-    assert doc_spans[1].length == 10
-
-
-@pytest.mark.unit
-def test_get_average_dist_to_tok_start_and_end():
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_span = _DocSpan(start=0, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 3
-
-    assert 2 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 6 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=4)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 5 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-
-@pytest.mark.unit
-def test_keep_relevant_docspans():
-
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'all'
-    assert doc_spans == QADataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode)
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = -1
-    tok_end_position = -1
-
-    mode = 'only_positive'
-
-    expected_doc_spans = []
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'only_positive'
-
-    expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)]
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'limited_negative'
-
-    expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)]
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-
-@pytest.mark.unit
-def test_gpt_no_pad_loss_masking():
-    input_ids = [1] * 15 + [50257] * 15
-    input_ids = torch.tensor(input_ids)
-
-    input_attn_mask = [1] * 16 + [0] * 14
-    input_attn_mask = torch.Tensor(input_attn_mask)
-
-    training_mask_end = 10
-
-    expected_labels = [-100] * 10 + [1] * 5 + [50257] + [-100] * 14
-    expected_labels = torch.tensor(expected_labels)
-
-    labels = GPTQADataset.update_labels_for_no_pad_loss(input_ids, training_mask_end, input_attn_mask)
-
-    assert torch.all(labels.eq(expected_labels))
diff --git a/tests/collections/nlp/test_question_answering.py b/tests/collections/nlp/test_question_answering.py
deleted file mode 100644
index c4aacf449c50..000000000000
--- a/tests/collections/nlp/test_question_answering.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-from pydoc import doc
-
-import pytest
-
-from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset
-from nemo.collections.nlp.data.question_answering_squad.qa_squad_processing import (
-    _get_tokens,
-    exact_match_score,
-    f1_score,
-)
-
-
-@pytest.mark.unit
-def test_get_tokens():
-    sentence = 'I am happy'
-    tokens = ['i', 'am', 'happy']
-    assert tokens == _get_tokens(sentence)
-
-    sentence = 'I am a person'
-    tokens = ['i', 'am', 'person']
-    assert tokens == _get_tokens(sentence)
-
-    sentence = 'I am a person.'
-    tokens = ['i', 'am', 'person']
-    assert tokens == _get_tokens(sentence)
-
-
-@pytest.mark.unit
-def test_f1_score():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    f1 = f1_score(generated_field, ground_truth_field)
-    assert f1 == 0.75
-
-    generated_field = ''
-    ground_truth_field = 'That'
-
-    f1 = f1_score(generated_field, ground_truth_field)
-    assert f1 == 0
-
-
-@pytest.mark.unit
-def test_exact_match_score():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 0
-
-    generated_field = 'That is so good!'
-    ground_truth_field = 'That is so good.'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 1
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'that is so good'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 1
-
-
-@pytest.mark.unit
-def test_split_into_words():
-    text = 'hi yo'
-    char_to_word_offset = [0, 0, 0, 1, 1]
-    doc_tokens = ["hi", "yo"]
-    output = SquadDataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-    text = 'i am good'
-    char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2]
-    doc_tokens = ["i", "am", 'good']
-    output = SquadDataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-
-@pytest.mark.unit
-def test_get_doc_spans():
-    all_doc_tokens = ['a'] * 15
-    max_tokens_for_doc = 10
-    doc_stride = 5
-    doc_spans = SquadDataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride)
-
-    assert len(doc_spans) == 2
-    assert doc_spans[0].start == 0
-    assert doc_spans[0].length == 10
-    assert doc_spans[1].start == 5
-    assert doc_spans[1].length == 10
-
-
-@pytest.mark.unit
-def test_get_average_dist_to_tok_start_and_end():
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_span = _DocSpan(start=0, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 3
-
-    assert 2 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 6 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=4)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 5 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-
-@pytest.mark.unit
-def test_keep_relevant_docspans():
-
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'all'
-    assert doc_spans == SquadDataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode)
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = -1
-    tok_end_position = -1
-
-    mode = 'only_positive'
-
-    expected_doc_spans = []
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'only_positive'
-
-    expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)]
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'limited_negative'
-
-    expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)]
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py
deleted file mode 100644
index 8e4d6e9a7b8f..000000000000
--- a/tests/collections/nlp/test_spellchecking_asr_customization.py
+++ /dev/null
@@ -1,1102 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-from transformers import AutoTokenizer
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder
-from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (
-    apply_replacements_to_text,
-    substitute_replacements_in_text,
-)
-
-
-@pytest.mark.unit
-def test_substitute_replacements_in_text():
-    text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement"
-    replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)]
-    gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement"
-    corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False)
-    assert corrected_text == gold_text
-
-    gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement"
-    corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True)
-    assert corrected_text == gold_text_no_hyphen
-
-
-@pytest.mark.unit
-def test_apply_replacements_to_text():
-
-    # min_prob = 0.5
-    # dp_data = None,
-    # min_dp_score_per_symbol: float = -99.9
-
-    # test more than one fragment to replace, test multiple same replacements
-    text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement"
-    replacements = [
-        (66, 75, 'proterra', 0.99986),
-        (66, 75, 'proterra', 0.9956),
-        (101, 109, 'navistar', 0.93),
-        (101, 109, 'navistar', 0.91),
-        (101, 109, 'navistar', 0.92),
-    ]
-    gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement"
-    corrected_text = apply_replacements_to_text(
-        text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None
-    )
-    assert corrected_text == gold_text
-
-    # test that min_prob works
-    gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement"
-    corrected_text = apply_replacements_to_text(
-        text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None
-    )
-    assert corrected_text == gold_text
-
-
-@pytest.fixture()
-def bert_example_builder():
-    tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D")
-    label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10}
-    semiotic_classes = {"PLAIN": 0, "CUSTOM": 1}
-    max_seq_len = 256
-    builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len)
-    return builder
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_creation(bert_example_builder):
-    assert bert_example_builder._tokenizer is not None
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_spans(bert_example_builder):
-    span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
-    gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)]
-    spans = bert_example_builder._get_spans(span_info_parts)
-    spans.sort()
-    assert spans == gold_sorted_spans
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_fragment_indices(bert_example_builder):
-    hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w"
-    targets = [1]
-    # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w
-    # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-    span_info_parts = ["CUSTOM 8 17"]
-    gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)]
-    fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts)
-    fragment_indices.sort()
-    assert fragment_indices == gold_sorted_fragment_indices
-
-    # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w
-    # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-    span_info_parts = ["CUSTOM 10 16"]
-    gold_sorted_fragment_indices = [(11, 18, 1)]
-    fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts)
-    fragment_indices.sort()
-    assert fragment_indices == gold_sorted_fragment_indices
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_input_features(bert_example_builder):
-    hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-    ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y"
-    targets = [1, 3]
-    span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"]
-
-    gold_tags = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-    ]
-    gold_input_ids = [
-        101,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1041,
-        1054,
-        1055,
-        1035,
-        1040,
-        1045,
-        1040,
-        1045,
-        1041,
-        1035,
-        1055,
-        1051,
-        1049,
-        1051,
-        1050,
-        1035,
-        1037,
-        1050,
-        1040,
-        1035,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1045,
-        1037,
-        1050,
-        1035,
-        1043,
-        1048,
-        1048,
-        1051,
-        102,
-        1040,
-        1045,
-        1040,
-        1045,
-        1041,
-        1054,
-        1035,
-        1055,
-        1037,
-        1057,
-        1049,
-        1051,
-        1050,
-        102,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1045,
-        1041,
-        102,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1037,
-        1050,
-        1035,
-        1043,
-        1057,
-        1045,
-        1048,
-        1048,
-        1051,
-        1056,
-        102,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1041,
-        1055,
-        1055,
-        1041,
-        102,
-        1049,
-        1051,
-        1050,
-        1037,
-        1040,
-        1041,
-        102,
-        1039,
-        1044,
-        1054,
-        1045,
-        1055,
-        1056,
-        1045,
-        1037,
-        1050,
-        102,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1041,
-        1054,
-        102,
-        1055,
-        1051,
-        1048,
-        1051,
-        1049,
-        1051,
-        1050,
-        102,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        102,
-        1049,
-        1041,
-        1054,
-        1039,
-        1061,
-        102,
-    ]
-    gold_input_mask = [
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-    ]
-    gold_segment_ids = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        5,
-        5,
-        5,
-        5,
-        5,
-        5,
-        5,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        10,
-        10,
-        10,
-        10,
-        10,
-        10,
-    ]
-    gold_labels_mask = [
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-    ]
-    gold_input_ids_for_subwords = [
-        101,
-        26357,
-        2106,
-        2666,
-        2061,
-        8202,
-        1998,
-        13012,
-        16643,
-        2319,
-        1043,
-        7174,
-        102,
-        2106,
-        3771,
-        7842,
-        2819,
-        2239,
-        102,
-        28625,
-        3630,
-        9856,
-        102,
-        9822,
-        26458,
-        7174,
-        2102,
-        102,
-        13012,
-        13473,
-        11393,
-        102,
-        13813,
-        3207,
-        102,
-        3017,
-        102,
-        15211,
-        102,
-        9168,
-        102,
-        2106,
-        28173,
-        4305,
-        4305,
-        102,
-        8673,
-        102,
-    ]
-    gold_input_mask_for_subwords = [
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-    ]
-    gold_segment_ids_for_subwords = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        3,
-        3,
-        4,
-        4,
-        4,
-        4,
-        5,
-        5,
-        5,
-        6,
-        6,
-        7,
-        7,
-        8,
-        8,
-        9,
-        9,
-        9,
-        9,
-        9,
-        10,
-        10,
-    ]
-    gold_character_pos_to_subword_pos = [
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        4,
-        4,
-        5,
-        5,
-        5,
-        5,
-        6,
-        6,
-        6,
-        6,
-        7,
-        7,
-        7,
-        8,
-        8,
-        8,
-        9,
-        9,
-        9,
-        10,
-        11,
-        11,
-        11,
-        12,
-        13,
-        13,
-        13,
-        14,
-        14,
-        14,
-        14,
-        15,
-        15,
-        16,
-        16,
-        17,
-        17,
-        18,
-        19,
-        19,
-        19,
-        19,
-        19,
-        20,
-        20,
-        21,
-        21,
-        21,
-        22,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        24,
-        24,
-        24,
-        25,
-        25,
-        25,
-        26,
-        27,
-        28,
-        28,
-        28,
-        29,
-        29,
-        29,
-        30,
-        30,
-        30,
-        31,
-        32,
-        32,
-        32,
-        32,
-        33,
-        33,
-        34,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        36,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        38,
-        39,
-        39,
-        39,
-        39,
-        39,
-        39,
-        39,
-        40,
-        41,
-        41,
-        41,
-        42,
-        42,
-        42,
-        43,
-        43,
-        44,
-        44,
-        45,
-        46,
-        46,
-        46,
-        46,
-        46,
-        47,
-    ]
-
-    tags = [0 for _ in hyp.split()]
-    for p, t in zip(span_info_parts, targets):
-        c, start, end = p.split(" ")
-        start = int(start)
-        end = int(end)
-        tags[start:end] = [t for i in range(end - start)]
-
-    # get input features for characters
-    (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features(
-        hyp=hyp, ref=ref, tags=tags
-    )
-
-    # get input features for words
-    hyp_with_words = hyp.replace(" ", "").replace("_", " ")
-    ref_with_words = ref.replace(" ", "").replace("_", " ")
-    (
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        _,
-        _,
-        _,
-        _,
-    ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None)
-
-    character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords)
-
-    assert tags == gold_tags
-    assert input_ids == gold_input_ids
-    assert input_mask == gold_input_mask
-    assert segment_ids == gold_segment_ids
-    assert labels_mask == gold_labels_mask
-    assert input_ids_for_subwords == gold_input_ids_for_subwords
-    assert input_mask_for_subwords == gold_input_mask_for_subwords
-    assert segment_ids_for_subwords == gold_segment_ids_for_subwords
-    assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 8c6b33022dac..2d9bd03f0203 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -151,7 +151,7 @@ def test_omegaconf(self):
 
     @pytest.mark.unit
     def test_trainer_loggers(self, tmp_path):
-        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
+        """Test that a trainer with logger errors out with a number of arguments. Test that it works with
         create_tensorboard_logger set to False
         """
         test_trainer = pl.Trainer(accelerator='cpu')  # Should create logger and modelcheckpoint
@@ -235,7 +235,7 @@ def test_trainer_neptune_logger(self, tmp_path):
 
     @pytest.mark.unit
     def test_checkpoint_configurations(self):
-        """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
+        """Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
         is error free if only one is asked to do so.
         """
         disable_tb_logger = {"create_tensorboard_logger": False}
@@ -297,7 +297,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
 
     @pytest.mark.unit
     def test_resume(self, tmp_path):
-        """ Tests the resume capabilities of exp_manager"""
+        """Tests the resume capabilities of exp_manager"""
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
 
         # Error because explicit_log_dir does not exist
@@ -428,7 +428,8 @@ def test_nemo_checkpoint_save_best_model_1(self, tmp_path):
     def test_nemo_checkpoint_save_best_model_2(self, tmp_path):
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
         exp_manager(
-            test_trainer, {"explicit_log_dir": str(tmp_path / "test")},
+            test_trainer,
+            {"explicit_log_dir": str(tmp_path / "test")},
         )
         model = ExampleModel()
         test_trainer.fit(model)
@@ -456,6 +457,27 @@ def test_nemo_checkpoint_always_save_nemo(self, tmp_path):
         model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
         assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
 
+    @pytest.mark.unit
+    def test_nemo_checkpoint_doesnt_produce_too_many_nemo_ckpts(self, tmp_path):
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True, "save_top_k": 2},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists()
+        assert (
+            len(list((tmp_path / "test" / "checkpoints").glob("default*.nemo"))) == 1
+        )  # check number of `.nemo` checkpoints
+
+        model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
+        assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
+
     @pytest.mark.unit
     def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path):
         test_trainer = pl.Trainer(
@@ -511,8 +533,8 @@ def test_nemo_checkpoint_restore_model(self, tmp_path):
 
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.parametrize('test_dist_ckpt', [False, True])
-    def test_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
-        """ Simulates already existing checkpoints in the ckpt directory and tests ckpt versioning """
+    def test_base_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
+        """Simulates already existing checkpoints in the ckpt directory and tests non-nemo ckpt versioning"""
         strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto'
         test_trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy
@@ -563,7 +585,8 @@ def _get_versioned_name(ckpt_name: Path, nemo: bool = False):
 
         assert _get_versioned_name(ckpt_1).exists(), all_checkpoints
         assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints  # ckpt2 didn't exist before
-        assert _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
+        # .nemo checkpoints are not versioned:
+        assert not _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
 
     @pytest.mark.unit
     def test_last_checkpoint_saved(self, tmp_path):
@@ -592,6 +615,7 @@ def train_dataloader(self):
         model_path = checkpoint_dir / "val_loss=0.0300-epoch=1-step=64-last.ckpt"
         last_saved_checkpoint = torch.load(model_path)
         assert max_steps == last_saved_checkpoint['global_step']
+
         # restart training, ensure global step starts correctly
         class AssertCallback(Callback):
             def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
@@ -681,8 +705,7 @@ def test_warning_validation_skipping_when_custom_epoch_loop(self, tmp_path):
         """
         tmp_path = tmp_path / "test_3"
 
-        class CustomLoop(_TrainingEpochLoop):
-            ...
+        class CustomLoop(_TrainingEpochLoop): ...
 
         trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1, val_check_interval=0.33
@@ -759,7 +782,8 @@ def test_skipped_unfinished_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -803,7 +827,8 @@ def test_skipped_unfinished_dist_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -850,13 +875,17 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         # unfinished checkpoint with EMA part, both parts should be removed
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete01-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete01-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         self._write_fake_checkpoint(checkpoints_dir / "incomplete01.ckpt", isdir=False, add_unfinished_marker=True)
 
         # just EMA part - should be removed. NOTE marker path is the same for base part and for EMA part
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete02-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete02-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         (checkpoints_dir / f"incomplete02{NeMoModelCheckpoint.UNFINISHED_CHECKPOINT_SUFFIX}").touch()
 
@@ -864,7 +893,10 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
@@ -909,7 +941,10 @@ def test_incomplete_dist_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index d27b3079a25e..ff1c9eae20ec 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -81,7 +81,12 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
 
             if nq is not None:
                 trtllm_deployed_output = nq.query_llm(
-                    prompts=[prompt], max_output_token=1, top_k=1, top_p=0, temperature=0.1, task_id=task_ids,
+                    prompts=[prompt],
+                    max_output_token=1,
+                    top_k=1,
+                    top_p=0,
+                    temperature=0.1,
+                    task_id=task_ids,
                 )
                 trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
 
@@ -194,7 +199,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,
@@ -215,7 +220,8 @@ def run_trt_llm_inference(
 
         if ptuning:
             trt_llm_exporter.add_prompt_table(
-                task_name="0", prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name="0",
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
 
         output = trt_llm_exporter.forward(
@@ -234,7 +240,11 @@ def run_trt_llm_inference(
         nm = None
         output_deployed = ""
         if test_deployment:
-            nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name=model_name, port=8000,)
+            nm = DeployPyTriton(
+                model=trt_llm_exporter,
+                triton_model_name=model_name,
+                port=8000,
+            )
             nm.deploy()
             nm.run()
             nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
@@ -350,77 +360,121 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_name", type=str, required=True,
+        "--model_name",
+        type=str,
+        required=True,
     )
     parser.add_argument(
-        "--existing_test_models", default=False, action='store_true',
+        "--existing_test_models",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--model_type", type=str, required=False,
+        "--model_type",
+        type=str,
+        required=False,
     )
     parser.add_argument(
-        "--min_gpus", type=int, default=1, required=True,
+        "--min_gpus",
+        type=int,
+        default=1,
+        required=True,
     )
     parser.add_argument(
-        "--max_gpus", type=int,
+        "--max_gpus",
+        type=int,
     )
     parser.add_argument(
-        "--checkpoint_dir", type=str, default="/tmp/nemo_checkpoint/", required=False,
+        "--checkpoint_dir",
+        type=str,
+        default="/tmp/nemo_checkpoint/",
+        required=False,
     )
     parser.add_argument(
-        "--trt_llm_model_dir", type=str,
+        "--trt_llm_model_dir",
+        type=str,
     )
     parser.add_argument(
-        "--max_batch_size", type=int, default=8,
+        "--max_batch_size",
+        type=int,
+        default=8,
     )
     parser.add_argument(
-        "--max_input_token", type=int, default=256,
+        "--max_input_token",
+        type=int,
+        default=256,
     )
     parser.add_argument(
-        "--max_output_token", type=int, default=128,
+        "--max_output_token",
+        type=int,
+        default=128,
     )
     parser.add_argument(
-        "--p_tuning_checkpoint", type=str,
+        "--p_tuning_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--ptuning", default=False, action='store_true',
+        "--ptuning",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--lora_checkpoint", type=str,
+        "--lora_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--lora", default=False, action='store_true',
+        "--lora",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--tp_size", type=int,
+        "--tp_size",
+        type=int,
     )
     parser.add_argument(
-        "--pp_size", type=int,
+        "--pp_size",
+        type=int,
     )
     parser.add_argument(
-        "--top_k", type=int, default=1,
+        "--top_k",
+        type=int,
+        default=1,
     )
     parser.add_argument(
-        "--top_p", type=float, default=0.0,
+        "--top_p",
+        type=float,
+        default=0.0,
     )
     parser.add_argument(
-        "--temperature", type=float, default=1.0,
+        "--temperature",
+        type=float,
+        default=1.0,
     )
     parser.add_argument(
-        "--run_accuracy", default=False, action='store_true',
+        "--run_accuracy",
+        default=False,
+        action='store_true',
     )
     parser.add_argument("--streaming", default=False, action="store_true")
     parser.add_argument(
-        "--test_deployment", type=str, default="False",
+        "--test_deployment",
+        type=str,
+        default="False",
     )
     parser.add_argument(
-        "--debug", default=False, action='store_true',
+        "--debug",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--ci_upload_test_results_to_cloud", default=False, action='store_true',
+        "--ci_upload_test_results_to_cloud",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--test_data_path", type=str, default=None,
+        "--test_data_path",
+        type=str,
+        default=None,
     )
 
     return parser.parse_args()
diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index c21696702a39..18b8652fa10a 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -588,7 +588,7 @@
         "id": "U7Eezf_sAVS0"
       },
       "source": [
-        "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
+        "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
         "\n",
         "This is because the `setup_optimization()` method does it for you! You can still update the config manually."
       ]
diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index 8105aa3db69c..4d2d7148b8b7 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -916,7 +916,7 @@
     {
       "cell_type": "code",
       "source": [
-        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"v2\")"
+        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"_v2\")"
       ],
       "metadata": {
         "id": "WDgwrr2aQyUS"
diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
index e177a5132b26..06bb75f8f237 100644
--- a/tutorials/asr/ASR_Confidence_Estimation.ipynb
+++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -284,7 +284,7 @@
     "            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
     "        ):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
@@ -307,7 +307,7 @@
     "        current_word_line = \"\"\n",
     "        for word, score in zip(transcript_list, confidence_scores):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index 75385234ce29..ec8c0c1b78c6 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -361,7 +361,7 @@
    "source": [
     "## Create a context-biasing list\n",
     "\n",
-    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
     "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
     "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
     "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index f0671763b984..e50e8d1f283e 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -1431,10 +1431,10 @@
                 "# Lets change the scheduler\n",
                 "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
                 "\n",
-                "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
+                "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
                 "optim_sched_cfg.sched.pop('power')\n",
                 "\n",
-                "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
+                "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
                 "optim_sched_cfg.sched.pop('hold_ratio')\n",
                 "\n",
                 "# Set \"min_lr\" to lower value\n",
diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index b817aef798cb..47e3c2c2ed0f 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -67,7 +67,7 @@
     "\n",
     "This notebook will show you how to prepare an image-text dataset into the [WebDataset](https://github.com/webdataset/webdataset) format. The Webdataset format is required to train all multimodal models in NeMo, such as Stable Diffusion and Imagen. \n",
     "\n",
-    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Megatron-Launcher on multiple processors/GPUs. NeMo-Megatron-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
+    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Framework-Launcher on multiple processors/GPUs. NeMo-Framework-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
     "\n",
     "Depending on your use case, not all 5 stages need to be run. Please go to [NeMo Multimodal Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/multimodal/text2img/datasets.html) for an overview of the 5 stages.\n",
     "    \n",
@@ -85,7 +85,7 @@
    "source": [
     "import os\n",
     "\n",
-    "LAUNCHER_DIR = \"/opt/NeMo-Megatron-Launcher\"\n",
+    "LAUNCHER_DIR = \"/opt/NeMo-Framework-Launcher\"  # formerly NeMo-Megatron-Launcher\n",
     "SCRIPT_DIR = os.path.join(LAUNCHER_DIR, \"launcher_scripts/nemo_launcher/collections/dataprep_scripts/multimodal_dataprep\")\n",
     "CONF_DIR = \"conf\"\n",
     "DATA_DIR = \"dummy_data\"\n",
@@ -168,7 +168,7 @@
     "\n",
     "Script: download_images.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to the number of parquet files in `$DATA_DIR/parquet/dummy_dataset50000.parquet_parts`. (i.e. `parquet_subpartitions` x `num_parquets_downloaded`)\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= SLURM_ARRAY_TASK_ID < SLURM_ARRAY_TASK_COUNT)\n",
     "\n",
@@ -266,7 +266,7 @@
     "\n",
     "Script: reorganize_tar.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -430,7 +430,7 @@
    },
    "outputs": [],
    "source": [
-    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Megatron-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
+    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
    ]
   },
   {
@@ -506,7 +506,7 @@
     "\n",
     "Script: precache_encodings.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -533,15 +533,6 @@
     "    precache_config_path=$CONF_DIR/precache_sd_example.yaml"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "If you encounter a nemo import problem with the cell above, please also running it in the terminal directly."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
deleted file mode 100644
index e0e2129e4dda..000000000000
--- a/tutorials/nlp/Dialogue.ipynb
+++ /dev/null
@@ -1,717 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jaosjY4rGRNH"
-      },
-      "source": [
-        "# Installing NeMo from source\n",
-        "\n",
-        "\n",
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run the cell below to set up dependencies.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "goQzOSflEq27"
-      },
-      "outputs": [],
-      "source": [
-        "import os \n",
-        "BRANCH = 'r2.0.0rc0'\n",
-        "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
-        "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n",
-        "os.chdir('NeMo')\n",
-        "!./reinstall.sh\n",
-        "os.chdir('..')\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GjQ_z_xQMDIb"
-      },
-      "source": [
-        "# Overview\n",
-        "\n",
-        "There are three tasks as part of this tutorial\n",
-        "\n",
-        "1. Intent and Slot Classification using Assistant Dataset and a BERT model\n",
-        "2. Intent Classification using Schema Guided Dialogue Dataset and a GPT2 model\n",
-        "3. Answer Extender using MS Marco NLGen Dataset and a BART model\n",
-        "\n",
-        "Feel free to skip to the task that interests you most after installing NeMo from source."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AS-zwy8tEq2_"
-      },
-      "source": [
-        "# 1. Intent and Slot Classification using Assistant Dataset\n",
-        "\n",
-        "## 1.1 Task Description\n",
-        "\n",
-        "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n",
-        "for this Intent in a query.\n",
-        "For example, in the query:  `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n",
-        "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n",
-        "Intents and Slots names are usually task specific and defined as labels in the training data.\n",
-        "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n",
-        "\n",
-        "Our model enables to train and then detect both of these tasks together.\n",
-        "\n",
-        "Note: There is a similar model available at [Joint Intent Slot Classification Colab](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb). However, this model only support BERT style models while the model in this tutorial supports other types of models such as GPT2. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FJk_UAyeEq3B"
-      },
-      "source": [
-        "\n",
-        "## 1.2 Download Assistant dataset and convert to NeMo format\n",
-        "\n",
-        "This is a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n",
-        "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n",
-        "\n",
-        "An example is:\n",
-        "\n",
-        "* utterance: what alarms have i set for tomorrow \n",
-        "* intent: alarm_query\n",
-        "* slots: date(tomorrow)\n",
-        "\n",
-        "\n",
-        "Note: While only the assistant dataset is used here, import_dataset.py is also compatible with ATIS and SNIPS"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "jjOVdGX2Eq3D"
-      },
-      "outputs": [],
-      "source": [
-        "# download and unzip the example dataset from github\n",
-        "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n",
-        "!unzip master.zip\n",
-        "# convert the dataset to the NeMo format\n",
-        "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5n81deZsEq3G"
-      },
-      "source": [
-        "## 1.3 Training and/or Testing the model\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eoYc_8jhEq3G"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./assistant' \\\n",
-        "  model.dataset.dialogues_example_dir='./assistant_bert_examples' \\\n",
-        "  model.dataset.task='assistant' \\\n",
-        "  model.language_model.pretrained_model_name='bert-base-uncased' \\\n",
-        "  exp_manager.create_wandb_logger=False)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GaPmHjayEbg8"
-      },
-      "source": [
-        "**Results after 3 epochs**\n",
-        "\n",
-        "Intent report: \n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    alarm_query (label_id: 0)                              100.00      94.44      97.14         18\n",
-        "    alarm_remove (label_id: 1)                             100.00      90.91      95.24         11\n",
-        "    alarm_set (label_id: 2)                                 94.12      94.12      94.12         17\n",
-        "    audio_volume_down (label_id: 3)                         75.00      42.86      54.55          7\n",
-        "    audio_volume_mute (label_id: 4)                        100.00      92.86      96.30         14\n",
-        "    audio_volume_up (label_id: 5)                           72.22     100.00      83.87         13\n",
-        "    calendar_query (label_id: 6)                            87.50      77.78      82.35         18\n",
-        "    calendar_remove (label_id: 7)                           94.44     100.00      97.14         17\n",
-        "    calendar_set (label_id: 8)                              94.44      94.44      94.44         18\n",
-        "    cooking_recipe (label_id: 9)                            85.71      70.59      77.42         17\n",
-        "    datetime_convert (label_id: 10)                         88.89     100.00      94.12          8\n",
-        "    datetime_query (label_id: 11)                           89.47     100.00      94.44         17\n",
-        "    email_addcontact (label_id: 12)                         80.00     100.00      88.89          8\n",
-        "    email_query (label_id: 13)                             100.00      83.33      90.91         18\n",
-        "    email_querycontact (label_id: 14)                       78.95      88.24      83.33         17\n",
-        "    email_sendemail (label_id: 15)                          94.44      94.44      94.44         18\n",
-        "    general_affirm (label_id: 16)                          100.00     100.00     100.00         17\n",
-        "    general_commandstop (label_id: 17)                     100.00     100.00     100.00         18\n",
-        "    general_confirm (label_id: 18)                         100.00     100.00     100.00         17\n",
-        "    general_dontcare (label_id: 19)                        100.00     100.00     100.00         18\n",
-        "    general_explain (label_id: 20)                         100.00     100.00     100.00         17\n",
-        "    general_joke (label_id: 21)                             91.67     100.00      95.65         11\n",
-        "    general_negate (label_id: 22)                          100.00     100.00     100.00         18\n",
-        "    general_praise (label_id: 23)                          100.00     100.00     100.00         17\n",
-        "    general_quirky (label_id: 24)                           60.00      50.00      54.55         18\n",
-        "    general_repeat (label_id: 25)                          100.00     100.00     100.00         17\n",
-        "    iot_cleaning (label_id: 26)                            100.00     100.00     100.00         15\n",
-        "    iot_coffee (label_id: 27)                               85.71     100.00      92.31         18\n",
-        "    iot_hue_lightchange (label_id: 28)                     100.00      94.12      96.97         17\n",
-        "    iot_hue_lightdim (label_id: 29)                        100.00     100.00     100.00         12\n",
-        "    iot_hue_lightoff (label_id: 30)                        100.00     100.00     100.00         17\n",
-        "    iot_hue_lighton (label_id: 31)                         100.00      50.00      66.67          4\n",
-        "    iot_hue_lightup (label_id: 32)                          84.62      91.67      88.00         12\n",
-        "    iot_wemo_off (label_id: 33)                            100.00     100.00     100.00          9\n",
-        "    iot_wemo_on (label_id: 34)                             100.00      85.71      92.31          7\n",
-        "    lists_createoradd (label_id: 35)                        90.00     100.00      94.74         18\n",
-        "    lists_query (label_id: 36)                             100.00      94.12      96.97         17\n",
-        "    lists_remove (label_id: 37)                             88.89      88.89      88.89         18\n",
-        "    music_likeness (label_id: 38)                          100.00      93.75      96.77         16\n",
-        "    music_query (label_id: 39)                             100.00     100.00     100.00         17\n",
-        "    music_settings (label_id: 40)                           77.78     100.00      87.50          7\n",
-        "    news_query (label_id: 41)                               72.73      88.89      80.00         18\n",
-        "    play_audiobook (label_id: 42)                          100.00     100.00     100.00         17\n",
-        "    play_game (label_id: 43)                                93.75      83.33      88.24         18\n",
-        "    play_music (label_id: 44)                               85.00     100.00      91.89         17\n",
-        "    play_podcasts (label_id: 45)                           100.00      88.89      94.12         18\n",
-        "    play_radio (label_id: 46)                               84.21      94.12      88.89         17\n",
-        "    qa_currency (label_id: 47)                              85.00      94.44      89.47         18\n",
-        "    qa_definition (label_id: 48)                            89.47     100.00      94.44         17\n",
-        "    qa_factoid (label_id: 49)                               64.00      88.89      74.42         18\n",
-        "    qa_maths (label_id: 50)                                 84.62      84.62      84.62         13\n",
-        "    qa_stock (label_id: 51)                                 87.50      77.78      82.35         18\n",
-        "    recommendation_events (label_id: 52)                    87.50      82.35      84.85         17\n",
-        "    recommendation_locations (label_id: 53)                 83.33      83.33      83.33         18\n",
-        "    recommendation_movies (label_id: 54)                   100.00      60.00      75.00         10\n",
-        "    social_post (label_id: 55)                             100.00      94.12      96.97         17\n",
-        "    social_query (label_id: 56)                            100.00      82.35      90.32         17\n",
-        "    takeaway_order (label_id: 57)                           92.31      70.59      80.00         17\n",
-        "    takeaway_query (label_id: 58)                           93.75      83.33      88.24         18\n",
-        "    transport_query (label_id: 59)                          81.25      76.47      78.79         17\n",
-        "    transport_taxi (label_id: 60)                          100.00     100.00     100.00         16\n",
-        "    transport_ticket (label_id: 61)                         85.00      94.44      89.47         18\n",
-        "    transport_traffic (label_id: 62)                        93.75      88.24      90.91         17\n",
-        "    weather_query (label_id: 63)                            89.47     100.00      94.44         17\n",
-        "    -------------------\n",
-        "    micro avg                                               91.16      91.16      91.16        996\n",
-        "    macro avg                                               91.66      90.44      90.48        996\n",
-        "    weighted avg                                            91.72      91.16      91.04        996\n",
-        "```\n",
-        "Slot report: \n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    alarm_type (label_id: 0)                                 0.00       0.00       0.00          2\n",
-        "    app_name (label_id: 1)                                   0.00       0.00       0.00          1\n",
-        "    artist_name (label_id: 2)                               17.39      80.00      28.57          5\n",
-        "    audiobook_author (label_id: 3)                           0.00       0.00       0.00          0\n",
-        "    audiobook_name (label_id: 4)                            64.52      74.07      68.97         27\n",
-        "    business_name (label_id: 5)                             81.48      84.62      83.02         52\n",
-        "    business_type (label_id: 6)                             80.00      80.00      80.00         20\n",
-        "    change_amount (label_id: 7)                             57.14      66.67      61.54          6\n",
-        "    coffee_type (label_id: 8)                              100.00      33.33      50.00          3\n",
-        "    color_type (label_id: 9)                                75.00      92.31      82.76         13\n",
-        "    cooking_type (label_id: 10)                              0.00       0.00       0.00          1\n",
-        "    currency_name (label_id: 11)                           100.00      96.43      98.18         28\n",
-        "    date (label_id: 12)                                     87.88      87.22      87.55        133\n",
-        "    definition_word (label_id: 13)                          85.00      85.00      85.00         20\n",
-        "    device_type (label_id: 14)                              84.75      76.92      80.65         65\n",
-        "    drink_type (label_id: 15)                                0.00       0.00       0.00          0\n",
-        "    email_address (label_id: 16)                            64.29     100.00      78.26          9\n",
-        "    email_folder (label_id: 17)                            100.00      50.00      66.67          2\n",
-        "    event_name (label_id: 18)                               80.00      75.00      77.42         64\n",
-        "    food_type (label_id: 19)                                84.38      77.14      80.60         35\n",
-        "    game_name (label_id: 20)                                93.55      78.38      85.29         37\n",
-        "    game_type (label_id: 21)                                 0.00       0.00       0.00          0\n",
-        "    general_frequency (label_id: 22)                         0.00       0.00       0.00          9\n",
-        "    house_place (label_id: 23)                              80.95      91.89      86.08         37\n",
-        "    ingredient (label_id: 24)                                0.00       0.00       0.00          1\n",
-        "    joke_type (label_id: 25)                               100.00     100.00     100.00          5\n",
-        "    list_name (label_id: 26)                                89.29      69.44      78.12         36\n",
-        "    meal_type (label_id: 27)                                 0.00       0.00       0.00          3\n",
-        "    media_type (label_id: 28)                               78.95      83.33      81.08         36\n",
-        "    movie_name (label_id: 29)                                0.00       0.00       0.00          1\n",
-        "    movie_type (label_id: 30)                                0.00       0.00       0.00          0\n",
-        "    music_album (label_id: 31)                               0.00       0.00       0.00          0\n",
-        "    music_descriptor (label_id: 32)                          0.00       0.00       0.00          2\n",
-        "    music_genre (label_id: 33)                              81.82      90.00      85.71         10\n",
-        "    news_topic (label_id: 34)                               80.00      30.77      44.44         13\n",
-        "    order_type (label_id: 35)                              100.00      42.11      59.26         19\n",
-        "    person (label_id: 36)                                   70.79     100.00      82.89         63\n",
-        "    personal_info (label_id: 37)                            76.19      94.12      84.21         17\n",
-        "    place_name (label_id: 38)                               82.86      84.47      83.65        103\n",
-        "    player_setting (label_id: 39)                           75.00      42.86      54.55          7\n",
-        "    playlist_name (label_id: 40)                             0.00       0.00       0.00          3\n",
-        "    podcast_descriptor (label_id: 41)                       92.31      54.55      68.57         22\n",
-        "    podcast_name (label_id: 42)                             66.67      16.67      26.67         12\n",
-        "    radio_name (label_id: 43)                               94.87      94.87      94.87         39\n",
-        "    relation (label_id: 44)                                 90.91      90.91      90.91         11\n",
-        "    song_name (label_id: 45)                               100.00       6.67      12.50         15\n",
-        "    time (label_id: 46)                                     77.57      84.69      80.98         98\n",
-        "    time_zone (label_id: 47)                                44.44     100.00      61.54          4\n",
-        "    timeofday (label_id: 48)                                86.96      80.00      83.33         25\n",
-        "    transport_agency (label_id: 49)                         80.00      57.14      66.67          7\n",
-        "    transport_descriptor (label_id: 50)                      0.00       0.00       0.00          5\n",
-        "    transport_name (label_id: 51)                            0.00       0.00       0.00          0\n",
-        "    transport_type (label_id: 52)                           88.89     100.00      94.12         40\n",
-        "    weather_descriptor (label_id: 53)                       87.50      87.50      87.50          8\n",
-        "    O (label_id: 54)                                        97.07      97.52      97.30       5408\n",
-        "    -------------------\n",
-        "    micro avg                                               94.24      94.24      94.24       6582\n",
-        "    macro avg                                               64.87      59.93      59.17       6582\n",
-        "    weighted avg                                            94.23      94.24      93.95       6582\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-44x5PqyrOeQ"
-      },
-      "source": [
-        "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "QyqQbpR4rNHT"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n",
-        "# model.dataset.target_template=with_slots: this perform slot filling with intent classification\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./assistant' \\\n",
-        "  model.dataset.dialogues_example_dir='./assistant_gpt2_examples' \\\n",
-        "  model.dataset.task='assistant' \\\n",
-        "  model.language_model.pretrained_model_name='gpt2' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n",
-        "  model.dataset.target_template=with_slots \\\n",
-        "  model.dataset.eval_mode=generation \\\n",
-        "  exp_manager.create_wandb_logger=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FbQ-6TVM1yQg"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "More epochs would be helpful\n",
-        "\n",
-        "Intent report:\n",
-        "\n",
-        "  ```\n",
-        "  label                                                precision    recall       f1           support   \n",
-        "    transport query (label_id: 0)                           72.73      84.21      78.05         19\n",
-        "    weather query (label_id: 1)                             94.74      94.74      94.74         19\n",
-        "    play game (label_id: 2)                                 92.86      68.42      78.79         19\n",
-        "    qa currency (label_id: 3)                              100.00     100.00     100.00         19\n",
-        "    qa maths (label_id: 4)                                 100.00     100.00     100.00         14\n",
-        "    iot wemo off (label_id: 5)                              75.00     100.00      85.71          9\n",
-        "    datetime convert (label_id: 6)                          46.67      87.50      60.87          8\n",
-        "    email addcontact (label_id: 7)                          70.00      87.50      77.78          8\n",
-        "    music likeness (label_id: 8)                            57.89      61.11      59.46         18\n",
-        "    music query (label_id: 9)                               78.57      57.89      66.67         19\n",
-        "    general negate (label_id: 10)                           95.00     100.00      97.44         19\n",
-        "    email sendemail (label_id: 11)                          92.86      68.42      78.79         19\n",
-        "    general affirm (label_id: 12)                           95.00     100.00      97.44         19\n",
-        "    play audiobook (label_id: 13)                           57.69      78.95      66.67         19\n",
-        "    general praise (label_id: 14)                          100.00      94.74      97.30         19\n",
-        "    alarm set (label_id: 15)                                85.71      94.74      90.00         19\n",
-        "    general explain (label_id: 16)                         100.00      89.47      94.44         19\n",
-        "    iot wemo on (label_id: 17)                              83.33      71.43      76.92          7\n",
-        "    cooking recipe (label_id: 18)                           90.00      94.74      92.31         19\n",
-        "    music settings (label_id: 19)                           60.00      42.86      50.00          7\n",
-        "    social post (label_id: 20)                              84.21      84.21      84.21         19\n",
-        "    recommendation events (label_id: 21)                    72.73      84.21      78.05         19\n",
-        "    audio volume up (label_id: 22)                          76.47     100.00      86.67         13\n",
-        "    lists remove (label_id: 23)                             73.08     100.00      84.44         19\n",
-        "    transport ticket (label_id: 24)                         94.74      94.74      94.74         19\n",
-        "    general joke (label_id: 25)                            100.00     100.00     100.00         12\n",
-        "    play podcasts (label_id: 26)                            94.12      84.21      88.89         19\n",
-        "    iot hue lightchange (label_id: 27)                      85.71      63.16      72.73         19\n",
-        "    audio volume mute (label_id: 28)                        84.62      73.33      78.57         15\n",
-        "    general dontcare (label_id: 29)                         95.00     100.00      97.44         19\n",
-        "    qa definition (label_id: 30)                            77.27      89.47      82.93         19\n",
-        "    email querycontact (label_id: 31)                       58.33      73.68      65.12         19\n",
-        "    general commandstop (label_id: 32)                     100.00     100.00     100.00         19\n",
-        "    calendar remove (label_id: 33)                          94.44      89.47      91.89         19\n",
-        "    news query (label_id: 34)                              100.00      57.89      73.33         19\n",
-        "    calendar query (label_id: 35)                           63.16      63.16      63.16         19\n",
-        "    social query (label_id: 36)                             88.24      83.33      85.71         18\n",
-        "    transport traffic (label_id: 37)                        90.48     100.00      95.00         19\n",
-        "    transport taxi (label_id: 38)                          100.00      94.44      97.14         18\n",
-        "    alarm query (label_id: 39)                             100.00      94.74      97.30         19\n",
-        "    iot hue lightoff (label_id: 40)                         88.89      84.21      86.49         19\n",
-        "    takeaway order (label_id: 41)                           81.25      68.42      74.29         19\n",
-        "    iot coffee (label_id: 42)                              100.00      94.74      97.30         19\n",
-        "    recommendation movies (label_id: 43)                    75.00      90.00      81.82         10\n",
-        "    iot hue lightup (label_id: 44)                          78.57      78.57      78.57         14\n",
-        "    email query (label_id: 45)                              85.71      94.74      90.00         19\n",
-        "    lists createoradd (label_id: 46)                        82.35      73.68      77.78         19\n",
-        "    play radio (label_id: 47)                               84.21      84.21      84.21         19\n",
-        "    audio volume down (label_id: 48)                       100.00      87.50      93.33          8\n",
-        "    general quirky (label_id: 49)                           30.00      15.79      20.69         19\n",
-        "    play music (label_id: 50)                               71.43      52.63      60.61         19\n",
-        "    qa stock (label_id: 51)                                 90.48     100.00      95.00         19\n",
-        "    iot cleaning (label_id: 52)                             93.33      87.50      90.32         16\n",
-        "    iot hue lightdim (label_id: 53)                        100.00     100.00     100.00         12\n",
-        "    recommendation locations (label_id: 54)                100.00      89.47      94.44         19\n",
-        "    general repeat (label_id: 55)                          100.00     100.00     100.00         19\n",
-        "    takeaway query (label_id: 56)                           77.27      89.47      82.93         19\n",
-        "    alarm remove (label_id: 57)                            100.00     100.00     100.00         11\n",
-        "    datetime query (label_id: 58)                           75.00      63.16      68.57         19\n",
-        "    iot hue lighton (label_id: 59)                          60.00     100.00      75.00          3\n",
-        "    qa factoid (label_id: 60)                               50.00      57.89      53.66         19\n",
-        "    calendar set (label_id: 61)                             75.00      78.95      76.92         19\n",
-        "    general confirm (label_id: 62)                         100.00     100.00     100.00         19\n",
-        "    lists query (label_id: 63)                              66.67      73.68      70.00         19\n",
-        "    label_id: 64                                             0.00       0.00       0.00          0\n",
-        "    -------------------\n",
-        "    micro avg                                               83.55      83.55      83.55       1076\n",
-        "    macro avg                                               83.53      83.93      83.01       1076\n",
-        "    weighted avg                                            84.26      83.55      83.30       1076\n",
-        "    \n",
-        "```\n",
-        "\n",
-        "```\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "       Test metric             DataLoader 0\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "        intent_f1            83.55018615722656\n",
-        "    intent_precision         83.55018615722656\n",
-        "      intent_recall          83.55018615722656\n",
-        "         slot_f1             73.99985919756773\n",
-        "slot_joint_goal_accuracy     65.89219330855019\n",
-        "     slot_precision          73.85223048327137\n",
-        "       slot_recall           74.14807930607186\n",
-        "  test_intent_accuracy       83.55018587360595\n",
-        "     test_loss_epoch       0.019178826361894608\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Gd42arYoEq3J"
-      },
-      "source": [
-        "# 2. Schema Guided Dialogue (SGD)\n",
-        "\n",
-        "## 2.1 Task Description\n",
-        "---\n",
-        "\n",
-        "SGD is a multi-domain intent classification dataset from Google with close to 100k examples.\n",
-        "\n",
-        "An example is:\n",
-        "\n",
-        "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n",
-        "* intent: ReserveRestaurant\n",
-        "* slots: {\"time\": \"11:30 am\"}\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "neH8rXwjEq3J"
-      },
-      "source": [
-        "## 2.2 Download the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IgD8eavfJ5pi"
-      },
-      "outputs": [],
-      "source": [
-        "!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7G7uPrUpEq3J"
-      },
-      "source": [
-        "## 2.3 Training and/or Testing the model\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gqo-rwQlEq3K"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n",
-        "\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./dstc8-schema-guided-dialogue' \\\n",
-        "  model.dataset.dialogues_example_dir='./sgd_gpt2_predictions' \\\n",
-        "  model.dataset.task='sgd' \\\n",
-        "  model.language_model.pretrained_model_name='gpt2' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n",
-        "  exp_manager.create_wandb_logger=False)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kGDlV5HvI2PQ"
-      },
-      "outputs": [],
-      "source": [
-        "!ls sgd_gpt2_predictions"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "p8g0f5KDTu9K"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "More epochs would needed to reach convergence.\n",
-        "\n",
-        "\n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    check balance (label_id: 0)                              0.00       0.00       0.00          0\n",
-        "    find trains (label_id: 1)                               80.20      91.95      85.68        348\n",
-        "    make payment (label_id: 2)                              83.12      28.07      41.97        228\n",
-        "    book appointment (label_id: 3)                          86.93      87.15      87.04        397\n",
-        "    get cars available (label_id: 4)                        96.88      90.51      93.58        274\n",
-        "    get event dates (label_id: 5)                            0.00       0.00       0.00          0\n",
-        "    buy bus ticket (label_id: 6)                            78.61      91.33      84.49        173\n",
-        "    add event (label_id: 7)                                  0.00       0.00       0.00          0\n",
-        "    get alarms (label_id: 8)                                58.33      77.78      66.67         45\n",
-        "    reserve car (label_id: 9)                               83.75      72.43      77.68        185\n",
-        "    get events (label_id: 10)                                0.00       0.00       0.00          0\n",
-        "    reserve roundtrip flights (label_id: 11)                 0.00       0.00       0.00          0\n",
-        "    lookup music (label_id: 12)                             89.83      86.89      88.33         61\n",
-        "    book house (label_id: 13)                               91.13      92.50      91.81        200\n",
-        "    search oneway flight (label_id: 14)                     74.77      47.70      58.25        174\n",
-        "    buy event tickets (label_id: 15)                        72.19      95.31      82.15        128\n",
-        "    find apartment (label_id: 16)                            0.00       0.00       0.00          0\n",
-        "    schedule visit (label_id: 17)                           77.27      66.06      71.23        386\n",
-        "    play media (label_id: 18)                               92.94      86.81      89.77         91\n",
-        "    get ride (label_id: 19)                                 99.41      98.82      99.12        170\n",
-        "    reserve oneway flight (label_id: 20)                     0.00       0.00       0.00          0\n",
-        "    find bus (label_id: 21)                                 96.64      87.53      91.86        361\n",
-        "    find restaurants (label_id: 22)                         77.14      91.22      83.59        148\n",
-        "    get times for movie (label_id: 23)                       0.00       0.00       0.00          0\n",
-        "    transfer money (label_id: 24)                            0.00       0.00       0.00          0\n",
-        "    request payment (label_id: 25)                          46.71      63.39      53.79        112\n",
-        "    play movie (label_id: 26)                              100.00      65.11      78.87        321\n",
-        "    search house (label_id: 27)                             97.91      91.83      94.77        306\n",
-        "    search roundtrip flights (label_id: 28)                 67.49      82.41      74.21        199\n",
-        "    find provider (label_id: 29)                            95.11      90.53      92.77        602\n",
-        "    find attractions (label_id: 30)                        100.00      89.01      94.19         91\n",
-        "    reserve hotel (label_id: 31)                            56.75      97.04      71.62        169\n",
-        "    lookup song (label_id: 32)                               0.00       0.00       0.00          0\n",
-        "    add alarm (label_id: 33)                                95.68      60.18      73.89        221\n",
-        "    find home by area (label_id: 34)                        48.95      59.79      53.83        194\n",
-        "    get available time (label_id: 35)                        0.00       0.00       0.00          0\n",
-        "    buy movie tickets (label_id: 36)                       100.00      29.39      45.42        473\n",
-        "    reserve restaurant (label_id: 37)                       95.71      84.80      89.92        342\n",
-        "    find movies (label_id: 38)                              62.40      97.61      76.14        335\n",
-        "    get weather (label_id: 39)                             100.00      87.69      93.44        195\n",
-        "    search hotel (label_id: 40)                             99.35      52.60      68.78        289\n",
-        "    find events (label_id: 41)                              99.57      82.56      90.27        281\n",
-        "    play song (label_id: 42)                                 0.00       0.00       0.00          0\n",
-        "    rent movie (label_id: 43)                                0.00       0.00       0.00          0\n",
-        "    get train tickets (label_id: 44)                        45.83       5.56       9.91        198\n",
-        "    none (label_id: 45)                                     55.77      98.90      71.32        728\n",
-        "    label_id: 46                                             0.00       0.00       0.00          0\n",
-        "    -------------------\n",
-        "    micro avg                                               77.23      77.23      77.23       8425\n",
-        "    macro avg                                               82.01      76.68      76.56       8425\n",
-        "    weighted avg                                            83.23      77.23      76.86       8425\n",
-        "\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jUJb-9VLLBXo"
-      },
-      "source": [
-        "# 3. MS Marco\n",
-        "\n",
-        "## Task Description\n",
-        "\n",
-        "MS Marco NLGen is a dataset from Microsoft that takes extracted answers and questions and output fluent answers.\n",
-        "\n",
-        "An example is \n",
-        "\n",
-        "\n",
-        "*   question: What county is Nine Mile in?\n",
-        "*   extracted_answer: Onondaga\n",
-        "*   fluent_answer: Nine Mile is in Onondaga county.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VtXEKG_UQU9u"
-      },
-      "source": [
-        "## Download and unzip files"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "b9avsZ1CEq3K"
-      },
-      "outputs": [],
-      "source": [
-        "!mkdir ms_marco\n",
-        "os.chdir('ms_marco')\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz\n",
-        "\n",
-        "!gunzip train_v2.1.json.gz\n",
-        "!gunzip dev_v2.1.json.gz\n",
-        "\n",
-        "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json \n",
-        "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json \n",
-        "\n",
-        "os.chdir('..')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "h7UZ9R8gQTFo"
-      },
-      "source": [
-        "## Training and/or Testing the model\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fwGQCwbvRf2m"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.dialogues_example_dir='./marco_bart_predictions' \\\n",
-        "  model.dataset.data_dir='./ms_marco' \\\n",
-        "  model.save_model=True \\\n",
-        "  model.dataset.debug_mode=True \\\n",
-        "  model.dataset.task='ms_marco' \\\n",
-        "  model.language_model.pretrained_model_name='facebook/bart-base' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.dataset.debug_mode=False \\\n",
-        "  exp_manager.create_wandb_logger=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UL7ekAOZ2abi"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "Train more epochs for optimal performance\n",
-        "\n",
-        "```\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "       Test metric             DataLoader 0\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "          bleu               65.46179962158203\n",
-        "           f1                78.24439835896995\n",
-        "        precision            81.92473076099847\n",
-        "         recall              76.72508929408436\n",
-        "      test_accuracy         25.563487607283225\n",
-        "        test_loss           0.4419259166606655\n",
-        "     test_loss_epoch        0.4420809745788574\n",
-        "        test_ppl            1.5557004846779854\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "```"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Dialogue.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.7.7"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb
deleted file mode 100644
index a53f66540e84..000000000000
--- a/tutorials/nlp/Entity_Linking_Medical.ipynb
+++ /dev/null
@@ -1,632 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-    "\n",
-    "Instructions for setting up Colab are as follows:\n",
-    "1. Open a new Python 3 notebook.\n",
-    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-    "4. Run this cell to set up dependencies.\n",
-    "\"\"\"\n",
-    "\n",
-    "## Install NeMo if using google collab or if its not installed locally\n",
-    "BRANCH = 'r2.0.0rc0'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Install dependencies\n",
-    "!pip install wget\n",
-    "!pip install faiss-gpu"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import faiss\n",
-    "import torch\n",
-    "import wget\n",
-    "import os\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from omegaconf import OmegaConf\n",
-    "from pytorch_lightning import Trainer\n",
-    "from IPython.display import display\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "from nemo.collections import nlp as nemo_nlp\n",
-    "from nemo.utils.exp_manager import exp_manager"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Entity Linking"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Task Description\n",
-    "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n",
-    "\n",
-    "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n",
-    "\n",
-    "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Self Alignment Pretraining\n",
-    "Self-Alignment pretraining is a second stage pretraining of an existing encoder (called second stage because the encoder model can be further finetuned after this more general pretraining step). The dataset used during training consists of pairs of concept synonyms that map to the same ID. At each training iteration, we only select *hard* examples present in the mini batch to calculate the loss and update the model weights. In this context, a hard example is an example where a concept is closer to an unrelated concept in the mini batch than it is to the synonym concept it is paired with by some margin. I encourage you to take a look at [section 2 of the paper](https://arxiv.org/pdf/2010.11784.pdf) for a more formal and in depth description of how hard examples are selected.\n",
-    "\n",
-    "We then use a [metric learning loss](https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf) calculated from the hard examples selected. This loss helps reshape the embedding space. The concept representation space is rearranged to be more suitable for entity matching via embedding cosine similarity. \n",
-    "\n",
-    "Now that we have idea of what's going on, let's get started!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Dataset Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Download data into project directory\n",
-    "PROJECT_DIR = \".\" #Change if you don't want the current directory to be the project dir\n",
-    "DATA_DIR = os.path.join(PROJECT_DIR, \"tiny_example_data\")\n",
-    "\n",
-    "if not os.path.isdir(os.path.join(DATA_DIR)):\n",
-    "    wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/tiny_example_data.zip',\n",
-    "                  os.path.join(PROJECT_DIR, \"tiny_example_data.zip\"))\n",
-    "\n",
-    "    !unzip {PROJECT_DIR}/tiny_example_data.zip -d {PROJECT_DIR}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial we will be using a tiny toy dataset to demonstrate how to use NeMo's entity linking model functionality. The dataset includes synonyms for 12 medical concepts. Entity phrases with the same ID are synonyms for the same concept. For example, \"*chronic kidney failure*\", \"*gradual loss of kidney function*\", and \"*CKD*\" are all synonyms of concept ID 5. Here's the dataset before preprocessing:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "raw_data = pd.read_csv(os.path.join(DATA_DIR, \"tiny_example_dev_data.csv\"), names=[\"ID\", \"CONCEPT\"], index_col=False)\n",
-    "print(raw_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We've already paired off the concepts for this dataset with the format `ID concept_synonym1 concept_synonym2`. Here are the first ten rows:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_data = pd.read_table(os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\"), names=[\"ID\", \"CONCEPT_SYN1\", \"CONCEPT_SYN2\"], delimiter='\\t')\n",
-    "print(training_data.head(10))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Use the [Unified Medical Language System (UMLS)](https://www.nlm.nih.gov/research/umls/index.html) dataset for full medical domain entity linking training. The data contains over 9 million entities and is a table of medical concepts with their corresponding concept IDs (CUI). After [requesting a free license and making a UMLS Terminology Services (UTS) account](https://www.nlm.nih.gov/research/umls/index.html), the [entire UMLS dataset](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) can be downloaded from the NIH's website. If you've cloned the NeMo repo you can run the data processing script located in `examples/nlp/entity_linking/data/umls_dataset_processing.py` on the full dataset. This script will take in the initial table of UMLS concepts and produce a .tsv file with each row formatted as `CUI\\tconcept_synonym1\\tconcept_synonym2`. Once the UMLS dataset .RRF file is downloaded, the script can be run from the `examples/nlp/entity_linking` directory like so: \n",
-    "```\n",
-    "python data/umls_dataset_processing.py\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model Training"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Second stage pretrain a BERT Base encoder on the self-alignment pretraining task (SAP) for improved entity linking. Using a GPU, the model should take 5 minutes or less to train on this example dataset and training progress will be output below the cell."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Download config\n",
-    "wget.download(f\"https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml\",\n",
-    "              os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Load in config file\n",
-    "cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Set config file variables\n",
-    "cfg.project_dir = PROJECT_DIR\n",
-    "cfg.model.nemo_path = os.path.join(PROJECT_DIR, \"tiny_example_sap_bert_model.nemo\")\n",
-    "cfg.model.train_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\")\n",
-    "cfg.model.validation_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_validation_pairs.tsv\")\n",
-    "\n",
-    "# remove distributed training flags\n",
-    "cfg.trainer.strategy = 'auto'\n",
-    "cfg.trainer.accelerator = 'auto'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize the trainer and model\n",
-    "trainer = Trainer(**cfg.trainer)\n",
-    "exp_manager(trainer, cfg.get(\"exp_manager\", None))\n",
-    "model = nemo_nlp.models.EntityLinkingModel(cfg=cfg.model, trainer=trainer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train and save the model\n",
-    "trainer.fit(model)\n",
-    "model.save_to(cfg.model.nemo_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can run the script at `examples/nlp/entity_linking/self_alignment_pretraining.py` to train a model on a larger dataset. Run\n",
-    "\n",
-    "```\n",
-    "python self_alignment_pretraining.py project_dir=.\n",
-    "```\n",
-    "from the `examples/nlp/entity_linking` directory."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model Evaluation\n",
-    "\n",
-    "Let's evaluate our freshly trained model and compare its performance with a BERT Base encoder that hasn't undergone self-alignment pretraining. We first need to restore our trained model and load our BERT Base Baseline model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-    "\n",
-    "# Restore second stage pretrained model\n",
-    "sap_model_cfg = cfg\n",
-    "sap_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_index\")\n",
-    "sap_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n",
-    "sap_model = nemo_nlp.models.EntityLinkingModel.restore_from(sap_model_cfg.model.nemo_path).to(device)\n",
-    "\n",
-    "# Load original model\n",
-    "base_model_cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Set train/val datasets to None to avoid loading datasets associated with training\n",
-    "base_model_cfg.model.train_ds = None\n",
-    "base_model_cfg.model.validation_ds = None\n",
-    "base_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"base_model_index\")\n",
-    "base_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n",
-    "base_model = nemo_nlp.models.EntityLinkingModel(base_model_cfg.model).to(device)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We are going evaluate our model on a nearest neighbor task using top 1 and top 5 accuracies as our metric. We will be using a tiny example test knowledge base and test queries. For this evaluation we are going to be comparing every test query with every concept vector in our test set knowledge base. We will rank each item in the knowledge base by its cosine similarity with the test query. We'll then compare the IDs of the predicted most similar test knowledge base concepts with our ground truth query IDs to calculate top 1 and top 5 accuracies. For this metric higher is better."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Helper function to get data embeddings\n",
-    "def get_embeddings(model, dataloader):\n",
-    "    embeddings, cids = [], []\n",
-    "\n",
-    "    with torch.no_grad():\n",
-    "        for batch in tqdm(dataloader):\n",
-    "            input_ids, token_type_ids, attention_mask, batch_cids = batch\n",
-    "            batch_embeddings = model.forward(input_ids=input_ids.to(device), \n",
-    "                                             token_type_ids=token_type_ids.to(device), \n",
-    "                                             attention_mask=attention_mask.to(device))\n",
-    "\n",
-    "            # Accumulate index embeddings and their corresponding IDs\n",
-    "            embeddings.extend(batch_embeddings.cpu().detach().numpy())\n",
-    "            cids.extend(batch_cids)\n",
-    "            \n",
-    "    return embeddings, cids"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def evaluate(model, test_kb, test_queries, ks):\n",
-    "    # Initialize knowledge base and query data loaders\n",
-    "    test_kb_dataloader = model.setup_dataloader(test_kb, is_index_data=True)\n",
-    "    test_query_dataloader = model.setup_dataloader(test_queries, is_index_data=True)\n",
-    "    \n",
-    "    # Get knowledge base and query embeddings\n",
-    "    test_kb_embs, test_kb_cids = get_embeddings(model, test_kb_dataloader)\n",
-    "    test_query_embs, test_query_cids = get_embeddings(model, test_query_dataloader)\n",
-    "\n",
-    "    # Calculate the cosine distance between each query and knowledge base concept\n",
-    "    score_matrix = np.matmul(np.array(test_query_embs), np.array(test_kb_embs).T)\n",
-    "    accs = {k : 0 for k in ks}\n",
-    "    \n",
-    "    # Compare the knowledge base IDs of the knowledge base entities with \n",
-    "    # the smallest cosine distance from the query \n",
-    "    for query_idx in tqdm(range(len(test_query_cids))):\n",
-    "        query_emb = test_query_embs[query_idx]\n",
-    "        query_cid = test_query_cids[query_idx]\n",
-    "        query_scores = score_matrix[query_idx]\n",
-    "\n",
-    "        for k in ks:\n",
-    "            topk_idxs = np.argpartition(query_scores, -k)[-k:]\n",
-    "            topk_cids = [test_kb_cids[idx] for idx in topk_idxs]\n",
-    "            \n",
-    "            # If the correct query ID is among the top k closest kb IDs\n",
-    "            # the model correctly linked the entity\n",
-    "            match = int(query_cid in topk_cids)\n",
-    "            accs[k] += match\n",
-    "\n",
-    "    for k in ks:\n",
-    "        accs[k] /= len(test_query_cids)\n",
-    "                \n",
-    "    return accs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create configs for our test data\n",
-    "test_kb = OmegaConf.create({\n",
-    "    \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_kb.tsv\"),\n",
-    "    \"max_seq_length\": 128,\n",
-    "    \"batch_size\": 10,\n",
-    "    \"shuffle\": False,\n",
-    "})\n",
-    "\n",
-    "test_queries = OmegaConf.create({\n",
-    "    \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_queries.tsv\"),\n",
-    "    \"max_seq_length\": 128,\n",
-    "    \"batch_size\": 10,\n",
-    "    \"shuffle\": False,\n",
-    "})\n",
-    "\n",
-    "ks = [1, 5]\n",
-    "\n",
-    "# Evaluate both models on our test data\n",
-    "base_accs = evaluate(base_model, test_kb, test_queries, ks)\n",
-    "base_accs[\"Model\"] = \"BERT Base Baseline\"\n",
-    "\n",
-    "sap_accs = evaluate(sap_model, test_kb, test_queries, ks)\n",
-    "sap_accs[\"Model\"] = \"BERT + SAP\"\n",
-    "\n",
-    "print(\"Top 1 and Top 5 Accuracy Comparison:\")\n",
-    "results_df = pd.DataFrame([base_accs, sap_accs], columns=[\"Model\", 1, 5])\n",
-    "results_df = results_df.style.set_properties(**{'text-align': 'left', }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n",
-    "display(results_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The purpose of this section was to show an example of evaluating your entity linking model. This evaluation set contains very little data, and no serious conclusions should be drawn about model performance. Top 1 accuracy should be between 0.7 and 1.0 for both models and top 5 accuracy should be between 0.8 and 1.0. When evaluating a model trained on a larger dataset, you can use a nearest neighbors index to speed up the evaluation time."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Building an Index"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To qualitatively observe the improvement we gain from the second stage pretraining, let's build two indices. One will be built with BERT base embeddings before self-alignment pretraining and one will be built with the model we just trained. Our knowledge base in this tutorial will be in the same domain and have some overlapping concepts as the training set. This data file is formatted as `ID\\tconcept`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The `EntityLinkingDataset` class can load the data used for training the entity linking encoder as well as for building the index if the `is_index_data` flag is set to true. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_index(cfg, model):\n",
-    "    # Setup index dataset loader\n",
-    "    index_dataloader = model.setup_dataloader(cfg.index.index_ds, is_index_data=True)\n",
-    "    \n",
-    "    # Get index dataset embeddings\n",
-    "    embeddings, _ = get_embeddings(model, index_dataloader)\n",
-    "    \n",
-    "    # Train IVFFlat index using faiss\n",
-    "    embeddings = np.array(embeddings)\n",
-    "    quantizer = faiss.IndexFlatL2(cfg.index.dims)\n",
-    "    index = faiss.IndexIVFFlat(quantizer, cfg.index.dims, cfg.index.nlist)\n",
-    "    index = faiss.index_cpu_to_all_gpus(index)\n",
-    "    index.train(embeddings)\n",
-    "    \n",
-    "    # Add concept embeddings to index\n",
-    "    for i in tqdm(range(0, embeddings.shape[0], cfg.index.index_batch_size)):\n",
-    "            index.add(embeddings[i:i+cfg.index.index_batch_size])\n",
-    "\n",
-    "    # Save index\n",
-    "    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index.index_save_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "build_index(sap_model_cfg, sap_model.to(device))\n",
-    "build_index(base_model_cfg, base_model.to(device))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Entity Linking via Nearest Neighbor Search"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now it's time to query our indices! We are going to query both our index built with embeddings from BERT Base, and our index with embeddings built from the SAP BERT model we trained. Our sample query phrases will be \"*high blood sugar*\" and \"*head pain*\". \n",
-    "\n",
-    "To query our indices, we first need to get the embedding of each query from the corresponding encoder model. We can then pass these query embeddings into the faiss index which will perform a nearest neighbor search, using cosine distance to compare the query embedding with embeddings present in the index. Once we get a list of knowledge base index concept IDs most closely matching our query, all that is left to do is map the IDs to a representative string describing the concept. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def query_index(cfg, model, index, queries, id2string):\n",
-    "    # Get query embeddings from our entity linking encoder model\n",
-    "    query_embs = get_query_embedding(queries, model).cpu().detach().numpy()\n",
-    "    \n",
-    "    # Use query embedding to find closest concept embedding in knowledge base\n",
-    "    distances, neighbors = index.search(query_embs, cfg.index.top_n)\n",
-    "    \n",
-    "    # Get the canonical strings corresponding to the IDs of the query's nearest neighbors in the kb \n",
-    "    neighbor_concepts = [[id2string[concept_id] for concept_id in query_neighbor] \\\n",
-    "                                                for query_neighbor in neighbors]\n",
-    "    \n",
-    "    # Display most similar concepts in the knowledge base. \n",
-    "    for query_idx in range(len(queries)):\n",
-    "        print(f\"\\nThe most similar concepts to {queries[query_idx]} are:\")\n",
-    "        for cid, concept, dist in zip(neighbors[query_idx], neighbor_concepts[query_idx], distances[query_idx]):\n",
-    "            print(cid, concept, 1 - dist)\n",
-    "\n",
-    "    \n",
-    "def get_query_embedding(queries, model):\n",
-    "    # Tokenize our queries\n",
-    "    model_input =  model.tokenizer(queries,\n",
-    "                                   add_special_tokens = True,\n",
-    "                                   padding = True,\n",
-    "                                   truncation = True,\n",
-    "                                   max_length = 512,\n",
-    "                                   return_token_type_ids = True,\n",
-    "                                   return_attention_mask = True)\n",
-    "    \n",
-    "    # Pass tokenized input into model\n",
-    "    query_emb =  model.forward(input_ids=torch.LongTensor(model_input[\"input_ids\"]).to(device),\n",
-    "                               token_type_ids=torch.LongTensor(model_input[\"token_type_ids\"]).to(device),\n",
-    "                               attention_mask=torch.LongTensor(model_input[\"attention_mask\"]).to(device))\n",
-    "    \n",
-    "    return query_emb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load indices\n",
-    "sap_index = faiss.read_index(sap_model_cfg.index.index_save_name)\n",
-    "base_index = faiss.read_index(base_model_cfg.index.index_save_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Map concept IDs to one canonical string\n",
-    "index_data = open(sap_model_cfg.index.index_ds.data_file, \"r\", encoding='utf-8-sig')\n",
-    "id2string = {}\n",
-    "\n",
-    "for line in index_data:\n",
-    "    cid, concept = line.split(\"\\t\")\n",
-    "    id2string[int(cid) - 1] = concept.strip()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "id2string"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Some sample queries\n",
-    "queries = [\"high blood sugar\", \"head pain\"]\n",
-    "\n",
-    "# Query BERT Base\n",
-    "print(\"BERT Base output before Self Alignment Pretraining:\")\n",
-    "query_index(base_model_cfg, base_model, base_index, queries, id2string)\n",
-    "print(\"\\n\" + \"-\" * 50 + \"\\n\")\n",
-    "\n",
-    "# Query SAP BERT\n",
-    "print(\"SAP BERT output after Self Alignment Pretraining:\")\n",
-    "query_index(sap_model_cfg, sap_model, sap_index, queries, id2string)\n",
-    "print(\"\\n\" + \"-\" * 50 + \"\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Even after only training on this tiny amount of data, the qualitative performance boost from self-alignment pretraining is visible. The baseline model links \"*high blood sugar*\" to the entity \"*6 diabetes*\" while our SAP BERT model accurately links \"*high blood sugar*\" to \"*Hyperinsulinemia*\". Similarly, \"*head pain*\" and \"*Myocardial infraction*\" are not the same concept, but \"*head pain*\" and \"*Headache*\" are."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For larger knowledge bases keeping the default embedding size might be too large and cause out of memory issues. You can apply PCA or some other dimensionality reduction method to your data to reduce its memory footprint. Code for creating a text file of all the UMLS entities in the correct format needed to build an index and creating a dictionary mapping concept ids to canonical concept strings can be found here `examples/nlp/entity_linking/data/umls_dataset_processing.py`. \n",
-    "\n",
-    "The code for extracting knowledge base concept embeddings, training and applying a PCA transformation to the embeddings, building a faiss index and querying the index from the command line is located at `examples/nlp/entity_linking/build_index.py` and `examples/nlp/entity_linking/query_index.py`. \n",
-    "\n",
-    "If you've cloned the NeMo repo, both of these steps can be run as follows on the command line from the `examples/nlp/entity_linking/` directory.\n",
-    "\n",
-    "```\n",
-    "python data/umls_dataset_processing.py --index\n",
-    "python build_index.py --restore\n",
-    "python query_index.py --restore\n",
-    "```\n",
-    "By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=<PATH>` after each of the above commands. Intermediate steps of the index building process are saved. In the occurrence of an error, previously completed steps do not need to be rerun. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Command Recap"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is a recap of the commands and steps to repeat this process on the full UMLS dataset. \n",
-    "\n",
-    "1) Download the UMLS dataset file `MRCONSO.RRF` from the NIH website and place it in the `examples/nlp/entity_linking/data` directory.\n",
-    "\n",
-    "2) Run the following commands from the `examples/nlp/entity_linking` directory\n",
-    "```\n",
-    "python data/umls_dataset_processing.py\n",
-    "python self_alignment_pretraining.py project_dir=. \n",
-    "python data/umls_dataset_processing.py --index\n",
-    "python build_index.py --restore\n",
-    "python query_index.py --restore\n",
-    "```\n",
-    "The model will take ~24hrs to train on two GPUs and ~48hrs to train on one GPU. By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=<PATH>` after each of the above commands and changing `project_dir=<PATH>` in the `self_alignment_pretraining.py` command. If you change the project directory, you should also move the `MRCONOSO.RRF` file to a `data` sub directory within the one you've specified. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As mentioned in the introduction, entity linking within NVIDIA NeMo is not limited to the medical domain. The same data processing and training steps can be applied to a variety of domains and use cases. You can edit the datasets used as well as training and loss function hyperparameters within your config file to better suit your domain."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb
deleted file mode 100644
index 0905d232a9c1..000000000000
--- a/tutorials/nlp/GLUE_Benchmark.ipynb
+++ /dev/null
@@ -1,566 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "GLUE_Benchmark.ipynb",
-      "provenance": [],
-      "private_outputs": true,
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU",
-    "pycharm": {
-      "stem_cell": {
-        "cell_type": "raw",
-        "source": [],
-        "metadata": {
-          "collapsed": false
-        }
-      }
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "o_0K1lsW1dj9",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "\"\"\"\n",
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run this cell to set up dependencies.\n",
-        "\"\"\"\n",
-        "# If you're using Google Colab and not running locally, run this cell\n",
-        "\n",
-        "# install NeMo\n",
-        "BRANCH = 'r2.0.0rc0'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "pycharm": {
-          "name": "#%%\n"
-        },
-        "id": "JFWG-jYCfvD7",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n",
-        "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n",
-        "\n",
-        "! pip install ipywidgets\n",
-        "! jupyter nbextension enable --py widgetsnbextension\n",
-        "\n",
-        "# Please restart the kernel after running this cell"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "dzqD2WDFOIN-",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "from nemo.collections import nlp as nemo_nlp\n",
-        "from nemo.utils.exp_manager import exp_manager\n",
-        "\n",
-        "import os\n",
-        "import wget \n",
-        "import torch\n",
-        "import pytorch_lightning as pl\n",
-        "from omegaconf import OmegaConf"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "daYw_Xll2ZR9",
-        "colab_type": "text"
-      },
-      "source": [
-        "In this tutorial, we are going to describe how to finetune a BERT-like model based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) on [GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding](https://openreview.net/pdf?id=rJ4km2R5t7). \n",
-        "\n",
-        "# GLUE tasks\n",
-        "GLUE Benchmark includes 9 natural language understanding tasks:\n",
-        "\n",
-        "## Single-Sentence Tasks\n",
-        "\n",
-        "* CoLA - [The Corpus of Linguistic Acceptability](https://arxiv.org/abs/1805.12471) is a set of English sentences from published linguistics literature. The task is to predict whether a given sentence is grammatically correct or not.\n",
-        "* SST-2 - [The Stanford Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence: positive or negative.\n",
-        "\n",
-        "## Similarity and Paraphrase tasks\n",
-        "\n",
-        "* MRPC - [The Microsoft Research Paraphrase Corpus](https://www.aclweb.org/anthology/I05-5002.pdf) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n",
-        "* QQP - [The Quora Question Pairs](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n",
-        "* STS-B - [The Semantic Textual Similarity Benchmark](https://arxiv.org/abs/1708.00055) is a collection of sentence pairs drawn from news headlines, video, and image captions, and natural language inference data. The task is to determine how similar two sentences are.\n",
-        "\n",
-        "## Inference Tasks\n",
-        "\n",
-        "* MNLI - [The Multi-Genre Natural Language Inference Corpus](https://cims.nyu.edu/~sbowman/multinli/multinli_0.9.pdf) is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The task has the matched (in-domain) and mismatched (cross-domain) sections.\n",
-        "* QNLI - [The Stanford Question Answering Dataset](https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf) is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question. The task is to determine whether the context sentence contains the answer to the question.\n",
-        "* RTE The Recognizing Textual Entailment (RTE) datasets come from a series of annual [textual entailment challenges](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment). The task is to determine whether the second sentence is the entailment of the first one or not.\n",
-        "* WNLI - The Winograd Schema Challenge is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices (Hector Levesque, Ernest Davis, and Leora Morgenstern. The winograd schema challenge. In Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. 2012).\n",
-        "\n",
-        "All tasks are classification tasks, except for the STS-B task which is a regression task. All classification tasks are 2-class problems, except for the MNLI task which has 3-classes.\n",
-        "\n",
-        "More details about GLUE benchmark could be found [here](https://gluebenchmark.com/)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZnuziSwJ1yEB",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Datasets\n",
-        "\n",
-        "**To proceed further, you need to download the GLUE data.** For example, you can download [this script](https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py) using `wget` and then execute it by running:\n",
-        "\n",
-        "`python download_glue_data.py`\n",
-        "\n",
-        "use `--tasks TASK` if datasets for only selected GLUE tasks are needed\n",
-        "\n",
-        "After running the above commands, you will have a folder `glue_data` with data folders for every GLUE task. For example, data for MRPC task would be under glue_data/MRPC.\n",
-        "\n",
-        "This tutorial and [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py) work with all GLUE tasks without any modifications. For this tutorial, we are going to use MRPC task.\n",
-        "\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "--wJ2891aIIE",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# supported task names: [\"cola\", \"sst-2\", \"mrpc\", \"sts-b\", \"qqp\", \"mnli\", \"qnli\", \"rte\", \"wnli\"]\n",
-        "TASK = 'mrpc'\n",
-        "DATA_DIR = 'glue_data/MRPC'\n",
-        "WORK_DIR = \"WORK_DIR\"\n",
-        "MODEL_CONFIG = 'glue_benchmark_config.yaml'"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "qB0oLE4R9EhJ",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "! ls -l $DATA_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gMWuU69pbUDe",
-        "colab_type": "text"
-      },
-      "source": [
-        "For each task, there are 3 files: `train.tsv, dev.tsv, and test.tsv`. Note, MNLI has 2 dev sets: matched and mismatched, evaluation on both dev sets will be done automatically."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "6UDPgadLN6SG",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# let's take a look at the training data \n",
-        "! head -n 5 {DATA_DIR}/train.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_whKCxfTMo6Y",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Model configuration\n",
-        "\n",
-        "Now, let's take a closer look at the model's configuration and learn to train the model.\n",
-        "\n",
-        "GLUE model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Sequence Regression module (for STS-B task) or  Sequence classifier module (for the rest of the tasks).\n",
-        "\n",
-        "The model is defined in a config file which declares multiple important sections. They are:\n",
-        "- **model**: All arguments that are related to the Model - language model, a classifier, optimizer and schedulers, datasets and any other related information\n",
-        "\n",
-        "- **trainer**: Any argument to be passed to PyTorch Lightning"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "T1gA8PsJ13MJ",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# download the model's configuration file \n",
-        "config_dir = WORK_DIR + '/configs/'\n",
-        "os.makedirs(config_dir, exist_ok=True)\n",
-        "if not os.path.exists(config_dir + MODEL_CONFIG):\n",
-        "    print('Downloading config file...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/glue_benchmark/' + MODEL_CONFIG, config_dir)\n",
-        "else:\n",
-        "    print ('config file is already exists')"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "mX3KmWMvSUQw",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# this line will print the entire config of the model\n",
-        "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n",
-        "print(config_path)\n",
-        "config = OmegaConf.load(config_path)\n",
-        "print(OmegaConf.to_yaml(config))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZCgWzNBkaQLZ",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Model Training\n",
-        "## Setting up Data within the config\n",
-        "\n",
-        "Among other things, the config file contains dictionaries called **dataset**, **train_ds** and **validation_ds**. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n",
-        "\n",
-        "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n",
-        "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n",
-        "\n",
-        "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n",
-        "\n",
-        "Let's now add the data directory path, task name and output directory for saving predictions to the config."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "LQHCJN-ZaoLp",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "config.model.task_name = TASK\n",
-        "config.model.output_dir = WORK_DIR\n",
-        "config.model.dataset.data_dir = DATA_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nB96-3sTc3yk",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Building the PyTorch Lightning Trainer\n",
-        "\n",
-        "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n",
-        "\n",
-        "Let's first instantiate a Trainer object"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "1tG4FzZ4Ui60",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "print(\"Trainer config - \\n\")\n",
-        "print(OmegaConf.to_yaml(config.trainer))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "knF6QeQQdMrH",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# lets modify some trainer configs\n",
-        "# checks if we have GPU available and uses it\n",
-        "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-        "config.trainer.devices = 1\n",
-        "config.trainer.accelerator = accelerator\n",
-        "\n",
-        "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
-        "\n",
-        "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n",
-        "# config.trainer.amp_level = O1\n",
-        "\n",
-        "# remove distributed training flags\n",
-        "config.trainer.strategy = 'auto'\n",
-        "\n",
-        "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n",
-        "config.trainer.max_steps = 128\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8IlEMdVxdr6p",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Setting up a NeMo Experiment\n",
-        "\n",
-        "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "8uztqGAmdrYt",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-        "\n",
-        "# the exp_dir provides a path to the current experiment for easy access\n",
-        "exp_dir = str(exp_dir)\n",
-        "exp_dir"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8tjLhUvL_o7_",
-        "colab_type": "text"
-      },
-      "source": [
-        "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model and use [Megatron-LM BERT](https://arxiv.org/abs/1909.08053) or [AlBERT model](https://arxiv.org/abs/1909.11942):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Xeuc2i7Y_nP5",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n",
-        "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n",
-        "\n",
-        "# specify BERT-like model, you want to use, for example, \"megatron-bert-345m-uncased\" or 'bert-base-uncased'\n",
-        "PRETRAINED_BERT_MODEL = \"albert-base-v1\""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "RK2xglXyAUOO",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# add the specified above model parameters to the config\n",
-        "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fzNZNAVRjDD-",
-        "colab_type": "text"
-      },
-      "source": [
-        "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n",
-        "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NgsGLydWo-6-",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "model = nemo_nlp.models.GLUEModel(cfg=config.model, trainer=trainer)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kQ592Tx4pzyB",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Monitoring training progress\n",
-        "Optionally, you can create a Tensorboard visualization to monitor training progress."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "mTJr16_pp0aS",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "try:\n",
-        "  from google import colab\n",
-        "  COLAB_ENV = True\n",
-        "except (ImportError, ModuleNotFoundError):\n",
-        "  COLAB_ENV = False\n",
-        "\n",
-        "# Load the TensorBoard notebook extension\n",
-        "if COLAB_ENV:\n",
-        "  %load_ext tensorboard\n",
-        "  %tensorboard --logdir {exp_dir}\n",
-        "else:\n",
-        "  print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CFgAlaIdndjW",
-        "colab_type": "text"
-      },
-      "source": [
-        "Note, it’s recommended to finetune the model on each task separately. Also, based on [GLUE Benchmark FAQ#12](https://gluebenchmark.com/faq), there are might be some differences in dev/test distributions for QQP task and in train/dev for WNLI task."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "hUvnSpyjp0Dh",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# start model training\n",
-        "trainer.fit(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ref1qSonGNhP",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Training Script\n",
-        "\n",
-        "If you have NeMo installed locally, you can also train the model with [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py).\n",
-        "\n",
-        "To run training script, use:\n",
-        "\n",
-        "`python glue_benchmark.py \\\n",
-        " model.dataset.data_dir=PATH_TO_DATA_DIR \\\n",
-        " model.task_name=TASK`\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KVPFofXaoKNE",
-        "colab_type": "text"
-      },
-      "source": [
-        "Average results after 3 runs:\n",
-        "\n",
-        "| Task  |         Metric           | ALBERT-large | ALBERT-xlarge | Megatron-345m | BERT base paper | BERT large paper |\n",
-        "|-------|--------------------------|--------------|---------------|---------------|-----------------|------------------|\n",
-        "| CoLA  | Matthew's correlation    |     54.94    |     61.72     |     64.56     |      52.1       |       60.5       |\n",
-        "| SST-2 | Accuracy                 |     92.74    |     91.86     |     95.87     |      93.5       |       94.9       |\n",
-        "| MRPC  | F1/Accuracy              |  92.05/88.97 |  91.87/88.61  |  92.36/89.46  |      88.9/-     |     89.3/-       |\n",
-        "| STS-B | Person/Spearman corr.    |  90.41/90.21 |  90.07/90.10  |  91.51/91.61  |     -/85.8      |      -/86.5      |\n",
-        "| QQP   | F1/Accuracy              |  88.26/91.26 |  88.80/91.65  |  89.18/91.91  |     71.2/-      |     72.1/-       |\n",
-        "| MNLI  | Matched /Mismatched acc. |  86.69/86.81 |  88.66/88.73  |  89.86/89.81  |    84.6/83.4    |     86.7/85.9    |\n",
-        "| QNLI  | Accuracy                 |     92.68    |     93.66     |     94.33     |      90.5       |       92.7       |\n",
-        "| RTE   | Accuracy                 |     80.87    |     82.86     |     83.39     |      66.4       |       70.1       |\n",
-        "\n",
-        "WNLI task was excluded from the experiments due to the problematic WNLI set.\n",
-        "The dev sets were used for evaluation for ALBERT and Megatron models, and the test sets results for [the BERT paper](https://arxiv.org/abs/1810.04805).\n",
-        "\n",
-        "Hyperparameters used to get the results from the above table, could be found in the table below. Some tasks could be further finetuned to improve performance numbers, the tables are for a baseline reference only.\n",
-        "Each cell in the table represents the following parameters:\n",
-        "Number of GPUs used/ Batch Size/ Learning Rate/ Number of Epochs. For not specified parameters, please refer to the default parameters in the training script.\n",
-        "\n",
-        "| Task  | ALBERT-large | ALBERT-xlarge | Megatron-345m |\n",
-        "|-------|--------------|---------------|---------------|\n",
-        "| CoLA  | 1 / 32 / 1e-5 / 3  |  1 / 32 / 1e-5 / 10 |  4 / 16 / 2e-5 / 12 |\n",
-        "| SST-2 | 4 / 16 / 2e-5 / 5  |  4 / 16 / 2e-5 /12  |  4 / 16 / 2e-5 / 12 |\n",
-        "| MRPC  | 1 / 32 / 1e-5 / 5  |  1 / 16 / 2e-5 / 5  |  1 / 16 / 2e-5 / 10 |\n",
-        "| STS-B | 1 / 16 / 2e-5 / 5  |  1 / 16 / 4e-5 / 12 |  4 / 16 / 3e-5 / 12 |\n",
-        "| QQP   | 1 / 16 / 2e-5 / 5  | 4 / 16 / 1e-5 / 12  |  4 / 16 / 1e-5 / 12 |\n",
-        "| MNLI  | 4 / 64 / 1e-5 / 5  |  4 / 32 / 1e-5 / 5  |  4 / 32 / 1e-5 / 5  | \n",
-        "| QNLI  | 4 / 16 / 1e-5 / 5  |  4 / 16 / 1e-5 / 5  |  4 / 16 / 2e-5 / 5  | \n",
-        "| RTE   | 1 / 16 / 1e-5 / 5  | 1 / 16 / 1e-5 / 12  |  4 / 16 / 3e-5 / 12 |\n"
-      ]
-    }
-  ]
-}
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
index a0ff0faf511b..b21fdfe36020 100644
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -749,7 +749,7 @@
             "source": [
                 "### Optimizing Threshold\n",
                 "\n",
-                "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
+                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
                 "\n",
                 "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
             ]
diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb
deleted file mode 100644
index b2f34f7e3141..000000000000
--- a/tutorials/nlp/MegatronBert_export.ipynb
+++ /dev/null
@@ -1,280 +0,0 @@
-{
-    "cells": [
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "8046e96a",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "BRANCH='r2.0.0rc0'"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "38bfe8ea",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies.\n",
-                "\"\"\"\n",
-                "# If you're using Google Colab and not running locally, run this cell\n",
-                "\n",
-                "# install NeMo\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "98c00a93",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "import wget \n",
-                "import torch\n",
-                "import pytorch_lightning as pl\n",
-                "from omegaconf import OmegaConf"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "e9fb1a66",
-            "metadata": {},
-            "source": [
-                "### Deprecation Notice\n",
-                "\n",
-                "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n",
-                "\n",
-                "---\n",
-                "\n",
-                "# Task Description\n",
-                "In this tutorial, we are going to describe how to export NeMo NLP models with BERT based models as the pre-trained model."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "dd0fb016",
-            "metadata": {},
-            "source": [
-                "## Convert the Megatron-LM Weights to Nemo file\n",
-                "\n",
-                "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n",
-                "\n",
-                "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "e451f219",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from nemo.collections.nlp.modules.common.megatron.megatron_utils import MEGATRON_CONFIG_MAP\n",
-                "import pathlib\n",
-                "\n",
-                "PRETRAINED_BERT_MODEL = \"megatron-bert-345m-uncased\"  # specify BERT-like model from MEGATRON_CONFIG_MAP.keys()\n",
-                "nemo_out_path = \"qa_pretrained.nemo\" # the nemo output file name\n",
-                "\n",
-                "checkpoint_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['checkpoint']\n",
-                "vocab_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['vocab']\n",
-                "checkpoint_filename = pathlib.Path(checkpoint_url).name\n",
-                "vocab_filename = pathlib.Path(vocab_url).name\n",
-                "if not pathlib.Path(checkpoint_filename).exists():\n",
-                "    print('downloading from checkpoint url', checkpoint_url)\n",
-                "    !wget $checkpoint_url\n",
-                "if not pathlib.Path(vocab_filename).exists():\n",
-                "    print('downloading from vocab url', vocab_url)\n",
-                "    !wget $vocab_url"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "7586b5c0",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "WORK_DIR = \"WORK_DIR\"\n",
-                "os.makedirs(WORK_DIR, exist_ok=True)\n",
-                "\n",
-                "# Prepare the model parameters \n",
-                "# download the model's configuration file \n",
-                "config_dir = WORK_DIR + '/configs/'\n",
-                "MODEL_CONFIG = \"megatron_bert_config.yaml\"\n",
-                "os.makedirs(config_dir, exist_ok=True)\n",
-                "if not os.path.exists(config_dir + MODEL_CONFIG):\n",
-                "    print('Downloading config file...')\n",
-                "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/conf/' + MODEL_CONFIG, config_dir)\n",
-                "else:\n",
-                "    print ('config file is already exists')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "e0dd3124",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# this line will print the entire config of the model\n",
-                "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n",
-                "print(config_path)\n",
-                "config = OmegaConf.load(config_path)\n",
-                "\n",
-                "config.model.megatron_legacy = True # set to true if you trained the NLP model on NeMo < 1.5.0\n",
-                "config.model.bias_gelu_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export \n",
-                "config.model.masked_softmax_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export\n",
-                "\n",
-                "config.model.num_layers = 24\n",
-                "config.model.hidden_size = 1024\n",
-                "config.model.ffn_hidden_size = 4096\n",
-                "config.model.num_attention_heads = 16\n",
-                "config.model.tokenizer.vocab_file = vocab_filename\n",
-                "config.model.tokenizer.type = 'BertWordPieceLowerCase' # change this to BertWordPieceCase if you are using a cased pretrained model\n",
-                "config.model.tensor_model_parallel_size = 1\n",
-                "config.model.data.data_prefix = ''\n",
-                "config.model.max_position_embeddings = 512\n",
-                "config.model.data.seq_length = 512\n",
-                "config.cfg = {}\n",
-                "config.cfg.cfg = config.model\n",
-                "with open('hparams.yaml', 'w') as f:\n",
-                "    f.write(OmegaConf.to_yaml(config.cfg))\n",
-                "if(config.model.megatron_legacy):\n",
-                "    checkpoint_filename = \"model_optim_rng_ca.pt\" #provide path to the pretrained pt file you used during training on NeMo < 1.5.0, for NeMo >= 1.5.0\n",
-                "print(checkpoint_filename)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "47dca6de",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "PWD = os.getcwd()\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py')\n",
-                "!python -m torch.distributed.run --nproc_per_node=1 megatron_lm_ckpt_to_nemo.py --checkpoint_folder=$PWD --checkpoint_name=$checkpoint_filename --hparams_file=$PWD/hparams.yaml --nemo_file_path=$PWD/$nemo_out_path --model_type=bert --tensor_model_parallel_size=1"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "1ae8d31b",
-            "metadata": {},
-            "source": [
-                "# Legacy NLP Bert based model conversion\n",
-                "\n",
-                "Step 1: Convert legacy nemo checkpoint to a checkpoint which is currently supported by nemo\n",
-                "\n",
-                "Step 2: Use the converted model from step 1 to export the nemo file to the required format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "86639a3d",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/nemo_legacy_import/nlp_checkpoint_port.py')\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/export.py')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "48820d57",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "legacy_nemo_file_path = \"/NeMo/megatron_multiqa.nemo\" #path to you model trained on NeMo < 1.5\n",
-                "nemo_converted_out_path = \"converted_megatron_multiqa.nemo\"\n",
-                "megatron_absolute_language_model_path = \"/NeMo/tutorials/nlp/qa_pretrained.nemo\" # Give the absolute path of the model you obtained using megatron_lm_ckpt_to_nemo\n",
-                "onnx_export_out_path = \"onnx_megatron_multiqa.onnx\""
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "7191e0cb",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "os.system(f\"python nlp_checkpoint_port.py {legacy_nemo_file_path} {nemo_converted_out_path} --megatron-legacy=True --megatron-checkpoint {megatron_absolute_language_model_path}\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "ccc720ef",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "os.system(f\"python export.py {nemo_converted_out_path} {onnx_export_out_path} --autocast --runtime-check\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "f10461f2",
-            "metadata": {},
-            "source": [
-                "# Convert a NLP model with BERT based pre-trained model trained on NeMo >= 1.5.0\n",
-                "\n",
-                "For models trained on NeMo >= 1.5.0, you just run the export script and skip the legacy conversion part"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "0514ab37",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "nemo_file_path = \"\"\n",
-                "onnx_export_out_path = "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "1d6b5db4",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "python export.py $nemo_converted_out_path $onnx_export_out_path --autocast --runtime-check"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.12"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb
deleted file mode 100644
index 9502ba177867..000000000000
--- a/tutorials/nlp/Question_Answering.ipynb
+++ /dev/null
@@ -1,1163 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tiIOhb7iVC3J"
-      },
-      "source": [
-        "# Overview"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PucJwfbhVC3L"
-      },
-      "source": [
-        "### Deprecation Notice\n",
-        "\n",
-        "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "This tutorial will demonstrate how to train, evaluate, and test three types of models for Question-Answering -\n",
-        "1. BERT-like models for Extractive Question-Answering\n",
-        "2. Sequence-to-Sequence (S2S) models for Generative Question-Answering (ex. T5/BART-like)\n",
-        "3. GPT-like models for Generative Question-Answering\n",
-        "\n",
-        "## Task Description\n",
-        "\n",
-        "- Given a context and a natural language query, we want to generate an answer for the query\n",
-        "- Depending on how the answer is generated, the task can be broadly divided into two types:\n",
-        "    1. Extractive Question Answering\n",
-        "    2. Generative Question Answering\n",
-        "\n",
-        "\n",
-        "### Extractive Question-Answering with BERT-like models\n",
-        "\n",
-        "Given a question and a context, both in natural language, predict the span within the context with a start and end position which indicates the answer to the question.\n",
-        "For every word in our training dataset we’re going to predict:\n",
-        "- likelihood this word is the start of the span \n",
-        "- likelihood this word is the end of the span\n",
-        "\n",
-        "We are using a BERT encoder with 2 span prediction heads for predicting start and end position of the answer. The span predictions are token classifiers consisting of a single linear layer.\n",
-        "\n",
-        "### Generative Question-Answering with S2S and GPT-like models\n",
-        "\n",
-        "Given a question and a context, both in natural language, generate an answer for the question. Unlike the BERT-like models, there is no constraint that the answer should be a span within the context."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IpX0w2PtVC3M"
-      },
-      "source": [
-        "# Installing NeMo"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "72XWYFQYVC3M"
-      },
-      "source": [
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run the cell below to set up dependencies."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_xQBtr0KVC3M"
-      },
-      "outputs": [],
-      "source": [
-        "BRANCH = 'r2.0.0rc0'"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9R1D6W58VC3N"
-      },
-      "outputs": [],
-      "source": [
-        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fof5-57iVC3N"
-      },
-      "source": [
-        "# Imports and constants"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KqKD-wReVC3O"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import wget\n",
-        "import gc\n",
-        "\n",
-        "import pytorch_lightning as pl\n",
-        "from omegaconf import OmegaConf\n",
-        "\n",
-        "from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel\n",
-        "from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel\n",
-        "from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel\n",
-        "from nemo.utils.exp_manager import exp_manager\n",
-        "\n",
-        "pl.seed_everything(42)\n",
-        "gc.disable()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xhPr9Jf_VC3O"
-      },
-      "outputs": [],
-      "source": [
-        "# set the following paths\n",
-        "DATA_DIR = \"data_dir\" # directory for storing datasets\n",
-        "WORK_DIR = \"work_dir\" # directory for storing trained models, logs, additionally downloaded scripts\n",
-        "\n",
-        "os.makedirs(DATA_DIR, exist_ok=True)\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dWymW8e0VC3O"
-      },
-      "source": [
-        "# Configuration"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0YhKTkuXVC3P"
-      },
-      "source": [
-        "The model is defined in a config file which declares multiple important sections:\n",
-        "- **model**: All arguments that will relate to the Model - language model, span prediction, optimizer and schedulers, datasets and any other related information\n",
-        "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
-        "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n",
-        "\n",
-        "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WOIWJqQ0VC3P"
-      },
-      "outputs": [],
-      "source": [
-        "# download the model's default configuration file \n",
-        "config_dir = WORK_DIR + '/conf/'\n",
-        "os.makedirs(config_dir, exist_ok=True)\n",
-        "if not os.path.exists(config_dir + \"qa_conf.yaml\"):\n",
-        "    print('Downloading config file...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/conf/qa_conf.yaml', config_dir)\n",
-        "else:\n",
-        "    print ('config file already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cvD-gv-FVC3P"
-      },
-      "outputs": [],
-      "source": [
-        "# this will print the entire default config of the model\n",
-        "config_path = f'{WORK_DIR}/conf/qa_conf.yaml'\n",
-        "print(config_path)\n",
-        "config = OmegaConf.load(config_path)\n",
-        "print(\"Default Config - \\n\")\n",
-        "print(OmegaConf.to_yaml(config))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "E08e-ItPVC3P"
-      },
-      "source": [
-        "# Training and testing models on SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xn022MsKVC3Q"
-      },
-      "source": [
-        "## Dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "c356CGL1VC3Q"
-      },
-      "source": [
-        "For this example, we are going to download the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset to showcase how to do training and inference. There are two datasets, SQuAD1.0 and SQuAD2.0. SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. SQuAD2.0 dataset combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Gaju1h_bVC3Q"
-      },
-      "source": [
-        "To download both datasets, we use `NeMo/examples/nlp/question_answering/get_squad.py`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nb840_bZVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "# download get_squad.py script to download and preprocess the SQuAD data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/get_squad.py'):\n",
-        "    print('Downloading get_squad.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/get_squad.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('get_squad.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sOgY0tRzVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "# download and preprocess the data\n",
-        "!python $WORK_DIR/get_squad.py --destDir $DATA_DIR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nprGkyvRVC3Q"
-      },
-      "source": [
-        "After execution of the above cell, your data folder will contain a subfolder \"squad\" the following four files for training and evaluation\n",
-        "\n",
-        "```\n",
-        "squad  \n",
-        "│\n",
-        "└───v1.1\n",
-        "│   │ -  train-v1.1.json\n",
-        "│   │ -  dev-v1.1.json\n",
-        "│\n",
-        "└───v2.0\n",
-        "    │ -  train-v2.0.json\n",
-        "    │ -  dev-v2.0.json\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "GX0KWQXKVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "!ls -LR {DATA_DIR}/squad"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RFVcvseOVC3R"
-      },
-      "source": [
-        "## Set dataset config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Grb0EeRqVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "# if True, model will load features from cache if file is present, or\n",
-        "# create features and dump to cache file if not already present\n",
-        "config.model.dataset.use_cache = False\n",
-        "\n",
-        "# indicates whether the dataset has unanswerable questions\n",
-        "config.model.dataset.version_2_with_negative = True\n",
-        "\n",
-        "# indicates whether the dataset is of extractive nature or not\n",
-        "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n",
-        "config.model.dataset.check_if_answer_in_context = True\n",
-        "\n",
-        "# set file paths for train, validation, and test datasets\n",
-        "config.model.train_ds.file = f\"{DATA_DIR}/squad/v2.0/train-v2.0.json\"\n",
-        "config.model.validation_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n",
-        "config.model.test_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n",
-        "\n",
-        "# set batch sizes for train, validation, and test datasets\n",
-        "config.model.train_ds.batch_size = 8\n",
-        "config.model.validation_ds.batch_size = 8\n",
-        "config.model.test_ds.batch_size = 8\n",
-        "\n",
-        "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n",
-        "config.model.train_ds.num_samples = 5000\n",
-        "config.model.validation_ds.num_samples = 1000\n",
-        "config.model.test_ds.num_samples = 100"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rFWF41VwVC3R"
-      },
-      "source": [
-        "## Set trainer config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "42yif-GIVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "config.trainer.max_epochs = 1\n",
-        "config.trainer.max_steps = -1 # takes precedence over max_epochs\n",
-        "config.trainer.precision = 16\n",
-        "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use [0] this tutorial does not support multiple GPUs. If needed please use NeMo/examples/nlp/question_answering/question_answering.py\n",
-        "config.trainer.accelerator = \"gpu\"\n",
-        "config.trainer.strategy=\"auto\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EDQzMBlbVC3R"
-      },
-      "source": [
-        "## Set experiment manager config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pxY4rnJBVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "config.exp_manager.exp_dir = WORK_DIR\n",
-        "config.exp_manager.name = \"QA-SQuAD2\"\n",
-        "config.exp_manager.create_wandb_logger=False"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "N2_C8reNVC3R"
-      },
-      "source": [
-        "## BERT model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4Mf-_rioVC3R"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gtlGHzVJVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"bert-base-uncased\"\n",
-        "config.model.tokenizer.tokenizer_name = \"bert-base-uncased\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bert_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 3e-5"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RaM7fe8rVC3R"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ukLzGmy9VC3R"
-      },
-      "outputs": [],
-      "source": [
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = BERTQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qZIA69rlVC3R"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "asutB9ZzVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "n5AIv0SEVC3S"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7k5kD6tvVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "model = BERTQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zyh0SNiyVC3S"
-      },
-      "source": [
-        "## S2S BART model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Sy9IYgVYVC3S"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PKNmHKV5VC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n",
-        "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 5e-5\n",
-        "\n",
-        "#remove vocab_file from gpt model\n",
-        "config.model.tokenizer.vocab_file = None"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "S_0glS4yVC3S"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8jWyHY1oVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n",
-        "# !rm -r /root/.cache/huggingface/\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = S2SQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xg-j39b4VC3S"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ocsf0EBDVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Vs3pl0VMVC3S"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NoW6_GO_VC3S"
-      },
-      "outputs": [],
-      "source": [
-        "model = S2SQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "a7-iInbPVC3S"
-      },
-      "source": [
-        "## GPT2 model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VaIC0l2aVC3S"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5j6SVk6fVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"gpt2\"\n",
-        "config.model.tokenizer.tokenizer_name = \"gpt2\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/gpt2_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 1e-4"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rWhhEuvzVC3S"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vBtP3ukDVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n",
-        "# !rm -r /root/.cache/huggingface/\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = GPTQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EApFrJh8VC3T"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zYo2JDdOVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6aNEt06fVC3T"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ioLT4DVbVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "model = GPTQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hTWOlD9AVC3T"
-      },
-      "source": [
-        "# Training and testing models on MS-MARCO"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lZWsMwnGVC3T"
-      },
-      "source": [
-        "## Dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pRUAwgAbVC3T"
-      },
-      "source": [
-        "### Downloading the data"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qz3DO9JGVC3T"
-      },
-      "source": [
-        "MS-MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. MS-MARCO consists of 1,010,916 queries generated from real, anonymized Bing user queries. The contexts are extracted from real web documents and the answers are generated by humans.\n",
-        "\n",
-        "Please agree to the Terms of Use at https://microsoft.github.io/msmarco/ before downloading the data\n",
-        "\n",
-        "The data can be downloaded at:\n",
-        "- https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n",
-        "- https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Fm5MzZ91inP5"
-      },
-      "outputs": [],
-      "source": [
-        "os.makedirs(os.path.join(DATA_DIR, \"msmarco\"), exist_ok=True)\n",
-        "\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P $DATA_DIR/msmarco\n",
-        "!gunzip $DATA_DIR/msmarco/train_v2.1.json.gz\n",
-        "\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P $DATA_DIR/msmarco\n",
-        "!gunzip $DATA_DIR/msmarco/dev_v2.1.json.gz"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nDmFHzBtVC3T"
-      },
-      "source": [
-        "### Converting to SQuAD format\n",
-        "\n",
-        "The script for converting MS-MARCO dataset to SQuAD can be found at `NeMo/examples/nlp/question_answering/convert_msmarco_to_squad_format.py`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tJtNIzZQVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "# download convert_msmarco_to_squad_format.py script to format the MS-MARCO data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/convert_msmarco_to_squad_format.py'):\n",
-        "    print('Downloading convert_msmarco_to_squad_format.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/convert_msmarco_to_squad_format.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('convert_msmarco_to_squad_format.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Io_esJPSuBcW"
-      },
-      "outputs": [],
-      "source": [
-        "# we will exclude examples from MS-MARCO dataset that do not have a wellFormedAnswer using a utility script\n",
-        "# download remove_ms_marco_samples_without_wellFormedAnswers.py script to format the MS-MARCO data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/remove_ms_marco_samples_without_wellFormedAnswers.py'):\n",
-        "    print('Downloading remove_ms_marco_samples_without_wellFormedAnswers.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('remove_ms_marco_samples_without_wellFormedAnswers.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cs_CXkfXuYVQ"
-      },
-      "outputs": [],
-      "source": [
-        "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/train_v2.1.json\n",
-        "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/dev_v2.1.json"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "AUAKI086VC3T"
-      },
-      "outputs": [],
-      "source": [
-        "!(python $WORK_DIR/convert_msmarco_to_squad_format.py \\\n",
-        "    --msmarco_train_input_filepath=$DATA_DIR/msmarco/train_v2.1.json \\\n",
-        "    --msmarco_dev_input_filepath=$DATA_DIR/msmarco/dev_v2.1.json \\\n",
-        "    --converted_train_save_path=$DATA_DIR/msmarco/msmarco-squad-format-train-v2.1.json \\\n",
-        "    --converted_dev_save_path=$DATA_DIR/msmarco/msmarco-squad-format-dev-v2.1.json \\\n",
-        "    --exclude_negative_samples=False \\\n",
-        "    --keep_only_relevant_passages=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AeHesaFcVC3T"
-      },
-      "source": [
-        "## Set dataset config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rhx-_1X3VC3T"
-      },
-      "outputs": [],
-      "source": [
-        "# if True, model will load features from cache if file is present, or\n",
-        "# create features and dump to cache file if not already present\n",
-        "config.model.dataset.use_cache = False\n",
-        "\n",
-        "# indicates whether the dataset has unanswerable questions\n",
-        "config.model.dataset.version_2_with_negative = True\n",
-        "\n",
-        "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n",
-        "# should be False for MS-MARCO dataset, or other datasets of generative nature\n",
-        "config.model.dataset.check_if_answer_in_context = False\n",
-        "\n",
-        "# set file paths for train, validation, and test datasets\n",
-        "config.model.train_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-train-v2.1.json\"\n",
-        "config.model.validation_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n",
-        "config.model.test_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n",
-        "\n",
-        "# set batch sizes for train, validation, and test datasets\n",
-        "config.model.train_ds.batch_size = 16\n",
-        "config.model.validation_ds.batch_size = 16\n",
-        "config.model.test_ds.batch_size = 16\n",
-        "\n",
-        "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n",
-        "config.model.train_ds.num_samples = 5000\n",
-        "config.model.validation_ds.num_samples = 1000\n",
-        "config.model.test_ds.num_samples = 100"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "X43k_EeqVC3T"
-      },
-      "source": [
-        "## Set trainer config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HavpkQLPVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "config.trainer.max_epochs = 1\n",
-        "config.trainer.max_steps = -1 # takes precedence over max_epochs\n",
-        "config.trainer.precision = 16\n",
-        "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0]\n",
-        "config.trainer.accelerator = \"gpu\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R-_FIZE2VC3U"
-      },
-      "source": [
-        "## Set experiment manager config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "10TT3okiVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "config.exp_manager.exp_dir = WORK_DIR\n",
-        "config.exp_manager.name = \"QA-MSMARCO\"\n",
-        "config.exp_manager.create_wandb_logger=False"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MKIq6YT-VC3U"
-      },
-      "source": [
-        "## S2S BART model for MS-MARCO"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tvf-QpYLVC3U"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DDVZ1a5fVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n",
-        "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_msmarco_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 5e-5"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "3N75cdLRVC3U"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Bv9UMkfxVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = S2SQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BhVuV9sWVC3U"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1JeaJ_OgVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "yj0dGexaVC3U"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "l1elN-WDVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "model = S2SQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Question_Answering.ipynb",
-      "provenance": []
-    },
-    "gpuClass": "standard",
-    "kernelspec": {
-      "display_name": "Python 3.8.0 ('test_ptl_1.7')",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.8.0"
-    },
-    "orig_nbformat": 4,
-    "vscode": {
-      "interpreter": {
-        "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9"
-      }
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
deleted file mode 100644
index 57e49cdcf255..000000000000
--- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
+++ /dev/null
@@ -1,1412 +0,0 @@
-{
-  "cells": [
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PiRuohn_FQco"
-      },
-      "source": [
-        "# Overview\n",
-        "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n",
-        "\n",
-        "Estimated time: 10-15 min.\n",
-        "\n",
-        "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n",
-        "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n",
-        "\n",
-        "This model is an alternative to word boosting/shallow fusion approaches:\n",
-        "  - does not require retraining ASR model;\n",
-        "  - does not require beam-search/language model(LM);\n",
-        "  - can be applied on top of any English ASR model output;"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qm5wmxVEGXgH"
-      },
-      "source": [
-        "## What is custom vocabulary?\n",
-        "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n",
-        "\n",
-        "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n",
-        "\n",
-        "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "D5_XwuXDOKho"
-      },
-      "source": [
-        "## Tutorial Plan\n",
-        "\n",
-        "1.   Create a sample custom vocabulary using some medical terminology.\n",
-        "2.   Study what customization does - a detailed analysis of a small example.\n",
-        "3.   Run a bigger example:\n",
-        "   *  Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n",
-        "   *  Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n",
-        "\n",
-        "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n",
-        "* `puramesin` => `puromycin`\n",
-        "* `parromsin` => `puromycin`\n",
-        "* `and hydrod` => `anhydride`\n",
-        "* `lesh night and` => `lesch-nyhan`\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "agz8B2CxXBBG"
-      },
-      "source": [
-        "# Preparation"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "koRPpYISNPuH"
-      },
-      "source": [
-        "## Installing NeMo"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HCnnz3cgVc4Q"
-      },
-      "outputs": [],
-      "source": [
-        "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
-        "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
-        "GITHUB_ACCOUNT = \"NVIDIA\"\n",
-        "BRANCH = 'r2.0.0rc0'\n",
-        "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n",
-        "\n",
-        "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
-        "# comment out the below lines and set NEMO_DIR to your local path.\n",
-        "NEMO_DIR = 'nemo'\n",
-        "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_M92gCn_NW1_"
-      },
-      "source": [
-        "## Additional installs\n",
-        "We will use `sentence_splitter` to split abstracts to sentences."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ddyJA3NtGl9C"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install sentence_splitter"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qVa91rGkeFje"
-      },
-      "source": [
-        "Clone the SpellMapper model from HuggingFace.\n",
-        "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JiI9dkEm5cpW"
-      },
-      "outputs": [],
-      "source": [
-        "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8saqFOePVfFf"
-      },
-      "source": [
-        "## Imports\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tAJyiYn_VnrF"
-      },
-      "outputs": [],
-      "source": [
-        "import IPython.display as ipd\n",
-        "import json\n",
-        "import random\n",
-        "import re\n",
-        "import soundfile as sf\n",
-        "import torch\n",
-        "\n",
-        "from collections import Counter, defaultdict\n",
-        "from difflib import SequenceMatcher\n",
-        "from matplotlib.pyplot import imshow\n",
-        "from matplotlib import pyplot as plt\n",
-        "from sentence_splitter import SentenceSplitter\n",
-        "from typing import List, Set, Tuple\n",
-        "\n",
-        "from nemo.collections.tts.models import FastPitchModel\n",
-        "from nemo.collections.tts.models import HifiGanModel\n",
-        "\n",
-        "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n",
-        "\n",
-        "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n",
-        "    get_all_candidates_coverage,\n",
-        "    get_index,\n",
-        "    load_ngram_mappings,\n",
-        "    search_in_index,\n",
-        "    get_candidates,\n",
-        "    read_spellmapper_predictions,\n",
-        "    apply_replacements_to_text,\n",
-        "    load_ngram_mappings_for_dp,\n",
-        "    get_alignment_by_dp,\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mfAaOdAWUGUV"
-      },
-      "source": [
-        "Use seed to get a reproducible behaviour."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UlGnNKTuT_6A"
-      },
-      "outputs": [],
-      "source": [
-        "random.seed(0)\n",
-        "torch.manual_seed(0)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RPPHI7Zd_fDz"
-      },
-      "source": [
-        "## Download data\n",
-        "\n",
-        "File `pubmed24n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n",
-        "\n",
-        "File `wordlist.txt` contains 100k **single-word** medical terms.\n",
-        "\n",
-        "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n",
-        "\n",
-        "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mX6cvE8xw2n1"
-      },
-      "outputs": [],
-      "source": [
-        "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0009.xml.gz\n",
-        "!gunzip pubmed24n0009.xml.gz\n",
-        "!grep \"AbstractText\" pubmed24n0009.xml > abstract.txt\n",
-        "\n",
-        "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n",
-        "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n",
-        "!wget https://norvig.com/ngrams/count_1w.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mBm9BeqNaRlC"
-      },
-      "source": [
-        "## Auxiliary functions\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kVUKhSh48Ypi"
-      },
-      "outputs": [],
-      "source": [
-        "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€™$•¼}{~—=“\\\"”″‟„]\")\n",
-        "\n",
-        "\n",
-        "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n",
-        "    \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n",
-        "        wordlist.txt - 100k single-word medical terms.\n",
-        "        valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n",
-        "        count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n",
-        "    \"\"\"\n",
-        "    common_words  = set()\n",
-        "    with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            word, freq = line.strip().casefold().split(\"\\t\")\n",
-        "            if int(freq) < 500000:\n",
-        "                break\n",
-        "            common_words.add(word)\n",
-        "    print(\"Size of common words vocabulary:\", len(common_words))\n",
-        "\n",
-        "    abbreviations = defaultdict(set)\n",
-        "    medical_vocabulary = set()\n",
-        "    with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        lines = f.readlines()\n",
-        "        # first line is header\n",
-        "        for line in lines[1:]:\n",
-        "            abbrev, _, phrase = line.strip().split(\"\\t\")\n",
-        "            # skip phrases longer than 3 words because some of them are long explanations\n",
-        "            if phrase.count(\" \") > 2:\n",
-        "                continue\n",
-        "            if phrase in common_words:\n",
-        "                continue\n",
-        "            medical_vocabulary.add(phrase)\n",
-        "            abbrev = abbrev.lower()\n",
-        "            abbreviations[abbrev].add(phrase)\n",
-        "\n",
-        "    with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            word = line.strip().casefold()\n",
-        "            # skip words containing digits\n",
-        "            if re.match(r\".*\\d.*\", word):\n",
-        "                continue\n",
-        "            if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n",
-        "                continue\n",
-        "            if word in common_words:\n",
-        "                continue\n",
-        "            medical_vocabulary.add(word)\n",
-        "\n",
-        "    print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n",
-        "    print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n",
-        "    return medical_vocabulary, abbreviations\n",
-        "\n",
-        "\n",
-        "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n",
-        "    \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n",
-        "    Args:\n",
-        "        medical_vocabulary: set of known medical words or phrases\n",
-        "    Returns:\n",
-        "        sentences: list of extracted sentences\n",
-        "        all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n",
-        "        all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n",
-        "    \"\"\"\n",
-        "    splitter = SentenceSplitter(language='en')\n",
-        "\n",
-        "    all_sentences = []\n",
-        "    all_found_singleword = set()\n",
-        "    all_found_multiword = set()\n",
-        "    with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            text = line.strip().replace(\"<AbstractText>\", \"\").replace(\"</AbstractText>\", \"\")\n",
-        "            sents = splitter.split(text)\n",
-        "            found_singleword = set()\n",
-        "            found_multiword = set()\n",
-        "            for sent in sents:\n",
-        "                # remove anything in brackets from text\n",
-        "                sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n",
-        "                # remove quotes from text\n",
-        "                sent = sent.replace(\"\\\"\", \"\")\n",
-        "                # skip sentences containing digits because normalization is out of scope of this tutorial\n",
-        "                if re.match(r\".*\\d.*\", sent):\n",
-        "                    continue\n",
-        "                # skip sentences containing abbreviations with period inside the sentence (for the same reason)\n",
-        "                if \". \" in sent:\n",
-        "                    continue\n",
-        "                # skip long sentences as they may cause OOM issues\n",
-        "                if len(sent) > 150:\n",
-        "                    continue\n",
-        "                # replace all punctuation to space and convert to lowercase\n",
-        "                sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n",
-        "                sent_clean = \" \".join(sent_clean.split(\" \"))\n",
-        "                words = sent_clean.split(\" \")\n",
-        "\n",
-        "                found_phrases = set()\n",
-        "                for begin in range(len(words)):\n",
-        "                    for end in range(begin + 1, min(begin + 4, len(words))):\n",
-        "                        phrase = \" \".join(words[begin:end])\n",
-        "                        if phrase in medical_vocabulary:\n",
-        "                            found_phrases.add(phrase)\n",
-        "                            if end - begin == 1:\n",
-        "                                found_singleword.add(phrase)\n",
-        "                            else:\n",
-        "                                found_multiword.add(phrase)\n",
-        "                if len(found_phrases) > 0:\n",
-        "                    all_sentences.append((sent, \";\".join(found_phrases)))\n",
-        "            all_found_singleword = all_found_singleword.union(found_singleword)\n",
-        "            all_found_multiword = all_found_multiword.union(found_multiword)\n",
-        "\n",
-        "    print(\"Sentences:\", len(all_sentences))\n",
-        "    print(\"Unique single-word terms found:\", len(all_found_singleword))\n",
-        "    print(\"Unique multi-word terms found:\", len(all_found_multiword))\n",
-        "    print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n",
-        "    \n",
-        "    return all_sentences, all_found_singleword, all_found_multiword"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "XU3xeCBVpWOL"
-      },
-      "outputs": [],
-      "source": [
-        "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n",
-        "    \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n",
-        "    Args:\n",
-        "        i_words: list of words in first sequence\n",
-        "        j_words: list of words in second sequence\n",
-        "    Returns:\n",
-        "        list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n",
-        "    \"\"\"\n",
-        "    s = SequenceMatcher(None, i_words, j_words)\n",
-        "    result = []\n",
-        "    for tag, i1, i2, j1, j2 in s.get_opcodes():\n",
-        "        result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n",
-        "    result = sorted(result, key=lambda x: x[3])\n",
-        "    return result"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2ydXp_pFYmYu"
-      },
-      "source": [
-        "## Read medical data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WAeauax0SV1-"
-      },
-      "outputs": [],
-      "source": [
-        "medical_vocabulary, _ = get_medical_vocabulary()\n",
-        "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n",
-        "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n",
-        "big_sample = list(medical_vocabulary)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "FRli7-Kx7sOO"
-      },
-      "outputs": [],
-      "source": [
-        "for sent, phrases in sentences[0:10]:\n",
-        "    print(sent, \"\\t\", phrases)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rL1VqH2_dk93"
-      },
-      "source": [
-        "# SpellMapper ASR Customization\n",
-        "\n",
-        "SpellMapper model relies on two offline preparation steps:\n",
-        "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n",
-        "2. Indexing of user vocabulary by n-grams.\n",
-        "\n",
-        "![Offline data preparation](images/spellmapper_data_preparation.png)\n",
-        "\n",
-        "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n",
-        "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n",
-        "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n",
-        "3. Do post-processing to combine results.\n",
-        "\n",
-        "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OeJpsMwslmrd"
-      },
-      "source": [
-        "## N-gram mappings\n",
-        "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uH6p0mOd12pi"
-      },
-      "source": [
-        "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n",
-        "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n",
-        "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n",
-        "* `<DELETE>` means that the original letter is deleted. \n",
-        "This auxiliary markup will be removed automatically during loading.\n",
-        "\n",
-        "`_` is used instead of real space symbol.\n",
-        "\n",
-        "Last three columns are:\n",
-        "* joint frequency\n",
-        "* frequency of original n-gram\n",
-        "* frequency of misspelled n-gram\n",
-        "\n",
-        "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qul163dB1sKp"
-      },
-      "outputs": [],
-      "source": [
-        "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eWxcrVWZ3Pfq"
-      },
-      "source": [
-        "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WHKhE945-N7o"
-      },
-      "outputs": [],
-      "source": [
-        "print(\"load n-gram mappings...\")\n",
-        "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n",
-        "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n",
-        "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n",
-        "print(\"Size of banned ngrams:\", len(ban_ngram))\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "49IcMBfllvXN"
-      },
-      "source": [
-        "## Indexing of custom vocabulary"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "b1K6paeee2Iu"
-      },
-      "source": [
-        "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n",
-        "\n",
-        "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n",
-        "gram mappings (see Index of custom vocabulary in Fig. 1)\n",
-        "\n",
-        "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xWb0jGqw6Woi"
-      },
-      "outputs": [],
-      "source": [
-        "custom_phrases = []\n",
-        "for phrase in medical_vocabulary:\n",
-        "    if phrase not in found_singleword and phrase not in found_multiword:\n",
-        "        continue\n",
-        "    custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n",
-        "print(\"Size of customization vocabulary:\", len(custom_phrases))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UHWor5pD2Eyb"
-      },
-      "source": [
-        "Now we build the index for our custom phrases.\n",
-        "\n",
-        "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n",
-        "\n",
-        "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "hs4RDXj0-xW9"
-      },
-      "outputs": [],
-      "source": [
-        "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n",
-        "print(\"Size of phrases:\", len(phrases))\n",
-        "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n",
-        "\n",
-        "# Save index to file - later we will use it in other script\n",
-        "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for ngram in ngram2phrases:\n",
-        "        for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n",
-        "            phrase = phrases[phrase_id]\n",
-        "            out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RV1sdQ9rvar8"
-      },
-      "source": [
-        "## Small detailed example\n",
-        "\n",
-        "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n",
-        "\n",
-        "We will see \n",
-        "1. How this custom phrase is indexed.\n",
-        "2. How candidate retrieval works, given ASR-hypothesis.\n",
-        "3. How inference and post-processing work.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kGBTTJXixnrG"
-      },
-      "source": [
-        "### N-grams in index"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ryfUlqNMl4vQ"
-      },
-      "source": [
-        "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n",
-        "Columns: \n",
-        "1. n-gram\n",
-        "2. beginning position in the phrase\n",
-        "3. length\n",
-        "4. log probability\n",
-        "\n",
-        "Note that many n-grams are not from n-gram mappings file. Those are derived by  growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "x0ZVsXGBo8pt"
-      },
-      "outputs": [],
-      "source": [
-        "for ngram in ngram2phrases:\n",
-        "    for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n",
-        "        if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n",
-        "            print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "20ov23ze4xeQ"
-      },
-      "source": [
-        "### Candidate retrieval\n",
-        "Candidate retrieval tasks are:\n",
-        " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n",
-        " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n",
-        " - Find approximate starting position for each candidate phrase. \n",
-        "\n",
-        "\n",
-        "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "t_rhKQ3Xqa8A"
-      },
-      "outputs": [],
-      "source": [
-        "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n",
-        "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n",
-        "print(\" \".join(list(sent)))\n",
-        "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "orkRapbjF4aZ"
-      },
-      "source": [
-        "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n",
-        "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n",
-        "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n",
-        "\n",
-        "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n",
-        "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JF7u4_iiHLyI"
-      },
-      "outputs": [],
-      "source": [
-        "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n",
-        "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n",
-        "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "45mvKg8ZyNbr"
-      },
-      "source": [
-        "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n",
-        "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n",
-        "\n",
-        "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n",
-        "\n",
-        "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Sjyn9I98udL9"
-      },
-      "source": [
-        "### Inference\n",
-        "\n",
-        "Now let's generate input for SpellMapper inference. \n",
-        "An input line should consist of 4 tab-separated columns:\n",
-        "  - text of ASR-hypothesis\n",
-        "  - texts of 10 candidates separated by semicolon\n",
-        "  - 1-based ids of non-dummy candidates\n",
-        "  - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n",
-        "Note that candidate retrieval is done inside the function `get_candidates`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cJnusVfBRhRX"
-      },
-      "outputs": [],
-      "source": [
-        "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n",
-        "letters = list(sent)\n",
-        "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n",
-        "# We add two columns with targets and span_info. \n",
-        "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n",
-        "targets = []\n",
-        "span_info = []\n",
-        "for idx, c in enumerate(candidates):\n",
-        "    if c[1] == -1:\n",
-        "        continue\n",
-        "    targets.append(str(idx + 1))  # targets are 1-based\n",
-        "    start = c[1]\n",
-        "    end = min(c[1] + c[2], len(letters))  # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n",
-        "    span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n",
-        "\n",
-        "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates])  + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n",
-        "out.close()\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Qpei5o89SmaU"
-      },
-      "outputs": [],
-      "source": [
-        "!cat spellmapper_input.txt"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9rAmO15SS6go"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n",
-        "      pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n",
-        "      model.max_sequence_len=512 \\\n",
-        "      inference.from_file=spellmapper_input.txt \\\n",
-        "      inference.out_file=spellmapper_output.txt \\\n",
-        "      inference.batch_size=16 \\\n",
-        "      lang=en\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wd2aq4T1N5cs"
-      },
-      "source": [
-        "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n",
-        "1. ASR-hypothesis (same as in input)\n",
-        "2. 10 candidates separated with semicolon (same as in input)\n",
-        "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n",
-        "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ravgEX8cTFty"
-      },
-      "outputs": [],
-      "source": [
-        "!cat spellmapper_output.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "az26364-PHb2"
-      },
-      "source": [
-        "We can use some utility functions to apply found replacements and get actual corrected text."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lPtFa_EhK8pb"
-      },
-      "outputs": [],
-      "source": [
-        "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n",
-        "text, replacements, _ = spellmapper_results[0]\n",
-        "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n",
-        "print(\"Text before correction:\\n\", text)\n",
-        "print(\"Text after correction:\\n\", corrected_text)\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "efF7O-D91FLX"
-      },
-      "source": [
-        "# Bigger customization example\n",
-        "\n",
-        "Let's test customization on more data. The plan is\n",
-        "   *  Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n",
-        "   *  Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "r_EFPnyDcXZt"
-      },
-      "source": [
-        "## Run TTS"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "i9F5SBhmr8rk"
-      },
-      "outputs": [],
-      "source": [
-        "# create a folder for wav files (TTS output)\n",
-        "!rm -r audio\n",
-        "!mkdir audio"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JMbkNVt7YBAO"
-      },
-      "outputs": [],
-      "source": [
-        "if torch.cuda.is_available():\n",
-        "  device = \"cuda\"\n",
-        "else:\n",
-        "  device = \"cpu\"\n",
-        "\n",
-        "# Load FastPitch from HuggingFace\n",
-        "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n",
-        "# Load HifiGan vocoder from HuggingFace\n",
-        "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n",
-        "\n",
-        "# Write sentences that we want to feed to TTS\n",
-        "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for sent, _ in sentences[0:100]:\n",
-        "        out.write(sent + \"\\n\")\n",
-        "\n",
-        "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n",
-        "i = 0\n",
-        "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n",
-        "    for line in inp:\n",
-        "        text = line.strip()\n",
-        "        text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower()  #replace all punctuation to space and convert to lowercase\n",
-        "        text_clean = \" \".join(text_clean.split())\n",
-        "\n",
-        "        parsed = spectrogram_generator.parse(text, normalize=True)\n",
-        "\n",
-        "        spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n",
-        "        audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n",
-        "\n",
-        "        # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n",
-        "        filename = \"audio/\" + str(i) + \".wav\"\n",
-        "        sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n",
-        "        out_manifest.write(\n",
-        "            \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n",
-        "        )\n",
-        "        i += 1\n",
-        "\n",
-        "        # display some examples\n",
-        "        if i < 10:\n",
-        "            print(f'\"{text}\"\\n')\n",
-        "            ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n",
-        "\n",
-        "out_manifest.close()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9T3CZcCAmxCz"
-      },
-      "source": [
-        "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`.",
-        "\n",
-        "Note that TTS model may mispronounce some unknown words, for example, abbreviations like `tRNAs`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pR_T1HnttVjm"
-      },
-      "outputs": [],
-      "source": [
-        "lines = []\n",
-        "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n",
-        "    lines = f.readlines()\n",
-        "\n",
-        "for line in lines:\n",
-        "    try:\n",
-        "        data = json.loads(line.strip())\n",
-        "    except:\n",
-        "        print(line)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bt2TMLLvdUHm"
-      },
-      "source": [
-        "Free GPU memory to avoid OOM."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ZwEpAOCaRH7s"
-      },
-      "outputs": [],
-      "source": [
-        "del spectrogram_generator\n",
-        "del vocoder\n",
-        "torch.cuda.empty_cache()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HrensakWdLkt"
-      },
-      "source": [
-        "## Run baseline ASR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IQNIo2M_mqJc"
-      },
-      "source": [
-        "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n",
-        "\n",
-        "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NMN63ux1mJiG"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/transcribe_speech.py \\\n",
-        "      pretrained_name=\"stt_en_conformer_ctc_large\" \\\n",
-        "      dataset_manifest=manifest.json \\\n",
-        "      output_filename=ctc_baseline_transcript_tmp.json \\\n",
-        "      batch_size=2"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "L3swQ8uqqgnp"
-      },
-      "source": [
-        "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n",
-        "\n",
-        "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n",
-        "\n",
-        "So we need to correct the manifests to ensure that all spaces are single."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "z17sxkmXrXpJ"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    # if there are multiple spaces in the string they will be merged to one\n",
-        "    test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n",
-        "\n",
-        "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for d in test_data:\n",
-        "        line = json.dumps(d)\n",
-        "        out.write(line + \"\\n\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PuKtfhbVkVJY"
-      },
-      "outputs": [],
-      "source": [
-        "!head -n 4 ctc_baseline_transcript.json"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aCJw9NEXqRg8"
-      },
-      "source": [
-        "### Calculating WER of baseline transcript\n",
-        "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ZmNEGVWQsGo2"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_baseline_transcript.json \\\n",
-        "  only_score_manifest=True\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AvPwJr0ZqdkN"
-      },
-      "source": [
-        "### See fragments that differ\n",
-        "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RAeaVCpMv78y"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n",
-        "pred_text = [data['pred_text'] for data in test_data]\n",
-        "ref_text = [data['text'] for data in test_data]\n",
-        "audio_filepath = [data['audio_filepath'] for data in test_data]\n",
-        "\n",
-        "diff_vocab = Counter()\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    ref_sent = \" \" + ref_text[i] + \" \"\n",
-        "    pred_sent = \" \" + pred_text[i] + \" \"\n",
-        "\n",
-        "    pred_words = pred_sent.strip().split()\n",
-        "    ref_words = ref_sent.strip().split()\n",
-        "\n",
-        "    for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n",
-        "        if tag != \"equal\":\n",
-        "            diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n",
-        "\n",
-        "sum_ = 0\n",
-        "print(\"PRED vs REF\")\n",
-        "for k, v in diff_vocab.most_common(1000000):\n",
-        "    sum_ += v\n",
-        "    print(k, v, \"sum=\", sum_)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dUSOF7iD1w_9"
-      },
-      "source": [
-        "## Run SpellMapper"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x39BQhYB6_Fr"
-      },
-      "source": [
-        "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "y8x-yT5WqfFz"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n",
-        "  --manifest ctc_baseline_transcript.json \\\n",
-        "  --custom_vocab_index index.txt \\\n",
-        "  --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_name spellmapper_input.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ueq_JAPWGs_Y"
-      },
-      "source": [
-        "Run the inference."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zgkqiiZtJjcB"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n",
-        "      pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n",
-        "      model.max_sequence_len=512 \\\n",
-        "      inference.from_file=spellmapper_input.txt \\\n",
-        "      inference.out_file=spellmapper_output.txt \\\n",
-        "      inference.batch_size=16 \\\n",
-        "      lang=en\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RPQWJX8dFLfX"
-      },
-      "source": [
-        "Now we postprocess SpellMapper output and create output corrected manifest."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "3eFU515yKvXP"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n",
-        "  --input_manifest ctc_baseline_transcript.json \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_manifest ctc_corrected_transcript.json \\\n",
-        "  --spellmapper_result spellmapper_output.txt \\\n",
-        "  --replace_hyphen_to_space \\\n",
-        "  --field_name pred_text \\\n",
-        "  --ngram_mappings \"\"\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hRoIhhGh17tp"
-      },
-      "source": [
-        "### Calculating WER of corrected transcript."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qIT957bGo9AY"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_corrected_transcript.json \\\n",
-        "  only_score_manifest=True\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NYXIPusupqOQ"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n",
-        "pred_text = [data['pred_text'] for data in test_data]\n",
-        "ref_text = [data['pred_text_before_correction'] for data in test_data]\n",
-        "\n",
-        "diff_vocab = Counter()\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    ref_sent = \" \" + ref_text[i] + \" \"\n",
-        "    pred_sent = \" \" + pred_text[i] + \" \"\n",
-        "\n",
-        "    pred_words = pred_sent.strip().split()\n",
-        "    ref_words = ref_sent.strip().split()\n",
-        "\n",
-        "    for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n",
-        "        if tag != \"equal\":\n",
-        "            diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n",
-        "\n",
-        "sum_ = 0\n",
-        "print(\"Corrected vs baseline\")\n",
-        "for k, v in diff_vocab.most_common(1000000):\n",
-        "    sum_ += v\n",
-        "    print(k, v, \"sum=\", sum_)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DJtXlqXbTD6M"
-      },
-      "source": [
-        "### Filtering by Dynamic Programming(DP) score\n",
-        "\n",
-        "What else can be done?\n",
-        "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n",
-        "Let's look at an example. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "05Qf9wgHU_UR"
-      },
-      "outputs": [],
-      "source": [
-        "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n",
-        "\n",
-        "fragment = \"and hydrod\"\n",
-        "replacement = \"anhydride\"\n",
-        "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n",
-        "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n",
-        "path = get_alignment_by_dp(\n",
-        "    replacement_spaced,\n",
-        "    fragment_spaced,\n",
-        "    dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n",
-        ")\n",
-        "print(\"Dynamic Programming path:\")\n",
-        "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n",
-        "    print(\n",
-        "        \"\\t\",\n",
-        "        \"frag=\",\n",
-        "        fragment_ngram,\n",
-        "        \"; repl=\",\n",
-        "        replacement_ngram,\n",
-        "        \"; score=\",\n",
-        "        score,\n",
-        "        \"; sum_score=\",\n",
-        "        sum_score,\n",
-        "        \"; joint_freq=\",\n",
-        "        joint_freq,\n",
-        "        \"; orig_freq=\",\n",
-        "        orig_freq,\n",
-        "        \"; misspelled_freq=\",\n",
-        "        misspelled_freq,\n",
-        "    )\n",
-        "\n",
-        "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n",
-        "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hgfKPKckaLnc"
-      },
-      "source": [
-        "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n",
-        "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UhSXh7ht_JRn"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n",
-        "  --input_manifest ctc_baseline_transcript.json \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_manifest ctc_corrected_transcript_dp.json \\\n",
-        "  --spellmapper_result spellmapper_output.txt \\\n",
-        "  --replace_hyphen_to_space \\\n",
-        "  --field_name pred_text \\\n",
-        "  --use_dp \\\n",
-        "  --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n",
-        "  --min_dp_score_per_symbol -1.5"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "u8R5YHB3vPC8"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_corrected_transcript_dp.json \\\n",
-        "  only_score_manifest=True"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "upvTbkFAeYtR"
-      },
-      "source": [
-        "# Final notes\n",
-        "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n",
-        "\n",
-        "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n",
-        "\n",
-        "3. To reproduce evaluation experiments from this paper see these scripts:\n",
-        " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        "\n",
-        "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n",
-        "\n",
-        "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n",
-        "\n",
-        "6. Promising future research directions would be:\n",
-        "  - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n",
-        "  - retrain with adding more various false positives to the training data"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}