pylint fix

zigzagcai · zigzagcai · commit 191ca18f7457 · 2025-03-03T12:12:32.000+08:00
diff --git a/internlm/checkpoint/load_funcs.py b/internlm/checkpoint/load_funcs.py
@@ -1,14 +1,7 @@
 # Copyright (c) InternLM. All rights reserved.
 
-from internlm.model.model_implementations.transformers.modeling_internlm2 import (
-    InternLM2,
-)
-from internlm.model.model_implementations.transformers.modeling_llama import Llama2
 from internlm.utils.logger import get_logger
 
 logger = get_logger(__file__)
 
-LOAD_FUNC_DICT = {
-    "llama": Llama2.load_llama_pretrained_weights,
-    "internlm2_test": InternLM2.load_internlm2_with_dynamic_parallel_size,
-}
+LOAD_FUNC_DICT = {}
diff --git a/internlm/core/parallel/comm/zero.py b/internlm/core/parallel/comm/zero.py
@@ -159,7 +159,9 @@ def _pre_forward_hook(model: nn.Module, *args, **kwargs):  # pylint: disable=W06
                 for working_param, all_splited_param in zip(
                     self._block_working_params[block_name], all_splited_param_list
                 ):
-                    working_param.data.copy_(_flatten_dense_tensors(all_splited_param)[: working_param.numel()].view_as(working_param))
+                    working_param.data.copy_(
+                        _flatten_dense_tensors(all_splited_param)[: working_param.numel()].view_as(working_param)
+                    )
 
                 self._block_allgather_handles[block_name] = None
                 self._block_gathered_params[block_name] = []
diff --git a/internlm/model/model_implementations/transformers/modeling_internlm.py b/internlm/model/model_implementations/transformers/modeling_internlm.py
@@ -13,7 +13,6 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import set_output_attr_to_module
-from internlm.core.parallel.shard import partition_uniform
 from internlm.model.model_implementations.transformers.base_model import (
     BaseTransformerModel,
 )
@@ -522,7 +521,6 @@ def load_hf_weights(folder: str, model: nn.Module) -> None:
 
         internlm_accelerator.empty_cache()
 
-
     @staticmethod
     def convert_internevo2hf_weights(src: str, tgt: str) -> None:
         model_config = gpc.config.model
diff --git a/internlm/model/model_implementations/transformers/modeling_internlm2.py b/internlm/model/model_implementations/transformers/modeling_internlm2.py
@@ -2,7 +2,6 @@
 import math
 import os
 from contextlib import nullcontext
-from functools import reduce
 from typing import Optional
 
 import torch
@@ -13,7 +12,6 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.parallel.comm.cpu_offload import get_cpu_offload_context
-from internlm.core.parallel.shard import partition_uniform
 from internlm.model.model_implementations.transformers.base_model import (
     BaseTransformerModel,
 )
@@ -31,7 +29,6 @@
 from internlm.model.model_ops.utils import (
     convert_attn_args_to_kwargs,
     convert_attn_kwargs_to_args,
-    get_parallel_size_from_file,
 )
 from internlm.solver.activation_checkpoint import activation_checkpoint
 from internlm.utils.logger import get_logger
@@ -636,196 +633,6 @@ def load_hf_weights(folder: str, model: nn.Module) -> None:
 
         internlm_accelerator.empty_cache()
 
-    @staticmethod
-    def load_internlm2_with_dynamic_parallel_size(folder, model):
-        """Load InternLM2 with dynamic parallel size."""
-        assert folder is not None, "Please specify the folder of the pretrained model"
-        assert gpc.config.model_type in ["INTERNLM2"], "dynamic_parallel is only for INTERNLM2"
-
-        fns = get_fns(folder)
-        if gpc.is_rank_for_log():
-            logger.info(f"Loading pretrained model from {folder}")
-        model_fns, old_tp, old_pp = get_parallel_size_from_file(fns)  # pylint: disable=W0612
-
-        tp = gpc.get_world_size(ParallelMode.TENSOR)
-        tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
-        assert old_tp % tp == 0 or tp % old_tp == 0, (
-            f"Expected TP size in loaded checkpoint to be fit with TP size in current config, but got {old_tp} in "
-            f"checkpoint and {tp} in current config"
-        )
-
-        correspond_tps = []
-
-        if old_tp <= tp:
-            correspond_tps.append(tp_rank // (tp // old_tp))
-            ratio = tp // old_tp
-            rank = tp_rank % ratio
-        else:
-            for i in range(old_tp // tp):
-                correspond_tps.append(tp_rank * (old_tp // tp) + i)
-            rank = 0
-            ratio = 1
-
-        current_states = {}
-
-        pp = gpc.get_world_size(ParallelMode.PIPELINE)  # noqa: F841 # pylint: disable=W0612
-
-        assert gpc.config.model.num_chunks == 1, "May cause future collisions, ignore this if necessary"
-
-        old_pp_partition = partition_uniform(gpc.config.model.num_layers, old_pp, 1)
-
-        for idx, parts in enumerate(old_pp_partition):
-            start, end = parts[0]
-            if model.last_layer <= start or model.first_layer >= end:
-                continue
-            tmp_states = {}
-
-            for correspond_tp in correspond_tps:
-                model_name = f"model_tp{correspond_tp}_pp{idx}.pt"
-                states = llm_load(os.path.join(folder, model_name), map_location="cpu")
-                states = {k.replace("model.", ""): v for k, v in states.items()}
-                for i in range(start, end):
-                    if i >= model.last_layer:
-                        break
-                    if i < model.first_layer:
-                        continue
-
-                    for name in list(states.keys()):
-                        if f".{i-start}." in name:
-                            to_name = name.replace(f".{i-start}.", f".{i-model.first_layer}.")
-
-                            if gpc.config.model_type == "INTERNLM2":
-                                if "norm" in name:
-                                    tmp_states[to_name] = [states.pop(name)]
-                                elif any(x in name for x in ("wo", "w2")):
-                                    tmp_states[to_name] = tmp_states.get(to_name, [])
-                                    tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=1)[rank])
-                                elif any(x in name for x in ("w1", "w3")):
-                                    tmp_states[to_name] = tmp_states.get(to_name, [])
-                                    tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=0)[rank])
-                                elif any(x in name for x in ("wqkv",)):
-                                    tmp_states[to_name] = tmp_states.get(to_name, [])
-                                    if tp > gpc.config.model.num_kv_attention_heads:
-                                        assert old_tp <= gpc.config.model.num_kv_attention_heads, (
-                                            f"`old_tp ({old_tp}) => tp ({tp})` is not supported. "
-                                            "At least one of `tp` and `old_tp` should be less than or "
-                                            "equal to `num_kv_attention_heads`"
-                                        )
-                                        # Suitable for cases where the num_kv_attention_head is small,
-                                        # but you want to have a large TP Size
-                                        q_per_kv = (
-                                            gpc.config.model.num_attention_heads
-                                            // gpc.config.model.num_kv_attention_heads
-                                        )
-                                        head_dim = gpc.config.model.hidden_size // gpc.config.model.num_attention_heads
-                                        index = torch.concat(
-                                            (
-                                                torch.arange(q_per_kv).chunk(ratio, dim=0)[tp_rank % ratio],
-                                                torch.tensor([q_per_kv, q_per_kv + 1]),
-                                            )
-                                        )
-                                        index = index + (q_per_kv + 2) * (tp_rank // ratio)
-                                        index = index % (
-                                            (q_per_kv + 2) * (gpc.config.model.num_kv_attention_heads / old_tp)
-                                        )
-                                        index = index * head_dim
-                                        index = index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(
-                                            index.shape[0]
-                                        )
-                                        tmp_states[to_name].append(
-                                            torch.index_select(states.pop(name), 0, index.to(torch.int32))
-                                        )
-                                    else:
-                                        tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=0)[rank])
-                                else:
-                                    raise KeyError(f"Unknown key {name}.")
-
-                            else:
-                                assert False, "unsupported model type"
-
-                if "tok_embeddings.weight" in states and model.first_layer == 0:
-                    tmp_states["tok_embeddings.weight"] = tmp_states.get("tok_embeddings.weight", [])
-                    tmp_states["tok_embeddings.weight"].append(
-                        states["tok_embeddings.weight"].chunk(ratio, dim=1)[rank]
-                    )
-                if "output.weight" in states and model.last_layer == gpc.config.model.num_layers:
-                    tmp_states["norm.weight"] = [states["norm.weight"]]
-                    tmp_states["output.weight"] = tmp_states.get("output.weight", [])
-                    tmp_states["output.weight"].append(states["output.weight"].chunk(ratio, dim=0)[rank])
-
-                states = {}
-
-            for name in list(tmp_states.keys()):
-                data = tmp_states.pop(name)
-                if len(data) == 1:
-                    current_states[name] = data[0]
-                else:
-                    current_states[name] = torch.concat(
-                        data, dim=1 if name == "tok_embeddings.weight" or any(x in name for x in ("wo", "w2")) else 0
-                    )
-                    # Merge copied kv heads
-                    if "wqkv" in name and old_tp > gpc.config.model.num_kv_attention_heads:
-                        assert (
-                            tp <= gpc.config.model.num_kv_attention_heads
-                        ), "new_tp should be less than or equal to num_kv_attention_heads"
-                        head_dim = gpc.config.model.hidden_size // gpc.config.model.num_attention_heads
-                        q_per_kv = gpc.config.model.num_attention_heads // gpc.config.model.num_kv_attention_heads
-                        copied_times = old_tp // gpc.config.model.num_kv_attention_heads
-                        cur_q_per_kv = q_per_kv // copied_times
-
-                        # pylint: disable=all
-                        def duplicate_kv_index(i):
-                            if i % (cur_q_per_kv + 2) >= cur_q_per_kv:
-                                return i
-                            else:
-                                return -100
-
-                        def unique_kv_index(i):
-                            if i // (cur_q_per_kv + 2) == copied_times - 1 or i % (cur_q_per_kv + 2) < cur_q_per_kv:
-                                return i
-                            else:
-                                return -100
-
-                        # pylint: enable=all
-
-                        # Verify
-                        duplicate_index = [duplicate_kv_index(i) for i in range((cur_q_per_kv + 2) * copied_times)]
-                        duplicate_index = [i for i in duplicate_index if i != -100]
-                        duplicate_index = _duplicate_index = torch.tensor(duplicate_index)
-                        for i in range(gpc.config.model.num_kv_attention_heads // tp - 1):
-                            duplicate_index = torch.concat(
-                                (duplicate_index, _duplicate_index + duplicate_index.max() + 1), dim=0
-                            )
-                        duplicate_kv = []
-                        for index in duplicate_index.reshape(-1, copied_times * 2).chunk(copied_times, dim=-1):
-                            index = index.reshape(-1) * head_dim
-                            index = index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(index.shape[0])
-                            duplicate_kv.append(torch.index_select(current_states[name], 0, index))
-                        assert reduce(
-                            lambda x, y: x and y,
-                            [torch.allclose(duplicate_kv[0], x, atol=1e-5) for x in duplicate_kv[1:]],
-                        ), "Copied kv heads are not equal after training!"
-
-                        # Merge
-                        unique_index = [unique_kv_index(i) for i in range((cur_q_per_kv + 2) * copied_times)]
-                        unique_index = [i for i in unique_index if i != -100]
-                        unique_index = _unique_index = torch.tensor(unique_index)
-                        for i in range(gpc.config.model.num_kv_attention_heads // tp - 1):
-                            unique_index = torch.concat((unique_index, _unique_index + unique_index.max() + 1), dim=0)
-                        unique_index = unique_index * head_dim
-                        unique_index = unique_index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(
-                            unique_index.shape[0]
-                        )
-                        current_states[name] = torch.index_select(current_states[name], 0, unique_index)
-        missing_keys, unexpected_keys = model.load_state_dict(current_states, strict=False)
-
-        if gpc.get_local_rank(ParallelMode.DATA) == 0:
-            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
-            logger.info(
-                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
-                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
-            )
-
     @staticmethod
     def convert_internevo2hf_weights(src: str, tgt: str) -> None:
         model_config = gpc.config.model
diff --git a/internlm/model/model_implementations/transformers/modeling_llama.py b/internlm/model/model_implementations/transformers/modeling_llama.py
@@ -586,63 +586,6 @@ def load_hf_weights(folder: str, model: nn.Module):
 
         internlm_accelerator.empty_cache()
 
-    @staticmethod
-    def load_llama_pretrained_weights(folder: str, model: nn.Module) -> None:
-        """NOTE: when loading huggingface's llama pretrained weights, you should set `adapt_hf=True` in your config."""
-        """NOTE: specified for meta-llama/Llama-2-7b"""
-        assert folder is not None, "Please specify the folder of the pretrained model"
-        if gpc.is_rank_for_log():
-            logger.info(f"Loading pretrained model from {folder}")
-
-        fns = get_fns(folder)
-        model_fns = []
-        for fn in fns:
-            if fn.startswith("model_t") and not fn.endswith("md5"):
-                model_fns.append(os.path.join(folder, fn))
-
-        if len(model_fns) == 0:
-            model_fns = [os.path.join(folder, fn) for fn in fns if fn.endswith(".pth") or fn.endswith(".pt")]
-
-        if len(model_fns) == 0:
-            raise FileNotFoundError(f"No checkpoint file found in {folder}")
-
-        model_fns.sort()
-
-        old_tp = len(model_fns)
-        cur_tp = gpc.get_world_size(ParallelMode.TENSOR)
-        # If the two tp are inconsistent, you need to consider the merge before splitting
-        if old_tp != cur_tp:
-            raise RuntimeError(
-                f"Your current tp is `{cur_tp}`, but the tp in folder:`{folder}` is `{old_tp}`, use `` to convert first"
-            )
-
-        states = llm_load(model_fns[gpc.get_local_rank(ParallelMode.TENSOR)], map_location="cpu")
-
-        current_states = {}
-        for idx, i in enumerate(range(model.first_layer, model.last_layer)):
-            for name in list(states.keys()):
-                if f".{i}." in name:
-                    current_states[name.replace(f".{i}.", f".{idx}.")] = states.pop(name)
-
-        model_state_keys = set(list(model.state_dict().keys()))
-
-        if "tok_embeddings.weight" in model_state_keys:
-            current_states["tok_embeddings.weight"] = states["tok_embeddings.weight"]
-            assert model.first_layer == 0, f"Expect model.NaiveAMPModel to be 0, but got {model.first_layer}"
-        if "output.weight" in model_state_keys:
-            current_states["norm.weight"] = states["norm.weight"]
-            current_states["output.weight"] = states["output.weight"]
-        missing_keys, unexpected_keys = model.load_state_dict(current_states, strict=False)
-
-        if gpc.get_local_rank(ParallelMode.DATA) == 0:
-            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
-            logger.info(
-                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
-                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
-            )
-
-        internlm_accelerator.empty_cache()
-
     @staticmethod
     def convert_internevo2hf_weights(src: str, tgt: str) -> None:
         model_config = gpc.config.model
diff --git a/internlm/solver/optimizer/hybrid_zero_optim_v2.py b/internlm/solver/optimizer/hybrid_zero_optim_v2.py
@@ -5,8 +5,8 @@
 
 import torch
 import torch.distributed as dist
-from torch.optim import Optimizer
 from torch._utils import _flatten_dense_tensors
+from torch.optim import Optimizer
 
 from internlm.core.context import (
     IS_REPLICA_ZERO_PARALLEL,
@@ -670,7 +670,9 @@ def step(self, closure=None):
 
                 # Update working parameters
                 for working_param, all_splited_param in zip(working_params_list[gather_idx], all_splited_param_list):
-                    working_param.data.copy_(_flatten_dense_tensors(all_splited_param)[: working_param.numel()].view_as(working_param))
+                    working_param.data.copy_(
+                        _flatten_dense_tensors(all_splited_param)[: working_param.numel()].view_as(working_param)
+                    )
 
         for group_id in range(self.num_param_groups):
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py