Skip to content

Commit 9f34e05

Browse files
committed
fix ci
1 parent 150153f commit 9f34e05

14 files changed

+5213
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .configuration_internlm2 import InternLM2Config
2+
from .modeling_internlm2 import InternLM2ForCausalLM
3+
from .tokenization_internlm2 import InternLM2Tokenizer
4+
from .tokenization_internlm2_fast import InternLM2TokenizerFast
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# coding=utf-8
2+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3+
#
4+
# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
""" InternLM2 model configuration"""
18+
19+
from transformers.configuration_utils import PretrainedConfig
20+
from transformers.utils import logging
21+
22+
logger = logging.get_logger(__name__)
23+
24+
INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25+
26+
27+
# Modified from transformers.model.llama.configuration_llama.LlamaConfig
28+
class InternLM2Config(PretrainedConfig):
29+
r"""
30+
This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31+
an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32+
configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33+
34+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35+
documentation from [`PretrainedConfig`] for more information.
36+
37+
38+
Args:
39+
vocab_size (`int`, *optional*, defaults to 32000):
40+
Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41+
`inputs_ids` passed when calling [`InternLM2Model`]
42+
hidden_size (`int`, *optional*, defaults to 4096):
43+
Dimension of the hidden representations.
44+
intermediate_size (`int`, *optional*, defaults to 11008):
45+
Dimension of the MLP representations.
46+
num_hidden_layers (`int`, *optional*, defaults to 32):
47+
Number of hidden layers in the Transformer encoder.
48+
num_attention_heads (`int`, *optional*, defaults to 32):
49+
Number of attention heads for each attention layer in the Transformer encoder.
50+
num_key_value_heads (`int`, *optional*):
51+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55+
by meanpooling all the original heads within that group. For more details checkout [this
56+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57+
`num_attention_heads`.
58+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59+
The non-linear activation function (function or string) in the decoder.
60+
max_position_embeddings (`int`, *optional*, defaults to 2048):
61+
The maximum sequence length that this model might ever be used with. Typically set this to something large
62+
just in case (e.g., 512 or 1024 or 2048).
63+
initializer_range (`float`, *optional*, defaults to 0.02):
64+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65+
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
66+
The epsilon used by the rms normalization layers.
67+
use_cache (`bool`, *optional*, defaults to `True`):
68+
Whether or not the model should return the last key/values attentions (not used by all models). Only
69+
relevant if `config.is_decoder=True`.
70+
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
71+
Whether to tie weight embeddings
72+
Example:
73+
74+
"""
75+
model_type = "internlm2"
76+
_auto_class = "AutoConfig"
77+
78+
def __init__( # pylint: disable=W0102
79+
self,
80+
vocab_size=103168,
81+
hidden_size=4096,
82+
intermediate_size=11008,
83+
num_hidden_layers=32,
84+
num_attention_heads=32,
85+
num_key_value_heads=None,
86+
hidden_act="silu",
87+
max_position_embeddings=2048,
88+
initializer_range=0.02,
89+
rms_norm_eps=1e-6,
90+
use_cache=True,
91+
pad_token_id=0,
92+
bos_token_id=1,
93+
eos_token_id=2,
94+
tie_word_embeddings=False,
95+
bias=True,
96+
rope_theta=10000,
97+
rope_scaling=None,
98+
attn_implementation="eager",
99+
**kwargs,
100+
):
101+
self.vocab_size = vocab_size
102+
self.max_position_embeddings = max_position_embeddings
103+
self.hidden_size = hidden_size
104+
self.intermediate_size = intermediate_size
105+
self.num_hidden_layers = num_hidden_layers
106+
self.num_attention_heads = num_attention_heads
107+
self.bias = bias
108+
109+
if num_key_value_heads is None:
110+
num_key_value_heads = num_attention_heads
111+
self.num_key_value_heads = num_key_value_heads
112+
113+
self.hidden_act = hidden_act
114+
self.initializer_range = initializer_range
115+
self.rms_norm_eps = rms_norm_eps
116+
self.use_cache = use_cache
117+
self.rope_theta = rope_theta
118+
self.rope_scaling = rope_scaling
119+
self._rope_scaling_validation()
120+
121+
self.attn_implementation = attn_implementation
122+
if self.attn_implementation is None:
123+
self.attn_implementation = "eager"
124+
super().__init__(
125+
pad_token_id=pad_token_id,
126+
bos_token_id=bos_token_id,
127+
eos_token_id=eos_token_id,
128+
tie_word_embeddings=tie_word_embeddings,
129+
**kwargs,
130+
)
131+
132+
def _rope_scaling_validation(self):
133+
"""
134+
Validate the `rope_scaling` configuration.
135+
"""
136+
if self.rope_scaling is None:
137+
return
138+
139+
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
140+
raise ValueError(
141+
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
142+
f"got {self.rope_scaling}"
143+
)
144+
rope_scaling_type = self.rope_scaling.get("type", None)
145+
rope_scaling_factor = self.rope_scaling.get("factor", None)
146+
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
147+
raise ValueError(
148+
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
149+
)
150+
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
151+
raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

0 commit comments

Comments
 (0)