Skip to content

Commit fc1a605

Browse files
committed
remove light monitoring address
1 parent 03d6472 commit fc1a605

31 files changed

+92
-197
lines changed

configs/1.8B_MoE16_sft.py

-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,6 @@
213213
alert=dict(
214214
enable_feishu_alert=DO_ALERT,
215215
feishu_alert_address=None, # feishu webhook to send alert message
216-
light_monitor_address=None, # light_monitor address to send heartbeat
217216
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
218217
),
219218
tensorboard=dict(

configs/57B_qwen2_MoE.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -220,10 +220,9 @@
220220
alert=dict(
221221
enable_feishu_alert=DO_ALERT,
222222
feishu_alert_address=None, # feishu webhook to send alert message
223-
light_monitor_address=None, # light_monitor address to send heartbeat
224223
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
225224
),
226225
tensorboard=dict(
227226
queue_max_length=10,
228227
),
229-
)
228+
)

configs/7B_MoE4_sft.py

-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,6 @@
211211
alert=dict(
212212
enable_feishu_alert=DO_ALERT,
213213
feishu_alert_address=None, # feishu webhook to send alert message
214-
light_monitor_address=None, # light_monitor address to send heartbeat
215214
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
216215
),
217216
tensorboard=dict(

configs/7B_baichuan2.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
CHECKPOINT_EVERY = 50
2323
ckpt = dict(
2424
enable_save_ckpt=False, # enable ckpt save.
25-
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
25+
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
2626
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
2727
# 'load_ckpt_info' setting guide:
2828
# 1. the 'path' indicate ckpt path,
@@ -196,7 +196,6 @@
196196
alert=dict(
197197
enable_feishu_alert=DO_ALERT,
198198
feishu_alert_address=None, # feishu webhook to send alert message
199-
light_monitor_address=None, # light_monitor address to send heartbeat
200199
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
201200
),
202201
tensorboard=dict(

configs/7B_gemma.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
CHECKPOINT_EVERY = 50
2525
ckpt = dict(
2626
enable_save_ckpt=False, # enable ckpt save.
27-
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
27+
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
2828
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
2929
# 'load_ckpt_info' setting guide:
3030
# 1. the 'path' indicate ckpt path,
@@ -203,7 +203,6 @@
203203
alert=dict(
204204
enable_feishu_alert=DO_ALERT,
205205
feishu_alert_address=None, # feishu webhook to send alert message
206-
light_monitor_address=None, # light_monitor address to send heartbeat
207206
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
208207
),
209208
tensorboard=dict(

configs/7B_internlm2.py

-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@
194194
alert=dict(
195195
enable_feishu_alert=DO_ALERT,
196196
feishu_alert_address=None, # feishu webhook to send alert message
197-
light_monitor_address=None, # light_monitor address to send heartbeat
198197
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
199198
),
200199
tensorboard=dict(

configs/7B_isp_sft.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
# 'load_ckpt_info' setting guide:
3232
# 1. the 'path' indicate ckpt path,
3333
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
34-
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
34+
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
3535
# load function such as "llama"
3636
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
3737
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -188,17 +188,17 @@
188188
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
189189
sequence_2D (dict):
190190
1. enable: bool, whether enable the 2D sequence parallel or not.
191-
2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
191+
2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
192192
head_size * context_size should be equal tensor size.
193193
3. context_size: int, the parallel degree of context parallelism.
194194
head_size * context_size should be equal tensor size.
195195
4. window_size: int, the sliding window size in context parallelism.
196196
5. device_placement_strategy: dict,
197-
head_first: bool, if `True`, ranks of the same head parallel group are
197+
head_first: bool, if `True`, ranks of the same head parallel group are
198198
given high priority for colocation on the same node;
199199
if `False`, ranks of the same context parallel group are
200200
given high priority for colocation on the same node;
201-
interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
201+
interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
202202
interleaved the ranks in the same window to make full use of NIC as much as possible.
203203
"""
204204
parallel = dict(
@@ -223,7 +223,6 @@
223223
alert=dict(
224224
enable_feishu_alert=DO_ALERT,
225225
feishu_alert_address=None, # feishu webhook to send alert message
226-
light_monitor_address=None, # light_monitor address to send heartbeat
227226
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
228227
),
229228
tensorboard=dict(

configs/7B_llama2.py

-1
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@
195195
alert=dict(
196196
enable_feishu_alert=DO_ALERT,
197197
feishu_alert_address=None, # feishu webhook to send alert message
198-
light_monitor_address=None, # light_monitor address to send heartbeat
199198
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
200199
),
201200
tensorboard=dict(

configs/7B_qwen2.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
CHECKPOINT_EVERY = 50
2424
ckpt = dict(
2525
enable_save_ckpt=False, # enable ckpt save.
26-
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
26+
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
2727
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
2828
# 'load_ckpt_info' setting guide:
2929
# 1. the 'path' indicate ckpt path,
@@ -203,7 +203,6 @@
203203
alert=dict(
204204
enable_feishu_alert=DO_ALERT,
205205
feishu_alert_address=None, # feishu webhook to send alert message
206-
light_monitor_address=None, # light_monitor address to send heartbeat
207206
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
208207
),
209208
tensorboard=dict(

configs/7B_sft.py

-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,6 @@
205205
alert=dict(
206206
enable_feishu_alert=DO_ALERT,
207207
feishu_alert_address=None, # feishu webhook to send alert message
208-
light_monitor_address=None, # light_monitor address to send heartbeat
209208
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
210209
),
211210
tensorboard=dict(

configs/8x22B_mixtral.py

-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@
221221
alert=dict(
222222
enable_feishu_alert=DO_ALERT,
223223
feishu_alert_address=None, # feishu webhook to send alert message
224-
light_monitor_address=None, # light_monitor address to send heartbeat
225224
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
226225
),
227226
tensorboard=dict(

configs/8x7B_mixtral.py

-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@
221221
alert=dict(
222222
enable_feishu_alert=DO_ALERT,
223223
feishu_alert_address=None, # feishu webhook to send alert message
224-
light_monitor_address=None, # light_monitor address to send heartbeat
225224
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
226225
),
227226
tensorboard=dict(

configs/demo.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
# 'load_ckpt_info' setting guide:
3535
# 1. the 'path' indicate ckpt path,
3636
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
37-
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
37+
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
3838
# load function such as "llama"
3939
load_ckpt_info=dict(path=LOAD_CKPT_FOLDER, content=("model",), ckpt_type="internevo"),
4040
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -141,7 +141,6 @@
141141
alert=dict(
142142
enable_feishu_alert=DO_ALERT,
143143
feishu_alert_address=None, # feishu webhook to send alert message
144-
light_monitor_address=None, # light_monitor address to send heartbeat
145144
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
146145
),
147146
tensorboard=dict(

configs/demo_llava.py

-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@
178178
alert=dict(
179179
enable_feishu_alert=DO_ALERT,
180180
feishu_alert_address=None, # feishu webhook to send alert message
181-
light_monitor_address=None, # light_monitor address to send heartbeat
182181
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
183182
),
184183
tensorboard=dict(

doc/code-docs/locales/en/LC_MESSAGES/monitor.po

+9-85
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ msgid ""
77
msgstr ""
88
"Project-Id-Version: InternLM \n"
99
"Report-Msgid-Bugs-To: \n"
10-
"POT-Creation-Date: 2024-08-30 16:07+0800\n"
10+
"POT-Creation-Date: 2024-11-20 15:01+0800\n"
1111
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
1212
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1313
"Language: en\n"
@@ -16,7 +16,7 @@ msgstr ""
1616
"MIME-Version: 1.0\n"
1717
"Content-Type: text/plain; charset=utf-8\n"
1818
"Content-Transfer-Encoding: 8bit\n"
19-
"Generated-By: Babel 2.15.0\n"
19+
"Generated-By: Babel 2.14.0\n"
2020

2121
#: ../../source/monitor.rst:2
2222
msgid "监控和告警"
@@ -56,25 +56,12 @@ msgstr ""
5656
"``internlm.monitor.alert.send_feishu_msg_with_webhook()``."
5757

5858
#: ../../source/monitor.rst:25
59-
msgid "轻量监控"
60-
msgstr "Light Monitoring"
59+
msgid "监控告警配置"
60+
msgstr "Monitor Config"
6161

62-
#: ../../source/monitor.rst:27
62+
#: ../../source/monitor.rst:28
6363
msgid ""
64-
"InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternEvo还可以通过"
65-
" `grafana dashboard <https://grafana.com/grafana/dashboards/>`_ "
66-
"直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。"
67-
msgstr ""
68-
"The InternEvo light monitoring tool employs a heartbeat mechanism to "
69-
"real-time monitor various metrics during the training process, such as "
70-
"loss, grad_norm, and training phase duration. Additionally, InternEvo can"
71-
" present these metric details through a `grafana dashboard "
72-
"<https://grafana.com/grafana/dashboards/>`_, allowing users to conduct "
73-
"more comprehensive and in-depth training analysis in an intuitive manner."
74-
75-
#: ../../source/monitor.rst:29
76-
msgid ""
77-
"轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file "
64+
"配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file "
7865
"<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_ "
7966
"来更改监控配置。以下是一个监控配置的示例:"
8067
msgstr ""
@@ -84,23 +71,17 @@ msgstr ""
8471
"<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_."
8572
" Here is an example of a monitoring configuration:"
8673

87-
#: ../../source/monitor.rst:42
74+
#: ../../source/monitor.rst:40
8875
msgid "enable_feishu_alert (bool):是否启用飞书告警。默认值:False。"
8976
msgstr "enable_feishu_alert: Whether to enable Feishu alerts. Defaults: False."
9077

91-
#: ../../source/monitor.rst:43
78+
#: ../../source/monitor.rst:41
9279
msgid "feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。"
9380
msgstr ""
9481
"feishu_alert_address: The webhook address for Feishu alerts. Defaults: "
9582
"None."
9683

97-
#: ../../source/monitor.rst:44
98-
msgid "light_monitor_address (str):轻量监控的地址。默认值:None。"
99-
msgstr ""
100-
"light_monitor_address: The address for lightweight monitoring. Defaults: "
101-
"None."
102-
103-
#: ../../source/monitor.rst:45
84+
#: ../../source/monitor.rst:42
10485
msgid "alert_file_path (str):告警存储路径。默认值:None。"
10586
msgstr "alert_file_path: path of alert. Defaults: None."
10687

@@ -213,60 +194,3 @@ msgstr "alert_file_path: path of alert. Defaults: None."
213194

214195
#~ msgid "示例"
215196
#~ msgstr "Example"
216-
217-
#~ msgid ""
218-
#~ "Initialize the monitoring module with "
219-
#~ "the default address ``initialize_light_monitor()``"
220-
#~ msgstr ""
221-
222-
#~ msgid "Send a heartbeat message to a monitoring server."
223-
#~ msgstr ""
224-
225-
#~ msgid ""
226-
#~ "The type of heartbeat message, e.g., "
227-
#~ "\"train_metrics\", \"init_time\", \"stage_time\"."
228-
#~ msgstr ""
229-
230-
#~ msgid "A dictionary containing message data to be included in the heartbeat."
231-
#~ msgstr ""
232-
233-
#~ msgid ""
234-
#~ "Sending a heartbeat message for training"
235-
#~ " metrics ``send_heartbeat(\"train_metrics\", {\"loss\":"
236-
#~ " 0.1, \"accuracy\": 0.95})``"
237-
#~ msgstr ""
238-
239-
#~ msgid ""
240-
#~ "Sending a heartbeat message for "
241-
#~ "initialization time ``send_heartbeat(\"init_time\", "
242-
#~ "{\"import_time\": 0.25})``"
243-
#~ msgstr ""
244-
245-
#~ msgid ""
246-
#~ "Sending a heartbeat message for stage"
247-
#~ " time ``send_heartbeat(\"stage_time\", {\"fwd_time\":"
248-
#~ " 2.3, \"bwd_time\": 6.2})``"
249-
#~ msgstr ""
250-
251-
#~ msgid ""
252-
#~ "InternEvo 使用 "
253-
#~ "``internlm.monitor.alert.initialize_light_monitor`` "
254-
#~ "来初始化轻量监控客户端。一旦初始化完成,它会建立与监控服务器的连接。在训练过程中,使用 "
255-
#~ "``internlm.monitor.alert.send_heartbeat`` "
256-
#~ "来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常,并在需要时发送警报消息。"
257-
#~ msgstr ""
258-
#~ "InternEvo uses "
259-
#~ "``internlm.monitor.alert.initialize_light_monitor`` to "
260-
#~ "initialize the lightweight monitoring client."
261-
#~ " Once initialization is complete, it "
262-
#~ "establishes a connection with the "
263-
#~ "monitoring server. During the training "
264-
#~ "process, it uses "
265-
#~ "``internlm.monitor.alert.send_heartbeat`` to send "
266-
#~ "various types of heartbeat messages to"
267-
#~ " the monitoring server. The monitoring "
268-
#~ "server uses these heartbeat messages to"
269-
#~ " detect if the training encounters "
270-
#~ "any abnormalities and sends alert "
271-
#~ "messages as needed."
272-

0 commit comments

Comments
 (0)