From 05ddf09fa198b7074c8c08e6501d79d608901a44 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Tue, 12 Jan 2021 15:29:32 +0800 Subject: [PATCH 01/23] bert pretraining adam squad sh --- LanguageModeling/BERT/run_pretraining_adam.sh | 45 +++++++++++ LanguageModeling/BERT/run_squad.sh | 74 +++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 LanguageModeling/BERT/run_pretraining_adam.sh create mode 100644 LanguageModeling/BERT/run_squad.sh diff --git a/LanguageModeling/BERT/run_pretraining_adam.sh b/LanguageModeling/BERT/run_pretraining_adam.sh new file mode 100644 index 0000000..f57e55d --- /dev/null +++ b/LanguageModeling/BERT/run_pretraining_adam.sh @@ -0,0 +1,45 @@ +BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT +OUTPUT_DIR=/DATA/disk1/of_output + +DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128 + + +BZ=48 +ITER_NUM=1000000 +max_seq_length=128 +max_predictions_per_seq=20 + +of_log_dir=$OUTPUT_DIR/bert_master/of +rm -rf ${of_log_dir} +mkdir -p ${of_log_dir} +rm -rf core.* + +export PYTHONUNBUFFERED=1 +export ONEFLOW_DEBUG_MODE=True +export GLOG_v=3 +export CUDA_VISIBLE_DEVICES=6 +python3 $BENCH_ROOT_DIR/run_pretraining.py \ + --gpu_num_per_node=1 \ + --num_nodes=1 \ + --learning_rate=1.25e-5 \ + --warmup_proportion=0.01 \ + --weight_decay_rate=0.01 \ + --batch_size_per_device=${BZ} \ + --iter_num=${ITER_NUM} \ + --loss_print_every_n_iter=1 \ + --seq_length=128 \ + --use_fp16 \ + --max_predictions_per_seq=20 \ + --num_hidden_layers=12 \ + --num_attention_heads=12 \ + --max_position_embeddings=512 \ + --type_vocab_size=2 \ + --vocab_size=30522 \ + --attention_probs_dropout_prob=0.1 \ + --hidden_dropout_prob=0.1 \ + --hidden_size_per_head=64 \ + --data_part_num=64 \ + --data_dir=$DATA_DIR \ + --log_dir=${of_log_dir} \ + --model_save_every_n_iter=50000 \ + --model_save_dir=${of_log_dir} diff --git a/LanguageModeling/BERT/run_squad.sh b/LanguageModeling/BERT/run_squad.sh new file mode 100644 index 0000000..20ade70 --- /dev/null +++ b/LanguageModeling/BERT/run_squad.sh @@ -0,0 +1,74 @@ +BENCH_ROOT_DIR=/home/oyy/workspace/OneFlow-Benchmark/LanguageModeling/BERT +# pretrained model dir +PRETRAINED_MODEL=/DATA/disk1/of_output/uncased_L-12_H-768_A-12_oneflow + +# squad ofrecord dataset dir +DATA_ROOT=/DATA/disk1/of_output/bert/of_squad + +# `vocab.txt` dir +REF_ROOT_DIR=/DATA/disk1/of_output/uncased_L-12_H-768_A-12 + +# `evaluate-v*.py` and `dev-v*.json` dir +SQUAD_TOOL_DIR=/DATA/disk1/of_output/bert/of_squad +db_version=${1:-"v2.0"} +if [ $db_version = "v1.1" ]; then + train_example_num=88614 + eval_example_num=10833 + version_2_with_negative="False" +elif [ $db_version = "v2.0" ]; then + train_example_num=131944 + eval_example_num=12232 + version_2_with_negative="True" +else + echo "db_version must be 'v1.1' or 'v2.0'" + exit +fi + +train_data_dir=$DATA_ROOT/train-$db_version +eval_data_dir=$DATA_ROOT/dev-$db_version +LOGFILE=./bert_fp_training.log +export PYTHONUNBUFFERED=1 +export ONEFLOW_DEBUG_MODE=True +export CUDA_VISIBLE_DEVICES=7 +# finetune and eval SQuAD, +# `predictions.json` will be saved to folder `./squad_output` +python3 $BENCH_ROOT_DIR/run_squad.py \ + --model=SQuAD \ + --do_train=True \ + --do_eval=True \ + --gpu_num_per_node=1 \ + --learning_rate=3e-5 \ + --batch_size_per_device=16 \ + --eval_batch_size_per_device=16 \ + --num_epoch=3 \ + --use_fp16 \ + --version_2_with_negative=$version_2_with_negative \ + --loss_print_every_n_iter=20 \ + --do_lower_case=True \ + --seq_length=384 \ + --num_hidden_layers=12 \ + --num_attention_heads=12 \ + --max_position_embeddings=512 \ + --type_vocab_size=2 \ + --vocab_size=30522 \ + --attention_probs_dropout_prob=0.1 \ + --hidden_dropout_prob=0.1 \ + --hidden_size_per_head=64 \ + --train_data_dir=$train_data_dir \ + --train_example_num=$train_example_num \ + --eval_data_dir=$eval_data_dir \ + --eval_example_num=$eval_example_num \ + --log_dir=./log \ + --model_load_dir=${PRETRAINED_MODEL} \ + --save_last_snapshot=True \ + --model_save_dir=./squad_snapshots \ + --vocab_file=$REF_ROOT_DIR/vocab.txt \ + --predict_file=$SQUAD_TOOL_DIR/dev-${db_version}.json \ + --output_dir=./squad_output 2>&1 | tee ${LOGFILE} + + +# evaluate predictions.json to get metrics +python3 $SQUAD_TOOL_DIR/evaluate-${db_version}.py \ + $SQUAD_TOOL_DIR/dev-${db_version}.json \ + ./squad_output/predictions.json + From de98facc2827a425e05cc7da63f024233f0f3269 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Thu, 14 Jan 2021 14:33:25 +0800 Subject: [PATCH 02/23] args num_accumulation_steps --- LanguageModeling/BERT/config.py | 2 ++ LanguageModeling/BERT/run_pretraining_adam.sh | 1 + LanguageModeling/BERT/util.py | 1 + 3 files changed, 4 insertions(+) diff --git a/LanguageModeling/BERT/config.py b/LanguageModeling/BERT/config.py index df45fef..b993449 100644 --- a/LanguageModeling/BERT/config.py +++ b/LanguageModeling/BERT/config.py @@ -58,6 +58,8 @@ def get_parser(parser=None): help='use use fp16 or not') parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True, help='Whether to use use xla') + parser.add_argument("num_accumulation_steps", type=int, default=1, + help='Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size') # log and resore/save parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False, diff --git a/LanguageModeling/BERT/run_pretraining_adam.sh b/LanguageModeling/BERT/run_pretraining_adam.sh index f57e55d..a2bc89e 100644 --- a/LanguageModeling/BERT/run_pretraining_adam.sh +++ b/LanguageModeling/BERT/run_pretraining_adam.sh @@ -32,6 +32,7 @@ python3 $BENCH_ROOT_DIR/run_pretraining.py \ --max_predictions_per_seq=20 \ --num_hidden_layers=12 \ --num_attention_heads=12 \ + --num_accumulation_steps=1 \ --max_position_embeddings=512 \ --type_vocab_size=2 \ --vocab_size=30522 \ diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index 6b04d55..e875d0f 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -155,6 +155,7 @@ def CreateOptimizer(args): def GetFunctionConfig(args): config = flow.function_config() config.enable_auto_mixed_precision(args.use_fp16) + config.train.num_gradient_accumulation_steps(args.num_accumulation_steps) if args.use_xla: config.use_xla_jit(True) config.enable_fuse_add_to_output(True) From 89b4e05affd76ffc614092b2fe79a7087029c480 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Thu, 14 Jan 2021 14:51:16 +0800 Subject: [PATCH 03/23] fix args --- LanguageModeling/BERT/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LanguageModeling/BERT/config.py b/LanguageModeling/BERT/config.py index b993449..ca53964 100644 --- a/LanguageModeling/BERT/config.py +++ b/LanguageModeling/BERT/config.py @@ -58,7 +58,7 @@ def get_parser(parser=None): help='use use fp16 or not') parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True, help='Whether to use use xla') - parser.add_argument("num_accumulation_steps", type=int, default=1, + parser.add_argument("--num_accumulation_steps", type=int, default=1, help='Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size') # log and resore/save From ed66b480a8fc814624ad0659c69cd58aa4d2dd06 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Tue, 19 Jan 2021 17:16:49 +0800 Subject: [PATCH 04/23] fix batch_size --- LanguageModeling/BERT/run_pretraining.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index 0b04de6..e784400 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -105,7 +105,7 @@ def main(): snapshot = Snapshot(args.model_save_dir, args.model_load_dir) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, - batch_size=batch_size, keys=['total_loss', 'mlm_loss', 'nsp_loss']) + batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) #PretrainJob().async_get(metric.metric_cb(step, epoch=3)) From 0e4d46bb0cef508a804271dbd7e14c1d73f5b3c9 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Wed, 20 Jan 2021 11:59:42 +0800 Subject: [PATCH 05/23] bert lamb --- LanguageModeling/BERT/config.py | 2 + LanguageModeling/BERT/run_pretraining_lamb.sh | 47 +++++++++++++++++++ LanguageModeling/BERT/util.py | 8 +++- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 LanguageModeling/BERT/run_pretraining_lamb.sh diff --git a/LanguageModeling/BERT/config.py b/LanguageModeling/BERT/config.py index ca53964..79ec270 100644 --- a/LanguageModeling/BERT/config.py +++ b/LanguageModeling/BERT/config.py @@ -60,6 +60,8 @@ def get_parser(parser=None): help='Whether to use use xla') parser.add_argument("--num_accumulation_steps", type=int, default=1, help='Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size') + parser.add_argument("--optimizer_type", type=str, default="adam", + help="Optimizer used for training - LAMB or ADAM") # log and resore/save parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False, diff --git a/LanguageModeling/BERT/run_pretraining_lamb.sh b/LanguageModeling/BERT/run_pretraining_lamb.sh new file mode 100644 index 0000000..a8ffd8b --- /dev/null +++ b/LanguageModeling/BERT/run_pretraining_lamb.sh @@ -0,0 +1,47 @@ +BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT +OUTPUT_DIR=/DATA/disk1/of_output + +DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128 + + +BZ=16 +ITER_NUM=1000000 +max_seq_length=128 +max_predictions_per_seq=20 + +of_log_dir=$OUTPUT_DIR/bert_master/of +rm -rf ${of_log_dir} +mkdir -p ${of_log_dir} +rm -rf core.* + +export PYTHONUNBUFFERED=1 +export ONEFLOW_DEBUG_MODE=True +export GLOG_v=3 + +python3 $BENCH_ROOT_DIR/run_pretraining.py \ + --gpu_num_per_node=8 \ + --num_nodes=1 \ + --learning_rate=1e-4 \ + --warmup_proportion=0.01 \ + --weight_decay_rate=0.01 \ + --batch_size_per_device=${BZ} \ + --iter_num=${ITER_NUM} \ + --loss_print_every_n_iter=1 \ + --seq_length=128 \ + --use_fp16 \ + --optimizer_type="lamb" \ + --max_predictions_per_seq=20 \ + --num_hidden_layers=12 \ + --num_attention_heads=12 \ + --num_accumulation_steps=512 \ + --max_position_embeddings=512 \ + --type_vocab_size=2 \ + --vocab_size=30522 \ + --attention_probs_dropout_prob=0.1 \ + --hidden_dropout_prob=0.1 \ + --hidden_size_per_head=64 \ + --data_part_num=64 \ + --data_dir=$DATA_DIR \ + --log_dir=${of_log_dir} \ + --model_save_every_n_iter=50000 \ + --model_save_dir=${of_log_dir} diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index e875d0f..9ace96c 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -147,7 +147,13 @@ def CreateOptimizer(args): loss_scale_policy = None if args.use_fp16: loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000); - return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate, + + if args.optimizer_type == "lamb": + return flow.optimizer.LAMB(lr_scheduler, beta1=0.9, beta2=0.999, epsilon=1e-6, + grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0), + loss_scale_policy=loss_scale_policy) + else: + return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate, weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0), loss_scale_policy=loss_scale_policy) From 52adf9f63da4238acc99c8e4b315edd2cd6a3593 Mon Sep 17 00:00:00 2001 From: ouyangyu Date: Wed, 20 Jan 2021 15:16:36 +0800 Subject: [PATCH 06/23] lamb weight decay --- LanguageModeling/BERT/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index 9ace96c..f6ff7d1 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -149,7 +149,8 @@ def CreateOptimizer(args): loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000); if args.optimizer_type == "lamb": - return flow.optimizer.LAMB(lr_scheduler, beta1=0.9, beta2=0.999, epsilon=1e-6, + return flow.optimizer.LAMB(lr_scheduler, beta1=0.9, beta2=0.999, epsilon=1e-6, weight_decay=args.weight_decay_rate, + weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0), loss_scale_policy=loss_scale_policy) else: From 647dbd6b34b39c779d870d5cc80e1af3107c4c73 Mon Sep 17 00:00:00 2001 From: aishangjj <702572275@qq.com> Date: Wed, 19 May 2021 17:08:33 +0800 Subject: [PATCH 07/23] run_pretraining.py Add some parameters. util.py Add GPU memory for printing. analysis.py Analyze log files to get loss and GPU memory. result_analysis.py Compare the two results to generate a png image. stitching_pic.py Stitching pictures. train_perbert.sh Training script. train_perbert_list.sh Training list. --- LanguageModeling/BERT/analysis.py | 72 +++++ LanguageModeling/BERT/gpu_memory_usage.py | 62 +++++ LanguageModeling/BERT/result_analysis.py | 144 ++++++++++ LanguageModeling/BERT/run_pretraining.py | 26 +- LanguageModeling/BERT/stitching_pic.py | 38 +++ LanguageModeling/BERT/train_perbert.sh | 129 +++++++++ LanguageModeling/BERT/train_perbert_list.sh | 281 ++++++++++++++++++++ LanguageModeling/BERT/util.py | 5 + 8 files changed, 755 insertions(+), 2 deletions(-) create mode 100644 LanguageModeling/BERT/analysis.py create mode 100644 LanguageModeling/BERT/gpu_memory_usage.py create mode 100644 LanguageModeling/BERT/result_analysis.py create mode 100644 LanguageModeling/BERT/stitching_pic.py create mode 100644 LanguageModeling/BERT/train_perbert.sh create mode 100644 LanguageModeling/BERT/train_perbert_list.sh diff --git a/LanguageModeling/BERT/analysis.py b/LanguageModeling/BERT/analysis.py new file mode 100644 index 0000000..012d159 --- /dev/null +++ b/LanguageModeling/BERT/analysis.py @@ -0,0 +1,72 @@ +import argparse +import re +import json + +from ctypes import * + + + +def collect_loss(log_file, gpu_num): + print("loss : ",log_file) + + f = open(log_file,"r") + lines = f.readlines()#读取全部内容 + total_loss = [] + mlm_loss = [] + nsp_loss = [] + throughput =[] + memory=[] + + pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*") + for line in lines: + if(line.split(':')[0] == 'step'): + # print(line) + + match = pattern.match(line) + if match: + # print(match.groups()) + total_loss.append(match.group(2)) + mlm_loss.append(match.group(3)) + nsp_loss.append(match.group(4)) + throughput.append(match.group(5)) + if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'): + str_tmp = line.split(' [MiB]\\n')[1] + # print(str_tmp) + for i in range(gpu_num): + memory.append(str_tmp.split(' MiB\\n')[i]) + + return total_loss, mlm_loss, nsp_loss,throughput, memory + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("--log_file", type=str, default=None) + parser.add_argument("--mem_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--gpu_num", type=int, default=1) + + args = parser.parse_args() + + total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num) + # print(total_loss) + # print(mlm_loss) + # print(nsp_loss) + # print(throughput) + out={} + out['total_loss'] = total_loss + out['mlm_loss'] = mlm_loss + out['nsp_loss'] = nsp_loss + out['throughput'] = throughput + out['memory'] = memory + # print(out) + + string = json.dumps(out) + with open(args.out_file,'w')as f: + f.write(string) + + +if __name__ == "__main__": + # libc = CDLL("libc.so.6") + # msg = "Hello, world!\n" + # libc.printf("Testing: %s \n", msg) + main() diff --git a/LanguageModeling/BERT/gpu_memory_usage.py b/LanguageModeling/BERT/gpu_memory_usage.py new file mode 100644 index 0000000..cf2e3f9 --- /dev/null +++ b/LanguageModeling/BERT/gpu_memory_usage.py @@ -0,0 +1,62 @@ +import time +import argparse +import pynvml + + +class Device(object): + class Status: + INIT = "INIT" + DETECTING = "DETECTING" + STOP = "STOP" + + start_detecting_mem_threshold = 32 * 1024 * 1024 + + def __init__(self, handle): + self.handle = handle + self.status = self.Status.INIT + self.max_mem_usage = 0 + + def update(self): + info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) + if self.status == self.Status.INIT: + if info.used > self.start_detecting_mem_threshold: + self.status = self.Status.DETECTING + elif self.status == self.Status.DETECTING: + if info.used < self.start_detecting_mem_threshold: + self.status = self.Status.STOP + return False + else: + self.max_mem_usage = max(self.max_mem_usage, info.used) + elif self.status == self.Status.STOP: + raise ValueError("detecting is stop") + else: + raise ValueError("invalid status") + + return True + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("-g", type=int, default=1, help="number of gpu devices") + parser.add_argument("-n", type=float, default=1, help="metrics rate") + args = parser.parse_args() + + pynvml.nvmlInit() + n_gpus = args.g + devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)] + + running = True + while running: + time.sleep(args.n) + running = False + for device in devices: + running |= device.update() + + pynvml.nvmlShutdown() + for i, device in enumerate(devices): + max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 + print(f"{max_mem_usage_mbytes:.2f}") + + +if __name__ == "__main__": + main() diff --git a/LanguageModeling/BERT/result_analysis.py b/LanguageModeling/BERT/result_analysis.py new file mode 100644 index 0000000..5f0b8f3 --- /dev/null +++ b/LanguageModeling/BERT/result_analysis.py @@ -0,0 +1,144 @@ +import json +import matplotlib.pyplot as plt +import argparse +import numpy as np + +def read_file(file1, file2): + + with open(file1,'r') as load_f: + dict1 = json.load(load_f) + + with open(file2,'r') as load_f: + dict2 = json.load(load_f) + + + return dict1, dict2 + +def analysis_loss(y3, f32): + # if f32 == 1: + # iter = len(y3) + + # else: + # len = y3.shape[0] + # min = y3.min() + # max = y3.max() + # mean = np.mean(y3) + # var = np.var(y3) + + # random = np.random.randn(len)/10000.0 + # print(random) + # print(y3) + # lst=[y3,random] + # res=np.corrcoef(lst) + # print('----->', res) + + len = y3.shape[0] + + if f32 == 1: + iter = np.count_nonzero(y3==0) + tmp = iter/len + print('count zeor = ', iter) + if iter/len > 0.6: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + else: + + mean = np.mean(y3) + var = np.var(y3) + print('F16---->', abs(mean), var) + + if abs(mean) < 0.001 and var < 0.00001: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + +def drawing_loss(dict1, dict2, f32, image): + + + m1 = dict1["memory"] + m2 = dict2["memory"] + + print(m1) + print(m2) + table_len = len(m1) + row_labels = ['old','new'] + table_vals = [m1,m2] + + + y1 = dict1["total_loss"] + y2 = dict2["total_loss"] + y1=list(map(float,y1)) + y2=list(map(float,y2)) + x = np.arange(1, len(y1)+1) + y1 = np.array(y1) + y2 = np.array(y2) + y3 = np.subtract(y1,y2) + # v =list(map(lambda x,y:x - y)) + + result = analysis_loss(y3, f32) + + print(x) + print(y1) + print(y2) + print(y3) + fig = plt.figure(figsize=(24,8), dpi=150) + plt.figure(1) + ax = fig.add_subplot() + + + ax1 = plt.subplot(121) + plt.xlabel('iterations') + plt.plot(x,y1,color='red',label='Diachronic version') + plt.plot(x,y2,color='blue',label='Current version') + plt.title('Loss comparison') + + plt.legend(loc='best') + + ax2 = plt.subplot(122) + plt.xlabel('iterations') + plt.plot(x,y3,color='red') + plt.title('Loss difference') + plt.table(cellText=table_vals, rowLabels=row_labels, colWidths=[0.05]*table_len, loc='best') + + plt.suptitle(image.split('/')[1].split('.')[0],fontsize=20,x=0.5,y=0.98) + + if result == 1: + plt.text(0.9, 1,'PASS', fontsize=50, color='blue', transform=ax.transAxes) + else: + plt.text(0.9, 1,'FAILED',fontsize=50,color='red',transform=ax.transAxes) + plt.savefig(image) + + +# def analysis_f32(dict1, dict2): +# return 1 +# def analysis_f16(dict1, dict2): +# return 1 + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Compare and analyze training results and output icons") + parser.add_argument("--cmp1_file", type=str, default=None) + parser.add_argument("--cmp2_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--f32", type=int, default=0) + args = parser.parse_args() + + # print('---------------------') + dict1, dict2 = read_file(args.cmp1_file, args.cmp2_file) + + # if args.f32 == 1: + # result = analysis_f32(dict1, dict2) + # else: + # result = analysis_f16(dict1, dict2) + + + drawing_loss(dict1, dict2, args.f32, args.out_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index e784400..3155317 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -29,9 +29,26 @@ parser.add_argument("--data_part_num", type=int, default=32, help="data part number in dataset") parser.add_argument("--iter_num", type=int, default=1144000, help="total iterations to run") parser.add_argument("--batch_size_per_device", type=int, default=64) +parser.add_argument("--debug", type=int, default=0) +parser.add_argument("--data_load_random", type=int, default=1) +parser.add_argument("--model_load", type=str, default=None) + + args = parser.parse_args() configs.print_args(args) + +if args.debug == 1: + flow.config.enable_debug_mode(True) + print('Enable Debug !!!!!!!') + +if args.data_load_random == 1: + random_tmp=True + print('Enable random loading of data !!!!!!!') +else: + random_tmp=False + print('Disable random loading of data !!!!!!!') + batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device @@ -39,8 +56,8 @@ def BertDecoder(data_dir, batch_size, data_part_num, seq_length, max_predictions ofrecord = flow.data.ofrecord_reader(data_dir, batch_size=batch_size, data_part_num=data_part_num, - random_shuffle = True, - shuffle_after_epoch=True) + random_shuffle = random_tmp, + shuffle_after_epoch=random_tmp) blob_confs = {} def _blob_conf(name, shape, dtype=flow.int32): blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) @@ -104,6 +121,11 @@ def main(): snapshot = Snapshot(args.model_save_dir, args.model_load_dir) + + if args.model_load != None: + flow.load_variables(flow.checkpoint.get(args.model_load)) + + print('num_accumulation_steps:', args.num_accumulation_steps) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): diff --git a/LanguageModeling/BERT/stitching_pic.py b/LanguageModeling/BERT/stitching_pic.py new file mode 100644 index 0000000..f2dbb2b --- /dev/null +++ b/LanguageModeling/BERT/stitching_pic.py @@ -0,0 +1,38 @@ +import matplotlib.pyplot as plt +import argparse +import os +from PIL import Image + +def stiiching_pic(dir, out_file): + + hight = 1200 + width = 3600 + file_lsit = os.listdir(dir) + target = Image.new('RGBA', (width, hight*len(file_lsit))) + left = 0 + right = hight + for file in file_lsit: + + tmp = dir+'/'+file + print(tmp) + image = Image.open(tmp) + # print(image) + # print(target) + target.paste(image, (0, left, width, right)) + left += hight + right += hight + target.save(out_file) + + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Stitching pictures") + parser.add_argument("--dir", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + args = parser.parse_args() + + stiiching_pic(args.dir, args.out_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh new file mode 100644 index 0000000..d662ecb --- /dev/null +++ b/LanguageModeling/BERT/train_perbert.sh @@ -0,0 +1,129 @@ +USE_FP16=${1:-0} +#DEBUG +DEBUG_MODE=${2:-0} +# +BATCH_SIZE=${3:-64} +#accumulation +ACCUMULATION_STEMPS=${4-1} +# +OPTIMIZER=${5-adam} +#GPU +GPUS_PER_NODE=${6:-8} +# +ITER_NUM=${7:-100} +# +PRINT_ITER=${8:-1} +# +NODE_RANK=${9:-0} + +LOG_FOLDER=./log/ + +########################################################################################################## +# FP +########################################################################################################## +echo ${USE_FP16} + + +if [ "$USE_FP16" = 1 ];then + FP_CMD=--use_fp16 + FP_NAME=f16 + echo "USE_FP16" +else + FP_CMD= + FP_NAME=f32 + echo "USE_FP32" +fi +########################################################################################################## +# DEBUG_NAME +########################################################################################################## +# if [ DEBUG_MODE==1 ];then +# DEBUG_NAME=debug +# else +# DEBUG_NAME= +# fi + +########################################################################################################## +# Create folder +########################################################################################################## +#bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug +mkdir -p $LOG_FOLDER + +# OUTFILE=bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ +# _${DEBUG_NAME} +# mkdir -p $OUTFILE + +LOGFILE=$LOG_FOLDER/bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ +_${DEBUG_NAME}.log + +MODEL_DIR=./snapshots/ +# DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128/ +DATA_DIR=/data/bert_dataset + +MEM_FILE=$LOG_FOLDER/memory.log + +echo LOGFILE=$LOGFILE +echo DATA_DIR=$DATA_DIR + +#${NNODES}n${GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_pretrain_${NODE_RANK}.log +rm -rf ${MODEL_DIR}/* +rm -rf ${LOG_FOLDER}/* + +NVPROF=baseline-report_${NODE_RANK} +#-o ${NVPROF} + + +# -g $GPUS_PER_NODE \ +# -n 0.5 \ + + +#nsys profile --stats=true -o ${NVPROF} \ +python3 run_pretraining.py \ + --gpu_num_per_node=${GPUS_PER_NODE} \ + --num_nodes=1 \ + --learning_rate=1e-4 \ + --warmup_proportion=0.01 \ + --weight_decay_rate=0.01 \ + --batch_size_per_device=${BATCH_SIZE} \ + --iter_num=${ITER_NUM} \ + --loss_print_every_n_iter=${PRINT_ITER} \ + --seq_length=128 \ + --max_predictions_per_seq=20 \ + --num_hidden_layers=12 \ + --num_attention_heads=12 \ + --num_accumulation_steps=${ACCUMULATION_STEMPS} \ + --max_position_embeddings=512 \ + --type_vocab_size=2 \ + --vocab_size=30522 \ + --attention_probs_dropout_prob=0 \ + --hidden_dropout_prob=0 \ + --hidden_size_per_head=64 \ + --data_part_num=64 \ + --data_dir=$DATA_DIR \ + --log_dir=${LOG_FOLDER} \ + --model_save_every_n_iter=10000 \ + --save_last_snapshot=True \ + --model_save_dir=./snapshots \ + --debug=${DEBUG_MODE} \ + --data_load_random=0 \ + --model_load=/opt/initial_model \ + ${FP_CMD} \ + --optimizer_type=${OPTIMIZER} \ + 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" + +SQLITE=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.sqlite +QDREP=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.qdrep + +# mv $NVPROF.sqlite $SQLITE +# mv $NVPROF.qdrep $QDREP + +json_file=${LOG_FOLDER}out.json +python analysis.py \ + --log_file=$LOGFILE \ + --mem_file=$MEM_FILE \ + --out_file=$json_file \ + --gpu_num=$GPUS_PER_NODE + +# --use_fp16 \ + diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh new file mode 100644 index 0000000..6dc1fcb --- /dev/null +++ b/LanguageModeling/BERT/train_perbert_list.sh @@ -0,0 +1,281 @@ +NUM=${2-1} + +file_op() +{ + mkdir -p $1 + mv -f log_f* $1 + + # tar -zcvf $1.tar.gz $1 + # rm -rf $1 +} +################################################################################# +rm -rf out +rm -rf pic +mkdir pic + + +# ############################################################################### +# # f32 adam +# ############################################################################### + +# for ((i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 64 1 adam +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter + +# # mkdir bert_f32_pretraining_8gpu_64bs_100iter +# # mv -f log_f32_* bert_f32_pretraining_8gpu_64bs_100iter + +# # tar -zcvf bert_f32_pretraining_8gpu_64bs_100iter.tar.gz \ +# # bert_f32_pretraining_8gpu_64bs_100iter +# # rm -rf bert_f32_pretraining_8gpu_64bs_100iter +# ############################################################################### +# # f32 lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 64 1 lamb +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb + +# # ############################################################################### +# # # f16 adam +# # ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 64 1 adam +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter + +# ############################################################################### +# # f16 lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 64 1 lamb +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb + +# ############################################################################### +# # f32 accumulation adam +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 32 2 adam +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation + +# ############################################################################### +# # f32 accumulation lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 32 2 lamb +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb + +# ############################################################################### +# # f16 accumulation adam +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 32 2 adam +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation + +# ############################################################################### +# # f16 accumulation lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 32 2 lamb +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb + + +############################################################################### +# f32 adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 64 1 adam + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_debug.png +############################################################################### +# f32 lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 64 1 lamb + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug.png +############################################################################### +# f16 adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 64 1 adam + cp -rf log/ log_f16_${i} +done +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_debug.png +############################################################################### +# f16 lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 64 1 lamb + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug.png +############################################################################### +# f32 accumulation adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 32 2 adam + cp -rf log/ log_f32_${i} + +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug.png + +############################################################################### +# f32 accumulation lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 32 2 lamb + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png + +############################################################################### +# f16 accumulation adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 32 2 adam + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug.png +############################################################################### +# f16 accumulation lamb +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 32 2 lamb + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png +# ############################################################################## +# tar +# ############################################################################## + +tar -zcvf out.tar.gz out + +python stitching_pic.py --dir=pic --out_file=./pic/all.png +# rm -rf out +############################################################################### +# upload +############################################################################### + + + + + diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index f6ff7d1..e62c3e4 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -21,6 +21,7 @@ import pandas as pd from datetime import datetime import oneflow as flow +import subprocess def InitNodes(args): @@ -103,6 +104,7 @@ def __init__(self, desc='train', print_steps=-1, batch_size=256, keys=[]): self.timer.start() self._clear() + def _clear(self): for key in self.keys: self.metric_dict[key] = 0.0 @@ -118,6 +120,9 @@ def metric_cb(self, step=0, **kwargs): def callback(outputs): if step == 0: self._clear() + if step == 1: + print(subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv ", shell=True)) + for key in self.keys: self.metric_dict[key] += outputs[key].sum() self.metric_dict['n_' + key] += outputs[key].size From a4299c082f52f6ec1de11dcb900115b31ecc137b Mon Sep 17 00:00:00 2001 From: aishangjj <702572275@qq.com> Date: Wed, 19 May 2021 17:13:25 +0800 Subject: [PATCH 08/23] Revert "run_pretraining.py Add some parameters." This reverts commit 647dbd6b34b39c779d870d5cc80e1af3107c4c73. --- LanguageModeling/BERT/analysis.py | 72 ----- LanguageModeling/BERT/gpu_memory_usage.py | 62 ----- LanguageModeling/BERT/result_analysis.py | 144 ---------- LanguageModeling/BERT/run_pretraining.py | 26 +- LanguageModeling/BERT/stitching_pic.py | 38 --- LanguageModeling/BERT/train_perbert.sh | 129 --------- LanguageModeling/BERT/train_perbert_list.sh | 281 -------------------- LanguageModeling/BERT/util.py | 5 - 8 files changed, 2 insertions(+), 755 deletions(-) delete mode 100644 LanguageModeling/BERT/analysis.py delete mode 100644 LanguageModeling/BERT/gpu_memory_usage.py delete mode 100644 LanguageModeling/BERT/result_analysis.py delete mode 100644 LanguageModeling/BERT/stitching_pic.py delete mode 100644 LanguageModeling/BERT/train_perbert.sh delete mode 100644 LanguageModeling/BERT/train_perbert_list.sh diff --git a/LanguageModeling/BERT/analysis.py b/LanguageModeling/BERT/analysis.py deleted file mode 100644 index 012d159..0000000 --- a/LanguageModeling/BERT/analysis.py +++ /dev/null @@ -1,72 +0,0 @@ -import argparse -import re -import json - -from ctypes import * - - - -def collect_loss(log_file, gpu_num): - print("loss : ",log_file) - - f = open(log_file,"r") - lines = f.readlines()#读取全部内容 - total_loss = [] - mlm_loss = [] - nsp_loss = [] - throughput =[] - memory=[] - - pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*") - for line in lines: - if(line.split(':')[0] == 'step'): - # print(line) - - match = pattern.match(line) - if match: - # print(match.groups()) - total_loss.append(match.group(2)) - mlm_loss.append(match.group(3)) - nsp_loss.append(match.group(4)) - throughput.append(match.group(5)) - if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'): - str_tmp = line.split(' [MiB]\\n')[1] - # print(str_tmp) - for i in range(gpu_num): - memory.append(str_tmp.split(' MiB\\n')[i]) - - return total_loss, mlm_loss, nsp_loss,throughput, memory - - -def main(): - parser = argparse.ArgumentParser(description="collect GPU device memory usage") - parser.add_argument("--log_file", type=str, default=None) - parser.add_argument("--mem_file", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - parser.add_argument("--gpu_num", type=int, default=1) - - args = parser.parse_args() - - total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num) - # print(total_loss) - # print(mlm_loss) - # print(nsp_loss) - # print(throughput) - out={} - out['total_loss'] = total_loss - out['mlm_loss'] = mlm_loss - out['nsp_loss'] = nsp_loss - out['throughput'] = throughput - out['memory'] = memory - # print(out) - - string = json.dumps(out) - with open(args.out_file,'w')as f: - f.write(string) - - -if __name__ == "__main__": - # libc = CDLL("libc.so.6") - # msg = "Hello, world!\n" - # libc.printf("Testing: %s \n", msg) - main() diff --git a/LanguageModeling/BERT/gpu_memory_usage.py b/LanguageModeling/BERT/gpu_memory_usage.py deleted file mode 100644 index cf2e3f9..0000000 --- a/LanguageModeling/BERT/gpu_memory_usage.py +++ /dev/null @@ -1,62 +0,0 @@ -import time -import argparse -import pynvml - - -class Device(object): - class Status: - INIT = "INIT" - DETECTING = "DETECTING" - STOP = "STOP" - - start_detecting_mem_threshold = 32 * 1024 * 1024 - - def __init__(self, handle): - self.handle = handle - self.status = self.Status.INIT - self.max_mem_usage = 0 - - def update(self): - info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) - if self.status == self.Status.INIT: - if info.used > self.start_detecting_mem_threshold: - self.status = self.Status.DETECTING - elif self.status == self.Status.DETECTING: - if info.used < self.start_detecting_mem_threshold: - self.status = self.Status.STOP - return False - else: - self.max_mem_usage = max(self.max_mem_usage, info.used) - elif self.status == self.Status.STOP: - raise ValueError("detecting is stop") - else: - raise ValueError("invalid status") - - return True - - -def main(): - parser = argparse.ArgumentParser(description="collect GPU device memory usage") - parser.add_argument("-g", type=int, default=1, help="number of gpu devices") - parser.add_argument("-n", type=float, default=1, help="metrics rate") - args = parser.parse_args() - - pynvml.nvmlInit() - n_gpus = args.g - devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)] - - running = True - while running: - time.sleep(args.n) - running = False - for device in devices: - running |= device.update() - - pynvml.nvmlShutdown() - for i, device in enumerate(devices): - max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 - print(f"{max_mem_usage_mbytes:.2f}") - - -if __name__ == "__main__": - main() diff --git a/LanguageModeling/BERT/result_analysis.py b/LanguageModeling/BERT/result_analysis.py deleted file mode 100644 index 5f0b8f3..0000000 --- a/LanguageModeling/BERT/result_analysis.py +++ /dev/null @@ -1,144 +0,0 @@ -import json -import matplotlib.pyplot as plt -import argparse -import numpy as np - -def read_file(file1, file2): - - with open(file1,'r') as load_f: - dict1 = json.load(load_f) - - with open(file2,'r') as load_f: - dict2 = json.load(load_f) - - - return dict1, dict2 - -def analysis_loss(y3, f32): - # if f32 == 1: - # iter = len(y3) - - # else: - # len = y3.shape[0] - # min = y3.min() - # max = y3.max() - # mean = np.mean(y3) - # var = np.var(y3) - - # random = np.random.randn(len)/10000.0 - # print(random) - # print(y3) - # lst=[y3,random] - # res=np.corrcoef(lst) - # print('----->', res) - - len = y3.shape[0] - - if f32 == 1: - iter = np.count_nonzero(y3==0) - tmp = iter/len - print('count zeor = ', iter) - if iter/len > 0.6: - print('Test passed') - return 1 - else: - print('Test failed') - return 0 - else: - - mean = np.mean(y3) - var = np.var(y3) - print('F16---->', abs(mean), var) - - if abs(mean) < 0.001 and var < 0.00001: - print('Test passed') - return 1 - else: - print('Test failed') - return 0 - -def drawing_loss(dict1, dict2, f32, image): - - - m1 = dict1["memory"] - m2 = dict2["memory"] - - print(m1) - print(m2) - table_len = len(m1) - row_labels = ['old','new'] - table_vals = [m1,m2] - - - y1 = dict1["total_loss"] - y2 = dict2["total_loss"] - y1=list(map(float,y1)) - y2=list(map(float,y2)) - x = np.arange(1, len(y1)+1) - y1 = np.array(y1) - y2 = np.array(y2) - y3 = np.subtract(y1,y2) - # v =list(map(lambda x,y:x - y)) - - result = analysis_loss(y3, f32) - - print(x) - print(y1) - print(y2) - print(y3) - fig = plt.figure(figsize=(24,8), dpi=150) - plt.figure(1) - ax = fig.add_subplot() - - - ax1 = plt.subplot(121) - plt.xlabel('iterations') - plt.plot(x,y1,color='red',label='Diachronic version') - plt.plot(x,y2,color='blue',label='Current version') - plt.title('Loss comparison') - - plt.legend(loc='best') - - ax2 = plt.subplot(122) - plt.xlabel('iterations') - plt.plot(x,y3,color='red') - plt.title('Loss difference') - plt.table(cellText=table_vals, rowLabels=row_labels, colWidths=[0.05]*table_len, loc='best') - - plt.suptitle(image.split('/')[1].split('.')[0],fontsize=20,x=0.5,y=0.98) - - if result == 1: - plt.text(0.9, 1,'PASS', fontsize=50, color='blue', transform=ax.transAxes) - else: - plt.text(0.9, 1,'FAILED',fontsize=50,color='red',transform=ax.transAxes) - plt.savefig(image) - - -# def analysis_f32(dict1, dict2): -# return 1 -# def analysis_f16(dict1, dict2): -# return 1 - -def main(): - print('test') - parser = argparse.ArgumentParser(description="Compare and analyze training results and output icons") - parser.add_argument("--cmp1_file", type=str, default=None) - parser.add_argument("--cmp2_file", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - parser.add_argument("--f32", type=int, default=0) - args = parser.parse_args() - - # print('---------------------') - dict1, dict2 = read_file(args.cmp1_file, args.cmp2_file) - - # if args.f32 == 1: - # result = analysis_f32(dict1, dict2) - # else: - # result = analysis_f16(dict1, dict2) - - - drawing_loss(dict1, dict2, args.f32, args.out_file) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index 3155317..e784400 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -29,26 +29,9 @@ parser.add_argument("--data_part_num", type=int, default=32, help="data part number in dataset") parser.add_argument("--iter_num", type=int, default=1144000, help="total iterations to run") parser.add_argument("--batch_size_per_device", type=int, default=64) -parser.add_argument("--debug", type=int, default=0) -parser.add_argument("--data_load_random", type=int, default=1) -parser.add_argument("--model_load", type=str, default=None) - - args = parser.parse_args() configs.print_args(args) - -if args.debug == 1: - flow.config.enable_debug_mode(True) - print('Enable Debug !!!!!!!') - -if args.data_load_random == 1: - random_tmp=True - print('Enable random loading of data !!!!!!!') -else: - random_tmp=False - print('Disable random loading of data !!!!!!!') - batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device @@ -56,8 +39,8 @@ def BertDecoder(data_dir, batch_size, data_part_num, seq_length, max_predictions ofrecord = flow.data.ofrecord_reader(data_dir, batch_size=batch_size, data_part_num=data_part_num, - random_shuffle = random_tmp, - shuffle_after_epoch=random_tmp) + random_shuffle = True, + shuffle_after_epoch=True) blob_confs = {} def _blob_conf(name, shape, dtype=flow.int32): blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) @@ -121,11 +104,6 @@ def main(): snapshot = Snapshot(args.model_save_dir, args.model_load_dir) - - if args.model_load != None: - flow.load_variables(flow.checkpoint.get(args.model_load)) - - print('num_accumulation_steps:', args.num_accumulation_steps) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): diff --git a/LanguageModeling/BERT/stitching_pic.py b/LanguageModeling/BERT/stitching_pic.py deleted file mode 100644 index f2dbb2b..0000000 --- a/LanguageModeling/BERT/stitching_pic.py +++ /dev/null @@ -1,38 +0,0 @@ -import matplotlib.pyplot as plt -import argparse -import os -from PIL import Image - -def stiiching_pic(dir, out_file): - - hight = 1200 - width = 3600 - file_lsit = os.listdir(dir) - target = Image.new('RGBA', (width, hight*len(file_lsit))) - left = 0 - right = hight - for file in file_lsit: - - tmp = dir+'/'+file - print(tmp) - image = Image.open(tmp) - # print(image) - # print(target) - target.paste(image, (0, left, width, right)) - left += hight - right += hight - target.save(out_file) - - -def main(): - print('test') - parser = argparse.ArgumentParser(description="Stitching pictures") - parser.add_argument("--dir", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - args = parser.parse_args() - - stiiching_pic(args.dir, args.out_file) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh deleted file mode 100644 index d662ecb..0000000 --- a/LanguageModeling/BERT/train_perbert.sh +++ /dev/null @@ -1,129 +0,0 @@ -USE_FP16=${1:-0} -#DEBUG -DEBUG_MODE=${2:-0} -# -BATCH_SIZE=${3:-64} -#accumulation -ACCUMULATION_STEMPS=${4-1} -# -OPTIMIZER=${5-adam} -#GPU -GPUS_PER_NODE=${6:-8} -# -ITER_NUM=${7:-100} -# -PRINT_ITER=${8:-1} -# -NODE_RANK=${9:-0} - -LOG_FOLDER=./log/ - -########################################################################################################## -# FP -########################################################################################################## -echo ${USE_FP16} - - -if [ "$USE_FP16" = 1 ];then - FP_CMD=--use_fp16 - FP_NAME=f16 - echo "USE_FP16" -else - FP_CMD= - FP_NAME=f32 - echo "USE_FP32" -fi -########################################################################################################## -# DEBUG_NAME -########################################################################################################## -# if [ DEBUG_MODE==1 ];then -# DEBUG_NAME=debug -# else -# DEBUG_NAME= -# fi - -########################################################################################################## -# Create folder -########################################################################################################## -#bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug -mkdir -p $LOG_FOLDER - -# OUTFILE=bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ -# _${DEBUG_NAME} -# mkdir -p $OUTFILE - -LOGFILE=$LOG_FOLDER/bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ -_${DEBUG_NAME}.log - -MODEL_DIR=./snapshots/ -# DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128/ -DATA_DIR=/data/bert_dataset - -MEM_FILE=$LOG_FOLDER/memory.log - -echo LOGFILE=$LOGFILE -echo DATA_DIR=$DATA_DIR - -#${NNODES}n${GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_pretrain_${NODE_RANK}.log -rm -rf ${MODEL_DIR}/* -rm -rf ${LOG_FOLDER}/* - -NVPROF=baseline-report_${NODE_RANK} -#-o ${NVPROF} - - -# -g $GPUS_PER_NODE \ -# -n 0.5 \ - - -#nsys profile --stats=true -o ${NVPROF} \ -python3 run_pretraining.py \ - --gpu_num_per_node=${GPUS_PER_NODE} \ - --num_nodes=1 \ - --learning_rate=1e-4 \ - --warmup_proportion=0.01 \ - --weight_decay_rate=0.01 \ - --batch_size_per_device=${BATCH_SIZE} \ - --iter_num=${ITER_NUM} \ - --loss_print_every_n_iter=${PRINT_ITER} \ - --seq_length=128 \ - --max_predictions_per_seq=20 \ - --num_hidden_layers=12 \ - --num_attention_heads=12 \ - --num_accumulation_steps=${ACCUMULATION_STEMPS} \ - --max_position_embeddings=512 \ - --type_vocab_size=2 \ - --vocab_size=30522 \ - --attention_probs_dropout_prob=0 \ - --hidden_dropout_prob=0 \ - --hidden_size_per_head=64 \ - --data_part_num=64 \ - --data_dir=$DATA_DIR \ - --log_dir=${LOG_FOLDER} \ - --model_save_every_n_iter=10000 \ - --save_last_snapshot=True \ - --model_save_dir=./snapshots \ - --debug=${DEBUG_MODE} \ - --data_load_random=0 \ - --model_load=/opt/initial_model \ - ${FP_CMD} \ - --optimizer_type=${OPTIMIZER} \ - 2>&1 | tee ${LOGFILE} - -echo "Writting log to ${LOGFILE}" - -SQLITE=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.sqlite -QDREP=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.qdrep - -# mv $NVPROF.sqlite $SQLITE -# mv $NVPROF.qdrep $QDREP - -json_file=${LOG_FOLDER}out.json -python analysis.py \ - --log_file=$LOGFILE \ - --mem_file=$MEM_FILE \ - --out_file=$json_file \ - --gpu_num=$GPUS_PER_NODE - -# --use_fp16 \ - diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh deleted file mode 100644 index 6dc1fcb..0000000 --- a/LanguageModeling/BERT/train_perbert_list.sh +++ /dev/null @@ -1,281 +0,0 @@ -NUM=${2-1} - -file_op() -{ - mkdir -p $1 - mv -f log_f* $1 - - # tar -zcvf $1.tar.gz $1 - # rm -rf $1 -} -################################################################################# -rm -rf out -rm -rf pic -mkdir pic - - -# ############################################################################### -# # f32 adam -# ############################################################################### - -# for ((i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 64 1 adam -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter - -# # mkdir bert_f32_pretraining_8gpu_64bs_100iter -# # mv -f log_f32_* bert_f32_pretraining_8gpu_64bs_100iter - -# # tar -zcvf bert_f32_pretraining_8gpu_64bs_100iter.tar.gz \ -# # bert_f32_pretraining_8gpu_64bs_100iter -# # rm -rf bert_f32_pretraining_8gpu_64bs_100iter -# ############################################################################### -# # f32 lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 64 1 lamb -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb - -# # ############################################################################### -# # # f16 adam -# # ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 64 1 adam -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter - -# ############################################################################### -# # f16 lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 64 1 lamb -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb - -# ############################################################################### -# # f32 accumulation adam -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 32 2 adam -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation - -# ############################################################################### -# # f32 accumulation lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 32 2 lamb -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb - -# ############################################################################### -# # f16 accumulation adam -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 32 2 adam -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation - -# ############################################################################### -# # f16 accumulation lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 32 2 lamb -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb - - -############################################################################### -# f32 adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 64 1 adam - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_debug - -python result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_debug.png -############################################################################### -# f32 lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 64 1 lamb - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug - -python result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug.png -############################################################################### -# f16 adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 64 1 adam - cp -rf log/ log_f16_${i} -done -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_debug - -python result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_debug.png -############################################################################### -# f16 lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 64 1 lamb - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug - -python result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug.png -############################################################################### -# f32 accumulation adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 32 2 adam - cp -rf log/ log_f32_${i} - -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug - -python result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug.png - -############################################################################### -# f32 accumulation lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 32 2 lamb - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug - -python result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png - -############################################################################### -# f16 accumulation adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 32 2 adam - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug - -python result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug.png -############################################################################### -# f16 accumulation lamb -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 32 2 lamb - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug - -python result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png -# ############################################################################## -# tar -# ############################################################################## - -tar -zcvf out.tar.gz out - -python stitching_pic.py --dir=pic --out_file=./pic/all.png -# rm -rf out -############################################################################### -# upload -############################################################################### - - - - - diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index e62c3e4..f6ff7d1 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -21,7 +21,6 @@ import pandas as pd from datetime import datetime import oneflow as flow -import subprocess def InitNodes(args): @@ -104,7 +103,6 @@ def __init__(self, desc='train', print_steps=-1, batch_size=256, keys=[]): self.timer.start() self._clear() - def _clear(self): for key in self.keys: self.metric_dict[key] = 0.0 @@ -120,9 +118,6 @@ def metric_cb(self, step=0, **kwargs): def callback(outputs): if step == 0: self._clear() - if step == 1: - print(subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv ", shell=True)) - for key in self.keys: self.metric_dict[key] += outputs[key].sum() self.metric_dict['n_' + key] += outputs[key].size From 1ec5a656a3e6516665b6cde5e4e8fe14348fe8d2 Mon Sep 17 00:00:00 2001 From: aishangjj <702572275@qq.com> Date: Wed, 19 May 2021 18:12:13 +0800 Subject: [PATCH 09/23] run_pretraining.py Add some parameters. util.py Add GPU memory for printing. analysis.py Analyze log files to get loss and GPU memory. result_analysis.py Compare the two results to generate a png image. stitching_pic.py Stitching pictures. train_perbert.sh Training script. train_perbert_list.sh Training list. --- LanguageModeling/BERT/analysis.py | 72 +++++ LanguageModeling/BERT/gpu_memory_usage.py | 62 +++++ LanguageModeling/BERT/result_analysis.py | 144 ++++++++++ LanguageModeling/BERT/run_pretraining.py | 26 +- LanguageModeling/BERT/stitching_pic.py | 38 +++ LanguageModeling/BERT/train_perbert.sh | 129 +++++++++ LanguageModeling/BERT/train_perbert_list.sh | 281 ++++++++++++++++++++ LanguageModeling/BERT/util.py | 4 + 8 files changed, 754 insertions(+), 2 deletions(-) create mode 100644 LanguageModeling/BERT/analysis.py create mode 100644 LanguageModeling/BERT/gpu_memory_usage.py create mode 100644 LanguageModeling/BERT/result_analysis.py create mode 100644 LanguageModeling/BERT/stitching_pic.py create mode 100644 LanguageModeling/BERT/train_perbert.sh create mode 100644 LanguageModeling/BERT/train_perbert_list.sh diff --git a/LanguageModeling/BERT/analysis.py b/LanguageModeling/BERT/analysis.py new file mode 100644 index 0000000..012d159 --- /dev/null +++ b/LanguageModeling/BERT/analysis.py @@ -0,0 +1,72 @@ +import argparse +import re +import json + +from ctypes import * + + + +def collect_loss(log_file, gpu_num): + print("loss : ",log_file) + + f = open(log_file,"r") + lines = f.readlines()#读取全部内容 + total_loss = [] + mlm_loss = [] + nsp_loss = [] + throughput =[] + memory=[] + + pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*") + for line in lines: + if(line.split(':')[0] == 'step'): + # print(line) + + match = pattern.match(line) + if match: + # print(match.groups()) + total_loss.append(match.group(2)) + mlm_loss.append(match.group(3)) + nsp_loss.append(match.group(4)) + throughput.append(match.group(5)) + if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'): + str_tmp = line.split(' [MiB]\\n')[1] + # print(str_tmp) + for i in range(gpu_num): + memory.append(str_tmp.split(' MiB\\n')[i]) + + return total_loss, mlm_loss, nsp_loss,throughput, memory + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("--log_file", type=str, default=None) + parser.add_argument("--mem_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--gpu_num", type=int, default=1) + + args = parser.parse_args() + + total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num) + # print(total_loss) + # print(mlm_loss) + # print(nsp_loss) + # print(throughput) + out={} + out['total_loss'] = total_loss + out['mlm_loss'] = mlm_loss + out['nsp_loss'] = nsp_loss + out['throughput'] = throughput + out['memory'] = memory + # print(out) + + string = json.dumps(out) + with open(args.out_file,'w')as f: + f.write(string) + + +if __name__ == "__main__": + # libc = CDLL("libc.so.6") + # msg = "Hello, world!\n" + # libc.printf("Testing: %s \n", msg) + main() diff --git a/LanguageModeling/BERT/gpu_memory_usage.py b/LanguageModeling/BERT/gpu_memory_usage.py new file mode 100644 index 0000000..cf2e3f9 --- /dev/null +++ b/LanguageModeling/BERT/gpu_memory_usage.py @@ -0,0 +1,62 @@ +import time +import argparse +import pynvml + + +class Device(object): + class Status: + INIT = "INIT" + DETECTING = "DETECTING" + STOP = "STOP" + + start_detecting_mem_threshold = 32 * 1024 * 1024 + + def __init__(self, handle): + self.handle = handle + self.status = self.Status.INIT + self.max_mem_usage = 0 + + def update(self): + info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) + if self.status == self.Status.INIT: + if info.used > self.start_detecting_mem_threshold: + self.status = self.Status.DETECTING + elif self.status == self.Status.DETECTING: + if info.used < self.start_detecting_mem_threshold: + self.status = self.Status.STOP + return False + else: + self.max_mem_usage = max(self.max_mem_usage, info.used) + elif self.status == self.Status.STOP: + raise ValueError("detecting is stop") + else: + raise ValueError("invalid status") + + return True + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("-g", type=int, default=1, help="number of gpu devices") + parser.add_argument("-n", type=float, default=1, help="metrics rate") + args = parser.parse_args() + + pynvml.nvmlInit() + n_gpus = args.g + devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)] + + running = True + while running: + time.sleep(args.n) + running = False + for device in devices: + running |= device.update() + + pynvml.nvmlShutdown() + for i, device in enumerate(devices): + max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 + print(f"{max_mem_usage_mbytes:.2f}") + + +if __name__ == "__main__": + main() diff --git a/LanguageModeling/BERT/result_analysis.py b/LanguageModeling/BERT/result_analysis.py new file mode 100644 index 0000000..5f0b8f3 --- /dev/null +++ b/LanguageModeling/BERT/result_analysis.py @@ -0,0 +1,144 @@ +import json +import matplotlib.pyplot as plt +import argparse +import numpy as np + +def read_file(file1, file2): + + with open(file1,'r') as load_f: + dict1 = json.load(load_f) + + with open(file2,'r') as load_f: + dict2 = json.load(load_f) + + + return dict1, dict2 + +def analysis_loss(y3, f32): + # if f32 == 1: + # iter = len(y3) + + # else: + # len = y3.shape[0] + # min = y3.min() + # max = y3.max() + # mean = np.mean(y3) + # var = np.var(y3) + + # random = np.random.randn(len)/10000.0 + # print(random) + # print(y3) + # lst=[y3,random] + # res=np.corrcoef(lst) + # print('----->', res) + + len = y3.shape[0] + + if f32 == 1: + iter = np.count_nonzero(y3==0) + tmp = iter/len + print('count zeor = ', iter) + if iter/len > 0.6: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + else: + + mean = np.mean(y3) + var = np.var(y3) + print('F16---->', abs(mean), var) + + if abs(mean) < 0.001 and var < 0.00001: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + +def drawing_loss(dict1, dict2, f32, image): + + + m1 = dict1["memory"] + m2 = dict2["memory"] + + print(m1) + print(m2) + table_len = len(m1) + row_labels = ['old','new'] + table_vals = [m1,m2] + + + y1 = dict1["total_loss"] + y2 = dict2["total_loss"] + y1=list(map(float,y1)) + y2=list(map(float,y2)) + x = np.arange(1, len(y1)+1) + y1 = np.array(y1) + y2 = np.array(y2) + y3 = np.subtract(y1,y2) + # v =list(map(lambda x,y:x - y)) + + result = analysis_loss(y3, f32) + + print(x) + print(y1) + print(y2) + print(y3) + fig = plt.figure(figsize=(24,8), dpi=150) + plt.figure(1) + ax = fig.add_subplot() + + + ax1 = plt.subplot(121) + plt.xlabel('iterations') + plt.plot(x,y1,color='red',label='Diachronic version') + plt.plot(x,y2,color='blue',label='Current version') + plt.title('Loss comparison') + + plt.legend(loc='best') + + ax2 = plt.subplot(122) + plt.xlabel('iterations') + plt.plot(x,y3,color='red') + plt.title('Loss difference') + plt.table(cellText=table_vals, rowLabels=row_labels, colWidths=[0.05]*table_len, loc='best') + + plt.suptitle(image.split('/')[1].split('.')[0],fontsize=20,x=0.5,y=0.98) + + if result == 1: + plt.text(0.9, 1,'PASS', fontsize=50, color='blue', transform=ax.transAxes) + else: + plt.text(0.9, 1,'FAILED',fontsize=50,color='red',transform=ax.transAxes) + plt.savefig(image) + + +# def analysis_f32(dict1, dict2): +# return 1 +# def analysis_f16(dict1, dict2): +# return 1 + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Compare and analyze training results and output icons") + parser.add_argument("--cmp1_file", type=str, default=None) + parser.add_argument("--cmp2_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--f32", type=int, default=0) + args = parser.parse_args() + + # print('---------------------') + dict1, dict2 = read_file(args.cmp1_file, args.cmp2_file) + + # if args.f32 == 1: + # result = analysis_f32(dict1, dict2) + # else: + # result = analysis_f16(dict1, dict2) + + + drawing_loss(dict1, dict2, args.f32, args.out_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index e784400..3155317 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -29,9 +29,26 @@ parser.add_argument("--data_part_num", type=int, default=32, help="data part number in dataset") parser.add_argument("--iter_num", type=int, default=1144000, help="total iterations to run") parser.add_argument("--batch_size_per_device", type=int, default=64) +parser.add_argument("--debug", type=int, default=0) +parser.add_argument("--data_load_random", type=int, default=1) +parser.add_argument("--model_load", type=str, default=None) + + args = parser.parse_args() configs.print_args(args) + +if args.debug == 1: + flow.config.enable_debug_mode(True) + print('Enable Debug !!!!!!!') + +if args.data_load_random == 1: + random_tmp=True + print('Enable random loading of data !!!!!!!') +else: + random_tmp=False + print('Disable random loading of data !!!!!!!') + batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device @@ -39,8 +56,8 @@ def BertDecoder(data_dir, batch_size, data_part_num, seq_length, max_predictions ofrecord = flow.data.ofrecord_reader(data_dir, batch_size=batch_size, data_part_num=data_part_num, - random_shuffle = True, - shuffle_after_epoch=True) + random_shuffle = random_tmp, + shuffle_after_epoch=random_tmp) blob_confs = {} def _blob_conf(name, shape, dtype=flow.int32): blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) @@ -104,6 +121,11 @@ def main(): snapshot = Snapshot(args.model_save_dir, args.model_load_dir) + + if args.model_load != None: + flow.load_variables(flow.checkpoint.get(args.model_load)) + + print('num_accumulation_steps:', args.num_accumulation_steps) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): diff --git a/LanguageModeling/BERT/stitching_pic.py b/LanguageModeling/BERT/stitching_pic.py new file mode 100644 index 0000000..f2dbb2b --- /dev/null +++ b/LanguageModeling/BERT/stitching_pic.py @@ -0,0 +1,38 @@ +import matplotlib.pyplot as plt +import argparse +import os +from PIL import Image + +def stiiching_pic(dir, out_file): + + hight = 1200 + width = 3600 + file_lsit = os.listdir(dir) + target = Image.new('RGBA', (width, hight*len(file_lsit))) + left = 0 + right = hight + for file in file_lsit: + + tmp = dir+'/'+file + print(tmp) + image = Image.open(tmp) + # print(image) + # print(target) + target.paste(image, (0, left, width, right)) + left += hight + right += hight + target.save(out_file) + + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Stitching pictures") + parser.add_argument("--dir", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + args = parser.parse_args() + + stiiching_pic(args.dir, args.out_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh new file mode 100644 index 0000000..d662ecb --- /dev/null +++ b/LanguageModeling/BERT/train_perbert.sh @@ -0,0 +1,129 @@ +USE_FP16=${1:-0} +#DEBUG +DEBUG_MODE=${2:-0} +# +BATCH_SIZE=${3:-64} +#accumulation +ACCUMULATION_STEMPS=${4-1} +# +OPTIMIZER=${5-adam} +#GPU +GPUS_PER_NODE=${6:-8} +# +ITER_NUM=${7:-100} +# +PRINT_ITER=${8:-1} +# +NODE_RANK=${9:-0} + +LOG_FOLDER=./log/ + +########################################################################################################## +# FP +########################################################################################################## +echo ${USE_FP16} + + +if [ "$USE_FP16" = 1 ];then + FP_CMD=--use_fp16 + FP_NAME=f16 + echo "USE_FP16" +else + FP_CMD= + FP_NAME=f32 + echo "USE_FP32" +fi +########################################################################################################## +# DEBUG_NAME +########################################################################################################## +# if [ DEBUG_MODE==1 ];then +# DEBUG_NAME=debug +# else +# DEBUG_NAME= +# fi + +########################################################################################################## +# Create folder +########################################################################################################## +#bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug +mkdir -p $LOG_FOLDER + +# OUTFILE=bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ +# _${DEBUG_NAME} +# mkdir -p $OUTFILE + +LOGFILE=$LOG_FOLDER/bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ +_${DEBUG_NAME}.log + +MODEL_DIR=./snapshots/ +# DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128/ +DATA_DIR=/data/bert_dataset + +MEM_FILE=$LOG_FOLDER/memory.log + +echo LOGFILE=$LOGFILE +echo DATA_DIR=$DATA_DIR + +#${NNODES}n${GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_pretrain_${NODE_RANK}.log +rm -rf ${MODEL_DIR}/* +rm -rf ${LOG_FOLDER}/* + +NVPROF=baseline-report_${NODE_RANK} +#-o ${NVPROF} + + +# -g $GPUS_PER_NODE \ +# -n 0.5 \ + + +#nsys profile --stats=true -o ${NVPROF} \ +python3 run_pretraining.py \ + --gpu_num_per_node=${GPUS_PER_NODE} \ + --num_nodes=1 \ + --learning_rate=1e-4 \ + --warmup_proportion=0.01 \ + --weight_decay_rate=0.01 \ + --batch_size_per_device=${BATCH_SIZE} \ + --iter_num=${ITER_NUM} \ + --loss_print_every_n_iter=${PRINT_ITER} \ + --seq_length=128 \ + --max_predictions_per_seq=20 \ + --num_hidden_layers=12 \ + --num_attention_heads=12 \ + --num_accumulation_steps=${ACCUMULATION_STEMPS} \ + --max_position_embeddings=512 \ + --type_vocab_size=2 \ + --vocab_size=30522 \ + --attention_probs_dropout_prob=0 \ + --hidden_dropout_prob=0 \ + --hidden_size_per_head=64 \ + --data_part_num=64 \ + --data_dir=$DATA_DIR \ + --log_dir=${LOG_FOLDER} \ + --model_save_every_n_iter=10000 \ + --save_last_snapshot=True \ + --model_save_dir=./snapshots \ + --debug=${DEBUG_MODE} \ + --data_load_random=0 \ + --model_load=/opt/initial_model \ + ${FP_CMD} \ + --optimizer_type=${OPTIMIZER} \ + 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" + +SQLITE=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.sqlite +QDREP=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter.qdrep + +# mv $NVPROF.sqlite $SQLITE +# mv $NVPROF.qdrep $QDREP + +json_file=${LOG_FOLDER}out.json +python analysis.py \ + --log_file=$LOGFILE \ + --mem_file=$MEM_FILE \ + --out_file=$json_file \ + --gpu_num=$GPUS_PER_NODE + +# --use_fp16 \ + diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh new file mode 100644 index 0000000..6dc1fcb --- /dev/null +++ b/LanguageModeling/BERT/train_perbert_list.sh @@ -0,0 +1,281 @@ +NUM=${2-1} + +file_op() +{ + mkdir -p $1 + mv -f log_f* $1 + + # tar -zcvf $1.tar.gz $1 + # rm -rf $1 +} +################################################################################# +rm -rf out +rm -rf pic +mkdir pic + + +# ############################################################################### +# # f32 adam +# ############################################################################### + +# for ((i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 64 1 adam +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter + +# # mkdir bert_f32_pretraining_8gpu_64bs_100iter +# # mv -f log_f32_* bert_f32_pretraining_8gpu_64bs_100iter + +# # tar -zcvf bert_f32_pretraining_8gpu_64bs_100iter.tar.gz \ +# # bert_f32_pretraining_8gpu_64bs_100iter +# # rm -rf bert_f32_pretraining_8gpu_64bs_100iter +# ############################################################################### +# # f32 lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 64 1 lamb +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb + +# # ############################################################################### +# # # f16 adam +# # ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 64 1 adam +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter + +# ############################################################################### +# # f16 lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 64 1 lamb +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb + +# ############################################################################### +# # f32 accumulation adam +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 32 2 adam +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation + +# ############################################################################### +# # f32 accumulation lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 0 0 32 2 lamb +# cp -rf log/ log_f32_${i} +# done + +# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb + +# ############################################################################### +# # f16 accumulation adam +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 32 2 adam +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation + +# ############################################################################### +# # f16 accumulation lamb +# ############################################################################### + +# for (( i = 1; i <= ${NUM}; i++ )) +# do +# echo $i +# sh train_perbert.sh 1 0 32 2 lamb +# cp -rf log/ log_f16_${i} +# done + +# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb + + +############################################################################### +# f32 adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 64 1 adam + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_debug.png +############################################################################### +# f32 lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 64 1 lamb + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug.png +############################################################################### +# f16 adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 64 1 adam + cp -rf log/ log_f16_${i} +done +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_debug.png +############################################################################### +# f16 lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 64 1 lamb + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug.png +############################################################################### +# f32 accumulation adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 32 2 adam + cp -rf log/ log_f32_${i} + +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug.png + +############################################################################### +# f32 accumulation lamb debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 0 1 32 2 lamb + cp -rf log/ log_f32_${i} +done + +file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug + +python result_analysis.py --f32=1 \ + --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ + --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ + --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png + +############################################################################### +# f16 accumulation adam debug +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 32 2 adam + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug.png +############################################################################### +# f16 accumulation lamb +############################################################################### + +for (( i = 1; i <= ${NUM}; i++ )) +do + echo $i + sh train_perbert.sh 1 1 32 2 lamb + cp -rf log/ log_f16_${i} +done + +file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug + +python result_analysis.py --f32=0 \ + --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ + --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ + --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png +# ############################################################################## +# tar +# ############################################################################## + +tar -zcvf out.tar.gz out + +python stitching_pic.py --dir=pic --out_file=./pic/all.png +# rm -rf out +############################################################################### +# upload +############################################################################### + + + + + diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index f6ff7d1..c32d380 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -21,6 +21,7 @@ import pandas as pd from datetime import datetime import oneflow as flow +import subprocess def InitNodes(args): @@ -118,6 +119,9 @@ def metric_cb(self, step=0, **kwargs): def callback(outputs): if step == 0: self._clear() + if step == 1: + print(subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv ", shell=True)) + for key in self.keys: self.metric_dict[key] += outputs[key].sum() self.metric_dict['n_' + key] += outputs[key].size From 728dd68603dcb01adbb1329ad74395870d4c260a Mon Sep 17 00:00:00 2001 From: aishangjj <702572275@qq.com> Date: Thu, 20 May 2021 12:08:48 +0800 Subject: [PATCH 10/23] Organize files and move them to the tools file directory --- LanguageModeling/BERT/tools/README.md | 12 ++ LanguageModeling/BERT/tools/analysis.py | 63 ++++++++ .../BERT/tools/gpu_memory_usage.py | 63 ++++++++ .../BERT/tools/result_analysis.py | 134 ++++++++++++++++++ LanguageModeling/BERT/tools/stitching_pic.py | 40 ++++++ 5 files changed, 312 insertions(+) create mode 100644 LanguageModeling/BERT/tools/README.md create mode 100644 LanguageModeling/BERT/tools/analysis.py create mode 100644 LanguageModeling/BERT/tools/gpu_memory_usage.py create mode 100644 LanguageModeling/BERT/tools/result_analysis.py create mode 100644 LanguageModeling/BERT/tools/stitching_pic.py diff --git a/LanguageModeling/BERT/tools/README.md b/LanguageModeling/BERT/tools/README.md new file mode 100644 index 0000000..3a9baa7 --- /dev/null +++ b/LanguageModeling/BERT/tools/README.md @@ -0,0 +1,12 @@ +## Oneflow BERT automatic test tools +Automatically execute bert modle with different parameters and analyze the results +### Dependent environment +- matplotlib + ``` + pip install matplotlib + ``` + +### Features +- `analysis.py`,Analyze the log file to get the total loss mlm loss nsp loos through GPU memory +- `result_analysis.py`,Analyze the running results of the two versions and output reports +- `stitching_pic.py` Multiple pictures are spliced ​​together diff --git a/LanguageModeling/BERT/tools/analysis.py b/LanguageModeling/BERT/tools/analysis.py new file mode 100644 index 0000000..c25b342 --- /dev/null +++ b/LanguageModeling/BERT/tools/analysis.py @@ -0,0 +1,63 @@ +import argparse +import re +import json + +from ctypes import * + + + +def collect_loss(log_file, gpu_num): + print("loss : ",log_file) + + f = open(log_file,"r") + lines = f.readlines() + total_loss = [] + mlm_loss = [] + nsp_loss = [] + throughput =[] + memory=[] + + pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*") + for line in lines: + if(line.split(':')[0] == 'step'): + + match = pattern.match(line) + if match: + total_loss.append(match.group(2)) + mlm_loss.append(match.group(3)) + nsp_loss.append(match.group(4)) + throughput.append(match.group(5)) + if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'): + str_tmp = line.split(' [MiB]\\n')[1] + + for i in range(gpu_num): + memory.append(str_tmp.split(' MiB\\n')[i]) + + return total_loss, mlm_loss, nsp_loss,throughput, memory + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("--log_file", type=str, default=None) + parser.add_argument("--mem_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--gpu_num", type=int, default=1) + + args = parser.parse_args() + + total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num) + + out={} + out['total_loss'] = total_loss + out['mlm_loss'] = mlm_loss + out['nsp_loss'] = nsp_loss + out['throughput'] = throughput + out['memory'] = memory + + string = json.dumps(out) + with open(args.out_file,'w')as f: + f.write(string) + + +if __name__ == "__main__": + main() diff --git a/LanguageModeling/BERT/tools/gpu_memory_usage.py b/LanguageModeling/BERT/tools/gpu_memory_usage.py new file mode 100644 index 0000000..6f3daa8 --- /dev/null +++ b/LanguageModeling/BERT/tools/gpu_memory_usage.py @@ -0,0 +1,63 @@ +import time +import argparse +import pynvml + + +class Device(object): + class Status: + INIT = "INIT" + DETECTING = "DETECTING" + STOP = "STOP" + + start_detecting_mem_threshold = 32 * 1024 * 1024 + + def __init__(self, handle): + self.handle = handle + self.status = self.Status.INIT + self.max_mem_usage = 0 + + def update(self): + info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) + if self.status == self.Status.INIT: + if info.used > self.start_detecting_mem_threshold: + self.status = self.Status.DETECTING + elif self.status == self.Status.DETECTING: + if info.used < self.start_detecting_mem_threshold: + self.status = self.Status.STOP + return False + else: + self.max_mem_usage = max(self.max_mem_usage, info.used) + elif self.status == self.Status.STOP: + raise ValueError("detecting is stop") + else: + raise ValueError("invalid status") + + return True + + +def main(): + parser = argparse.ArgumentParser(description="collect GPU device memory usage") + parser.add_argument("-g", type=int, default=1, help="number of gpu devices") + parser.add_argument("-n", type=float, default=1, help="metrics rate") + args = parser.parse_args() + + pynvml.nvmlInit() + n_gpus = args.g + devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)] + + running = True + while running: + time.sleep(args.n) + running = False + for device in devices: + running |= device.update() + + pynvml.nvmlShutdown() + for i, device in enumerate(devices): + max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 + print(f"{max_mem_usage_mbytes:.2f}") + + +if __name__ == "__main__": + main() + diff --git a/LanguageModeling/BERT/tools/result_analysis.py b/LanguageModeling/BERT/tools/result_analysis.py new file mode 100644 index 0000000..fe8b988 --- /dev/null +++ b/LanguageModeling/BERT/tools/result_analysis.py @@ -0,0 +1,134 @@ +import json +import matplotlib.pyplot as plt +import argparse +import numpy as np + +def read_file(file1, file2): + + with open(file1,'r') as load_f: + dict1 = json.load(load_f) + + with open(file2,'r') as load_f: + dict2 = json.load(load_f) + + + return dict1, dict2 + +def analysis_loss(y3, f32): + + # Calculate the correlation coefficient + + # if f32 == 1: + # iter = len(y3) + + # else: + # len = y3.shape[0] + # min = y3.min() + # max = y3.max() + # mean = np.mean(y3) + # var = np.var(y3) + + # random = np.random.randn(len)/10000.0 + # print(random) + # print(y3) + # lst=[y3,random] + # res=np.corrcoef(lst) + # print('----->', res) + + len = y3.shape[0] + + if f32 == 1: + iter = np.count_nonzero(y3==0) + tmp = iter/len + print('count zeor = ', iter) + if iter/len > 0.6: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + else: + + mean = np.mean(y3) + var = np.var(y3) + print('F16---->', abs(mean), var) + + if abs(mean) < 0.001 and var < 0.00001: + print('Test passed') + return 1 + else: + print('Test failed') + return 0 + +def drawing_loss(dict1, dict2, f32, image): + + + m1 = dict1["memory"] + m2 = dict2["memory"] + + print(m1) + print(m2) + table_len = len(m1) + row_labels = ['old','new'] + table_vals = [m1,m2] + + + y1 = dict1["total_loss"] + y2 = dict2["total_loss"] + y1=list(map(float,y1)) + y2=list(map(float,y2)) + x = np.arange(1, len(y1)+1) + y1 = np.array(y1) + y2 = np.array(y2) + y3 = np.subtract(y1,y2) + + result = analysis_loss(y3, f32) + + print(x) + print(y1) + print(y2) + print(y3) + fig = plt.figure(figsize=(24,8), dpi=150) + plt.figure(1) + ax = fig.add_subplot() + + + ax1 = plt.subplot(121) + plt.xlabel('iterations') + plt.plot(x,y1,color='red',label='Diachronic version') + plt.plot(x,y2,color='blue',label='Current version') + plt.title('Loss comparison') + + plt.legend(loc='best') + + ax2 = plt.subplot(122) + plt.xlabel('iterations') + plt.plot(x,y3,color='red') + plt.title('Loss difference') + plt.table(cellText=table_vals, rowLabels=row_labels, colWidths=[0.05]*table_len, loc='best') + + plt.suptitle(image.split('/')[1].split('.')[0],fontsize=20,x=0.5,y=0.98) + + if result == 1: + plt.text(0.9, 1,'PASS', fontsize=50, color='blue', transform=ax.transAxes) + else: + plt.text(0.9, 1,'FAILED',fontsize=50,color='red',transform=ax.transAxes) + plt.savefig(image) + + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Compare and analyze training results and output icons") + parser.add_argument("--cmp1_file", type=str, default=None) + parser.add_argument("--cmp2_file", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + parser.add_argument("--f32", type=int, default=0) + args = parser.parse_args() + + dict1, dict2 = read_file(args.cmp1_file, args.cmp2_file) + + drawing_loss(dict1, dict2, args.f32, args.out_file) + + +if __name__ == "__main__": + main() diff --git a/LanguageModeling/BERT/tools/stitching_pic.py b/LanguageModeling/BERT/tools/stitching_pic.py new file mode 100644 index 0000000..c39f8f9 --- /dev/null +++ b/LanguageModeling/BERT/tools/stitching_pic.py @@ -0,0 +1,40 @@ +import matplotlib.pyplot as plt +import argparse +import os +from PIL import Image + +def stiiching_pic(dir, out_file): + + hight = 1200 + width = 3600 + file_lsit = os.listdir(dir) + target = Image.new('RGBA', (width, hight*len(file_lsit))) + left = 0 + right = hight + for file in file_lsit: + + tmp = dir+'/'+file + print(tmp) + image = Image.open(tmp) + # print(image) + # print(target) + target.paste(image, (0, left, width, right)) + left += hight + right += hight + target.save(out_file) + + +def main(): + print('test') + parser = argparse.ArgumentParser(description="Stitching pictures") + parser.add_argument("--dir", type=str, default=None) + parser.add_argument("--out_file", type=str, default=None) + args = parser.parse_args() + + stiiching_pic(args.dir, args.out_file) + + +if __name__ == "__main__": + main() + + \ No newline at end of file From 7f9827216882ac6cbce03b667aa9779fe43e6f05 Mon Sep 17 00:00:00 2001 From: aishangjj <702572275@qq.com> Date: Thu, 20 May 2021 20:17:42 +0800 Subject: [PATCH 11/23] Modify path --- LanguageModeling/BERT/analysis.py | 72 ---------- LanguageModeling/BERT/gpu_memory_usage.py | 62 --------- LanguageModeling/BERT/result_analysis.py | 144 -------------------- LanguageModeling/BERT/stitching_pic.py | 38 ------ LanguageModeling/BERT/train_perbert.sh | 2 +- LanguageModeling/BERT/train_perbert_list.sh | 18 +-- LanguageModeling/BERT/util.py | 2 +- 7 files changed, 11 insertions(+), 327 deletions(-) delete mode 100644 LanguageModeling/BERT/analysis.py delete mode 100644 LanguageModeling/BERT/gpu_memory_usage.py delete mode 100644 LanguageModeling/BERT/result_analysis.py delete mode 100644 LanguageModeling/BERT/stitching_pic.py diff --git a/LanguageModeling/BERT/analysis.py b/LanguageModeling/BERT/analysis.py deleted file mode 100644 index 012d159..0000000 --- a/LanguageModeling/BERT/analysis.py +++ /dev/null @@ -1,72 +0,0 @@ -import argparse -import re -import json - -from ctypes import * - - - -def collect_loss(log_file, gpu_num): - print("loss : ",log_file) - - f = open(log_file,"r") - lines = f.readlines()#读取全部内容 - total_loss = [] - mlm_loss = [] - nsp_loss = [] - throughput =[] - memory=[] - - pattern = re.compile(r"step:\s*(\d+)\s*,\s*total_loss:\s*(\d+\.?\d+)\s*,\s*mlm_loss:\s*(\d+\.?\d+)\s*,\s*nsp_loss:\s*(\d+\.?\d+)\s*,\s*throughput:\s*(\d+\.?\d+)\s*") - for line in lines: - if(line.split(':')[0] == 'step'): - # print(line) - - match = pattern.match(line) - if match: - # print(match.groups()) - total_loss.append(match.group(2)) - mlm_loss.append(match.group(3)) - nsp_loss.append(match.group(4)) - throughput.append(match.group(5)) - if(line.split(' [MiB]\\n')[0] == 'b\'memory.used'): - str_tmp = line.split(' [MiB]\\n')[1] - # print(str_tmp) - for i in range(gpu_num): - memory.append(str_tmp.split(' MiB\\n')[i]) - - return total_loss, mlm_loss, nsp_loss,throughput, memory - - -def main(): - parser = argparse.ArgumentParser(description="collect GPU device memory usage") - parser.add_argument("--log_file", type=str, default=None) - parser.add_argument("--mem_file", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - parser.add_argument("--gpu_num", type=int, default=1) - - args = parser.parse_args() - - total_loss, mlm_loss, nsp_loss,throughput, memory = collect_loss(args.log_file, args.gpu_num) - # print(total_loss) - # print(mlm_loss) - # print(nsp_loss) - # print(throughput) - out={} - out['total_loss'] = total_loss - out['mlm_loss'] = mlm_loss - out['nsp_loss'] = nsp_loss - out['throughput'] = throughput - out['memory'] = memory - # print(out) - - string = json.dumps(out) - with open(args.out_file,'w')as f: - f.write(string) - - -if __name__ == "__main__": - # libc = CDLL("libc.so.6") - # msg = "Hello, world!\n" - # libc.printf("Testing: %s \n", msg) - main() diff --git a/LanguageModeling/BERT/gpu_memory_usage.py b/LanguageModeling/BERT/gpu_memory_usage.py deleted file mode 100644 index cf2e3f9..0000000 --- a/LanguageModeling/BERT/gpu_memory_usage.py +++ /dev/null @@ -1,62 +0,0 @@ -import time -import argparse -import pynvml - - -class Device(object): - class Status: - INIT = "INIT" - DETECTING = "DETECTING" - STOP = "STOP" - - start_detecting_mem_threshold = 32 * 1024 * 1024 - - def __init__(self, handle): - self.handle = handle - self.status = self.Status.INIT - self.max_mem_usage = 0 - - def update(self): - info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) - if self.status == self.Status.INIT: - if info.used > self.start_detecting_mem_threshold: - self.status = self.Status.DETECTING - elif self.status == self.Status.DETECTING: - if info.used < self.start_detecting_mem_threshold: - self.status = self.Status.STOP - return False - else: - self.max_mem_usage = max(self.max_mem_usage, info.used) - elif self.status == self.Status.STOP: - raise ValueError("detecting is stop") - else: - raise ValueError("invalid status") - - return True - - -def main(): - parser = argparse.ArgumentParser(description="collect GPU device memory usage") - parser.add_argument("-g", type=int, default=1, help="number of gpu devices") - parser.add_argument("-n", type=float, default=1, help="metrics rate") - args = parser.parse_args() - - pynvml.nvmlInit() - n_gpus = args.g - devices = [Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus)] - - running = True - while running: - time.sleep(args.n) - running = False - for device in devices: - running |= device.update() - - pynvml.nvmlShutdown() - for i, device in enumerate(devices): - max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 - print(f"{max_mem_usage_mbytes:.2f}") - - -if __name__ == "__main__": - main() diff --git a/LanguageModeling/BERT/result_analysis.py b/LanguageModeling/BERT/result_analysis.py deleted file mode 100644 index 5f0b8f3..0000000 --- a/LanguageModeling/BERT/result_analysis.py +++ /dev/null @@ -1,144 +0,0 @@ -import json -import matplotlib.pyplot as plt -import argparse -import numpy as np - -def read_file(file1, file2): - - with open(file1,'r') as load_f: - dict1 = json.load(load_f) - - with open(file2,'r') as load_f: - dict2 = json.load(load_f) - - - return dict1, dict2 - -def analysis_loss(y3, f32): - # if f32 == 1: - # iter = len(y3) - - # else: - # len = y3.shape[0] - # min = y3.min() - # max = y3.max() - # mean = np.mean(y3) - # var = np.var(y3) - - # random = np.random.randn(len)/10000.0 - # print(random) - # print(y3) - # lst=[y3,random] - # res=np.corrcoef(lst) - # print('----->', res) - - len = y3.shape[0] - - if f32 == 1: - iter = np.count_nonzero(y3==0) - tmp = iter/len - print('count zeor = ', iter) - if iter/len > 0.6: - print('Test passed') - return 1 - else: - print('Test failed') - return 0 - else: - - mean = np.mean(y3) - var = np.var(y3) - print('F16---->', abs(mean), var) - - if abs(mean) < 0.001 and var < 0.00001: - print('Test passed') - return 1 - else: - print('Test failed') - return 0 - -def drawing_loss(dict1, dict2, f32, image): - - - m1 = dict1["memory"] - m2 = dict2["memory"] - - print(m1) - print(m2) - table_len = len(m1) - row_labels = ['old','new'] - table_vals = [m1,m2] - - - y1 = dict1["total_loss"] - y2 = dict2["total_loss"] - y1=list(map(float,y1)) - y2=list(map(float,y2)) - x = np.arange(1, len(y1)+1) - y1 = np.array(y1) - y2 = np.array(y2) - y3 = np.subtract(y1,y2) - # v =list(map(lambda x,y:x - y)) - - result = analysis_loss(y3, f32) - - print(x) - print(y1) - print(y2) - print(y3) - fig = plt.figure(figsize=(24,8), dpi=150) - plt.figure(1) - ax = fig.add_subplot() - - - ax1 = plt.subplot(121) - plt.xlabel('iterations') - plt.plot(x,y1,color='red',label='Diachronic version') - plt.plot(x,y2,color='blue',label='Current version') - plt.title('Loss comparison') - - plt.legend(loc='best') - - ax2 = plt.subplot(122) - plt.xlabel('iterations') - plt.plot(x,y3,color='red') - plt.title('Loss difference') - plt.table(cellText=table_vals, rowLabels=row_labels, colWidths=[0.05]*table_len, loc='best') - - plt.suptitle(image.split('/')[1].split('.')[0],fontsize=20,x=0.5,y=0.98) - - if result == 1: - plt.text(0.9, 1,'PASS', fontsize=50, color='blue', transform=ax.transAxes) - else: - plt.text(0.9, 1,'FAILED',fontsize=50,color='red',transform=ax.transAxes) - plt.savefig(image) - - -# def analysis_f32(dict1, dict2): -# return 1 -# def analysis_f16(dict1, dict2): -# return 1 - -def main(): - print('test') - parser = argparse.ArgumentParser(description="Compare and analyze training results and output icons") - parser.add_argument("--cmp1_file", type=str, default=None) - parser.add_argument("--cmp2_file", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - parser.add_argument("--f32", type=int, default=0) - args = parser.parse_args() - - # print('---------------------') - dict1, dict2 = read_file(args.cmp1_file, args.cmp2_file) - - # if args.f32 == 1: - # result = analysis_f32(dict1, dict2) - # else: - # result = analysis_f16(dict1, dict2) - - - drawing_loss(dict1, dict2, args.f32, args.out_file) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/LanguageModeling/BERT/stitching_pic.py b/LanguageModeling/BERT/stitching_pic.py deleted file mode 100644 index f2dbb2b..0000000 --- a/LanguageModeling/BERT/stitching_pic.py +++ /dev/null @@ -1,38 +0,0 @@ -import matplotlib.pyplot as plt -import argparse -import os -from PIL import Image - -def stiiching_pic(dir, out_file): - - hight = 1200 - width = 3600 - file_lsit = os.listdir(dir) - target = Image.new('RGBA', (width, hight*len(file_lsit))) - left = 0 - right = hight - for file in file_lsit: - - tmp = dir+'/'+file - print(tmp) - image = Image.open(tmp) - # print(image) - # print(target) - target.paste(image, (0, left, width, right)) - left += hight - right += hight - target.save(out_file) - - -def main(): - print('test') - parser = argparse.ArgumentParser(description="Stitching pictures") - parser.add_argument("--dir", type=str, default=None) - parser.add_argument("--out_file", type=str, default=None) - args = parser.parse_args() - - stiiching_pic(args.dir, args.out_file) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh index d662ecb..d5a80a5 100644 --- a/LanguageModeling/BERT/train_perbert.sh +++ b/LanguageModeling/BERT/train_perbert.sh @@ -119,7 +119,7 @@ QDREP=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NU # mv $NVPROF.qdrep $QDREP json_file=${LOG_FOLDER}out.json -python analysis.py \ +python tools/analysis.py \ --log_file=$LOGFILE \ --mem_file=$MEM_FILE \ --out_file=$json_file \ diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh index 6dc1fcb..4417b4a 100644 --- a/LanguageModeling/BERT/train_perbert_list.sh +++ b/LanguageModeling/BERT/train_perbert_list.sh @@ -138,7 +138,7 @@ done file_op out/bert_f32_pretraining_8gpu_64bs_100iter_debug -python result_analysis.py --f32=1 \ +python tools/result_analysis.py --f32=1 \ --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_debug.png @@ -155,7 +155,7 @@ done file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug -python result_analysis.py --f32=1 \ +python tools/result_analysis.py --f32=1 \ --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug.png @@ -171,7 +171,7 @@ do done file_op out/bert_f16_pretraining_8gpu_64bs_100iter_debug -python result_analysis.py --f32=0 \ +python tools/result_analysis.py --f32=0 \ --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_debug.png @@ -188,7 +188,7 @@ done file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug -python result_analysis.py --f32=0 \ +python tools/result_analysis.py --f32=0 \ --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug.png @@ -206,7 +206,7 @@ done file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug -python result_analysis.py --f32=1 \ +python tools/result_analysis.py --f32=1 \ --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug.png @@ -224,7 +224,7 @@ done file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug -python result_analysis.py --f32=1 \ +python tools/result_analysis.py --f32=1 \ --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png @@ -242,7 +242,7 @@ done file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug -python result_analysis.py --f32=0 \ +python tools/result_analysis.py --f32=0 \ --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug.png @@ -259,7 +259,7 @@ done file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug -python result_analysis.py --f32=0 \ +python tools/result_analysis.py --f32=0 \ --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png @@ -269,7 +269,7 @@ python result_analysis.py --f32=0 \ tar -zcvf out.tar.gz out -python stitching_pic.py --dir=pic --out_file=./pic/all.png +python tools/stitching_pic.py --dir=pic --out_file=./pic/all.png # rm -rf out ############################################################################### # upload diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index c32d380..bb525c5 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -137,7 +137,7 @@ def callback(outputs): for key in self.keys: value = self.metric_dict[key] / self.metric_dict['n_' + key] self.update_and_save(key, value, step, **kwargs) - print(', '.join(('{}: {}' if type(v) is int else '{}: {:.3f}').format(k, v) \ + print(', '.join(('{}: {}' if type(v) is int else '{}: {}').format(k, v) \ for k, v in self.metric_dict.items()), time.time()) self._clear() From e0093dd9a683eaa47e3eceb1d3f1277af51c4326 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Mon, 31 May 2021 11:17:57 +0800 Subject: [PATCH 12/23] conda environment multi-machine automatic car market --- LanguageModeling/BERT/train_perbert.sh | 45 +++++++++++++-------- LanguageModeling/BERT/train_perbert_list.sh | 7 ++-- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh index d5a80a5..21a3b2e 100644 --- a/LanguageModeling/BERT/train_perbert.sh +++ b/LanguageModeling/BERT/train_perbert.sh @@ -10,14 +10,24 @@ OPTIMIZER=${5-adam} #GPU GPUS_PER_NODE=${6:-8} # -ITER_NUM=${7:-100} +NNODES=${7:-1} # -PRINT_ITER=${8:-1} -# -NODE_RANK=${9:-0} +MASTER=${8:-0} + +PYTHON=${9:-python} LOG_FOLDER=./log/ +PRINT_ITER=1 +ITER_NUM=100 + +NODE_IPS='10.10.0.2','10.10.0.3','10.10.0.4','10.10.0.5' + +# INIT_MODEL=/opt/initial_model +INIT_MODEL=/data/bert/initial_model/ +#DATA_DIR=/data/bert_dataset +DATA_DIR=/data/bert/wiki_seq_len_128/ + ########################################################################################################## # FP ########################################################################################################## @@ -57,7 +67,6 @@ _${DEBUG_NAME}.log MODEL_DIR=./snapshots/ # DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128/ -DATA_DIR=/data/bert_dataset MEM_FILE=$LOG_FOLDER/memory.log @@ -76,10 +85,14 @@ NVPROF=baseline-report_${NODE_RANK} # -n 0.5 \ +export NCCL_BUG=INFO + #nsys profile --stats=true -o ${NVPROF} \ -python3 run_pretraining.py \ + +$PYTHON run_pretraining.py \ --gpu_num_per_node=${GPUS_PER_NODE} \ - --num_nodes=1 \ + --num_nodes=${NNODES} \ + --node_ips=$NODE_IPS \ --learning_rate=1e-4 \ --warmup_proportion=0.01 \ --weight_decay_rate=0.01 \ @@ -105,7 +118,7 @@ python3 run_pretraining.py \ --model_save_dir=./snapshots \ --debug=${DEBUG_MODE} \ --data_load_random=0 \ - --model_load=/opt/initial_model \ + --model_load=${INIT_MODEL} \ ${FP_CMD} \ --optimizer_type=${OPTIMIZER} \ 2>&1 | tee ${LOGFILE} @@ -118,12 +131,12 @@ QDREP=$LOG_FOLDER/bert_pretraining_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NU # mv $NVPROF.sqlite $SQLITE # mv $NVPROF.qdrep $QDREP -json_file=${LOG_FOLDER}out.json -python tools/analysis.py \ - --log_file=$LOGFILE \ - --mem_file=$MEM_FILE \ - --out_file=$json_file \ - --gpu_num=$GPUS_PER_NODE - -# --use_fp16 \ +if [ "$MASTER" = 1 ]; then + json_file=${LOG_FOLDER}out.json + python tools/analysis.py \ + --log_file=$LOGFILE \ + --mem_file=$MEM_FILE \ + --out_file=$json_file \ + --gpu_num=$GPUS_PER_NODE +fi diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh index 4417b4a..844227d 100644 --- a/LanguageModeling/BERT/train_perbert_list.sh +++ b/LanguageModeling/BERT/train_perbert_list.sh @@ -9,9 +9,10 @@ file_op() # rm -rf $1 } ################################################################################# -rm -rf out -rm -rf pic -mkdir pic +mkdir -p out +mkdir -p pic +rm -rf out/* +rm -rf pic/* # ############################################################################### From 494da78cf28f79c8f873a03bebb9a176d98b8db4 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Mon, 21 Jun 2021 14:38:49 +0800 Subject: [PATCH 13/23] Add multi-machine bert automated test --- LanguageModeling/BERT/run_pretraining.py | 3 +++ LanguageModeling/BERT/train_perbert.sh | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index 3155317..d8fac37 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -117,6 +117,9 @@ def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) + flow.config.enable_legacy_model_io() + flow.config.enable_model_io_v2(True) + InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh index 21a3b2e..dba7c10 100644 --- a/LanguageModeling/BERT/train_perbert.sh +++ b/LanguageModeling/BERT/train_perbert.sh @@ -19,7 +19,7 @@ PYTHON=${9:-python} LOG_FOLDER=./log/ PRINT_ITER=1 -ITER_NUM=100 +ITER_NUM=130 NODE_IPS='10.10.0.2','10.10.0.3','10.10.0.4','10.10.0.5' @@ -66,7 +66,6 @@ LOGFILE=$LOG_FOLDER/bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE _${DEBUG_NAME}.log MODEL_DIR=./snapshots/ -# DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128/ MEM_FILE=$LOG_FOLDER/memory.log @@ -85,7 +84,9 @@ NVPROF=baseline-report_${NODE_RANK} # -n 0.5 \ -export NCCL_BUG=INFO +export NCCL_DEBUG=INFO +export PYTHONUNBUFFERED=1 + #nsys profile --stats=true -o ${NVPROF} \ From 389af5d4de65308da89aea4df7ca3cfd461d5b82 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Mon, 21 Jun 2021 17:24:42 +0800 Subject: [PATCH 14/23] Add automated test script --- LanguageModeling/BERT/oneflow_auto_bert.sh | 265 +++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100755 LanguageModeling/BERT/oneflow_auto_bert.sh diff --git a/LanguageModeling/BERT/oneflow_auto_bert.sh b/LanguageModeling/BERT/oneflow_auto_bert.sh new file mode 100755 index 0000000..9683289 --- /dev/null +++ b/LanguageModeling/BERT/oneflow_auto_bert.sh @@ -0,0 +1,265 @@ +#!/bin/bash + +BENCH_ROOT=$1 +PYTHON_WHL=$2 +CMP_OLD=$3 + +# PYTHON_WHL=oneflow-0.3.5+cu112.git.325160b-cp38-cp38-linux_x86_64.whl +# CMP_OLD=325160bcfb786b166b063e669aea345fadee2da7 + +ENABLE_FP32=0 +GPU_NUM_PER_NODE=8 +BSZ=64 + +BERT_OSSDIR=oss://oneflow-staging/branch/master/bert/ +PORT=57520 +# PYTHON="/home/guoluqiang/miniconda3/envs/of/bin/python" +PYTHON="python3.8" +DOCKER_USER=root + +multi_machine() +{ + # param 1 node + NUM_NODES=$1 + + # param 2 run cmd + RUN_CMD=$2 + + # param 3 output file + OUTPUT_FILE=$3 + + # param 4 python + PYTHON=$4 + + # param 5 + IS_F32=$5 + + declare -a host_list=("10.10.0.2" "10.10.0.3" "10.10.0.4" "10.10.0.5") + + if [ $NUM_NODES -gt ${#host_list[@]} ] + then + echo num_nodes should be less than or equal to length of host_list. + exit + fi + + hosts=("${host_list[@]:0:${NUM_NODES}}") + echo "Working on hosts:${hosts[@]}" + + ips=${hosts[0]} + for host in "${hosts[@]:1}" + do + ips+=",${host}" + done + + for host in "${hosts[@]:1}" + do + echo "start training on ${host}" + + echo -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" + + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" + + done + + # copy files to master host and start work + host=${hosts[0]} + echo "start training on ${host}" + + echo $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + $RUN_CMD 1 $PYTHON " + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + $RUN_CMD 1 $PYTHON " + + + for host in "${hosts[@]}" + do + echo $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + mkdir -p out/${OUTPUT_FILE}; mv -f log out/${OUTPUT_FILE}/log_1 " + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + mkdir -p out/${OUTPUT_FILE}; mv -f log out/${OUTPUT_FILE}/log_1 " + done + + # Result analysis + + host=${hosts[0]} + echo "start training on ${host}" + + echo -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + $PYTHON tools/result_analysis.py $IS_F32 \ + --cmp1_file=./old/$OUTPUT_FILE/log_1/out.json \ + --cmp2_file=./out/$OUTPUT_FILE/log_1/out.json \ + --out=./pic/$OUTPUT_FILE.png " + + + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + $PYTHON tools/result_analysis.py $IS_F32 \ + --cmp1_file=./old/$OUTPUT_FILE/log_1/out.json \ + --cmp2_file=./out/$OUTPUT_FILE/log_1/out.json \ + --out=./pic/$OUTPUT_FILE.png " + + echo "multi_machine done" + +} + + +####################################################################################### +# 0 prepare the host list ips for training +######################################################################################## +ALL_NODES=4 + +declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") + +if [ $ALL_NODES -gt ${#host_list[@]} ] +then + echo num_nodes should be less than or equal to length of host_list. + exit +fi + +hosts=("${host_list[@]:0:${ALL_NODES}}") +echo "Working on hosts:${hosts[@]}" + +ips=${hosts[0]} +for host in "${hosts[@]:1}" +do + ips+=",${host}" +done + +# ####################################################################################### +# # 1 prepare oneflow_temp folder on each host +# ######################################################################################## + +for host in "${hosts[@]}" +do + ssh -p $PORT $DOCKER_USER@$host " rm -rf ~/oneflow_temp ; mkdir -p ~/oneflow_temp" + # scp -P $PORT -r $PYTHON_WHL $DOCKER_USER@$host:~/oneflow_temp/ + scp -P $PORT -r $BENCH_ROOT $DOCKER_USER@$host:~/oneflow_temp/ + echo "tesst--->" + # ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/; \ + # $PYTHON -m pip install $PYTHON_WHL; " + + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + mkdir -p pic; rm -rf pic/*; mkdir -p out; rm -rf out/* " + + +done + +#_______________________________________________________________________________________________ +host=${hosts[0]} +ssh -p $PORT $DOCKER_USER@$host "cd ~; rm -rf ~/out; \ + ossutil64 cp ${BERT_OSSDIR}$CMP_OLD/out.tar.gz .; \ + tar xvf out.tar.gz; \ + cp -rf ~/out ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT/old;" + +####################################################################################### +# 2 run single +######################################################################################## + +NUM_NODES=1 + + +if [ "$ENABLE_FP32" = 1 ];then + + # f32 adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f32_pretraining_8gpu_${BSZ}bs_130iter_adam_debug" \ + $PYTHON "--f32=1" + # f32 lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ + $PYTHON "--f32=1" + # f32 accumulation adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ + $PYTHON "--f32=1" + # f32 accumulation lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ + $PYTHON "--f32=1" + echo "BERT USE_FP32" +else + # f16 adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_debug" \ + $PYTHON "--f32=0" + # f16 lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ + $PYTHON "--f32=0" + + # f16 accumulation adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ + $PYTHON "--f32=0" + # f16 accumulation lamb + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ + $PYTHON "--f32=0" + echo "BERT USE_FP16" +fi + + + + +#____________________________________________________________________________________________________ + + + +# ####################################################################################### +# # 2 run multi-machine +# ######################################################################################## +NUM_NODES=4 + + +if [ "$ENABLE_FP32" = 1 ];then + + # f32 adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f32_pretraining_8gpu_${BSZ}bs_130iter_adam_debug" \ + $PYTHON "--f32=1" + # f32 lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ + $PYTHON "--f32=1" + # f32 accumulation adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ + $PYTHON "--f32=1" + # f32 accumulation lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ + $PYTHON "--f32=1" + + echo "BERT USE_FP32" +else + # f16 adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_debug" \ + $PYTHON "--f32=0" + # f16 lamb debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ + $PYTHON "--f32=0" + # f16 accumulation adam debug + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ + $PYTHON "--f32=0" + # f16 accumulation lamb + multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ + "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ + $PYTHON "--f32=0" + echo "BERT USE_FP16" +fi + + +# __________________________________________________________________________________________ + +host=${hosts[0]} +echo "start tar on ${host}" + +ssh $USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + tar -zcvf out.tar.gz out; \ + $PYTHON tools/stitching_pic.py --dir=pic --out_file=./pic/all.png " + +echo "multi_machine done" From 8f0997405d6353740d6d0d7306d1e1f016716d88 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Tue, 22 Jun 2021 09:54:54 +0800 Subject: [PATCH 15/23] Modify the automatic test script --- LanguageModeling/BERT/oneflow_auto_bert.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/LanguageModeling/BERT/oneflow_auto_bert.sh b/LanguageModeling/BERT/oneflow_auto_bert.sh index 9683289..fc45bc2 100755 --- a/LanguageModeling/BERT/oneflow_auto_bert.sh +++ b/LanguageModeling/BERT/oneflow_auto_bert.sh @@ -7,13 +7,15 @@ CMP_OLD=$3 # PYTHON_WHL=oneflow-0.3.5+cu112.git.325160b-cp38-cp38-linux_x86_64.whl # CMP_OLD=325160bcfb786b166b063e669aea345fadee2da7 +BERT_OSSDIR=oss://oneflow-staging/branch/master/bert/ +DOWN_FILE="wget https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/bert/${CMP_OLD}/out.tar.gz" +# DOWN_FILE="ossutil64 cp ${BERT_OSSDIR}$CMP_OLD/out.tar.gz .; " ENABLE_FP32=0 GPU_NUM_PER_NODE=8 BSZ=64 -BERT_OSSDIR=oss://oneflow-staging/branch/master/bert/ PORT=57520 -# PYTHON="/home/guoluqiang/miniconda3/envs/of/bin/python" + PYTHON="python3.8" DOCKER_USER=root @@ -133,11 +135,11 @@ done for host in "${hosts[@]}" do ssh -p $PORT $DOCKER_USER@$host " rm -rf ~/oneflow_temp ; mkdir -p ~/oneflow_temp" - # scp -P $PORT -r $PYTHON_WHL $DOCKER_USER@$host:~/oneflow_temp/ scp -P $PORT -r $BENCH_ROOT $DOCKER_USER@$host:~/oneflow_temp/ echo "tesst--->" - # ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/; \ - # $PYTHON -m pip install $PYTHON_WHL; " + scp -P $PORT -r $PYTHON_WHL $DOCKER_USER@$host:~/oneflow_temp/ + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/; \ + $PYTHON -m pip install $PYTHON_WHL; " ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ mkdir -p pic; rm -rf pic/*; mkdir -p out; rm -rf out/* " @@ -148,10 +150,11 @@ done #_______________________________________________________________________________________________ host=${hosts[0]} ssh -p $PORT $DOCKER_USER@$host "cd ~; rm -rf ~/out; \ - ossutil64 cp ${BERT_OSSDIR}$CMP_OLD/out.tar.gz .; \ + ${DOWN_FILE}; \ tar xvf out.tar.gz; \ cp -rf ~/out ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT/old;" + ####################################################################################### # 2 run single ######################################################################################## @@ -197,6 +200,7 @@ else "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ $PYTHON "--f32=0" echo "BERT USE_FP16" + fi From a4cc1eb24733ccf373616eb7586aa411891c7f37 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Tue, 22 Jun 2021 10:20:09 +0800 Subject: [PATCH 16/23] Add multi-machine docker script --- LanguageModeling/BERT/prepare_auto_docker.sh | 50 ++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 LanguageModeling/BERT/prepare_auto_docker.sh diff --git a/LanguageModeling/BERT/prepare_auto_docker.sh b/LanguageModeling/BERT/prepare_auto_docker.sh new file mode 100755 index 0000000..18aacbe --- /dev/null +++ b/LanguageModeling/BERT/prepare_auto_docker.sh @@ -0,0 +1,50 @@ +NUM_NODES=${1-4} + + +####################################################################################### +# 0 prepare the host list ips for training +######################################################################################## +declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") + +if [ $NUM_NODES -gt ${#host_list[@]} ] +then + echo num_nodes should be less than or equal to length of host_list. + exit +fi + +hosts=("${host_list[@]:0:${NUM_NODES}}") +echo "Working on hosts:${hosts[@]}" + +ips=${hosts[0]} +for host in "${hosts[@]:1}" +do + ips+=",${host}" +done + +####################################################################################### +# 1 prepare docker image +######################################################################################## +WORK_PATH=`pwd` + +wget https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/bert/docker_image/oneflow_autobert.tar + +for host in "${hosts[@]}" +do + ssh $USER@$host "mkdir -p ~/oneflow_docker_temp; rm -rf ~/oneflow_docker_temp/*" + scp -r oneflow_autobert.tar $USER@$host:~/oneflow_docker_temp + ssh $USER@$host " docker load --input ~/oneflow_docker_temp/oneflow_autobert.tar; " + + echo "tesst--->" + ssh $USER@$host " \ + docker run --runtime=nvidia --rm -i -d --privileged --shm-size=16g \ + --ulimit memlock=-1 --net=host \ + --name oneflow-auto-test \ + --cap-add=IPC_LOCK --device=/dev/infiniband \ + -v /data/bert/:/data/bert/ \ + -v /datasets/bert/:/datasets/bert/ \ + -v /datasets/ImageNet/OneFlow/:/datasets/ImageNet/OneFlow/ \ + -v /data/imagenet/ofrecord:/data/imagenet/ofrecord \ + -v ${WORK_PATH}:/workspace/oneflow-test \ + -w /workspace/oneflow-test \ + oneflow:cu11.2-ubuntu18.04 bash -c \"/usr/sbin/sshd -p 57520 && bash\" " +done From 8c32fa4c7c780ac246db54ea2839c771e2e25e8e Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Tue, 22 Jun 2021 11:00:47 +0800 Subject: [PATCH 17/23] Add close multi-machine docker script --- LanguageModeling/BERT/stop_all_docker.sh | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 LanguageModeling/BERT/stop_all_docker.sh diff --git a/LanguageModeling/BERT/stop_all_docker.sh b/LanguageModeling/BERT/stop_all_docker.sh new file mode 100755 index 0000000..78c6078 --- /dev/null +++ b/LanguageModeling/BERT/stop_all_docker.sh @@ -0,0 +1,31 @@ +NUM_NODES=${1-4} + +####################################################################################### +# 0 prepare the host list ips for training +######################################################################################## +declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") + +if [ $NUM_NODES -gt ${#host_list[@]} ] +then + echo num_nodes should be less than or equal to length of host_list. + exit +fi + +hosts=("${host_list[@]:0:${NUM_NODES}}") +echo "Working on hosts:${hosts[@]}" + +ips=${hosts[0]} +for host in "${hosts[@]:1}" +do + ips+=",${host}" +done + +####################################################################################### +# 1 prepare docker image +######################################################################################## + +for host in "${hosts[@]}" +do + ssh $USER@$host "docker kill oneflow-auto-test" + +done From 2d36b9a556f40b9530aa0727e8034450a7e48289 Mon Sep 17 00:00:00 2001 From: mu <702572275@qq.com> Date: Tue, 22 Jun 2021 16:46:44 +0800 Subject: [PATCH 18/23] Modify small details --- LanguageModeling/BERT/oneflow_auto_bert.sh | 2 +- LanguageModeling/BERT/run_squad.sh | 2 +- LanguageModeling/BERT/tools/stitching_pic.py | 2 -- LanguageModeling/BERT/train_perbert.sh | 3 +-- requirements.txt | 3 ++- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/LanguageModeling/BERT/oneflow_auto_bert.sh b/LanguageModeling/BERT/oneflow_auto_bert.sh index fc45bc2..e8af958 100755 --- a/LanguageModeling/BERT/oneflow_auto_bert.sh +++ b/LanguageModeling/BERT/oneflow_auto_bert.sh @@ -36,7 +36,7 @@ multi_machine() # param 5 IS_F32=$5 - declare -a host_list=("10.10.0.2" "10.10.0.3" "10.10.0.4" "10.10.0.5") + declare -a host_list=("10.11.0.2" "10.11.0.3" "11.10.0.4" "10.11.0.5") if [ $NUM_NODES -gt ${#host_list[@]} ] then diff --git a/LanguageModeling/BERT/run_squad.sh b/LanguageModeling/BERT/run_squad.sh index 20ade70..4fd9e4b 100644 --- a/LanguageModeling/BERT/run_squad.sh +++ b/LanguageModeling/BERT/run_squad.sh @@ -1,4 +1,4 @@ -BENCH_ROOT_DIR=/home/oyy/workspace/OneFlow-Benchmark/LanguageModeling/BERT +BENCH_ROOT_DIR=/path/to/ # pretrained model dir PRETRAINED_MODEL=/DATA/disk1/of_output/uncased_L-12_H-768_A-12_oneflow diff --git a/LanguageModeling/BERT/tools/stitching_pic.py b/LanguageModeling/BERT/tools/stitching_pic.py index c39f8f9..dac5c33 100644 --- a/LanguageModeling/BERT/tools/stitching_pic.py +++ b/LanguageModeling/BERT/tools/stitching_pic.py @@ -36,5 +36,3 @@ def main(): if __name__ == "__main__": main() - - \ No newline at end of file diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh index dba7c10..61f57c5 100644 --- a/LanguageModeling/BERT/train_perbert.sh +++ b/LanguageModeling/BERT/train_perbert.sh @@ -21,9 +21,8 @@ LOG_FOLDER=./log/ PRINT_ITER=1 ITER_NUM=130 -NODE_IPS='10.10.0.2','10.10.0.3','10.10.0.4','10.10.0.5' +NODE_IPS='10.11.0.2','10.11.0.3','10.11.0.4','10.11.0.5' -# INIT_MODEL=/opt/initial_model INIT_MODEL=/data/bert/initial_model/ #DATA_DIR=/data/bert_dataset DATA_DIR=/data/bert/wiki_seq_len_128/ diff --git a/requirements.txt b/requirements.txt index 1eedb7a..1b55c50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.17.2 pandas>=1.0.4 -pillow>=7.2.0 \ No newline at end of file +pillow>=7.2.0 +matplotlib>=3.4.2 \ No newline at end of file From 638d576b1a224b30f90fe1a1491b64df5dd5ad70 Mon Sep 17 00:00:00 2001 From: mu <702572275@qq.com> Date: Tue, 22 Jun 2021 16:54:22 +0800 Subject: [PATCH 19/23] oneflow_auto_bert.sh can replace train_perbert_list.sh --- LanguageModeling/BERT/train_perbert_list.sh | 282 -------------------- 1 file changed, 282 deletions(-) delete mode 100644 LanguageModeling/BERT/train_perbert_list.sh diff --git a/LanguageModeling/BERT/train_perbert_list.sh b/LanguageModeling/BERT/train_perbert_list.sh deleted file mode 100644 index 844227d..0000000 --- a/LanguageModeling/BERT/train_perbert_list.sh +++ /dev/null @@ -1,282 +0,0 @@ -NUM=${2-1} - -file_op() -{ - mkdir -p $1 - mv -f log_f* $1 - - # tar -zcvf $1.tar.gz $1 - # rm -rf $1 -} -################################################################################# -mkdir -p out -mkdir -p pic -rm -rf out/* -rm -rf pic/* - - -# ############################################################################### -# # f32 adam -# ############################################################################### - -# for ((i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 64 1 adam -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter - -# # mkdir bert_f32_pretraining_8gpu_64bs_100iter -# # mv -f log_f32_* bert_f32_pretraining_8gpu_64bs_100iter - -# # tar -zcvf bert_f32_pretraining_8gpu_64bs_100iter.tar.gz \ -# # bert_f32_pretraining_8gpu_64bs_100iter -# # rm -rf bert_f32_pretraining_8gpu_64bs_100iter -# ############################################################################### -# # f32 lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 64 1 lamb -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb - -# # ############################################################################### -# # # f16 adam -# # ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 64 1 adam -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter - -# ############################################################################### -# # f16 lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 64 1 lamb -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb - -# ############################################################################### -# # f32 accumulation adam -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 32 2 adam -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation - -# ############################################################################### -# # f32 accumulation lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 0 0 32 2 lamb -# cp -rf log/ log_f32_${i} -# done - -# file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb - -# ############################################################################### -# # f16 accumulation adam -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 32 2 adam -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation - -# ############################################################################### -# # f16 accumulation lamb -# ############################################################################### - -# for (( i = 1; i <= ${NUM}; i++ )) -# do -# echo $i -# sh train_perbert.sh 1 0 32 2 lamb -# cp -rf log/ log_f16_${i} -# done - -# file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb - - -############################################################################### -# f32 adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 64 1 adam - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_debug - -python tools/result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_debug.png -############################################################################### -# f32 lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 64 1 lamb - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug - -python tools/result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug.png -############################################################################### -# f16 adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 64 1 adam - cp -rf log/ log_f16_${i} -done -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_debug - -python tools/result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_debug.png -############################################################################### -# f16 lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 64 1 lamb - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug - -python tools/result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_lamb_debug.png -############################################################################### -# f32 accumulation adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 32 2 adam - cp -rf log/ log_f32_${i} - -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug - -python tools/result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_debug.png - -############################################################################### -# f32 accumulation lamb debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 0 1 32 2 lamb - cp -rf log/ log_f32_${i} -done - -file_op out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug - -python tools/result_analysis.py --f32=1 \ - --cmp1_file=old/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ - --cmp2_file=out/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f32_1/out.json \ - --out=pic/bert_f32_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png - -############################################################################### -# f16 accumulation adam debug -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 32 2 adam - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug - -python tools/result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_debug.png -############################################################################### -# f16 accumulation lamb -############################################################################### - -for (( i = 1; i <= ${NUM}; i++ )) -do - echo $i - sh train_perbert.sh 1 1 32 2 lamb - cp -rf log/ log_f16_${i} -done - -file_op out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug - -python tools/result_analysis.py --f32=0 \ - --cmp1_file=old/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ - --cmp2_file=out/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug/log_f16_1/out.json \ - --out=pic/bert_f16_pretraining_8gpu_64bs_100iter_accumulation_lamb_debug.png -# ############################################################################## -# tar -# ############################################################################## - -tar -zcvf out.tar.gz out - -python tools/stitching_pic.py --dir=pic --out_file=./pic/all.png -# rm -rf out -############################################################################### -# upload -############################################################################### - - - - - From 4b217e9b9adc6b80d4ddeea85c42dd4d6630748d Mon Sep 17 00:00:00 2001 From: mu <702572275@qq.com> Date: Tue, 22 Jun 2021 16:56:31 +0800 Subject: [PATCH 20/23] modified oneflow_auto_bert.sh ip --- LanguageModeling/BERT/oneflow_auto_bert.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LanguageModeling/BERT/oneflow_auto_bert.sh b/LanguageModeling/BERT/oneflow_auto_bert.sh index e8af958..9d53f41 100755 --- a/LanguageModeling/BERT/oneflow_auto_bert.sh +++ b/LanguageModeling/BERT/oneflow_auto_bert.sh @@ -36,7 +36,7 @@ multi_machine() # param 5 IS_F32=$5 - declare -a host_list=("10.11.0.2" "10.11.0.3" "11.10.0.4" "10.11.0.5") + declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") if [ $NUM_NODES -gt ${#host_list[@]} ] then From 45d9aced2d07e712e7675c61a31b003b88e9af10 Mon Sep 17 00:00:00 2001 From: mu <702572275@qq.com> Date: Tue, 22 Jun 2021 18:42:59 +0800 Subject: [PATCH 21/23] Fix merge errors --- LanguageModeling/BERT/config.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/LanguageModeling/BERT/config.py b/LanguageModeling/BERT/config.py index b658833..79ec270 100644 --- a/LanguageModeling/BERT/config.py +++ b/LanguageModeling/BERT/config.py @@ -63,9 +63,6 @@ def get_parser(parser=None): parser.add_argument("--optimizer_type", type=str, default="adam", help="Optimizer used for training - LAMB or ADAM") - parser.add_argument("--optimizer_type", type=str, default="adam", - help="Optimizer used for training - LAMB or ADAM") - # log and resore/save parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False, help="print loss every n iteration") From bb3d0bf6c65572fcab41a4b29f55ade0c077eb91 Mon Sep 17 00:00:00 2001 From: luqiang-guo <702572275@qq.com> Date: Wed, 23 Jun 2021 10:03:22 +0800 Subject: [PATCH 22/23] Modify the bert automatic test script --- LanguageModeling/BERT/oneflow_auto_bert.sh | 131 +++++---------------- 1 file changed, 31 insertions(+), 100 deletions(-) diff --git a/LanguageModeling/BERT/oneflow_auto_bert.sh b/LanguageModeling/BERT/oneflow_auto_bert.sh index 9d53f41..1623b67 100755 --- a/LanguageModeling/BERT/oneflow_auto_bert.sh +++ b/LanguageModeling/BERT/oneflow_auto_bert.sh @@ -57,11 +57,11 @@ multi_machine() do echo "start training on ${host}" - echo -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ - nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" + echo -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" - ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ - nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" + ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ + nohup $RUN_CMD 0 $PYTHON >/dev/null 2>&1 &" done @@ -136,10 +136,10 @@ for host in "${hosts[@]}" do ssh -p $PORT $DOCKER_USER@$host " rm -rf ~/oneflow_temp ; mkdir -p ~/oneflow_temp" scp -P $PORT -r $BENCH_ROOT $DOCKER_USER@$host:~/oneflow_temp/ - echo "tesst--->" - scp -P $PORT -r $PYTHON_WHL $DOCKER_USER@$host:~/oneflow_temp/ - ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/; \ - $PYTHON -m pip install $PYTHON_WHL; " + echo "test--->" + # scp -P $PORT -r $PYTHON_WHL $DOCKER_USER@$host:~/oneflow_temp/ + # ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/; \ + # $PYTHON -m pip install $PYTHON_WHL; " ssh -p $PORT $DOCKER_USER@$host "cd ~/oneflow_temp/OneFlow-Benchmark/LanguageModeling/BERT; \ mkdir -p pic; rm -rf pic/*; mkdir -p out; rm -rf out/* " @@ -159,105 +159,36 @@ ssh -p $PORT $DOCKER_USER@$host "cd ~; rm -rf ~/out; \ # 2 run single ######################################################################################## -NUM_NODES=1 - - if [ "$ENABLE_FP32" = 1 ];then - - # f32 adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f32_pretraining_8gpu_${BSZ}bs_130iter_adam_debug" \ - $PYTHON "--f32=1" - # f32 lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ - $PYTHON "--f32=1" - # f32 accumulation adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ - $PYTHON "--f32=1" - # f32 accumulation lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ - $PYTHON "--f32=1" - echo "BERT USE_FP32" + float_types=(0 1) else - # f16 adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_debug" \ - $PYTHON "--f32=0" - # f16 lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ - $PYTHON "--f32=0" - - # f16 accumulation adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ - $PYTHON "--f32=0" - # f16 accumulation lamb - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "single_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ - $PYTHON "--f32=0" - echo "BERT USE_FP16" - + float_types=(0 ) fi +num_nodes=(1 4) +optimizers=(adam lamb) +accumulations=(1 2) +FLOAT_STR=(f16 f32) +NUM_NODE_STR=(null single multi multi multi) +for ftype in ${float_types[@]} +do + for num_node in ${num_nodes[@]} + do + for optimizer in ${optimizers[@]} + do + for accumulation in ${accumulations[@]} + do + name=${NUM_NODE_STR[$num_node]}_bert_${FLOAT_STR[$ftype]}_pretraining_ + multi_machine ${num_node} "sh train_perbert.sh 1 1 ${BSZ} 1 ${optimizer} ${GPU_NUM_PER_NODE} $num_node " \ + "${name}${GPU_NUM_PER_NODE}gpu_${BSZ}bs_accumulation-${accumulation}_${optimizer}_debug" \ + $PYTHON "--f32=${ftype}" + done #end accumulations + done #end optimizer + done #end num_node +done #float_types -#____________________________________________________________________________________________________ - - - -# ####################################################################################### -# # 2 run multi-machine -# ######################################################################################## -NUM_NODES=4 - - -if [ "$ENABLE_FP32" = 1 ];then - - # f32 adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f32_pretraining_8gpu_${BSZ}bs_130iter_adam_debug" \ - $PYTHON "--f32=1" - # f32 lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ - $PYTHON "--f32=1" - # f32 accumulation adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ - $PYTHON "--f32=1" - # f32 accumulation lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 0 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f32_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ - $PYTHON "--f32=1" - - echo "BERT USE_FP32" -else - # f16 adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_debug" \ - $PYTHON "--f32=0" - # f16 lamb debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 1 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_lamb_debug" \ - $PYTHON "--f32=0" - # f16 accumulation adam debug - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 adam ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_debug" \ - $PYTHON "--f32=0" - # f16 accumulation lamb - multi_machine ${NUM_NODES} "sh train_perbert.sh 1 1 ${BSZ} 2 lamb ${GPU_NUM_PER_NODE} $NUM_NODES " \ - "multi_bert_f16_pretraining_8gpu_${BSZ}bs_100iter_accumulation_lamb_debug" \ - $PYTHON "--f32=0" - echo "BERT USE_FP16" -fi - - -# __________________________________________________________________________________________ host=${hosts[0]} echo "start tar on ${host}" From d81c54abec57ba10ea29ed80f9b294e2a7aac18d Mon Sep 17 00:00:00 2001 From: mu <702572275@qq.com> Date: Wed, 23 Jun 2021 10:41:02 +0800 Subject: [PATCH 23/23] Fix merge errors --- LanguageModeling/BERT/run_pretraining.py | 6 ------ LanguageModeling/BERT/train_perbert.sh | 12 +----------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/LanguageModeling/BERT/run_pretraining.py b/LanguageModeling/BERT/run_pretraining.py index 9e9f85d..3d296e6 100755 --- a/LanguageModeling/BERT/run_pretraining.py +++ b/LanguageModeling/BERT/run_pretraining.py @@ -31,8 +31,6 @@ parser.add_argument("--batch_size_per_device", type=int, default=64) parser.add_argument("--debug", type=int, default=0) parser.add_argument("--data_load_random", type=int, default=1) -parser.add_argument("--model_load", type=str, default=None) - args = parser.parse_args() configs.print_args(args) @@ -125,10 +123,6 @@ def main(): snapshot = Snapshot(args.model_save_dir, args.model_load_dir) - - if args.model_load != None: - flow.load_variables(flow.checkpoint.get(args.model_load)) - print('num_accumulation_steps:', args.num_accumulation_steps) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=['total_loss', 'mlm_loss', 'nsp_loss']) diff --git a/LanguageModeling/BERT/train_perbert.sh b/LanguageModeling/BERT/train_perbert.sh index 61f57c5..079dadf 100644 --- a/LanguageModeling/BERT/train_perbert.sh +++ b/LanguageModeling/BERT/train_perbert.sh @@ -57,10 +57,6 @@ fi #bert_f32_pretraining_8gpu_64bs_100iter_lamb_debug mkdir -p $LOG_FOLDER -# OUTFILE=bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ -# _${DEBUG_NAME} -# mkdir -p $OUTFILE - LOGFILE=$LOG_FOLDER/bert_pretraining_${FP_NAME}_${GPUS_PER_NODE}gpu_${BATCH_SIZE}bs_${ITER_NUM}iter_${OPTIMIZER}\ _${DEBUG_NAME}.log @@ -71,18 +67,12 @@ MEM_FILE=$LOG_FOLDER/memory.log echo LOGFILE=$LOGFILE echo DATA_DIR=$DATA_DIR -#${NNODES}n${GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_pretrain_${NODE_RANK}.log rm -rf ${MODEL_DIR}/* rm -rf ${LOG_FOLDER}/* NVPROF=baseline-report_${NODE_RANK} #-o ${NVPROF} - -# -g $GPUS_PER_NODE \ -# -n 0.5 \ - - export NCCL_DEBUG=INFO export PYTHONUNBUFFERED=1 @@ -118,7 +108,7 @@ $PYTHON run_pretraining.py \ --model_save_dir=./snapshots \ --debug=${DEBUG_MODE} \ --data_load_random=0 \ - --model_load=${INIT_MODEL} \ + --model_load_dir=${INIT_MODEL} \ ${FP_CMD} \ --optimizer_type=${OPTIMIZER} \ 2>&1 | tee ${LOGFILE}