From 779f4ea2732a0ef911bf60de6dcff4666f5c95c3 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 28 May 2024 22:29:34 +0800 Subject: [PATCH 1/8] cuda -> device --- .../classification/image/resnet50/check/check.py | 14 ++++++++------ Vision/classification/image/resnet50/config.py | 1 + Vision/classification/image/resnet50/graph.py | 10 ++++++---- Vision/classification/image/resnet50/infer.py | 4 ++-- .../classification/image/resnet50/models/data.py | 15 ++++++++++----- Vision/classification/image/resnet50/train.py | 13 +++++++------ 6 files changed, 34 insertions(+), 23 deletions(-) diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py index 2708f0c0a..a822f6e9b 100644 --- a/Vision/classification/image/resnet50/check/check.py +++ b/Vision/classification/image/resnet50/check/check.py @@ -14,6 +14,7 @@ def _parse_args(): parser = argparse.ArgumentParser("flags for train resnet50") + parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save_checkpoint_path", type=str, @@ -68,8 +69,8 @@ def setup(args): graph_model = resnet50() graph_model.load_state_dict(eager_model.state_dict()) - eager_model.to("cuda") - graph_model.to("cuda") + eager_model.to(args.device) + graph_model.to(args.device) # optimizer setup eager_optimizer = flow.optim.SGD( eager_model.parameters(), lr=args.learning_rate, momentum=args.mom @@ -80,7 +81,7 @@ def setup(args): # criterion setup criterion = flow.nn.CrossEntropyLoss() - criterion = criterion.to("cuda") + criterion = criterion.to(args.device) class ModelTrainGraph(flow.nn.Graph): def __init__(self): @@ -145,6 +146,7 @@ def __init__(self, args): self.graph_eval_total_time = 0.0 self.eager_val_total_time = 0.0 + self.device = args.device self.args = args def compare_eager_graph(self, compare_dic): @@ -167,8 +169,8 @@ def compare_eager_graph(self, compare_dic): for b in range(len(train_data_loader)): image, label = train_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) # oneflow graph train graph_iter_start_time = time.time() @@ -224,7 +226,7 @@ def compare_eager_graph(self, compare_dic): total_graph_infer_time, total_eager_infer_time = 0, 0 for b in tqdm(range(len(val_data_loader))): image, label = val_data_loader() - image = image.to("cuda") + image = image.to(self.device) # graph val graph_infer_time = time.time() diff --git a/Vision/classification/image/resnet50/config.py b/Vision/classification/image/resnet50/config.py index 63f3e25e2..129c8968c 100644 --- a/Vision/classification/image/resnet50/config.py +++ b/Vision/classification/image/resnet50/config.py @@ -26,6 +26,7 @@ def parse_args(ignore_unknown_args=False): parser = argparse.ArgumentParser( description="OneFlow ResNet50 Arguments", allow_abbrev=False ) + parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save", type=str, diff --git a/Vision/classification/image/resnet50/graph.py b/Vision/classification/image/resnet50/graph.py index dcad741ba..58ab63689 100644 --- a/Vision/classification/image/resnet50/graph.py +++ b/Vision/classification/image/resnet50/graph.py @@ -51,11 +51,12 @@ def __init__( self.cross_entropy = cross_entropy self.data_loader = data_loader self.add_optimizer(optimizer, lr_sch=lr_scheduler) + self.device = args.device def build(self): image, label = self.data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) loss = self.cross_entropy(logits, label) if self.return_pred_and_label: @@ -79,11 +80,12 @@ def __init__(self, model, data_loader): self.data_loader = data_loader self.model = model + self.device = args.device def build(self): image, label = self.data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) pred = logits.softmax() return pred, label diff --git a/Vision/classification/image/resnet50/infer.py b/Vision/classification/image/resnet50/infer.py index 85f19ed6a..8837ec39a 100644 --- a/Vision/classification/image/resnet50/infer.py +++ b/Vision/classification/image/resnet50/infer.py @@ -55,7 +55,7 @@ def main(args): print("***** Model Init *****") model = resnet50() model.load_state_dict(flow.load(args.model_path)) - model = model.to("cuda") + model = model.to(args.device) model.eval() end_t = time.perf_counter() print(f"***** Model Init Finish, time escapled {end_t - start_t:.6f} s *****") @@ -65,7 +65,7 @@ def main(args): start_t = end_t image = load_image(args.image_path) - image = flow.Tensor(image, device=flow.device("cuda")) + image = flow.Tensor(image, device=flow.device(args.device)) if args.graph: pred = model_graph(image) else: diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index ee8da362f..ca7e40c23 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -31,8 +31,9 @@ def make_data_loader(args, mode, is_global=False, synthetic=False): placement=placement, sbp=sbp, channel_last=args.channel_last, + device=args.device, ) - return data_loader.to("cuda") + return data_loader.to(args.device) ofrecord_data_loader = OFRecordDataLoader( ofrecord_dir=args.ofrecord_path, @@ -45,6 +46,7 @@ def make_data_loader(args, mode, is_global=False, synthetic=False): placement=placement, sbp=sbp, use_gpu_decode=args.use_gpu_decode, + device=args.device, ) return ofrecord_data_loader @@ -62,6 +64,7 @@ def __init__( placement=None, sbp=None, use_gpu_decode=False, + device="cuda", ): super().__init__() @@ -71,6 +74,7 @@ def __init__( self.total_batch_size = total_batch_size self.dataset_size = dataset_size self.mode = mode + self.device = device random_shuffle = True if mode == "train" else False shuffle_after_epoch = True if mode == "train" else False @@ -159,11 +163,11 @@ def forward(self): else: image_raw_bytes = self.image_decoder(record) image = self.resize(image_raw_bytes)[0] - image = image.to("cuda") + image = image.to(self.device) label = self.label_decoder(record) flip_code = self.flip() - flip_code = flip_code.to("cuda") + flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: record = self.ofrecord_reader() @@ -184,6 +188,7 @@ def __init__( placement=None, sbp=None, channel_last=False, + device="cuda", ): super().__init__() @@ -220,10 +225,10 @@ def __init__( ) else: self.image = flow.randint( - 0, high=256, size=self.image_shape, dtype=flow.float32, device="cuda" + 0, high=256, size=self.image_shape, dtype=flow.float32, device=device, ) self.label = flow.randint( - 0, high=self.num_classes, size=self.label_shape, device="cuda", + 0, high=self.num_classes, size=self.label_shape, device=device, ).to(dtype=flow.int32) def forward(self): diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index c1ba49ba4..3ae575040 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -26,6 +26,7 @@ class Trainer(object): def __init__(self): args = get_args() + self.device = args.device for k, v in args.__dict__.items(): setattr(self, k, v) @@ -89,12 +90,12 @@ def init_model(self): start_t = time.perf_counter() if self.is_global: - placement = flow.env.all_device_placement("cuda") + placement = flow.env.all_device_placement(self.device) self.model = self.model.to_global( placement=placement, sbp=flow.sbp.broadcast ) else: - self.model = self.model.to("cuda") + self.model = self.model.to(self.device) if self.load_path is None: self.legacy_init_parameters() @@ -311,8 +312,8 @@ def eval(self): def forward(self): image, label = self.train_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) logits = self.model(image) loss = self.cross_entropy(logits, label) if self.metric_train_acc: @@ -323,8 +324,8 @@ def forward(self): def inference(self): image, label = self.val_data_loader() - image = image.to("cuda") - label = label.to("cuda") + image = image.to(self.device) + label = label.to(self.device) with flow.no_grad(): logits = self.model(image) pred = logits.softmax() From 8571548b69c22b83551a392def8855f7d6e82505 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 28 May 2024 22:34:49 +0800 Subject: [PATCH 2/8] recover --- .../classification/image/resnet50/check/check.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py index a822f6e9b..2708f0c0a 100644 --- a/Vision/classification/image/resnet50/check/check.py +++ b/Vision/classification/image/resnet50/check/check.py @@ -14,7 +14,6 @@ def _parse_args(): parser = argparse.ArgumentParser("flags for train resnet50") - parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...") parser.add_argument( "--save_checkpoint_path", type=str, @@ -69,8 +68,8 @@ def setup(args): graph_model = resnet50() graph_model.load_state_dict(eager_model.state_dict()) - eager_model.to(args.device) - graph_model.to(args.device) + eager_model.to("cuda") + graph_model.to("cuda") # optimizer setup eager_optimizer = flow.optim.SGD( eager_model.parameters(), lr=args.learning_rate, momentum=args.mom @@ -81,7 +80,7 @@ def setup(args): # criterion setup criterion = flow.nn.CrossEntropyLoss() - criterion = criterion.to(args.device) + criterion = criterion.to("cuda") class ModelTrainGraph(flow.nn.Graph): def __init__(self): @@ -146,7 +145,6 @@ def __init__(self, args): self.graph_eval_total_time = 0.0 self.eager_val_total_time = 0.0 - self.device = args.device self.args = args def compare_eager_graph(self, compare_dic): @@ -169,8 +167,8 @@ def compare_eager_graph(self, compare_dic): for b in range(len(train_data_loader)): image, label = train_data_loader() - image = image.to(self.device) - label = label.to(self.device) + image = image.to("cuda") + label = label.to("cuda") # oneflow graph train graph_iter_start_time = time.time() @@ -226,7 +224,7 @@ def compare_eager_graph(self, compare_dic): total_graph_infer_time, total_eager_infer_time = 0, 0 for b in tqdm(range(len(val_data_loader))): image, label = val_data_loader() - image = image.to(self.device) + image = image.to("cuda") # graph val graph_infer_time = time.time() From caa6d4ba09b6d3a4758d022bc5bc8837af4ad210 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 29 May 2024 04:17:54 +0000 Subject: [PATCH 3/8] update --- Vision/classification/image/resnet50/models/data.py | 4 ++-- Vision/classification/image/resnet50/train.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index ca7e40c23..c5b3e6958 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -163,11 +163,11 @@ def forward(self): else: image_raw_bytes = self.image_decoder(record) image = self.resize(image_raw_bytes)[0] - image = image.to(self.device) label = self.label_decoder(record) flip_code = self.flip() - flip_code = flip_code.to(self.device) + if self.use_gpu_decode: + flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: record = self.ofrecord_reader() diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index 3ae575040..2e2fded8f 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -9,6 +9,7 @@ import time import oneflow as flow +import oneflow_npu from oneflow.nn.parallel import DistributedDataParallel as ddp from config import get_args From ac2ecb7cd37659b28e90546fe29bb8d13054f2b9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 29 May 2024 04:19:38 +0000 Subject: [PATCH 4/8] update --- Vision/classification/image/resnet50/examples/train_eager.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh index 46461d208..e75b485a0 100644 --- a/Vision/classification/image/resnet50/examples/train_eager.sh +++ b/Vision/classification/image/resnet50/examples/train_eager.sh @@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50 SRC_DIR=$(realpath $(dirname $0)/..) python3 $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ --num-devices-per-node 1 \ From b1908dda7b9669b216e007e739d8f75944d10bc0 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Sun, 2 Jun 2024 12:41:47 +0000 Subject: [PATCH 5/8] eager fp32 --- .../examples/train_eager_distributed_fp32.sh | 54 +++++++++++++++++++ .../image/resnet50/models/data.py | 1 + Vision/classification/image/resnet50/train.py | 2 +- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh new file mode 100644 index 000000000..cfbd8c09c --- /dev/null +++ b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh @@ -0,0 +1,54 @@ +# set -aux + +DEVICE_NUM_PER_NODE=8 +MASTER_ADDR=127.0.0.1 +NUM_NODES=1 +NODE_RANK=0 + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +# export NCCL_DEBUG=INFO +# export ONEFLOW_DEBUG_MODE=True + +CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints" +if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then + mkdir $CHECKPOINT_SAVE_PATH +fi + +#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord" + +OFRECORD_PART_NUM=256 +LEARNING_RATE=0.768 +MOM=0.875 +EPOCH=50 +TRAIN_BATCH_SIZE=96 +VAL_BATCH_SIZE=50 + +# SRC_DIR=/path/to/models/resnet50 +SRC_DIR=$(realpath $(dirname $0)/..) + +python3 -m oneflow.distributed.launch \ + --nproc_per_node $DEVICE_NUM_PER_NODE \ + --nnodes $NUM_NODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ + --print-interval 100 \ + --save $CHECKPOINT_SAVE_PATH \ + --ofrecord-path $OFRECORD_PATH \ + --ofrecord-part-num $OFRECORD_PART_NUM \ + --num-devices-per-node $DEVICE_NUM_PER_NODE \ + --lr $LEARNING_RATE \ + --momentum $MOM \ + --num-epochs $EPOCH \ + --train-batch-size $TRAIN_BATCH_SIZE \ + --val-batch-size $VAL_BATCH_SIZE \ + --scale-grad \ + #--graph \ + #--fuse-bn-relu \ + #--fuse-bn-add-relu \ diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py index c5b3e6958..2f3cbefa9 100644 --- a/Vision/classification/image/resnet50/models/data.py +++ b/Vision/classification/image/resnet50/models/data.py @@ -167,6 +167,7 @@ def forward(self): label = self.label_decoder(record) flip_code = self.flip() if self.use_gpu_decode: + # todo NPU: image will down grade to cpu flip_code = flip_code.to(self.device) image = self.crop_mirror_norm(image, flip_code) else: diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py index 2e2fded8f..43d54617d 100644 --- a/Vision/classification/image/resnet50/train.py +++ b/Vision/classification/image/resnet50/train.py @@ -278,7 +278,7 @@ def train_eager(self): param.grad /= self.world_size else: loss.backward() - loss = loss / self.world_size + #loss = loss / self.world_size self.optimizer.step() self.optimizer.zero_grad() From ba00c54c48adc420a80bde556125c7927002fce9 Mon Sep 17 00:00:00 2001 From: 0x404 <871206929@qq.com> Date: Tue, 16 Jul 2024 04:02:06 +0000 Subject: [PATCH 6/8] experiment: run resnet50 with graph on npu device --- .../resnet50/examples/train_graph_distributed_fp32.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) mode change 100644 => 100755 Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh old mode 100644 new mode 100755 index 038b1c812..f18654015 --- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh +++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh @@ -1,6 +1,6 @@ # set -aux -DEVICE_NUM_PER_NODE=8 +DEVICE_NUM_PER_NODE=1 MASTER_ADDR=127.0.0.1 NUM_NODES=1 NODE_RANK=0 @@ -17,7 +17,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then mkdir $CHECKPOINT_SAVE_PATH fi -OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord" OFRECORD_PART_NUM=256 LEARNING_RATE=0.768 @@ -35,6 +35,9 @@ python3 -m oneflow.distributed.launch \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ + --print-interval 10 \ --save $CHECKPOINT_SAVE_PATH \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ @@ -44,7 +47,6 @@ python3 -m oneflow.distributed.launch \ --num-epochs $EPOCH \ --train-batch-size $TRAIN_BATCH_SIZE \ --val-batch-size $VAL_BATCH_SIZE \ - --use-gpu-decode \ --scale-grad \ --graph \ --fuse-bn-relu \ From 6b0a86b8e1c1a13cc27ada62ce680d319eeb7956 Mon Sep 17 00:00:00 2001 From: 0x404 <871206929@qq.com> Date: Tue, 16 Jul 2024 04:13:48 +0000 Subject: [PATCH 7/8] remove fused kernel options since we don't support them yet --- .../image/resnet50/examples/train_graph_distributed_fp32.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh index f18654015..81cf7ea1f 100755 --- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh +++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh @@ -49,5 +49,4 @@ python3 -m oneflow.distributed.launch \ --val-batch-size $VAL_BATCH_SIZE \ --scale-grad \ --graph \ - --fuse-bn-relu \ - --fuse-bn-add-relu \ + From f69286a631aa7f77ad4985f395bfa55a84a6f5e7 Mon Sep 17 00:00:00 2001 From: 0x404 <871206929@qq.com> Date: Mon, 12 Aug 2024 12:37:55 +0000 Subject: [PATCH 8/8] update scripts to run graph mode on npu device --- .../classification/image/resnet50/examples/train_graph.sh | 8 ++++---- .../resnet50/examples/train_graph_distributed_fp32.sh | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) mode change 100644 => 100755 Vision/classification/image/resnet50/examples/train_graph.sh diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh old mode 100644 new mode 100755 index 3e267e0bf..7636391a7 --- a/Vision/classification/image/resnet50/examples/train_graph.sh +++ b/Vision/classification/image/resnet50/examples/train_graph.sh @@ -8,7 +8,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then mkdir $CHECKPOINT_SAVE_PATH fi -OFRECORD_PATH="./mini-imagenet/ofrecord" +OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord" if [ ! -d "$OFRECORD_PATH" ]; then wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip @@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50 SRC_DIR=$(realpath $(dirname $0)/..) python3 $SRC_DIR/train.py \ + --device npu \ + --label-smoothing 0 \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ --num-devices-per-node 1 \ @@ -35,9 +37,7 @@ python3 $SRC_DIR/train.py \ --warmup-epochs 0 \ --train-batch-size $TRAIN_BATCH_SIZE \ --val-batch-size $VAL_BATCH_SIZE \ - --save $CHECKPOINT_SAVE_PATH \ --samples-per-epoch 50 \ --val-samples-per-epoch 50 \ - --use-gpu-decode \ - --scale-grad \ --graph \ + --skip-eval \ diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh index 81cf7ea1f..27c748aa0 100755 --- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh +++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh @@ -37,8 +37,7 @@ python3 -m oneflow.distributed.launch \ $SRC_DIR/train.py \ --device npu \ --label-smoothing 0 \ - --print-interval 10 \ - --save $CHECKPOINT_SAVE_PATH \ + --print-interval 100 \ --ofrecord-path $OFRECORD_PATH \ --ofrecord-part-num $OFRECORD_PART_NUM \ --num-devices-per-node $DEVICE_NUM_PER_NODE \ @@ -47,6 +46,7 @@ python3 -m oneflow.distributed.launch \ --num-epochs $EPOCH \ --train-batch-size $TRAIN_BATCH_SIZE \ --val-batch-size $VAL_BATCH_SIZE \ - --scale-grad \ --graph \ + --skip-eval \ + # --scale-grad \