From 779f4ea2732a0ef911bf60de6dcff4666f5c95c3 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 28 May 2024 22:29:34 +0800
Subject: [PATCH 1/8] cuda -> device

---
 .../classification/image/resnet50/check/check.py  | 14 ++++++++------
 Vision/classification/image/resnet50/config.py    |  1 +
 Vision/classification/image/resnet50/graph.py     | 10 ++++++----
 Vision/classification/image/resnet50/infer.py     |  4 ++--
 .../classification/image/resnet50/models/data.py  | 15 ++++++++++-----
 Vision/classification/image/resnet50/train.py     | 13 +++++++------
 6 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py
index 2708f0c0a..a822f6e9b 100644
--- a/Vision/classification/image/resnet50/check/check.py
+++ b/Vision/classification/image/resnet50/check/check.py
@@ -14,6 +14,7 @@
 
 def _parse_args():
     parser = argparse.ArgumentParser("flags for train resnet50")
+    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save_checkpoint_path",
         type=str,
@@ -68,8 +69,8 @@ def setup(args):
     graph_model = resnet50()
     graph_model.load_state_dict(eager_model.state_dict())
 
-    eager_model.to("cuda")
-    graph_model.to("cuda")
+    eager_model.to(args.device)
+    graph_model.to(args.device)
     # optimizer setup
     eager_optimizer = flow.optim.SGD(
         eager_model.parameters(), lr=args.learning_rate, momentum=args.mom
@@ -80,7 +81,7 @@ def setup(args):
 
     # criterion setup
     criterion = flow.nn.CrossEntropyLoss()
-    criterion = criterion.to("cuda")
+    criterion = criterion.to(args.device)
 
     class ModelTrainGraph(flow.nn.Graph):
         def __init__(self):
@@ -145,6 +146,7 @@ def __init__(self, args):
         self.graph_eval_total_time = 0.0
         self.eager_val_total_time = 0.0
 
+        self.device = args.device
         self.args = args
 
     def compare_eager_graph(self, compare_dic):
@@ -167,8 +169,8 @@ def compare_eager_graph(self, compare_dic):
 
             for b in range(len(train_data_loader)):
                 image, label = train_data_loader()
-                image = image.to("cuda")
-                label = label.to("cuda")
+                image = image.to(self.device)
+                label = label.to(self.device)
 
                 # oneflow graph train
                 graph_iter_start_time = time.time()
@@ -224,7 +226,7 @@ def compare_eager_graph(self, compare_dic):
             total_graph_infer_time, total_eager_infer_time = 0, 0
             for b in tqdm(range(len(val_data_loader))):
                 image, label = val_data_loader()
-                image = image.to("cuda")
+                image = image.to(self.device)
 
                 # graph val
                 graph_infer_time = time.time()
diff --git a/Vision/classification/image/resnet50/config.py b/Vision/classification/image/resnet50/config.py
index 63f3e25e2..129c8968c 100644
--- a/Vision/classification/image/resnet50/config.py
+++ b/Vision/classification/image/resnet50/config.py
@@ -26,6 +26,7 @@ def parse_args(ignore_unknown_args=False):
     parser = argparse.ArgumentParser(
         description="OneFlow ResNet50 Arguments", allow_abbrev=False
     )
+    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save",
         type=str,
diff --git a/Vision/classification/image/resnet50/graph.py b/Vision/classification/image/resnet50/graph.py
index dcad741ba..58ab63689 100644
--- a/Vision/classification/image/resnet50/graph.py
+++ b/Vision/classification/image/resnet50/graph.py
@@ -51,11 +51,12 @@ def __init__(
         self.cross_entropy = cross_entropy
         self.data_loader = data_loader
         self.add_optimizer(optimizer, lr_sch=lr_scheduler)
+        self.device = args.device
 
     def build(self):
         image, label = self.data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         loss = self.cross_entropy(logits, label)
         if self.return_pred_and_label:
@@ -79,11 +80,12 @@ def __init__(self, model, data_loader):
 
         self.data_loader = data_loader
         self.model = model
+        self.device = args.device
 
     def build(self):
         image, label = self.data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         pred = logits.softmax()
         return pred, label
diff --git a/Vision/classification/image/resnet50/infer.py b/Vision/classification/image/resnet50/infer.py
index 85f19ed6a..8837ec39a 100644
--- a/Vision/classification/image/resnet50/infer.py
+++ b/Vision/classification/image/resnet50/infer.py
@@ -55,7 +55,7 @@ def main(args):
     print("***** Model Init *****")
     model = resnet50()
     model.load_state_dict(flow.load(args.model_path))
-    model = model.to("cuda")
+    model = model.to(args.device)
     model.eval()
     end_t = time.perf_counter()
     print(f"***** Model Init Finish, time escapled {end_t - start_t:.6f} s *****")
@@ -65,7 +65,7 @@ def main(args):
 
     start_t = end_t
     image = load_image(args.image_path)
-    image = flow.Tensor(image, device=flow.device("cuda"))
+    image = flow.Tensor(image, device=flow.device(args.device))
     if args.graph:
         pred = model_graph(image)
     else:
diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index ee8da362f..ca7e40c23 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -31,8 +31,9 @@ def make_data_loader(args, mode, is_global=False, synthetic=False):
             placement=placement,
             sbp=sbp,
             channel_last=args.channel_last,
+            device=args.device,
         )
-        return data_loader.to("cuda")
+        return data_loader.to(args.device)
 
     ofrecord_data_loader = OFRecordDataLoader(
         ofrecord_dir=args.ofrecord_path,
@@ -45,6 +46,7 @@ def make_data_loader(args, mode, is_global=False, synthetic=False):
         placement=placement,
         sbp=sbp,
         use_gpu_decode=args.use_gpu_decode,
+        device=args.device,
     )
     return ofrecord_data_loader
 
@@ -62,6 +64,7 @@ def __init__(
         placement=None,
         sbp=None,
         use_gpu_decode=False,
+        device="cuda",
     ):
         super().__init__()
 
@@ -71,6 +74,7 @@ def __init__(
         self.total_batch_size = total_batch_size
         self.dataset_size = dataset_size
         self.mode = mode
+        self.device = device
 
         random_shuffle = True if mode == "train" else False
         shuffle_after_epoch = True if mode == "train" else False
@@ -159,11 +163,11 @@ def forward(self):
             else:
                 image_raw_bytes = self.image_decoder(record)
                 image = self.resize(image_raw_bytes)[0]
-                image = image.to("cuda")
+                image = image.to(self.device)
 
             label = self.label_decoder(record)
             flip_code = self.flip()
-            flip_code = flip_code.to("cuda")
+            flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
             record = self.ofrecord_reader()
@@ -184,6 +188,7 @@ def __init__(
         placement=None,
         sbp=None,
         channel_last=False,
+        device="cuda",
     ):
         super().__init__()
 
@@ -220,10 +225,10 @@ def __init__(
             )
         else:
             self.image = flow.randint(
-                0, high=256, size=self.image_shape, dtype=flow.float32, device="cuda"
+                0, high=256, size=self.image_shape, dtype=flow.float32, device=device,
             )
             self.label = flow.randint(
-                0, high=self.num_classes, size=self.label_shape, device="cuda",
+                0, high=self.num_classes, size=self.label_shape, device=device,
             ).to(dtype=flow.int32)
 
     def forward(self):
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index c1ba49ba4..3ae575040 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -26,6 +26,7 @@
 class Trainer(object):
     def __init__(self):
         args = get_args()
+        self.device = args.device
         for k, v in args.__dict__.items():
             setattr(self, k, v)
 
@@ -89,12 +90,12 @@ def init_model(self):
         start_t = time.perf_counter()
 
         if self.is_global:
-            placement = flow.env.all_device_placement("cuda")
+            placement = flow.env.all_device_placement(self.device)
             self.model = self.model.to_global(
                 placement=placement, sbp=flow.sbp.broadcast
             )
         else:
-            self.model = self.model.to("cuda")
+            self.model = self.model.to(self.device)
 
         if self.load_path is None:
             self.legacy_init_parameters()
@@ -311,8 +312,8 @@ def eval(self):
 
     def forward(self):
         image, label = self.train_data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         logits = self.model(image)
         loss = self.cross_entropy(logits, label)
         if self.metric_train_acc:
@@ -323,8 +324,8 @@ def forward(self):
 
     def inference(self):
         image, label = self.val_data_loader()
-        image = image.to("cuda")
-        label = label.to("cuda")
+        image = image.to(self.device)
+        label = label.to(self.device)
         with flow.no_grad():
             logits = self.model(image)
             pred = logits.softmax()

From 8571548b69c22b83551a392def8855f7d6e82505 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 28 May 2024 22:34:49 +0800
Subject: [PATCH 2/8] recover

---
 .../classification/image/resnet50/check/check.py   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Vision/classification/image/resnet50/check/check.py b/Vision/classification/image/resnet50/check/check.py
index a822f6e9b..2708f0c0a 100644
--- a/Vision/classification/image/resnet50/check/check.py
+++ b/Vision/classification/image/resnet50/check/check.py
@@ -14,7 +14,6 @@
 
 def _parse_args():
     parser = argparse.ArgumentParser("flags for train resnet50")
-    parser.add_argument("--device", type=str, default="cuda", help="device: cpu, cuda...")
     parser.add_argument(
         "--save_checkpoint_path",
         type=str,
@@ -69,8 +68,8 @@ def setup(args):
     graph_model = resnet50()
     graph_model.load_state_dict(eager_model.state_dict())
 
-    eager_model.to(args.device)
-    graph_model.to(args.device)
+    eager_model.to("cuda")
+    graph_model.to("cuda")
     # optimizer setup
     eager_optimizer = flow.optim.SGD(
         eager_model.parameters(), lr=args.learning_rate, momentum=args.mom
@@ -81,7 +80,7 @@ def setup(args):
 
     # criterion setup
     criterion = flow.nn.CrossEntropyLoss()
-    criterion = criterion.to(args.device)
+    criterion = criterion.to("cuda")
 
     class ModelTrainGraph(flow.nn.Graph):
         def __init__(self):
@@ -146,7 +145,6 @@ def __init__(self, args):
         self.graph_eval_total_time = 0.0
         self.eager_val_total_time = 0.0
 
-        self.device = args.device
         self.args = args
 
     def compare_eager_graph(self, compare_dic):
@@ -169,8 +167,8 @@ def compare_eager_graph(self, compare_dic):
 
             for b in range(len(train_data_loader)):
                 image, label = train_data_loader()
-                image = image.to(self.device)
-                label = label.to(self.device)
+                image = image.to("cuda")
+                label = label.to("cuda")
 
                 # oneflow graph train
                 graph_iter_start_time = time.time()
@@ -226,7 +224,7 @@ def compare_eager_graph(self, compare_dic):
             total_graph_infer_time, total_eager_infer_time = 0, 0
             for b in tqdm(range(len(val_data_loader))):
                 image, label = val_data_loader()
-                image = image.to(self.device)
+                image = image.to("cuda")
 
                 # graph val
                 graph_infer_time = time.time()

From caa6d4ba09b6d3a4758d022bc5bc8837af4ad210 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 29 May 2024 04:17:54 +0000
Subject: [PATCH 3/8] update

---
 Vision/classification/image/resnet50/models/data.py | 4 ++--
 Vision/classification/image/resnet50/train.py       | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index ca7e40c23..c5b3e6958 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -163,11 +163,11 @@ def forward(self):
             else:
                 image_raw_bytes = self.image_decoder(record)
                 image = self.resize(image_raw_bytes)[0]
-                image = image.to(self.device)
 
             label = self.label_decoder(record)
             flip_code = self.flip()
-            flip_code = flip_code.to(self.device)
+            if self.use_gpu_decode:
+                flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
             record = self.ofrecord_reader()
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index 3ae575040..2e2fded8f 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -9,6 +9,7 @@
 import time
 
 import oneflow as flow
+import oneflow_npu
 from oneflow.nn.parallel import DistributedDataParallel as ddp
 
 from config import get_args

From ac2ecb7cd37659b28e90546fe29bb8d13054f2b9 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 29 May 2024 04:19:38 +0000
Subject: [PATCH 4/8] update

---
 Vision/classification/image/resnet50/examples/train_eager.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Vision/classification/image/resnet50/examples/train_eager.sh b/Vision/classification/image/resnet50/examples/train_eager.sh
index 46461d208..e75b485a0 100644
--- a/Vision/classification/image/resnet50/examples/train_eager.sh
+++ b/Vision/classification/image/resnet50/examples/train_eager.sh
@@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50
 SRC_DIR=$(realpath $(dirname $0)/..)
 
 python3 $SRC_DIR/train.py \
+    --device npu \
+    --label-smoothing 0 \
     --ofrecord-path $OFRECORD_PATH \
     --ofrecord-part-num $OFRECORD_PART_NUM \
     --num-devices-per-node 1 \

From b1908dda7b9669b216e007e739d8f75944d10bc0 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Sun, 2 Jun 2024 12:41:47 +0000
Subject: [PATCH 5/8] eager fp32

---
 .../examples/train_eager_distributed_fp32.sh  | 54 +++++++++++++++++++
 .../image/resnet50/models/data.py             |  1 +
 Vision/classification/image/resnet50/train.py |  2 +-
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh

diff --git a/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
new file mode 100644
index 000000000..cfbd8c09c
--- /dev/null
+++ b/Vision/classification/image/resnet50/examples/train_eager_distributed_fp32.sh
@@ -0,0 +1,54 @@
+# set -aux
+
+DEVICE_NUM_PER_NODE=8
+MASTER_ADDR=127.0.0.1
+NUM_NODES=1
+NODE_RANK=0
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+# export NCCL_DEBUG=INFO
+# export ONEFLOW_DEBUG_MODE=True
+
+CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints"
+if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
+    mkdir $CHECKPOINT_SAVE_PATH
+fi
+
+#OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord"
+
+OFRECORD_PART_NUM=256
+LEARNING_RATE=0.768
+MOM=0.875
+EPOCH=50
+TRAIN_BATCH_SIZE=96
+VAL_BATCH_SIZE=50
+
+# SRC_DIR=/path/to/models/resnet50
+SRC_DIR=$(realpath $(dirname $0)/..)
+
+python3 -m oneflow.distributed.launch \
+    --nproc_per_node $DEVICE_NUM_PER_NODE \
+    --nnodes $NUM_NODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    $SRC_DIR/train.py \
+        --device npu \
+        --label-smoothing 0 \
+	--print-interval 100 \
+        --save $CHECKPOINT_SAVE_PATH \
+        --ofrecord-path $OFRECORD_PATH \
+        --ofrecord-part-num $OFRECORD_PART_NUM \
+        --num-devices-per-node $DEVICE_NUM_PER_NODE \
+        --lr $LEARNING_RATE \
+        --momentum $MOM \
+        --num-epochs $EPOCH \
+        --train-batch-size $TRAIN_BATCH_SIZE \
+        --val-batch-size $VAL_BATCH_SIZE \
+        --scale-grad \
+        #--graph \
+        #--fuse-bn-relu \
+        #--fuse-bn-add-relu \
diff --git a/Vision/classification/image/resnet50/models/data.py b/Vision/classification/image/resnet50/models/data.py
index c5b3e6958..2f3cbefa9 100644
--- a/Vision/classification/image/resnet50/models/data.py
+++ b/Vision/classification/image/resnet50/models/data.py
@@ -167,6 +167,7 @@ def forward(self):
             label = self.label_decoder(record)
             flip_code = self.flip()
             if self.use_gpu_decode:
+                # todo NPU: image will down grade to cpu
                 flip_code = flip_code.to(self.device)
             image = self.crop_mirror_norm(image, flip_code)
         else:
diff --git a/Vision/classification/image/resnet50/train.py b/Vision/classification/image/resnet50/train.py
index 2e2fded8f..43d54617d 100644
--- a/Vision/classification/image/resnet50/train.py
+++ b/Vision/classification/image/resnet50/train.py
@@ -278,7 +278,7 @@ def train_eager(self):
                     param.grad /= self.world_size
         else:
             loss.backward()
-            loss = loss / self.world_size
+            #loss = loss / self.world_size
 
         self.optimizer.step()
         self.optimizer.zero_grad()

From ba00c54c48adc420a80bde556125c7927002fce9 Mon Sep 17 00:00:00 2001
From: 0x404 <871206929@qq.com>
Date: Tue, 16 Jul 2024 04:02:06 +0000
Subject: [PATCH 6/8]  experiment: run resnet50 with graph on npu device

---
 .../resnet50/examples/train_graph_distributed_fp32.sh     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh

diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
old mode 100644
new mode 100755
index 038b1c812..f18654015
--- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
@@ -1,6 +1,6 @@
 # set -aux
 
-DEVICE_NUM_PER_NODE=8
+DEVICE_NUM_PER_NODE=1
 MASTER_ADDR=127.0.0.1
 NUM_NODES=1
 NODE_RANK=0
@@ -17,7 +17,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
     mkdir $CHECKPOINT_SAVE_PATH
 fi
 
-OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord"
 
 OFRECORD_PART_NUM=256
 LEARNING_RATE=0.768
@@ -35,6 +35,9 @@ python3 -m oneflow.distributed.launch \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
     $SRC_DIR/train.py \
+        --device npu \
+        --label-smoothing 0 \
+        --print-interval 10 \
         --save $CHECKPOINT_SAVE_PATH \
         --ofrecord-path $OFRECORD_PATH \
         --ofrecord-part-num $OFRECORD_PART_NUM \
@@ -44,7 +47,6 @@ python3 -m oneflow.distributed.launch \
         --num-epochs $EPOCH \
         --train-batch-size $TRAIN_BATCH_SIZE \
         --val-batch-size $VAL_BATCH_SIZE \
-        --use-gpu-decode \
         --scale-grad \
         --graph \
         --fuse-bn-relu \

From 6b0a86b8e1c1a13cc27ada62ce680d319eeb7956 Mon Sep 17 00:00:00 2001
From: 0x404 <871206929@qq.com>
Date: Tue, 16 Jul 2024 04:13:48 +0000
Subject: [PATCH 7/8] remove fused kernel options since we don't support them
 yet

---
 .../image/resnet50/examples/train_graph_distributed_fp32.sh    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
index f18654015..81cf7ea1f 100755
--- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
@@ -49,5 +49,4 @@ python3 -m oneflow.distributed.launch \
         --val-batch-size $VAL_BATCH_SIZE \
         --scale-grad \
         --graph \
-        --fuse-bn-relu \
-        --fuse-bn-add-relu \
+

From f69286a631aa7f77ad4985f395bfa55a84a6f5e7 Mon Sep 17 00:00:00 2001
From: 0x404 <871206929@qq.com>
Date: Mon, 12 Aug 2024 12:37:55 +0000
Subject: [PATCH 8/8] update scripts to run graph mode on npu device

---
 .../classification/image/resnet50/examples/train_graph.sh | 8 ++++----
 .../resnet50/examples/train_graph_distributed_fp32.sh     | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 Vision/classification/image/resnet50/examples/train_graph.sh

diff --git a/Vision/classification/image/resnet50/examples/train_graph.sh b/Vision/classification/image/resnet50/examples/train_graph.sh
old mode 100644
new mode 100755
index 3e267e0bf..7636391a7
--- a/Vision/classification/image/resnet50/examples/train_graph.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph.sh
@@ -8,7 +8,7 @@ if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
     mkdir $CHECKPOINT_SAVE_PATH
 fi
 
-OFRECORD_PATH="./mini-imagenet/ofrecord"
+OFRECORD_PATH="/data0/datasets/ImageNet/ofrecord"
 
 if [ ! -d "$OFRECORD_PATH" ]; then
     wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
@@ -26,6 +26,8 @@ VAL_BATCH_SIZE=50
 SRC_DIR=$(realpath $(dirname $0)/..)
 
 python3 $SRC_DIR/train.py \
+    --device npu \
+    --label-smoothing 0 \
     --ofrecord-path $OFRECORD_PATH \
     --ofrecord-part-num $OFRECORD_PART_NUM \
     --num-devices-per-node 1 \
@@ -35,9 +37,7 @@ python3 $SRC_DIR/train.py \
     --warmup-epochs 0 \
     --train-batch-size $TRAIN_BATCH_SIZE \
     --val-batch-size $VAL_BATCH_SIZE \
-    --save $CHECKPOINT_SAVE_PATH \
     --samples-per-epoch 50 \
     --val-samples-per-epoch 50 \
-    --use-gpu-decode \
-    --scale-grad \
     --graph \
+    --skip-eval \
diff --git a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
index 81cf7ea1f..27c748aa0 100755
--- a/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
+++ b/Vision/classification/image/resnet50/examples/train_graph_distributed_fp32.sh
@@ -37,8 +37,7 @@ python3 -m oneflow.distributed.launch \
     $SRC_DIR/train.py \
         --device npu \
         --label-smoothing 0 \
-        --print-interval 10 \
-        --save $CHECKPOINT_SAVE_PATH \
+        --print-interval 100 \
         --ofrecord-path $OFRECORD_PATH \
         --ofrecord-part-num $OFRECORD_PART_NUM \
         --num-devices-per-node $DEVICE_NUM_PER_NODE \
@@ -47,6 +46,7 @@ python3 -m oneflow.distributed.launch \
         --num-epochs $EPOCH \
         --train-batch-size $TRAIN_BATCH_SIZE \
         --val-batch-size $VAL_BATCH_SIZE \
-        --scale-grad \
         --graph \
+        --skip-eval \
+        # --scale-grad \