diff --git a/fluid/SE-ResNeXt-152/train.py b/fluid/SE-ResNeXt-152/train.py index 3ae9ca2..0466ea7 100644 --- a/fluid/SE-ResNeXt-152/train.py +++ b/fluid/SE-ResNeXt-152/train.py @@ -196,7 +196,11 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=152): def net_conf(image, label, class_dim): + print("startup ops : ", len(fluid.default_startup_program().block(0).ops)) + print("main ops : ", len(fluid.default_main_program().block(0).ops)) out = SE_ResNeXt(input=image, class_dim=class_dim) + print("startup ops : ", len(fluid.default_startup_program().block(0).ops)) + print("main ops : ", len(fluid.default_main_program().block(0).ops)) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) accuracy = fluid.layers.accuracy(input=out, label=label) @@ -206,12 +210,26 @@ def net_conf(image, label, class_dim): def add_optimizer(args, avg_cost): #optimizer = fluid.optimizer.SGD(learning_rate=0.002) + print("before optimize") + print("startup ops : ", len(fluid.default_startup_program().block(0).ops)) + print("main ops : ", len(fluid.default_main_program().block(0).ops)) + # optimizer = fluid.optimizer.Momentum( + # learning_rate=fluid.layers.piecewise_decay( + # boundaries=[100], values=[0.1, 0.2]), + # momentum=0.9, + # regularization=fluid.regularizer.L2Decay(1e-4)) + # print("startup parameters : ", len(fluid.default_startup_program().block(0).all_parameters())) + # print("main parameters : ", len(fluid.default_main_program().block(0).all_parameters())) optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=[100], values=[0.1, 0.2]), + learning_rate=0.01, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) + print("after optimize") + # print("startup parameters : ", len(fluid.default_startup_program().block(0).all_parameters())) + # print("main parameters : ", len(fluid.default_main_program().block(0).all_parameters())) + print("startup ops : ", len(fluid.default_startup_program().block(0).ops)) + print("main ops : ", len(fluid.default_main_program().block(0).ops)) if args.use_mem_opt: fluid.memory_optimize(fluid.default_main_program()) @@ -350,6 +368,10 @@ def train_parallel_exe(args): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if args.balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + print("startup ops : ", len(fluid.default_startup_program().block(0).ops)) + print("main ops : ", len(fluid.default_main_program().block(0).ops)) + exit(0) + exe = fluid.ParallelExecutor( loss_name=avg_cost.name, use_cuda=True, @@ -418,7 +440,7 @@ def train_parallel_exe(args): print_arguments(args) print("cards_num=" + str(cards_num)) - if args.parallel_mode == "parallel_do": - train_parallel_do(args) - else: - train_parallel_exe(args) + # if args.parallel_mode == "parallel_do": + # train_parallel_do(args) + # else: + train_parallel_exe(args) diff --git a/fluid/resnet50.py b/fluid/resnet50.py index a80291c..6d1220a 100644 --- a/fluid/resnet50.py +++ b/fluid/resnet50.py @@ -9,7 +9,7 @@ import cProfile, pstats, StringIO -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler diff --git a/se_resnext_v2/model.py b/se_resnext_v2/model.py new file mode 100644 index 0000000..a3e0920 --- /dev/null +++ b/se_resnext_v2/model.py @@ -0,0 +1,176 @@ +import math +import numpy as np +import os +import sys +import time + +# import paddle.v2 as paddle +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers.ops as ops +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False, + use_cudnn=False) + # return conv + return fluid.layers.batch_norm(input=conv, act=act) + + +def squeeze_excitation(input, num_channels, reduction_ratio): + pool = fluid.layers.pool2d( + input=input, pool_size=0, pool_type='avg', global_pooling=True) + ### initializer parameter + # print >> sys.stderr, "pool shape:", pool.shape + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + # print >> sys.stderr, "squeeze shape:", squeeze.shape + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) + excitation = fluid.layers.fc(input=squeeze, size=num_channels, act='relu') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def shortcut_old(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + if stride == 1: + filter_size = 1 + else: + filter_size = 3 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + filter_size = 1 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = conv2 + # scale = squeeze_excitation( + # input=conv2, + # num_channels=num_filters * 2, + # reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def resnet_imagenet(input, class_dim, infer=False, layers=50): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return Out + + +def SE_ResNeXt(input, class_dim, infer=False, layers=50): + # supported_layers = [50, 152] + # if layers not in supported_layers: + # print("supported layers are", supported_layers, \ + # "but input layer is ", layers) + # exit() + # if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + ret = [] + conv = conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='avg') + + # block = 3 + # conv = bottleneck_block( + # input=conv, + # num_filters=num_filters[block], + # stride=2, + # cardinality=cardinality, + # reduction_ratio=reduction_ratio) + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + # if not infer: + # drop = fluid.layers.dropout(x=pool, dropout_prob=0.5, seed=1) + # else: + # drop = pool + drop = pool + # print >> sys.stderr, "drop shape:", drop.shape + stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) + out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') + ret.append(out) + return ret + + +def lenet(input, class_dim, infer=False): + conv1 = fluid.layers.conv2d(input, 32, 5, 1, act=None, use_cudnn=False) + # conv1 = fluid.layers.batch_norm(conv1, act='relu') + pool1 = fluid.layers.pool2d(conv1, 2, 'max', 2) + conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None, use_cudnn=False) + # conv2 = fluid.layers.batch_norm(conv2, act='relu') + pool2 = fluid.layers.pool2d(conv2, 2, 'max', 2) + fc1 = fluid.layers.fc(pool2, size=500, act='relu') + fc2 = fluid.layers.fc(fc1, size=10, act='softmax') + return fc2 diff --git a/se_resnext_v2/parse_log.sh b/se_resnext_v2/parse_log.sh new file mode 100644 index 0000000..c7bc3be --- /dev/null +++ b/se_resnext_v2/parse_log.sh @@ -0,0 +1 @@ +cat mem.log | awk -F ',' '{print $2}' | uniq -u | sort diff --git a/se_resnext_v2/parse_mem.py b/se_resnext_v2/parse_mem.py new file mode 100644 index 0000000..e69de29 diff --git a/se_resnext_v2/run.sh b/se_resnext_v2/run.sh new file mode 100755 index 0000000..34140bb --- /dev/null +++ b/se_resnext_v2/run.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# This script benchmarking the PaddlePaddle Fluid on +# single thread single GPU. + +#export FLAGS_fraction_of_gpu_memory_to_use=0.0 +export CUDNN_PATH=/paddle/cudnn_v5 + +# disable openmp and mkl parallel +#https://github.com/PaddlePaddle/Paddle/issues/7199 +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi +# disable multi-gpu if have more than one +export CUDA_VISIBLE_DEVICES=0 +export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH +export PYTHONPATH=/paddle/Paddle/build/python/build/lib-python:$PYTHONPATH +export FLAGS_fraction_of_gpu_memory_to_use=0.0 + +sudo rm -y train.log mem.log +# only query the gpu used +nohup stdbuf -oL nvidia-smi \ + --id=${CUDA_VISIBLE_DEVICES} \ + --query-compute-apps=pid,used_memory \ + --format=csv \ + --filename=mem.log \ + -l 1 & + +stdbuf -oL python train.py \ + --iterations=10 \ + 2>&1 | tee -a train.log diff --git a/se_resnext_v2/train.py b/se_resnext_v2/train.py new file mode 100644 index 0000000..25f505e --- /dev/null +++ b/se_resnext_v2/train.py @@ -0,0 +1,133 @@ +import math +import numpy as np +import os +import sys +import time +import argparse + +import paddle +import paddle.dataset.flowers as flowers +import paddle.fluid as fluid +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import paddle.fluid.profiler as profiler + +from model import SE_ResNeXt, lenet +fluid.default_startup_program().random_seed = 100 + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + args = parser.parse_args() + return args + + +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + with init_on_cpu(): + epoch = fluid.layers.floor(global_step / step_each_epoch) + lr = learning_rate / 2. + decayed_lr = lr * (fluid.layers.cos(epoch * (math.pi / epochs)) + 1) + return decayed_lr + + +def train_parallel_exe(learning_rate, + batch_size, + num_passes, + lr_strategy=None, + layers=50): + class_dim = 1000 + image_shape = [3, 224, 224] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float64') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + ret = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) + out = ret[-1] + # out = lenet(input=image, class_dim=class_dim) + # out = resnet_imagenet(input=image, class_dim=class_dim, layers=layers) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + optimizer = fluid.optimizer.SGD(learning_rate=0.01) + opts = optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + print(len(fluid.default_startup_program().block(0).ops)) + exe.run(fluid.default_startup_program()) + + train_reader = paddle.batch( + flowers.train( + use_xmap=False, mapper=flowers.test_mapper, buffered_size=1), + batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + fetch_list = [avg_cost, acc_top1, acc_top5] + + dshape = [3, 224, 224] + args = parse_args() + Iter = 0 + for pass_id in range(num_passes): + train_info = [[], [], []] + for batch_id, data in enumerate(train_reader()): + t1 = time.time() + image_data = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype('float64') + label_data = np.array(map(lambda x: x[1], data)).astype('int64') + label_data = label_data.reshape([-1, 1]) + print(len(fluid.default_main_program().block(0).ops)) + exit(0) + ret_numpy = exe.run( + fluid.default_main_program(), + feed={'image': image_data, + 'label': label_data}, + fetch_list=fetch_list) + loss = ret_numpy.pop(0) + acc1 = ret_numpy.pop(0) + acc5 = ret_numpy.pop(0) + + t2 = time.time() + period = t2 - t1 + loss = np.mean(np.array(loss)) + acc1 = np.mean(np.array(acc1)) + acc5 = np.mean(np.array(acc5)) + train_info[0].append(loss) + train_info[1].append(acc1) + train_info[2].append(acc5) + + if batch_id % 1 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, acc1 {3}, acc5 {4}" + .format(pass_id, batch_id, loss, acc1, acc5)) + sys.stdout.flush() + Iter += 1 + if Iter == args.iterations: + # if batch_id == 50: + exit(0) + + +if __name__ == '__main__': + lr_strategy = None + method = train_parallel_exe + # method( + # learning_rate=0.1, + # batch_size=16, + # num_passes=5, + # lr_strategy=lr_strategy, + # layers=50) + + method( + learning_rate=0.1, + batch_size=16, + num_passes=5, + lr_strategy=lr_strategy, + layers=152)