From fbbcaafdba3e885eab44c0dcfd23829c2c80f732 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 27 Aug 2015 21:50:08 +0800 Subject: [PATCH] SINGA-9 Add Support for Restricted Boltzman Machine (RBM) model * Refactor the CDWorker::TrainOneBatch. * Replace Phase with int flag in ComputeFeature ComputeGradient * functions. The flag can be a combination for multiple phases, e.g., * kTrain|kForward, where each phase is set to have only one none zero bit, e.g., 1, 2, 4, 8, etc. * Remove compliation dependency on opencv. * Refactor JobProto to create a AlgProto for TrainOneBatch. * Create a RBMLayer as the base layer for RBM layers. * Updated all configurations of all examples. --- Makefile.am | 3 - configure.ac | 18 +- examples/cifar10/job.conf | 5 +- examples/mnist/conv.conf | 4 +- examples/mnist/job.conf | 28 ++- examples/rbm/autoencoder.conf | 89 ++------- examples/rbm/rbm0.conf | 41 ++-- examples/rbm/rbm1.conf | 85 ++++----- examples/rbm/rbm2.conf | 57 ++---- examples/rbm/rbm3.conf | 68 ++----- include/mshadow/tensor_random.h | 17 +- include/neuralnet/base_layer.h | 91 +++++---- include/neuralnet/layer.h | 146 ++++---------- include/trainer/worker.h | 6 - include/utils/param.h | 3 +- src/driver.cc | 8 +- src/neuralnet/base_layer.cc | 49 +++-- src/neuralnet/layer.cc | 324 +++++++++++++++----------------- src/proto/job.proto | 47 +++-- src/trainer/worker.cc | 96 +++------- src/utils/common.cc | 2 +- src/utils/param.cc | 10 +- 22 files changed, 497 insertions(+), 700 deletions(-) diff --git a/Makefile.am b/Makefile.am index fa28848435..ae8a9ddbd7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -95,9 +95,6 @@ singa_LDFLAGS = -I./include \ -lglog \ -lprotobuf \ -lrt \ - -lopencv_highgui \ - -lopencv_imgproc \ - -lopencv_core \ -lopenblas \ -lzmq \ -lczmq \ diff --git a/configure.ac b/configure.ac index 35c6d6101d..569c406fd6 100644 --- a/configure.ac +++ b/configure.ac @@ -44,15 +44,15 @@ if test x"$enable_lmdb" = x"yes"; then AC_DEFINE(LMDB, 1, [Enable Option layer]) fi -AC_CHECK_LIB([opencv_imgproc], [main], [], [ - AC_MSG_ERROR([unable to find opencv_imgproc lib]) - ]) -AC_CHECK_LIB([opencv_highgui], [main], [], [ - AC_MSG_ERROR([unable to find opencv_highgui lib]) - ]) -AC_CHECK_LIB([opencv_core], [main], [], [ - AC_MSG_ERROR([unable to find opencv_core lib]) - ]) +#AC_CHECK_LIB([opencv_imgproc], [main], [], [ +# AC_MSG_ERROR([unable to find opencv_imgproc lib]) +# ]) +#AC_CHECK_LIB([opencv_highgui], [main], [], [ +# AC_MSG_ERROR([unable to find opencv_highgui lib]) +# ]) +#AC_CHECK_LIB([opencv_core], [main], [], [ +# AC_MSG_ERROR([unable to find opencv_core lib]) +# ]) AC_CHECK_LIB([zookeeper_mt], [main], [], [ AC_MSG_ERROR([unable to find zookeeper]) ]) diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf index f44ca504e4..9d2590490f 100644 --- a/examples/cifar10/job.conf +++ b/examples/cifar10/job.conf @@ -3,7 +3,10 @@ train_steps: 1000 test_steps: 100 test_freq:300 disp_freq:30 -alg: kBP +debug: true +train_one_batch { + alg: kBP +} updater{ type: kSGD weight_decay:0.004 diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf index 1d4d7403fa..aaf34f26a2 100644 --- a/examples/mnist/conv.conf +++ b/examples/mnist/conv.conf @@ -3,7 +3,9 @@ train_steps: 10000 test_steps:100 test_freq:500 disp_freq:50 -alg: kBP +train_one_batch { + alg: kBP +} updater { momentum:0.9 weight_decay:0.0005 diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf index 360e1ecf17..b8d14e8b43 100644 --- a/examples/mnist/job.conf +++ b/examples/mnist/job.conf @@ -3,7 +3,9 @@ train_steps: 1000 test_steps:10 test_freq:60 disp_freq:10 -alg: kBP +train_one_batch { + alg: kBP +} updater{ type: kSGD learning_rate{ @@ -82,6 +84,10 @@ neuralnet { layer{ name: "tanh1" type: kTanh + tanh_conf { + outer_scale: 1.7159047 + inner_scale: 0.6666667 + } srclayers:"fc1" } layer{ @@ -112,6 +118,11 @@ neuralnet { layer{ name: "tanh2" type: kTanh + tanh_conf { + outer_scale: 1.7159047 + inner_scale: 0.6666667 + } + srclayers:"fc2" } layer{ @@ -143,6 +154,11 @@ neuralnet { layer{ name: "tanh3" type: kTanh + tanh_conf { + outer_scale: 1.7159047 + inner_scale: 0.6666667 + } + srclayers:"fc3" } layer{ @@ -174,6 +190,11 @@ neuralnet { layer{ name: "tanh4" type: kTanh + tanh_conf { + outer_scale: 1.7159047 + inner_scale: 0.6666667 + } + srclayers:"fc4" } layer{ @@ -205,6 +226,11 @@ neuralnet { layer{ name: "tanh5" type: kTanh + tanh_conf { + outer_scale: 1.7159047 + inner_scale: 0.6666667 + } + srclayers:"fc5" } layer{ diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf index 957532346d..bc32cc7e50 100644 --- a/examples/rbm/autoencoder.conf +++ b/examples/rbm/autoencoder.conf @@ -1,15 +1,15 @@ -name: "deep-big-simple-mlp" +name: "auto-encoder" train_steps: 12200 test_steps:100 -test_freq:100 -disp_freq:20 -checkpoint_after: 1000 -checkpoint_freq: 1000 -checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin" -checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin" -checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin" -checkpoint_path: "examples/rbm/checkpoint/rbm3/checkpoint/step6000-worker0.bin" -alg: kBP +test_freq:1000 +disp_freq:100 +checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin" +checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin" +checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin" +checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin" +train_one_batch{ + alg: kBP +} updater{ type: kAdaGrad learning_rate{ @@ -23,7 +23,7 @@ neuralnet { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_train_shard" + path: "examples/mnist/mnist_train_shard" batchsize: 1000 } exclude: kTest @@ -33,7 +33,7 @@ neuralnet { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_test_shard" + path: "examples/mnist/mnist_test_shard" batchsize: 1000 } exclude: kTrain @@ -64,19 +64,9 @@ neuralnet { } param{ name: "w1" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb12" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -94,19 +84,9 @@ neuralnet { } param{ name: "w2" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb22" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -125,19 +105,9 @@ neuralnet { } param{ name: "w3" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb32" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -156,19 +126,10 @@ neuralnet { } param{ name: "w4" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb42" - init{ - type: kUniform - low: -0.05 - high:0.05 - } + } } @@ -187,11 +148,6 @@ neuralnet { } param{ name: "rb41" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -214,13 +170,7 @@ neuralnet { } param{ name: "rb31" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } - } layer{ @@ -242,11 +192,6 @@ neuralnet { } param{ name: "rb21" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -270,13 +215,7 @@ neuralnet { } param{ name: "rb11" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } - } layer{ @@ -295,5 +234,5 @@ neuralnet { cluster { nworker_groups: 1 nserver_groups: 1 - workspace: "examples/rbm/checkpoint/autoencoder/" + workspace: "examples/rbm/autoencoder/" } diff --git a/examples/rbm/rbm0.conf b/examples/rbm/rbm0.conf index ef8653f977..a7e503b925 100644 --- a/examples/rbm/rbm0.conf +++ b/examples/rbm/rbm0.conf @@ -1,11 +1,11 @@ -name: "deep-big-simple-dbm" +name: "rbm0" train_steps: 6000 test_steps:100 test_freq:100 disp_freq: 100 -alg: kCD -checkpoint_after: 500 -checkpoint_freq: 1000 +train_one_batch{ + alg: kCD +} updater{ type: kSGD momentum: 0.9 @@ -21,7 +21,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_train_shard" + path: "examples/mnist/mnist_train_shard" batchsize: 100 } exclude: kTest @@ -32,7 +32,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_test_shard" + path: "examples/mnist/mnist_test_shard" batchsize: 100 } exclude: kTrain @@ -54,22 +54,15 @@ layer{ type: kRBMVis srclayers:"mnist" srclayers:"RBMHid" - rbmvis_conf{ - num_output: 1000 - } param{ - name: "w1" - init{ - type: kGaussian - mean: 0.0 - std: 0.1 - } + name: "w1_" + share_from: "w1" } param{ name: "rb11" init{ - type: kConstant - value: 0.0 + type: kConstant + value: 0.0 } } } @@ -82,14 +75,18 @@ layer{ hid_dim: 1000 } param{ - name: "w1_1" - share_from: "w1" + name: "w1" + init{ + type: kGaussian + mean: 0.0 + std: 0.1 + } } param{ name: "rb12" init{ - type: kConstant - value: 0.0 + type: kConstant + value: 0.0 } } } @@ -99,5 +96,5 @@ cluster { nserver_groups: 1 nservers_per_group: 1 nworkers_per_group: 1 - workspace: "examples/rbm/checkpoint/rbm0/" + workspace: "examples/rbm/rbm0/" } diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf index f9b4974c35..db27d3a565 100644 --- a/examples/rbm/rbm1.conf +++ b/examples/rbm/rbm1.conf @@ -1,12 +1,12 @@ -name: "deep-big-simple-dbm" +name: "rbm1" train_steps: 6000 test_steps:100 -test_freq:500 +test_freq:1000 disp_freq: 100 -alg: kCD -checkpoint_after: 500 -checkpoint_freq: 1000 -checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin" +train_one_batch{ + alg: kCD +} +checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin" updater{ type: kSGD momentum: 0.9 @@ -22,7 +22,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_train_shard" + path: "examples/mnist/mnist_train_shard" batchsize: 100 } exclude: kTest @@ -33,7 +33,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_test_shard" + path: "examples/mnist/mnist_test_shard" batchsize: 100 } exclude: kTrain @@ -51,51 +51,34 @@ layer{ } layer{ - name: "fc1" - type: kInnerProduct - srclayers:"mnist" - innerproduct_conf{ - num_output: 1000 - } - param{ - name: "w1" - init{ - type: kUniform - low:-0.05 - high:0.05 - } - } - param{ - name: "rb12" - init{ - type: kUniform - low: -0.05 - high:0.05 - } - } + name: "fc1" + type: kInnerProduct + srclayers:"mnist" + innerproduct_conf{ + num_output: 1000 } - - layer{ - name: "sigmoid1" - type: kSigmoid - srclayers:"fc1" + param{ + name: "w1" } + param{ + name: "rb12" + } +} + +layer{ + name: "sigmoid1" + type: kSigmoid + srclayers:"fc1" +} layer{ name: "RBMVis" type: kRBMVis srclayers:"sigmoid1" srclayers:"RBMHid" - rbmvis_conf{ - num_output: 500 - } param{ - name: "w2" - init{ - type: kGaussian - mean: 0.0 - std: 0.1 - } + name: "w2_" + share_from: "w2" } param{ name: "rb21" @@ -114,14 +97,18 @@ layer{ hid_dim: 500 } param{ - name: "w2_1" - share_from: "w2" + name: "w2" + init{ + type: kGaussian + mean: 0.0 + std: 0.1 + } } param{ name: "rb22" init{ - type: kConstant - value: 0.0 + type: kConstant + value: 0.0 } } } @@ -131,5 +118,5 @@ cluster { nserver_groups: 1 nservers_per_group: 1 nworkers_per_group: 1 - workspace: "examples/rbm/checkpoint/rbm1/" + workspace: "examples/rbm/rbm1/" } diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf index 6629481622..fd08907448 100644 --- a/examples/rbm/rbm2.conf +++ b/examples/rbm/rbm2.conf @@ -1,12 +1,12 @@ -name: "deep-big-simple-dbm" +name: "rbm2" train_steps: 6000 test_steps:100 -test_freq:100 +test_freq:1000 disp_freq: 100 -alg: kCD -checkpoint_after: 500 -checkpoint_freq: 1000 -checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin" +train_one_batch{ + alg: kCD +} +checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin" updater{ type: kSGD @@ -24,7 +24,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_train_shard" + path: "examples/mnist/mnist_train_shard" batchsize: 100 } exclude: kTest @@ -35,7 +35,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_test_shard" + path: "examples/mnist/mnist_test_shard" batchsize: 100 } exclude: kTrain @@ -61,19 +61,9 @@ layer{ } param{ name: "w1" - init { - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb12" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -92,19 +82,9 @@ layer{ } param{ name: "w2" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb22" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -118,16 +98,9 @@ layer{ type: kRBMVis srclayers:"sigmoid2" srclayers:"RBMHid" - rbmvis_conf{ - num_output: 250 - } param{ - name: "w3" - init{ - type: kGaussian - mean: 0.0 - std: 0.1 - } + name: "w3_" + share_from: "w3" } param{ name: "rb31" @@ -146,8 +119,12 @@ layer{ hid_dim: 250 } param{ - name: "w3_1" - share_from: "w3" + name: "w3" + init{ + type: kGaussian + mean: 0.0 + std: 0.1 + } } param{ name: "rb32" @@ -163,5 +140,5 @@ cluster { nserver_groups: 1 nservers_per_group: 1 nworkers_per_group: 1 - workspace: "examples/rbm/checkpoint/rbm2/" + workspace: "examples/rbm/rbm2/" } diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf index 482c5e77ab..fe7cc1ff2d 100644 --- a/examples/rbm/rbm3.conf +++ b/examples/rbm/rbm3.conf @@ -1,12 +1,12 @@ -name: "deep-big-simple-dbm" +name: "rbm3" train_steps: 6000 test_steps: 100 -test_freq: 100 +test_freq: 1000 disp_freq: 100 -alg: kCD -checkpoint_after: 500 -checkpoint_freq: 1000 -checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin" +train_one_batch{ + alg: kCD +} +checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin" updater{ type: kSGD momentum: 0.9 @@ -22,7 +22,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_train_shard" + path: "examples/mnist/mnist_train_shard" batchsize: 100 } exclude: kTest @@ -33,7 +33,7 @@ layer { name: "data" type: kShardData sharddata_conf { - path: "examples/rbm/mnist_test_shard" + path: "examples/mnist/mnist_test_shard" batchsize: 100 } exclude: kTrain @@ -59,19 +59,9 @@ layer{ } param{ name: "w1" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb12" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -90,19 +80,9 @@ layer{ } param{ name: "w2" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb22" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -121,19 +101,9 @@ layer{ } param{ name: "w3" - init{ - type: kUniform - low:-0.05 - high:0.05 - } } param{ name: "rb32" - init{ - type: kUniform - low: -0.05 - high:0.05 - } } } @@ -148,16 +118,10 @@ layer{ type: kRBMVis srclayers:"sigmoid3" srclayers:"RBMHid" - rbmvis_conf{ - num_output: 30 - } param{ - name: "w4" - init{ - type: kGaussian - mean: 0.0 - std: 0.1 - } + name: "w4_" + share_from: "w4" + } param{ name: "rb41" @@ -177,8 +141,12 @@ layer{ gaussian: true } param{ - name: "w4_1" - share_from: "w4" + name: "w4" + init{ + type: kGaussian + mean: 0.0 + std: 0.1 + } } param{ name: "rb42" @@ -194,5 +162,5 @@ cluster { nserver_groups: 1 nservers_per_group: 1 nworkers_per_group: 1 - workspace: "examples/rbm/checkpoint/rbm3/" + workspace: "examples/rbm/rbm3/" } diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h index 72164a8fa5..59ef08266d 100644 --- a/include/mshadow/tensor_random.h +++ b/include/mshadow/tensor_random.h @@ -68,20 +68,27 @@ namespace mshadow { gen_.seed(seed); #endif } + template + inline void SampleBinary(Tensor &src) { + SampleBinary(src, src); + } + /*! * \brief generate binary data according to a probability matrix + * \param src source * \param dst destination * \param a lower bound of uniform * \param b upper bound of uniform * \tparam dim dimension of tensor */ template - inline void SampleBinary( Tensor &dst) { + inline void SampleBinary(Tensor &dst, Tensor &src) { real_t a=0.0f; real_t b=1.0f; - Tensor mat = dst.FlatTo2D(); + Tensor dmat = dst.FlatTo2D(); + Tensor smat = src.FlatTo2D(); std::uniform_real_distribution distribution (a,b); - for ( index_t i = 0; i < mat.shape[1]; ++i ) { + for ( index_t i = 0; i < dmat.shape[1]; ++i ) { #if MSHADOW_USE_MKL #if MSHADOW_SINGLE_PRECISION int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b ); @@ -96,8 +103,8 @@ namespace mshadow { mat[i][j] = this->RandNext()*(b-a) + a; } */ - for ( index_t j = 0; j < mat.shape[0]; ++j ) { - mat[i][j] = distribution(gen_) > mat[i][j] ? 0.0f: 1.0f; + for ( index_t j = 0; j < dmat.shape[0]; ++j ) { + dmat[i][j] = distribution(gen_) > smat[i][j] ? 0.0f: 1.0f; } #endif } diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h index 5575fc75a9..9aa207dddd 100644 --- a/include/neuralnet/base_layer.h +++ b/include/neuralnet/base_layer.h @@ -49,25 +49,25 @@ class Layer { /** * Compute features of this layer based on connected layers. * - * @param phase kTrain, kTest, kPositive, etc. + * @param flag kTrain, kTest, kPositive, etc. */ - virtual void ComputeFeature(Phase phase, Metric* perf) = 0; + virtual void ComputeFeature(int flag, Metric* perf) = 0; /** * Compute gradients for parameters and connected layers. * - * @param phase kTrain, kTest, kPositive, etc. + * @param flag kTrain, kTest, kPositive, etc. */ virtual void ComputeLoss(Metric* perf) {} - virtual void ComputeGradient(Phase phase) = 0; + virtual void ComputeGradient(int flag) = 0; /** * For print debug info about each layer, e.g., norm of feature vector, * norm of parameters. * * @param step training/test/validation step - * @param phase forward/backward/positive/negative... + * @param flag forward/backward/positive/negative... * @return debug info about this layer. */ - const string DebugString(int step, Phase phase); + const string DebugString(int step, int flag); /** * Layers that have paramters must override this function. * @@ -141,10 +141,10 @@ class Layer { /** * @return a const ref for Blob storing neuron values of this layer for BP */ - virtual const Blob& data(const Layer* from, Phase = kPositive) const { + virtual const Blob& data(const Layer* from) const { return data_; } - virtual Blob* mutable_data(const Layer* from, Phase = kPositive) { + virtual Blob* mutable_data(const Layer* from) { return &data_; } @@ -246,15 +246,15 @@ class BridgeSrcLayer: public BridgeLayer { using Layer::ComputeFeature; using Layer::ComputeGradient; - void ComputeFeature(Phase phase, Metric* perf) override {} - void ComputeGradient(Phase phase) override { + void ComputeFeature(int flag, Metric* perf) override {} + void ComputeGradient(int flag) override { ready_ = false; } - const Blob& data(const Layer* from, Phase phase) const override { + const Blob& data(const Layer* from) const override { return srclayers_[0]->data(this); } - Blob* mutable_data(const Layer* from, Phase phase) override { + Blob* mutable_data(const Layer* from) override { return srclayers_[0]->mutable_data(this); } const Blob& grad(const Layer* from) const override { @@ -278,11 +278,11 @@ class BridgeDstLayer: public BridgeLayer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric* perf) override { + void ComputeFeature(int flag, Metric* perf) override { // reset ready_ for next iteration. ready_ = false; } - void ComputeGradient(Phase phase) override {} + void ComputeGradient(int flag) override {} bool is_bridgedstlayer() const { return true; } @@ -297,8 +297,8 @@ class ConcateLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric* perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric* perf) override; + void ComputeGradient(int flag) override; }; /** @@ -311,11 +311,11 @@ class DataLayer: public Layer{ using Layer::mutable_grad; using Layer::dst_layer_connection; - void ComputeGradient(Phase phase) override {} + void ComputeGradient(int flag) override {} bool is_datalayer() const override { return true; } - Blob* mutable_data(const Layer* layer, Phase phase) override { + Blob* mutable_data(const Layer* layer) override { return nullptr; } Blob* mutable_grad(const Layer* layer) override { @@ -357,11 +357,11 @@ class PrefetchLayer : public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric* perf) override; - void ComputeGradient(Phase phase) override {}; + void ComputeFeature(int flag, Metric* perf) override; + void ComputeGradient(int flag) override {}; - const Blob& data(const Layer* from, Phase phase) const override; - Blob* mutable_data(const Layer* layer, Phase phase) override; + const Blob& data(const Layer* from) const override; + Blob* mutable_data(const Layer* layer) override; Blob* mutable_grad(const Layer* layer) override { return nullptr; @@ -371,7 +371,7 @@ class PrefetchLayer : public Layer { return grad_; } - void Prefetch(Phase phase); + void Prefetch(int flag); virtual ~PrefetchLayer(); protected: @@ -389,14 +389,14 @@ class SliceLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric* perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric* perf) override; + void ComputeGradient(int flag) override; ConnectionType dst_layer_connection() const override { return kOneToMany; } - const Blob& data(const Layer* layer, Phase phase) const override; + const Blob& data(const Layer* layer) const override; const Blob& grad(const Layer* layer) const override; - Blob* mutable_data(const Layer* layer, Phase phase) override; + Blob* mutable_data(const Layer* layer) override; Blob* mutable_grad(const Layer* layer) override; protected: @@ -418,8 +418,8 @@ class SplitLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric* perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric* perf) override; + void ComputeGradient(int flag) override; ConnectionType dst_layer_connection() const override { return kOneToMany; } @@ -462,12 +462,12 @@ class ParserLayer: public Layer { using Layer::mutable_grad; using Layer::grad; - void ComputeFeature(Phase phase, Metric* perf) override; - void ComputeGradient(Phase phase) override {}; + void ComputeFeature(int flag, Metric* perf) override; + void ComputeGradient(int flag) override {}; /** * Parse records from DataLayer into blob. */ - virtual void ParseRecords(Phase phase, const vector& records, + virtual void ParseRecords(int flag, const vector& records, Blob* blob) = 0; bool is_parserlayer() const override { return true; @@ -480,6 +480,33 @@ class ParserLayer: public Layer { return grad_; } }; + +class RBMLayer: public Layer { + public: + const Blob& neg_data(const Layer* layer) { + return neg_data_; + } + Blob* mutable_neg_data(const Layer* layer) { + return &neg_data_; + } + const vector GetParams() const override { + vector params{weight_, bias_}; + return params; + } + virtual Blob* Sample(int flat) = 0; + + protected: + //! dimension of the hidden layer + int hdim_; + //! dimension of the visible layer + int vdim_; + int batchsize_; + Param* weight_, *bias_; + + Blob neg_data_; + Blob neg_sample_; + Blob sample_; +}; } // namespace singa #endif // SINGA_NEURALNET_BASE_LAYER_H_ diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h index b1fbbb087d..435d854d0d 100644 --- a/include/neuralnet/layer.h +++ b/include/neuralnet/layer.h @@ -31,8 +31,8 @@ class ConvolutionLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; const vector GetParams() const override { vector params{weight_, bias_}; return params; @@ -57,8 +57,8 @@ class DropoutLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; protected: // drop probability @@ -68,112 +68,42 @@ class DropoutLayer: public Layer { */ Blob mask_; }; + /** * RBM visible layer */ -class RBMVisLayer: public Layer { +class RBMVisLayer: public RBMLayer { public: using Layer::ComputeFeature; using Layer::ComputeGradient; - void Setup(const LayerProto& proto, - int npartitions) override; - virtual bool is_vislayer() const { - return true; - } - - void ComputeFeature(Phase phase, - Metric *perf) override; - void ComputeGradient(Phase phase) override; - virtual void ComputeLoss(Metric* perf); - virtual Blob* mutable_data(const Layer* from, Phase phase) { - if (phase == kPositive) { - return &data_; - } else { - return &vis_sample_; - } - } - virtual const Blob& data(const Layer* from, Phase phase) const { - if (phase == kPositive) { - return data_; - } else { - return vis_sample_; - } - } - // virtual void ToProto(LayerProto *layer_proto, bool copyData); - const vector GetParams() const override { - vector params{weight_, bias_}; - return params; - } ~RBMVisLayer(); - - + void Setup(const LayerProto& proto, int npartitions) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; + Blob* Sample(int flat) override; private: - //! dimension of the hidden layer - int hdim_; - //! dimension of the visible layer - int vdim_; - int batchsize_; - // batchsize of negative phase - int neg_batchsize_; - bool is_first_iteration_vis_; - float scale_; - // srclayer index - int data_idx_; - int hid_idx_; - Param* weight_, *bias_; - // data to store sampling result - Blob vis_sample_; - // in order to implement Persistent Contrastive Divergence, + RBMLayer* hid_layer_; + Layer* input_layer_; }; /** * RBM hidden layer */ -class RBMHidLayer: public Layer { +class RBMHidLayer: public RBMLayer { public: using Layer::ComputeFeature; using Layer::ComputeGradient; - void Setup(const LayerProto& proto, - int npartitions) override; - virtual bool is_hidlayer() const { - return true; - } - - void ComputeFeature(Phase phase, - Metric *perf) override; - void ComputeGradient(Phase phase) override; - virtual Blob* mutable_data(const Layer* from, Phase phase) { - if (phase == kPositive) - return &data_; - else - return &hid_sample_; - } - virtual const Blob& data(const Layer* from, Phase phase) const { - if (phase == kPositive) - return data_; - else - return hid_sample_; - } - const vector GetParams() const override { - vector params{weight_, bias_}; - return params; - } ~RBMHidLayer(); - + void Setup(const LayerProto& proto, int npartitions) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; + Blob* Sample(int flat) override; private: - //! dimension of the hidden layer - int hdim_; - int vdim_; // dimension of visible layer - int batchsize_; - // batchsize of negative phase - int neg_batchsize_; - float scale_; // whether use gaussian sampling bool gaussian_; - Blob hid_sample_; - Param* weight_, *bias_; + RBMLayer *vis_layer_; }; /** * fully connected layer @@ -184,8 +114,8 @@ class InnerProductLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; ConnectionType src_neuron_connection(int k) const override { // CHECK_LT(k, srclayers_.size()); @@ -212,7 +142,7 @@ class LabelLayer: public ParserLayer { using ParserLayer::ParseRecords; void Setup(const LayerProto& proto, int npartitions) override; - void ParseRecords(Phase phase, const vector& records, + void ParseRecords(int flag, const vector& records, Blob* blob) override; }; @@ -229,8 +159,8 @@ class LRNLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; protected: //! shape of the bottom layer feature @@ -247,7 +177,7 @@ class MnistLayer: public ParserLayer { using ParserLayer::ParseRecords; void Setup(const LayerProto& proto, int npartitions) override; - void ParseRecords(Phase phase, const vector& records, + void ParseRecords(int flag, const vector& records, Blob* blob) override; ConnectionType dst_layer_connection() const override { return kOneToMany; @@ -269,8 +199,8 @@ class PoolingLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; protected: int kernel_, pad_, stride_; @@ -284,8 +214,8 @@ class ReLULayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions = 1) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; }; class EuclideanLossLayer: public LossLayer { @@ -294,8 +224,8 @@ class EuclideanLossLayer: public LossLayer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; int partition_dim() const override { @@ -321,8 +251,8 @@ class SoftmaxLossLayer: public LossLayer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; /** * softmax is not recommendeded for partition because it requires the whole @@ -349,7 +279,7 @@ class RGBImageLayer: public ParserLayer { using ParserLayer::ParseRecords; void Setup(const LayerProto& proto, int npartitions) override; - void ParseRecords(Phase phase, const vector& records, + void ParseRecords(int flag, const vector& records, Blob* blob) override; private: @@ -365,7 +295,7 @@ class ShardDataLayer: public DataLayer{ ~ShardDataLayer(); void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; + void ComputeFeature(int flag, Metric *perf) override; private: DataShard* shard_; @@ -382,8 +312,8 @@ class SigmoidLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; }; /** @@ -397,8 +327,8 @@ class TanhLayer: public Layer { using Layer::ComputeGradient; void Setup(const LayerProto& proto, int npartitions) override; - void ComputeFeature(Phase phase, Metric *perf) override; - void ComputeGradient(Phase phase) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag) override; private: float outer_scale_, inner_scale_; diff --git a/include/trainer/worker.h b/include/trainer/worker.h index 86b1c90a8e..35ce77e736 100644 --- a/include/trainer/worker.h +++ b/include/trainer/worker.h @@ -192,15 +192,9 @@ class BPWorker: public Worker{ class CDWorker: public Worker{ public: - ~CDWorker() {} - void Init(int thread_id, int grp_id, int id) override; void TrainOneBatch(int step, Metric* perf) override; void TestOneBatch(int step, Phase phase, shared_ptr net, Metric* perf) override; - void PositivePhase(int step, shared_ptr net, Metric* perf); - void NegativePhase(int step, shared_ptr net, Metric* perf); - void GradientPhase(int step, shared_ptr net); - void LossPhase(int step, shared_ptr net, Metric* perf); }; inline int BlobTrgt(int grp, int layer) { diff --git a/include/utils/param.h b/include/utils/param.h index f7a0982f4f..0d24e9543b 100644 --- a/include/utils/param.h +++ b/include/utils/param.h @@ -74,13 +74,14 @@ class Param { static Param* Create(const ParamProto& proto); Param(); virtual ~Param() {} + void Init(const ParamProto& proto) { proto_ = proto; } /** * Setup param object * * @param conf param configuration, include learning rate multiplier etc. * @param shape one value per dimension */ - virtual void Setup(const ParamProto& conf, const std::vector& shape); + virtual void Setup(const std::vector& shape); /* * Fill the values according to init method, e.g., gaussian distribution. * diff --git a/src/driver.cc b/src/driver.cc index e5045a30d7..9fa4b860a7 100644 --- a/src/driver.cc +++ b/src/driver.cc @@ -1,9 +1,12 @@ -#include "singa.h" #include #include #include +#include "singa.h" + +#include "utils/tinydir.h" + namespace singa { void Driver::Init(int argc, char **argv) { @@ -89,6 +92,9 @@ void Driver::Submit(bool resume, const JobProto& jobConf) { if (singa_conf_.has_log_dir()) SetupLog(singa_conf_.log_dir(), std::to_string(job_id_) + "-" + jobConf.name()); + tinydir_dir workspace; + if (tinydir_open(&workspace, jobConf.cluster().workspace().c_str()) == -1) + LOG(FATAL) << "workspace does not exist: " << jobConf.cluster().workspace(); if (jobConf.num_openblas_threads() != 1) LOG(WARNING) << "openblas with " << jobConf.num_openblas_threads() << " threads"; diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc index f995353363..46f8b57b59 100644 --- a/src/neuralnet/base_layer.cc +++ b/src/neuralnet/base_layer.cc @@ -1,5 +1,3 @@ -#include -#include #include #include #include @@ -24,14 +22,13 @@ void Layer::Setup(const LayerProto& proto, int npartitions) { layer_proto_ = proto; } -const string Layer::DebugString(int step, Phase phase) { +const string Layer::DebugString(int step, int flag) { string ret =StringPrintf("Layer %10s ", name().c_str()); - if(data_.count() != 0) - return ret; - if(phase == kForward) { - ret += StringPrintf("data %10s data norm1 %13.9f", data_.asum_data()); - }else if(phase == kBackward) { - ret += StringPrintf("grad norm1 %13.9f\n", grad_.asum_data()); + if ((flag & kForward) == kForward && data_.count() !=0) { + ret += StringPrintf("data norm1 %13.9f", data_.asum_data()); + } else if ((flag & kBackward) == kBackward) { + if (grad_.count() != 0) + ret += StringPrintf("grad norm1 %13.9f\n", grad_.asum_data()); for(Param* p: GetParams()) ret += StringPrintf("param id %2d, name %10s,\ value norm1 %13.9f, grad norm1 %13.9f\n", @@ -68,41 +65,41 @@ void ConcateLayer::Setup(const LayerProto& proto, int npartitions) { grad_.Reshape(shape); } -void ConcateLayer::ComputeFeature(Phase phase, Metric *perf){ +void ConcateLayer::ComputeFeature(int flag, Metric *perf){ LOG(FATAL) << "Not implemented for Concate Layer"; } -void ConcateLayer::ComputeGradient(Phase phase){ +void ConcateLayer::ComputeGradient(int flag){ LOG(FATAL) << "Not implemented for Concate Layer"; } /************* Implementation for ParserLayer ***********/ -void ParserLayer::ComputeFeature(Phase phase, Metric *perf){ +void ParserLayer::ComputeFeature(int flag, Metric *perf){ CHECK_EQ(srclayers_.size(),1); auto datalayer=static_cast(*srclayers_.begin()); - ParseRecords(phase, datalayer->records(), &data_); + ParseRecords(flag, datalayer->records(), &data_); } /************* Implementation for PrefetchLayer ***********/ -void PrefetchLayer::Prefetch(Phase phase){ +void PrefetchLayer::Prefetch(int flag){ //clock_t s=clock(); for(auto layer: sublayers_) - layer->ComputeFeature(phase, nullptr); + layer->ComputeFeature(flag, nullptr); //LOG(ERROR)<<(clock()-s)*1.0/CLOCKS_PER_SEC; } -void PrefetchLayer::ComputeFeature(Phase phase, Metric* perf){ +void PrefetchLayer::ComputeFeature(int flag, Metric* perf){ if(thread_.joinable()) thread_.join(); else{ - Prefetch(phase); + Prefetch(flag); } for(auto layer: sublayers_){ if(layer->is_parserlayer()) // TODO replace CopyFrom with Swap? datablobs_.at(layer->name()).CopyFrom(layer->data(this)); } - thread_=std::thread(&PrefetchLayer::Prefetch, this, phase); + thread_=std::thread(&PrefetchLayer::Prefetch, this, flag); } void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) { @@ -133,7 +130,7 @@ void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) { datablobs_[layer->name()]=Blob(layer->data(this).shape()); } -const Blob& PrefetchLayer::data(const Layer* from, Phase phase) const { +const Blob& PrefetchLayer::data(const Layer* from) const { LOG(FATAL) << " needs update"; if(from != nullptr) { return datablobs_.at(""); @@ -143,7 +140,7 @@ const Blob& PrefetchLayer::data(const Layer* from, Phase phase) const { } } -Blob* PrefetchLayer::mutable_data(const Layer* from, Phase phase) { +Blob* PrefetchLayer::mutable_data(const Layer* from) { LOG(FATAL) << " needs update"; if(from!=nullptr){ return &(datablobs_.at("")); @@ -194,7 +191,7 @@ int SliceLayer::SliceID(const Layer* layer) const { return -1; } -const Blob& SliceLayer::data(const Layer* layer, Phase phase) const { +const Blob& SliceLayer::data(const Layer* layer) const { if(layer==nullptr) return data_; return datavec_[SliceID(layer)]; @@ -204,7 +201,7 @@ const Blob& SliceLayer::grad(const Layer* layer) const { return grad_; return gradvec_[SliceID(layer)]; } -Blob* SliceLayer::mutable_data(const Layer* layer, Phase phase) { +Blob* SliceLayer::mutable_data(const Layer* layer) { if(layer==nullptr) return &data_; return &datavec_[SliceID(layer)]; @@ -214,7 +211,7 @@ Blob* SliceLayer::mutable_grad(const Layer* layer){ return &grad_; return &gradvec_[SliceID(layer)]; } -void SliceLayer::ComputeFeature(Phase phase, Metric *perf) { +void SliceLayer::ComputeFeature(int flag, Metric *perf) { CHECK_EQ(srclayers_.size(),1); if(slice_dim_==0){ const auto& blob=srclayers_.at(0)->data(this); @@ -226,7 +223,7 @@ void SliceLayer::ComputeFeature(Phase phase, Metric *perf) { } } } -void SliceLayer::ComputeGradient(Phase phase) { +void SliceLayer::ComputeGradient(int flag) { // LOG(FATAL) << "Not implemented"; } @@ -240,11 +237,11 @@ void SplitLayer::Setup(const LayerProto& proto, int npartitions) { grad_.Reshape(srclayers_[0]->data(this).shape()); } -void SplitLayer::ComputeFeature(Phase phase, Metric *perf) { +void SplitLayer::ComputeFeature(int flag, Metric *perf) { LOG(FATAL) << "Not implemented"; } -void SplitLayer::ComputeGradient(Phase phase) { +void SplitLayer::ComputeGradient(int flag) { LOG(FATAL) << "Not implemented"; } diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc index b5c986eb0e..29a2312d6e 100644 --- a/src/neuralnet/layer.cc +++ b/src/neuralnet/layer.cc @@ -72,11 +72,11 @@ void ConvolutionLayer::Setup(const LayerProto& proto, int npartitions) { weight_ = Param::Create(proto.param(0)); bias_ = Param::Create(proto.param(1)); - weight_->Setup(proto.param(0), vector{num_filters_, col_height_}); - bias_->Setup(proto.param(1), vector{num_filters_}); + weight_->Setup(vector{num_filters_, col_height_}); + bias_->Setup(vector{num_filters_}); } -void ConvolutionLayer::ComputeFeature(Phase phase, Metric* perf){ +void ConvolutionLayer::ComputeFeature(int flag, Metric* perf){ auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto data = Tensor3(&data_); auto col = Tensor2(&col_data_); @@ -93,7 +93,7 @@ void ConvolutionLayer::ComputeFeature(Phase phase, Metric* perf){ data+=broadcast<1>(bias, data.shape); } -void ConvolutionLayer::ComputeGradient(Phase phase) { +void ConvolutionLayer::ComputeGradient(int flag) { auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto col = Tensor2(&col_data_); auto weight = Tensor2(weight_->mutable_data()); @@ -137,9 +137,9 @@ void DropoutLayer::Setup(const LayerProto& proto, int npartitions) { pdrop_ = proto.dropout_conf().dropout_ratio(); } -void DropoutLayer::ComputeFeature(Phase phase, Metric* perf) { +void DropoutLayer::ComputeFeature(int flag, Metric* perf) { // check training - if(phase != kTrain){//!training){ + if((flag & kTrain) != kTrain) { data_.CopyFrom(srclayers_[0]->data(this)); return; } @@ -152,7 +152,7 @@ void DropoutLayer::ComputeFeature(Phase phase, Metric* perf) { data = src * mask; } -void DropoutLayer::ComputeGradient(Phase phase) { +void DropoutLayer::ComputeGradient(int flag) { auto mask = Tensor1(&mask_); auto grad = Tensor1(&grad_); auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this)); @@ -164,94 +164,69 @@ RBMVisLayer::~RBMVisLayer() { delete bias_; } -void RBMVisLayer::Setup(const LayerProto& proto, - int npartitions) { +void RBMVisLayer::Setup(const LayerProto& proto, int npartitions) { Layer::Setup(proto, npartitions); CHECK_EQ(srclayers_.size(), 2); - // hid_idx_: index indicating which srclayer is is hidden layer - // data_idx_: index indicating which srclayer is data layer - for (unsigned int i = 0; i < srclayers_.size(); i++) - for (unsigned int j = 0; j < (srclayers_[i]-> dstlayers()).size(); j++) - if (strcmp(((srclayers_[i]->dstlayers()).at(j)->name().c_str()), - (this->name()).c_str()) == 0) - hid_idx_ = i; - for (unsigned int i = 0; i < srclayers_.size(); i++) - if (i != static_cast(hid_idx_) ) - data_idx_ = i; - const auto& src = srclayers_[data_idx_]->data(this); - is_first_iteration_vis_ = true; + hid_layer_ = nullptr; + for (auto src : srclayers_) { + for (auto dst : src->srclayers()) { + if (dst->name() == name()) { + CHECK(hid_layer_ == nullptr); + hid_layer_ = static_cast(src); + } + } + } + input_layer_ = srclayers_[0] != hid_layer_ ? srclayers_[0]: srclayers_[1]; + const auto& src = input_layer_->data(this); batchsize_ = src.shape()[0]; - neg_batchsize_ = batchsize_; - /*gibbs sampling size and input have the same size*/ - vdim_ = src.count()/batchsize_; - hdim_ = proto.rbmvis_conf().num_output(); - data_.Reshape(vector{batchsize_, vdim_}); // this is visible dimension - vis_sample_.Reshape(vector{neg_batchsize_, vdim_}); + data_.ReshapeLike(src); // this is visible dimension + neg_data_.ReshapeLike(data_); + neg_sample_.ReshapeLike(data_); weight_ = Param::Create(proto.param(0)); bias_ = Param::Create(proto.param(1)); - weight_->Setup(proto.param(0), vector{hdim_, vdim_}); - bias_->Setup(proto.param(1), vector{vdim_}); -} - -void RBMVisLayer::ComputeFeature(Phase phase, Metric* perf) { - if (phase == kPositive) { /*positive phase*/ - auto data = Tensor2(&data_); - CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_); - auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this)); - Copy(data, src); - } else if (phase == kNegative) { /*negative phase*/ - auto hid_sample = - Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative)); - // fetch sampling results from hidden layer - auto vis_sample = Tensor2(&vis_sample_); - auto weight = Tensor2(weight_->mutable_data()); - auto bias = Tensor1(bias_->mutable_data()); - vis_sample = dot(hid_sample, weight); - vis_sample+=repmat(bias, neg_batchsize_); - vis_sample = F(vis_sample); + bias_->Setup(vector{src.count() / batchsize_}); +} +Blob* RBMVisLayer::Sample(int flag) { + Tensor sample, data; + if ((flag & kPositive) == kPositive) { + LOG(FATAL) << "RBMVisLayer can not be sampled for positive flag"; + } else { + data = Tensor2(&neg_data_); + sample = Tensor2(&neg_sample_); + } + auto random = TSingleton>::Instance(); + random->SampleBinary(sample, data); + return &neg_sample_; +} +void RBMVisLayer::ComputeFeature(int flag, Metric* perf) { + if ((flag & kPositive) == kPositive) { /*positive flag*/ + data_.CopyFrom(input_layer_->data(this), true); + } else if ((flag & kNegative) == kNegative) { /*negative flag*/ + auto hid_sample = Tensor2(hid_layer_->Sample(flag)); + // fetch sampling results from hidden layer + auto data = Tensor2(&neg_data_); + auto weight = Tensor2(weight_->mutable_data()); + auto bias = Tensor1(bias_->mutable_data()); + data = dot(hid_sample, weight); + data += repmat(bias, batchsize_); + data = F(data); + if ((flag & kTest) == kTest) { + const float *dptr = data_.cpu_data(), *rcns = neg_data_.cpu_data(); + float err = 0.f; + for (int i = 0; i < data_.count(); i++) { + err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]); + } + perf->Add("Squared Error", err / batchsize_); } + } } -void RBMVisLayer::ComputeGradient(Phase phase) { - auto data = Tensor2(&data_); - auto hid_data = Tensor2(srclayers_[hid_idx_]->mutable_data(this, kPositive)); - auto vis_sample = Tensor2(&vis_sample_); - auto hid_sample = - Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative)); - // fetch sampling results from hidden layer - auto gweight = Tensor2(weight_->mutable_grad()); - auto gbias = Tensor1(bias_->mutable_grad()); - gbias = sum_rows(vis_sample); - gbias -= sum_rows(data); - gweight = dot(hid_sample.T(), vis_sample); - gweight -= dot(hid_data.T(), data); - gbias*=(1.0f)/(1.0f*batchsize_); - gweight*=(1.0f)/(1.0f*batchsize_); -} - -void RBMVisLayer::ComputeLoss(Metric* perf) { - float loss_sqr = (0.0f); - CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_); - auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this)); - auto hid_data = Tensor2(srclayers_[hid_idx_]->mutable_data(this, kPositive)); - // gibbs using u - auto weight = Tensor2(weight_->mutable_data()); - auto bias = Tensor1(bias_->mutable_data()); - Tensor reconstruct(Shape2(batchsize_, vdim_)); /*reconstruct error*/ - AllocSpace(reconstruct); - reconstruct = dot(hid_data, weight); - reconstruct+=repmat(bias, batchsize_); - reconstruct = F(reconstruct); - float *src_dptr = src.dptr; - for (int i = 0; i < vdim_*batchsize_; i++) { - int recon_row = i / vdim_; - int recon_col = i - recon_row * vdim_; - loss_sqr += (src_dptr[i] - reconstruct[recon_row][recon_col]) * - (src_dptr[i] - reconstruct[recon_row][recon_col]); - } - FreeSpace(reconstruct); - perf->Reset(); - perf->Add("sqr_reconstruct_error", loss_sqr); +void RBMVisLayer::ComputeGradient(int flag) { + auto vis_pos = Tensor2(&data_); + auto vis_neg = Tensor2(&neg_data_); + auto gbias = Tensor1(bias_->mutable_grad()); + gbias = sum_rows(vis_neg); + gbias -= sum_rows(vis_pos); } /**************** Implementation for RBMHidLayer********************/ RBMHidLayer::~RBMHidLayer() { @@ -263,84 +238,75 @@ void RBMHidLayer::Setup(const LayerProto& proto, int npartitions) { Layer::Setup(proto, npartitions); CHECK_EQ(srclayers_.size(), 1); - const auto& src_data = srclayers_[0]->data(this, kPositive); - const auto& src_sample = srclayers_[0]->data(this, kNegative); - scale_ = static_cast (1.0f); + const auto& src_data = srclayers_[0]->data(this); batchsize_ = src_data.shape()[0]; - neg_batchsize_ = src_sample.shape()[0]; vdim_ = src_data.count()/batchsize_; hdim_ = proto.rbmhid_conf().hid_dim(); gaussian_ = proto.rbmhid_conf().gaussian(); data_.Reshape(vector{batchsize_, hdim_}); - hid_sample_.Reshape(vector{neg_batchsize_, hdim_}); + neg_data_.ReshapeLike(data_); + sample_.ReshapeLike(data_); + neg_sample_.ReshapeLike(data_); weight_ = Param::Create(proto.param(0)); bias_ = Param::Create(proto.param(1)); - bias_->Setup(proto.param(1), vector{hdim_}); - weight_->Setup(proto.param(0), vector{hdim_, vdim_}); + bias_->Setup(vector{hdim_}); + weight_->Setup(vector{hdim_, vdim_}); + vis_layer_ = static_cast (srclayers_[0]); } -void RBMHidLayer::ComputeFeature(Phase phase, Metric* perf) { - if (phase == kPositive) { /*postive phase*/ - auto data = Tensor2(&data_); - - auto hid_sample = Tensor2(&hid_sample_); - - CHECK_EQ(srclayers_[0]->data(this, kPositive).count(), batchsize_*vdim_); - auto src = Tensor2(srclayers_[0]->mutable_data(this, kPositive)); - auto weight = Tensor2(weight_->mutable_data()); - auto bias = Tensor1(bias_->mutable_data()); - data = dot(src, weight.T()); - data += repmat(bias, batchsize_); - - if (!gaussian_) - data = F(data); +Blob* RBMHidLayer::Sample(int flag) { + Tensor sample, data; + if ((flag & kPositive) == kPositive) { + data = Tensor2(&data_); + sample = Tensor2(&sample_); + } else { + data = Tensor2(&neg_data_); + sample = Tensor2(&neg_sample_); + } + auto random = TSingleton>::Instance(); + if (gaussian_) { // first gibbs + random->SampleGaussian(sample, 0.0f, 1.0f); + sample += data; + } else { + random->SampleBinary(sample, data); + } + return (flag & kPositive) == kPositive ? &sample_ : &neg_sample_; +} - Copy(hid_sample, data); +void RBMHidLayer::ComputeFeature(int flag, Metric* perf) { + auto weight = Tensor2(weight_->mutable_data()); + auto bias = Tensor1(bias_->mutable_data()); - if (gaussian_) { // first gibbs - Tensor gaussian_sample(Shape2(batchsize_, hdim_)); - AllocSpace(gaussian_sample); - auto random = TSingleton>::Instance(); - random->SampleGaussian(gaussian_sample, 0.0f, 1.0f); - hid_sample += gaussian_sample; - FreeSpace(gaussian_sample); - } else { - TSingleton>::Instance()->SampleBinary(hid_sample); - } + Tensor data, src; + if ((flag & kPositive) == kPositive) { /*postive flag*/ + data = Tensor2(&data_); + src = Tensor2(vis_layer_->mutable_data(this)); + } else { + data = Tensor2(&neg_data_); + src = Tensor2(vis_layer_->Sample(flag)); + } + data = dot(src, weight.T()); + data += repmat(bias, batchsize_); - } else if (phase == kNegative) { /*negative phase*/ - CHECK_EQ(srclayers_[0]->data(this, kNegative).count(), - neg_batchsize_*vdim_); - auto src_sample = Tensor2(srclayers_[0]->mutable_data(this, kNegative)); - auto hid_sample = Tensor2(&hid_sample_); - auto bias = Tensor1(bias_->mutable_data()); - auto weight = Tensor2(weight_->mutable_data()); - hid_sample = dot(src_sample, weight.T()); - hid_sample += repmat(bias, neg_batchsize_); - if (!gaussian_) - hid_sample = F(hid_sample); - } else if (phase == kLoss) { /*test phase*/ - auto data = Tensor2(&data_); // data: sigmoid(Wv+b) - if (gaussian_) { - Tensor gaussian_sample(Shape2(batchsize_, hdim_)); - AllocSpace(gaussian_sample); - auto random = TSingleton>::Instance(); - random->SampleGaussian(gaussian_sample, 0.0f, 1.0f); - data += gaussian_sample; - FreeSpace(gaussian_sample); - } - else - TSingleton>::Instance()->SampleBinary(data); - } + if (!gaussian_) + data = F(data); } -void RBMHidLayer::ComputeGradient(Phase phase) { - auto data = Tensor2(&data_); - auto hid_sample = Tensor2(&hid_sample_); +void RBMHidLayer::ComputeGradient(int flag) { + auto hid_pos = Tensor2(&data_); + auto hid_neg = Tensor2(&neg_data_); + auto vis_pos = Tensor2(vis_layer_->mutable_data(this)); + auto vis_neg = Tensor2(vis_layer_->mutable_data(this)); + auto gbias = Tensor1(bias_->mutable_grad()); - gbias = sum_rows(hid_sample); - gbias -= sum_rows(data); - gbias *= scale_/(1.0f*batchsize_); + gbias = sum_rows(hid_neg); + gbias -= sum_rows(hid_pos); + gbias /= batchsize_; + + auto gweight = Tensor2(weight_->mutable_grad()); + gweight = dot(hid_neg.T(), vis_neg); + gweight -= dot(hid_pos.T(), vis_pos); + gweight /= batchsize_; } /*********** Implementation for InnerProductLayer**********/ InnerProductLayer::~InnerProductLayer() { @@ -362,13 +328,13 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) { weight_ = Param::Create(proto.param(0)); bias_ = Param::Create(proto.param(1)); if (transpose_) - weight_->Setup(proto.param(0), vector{vdim_, hdim_}); + weight_->Setup(vector{vdim_, hdim_}); else - weight_->Setup(proto.param(0), vector{hdim_, vdim_}); - bias_->Setup(proto.param(1), vector{hdim_}); + weight_->Setup(vector{hdim_, vdim_}); + bias_->Setup(vector{hdim_}); } -void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) { +void InnerProductLayer::ComputeFeature(int flag, Metric* perf) { auto data = Tensor2(&data_); auto src = Tensor2(srclayers_[0]->mutable_data(this)); auto weight = Tensor2(weight_->mutable_data()); @@ -381,7 +347,7 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) { data+=repmat(bias, batchsize_); } -void InnerProductLayer::ComputeGradient(Phase phas) { +void InnerProductLayer::ComputeGradient(int phas) { auto src = Tensor2(srclayers_[0]->mutable_data(this)); auto grad = Tensor2(&grad_); auto weight = Tensor2(weight_->mutable_data()); @@ -411,7 +377,7 @@ void LabelLayer::Setup(const LayerProto& proto, int npartitions){ data_.Reshape(vector{batchsize}); } -void LabelLayer::ParseRecords(Phase phase, const vector& records, +void LabelLayer::ParseRecords(int flag, const vector& records, Blob* blob){ int rid=0; float *label= blob->mutable_cpu_data() ; @@ -442,7 +408,7 @@ void LRNLayer::Setup(const LayerProto& proto, int npartitions) { width_=s[3]; } -void LRNLayer::ComputeFeature(Phase phase, Metric* perf) { +void LRNLayer::ComputeFeature(int flag, Metric* perf) { const float salpha = alpha_ / lsize_; auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto data = Tensor4(&data_); @@ -452,7 +418,7 @@ void LRNLayer::ComputeFeature(Phase phase, Metric* perf) { data = src * F(norm, -beta_ ); } -void LRNLayer::ComputeGradient(Phase phase) { +void LRNLayer::ComputeGradient(int flag) { const float salpha = alpha_ / lsize_; auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto norm = Tensor4(&norm_); @@ -466,8 +432,10 @@ void LRNLayer::ComputeGradient(Phase phase) { /**************** Implementation for MnistImageLayer******************/ -void MnistLayer::ParseRecords(Phase phase, +void MnistLayer::ParseRecords(int flag, const vector& records, Blob* blob){ + if ((flag & kForward) == 0) + return; LOG_IF(ERROR, records.size()==0)<<"Empty records to parse"; int ndim=records.at(0).image().shape_size(); int inputsize =records.at(0).image().shape(ndim-1); @@ -554,7 +522,7 @@ void PoolingLayer::Setup(const LayerProto& proto, int npartitions) { grad_.ReshapeLike(data_); } -void PoolingLayer::ComputeFeature(Phase phase, Metric* perf) { +void PoolingLayer::ComputeFeature(int flag, Metric* perf) { auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto data = Tensor4(&data_); if(pool_ == PoolingProto_PoolMethod_MAX) @@ -567,7 +535,7 @@ void PoolingLayer::ComputeFeature(Phase phase, Metric* perf) { * partition only on num/channel dim * assume grad and data have the same paritition */ -void PoolingLayer::ComputeGradient(Phase phase) { +void PoolingLayer::ComputeGradient(int flag) { auto src = Tensor4(srclayers_[0]->mutable_data(this)); auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this)); auto data = Tensor4(&data_); @@ -587,13 +555,13 @@ void ReLULayer::Setup(const LayerProto& proto, int npartitions) { grad_.ReshapeLike(*(srclayers_[0]->mutable_grad(this))); } -void ReLULayer::ComputeFeature(Phase phase, Metric* perf) { +void ReLULayer::ComputeFeature(int flag, Metric* perf) { auto data = Tensor1(&data_); auto src = Tensor1(srclayers_[0]->mutable_data(this)); data=F(src); } -void ReLULayer::ComputeGradient(Phase phase) { +void ReLULayer::ComputeGradient(int flag) { auto data = Tensor1(&data_); auto grad = Tensor1(&grad_); auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this)); @@ -602,8 +570,11 @@ void ReLULayer::ComputeGradient(Phase phase) { /*************** Implementation for RGBImageLayer *************************/ -void RGBImageLayer::ParseRecords(Phase phase, +void RGBImageLayer::ParseRecords(int flag, const vector& records, Blob* blob){ + if ((flag & kForward) == 0) + return; + const vector& s=blob->shape(); auto images = Tensor4(&data_); const SingleLabelImageRecord& r=records.at(0).image(); @@ -617,8 +588,8 @@ void RGBImageLayer::ParseRecords(Phase phase, const float* meandptr=mean_.cpu_data(); for(const Record& record: records){ auto image=images[rid]; - bool do_crop=cropsize_>0&&(phase == kTrain); - bool do_mirror=mirror_&&rand()%2&&(phase == kTrain); + bool do_crop = cropsize_ > 0 && ((flag & kTrain) == kTrain); + bool do_mirror = mirror_ && rand() % 2 && ((flag & kTrain) == kTrain); float* dptr=nullptr; if(do_crop||do_mirror) dptr=raw_image.dptr; @@ -697,7 +668,10 @@ void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) { } /***************Implementation for ShardDataLayer**************************/ -void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){ +void ShardDataLayer::ComputeFeature(int flag, Metric* perf){ + if ((flag & kForward) == 0) + return; + if (shard_ == nullptr) shard_ = new DataShard(layer_proto_.sharddata_conf().path(), DataShard::kRead); @@ -747,13 +721,13 @@ void SigmoidLayer::Setup(const LayerProto& proto, int npartitions) { grad_.ReshapeLike(srclayers_[0]->grad(this)); } -void SigmoidLayer::ComputeFeature(Phase phase, Metric* perf) { +void SigmoidLayer::ComputeFeature(int flag, Metric* perf) { auto data = Tensor1(&data_); auto src = Tensor1(srclayers_[0]->mutable_data(this)); data = F(src); } -void SigmoidLayer::ComputeGradient(Phase phase) { +void SigmoidLayer::ComputeGradient(int flag) { auto data = Tensor1(&data_); auto grad = Tensor1(&grad_); auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this)); @@ -766,13 +740,13 @@ void TanhLayer::Setup(const LayerProto& proto, int npartitions){ grad_.ReshapeLike(srclayers_[0]->grad(this)); } -void TanhLayer::ComputeFeature(Phase phase, Metric* perf) { +void TanhLayer::ComputeFeature(int flag, Metric* perf) { auto data = Tensor1(&data_); auto src = Tensor1(srclayers_[0]->mutable_data(this)); data=F(src); } -void TanhLayer::ComputeGradient(Phase phase) { +void TanhLayer::ComputeGradient(int flag) { auto data = Tensor1(&data_); auto grad = Tensor1(&grad_); auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this)); @@ -787,7 +761,7 @@ void EuclideanLossLayer::Setup(const LayerProto& proto, int npartitions) { dim_ = data_.count()/batchsize_; metric_.Reshape(vector{1}); } -void EuclideanLossLayer::ComputeFeature(Phase phase, Metric* perf) { +void EuclideanLossLayer::ComputeFeature(int flag, Metric* perf) { const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data(); const float* input_dptr = srclayers_[1]->data(this).cpu_data(); float loss = 0; @@ -805,7 +779,7 @@ void EuclideanLossLayer::ComputeFeature(Phase phase, Metric* perf) { srclayers_[1]->data(this).cpu_data() + (batchsize_*dim_)); perf->Add("loss", loss/(1.0f*batchsize_)); } -void EuclideanLossLayer::ComputeGradient(Phase phase) { +void EuclideanLossLayer::ComputeGradient(int flag) { const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data(); const float* input_dptr = srclayers_[1]->data(this).cpu_data(); Blob* gsrcblob = srclayers_[0]->mutable_grad(this); @@ -828,7 +802,7 @@ void SoftmaxLossLayer::Setup(const LayerProto& proto, int npartitions) { metric_.Reshape(vector{2}); scale_=proto.softmaxloss_conf().scale(); } -void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) { +void SoftmaxLossLayer::ComputeFeature(int flag, Metric* perf) { Shape<2> s=Shape2(batchsize_, dim_); Tensor prob(data_.mutable_cpu_data(), s); Tensor src(srclayers_[0]->mutable_data(this)->mutable_cpu_data(), s); @@ -863,7 +837,7 @@ void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) { perf->Add("accuracy", precision*scale_/(1.0f*batchsize_)); } -void SoftmaxLossLayer::ComputeGradient(Phase phase) { +void SoftmaxLossLayer::ComputeGradient(int flag) { const float* label=srclayers_[1]->data(this).cpu_data(); Blob* gsrcblob=srclayers_[0]->mutable_grad(this); gsrcblob->CopyFrom(data_); diff --git a/src/proto/job.proto b/src/proto/job.proto index 7f030acf6e..1c79aeabba 100644 --- a/src/proto/job.proto +++ b/src/proto/job.proto @@ -21,18 +21,12 @@ message JobProto { required string name = 1; // neural net consits of a set of connected layers required NetProto neuralnet = 3; - // algorithms calculating gradients for one mini-batch/iteration - optional TrainOneBatchAlg alg = 5 [default = kUserAlg]; - // user defined algorithm - optional string user_alg = 6; + // algorithm for computing gradients over one mini-batch + required AlgProto train_one_batch = 5; // configuration of SGD updater, including learning rate, etc. required UpdaterProto updater = 7; // cluster toplogy conf required ClusterProto cluster = 9; - - // for setting CD fields - optional CDProto cd_conf = 12; - // total num of steps for training required int32 train_steps = 16; // frequency of displaying training info @@ -86,6 +80,16 @@ message JobProto { // Protos used by JobProto // ----------------------- +message AlgProto { + // algorithms calculating gradients for one mini-batch/iteration + optional AlgType alg = 1 [default = kUserAlg]; + // user defined algorithm + optional string user_alg = 2; + // for setting CD fields + optional CDProto cd_conf = 10; + + extensions 101 to 200; +} message NetProto { repeated LayerProto layer = 1; // partitioning type for parallelism @@ -140,7 +144,7 @@ message ClusterProto { message CDProto { //number of steps for gibbs sampling - optional int32 pcd_k = 1 [default = 1]; + optional int32 cd_k = 1 [default = 1]; } message LayerProto { @@ -182,8 +186,6 @@ message LayerProto { optional PrefetchProto prefetch_conf = 44; // configuration for rbmhid layer optional RBMHidProto rbmhid_conf = 49; - // configuration for rbmvis layer - optional RBMVisProto rbmvis_conf = 48; // configuration for rectified linear unit layer optional ReLUProto relu_conf = 38; // configuration for rgb image parser layer @@ -365,11 +367,6 @@ message DropoutProto { optional float dropout_ratio = 30 [default = 0.5]; } -message RBMVisProto { - optional int32 num_output = 1; // The number of outputs for the layer - optional bool bias_term = 2 [default = true]; // whether to have bias terms -} - message RBMHidProto { optional int32 hid_dim = 1; // The number of outputs for the layer optional bool bias_term = 2 [default = true]; // whether to have bias terms @@ -559,16 +556,16 @@ enum PartitionType { } enum Phase { - kTrain = 0; - kValidation = 1; - kTest= 2; + kTrain = 1; + kValidation = 2; + kTest= 4; // postivie phase for contrastive divergence algorithm - kPositive = 3; + kPositive = 8; // negative phase for contrastive divergence algorithm - kNegative = 4; - kForward = 5; - kBackward = 6; - kLoss = 7; + kNegative = 16; + kForward = 32; + kBackward = 64; + kLoss = 128; } enum ParamType { @@ -578,7 +575,7 @@ enum ParamType { kUser = 103; } -enum TrainOneBatchAlg { +enum AlgType { // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN kBP = 1; // Contrastive Divergence algorithm for RBM, DBM, etc. diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc index f112b173c2..a22a8effcf 100644 --- a/src/trainer/worker.cc +++ b/src/trainer/worker.cc @@ -13,12 +13,14 @@ using std::thread; Worker* Worker::Create(const JobProto& proto) { auto factory = Singleton>::Instance(); Worker* worker = nullptr; - if (proto.has_user_alg()) - worker = factory->Create(proto.user_alg()); + const auto& conf = proto.train_one_batch(); + if (conf.has_user_alg()) + worker = factory->Create(conf.user_alg()); else - worker = factory->Create(proto.alg()); + worker = factory->Create(conf.alg()); return worker; } + void Worker::Init(int thread_id, int grp_id, int id) { thread_id_ = thread_id; grp_id_ = grp_id; @@ -63,7 +65,7 @@ void Worker::InitLocalParams() { // the param from previous checkpoint files will be overwritten by // the param with the same name in later checkpoint files. for (const auto checkpoint : job_conf_.checkpoint_path()) { - LOG(INFO) << "Load from checkpoint file " << checkpoint; + LOG(ERROR) << "Load from checkpoint file " << checkpoint; BlobProtos bps; ReadProtoFromBinaryFile(checkpoint.c_str(), &bps); for (int i = 0; i < bps.name_size(); i++) { @@ -342,11 +344,11 @@ void BPWorker::Forward( Collect(p, step); } } - layer->ComputeFeature(phase, perf); + layer->ComputeFeature(phase | kForward, perf); if (layer->is_bridgesrclayer()) // send data to other workers SendBlobs(true, false, static_cast(layer), net); if (DisplayDebugInfo(step)) - LOG(INFO) << layer->DebugString(step, kForward); + LOG(INFO) << layer->DebugString(step, phase | kForward); } } } @@ -359,9 +361,9 @@ void BPWorker::Backward(int step, shared_ptr net) { if(layer->is_bridgesrclayer()) { // ReceiveBlobs(false, true, layer, net); } - layer->ComputeGradient(kTrain); + layer->ComputeGradient(kTrain | kBackward); if (DisplayDebugInfo(step)) - LOG(INFO) << layer->DebugString(step, kBackward); + LOG(INFO) << layer->DebugString(step, kTrain | kBackward); for (Param* p : layer->GetParams()) Update(p, step); if (layer->is_bridgedstlayer()) { @@ -381,72 +383,34 @@ void BPWorker::TestOneBatch(int step, Phase phase, Forward(step, phase, net, perf); } /****************************CDWorker**********************************/ -void CDWorker::Init(int thread_id, int group_id, int worker_id) { - Worker::Init(thread_id, group_id, worker_id); -} - -void CDWorker::PositivePhase(int step, - shared_ptr net, Metric* perf) { - auto& layers = net->layers(); - // LOG(ERROR)<<"Positive Phase"; - for (auto& layer : layers) { - for (Param* p : layer->GetParams()) { // wait until param is updated +void CDWorker::TrainOneBatch(int step, Metric* perf) { + const auto& layers = train_net_->layers(); + for (auto* layer : layers) { + for (Param* p : layer->GetParams()) // wait until param is updated Collect(p, step); - } - layer->ComputeFeature(kPositive, perf); + layer->ComputeFeature(kPositive | kForward, perf); } -} - -void CDWorker::NegativePhase(int step, - shared_ptr net, Metric* perf) { -// for negative phase, gibbs sampling only concerns RBM bottom and top layer - auto& layers = net->layers(); - // LOG(ERROR)<<"Negative Phase"; - for (auto& layer : layers) { - if (layer->is_vislayer() || layer->is_hidlayer()) { - layer->ComputeFeature(kNegative, perf); - } - } -} - -void CDWorker::GradientPhase(int step, shared_ptr net) { - auto& layers = net->layers(); - // LOG(ERROR)<<"Gradient Phase"; - for (auto& layer : layers) { - if (layer->is_vislayer() || layer->is_hidlayer()) { - layer->ComputeGradient(kTrain); - for (Param* p : layer->GetParams()) { - Update(p, step); - } + for (auto* layer : layers) + layer->ComputeFeature(kNegative | kTest, perf); + for (int i = 1; i < job_conf_.train_one_batch().cd_conf().cd_k(); i++) { + for (auto* layer : layers) { + layer->ComputeFeature(kNegative, perf); } } -} - -void CDWorker::LossPhase(int step, shared_ptr net, Metric* perf) { - auto& layers = net->layers(); - // LOG(ERROR)<<"Loss Phase"; - for (auto& layer : layers) { - if (layer->is_hidlayer()) { - layer->ComputeFeature(kLoss, perf); + for (auto* layer : layers) { + layer->ComputeGradient(kTrain); + for (Param* p : layer->GetParams()) { + Update(p, step); } } - for (auto& layer : layers) { - if (layer->is_vislayer()) { - layer->ComputeLoss(perf); - } - } -} - -void CDWorker::TrainOneBatch(int step, Metric* perf) { - PositivePhase(step, train_net_, perf); - NegativePhase(step, train_net_, perf); - GradientPhase(step, train_net_); - LossPhase(step, train_net_, perf); } void CDWorker::TestOneBatch(int step, Phase phase, - shared_ptr net, Metric* perf) { - PositivePhase(step, test_net_, perf); - LossPhase(step, test_net_, perf); + shared_ptr net, Metric* perf) { + auto& layers = net->layers(); + for (auto layer : layers) + layer->ComputeFeature(kPositive | kForward, perf); + for (auto layer : layers) + layer->ComputeFeature(kNegative | kTest, perf); } } // namespace singa diff --git a/src/utils/common.cc b/src/utils/common.cc index 1888380e53..d13faeacf9 100644 --- a/src/utils/common.cc +++ b/src/utils/common.cc @@ -40,7 +40,7 @@ string IntVecToString(const vector& vec) { * * Formatted string. * */ string VStringPrintf(string fmt, va_list l) { - char buffer[32768]; + char buffer[4096]; vsnprintf(buffer, sizeof(buffer), fmt.c_str(), l); return string(buffer); } diff --git a/src/utils/param.cc b/src/utils/param.cc index 67f14ab04b..69f697b4a6 100644 --- a/src/utils/param.cc +++ b/src/utils/param.cc @@ -73,17 +73,18 @@ Param* Param::Create(const ParamProto& proto) { p = factory->Create(proto.user_type()); else p = factory->Create(proto.type()); + p->Init(proto); return p; } Param::Param():local_version_(-1), slice_start_(0), num_slices_(0), num_pending_requests_(0), data_(nullptr) { } -void Param::Setup(const ParamProto& proto, const vector& shape) { + +void Param::Setup(const vector& shape) { data_ = std::make_shared>(shape); grad_.Reshape(shape); history_.Reshape(shape); - proto_.CopyFrom(proto); } void Param::AddSlice(int slice_id, int size) { @@ -178,7 +179,8 @@ Msg* Param::HandlePutMsg(Msg** msg, bool reserve) { proto.set_lr_scale(lr); proto.set_wd_scale(wc); vector shape{size}; - Setup(proto, shape); + Init(proto); + Setup(shape); if (ptr == nullptr) { CHECK((*msg)->NextFrame()); CHECK_EQ(size* sizeof(float), (*msg)->FrameSize()); @@ -298,6 +300,8 @@ void Param::ShareFrom(const Param& other) { other.data_->shape().begin())); } data_ = other.data_; + if (grad_.count() == 0) + grad_.Reshape(data_->shape()); slice_offset_ = other.slice_offset_; slice_size_ = other.slice_size_; slice_start_ = other.slice_start_;