From 21f5d32ecc76c0edca9f352a146e33f0c1e8b04b Mon Sep 17 00:00:00 2001 From: sabreshao Date: Fri, 25 May 2018 11:34:43 +0800 Subject: [PATCH] Draft of MIOpen optimization. --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 90 +++++++++++++++++++++- paddle/fluid/operators/conv_op.cc | 18 +++++ python/paddle/fluid/layers/nn.py | 20 ++++- 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 193f776e760d2a..ecf44ebc9d3ae2 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/miopen_helper.h" #include "paddle/fluid/platform/float16.h" +#include namespace paddle { namespace operators { @@ -43,6 +44,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); + auto* alg = ctx.Input("Algorithm"); + auto* algOut = ctx.Output("AlgorithmOut"); + algOut->mutable_data(platform::CPUPlace()); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -126,13 +130,45 @@ class CUDNNConvOpKernel : public framework::OpKernel { ScalingParamType alpha = 1.0f, beta = 0.0f; miopenConvAlgoPerf_t perfRes; int algoCount = 0; + + VLOG(3) << "X Tensor: " << input->dims()[0] << " " << input->dims()[1] << " " << input->dims()[2] << " " << input->dims()[3]; + VLOG(3) << "W Tensor: " << filter->dims()[0] << " " << filter->dims()[1] << " " << filter->dims()[2] << " " << filter->dims()[3]; + VLOG(3) << "Y Tensor: " << output->dims()[0] << " " << output->dims()[1] << " " << output->dims()[2] << " " << output->dims()[3]; + VLOG(3) << "ctx: " << &ctx << " op: " << &ctx.op() << " scope: " << &ctx.scope(); + VLOG(3) << "alg: " << alg << " get alg str: " << ctx.op().Input("Algorithm"); + VLOG(3) << "get alg ptr: " << ctx.scope().FindVar(ctx.op().Input("Algorithm")); + VLOG(3) << "Input: " << alg->data() << " Output: " << algOut->mutable_data(platform::CPUPlace()); + int pre_alg = (alg->data())[0]; + if (pre_alg == 0) + { + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, input_data, + cudnn_filter_desc, filter_data, + cudnn_conv_desc, cudnn_output_desc, output_data, + 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + (algOut->data())[0] = (int)(perfRes.fwd_algo) + 1; + VLOG(3) << "Find Kernel: store " << (algOut->data()) << " kernel :" << perfRes.fwd_algo; + } + else + { + perfRes.fwd_algo = (miopenConvFwdAlgorithm_t)(pre_alg - 1); + //((algOut->mutable_data(platform::CPUPlace()))[0] - 1); + VLOG(3) << "Find Kernel: load " << (alg->data()) << " kernel :" << perfRes.fwd_algo; + } + for (int i = 0; i < groups; i++) { // ------------------- cudnn conv algorithm --------------------- +#if 0 + struct timeval before, after; + gettimeofday(&before, nullptr); PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionForwardAlgorithm( handle, cudnn_input_desc, input_data + i * group_offset_in, cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, cudnn_output_desc, output_data + i * group_offset_out, + cudnn_conv_desc, cudnn_output_desc, output_data + i * group_offset_out, 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + gettimeofday(&after, nullptr); + VLOG(3) << "miopenFindConvolutionForwardAlgorithm: takes "<< (after.tv_sec - before.tv_sec) * 1000000 + after.tv_usec - before.tv_usec << " us"; +#endif // ------------------- cudnn conv forward --------------------- PADDLE_ENFORCE(platform::dynload::miopenConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, @@ -157,6 +193,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto output_grad = ctx.Input(framework::GradVarName("Output")); auto input_grad = ctx.Output(framework::GradVarName("Input")); auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + //auto* alg = ctx.Input("Algorithm"); + //auto* algOut = ctx.Output("AlgorithmOut"); const T* input_data = input->data(); const T* output_grad_data = output_grad->data(); @@ -268,8 +306,29 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. +#if 0 + if ((algOut->mutable_data(platform::CPUPlace()))[1] == 0) + { + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + handle, + cudnn_output_grad_desc, output_grad_data, + cudnn_filter_desc, filter_data, + cudnn_conv_desc, + cudnn_input_desc, input_grad_data, + 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + (algOut->mutable_data(platform::CPUPlace()))[1] = (int)(perfRes.bwd_data_algo) + 1; + } + else + { + perfRes.bwd_data_algo = (miopenConvBwdDataAlgorithm_t) + ((algOut->mutable_data(platform::CPUPlace()))[1] - 1); + } +#endif for (int i = 0; i < groups; i++) { +#if 1 + struct timeval before, after; + gettimeofday(&before, nullptr); PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( handle, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, @@ -277,6 +336,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_conv_desc, cudnn_input_desc, input_grad_data + i * group_offset_in, 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + gettimeofday(&after, nullptr); + //VLOG(3) << "miopenFindConvolutionBackwardDataAlgorithm: takes "<< (after.tv_sec - before.tv_sec) * 1000000 + after.tv_usec - before.tv_usec << " us"; +#endif PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardData( handle, &alpha, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, @@ -290,7 +352,30 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. +#if 0 + if ((algOut->mutable_data(platform::CPUPlace()))[2] == 0) + { + PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + handle, + cudnn_output_grad_desc, output_grad_data, + cudnn_input_desc, input_data, + cudnn_conv_desc, + cudnn_filter_desc, filter_grad_data, + 1, &algoCount, &perfRes, + cudnn_workspace, workspace_size_in_bytes, false)); + (algOut->mutable_data(platform::CPUPlace()))[2] = (int)(perfRes.bwd_weights_algo) + 1; + } + else + { + perfRes.bwd_weights_algo = (miopenConvBwdWeightsAlgorithm_t) + ((algOut->mutable_data(platform::CPUPlace()))[2] - 1); + } +#endif + for (int i = 0; i < groups; i++) { +#if 1 + struct timeval before, after; + gettimeofday(&before, nullptr); PADDLE_ENFORCE(platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( handle, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, @@ -299,6 +384,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_filter_desc, filter_grad_data + i * group_offset_filter, 1, &algoCount, &perfRes, cudnn_workspace, workspace_size_in_bytes, false)); + gettimeofday(&after, nullptr); + //VLOG(3) << "miopenFindConvolutionBackwardWeightsAlgorithm: takes "<< (after.tv_sec - before.tv_sec) * 1000000 + after.tv_usec - before.tv_usec << " us"; +#endif PADDLE_ENFORCE(platform::dynload::miopenConvolutionBackwardWeights( handle, &alpha, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e3f39218f5db75..ee1f7916e53246 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -38,6 +38,16 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasOutput("Output"), "Output(Output) of ConvOp should not be null."); +#if 0 + PADDLE_ENFORCE(ctx->HasInput("Algorithm"), + "Input(Algorithm) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AlgorithmOut"), + "Output(AlgorithmOut) of ConvOp should not be null.");*/ + + PADDLE_ENFORCE_EQ(ctx->Inputs("Algorithm")[0], ctx->Outputs("Algorithm")[0], + "Algorithm and AlgorithmOut should share the same memory"); + ctx->SetOutputDim("AlgorithmOut", {3}); +#endif auto in_dims = ctx->GetInputDim("Input"); auto filter_dims = ctx->GetInputDim("Filter"); std::vector strides = ctx->Attrs().Get>("strides"); @@ -129,9 +139,13 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "H is the height of the filter, and W is the width of the filter. " "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", + "Selected algorithm for conv2d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " "The format of output tensor is also NCHW."); + AddOutput("AlgorithmOut", + "Tuned algorithm for conv2d"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -225,9 +239,13 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "is the width of the filter." "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("Algorithm", + "Selected algorithm for conv3d"); AddOutput("Output", "(Tensor) The output tensor of convolution operator." "The format of output tensor is also NCDHW."); + AddOutput("AlgorithmOut", + "Tuned algorithm for conv3d"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d2e7d58524bfb1..38a9188bf348ad 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1293,15 +1293,33 @@ def _get_default_param_initializer(): dtype=dtype, default_initializer=_get_default_param_initializer()) + #print 'conv2d {0:2d} {1:3d}'.format(filter_size[0], filter_size[1]) + #algorithm = helper.create_tmp_variable(dtype) + #algorithm = helper.create_parameter( + # attr=ParamAttr(name="miopen_algorithm", initializer=Constant(0), trainable=False), + # shape=[3], + # dtype='int') + #algorithm = helper.create_parameter( + # attr=ParamAttr(name=None, initializer=Constant(0), trainable=False), + # shape=[3], + # dtype='int') + #algorithm.stop_gradient = True + algorithm = helper.create_global_variable(dtype='int', shape=[3], persistable=True, nam=None); + pre_bias = helper.create_tmp_variable(dtype) + algorithm_out = algorithm + helper.append_op( type=l_type, inputs={ 'Input': input, 'Filter': filter_param, + 'Algorithm': algorithm, + }, + outputs={'Output': pre_bias, + 'AlgorithmOut': algorithm_out, }, - outputs={"Output": pre_bias}, attrs={ 'strides': stride, 'paddings': padding,