MNN/source/backend/opencl/execution/ScaleExecution.cpp at master · DannyP0/MNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
//
//  ScaleExecution.cpp
//  MNN
//
//  Created by MNN on 2019/02/28.
//  Copyright © 2018, Alibaba Group Holding Limited
//

#include "execution/ScaleExecution.hpp"
#include "Macro.h"
#include "TensorUtils.hpp"
#include "core/OpenCLRunningUtils.hpp"

namespace MNN {
namespace OpenCL {

ScaleExecution::ScaleExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
    : Execution(backend) {
#ifdef LOG_VERBOSE
    MNN_PRINT("Start ScaleExecution init !\n");
#endif
    auto openclBackend        = (OpenCLBackend *)backend;
    mOpenCLBackend            = static_cast<OpenCLBackend *>(backend);
    const auto *scaleParams   = op->main_as_Scale();
    int scaleSize             = scaleParams->scaleData()->size();
    const float *scaleDataPtr = scaleParams->scaleData()->data();
    cl::Buffer scaleBuffer(openclBackend->getOpenCLRuntime()->context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                           UP_DIV(scaleSize, 4) * 4 * sizeof(float));
    auto scalePtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
        scaleBuffer, true, CL_MAP_WRITE, 0, ALIGN_UP4(scaleSize) * sizeof(float));
    if(nullptr != scalePtrCL){
        ::memset(scalePtrCL, 0, ALIGN_UP4(scaleSize) * sizeof(float));
        ::memcpy(scalePtrCL, scaleDataPtr, scaleSize * sizeof(float));
    }else{
        MNN_ERROR("Map error scalePtrCL == nullptr \n");
    }
    openclBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(scaleBuffer, scalePtrCL);

    mScale.reset(Tensor::createDevice<float>({1, 1, 1, scaleSize}));
    backend->onAcquireBuffer(mScale.get(), Backend::STATIC);
    copyBufferToImage(openclBackend->getOpenCLRuntime(), scaleBuffer, openCLImage(mScale.get()), UP_DIV(scaleSize, 4),
                      1);

    std::set<std::string> buildOptions;
    if (nullptr != scaleParams->biasData() && nullptr != scaleParams->biasData()->data()) {
        int biasSize = scaleParams->biasData()->size();
        MNN_ASSERT(biasSize == scaleSize);
        const float *biasDataPtr = scaleParams->biasData()->data();
        cl::Buffer biasBuffer(openclBackend->getOpenCLRuntime()->context(), CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                              UP_DIV(biasSize, 4) * 4 * sizeof(float));
        auto biasPtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
            biasBuffer, true, CL_MAP_WRITE, 0, ALIGN_UP4(biasSize) * sizeof(float));
        if(nullptr != biasPtrCL){
            ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
            ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
        }else{
            MNN_ERROR("Map error biasPtrCL == nullptr \n");
        }
        openclBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL);
        std::shared_ptr<Tensor> bias;
        bias.reset(Tensor::createDevice<float>({1, 1, 1, biasSize}));
        backend->onAcquireBuffer(bias.get(), Backend::STATIC);
        copyBufferToImage(openclBackend->getOpenCLRuntime(), biasBuffer, openCLImage(bias.get()), UP_DIV(biasSize, 4),
                          1);
        mBias = bias;
        buildOptions.emplace("-DHAS_BIAS");
        mHasBias = true;
    }
    std::string kernelName = "scale";
    auto runtime           = mOpenCLBackend->getOpenCLRuntime();
    mKernel                = runtime->buildKernel("scale", kernelName, buildOptions);
    mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));

    mAreadySetArg = false;
#ifdef LOG_VERBOSE
    MNN_PRINT("end ScaleExecution init !\n");
#endif
}

ScaleExecution::~ScaleExecution() {
    if (nullptr != mBias) {
        mOpenCLBackend->onReleaseBuffer(mBias.get(), Backend::STATIC);
    }
    mOpenCLBackend->onReleaseBuffer(mScale.get(), Backend::STATIC);
}

ErrorCode ScaleExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
#ifdef LOG_VERBOSE
    MNN_PRINT("Start ScaleExecution onResize !\n");
#endif

#ifdef LOG_VERBOSE
    MNN_PRINT("end ScaleExecution onResize !\n");
#endif
    std::vector<int> inputShape = tensorShapeFormat(inputs[0]);

    const int batch    = inputShape.at(0);
    const int height   = inputShape.at(1);
    const int width    = inputShape.at(2);
    const int channels = inputShape.at(3);

    const int channelBlocks = UP_DIV(channels, 4);

    const std::vector<uint32_t> &gws = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(width),
                                        static_cast<uint32_t>(height * batch)};
    uint32_t idx                     = 0;
    mKernel.setArg(idx++, gws[0]);
    mKernel.setArg(idx++, gws[1]);
    mKernel.setArg(idx++, gws[2]);

    mKernel.setArg(idx++, openCLImage(inputs[0]));
    mKernel.setArg(idx++, openCLImage(mScale.get()));
    if (mHasBias) {
        mKernel.setArg(idx++, openCLImage(mBias.get()));
    }
    mKernel.setArg(idx++, openCLImage(outputs[0]));
    return NO_ERROR;
}

ErrorCode ScaleExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
#ifdef LOG_VERBOSE
    MNN_PRINT("Start ScaleExecution onExecute !\n");
#endif
    Tensor *input  = inputs[0];
    Tensor *output = outputs[0];

    std::vector<int> inputShape  = tensorShapeFormat(input);
    std::vector<int> outputShape = tensorShapeFormat(output);

    const int batch    = inputShape.at(0);
    const int height   = inputShape.at(1);
    const int width    = inputShape.at(2);
    const int channels = inputShape.at(3);

    const int channelBlocks = UP_DIV(channels, 4);

    const std::vector<uint32_t> &gws = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(width),
                                        static_cast<uint32_t>(height * batch)};

    auto runtime = mOpenCLBackend->getOpenCLRuntime();

    const std::vector<uint32_t> lws = localWS3DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime());

    cl::Event event;
    cl_int error;

    std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
    for (size_t i = 0; i < lws.size(); ++i) {
        roundUpGroupWorkSize[i] = ROUND_UP(gws[i], std::max((uint32_t)1, lws[i]));
    }
    error = runtime->commandQueue().enqueueNDRangeKernel(
        mKernel, cl::NullRange, cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1], roundUpGroupWorkSize[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);

    MNN_CHECK_CL_SUCCESS(error);

#ifdef LOG_VERBOSE
    MNN_PRINT("end ScaleExecution onExecute !\n");
#endif
    return NO_ERROR;
}

OpenCLCreatorRegister<TypedCreator<ScaleExecution>> __scale_op(OpType_Scale);

} // namespace OpenCL
} // namespace MNN