From fd64662dbecbf442ccad18e42917ce63c07f0fb4 Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 16 Sep 2015 02:50:42 +0900 Subject: [PATCH 01/22] =?UTF-8?q?=E6=AD=A3=E8=A6=8F=E5=8C=96=E5=B1=A4?= =?UTF-8?q?=E3=81=A7=E5=B9=B3=E5=9D=87=E5=88=86=E6=95=A3=E3=82=92=E4=BF=9D?= =?UTF-8?q?=E6=8C=81=E3=81=97=E3=81=AA=E3=81=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/kernels/NormalizeKernel.java | 12 ++---------- .../java/kishida/cnn/layers/MultiNormalizeLayer.java | 8 -------- src/main/java/kishida/cnn/layers/NormalizeLayer.java | 2 +- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java index f6d9dc7..8202643 100644 --- a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java +++ b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java @@ -61,11 +61,7 @@ private void proc(int chxy) { } float std = max(threshold, sqrt(variance / count)); result[chxy] = (input[chxy] - average) / std; - averages[chxy] = average; - rates[chxy] = std; } - float[] averages; - float[] rates; float[] result; float[] input; int inputChannels; @@ -75,11 +71,9 @@ private void proc(int chxy) { float threshold; public float[] normalize(float[] input, int inputChannels, int inputWidth, int inputHeight, - int size, float[] averages, float[] rates, float threshold, boolean useGpu) { + int size, float threshold, float[] result, boolean useGpu) { this.input = input; - this.rates = rates; - this.result = new float[inputChannels * inputWidth * inputHeight]; - this.averages = averages; + this.result = result; this.inputChannels = inputChannels; this.inputWidth = inputWidth; this.inputHeight = inputHeight; @@ -88,8 +82,6 @@ public float[] normalize(float[] input, int inputChannels, int inputWidth, int i if (useGpu) { put(input); execute(inputChannels * inputWidth * inputHeight); - get(averages); - get(rates); get(result); } else { IntStream.range(0, inputChannels).parallel().forEach(ch -> { diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index c776d3a..50681b6 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -22,9 +22,6 @@ public class MultiNormalizeLayer extends ImageNeuralLayer{ @Getter boolean useGpu; - float[] averages; - float[] rates; - @JsonCreator public MultiNormalizeLayer( @JsonProperty("name") String name, @@ -43,14 +40,11 @@ public final void setPreLayer(NeuralLayer preLayer) { outputChannels = inputChannels; outputWidth = inputWidth; outputHeight = inputHeight; - averages = new float[inputWidth * inputHeight]; - rates = new float[inputWidth * inputHeight]; result = new float[inputChannels * inputHeight * inputWidth]; } @Override public float[] forward(float[] in) { - IntStream.range(0, inputWidth).parallel().forEach(x -> { for(int y = 0; y < inputHeight; ++y){ float total = 0; @@ -90,8 +84,6 @@ public float[] forward(float[] in) { } } float std = Math.max(threshold, (float)Math.sqrt(variance / count)); - averages[x * inputHeight + y] = average; - rates[x * inputHeight + y] = std; for(int ch = 0; ch < inputChannels; ++ch){ int pos = ch * inputHeight * inputWidth + x * inputHeight + y; result[pos] = (in[pos] - average) / std; diff --git a/src/main/java/kishida/cnn/layers/NormalizeLayer.java b/src/main/java/kishida/cnn/layers/NormalizeLayer.java index ff55efe..c4097a7 100644 --- a/src/main/java/kishida/cnn/layers/NormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/NormalizeLayer.java @@ -38,7 +38,7 @@ public float[] forward(float[] in) { averages = new float[in.length]; rates = new float[in.length]; result = NormalizeKernel.INSTANCE.normalize(in, inputChannels, inputWidth, inputHeight, - size, averages, rates, threshold, useGpu); + size, threshold, result, useGpu); return result; } From be0b33f509fc146f3dc02d18a59c218c1cc76769 Mon Sep 17 00:00:00 2001 From: kishida Date: Fri, 18 Sep 2015 04:03:18 +0900 Subject: [PATCH 02/22] =?UTF-8?q?=E3=81=A8=E3=82=8A=E3=81=82=E3=81=88?= =?UTF-8?q?=E3=81=9AJOCL=E5=AF=BE=E5=BF=9C=E3=82=B3=E3=83=BC=E3=83=89?= =?UTF-8?q?=E3=82=92=E6=9B=B8=E3=81=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 10 ++ .../java/kishida/cnn/ConvolutionalNet.java | 26 ++-- .../ConvolutionBackwordDeltaKernel.java | 9 +- .../kernels/ConvolutionBackwordKernel.java | 6 +- .../kishida/cnn/layers/ConvolutionLayer.java | 24 ++-- .../cnn/opencl/ConvolutionBackwordCL.java | 119 ++++++++++++++++++ src/main/java/kishida/cnn/opencl/OpenCL.java | 78 ++++++++++++ .../resources/kernels/convolution_backword.cl | 99 +++++++++++++++ .../resources/kernels/convolution_forward.cl | 55 ++++++++ src/main/resources/kernels/fully_forward.cl | 35 ++++++ 10 files changed, 433 insertions(+), 28 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java create mode 100644 src/main/java/kishida/cnn/opencl/OpenCL.java create mode 100644 src/main/resources/kernels/convolution_backword.cl create mode 100644 src/main/resources/kernels/convolution_forward.cl create mode 100644 src/main/resources/kernels/fully_forward.cl diff --git a/pom.xml b/pom.xml index 5e86896..0cc4506 100644 --- a/pom.xml +++ b/pom.xml @@ -21,6 +21,16 @@ lombok 1.16.6 + + org.jogamp.gluegen + gluegen-rt-main + 2.1.4 + + + org.jogamp.jocl + jocl-main + 2.1.4 + diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 8341adf..74c65ba 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -8,6 +8,8 @@ import java.awt.Image; import java.awt.image.BufferedImage; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; import java.io.UncheckedIOException; import java.io.Writer; @@ -57,8 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - private static final String RESOURCE_NAME = "/alexnet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + //private static final String RESOURCE_NAME = "/alexnet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ @@ -89,6 +93,7 @@ BufferedImage readImage(){ static List historyData = new ArrayList<>(); static LinkedList rateData = new LinkedList<>(); + @SuppressWarnings({"ThrowableInstanceNotThrown", "ThrowableInstanceNeverThrown"}) public static void main(String[] args) throws IOException { System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "false"); String def = "C:\\Users\\naoki\\Desktop\\sampleimg288"; @@ -184,14 +189,15 @@ public static void main(String[] args) throws IOException { NeuralNetwork nn; - /* - try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME); - InputStreamReader isr = new InputStreamReader(is)){ - nn = NeuralNetwork.readFromJson(isr); - }*/ - - try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){ - nn = NeuralNetwork.readFromJson(r); + if(true){ + try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME); + InputStreamReader isr = new InputStreamReader(is)){ + nn = NeuralNetwork.readFromJson(isr); + } + }else{ + try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){ + nn = NeuralNetwork.readFromJson(r); + } } nn.init(); diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java index 44f441e..67f7449 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java @@ -51,7 +51,6 @@ private void proc(int chxxyy) { } newDelta[chxxyy] = tempDelta; } - float[] input; float[] result; int inputChannels; int inputWidth; @@ -65,11 +64,10 @@ private void proc(int chxxyy) { float[] delta; float[] newDelta; - public float[] backword(float[] input, float[] delta, float[] result, + public float[] backword(float[] delta, float[] result, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, - int filterSize, int stride, boolean useGpu) { - this.input = input; + int filterSize, int stride, float[] newDelta, boolean useGpu) { this.delta = delta; this.inputChannels = inputChannels; this.inputWidth = inputWidth; @@ -81,12 +79,11 @@ public float[] backword(float[] input, float[] delta, float[] result, this.filterSize = filterSize; this.stride = stride; this.result = result; - this.newDelta = new float[inputChannels * inputWidth * inputHeight]; + this.newDelta = newDelta; if (useGpu) { put(filter); put(delta); put(result); - put(input); execute(inputChannels * inputWidth * inputHeight); get(newDelta); } else { diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java index 51c0928..de8d9ca 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java @@ -62,7 +62,7 @@ private void proc(int fxy) { int outputHeight; int filterSize; int stride; - float[] bias; + //float[] bias; float[] delta; float learningRate; float[] tempDelta; @@ -74,7 +74,7 @@ public float[] backward(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] filterDelta, float[] biasDelta, - int filterSize, int stride, float[] bias, float learningRate, boolean useGpu) { + int filterSize, int stride, float learningRate, boolean useGpu) { this.delta = delta; this.input = input; this.inputChannels = inputChannels; @@ -86,7 +86,7 @@ public float[] backward(float[] delta, float[] result, this.outputHeight = outputHeight; this.filterSize = filterSize; this.stride = stride; - this.bias = bias; + //this.bias = bias; this.result = result; this.tempDelta = new float[outputChannels * inputChannels * inputWidth * inputHeight]; this.learningRate = learningRate;// / (outputWidth * outputHeight); diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 00c58e2..6b37bf6 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -5,7 +5,6 @@ */ package kishida.cnn.layers; -import com.amd.aparapi.Kernel; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; @@ -14,12 +13,10 @@ import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.activation.RectifiedLinear; -import kishida.cnn.kernels.ConvolutionBackwordBiasKernel; -import kishida.cnn.kernels.ConvolutionBackwordDeltaKernel; -import kishida.cnn.kernels.ConvolutionBackwordFilterKernel; import kishida.cnn.kernels.ConvolutionBackwordKernel; import kishida.cnn.kernels.ConvolutionForwardKernel; import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel; +import kishida.cnn.opencl.ConvolutionBackwordCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -49,6 +46,7 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ @Getter float initBias; float[] tempDelta; + float[] newDelta; public ConvolutionLayer(String name, int filterCount, int size, int stride, float initBias, boolean useGpu) { @@ -108,6 +106,7 @@ public final void setPreLayer(NeuralLayer preLayer) { this.result = new float[outputChannels * outputWidth * outputHeight]; this.tempDelta = new float[result.length]; + this.newDelta = new float[inputChannels * inputWidth * inputHeight]; } /** 畳み込みフィルタを適用する */ @@ -153,9 +152,11 @@ private void localNormalization(float[] result){ public float[] backward(float[] input, float[] delta) { if (useGpu) { // GPUバージョン - float[] newDelta = ConvolutionBackwordDeltaKernel.INSTANCE.backword(input, delta, result, + /* + ConvolutionBackwordDeltaKernel.INSTANCE.backword(delta, result, inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, filterSize, stride, useGpu); + filter, outputChannels, outputWidth, outputHeight, + filterSize, stride, newDelta, useGpu); ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result, input, inputChannels, inputWidth, inputHeight, filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu); @@ -171,15 +172,20 @@ public float[] backward(float[] input, float[] delta) { System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode()); System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); - } - return newDelta; + }*/ + + return ConvolutionBackwordCL.INSTANCE.backward( + delta, result, input, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, filterSize, stride, newDelta, initBias); } else { // CPUバージョン return ConvolutionBackwordKernel.INSTANCE.backward(delta, result, input, inputChannels, inputWidth, inputHeight, filter, outputChannels, outputWidth, outputHeight, filterDelta, biasDelta, - filterSize, stride, bias, parent.getLearningRate(), false); + filterSize, stride, parent.getLearningRate(), false); } } diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java new file mode 100644 index 0000000..c7a23a4 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -0,0 +1,119 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; + +/** + * + * @author naoki + */ +public class ConvolutionBackwordCL { + public static ConvolutionBackwordCL INSTANCE = new ConvolutionBackwordCL(); + CLProgram prog; + + private ConvolutionBackwordCL() { + } + + public float[] backward(float[] delta, float[] result, + float[] input, int inputChannels, int inputWidth, int inputHeight, + float[] filter, int outputChannels, int outputWidth, int outputHeight, + float[] filterDelta, float[] biasDelta, + int filterSize, int stride, float[] newDelta, float learningRate) { + if(prog == null){ + prog = OpenCL.compile("convolution_backword.cl"); + } + + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + CLBuffer bufResult = OpenCL.createReadBuffer(result); + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); + CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(result.length); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + + OpenCL.getQueue() + .putWriteBuffer(bufDelta, false) + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufResult, false) + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufBiasDelta, false); + + CLKernel deltaKernel = prog.createCLKernel("delta"); + deltaKernel + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(filterSize) + .putArg(outputChannels) + .putArg(stride) + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufResult, + bufDelta, + bufFilter) + .putArg(inputChannels) + .putArg(bufNewDelta); + OpenCL.getQueue() + .put1DRangeKernel(deltaKernel, 0, + inputChannels * inputWidth * inputHeight, 256); + deltaKernel.release(); + + CLKernel filterKernel = prog.createCLKernel("filter"); + filterKernel + .putArg(inputChannels) + .putArg(filterSize) + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufResult, + bufDelta) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(learningRate) + .putArgs( + bufInput, + bufFilter); + OpenCL.getQueue() + .put1DRangeKernel(filterKernel, 0, + outputChannels * inputChannels * filterSize * filterSize, 128); + filterKernel.release(); + + CLKernel biasKernel = prog.createCLKernel("bias"); + biasKernel + .putArgs( + bufResult, + bufDelta, + bufTempBias) + .putArg(learningRate); + OpenCL.getQueue() + .put1DRangeKernel(biasKernel, 0, + outputChannels * outputWidth * outputHeight, 128); + biasKernel.release(); + + CLKernel biasAfterKernel = prog.createCLKernel("biasAfter"); + biasAfterKernel + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufTempBias, + bufBiasDelta); + OpenCL.getQueue() + .put1DRangeKernel(biasAfterKernel, 0, outputChannels, 16) + .putReadBuffer(bufBiasDelta, false) + .putReadBuffer(bufFilterDelta, false) + .putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + biasAfterKernel.release(); + return newDelta; + } +} diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java new file mode 100644 index 0000000..33b6443 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/OpenCL.java @@ -0,0 +1,78 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLMemory; +import com.jogamp.opencl.CLProgram; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.FloatBuffer; +import lombok.Getter; + +/** + * + * @author naoki + */ +public class OpenCL { + + static CLContext ctx; + @Getter + static CLCommandQueue queue; + + public static void prepare(){ + ctx = CLContext.create(); + CLDevice device = ctx.getMaxFlopsDevice(); + System.out.println(device); + queue = device.createCommandQueue(); + } + + public static void release(){ + queue.finish(); + ctx.release(); + ctx = null; + } + + public static CLContext getCtx() { + if(ctx == null){ + prepare(); + } + return ctx; + } + + public static CLProgram compile(String path){ + try { + return getCtx().createProgram(OpenCL.class.getResourceAsStream("/kernels/" + path)) + .build(); + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } + } + + public static CLBuffer createReadBuffer(float[] data){ + CLBuffer buf = getCtx().createFloatBuffer( + data.length, CLMemory.Mem.READ_ONLY); + buf.getBuffer().put(data); + return buf; + } + public static CLBuffer createReadWriteBuffer(float[] data){ + CLBuffer buf = createReadWriteBuffer(data.length); + buf.getBuffer().put(data); + return buf; + } + public static CLBuffer createReadWriteBuffer(int size){ + CLBuffer buf = getCtx().createFloatBuffer( + size, CLMemory.Mem.READ_WRITE); + return buf; + + } + public static CLBuffer createWriteBuffer(int size){ + return getCtx().createFloatBuffer(size, CLMemory.Mem.WRITE_ONLY); + } +} diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl new file mode 100644 index 0000000..843121b --- /dev/null +++ b/src/main/resources/kernels/convolution_backword.cl @@ -0,0 +1,99 @@ +__kernel void delta( + int inputWidth, + int inputHeight, + int filterSize, + int outputChannels, + int stride, + int outputWidth, + int outputHeight, + __global float *result, + __global float *delta, + __global float *filter, + int inputChannels, + __global float *newDelta +){ + int chxxyy = get_global_id(0); + int ch = chxxyy / (inputWidth * inputHeight); + int xx = (chxxyy % (inputWidth * inputHeight)) / inputHeight; + int yy = chxxyy % inputHeight; + int sizeHalf = filterSize / 2; + float tempDelta = 0.0f; + for (int f = 0; f=0 && x=0 && y=0.0f)?delta[fxy]:0.0f; + tempDelta = tempDelta + (d * filter[((((((f * inputChannels) * filterSize) * filterSize) + ((ch * filterSize) * filterSize)) + (i * filterSize)) + j)]); + } + } + } + } + } + newDelta[chxxyy] = tempDelta; +} + +__kernel void filter( + int inputChannels, + int filterSize, + int outputWidth, + int outputHeight, + __global float *result, + __global float *delta, + int stride, + int inputWidth, + int inputHeight, + float learningRate, + __global float *input, + __global float *filter +){ + int fchij = get_global_id(0); + int f = fchij / ((inputChannels * filterSize) * filterSize); + int ch = (fchij % ((inputChannels * filterSize) * filterSize)) / (filterSize * filterSize); + int i = (fchij % (filterSize * filterSize)) / filterSize; + int j = fchij % filterSize; + float df = 0.0f; + for (int x = 0; x=0.0f)?delta[fxy]:0.0f; + int xx = x * stride + i - filterSize / 2; + if (xx >= 0 && xx < inputWidth){ + int yy = y * stride + j - filterSize / 2; + if (yy >= 0 && yy < inputHeight){ + df = df + d * learningRate * + input[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + } + } + } + } + filter[fchij] = filter[fchij] + df; +} + +__kernel void bias( + __global float *result, + __global float *delta, + __global float *tempBiasDelta, + float learningRate +){ + int fxy = get_global_id(0); + float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f; + tempBiasDelta[fxy] = learningRate * d; +} + +__kernel void biasAfter( + int outputWidth, + int outputHeight, + __global float *tempBiasDelta, + __global float *biasDelta +){ + int f = get_global_id(0); + float b = 0; + for(int xy = 0; xy < outputWidth * outputHeight; ++xy){ + b += tempBiasDelta[f * outputWidth * outputHeight + xy]; + } + biasDelta[f] += b; +} diff --git a/src/main/resources/kernels/convolution_forward.cl b/src/main/resources/kernels/convolution_forward.cl new file mode 100644 index 0000000..79bb82f --- /dev/null +++ b/src/main/resources/kernels/convolution_forward.cl @@ -0,0 +1,55 @@ +void kishida_cnn_kernels_ConvolutionForwardKernel__proc(This *this, int fxy){ + int f = fxy / (this->outputHeight * this->outputWidth); + int x = (fxy % (this->outputHeight * this->outputWidth)) / this->outputHeight; + int y = fxy % this->outputHeight; + float r = 0.0f; + for (int ch = 0; chinputChannels; ch++){ + for (int i = 0; ifilterSize; i++){ + int xx = ((x * this->stride) + i) - (this->filterSize / 2); + if (xx>=0 && xxinputWidth){ + for (int j = 0; jfilterSize; j++){ + int yy = ((y * this->stride) + j) - (this->filterSize / 2); + if (yy>=0 && yyinputHeight){ + r = r + (this->input[((((ch * this->inputWidth) * this->inputHeight) + (xx * this->inputHeight)) + yy)] * this->filter[((((((f * this->inputChannels) * this->filterSize) * this->filterSize) + ((ch * this->filterSize) * this->filterSize)) + (i * this->filterSize)) + j)]); + } + } + } + } + } + this->result[fxy] = r + this->bias[f]; + return; +} +__kernel void run( + int outputHeight, + int outputWidth, + int inputChannels, + int filterSize, + int stride, + int inputWidth, + int inputHeight, + __global float *input, + __global float *filter, + __global float *result, + __global float *bias, + int passid +){ + This thisStruct; + This* this=&thisStruct; + this->outputHeight = outputHeight; + this->outputWidth = outputWidth; + this->inputChannels = inputChannels; + this->filterSize = filterSize; + this->stride = stride; + this->inputWidth = inputWidth; + this->inputHeight = inputHeight; + this->input = input; + this->filter = filter; + this->result = result; + this->bias = bias; + this->passid = passid; + { + int fixy = get_global_id(0); + kishida_cnn_kernels_ConvolutionForwardKernel__proc(this, fixy); + return; + } +} \ No newline at end of file diff --git a/src/main/resources/kernels/fully_forward.cl b/src/main/resources/kernels/fully_forward.cl new file mode 100644 index 0000000..ecb2402 --- /dev/null +++ b/src/main/resources/kernels/fully_forward.cl @@ -0,0 +1,35 @@ +void kishida_cnn_kernels_FullyForwardKernel__proc(This *this, int j){ + if (this->dropout[j]==1){ + for (int i = 0; iinSize; i++){ + this->result[j] = this->result[j] + (this->in[i] * this->weight[((i * this->out) + j)]); + } + this->result[j] = this->result[j] + this->bias[j]; + } + return; +} +__kernel void run( + __global int *dropout, + int inSize, + __global float *result, + __global float *in, + __global float *weight, + int out, + __global float *bias, + int passid +){ + This thisStruct; + This* this=&thisStruct; + this->dropout = dropout; + this->inSize = inSize; + this->result = result; + this->in = in; + this->weight = weight; + this->out = out; + this->bias = bias; + this->passid = passid; + { + int j = get_global_id(0); + kishida_cnn_kernels_FullyForwardKernel__proc(this, j); + return; + } +} From 3c2aae799ad8069f91585faa6b0ce6f3b529a05d Mon Sep 17 00:00:00 2001 From: kishida Date: Fri, 18 Sep 2015 04:04:07 +0900 Subject: [PATCH 03/22] =?UTF-8?q?=E5=AE=9F=E8=A1=8C=E3=82=A8=E3=83=A9?= =?UTF-8?q?=E3=83=BC=E3=83=AD=E3=82=B0=E3=81=AF=E3=82=B3=E3=83=9F=E3=83=83?= =?UTF-8?q?=E3=83=88=E3=81=AB=E5=90=AB=E3=82=81=E3=81=AA=E3=81=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 718edb6..74785ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target/ /nb-configuration.xml -/nbactions*.xml \ No newline at end of file +/nbactions*.xml +/*.log \ No newline at end of file From 569793e380f442d7d5c24d48e5d5a22db1791969 Mon Sep 17 00:00:00 2001 From: kishida Date: Sat, 19 Sep 2015 13:52:27 +0900 Subject: [PATCH 04/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E9=80=86=E4=BC=9D=E6=92=AD=E3=81=ABJOCL=E3=82=92?= =?UTF-8?q?=E4=BD=BF=E3=81=86=E3=80=821=E5=89=B2=E9=AB=98=E9=80=9F?= =?UTF-8?q?=E5=8C=96=E3=80=8290/=E5=88=86=E2=86=92100/=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ConvolutionBackwordFilterKernel.java | 12 +-- .../kishida/cnn/layers/ConvolutionLayer.java | 60 ++++++++------- .../cnn/opencl/ConvolutionBackwordCL.java | 77 +++++++++++++++---- src/main/java/kishida/cnn/opencl/OpenCL.java | 18 ++++- .../resources/kernels/convolution_backword.cl | 29 +++++-- 5 files changed, 140 insertions(+), 56 deletions(-) diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java index af465c6..451d11e 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java @@ -44,7 +44,7 @@ private void proc(int fchij) { } } } - filter[fchij] += df; + filterDelta[fchij] += df; } float[] input; float[] result; @@ -52,7 +52,7 @@ private void proc(int fchij) { int inputChannels; int inputWidth; int inputHeight; - float[] filter; + float[] filterDelta; int outputChannels; int outputWidth; int outputHeight; @@ -62,14 +62,14 @@ private void proc(int fchij) { public void backword(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, - float[] filter, int outputChannels, int outputWidth, int outputHeight, + float[] filterDelta, int outputChannels, int outputWidth, int outputHeight, int filterSize, int stride, float learningRate, boolean useGpu) { this.input = input; this.delta = delta; this.inputChannels = inputChannels; this.inputWidth = inputWidth; this.inputHeight = inputHeight; - this.filter = filter; + this.filterDelta = filterDelta; this.outputChannels = outputChannels; this.outputWidth = outputWidth; this.outputHeight = outputHeight; @@ -79,11 +79,11 @@ public void backword(float[] delta, float[] result, this.learningRate = learningRate;// / outputWidth;// * outputHeight); if (useGpu) { put(delta); - put(filter); + put(filterDelta); put(input); put(result); execute(outputChannels * inputChannels * filterSize * filterSize); - get(filter); + get(filterDelta); } else { IntStream.range(0, outputChannels).parallel().forEach((f) -> { for (int chij = 0; chij < inputChannels * filterSize * filterSize; ++chij) { diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 6b37bf6..f27e577 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -5,6 +5,7 @@ */ package kishida.cnn.layers; +import com.amd.aparapi.Kernel; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; @@ -13,6 +14,9 @@ import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.activation.RectifiedLinear; +import kishida.cnn.kernels.ConvolutionBackwordBiasKernel; +import kishida.cnn.kernels.ConvolutionBackwordDeltaKernel; +import kishida.cnn.kernels.ConvolutionBackwordFilterKernel; import kishida.cnn.kernels.ConvolutionBackwordKernel; import kishida.cnn.kernels.ConvolutionForwardKernel; import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel; @@ -152,33 +156,37 @@ private void localNormalization(float[] result){ public float[] backward(float[] input, float[] delta) { if (useGpu) { // GPUバージョン - /* - ConvolutionBackwordDeltaKernel.INSTANCE.backword(delta, result, - inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, - filterSize, stride, newDelta, useGpu); - ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result, - input, inputChannels, inputWidth, inputHeight, - filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu); - ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result, - outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu); - if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || - ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || - ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) { - useGpu = false; + if(false){ + // aparapi + ConvolutionBackwordDeltaKernel.INSTANCE.backword(delta, result, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterSize, stride, newDelta, useGpu); + ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result, + input, inputChannels, inputWidth, inputHeight, + filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu); + ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result, + outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu); + if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) { + useGpu = false; + } + if (!useGpu) { + System.out.println("Can't use GPU on " + name); + System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode()); + System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); + System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); + } + return newDelta; + }else{ + // JOCL + return ConvolutionBackwordCL.INSTANCE.backward( + delta, result, input, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); } - if (!useGpu) { - System.out.println("Can't use GPU on " + name); - System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode()); - System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); - System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); - }*/ - - return ConvolutionBackwordCL.INSTANCE.backward( - delta, result, input, - inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, - filterDelta, biasDelta, filterSize, stride, newDelta, initBias); } else { // CPUバージョン return ConvolutionBackwordKernel.INSTANCE.backward(delta, result, diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index c7a23a4..8d4f0b4 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -38,13 +38,13 @@ public float[] backward(float[] delta, float[] result, CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(result.length); CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); - - OpenCL.getQueue() + OpenCL.getQueue().putBarrier() .putWriteBuffer(bufDelta, false) .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufResult, false) .putWriteBuffer(bufInput, false) .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufTempBias, false) .putWriteBuffer(bufBiasDelta, false); CLKernel deltaKernel = prog.createCLKernel("delta"); @@ -62,9 +62,8 @@ public float[] backward(float[] delta, float[] result, bufFilter) .putArg(inputChannels) .putArg(bufNewDelta); - OpenCL.getQueue() - .put1DRangeKernel(deltaKernel, 0, - inputChannels * inputWidth * inputHeight, 256); + OpenCL.execute(deltaKernel, + inputChannels * inputWidth * inputHeight); deltaKernel.release(); CLKernel filterKernel = prog.createCLKernel("filter"); @@ -82,10 +81,9 @@ public float[] backward(float[] delta, float[] result, .putArg(learningRate) .putArgs( bufInput, - bufFilter); - OpenCL.getQueue() - .put1DRangeKernel(filterKernel, 0, - outputChannels * inputChannels * filterSize * filterSize, 128); + bufFilterDelta); + OpenCL.execute(filterKernel, + outputChannels * inputChannels * filterSize * filterSize); filterKernel.release(); CLKernel biasKernel = prog.createCLKernel("bias"); @@ -95,11 +93,20 @@ public float[] backward(float[] delta, float[] result, bufDelta, bufTempBias) .putArg(learningRate); - OpenCL.getQueue() - .put1DRangeKernel(biasKernel, 0, - outputChannels * outputWidth * outputHeight, 128); + OpenCL.execute(biasKernel, + outputChannels * outputWidth * outputHeight); biasKernel.release(); + /* + OpenCL.getQueue().putReadBuffer(bufTempBias, true); + float[] tempBias = new float[result.length]; + bufTempBias.getBuffer().get(tempBias).rewind(); + + float[] compTempBias = new float[tempBias.length]; + for(int i = 0; i < compTempBias.length; ++i){ + compTempBias[i] = result[i] >= 0 ? delta[i] * learningRate : 0; + }*/ + CLKernel biasAfterKernel = prog.createCLKernel("biasAfter"); biasAfterKernel .putArg(outputWidth) @@ -107,13 +114,51 @@ public float[] backward(float[] delta, float[] result, .putArgs( bufTempBias, bufBiasDelta); + OpenCL.execute(biasAfterKernel, outputChannels); + biasAfterKernel.release(); OpenCL.getQueue() - .put1DRangeKernel(biasAfterKernel, 0, outputChannels, 16) - .putReadBuffer(bufBiasDelta, false) - .putReadBuffer(bufFilterDelta, false) + .putReadBuffer(bufBiasDelta, true) + .putReadBuffer(bufFilterDelta, true) .putReadBuffer(bufNewDelta, true); bufNewDelta.getBuffer().get(newDelta); - biasAfterKernel.release(); + bufFilterDelta.getBuffer().get(filterDelta); + bufBiasDelta.getBuffer().get(biasDelta); + + bufDelta.release(); + bufFilter.release(); + bufResult.release(); + bufInput.release(); + bufFilterDelta.release(); + bufTempBias.release(); + bufBiasDelta.release(); + bufNewDelta.release(); + return newDelta; } + + public static void main(String[] args) { + int inputChannels = 3; + int inputWidth = 200; + int inputHeight = 200; + int stride = 3; + int filterSize = 11; + int outputChannels = 24; + int outputWidth = inputWidth / stride; + int outputHeight = inputHeight / stride; + float[] input = new float[inputChannels * inputWidth * inputHeight]; + float[] newDelta = new float[input.length]; + float[] filter = new float[inputChannels * outputChannels * filterSize * filterSize]; + float[] filterDelta = new float[filter.length]; + float[] biasDelta = new float[outputChannels]; + float[] result = new float[outputChannels * outputWidth * outputHeight]; + float[] delta = new float[result.length]; + float learningRate = 0.001f; + + for(int i = 0; i < 3; ++i){ + System.out.println(i + 1); + ConvolutionBackwordCL.INSTANCE.backward(delta, result, + input, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, filterDelta, biasDelta, filterSize, stride, newDelta, learningRate); + } + } } diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java index 33b6443..09f869c 100644 --- a/src/main/java/kishida/cnn/opencl/OpenCL.java +++ b/src/main/java/kishida/cnn/opencl/OpenCL.java @@ -9,6 +9,7 @@ import com.jogamp.opencl.CLCommandQueue; import com.jogamp.opencl.CLContext; import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLKernel; import com.jogamp.opencl.CLMemory; import com.jogamp.opencl.CLProgram; import java.io.IOException; @@ -25,10 +26,11 @@ public class OpenCL { static CLContext ctx; @Getter static CLCommandQueue queue; + static CLDevice device; public static void prepare(){ ctx = CLContext.create(); - CLDevice device = ctx.getMaxFlopsDevice(); + device = ctx.getMaxFlopsDevice(); System.out.println(device); queue = device.createCommandQueue(); } @@ -58,12 +60,12 @@ public static CLProgram compile(String path){ public static CLBuffer createReadBuffer(float[] data){ CLBuffer buf = getCtx().createFloatBuffer( data.length, CLMemory.Mem.READ_ONLY); - buf.getBuffer().put(data); + buf.getBuffer().put(data).rewind();//rewindしないと不安定になる return buf; } public static CLBuffer createReadWriteBuffer(float[] data){ CLBuffer buf = createReadWriteBuffer(data.length); - buf.getBuffer().put(data); + buf.getBuffer().put(data).rewind();//rewindしないと不安定になる return buf; } public static CLBuffer createReadWriteBuffer(int size){ @@ -75,4 +77,14 @@ public static CLBuffer createReadWriteBuffer(int size){ public static CLBuffer createWriteBuffer(int size){ return getCtx().createFloatBuffer(size, CLMemory.Mem.WRITE_ONLY); } + + public static CLCommandQueue execute(CLKernel kernel, int range){ + int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 256); + int globalWorkSize = roundUp(localWorkSize, range); + kernel.putArg(range); + return getQueue().put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + } + static int roundUp(int groupSize, int globalSize){ + return ((globalSize + groupSize - 1) / groupSize) * groupSize; + } } diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl index 843121b..cbf706c 100644 --- a/src/main/resources/kernels/convolution_backword.cl +++ b/src/main/resources/kernels/convolution_backword.cl @@ -10,9 +10,13 @@ __kernel void delta( __global float *delta, __global float *filter, int inputChannels, - __global float *newDelta + __global float *newDelta, + int count ){ int chxxyy = get_global_id(0); + if(chxxyy >= count){ + return; + } int ch = chxxyy / (inputWidth * inputHeight); int xx = (chxxyy % (inputWidth * inputHeight)) / inputHeight; int yy = chxxyy % inputHeight; @@ -27,7 +31,10 @@ __kernel void delta( if ((((yy - j) + sizeHalf) % stride)==0 && y>=0 && y=0.0f)?delta[fxy]:0.0f; - tempDelta = tempDelta + (d * filter[((((((f * inputChannels) * filterSize) * filterSize) + ((ch * filterSize) * filterSize)) + (i * filterSize)) + j)]); + tempDelta = tempDelta + d * filter[ + f * inputChannels * filterSize * filterSize + + ch * filterSize * filterSize + + i * filterSize + j]; } } } @@ -48,9 +55,13 @@ __kernel void filter( int inputHeight, float learningRate, __global float *input, - __global float *filter + __global float *filter, + int count ){ int fchij = get_global_id(0); + if(fchij >= count){ + return; + } int f = fchij / ((inputChannels * filterSize) * filterSize); int ch = (fchij % ((inputChannels * filterSize) * filterSize)) / (filterSize * filterSize); int i = (fchij % (filterSize * filterSize)) / filterSize; @@ -77,9 +88,13 @@ __kernel void bias( __global float *result, __global float *delta, __global float *tempBiasDelta, - float learningRate + float learningRate, + int count ){ int fxy = get_global_id(0); + if(fxy >= count){ + return; + } float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f; tempBiasDelta[fxy] = learningRate * d; } @@ -88,9 +103,13 @@ __kernel void biasAfter( int outputWidth, int outputHeight, __global float *tempBiasDelta, - __global float *biasDelta + __global float *biasDelta, + int count ){ int f = get_global_id(0); + if(f >= count){ + return; + } float b = 0; for(int xy = 0; xy < outputWidth * outputHeight; ++xy){ b += tempBiasDelta[f * outputWidth * outputHeight + xy]; From c0b9058fbb3e4cbe72d79e3210fb986d334b34ab Mon Sep 17 00:00:00 2001 From: kishida Date: Sun, 20 Sep 2015 22:54:28 +0900 Subject: [PATCH 05/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E9=A0=86=E4=BC=9D=E6=92=AD=E3=81=ABJOCL=E3=82=92?= =?UTF-8?q?=E4=BD=BF=E3=81=86=E3=80=821=E5=89=B2=E9=AB=98=E9=80=9F?= =?UTF-8?q?=E5=8C=96=E3=80=82100/=E5=88=86=E2=86=92110/=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 8 +- .../ConvolutionLocalNormalizationKernel.java | 38 +++----- .../kishida/cnn/layers/ConvolutionLayer.java | 27 +++++- .../cnn/opencl/ConvolutionBackwordCL.java | 12 +-- .../cnn/opencl/ConvolutionForwardCL.java | 80 +++++++++++++++ .../resources/kernels/convolution_backword.cl | 18 ++-- .../resources/kernels/convolution_forward.cl | 97 +++++++++++-------- 7 files changed, 185 insertions(+), 95 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 74c65ba..8c64a9a 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -59,10 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - //private static final String RESOURCE_NAME = "/alexnet_def.json"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; - private static final String RESOURCE_NAME = "/tinynet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + private static final String RESOURCE_NAME = "/alexnet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + //private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java index 09a7e7e..a60aaa8 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java @@ -23,8 +23,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid this.outputChannels = outputChannels; this.outputWidth = outputWidth; this.outputHeight = outputHeight; - if(useGpu && outputWidth * outputHeight > 500){ - execute(outputWidth * outputHeight); + if(useGpu){ + execute(outputChannels * outputWidth * outputHeight); //throw new UnsupportedOperationException("because I dont know how to use private memory."); }else{ IntStream.range(0, outputWidth).parallel().forEach(x -> { @@ -38,8 +38,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid @Override public void run() { - int xy = getGlobalId(); - procGpu(xy); + int chxy = getGlobalId(); + procGpu(chxy); } float[] result; @@ -48,30 +48,22 @@ public void run() { int outputChannels; static final int n = 5; - @PrivateMemorySpace(n) float[] sigma = new float[n]; // not work + //@PrivateMemorySpace(n) float[] sigma = new float[n]; // not work - public void procGpu(int xy){ + public void procGpu(int chxy){ final int k = 2; final float a = 0.0001f; final float b = 0.75f; - int lp = 0; - for(; lp < n / 2; ++lp){ - sigma[lp] = - result[lp * outputWidth * outputHeight + xy] * - result[lp * outputWidth * outputHeight + xy]; - } - for(int ch = 0; ch < outputChannels; ++ch){ - sigma[lp % n] = lp >= outputChannels ? 0 : - result[lp * outputWidth * outputHeight + xy] * - result[lp * outputWidth * outputHeight + xy]; - lp = lp + 1; - float sum = 0; - for(int i = 0; i < n; ++i){ - sum += sigma[i]; - } - result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] / - pow(k + a * sum, b); + int ch = chxy / (outputWidth * outputHeight); + int xy = chxy % (outputWidth * outputHeight); + + float sum = 0; + for(int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); ++lp){ + sum += result[lp * outputWidth * outputHeight + xy] * + result[lp * outputWidth * outputHeight + xy]; } + result[chxy] = result[chxy] / + pow(k + a * sum, b); } diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index f27e577..a1f00ab 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -21,6 +21,7 @@ import kishida.cnn.kernels.ConvolutionForwardKernel; import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel; import kishida.cnn.opencl.ConvolutionBackwordCL; +import kishida.cnn.opencl.ConvolutionForwardCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -116,11 +117,27 @@ public final void setPreLayer(NeuralLayer preLayer) { /** 畳み込みフィルタを適用する */ @Override public float[] forward(float[] img) { - result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, useGpu); - //localNormalization(result); - ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, - outputChannels, outputWidth, outputHeight, false); + if(true){ + if(false){ + // aparapi + result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false); + //localNormalization(result); + ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, + outputChannels, outputWidth, outputHeight, false); + } else{ + // JOCL + result = ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias); + } + }else { + //CPU + result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false); + //localNormalization(result); + ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, + outputChannels, outputWidth, outputHeight, false); + } return result; } diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index 8d4f0b4..99ce5a0 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -38,7 +38,7 @@ public float[] backward(float[] delta, float[] result, CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(result.length); CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); - OpenCL.getQueue().putBarrier() + OpenCL.getQueue() .putWriteBuffer(bufDelta, false) .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufResult, false) @@ -97,16 +97,6 @@ public float[] backward(float[] delta, float[] result, outputChannels * outputWidth * outputHeight); biasKernel.release(); - /* - OpenCL.getQueue().putReadBuffer(bufTempBias, true); - float[] tempBias = new float[result.length]; - bufTempBias.getBuffer().get(tempBias).rewind(); - - float[] compTempBias = new float[tempBias.length]; - for(int i = 0; i < compTempBias.length; ++i){ - compTempBias[i] = result[i] >= 0 ? delta[i] * learningRate : 0; - }*/ - CLKernel biasAfterKernel = prog.createCLKernel("biasAfter"); biasAfterKernel .putArg(outputWidth) diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java new file mode 100644 index 0000000..28bb85e --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java @@ -0,0 +1,80 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; + +/** + * + * @author naoki + */ +public class ConvolutionForwardCL { + public static ConvolutionForwardCL INSTANCE = new ConvolutionForwardCL(); + CLProgram prog; + private ConvolutionForwardCL() { + } + + public float[] forward(float[] input, int inputChannels, int inputWidth, int inputHeight, + float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] result, + int filterSize, int stride, float[] bias){ + if(prog == null){ + prog = OpenCL.compile("convolution_forward.cl"); + } + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result); + CLBuffer bufBias = OpenCL.createReadBuffer(bias); + + OpenCL.getQueue() + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false); + + CLKernel forwardKernel = prog.createCLKernel("forward"); + forwardKernel + .putArg(outputHeight) + .putArg(outputWidth) + .putArg(inputChannels) + .putArg(filterSize) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufFilter, + bufResult, + bufBias); + OpenCL.execute(forwardKernel, + outputChannels * outputWidth * outputHeight); + forwardKernel.release(); + + CLKernel normalizeKernel = prog.createCLKernel("localNormalize"); + normalizeKernel + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(outputChannels) + .putArg(bufResult); + OpenCL.execute(normalizeKernel, + outputChannels * outputWidth * outputHeight); + normalizeKernel.release(); + + OpenCL.getQueue() + .putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufBias.release(); + bufResult.release(); + bufInput.release(); + bufFilter.release(); + + return result; + } + +} diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl index cbf706c..cce42a9 100644 --- a/src/main/resources/kernels/convolution_backword.cl +++ b/src/main/resources/kernels/convolution_backword.cl @@ -6,9 +6,9 @@ __kernel void delta( int stride, int outputWidth, int outputHeight, - __global float *result, - __global float *delta, - __global float *filter, + __global const float *result, + __global const float *delta, + __global const float *filter, int inputChannels, __global float *newDelta, int count @@ -48,13 +48,13 @@ __kernel void filter( int filterSize, int outputWidth, int outputHeight, - __global float *result, - __global float *delta, + __global const float *result, + __global const float *delta, int stride, int inputWidth, int inputHeight, float learningRate, - __global float *input, + __global const float *input, __global float *filter, int count ){ @@ -85,8 +85,8 @@ __kernel void filter( } __kernel void bias( - __global float *result, - __global float *delta, + __global const float *result, + __global const float *delta, __global float *tempBiasDelta, float learningRate, int count @@ -102,7 +102,7 @@ __kernel void bias( __kernel void biasAfter( int outputWidth, int outputHeight, - __global float *tempBiasDelta, + __global const float *tempBiasDelta, __global float *biasDelta, int count ){ diff --git a/src/main/resources/kernels/convolution_forward.cl b/src/main/resources/kernels/convolution_forward.cl index 79bb82f..9f256f3 100644 --- a/src/main/resources/kernels/convolution_forward.cl +++ b/src/main/resources/kernels/convolution_forward.cl @@ -1,55 +1,66 @@ -void kishida_cnn_kernels_ConvolutionForwardKernel__proc(This *this, int fxy){ - int f = fxy / (this->outputHeight * this->outputWidth); - int x = (fxy % (this->outputHeight * this->outputWidth)) / this->outputHeight; - int y = fxy % this->outputHeight; +__kernel void forward( + int outputHeight, + int outputWidth, + int inputChannels, + int filterSize, + int stride, + int inputWidth, + int inputHeight, + __global const float *input, + __global const float *filter, + __global float *result, + __global const float *bias, + int count +){ + int fxy = get_global_id(0); + if(fxy >= count){ + return; + } + int f = fxy / (outputHeight * outputWidth); + int x = (fxy % (outputHeight * outputWidth)) / outputHeight; + int y = fxy % outputHeight; float r = 0.0f; - for (int ch = 0; chinputChannels; ch++){ - for (int i = 0; ifilterSize; i++){ - int xx = ((x * this->stride) + i) - (this->filterSize / 2); - if (xx>=0 && xxinputWidth){ - for (int j = 0; jfilterSize; j++){ - int yy = ((y * this->stride) + j) - (this->filterSize / 2); - if (yy>=0 && yyinputHeight){ - r = r + (this->input[((((ch * this->inputWidth) * this->inputHeight) + (xx * this->inputHeight)) + yy)] * this->filter[((((((f * this->inputChannels) * this->filterSize) * this->filterSize) + ((ch * this->filterSize) * this->filterSize)) + (i * this->filterSize)) + j)]); + for (int ch = 0; ch=0 && xx=0 && yyresult[fxy] = r + this->bias[f]; - return; + float rs = r + bias[f]; + result[fxy] = rs >= 0 ? rs : 0; } -__kernel void run( - int outputHeight, + +__kernel void localNormalize( int outputWidth, - int inputChannels, - int filterSize, - int stride, - int inputWidth, - int inputHeight, - __global float *input, - __global float *filter, + int outputHeight, + int outputChannels, __global float *result, - __global float *bias, - int passid + int count ){ - This thisStruct; - This* this=&thisStruct; - this->outputHeight = outputHeight; - this->outputWidth = outputWidth; - this->inputChannels = inputChannels; - this->filterSize = filterSize; - this->stride = stride; - this->inputWidth = inputWidth; - this->inputHeight = inputHeight; - this->input = input; - this->filter = filter; - this->result = result; - this->bias = bias; - this->passid = passid; - { - int fixy = get_global_id(0); - kishida_cnn_kernels_ConvolutionForwardKernel__proc(this, fixy); + int chxy = get_global_id(0); + if(chxy >= count){ return; } -} \ No newline at end of file + float k = 2; + float a = 1.0E-4f; + float b = 0.75f; + int n = 5; + int ch = chxy / (outputWidth * outputHeight); + int xy = chxy % (outputWidth * outputHeight); + float sum = 0.0f; + + for (int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); lp++){ + sum += result[lp * outputWidth * outputHeight + xy] * + result[lp * outputWidth * outputHeight + xy]; + } + result[chxy] /= pow(k + a * sum, b); +} From f3e483e63d902f6bba8e96ab30021ce173bc998f Mon Sep 17 00:00:00 2001 From: kishida Date: Mon, 21 Sep 2015 02:45:09 +0900 Subject: [PATCH 06/22] =?UTF-8?q?=E5=85=A8=E7=B5=90=E5=90=88=E5=B1=A4?= =?UTF-8?q?=E3=81=A8=E6=B4=BB=E6=80=A7=E5=8C=96=E9=96=A2=E6=95=B0=E3=81=AE?= =?UTF-8?q?JOCL=E5=8C=96=E3=80=82=E3=81=97=E3=81=8B=E3=81=97=E9=81=85?= =?UTF-8?q?=E3=81=8F=E3=81=AA=E3=81=A3=E3=81=9F=E3=83=BB=E3=83=BB=E3=83=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cnn/activation/ActivationFunction.java | 1 + .../activation/LimitedRectifiedLinear.java | 5 + .../cnn/activation/LinearFunction.java | 4 + .../cnn/activation/LogisticFunction.java | 4 + .../cnn/activation/RectifiedLinear.java | 4 + .../cnn/activation/SoftMaxFunction.java | 5 + .../cnn/kernels/FullyForwardKernel.java | 7 +- .../java/kishida/cnn/layers/FullyConnect.java | 22 +++-- .../kishida/cnn/opencl/FullyForwardCL.java | 98 +++++++++++++++++++ src/main/java/kishida/cnn/opencl/OpenCL.java | 8 +- src/main/resources/kernels/activation.cl | 64 ++++++++++++ src/main/resources/kernels/fully_forward.cl | 45 ++++----- 12 files changed, 226 insertions(+), 41 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/FullyForwardCL.java create mode 100644 src/main/resources/kernels/activation.cl diff --git a/src/main/java/kishida/cnn/activation/ActivationFunction.java b/src/main/java/kishida/cnn/activation/ActivationFunction.java index 4768adc..ac58b07 100644 --- a/src/main/java/kishida/cnn/activation/ActivationFunction.java +++ b/src/main/java/kishida/cnn/activation/ActivationFunction.java @@ -28,4 +28,5 @@ public void applyAfter(float[] values) { /** 微分 */ public abstract float diff(float value); + public abstract String getName(); } diff --git a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java index 4d37ccb..7c4c221 100644 --- a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java +++ b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java @@ -23,4 +23,9 @@ public float diff(float value) { return value >= 0 && value <= 2 ? 1 : 0; } + @Override + public String getName() { + return "limitrelu"; + } + } diff --git a/src/main/java/kishida/cnn/activation/LinearFunction.java b/src/main/java/kishida/cnn/activation/LinearFunction.java index 5bb1ef2..b46eb2a 100644 --- a/src/main/java/kishida/cnn/activation/LinearFunction.java +++ b/src/main/java/kishida/cnn/activation/LinearFunction.java @@ -20,5 +20,9 @@ public float apply(float value) { public float diff(float value) { return 1; } + @Override + public String getName() { + return "linear"; + } } diff --git a/src/main/java/kishida/cnn/activation/LogisticFunction.java b/src/main/java/kishida/cnn/activation/LogisticFunction.java index 637b0f0..577468c 100644 --- a/src/main/java/kishida/cnn/activation/LogisticFunction.java +++ b/src/main/java/kishida/cnn/activation/LogisticFunction.java @@ -17,5 +17,9 @@ public float apply(float value) { public float diff(float value) { return value * (1 - value); } + @Override + public String getName() { + return "logistic"; + } } diff --git a/src/main/java/kishida/cnn/activation/RectifiedLinear.java b/src/main/java/kishida/cnn/activation/RectifiedLinear.java index b37e663..c620a0b 100644 --- a/src/main/java/kishida/cnn/activation/RectifiedLinear.java +++ b/src/main/java/kishida/cnn/activation/RectifiedLinear.java @@ -17,5 +17,9 @@ public float apply(float value) { public float diff(float value) { return value >= 0 ? 1 : 0; } + @Override + public String getName() { + return "relu"; + } } diff --git a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java index b6e9875..86ad50e 100644 --- a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java +++ b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java @@ -30,4 +30,9 @@ public float diff(float value) { return value * (1 - value); } + @Override + public String getName() { + return "softmax"; + } + } diff --git a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java index f996d10..17469af 100644 --- a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java +++ b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java @@ -26,10 +26,13 @@ public void run() { private void proc(int j){ if(dropout[j] == 1){ + float r = 0; for (int i = 0; i < inSize; ++i) { - result[j] += in[i] * weight[i * out + j]; + r += in[i] * weight[i * out + j]; } - result[j] += bias[j]; + result[j] = r + bias[j]; + }else{ + result[j] = 0; } } int out; diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index 9183706..7bd019a 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -13,6 +13,7 @@ import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.kernels.FullyForwardKernel; +import kishida.cnn.opencl.FullyForwardCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -132,23 +133,24 @@ public void prepareDropout() { @Override public float[] forward(float[] in) { prepareDropout(); - Arrays.fill(result, 0); - FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); - /* - IntStream.range(0, out).parallel().filter(j -> dropout[j] == 1).forEach(j -> { - for (int i = 0; i < in.length; ++i) { - result[j] += in[i] * weight[i * out + j]; + if(useGpu){ + if(false){ + FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); + activation.applyAfter(result); + }else{ + FullyForwardCL.INSTANCE.forward(inputSize, outputSize, dropout, in, weight, bias, result, activation); } - result[j] += bias[j]; - });*/ - activation.applyAfter(result); + }else{ + FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); + activation.applyAfter(result); + } return result; } @Override public float[] backward(float[] in, float[] delta) { Arrays.fill(newDelta, 0); - Arrays.fill(diffed, 0); + //Arrays.fill(diffed, 0); for(int i = 0; i < result.length; ++i){ diffed[i] = activation.diff(result[i]); } diff --git a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java new file mode 100644 index 0000000..e27010d --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java @@ -0,0 +1,98 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.Map; +import kishida.cnn.activation.ActivationFunction; +import kishida.cnn.activation.SoftMaxFunction; + +/** + * + * @author naoki + */ +public class FullyForwardCL { + public static FullyForwardCL INSTANCE = new FullyForwardCL(); + CLProgram progFully; + CLProgram progActivation; + CLKernel forwardKernel; + Map actKernels; + public FullyForwardCL() { + } + + public void forward(int inputSize, int outputSize, int[] dropout, + float[] input, float[] weight, float[] bias, float[] result, + ActivationFunction activation){ + if(progFully == null){ + progFully = OpenCL.compile("fully_forward.cl"); + forwardKernel = progFully.createCLKernel("forward"); + } + if(progActivation == null){ + progActivation = OpenCL.compile("activation.cl"); + actKernels = progActivation.createCLKernels(); + } + + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufBias = OpenCL.createReadBuffer(bias); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length); + + OpenCL.getQueue() + .putWriteBuffer(bufDropout, false) + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false); + + forwardKernel.rewind() + .putArg(input.length) + .putArg(outputSize) + .putArgs( + bufDropout, + bufInput, + bufWeight, + bufBias, + bufResult); + OpenCL.execute(forwardKernel, outputSize); + + if(activation instanceof SoftMaxFunction){ + CLBuffer bufExped = OpenCL.createReadWriteBuffer(result.length); + CLKernel kernelActPre = actKernels.get("softmax_before"); + kernelActPre.rewind() + .putArg(bufResult) + .putArg(bufExped); + OpenCL.execute(kernelActPre, outputSize); + + CLKernel kernelAct = actKernels.get("softmax"); + kernelAct.rewind() + .putArg(bufExped) + .putArg(bufResult); + OpenCL.execute(kernelAct, outputSize); + + bufExped.release(); + + }else{ + CLKernel kernelAct = actKernels.get(activation.getName()); + kernelAct.rewind() + .putArg(bufResult); + OpenCL.execute(kernelAct, outputSize); + } + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufDropout.release(); + bufInput.release(); + bufWeight.release(); + bufBias.release(); + bufResult.release(); + + } +} diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java index 09f869c..6d943f0 100644 --- a/src/main/java/kishida/cnn/opencl/OpenCL.java +++ b/src/main/java/kishida/cnn/opencl/OpenCL.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.nio.FloatBuffer; +import java.nio.IntBuffer; import lombok.Getter; /** @@ -77,7 +78,12 @@ public static CLBuffer createReadWriteBuffer(int size){ public static CLBuffer createWriteBuffer(int size){ return getCtx().createFloatBuffer(size, CLMemory.Mem.WRITE_ONLY); } - + public static CLBuffer createReadBuffer(int[] data){ + CLBuffer buf = getCtx().createIntBuffer( + data.length, CLMemory.Mem.READ_ONLY); + buf.getBuffer().put(data).rewind(); + return buf; + } public static CLCommandQueue execute(CLKernel kernel, int range){ int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 256); int globalWorkSize = roundUp(localWorkSize, range); diff --git a/src/main/resources/kernels/activation.cl b/src/main/resources/kernels/activation.cl new file mode 100644 index 0000000..0143565 --- /dev/null +++ b/src/main/resources/kernels/activation.cl @@ -0,0 +1,64 @@ +__kernel void relu( + __global float *result, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float r = result[i]; + result[i] = r >= 0 ? r : 0; +} + +__kernel void relu_diff( + __global const float* result, + __global float* diff, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + diff[i] = result[i] >= 0 ? 1 : 0; +} + +__kernel void softmax_before( + __global const float* result, + __global float* exped, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + exped[i] = exp(min(700.0f, result[i])); +} + +__kernel void softmax( + __global const float* exped, + __global float* result, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float sum = 0; + for(int j = 0; j < count; ++j){ + sum += exped[j]; + } + result[i] = exped[i] / sum; +} + +__kernel void softmax_diff( + __global const float* result, + __global float* diff, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float r = result[i]; + diff[i] = r * (1 - r); +} diff --git a/src/main/resources/kernels/fully_forward.cl b/src/main/resources/kernels/fully_forward.cl index ecb2402..e8b06a3 100644 --- a/src/main/resources/kernels/fully_forward.cl +++ b/src/main/resources/kernels/fully_forward.cl @@ -1,35 +1,24 @@ -void kishida_cnn_kernels_FullyForwardKernel__proc(This *this, int j){ - if (this->dropout[j]==1){ - for (int i = 0; iinSize; i++){ - this->result[j] = this->result[j] + (this->in[i] * this->weight[((i * this->out) + j)]); - } - this->result[j] = this->result[j] + this->bias[j]; - } - return; -} -__kernel void run( - __global int *dropout, +__kernel void forward( int inSize, - __global float *result, - __global float *in, - __global float *weight, int out, + __global const int *dropout, + __global const float *in, + __global float *weight, __global float *bias, - int passid + __global float *result, + int count ){ - This thisStruct; - This* this=&thisStruct; - this->dropout = dropout; - this->inSize = inSize; - this->result = result; - this->in = in; - this->weight = weight; - this->out = out; - this->bias = bias; - this->passid = passid; - { - int j = get_global_id(0); - kishida_cnn_kernels_FullyForwardKernel__proc(this, j); + int j = get_global_id(0); + if(j >= count){ return; } + if (dropout[j] == 1){ + float r = 0; + for (int i = 0; i Date: Mon, 21 Sep 2015 05:01:28 +0900 Subject: [PATCH 07/22] =?UTF-8?q?=E5=85=A8=E7=B5=90=E5=90=88=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E9=80=86=E4=BC=9D=E6=92=AD=E3=81=AEJOCL=E5=8C=96?= =?UTF-8?q?=E3=80=82=E3=81=97=E3=81=8B=E3=81=97=E3=81=8B=E3=81=AA=E3=82=8A?= =?UTF-8?q?=E9=81=85=E3=81=8F=E3=81=AA=E3=81=A3=E3=81=9F=E3=83=BB=E3=83=BB?= =?UTF-8?q?=E3=83=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/layers/FullyConnect.java | 44 +++--- .../kishida/cnn/opencl/FullyBackwordCL.java | 140 ++++++++++++++++++ src/main/resources/kernels/fully_backword.cl | 67 +++++++++ 3 files changed, 232 insertions(+), 19 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/FullyBackwordCL.java create mode 100644 src/main/resources/kernels/fully_backword.cl diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index 7bd019a..224dbb9 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -8,11 +8,11 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Arrays; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.kernels.FullyForwardKernel; +import kishida.cnn.opencl.FullyBackwordCL; import kishida.cnn.opencl.FullyForwardCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; @@ -134,7 +134,7 @@ public void prepareDropout() { public float[] forward(float[] in) { prepareDropout(); if(useGpu){ - if(false){ + if(true){ FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); activation.applyAfter(result); }else{ @@ -149,24 +149,30 @@ public float[] forward(float[] in) { @Override public float[] backward(float[] in, float[] delta) { - Arrays.fill(newDelta, 0); - //Arrays.fill(diffed, 0); - for(int i = 0; i < result.length; ++i){ - diffed[i] = activation.diff(result[i]); - } - IntStream.range(0, in.length).parallel().forEach((i) -> { - for (int j = 0; j < outputSize; ++j) { - if (dropout[j] != 1) { - continue; - } - float d = diffed[j] * delta[j]; - newDelta[i] += d * weight[i * outputSize + j];//in[i] *; - weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate(); + if(false){ + FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, + dropout, in, delta, result, weight, weightDelta, biasDelta, newDelta, + parent.getLearningRate(), activation); + }else{ + for(int i = 0; i < result.length; ++i){ + diffed[i] = activation.diff(result[i]); } - }); - IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> { - biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate(); - }); + IntStream.range(0, in.length).parallel().forEach((i) -> { + float nd = 0; + for (int j = 0; j < outputSize; ++j) { + if (dropout[j] != 1) { + continue; + } + float d = diffed[j] * delta[j]; + nd += d * weight[i * outputSize + j];//in[i] *; + weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate(); + } + newDelta[i] = nd; + }); + IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> { + biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate(); + }); + } return newDelta; } diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java new file mode 100644 index 0000000..c6d6c4e --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -0,0 +1,140 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.Map; +import kishida.cnn.activation.ActivationFunction; +import kishida.cnn.activation.RectifiedLinear; + +/** + * + * @author naoki + */ +public class FullyBackwordCL { + public static FullyBackwordCL INSTANCE = new FullyBackwordCL(); + CLProgram prog; + Map kernels; + CLProgram progActivation; + Map actKernels; + + private FullyBackwordCL() { + } + + public void backword(int inputSize, int outputSize, + int[] dropout, float[] input, float[] delta, + float[] result, float[] weight, + float[] weightDelta, float[] biasDelta, + float[] newDelta, + float learningRate, ActivationFunction activation){ + if(prog == null){ + prog = OpenCL.compile("fully_backword.cl"); + kernels = prog.createCLKernels(); + } + if(progActivation == null){ + progActivation = OpenCL.compile("activation.cl"); + actKernels = progActivation.createCLKernels(); + } + + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufResult = OpenCL.createReadBuffer(result); + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(result.length); + + OpenCL.getQueue() + .putWriteBuffer(bufDropout ,false) + .putWriteBuffer(bufInput ,false) + .putWriteBuffer(bufDelta ,false) + .putWriteBuffer(bufResult ,false) + .putWriteBuffer(bufWeight ,false) + .putWriteBuffer(bufWeightDelta ,false) + .putWriteBuffer(bufBiasDelta ,false); + + CLKernel actKernel = actKernels.get(activation.getName() + "_diff"); + actKernel.rewind() + .putArg(bufResult) + .putArg(bufDiffed); + OpenCL.execute(actKernel, outputSize); + + CLKernel kernelDelta = kernels.get("backword_delta"); + kernelDelta.rewind() + .putArg(outputSize) + .putArgs( + bufDropout, + bufDelta, + bufDiffed, + bufWeight, + bufNewDelta); + OpenCL.execute(kernelDelta, inputSize); + + CLKernel kernelWeight = kernels.get("backword_weight"); + kernelWeight.rewind() + .putArg(outputSize) + .putArg(learningRate) + .putArgs( + bufDropout, + bufInput, + bufDelta, + bufDiffed, + bufWeight, + bufWeightDelta); + OpenCL.execute(kernelWeight, inputSize * outputSize); + + CLKernel kernelBias = kernels.get("backword_bias"); + kernelBias.rewind() + .putArg(outputSize) + .putArg(learningRate) + .putArg(bufDropout) + .putArg(bufDelta) + .putArg(bufDiffed) + .putArg(bufBiasDelta); + OpenCL.execute(kernelBias, outputSize); + + OpenCL.getQueue() + .putReadBuffer(bufNewDelta ,false) + .putReadBuffer(bufBiasDelta ,false) + .putReadBuffer(bufWeightDelta ,true); + bufNewDelta.getBuffer().get(newDelta); + bufBiasDelta.getBuffer().get(biasDelta); + bufWeightDelta.getBuffer().get(weightDelta); + + bufDropout .release(); + bufInput .release(); + bufDelta .release(); + bufResult .release(); + bufWeight .release(); + bufNewDelta .release(); + bufWeightDelta .release(); + bufBiasDelta .release(); + bufDiffed .release(); + } + + public static void main(String[] args) { + int inputSize = 5; + int outputSize = 9; + int[] dropout = new int[outputSize]; + float[] delta = new float[outputSize]; + float[] input = new float[inputSize]; + float[] result = new float[outputSize]; + float[] weight = new float[inputSize * outputSize]; + float[] weightDelta = new float[weight.length]; + float[] biasDelta = new float[outputSize]; + float[] newDelta = new float[inputSize]; + + INSTANCE.backword(inputSize, outputSize, + dropout, input, delta, result, weight, + weightDelta, biasDelta, newDelta, 0.001f, new RectifiedLinear()); + } +} diff --git a/src/main/resources/kernels/fully_backword.cl b/src/main/resources/kernels/fully_backword.cl new file mode 100644 index 0000000..e2d0b6f --- /dev/null +++ b/src/main/resources/kernels/fully_backword.cl @@ -0,0 +1,67 @@ +__kernel void backword_delta( + int outputSize, + __global const int* dropout, + __global const float* delta, + __global const float* diffed, + __global const float* weight, + __global float* newDelta, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float nd = 0; + for (int j = 0; j < outputSize; ++j) { + if (dropout[j] != 1) { + continue; + } + float d = diffed[j] * delta[j]; + nd += d * weight[i * outputSize + j];//in[i] *; + } + newDelta[i] = nd; +} + +__kernel void backword_weight( + int outputSize, + float learningRate, + __global const int* dropout, + __global const float* input, + __global const float* delta, + __global const float* diffed, + __global const float* weight, + __global float* weightDelta, + int count +){ + int ij = get_global_id(0); + if(ij >= count){ + return; + } + int i = ij / outputSize; + int j = ij % outputSize; + if (dropout[j] != 1) { + return; + } + float d = diffed[j] * delta[j]; + weightDelta[ij] += d * input[i] * learningRate; + +} + +__kernel void backword_bias( + int outputSize, + float learningRate, + __global const int* dropout, + __global const float* delta, + __global const float* diffed, + __global float* biasDelta, + int count +){ + int j = get_global_id(0); + if(j >= count){ + return; + } + if (dropout[j] != 1) { + return; + } + biasDelta[j] += diffed[j] * delta[j] * learningRate; +} \ No newline at end of file From 46a48f155df362561d6f9a594af8dc502c10aa6c Mon Sep 17 00:00:00 2001 From: kishida Date: Tue, 22 Sep 2015 13:20:31 +0900 Subject: [PATCH 08/22] =?UTF-8?q?MaxPooling=E3=81=AEOpenCL=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 8 +- .../kishida/cnn/layers/MaxPoolingLayer.java | 143 ++++++++++++------ .../java/kishida/cnn/opencl/MaxPoolingCL.java | 117 ++++++++++++++ src/main/resources/kernels/maxpooling.cl | 95 ++++++++++++ 4 files changed, 316 insertions(+), 47 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/MaxPoolingCL.java create mode 100644 src/main/resources/kernels/maxpooling.cl diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 8c64a9a..74c65ba 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -59,10 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - private static final String RESOURCE_NAME = "/alexnet_def.json"; - //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; - //private static final String RESOURCE_NAME = "/tinynet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + //private static final String RESOURCE_NAME = "/alexnet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java index ad7f4fe..a3641c4 100644 --- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java +++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java @@ -8,7 +8,10 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import java.util.Arrays; +import java.util.Random; +import java.util.stream.Collectors; import java.util.stream.IntStream; +import kishida.cnn.opencl.MaxPoolingCL; import lombok.Getter; /** @@ -45,69 +48,123 @@ public final void setPreLayer(NeuralLayer preLayer) { /** プーリング(max) */ @Override public float[] forward(float[] data) { - IntStream.range(0, inputChannels).parallel().forEach(ch -> { - for (int x = 0; x < outputWidth; ++x) { - for (int y = 0; y < outputHeight; ++y) { - float max = Float.NEGATIVE_INFINITY; - for (int i = 0; i < size; ++i) { - int xx = x * stride + i - size / 2; - if (xx < 0 || xx >= inputWidth) { - continue; - } - for (int j = 0; j < size; ++j) { - int yy = y * stride + j - size / 2; - if (yy < 0 || yy >= inputHeight) { + if(true){ + MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, data, result); + }else{ + IntStream.range(0, inputChannels).parallel().forEach(ch -> { + for (int x = 0; x < outputWidth; ++x) { + for (int y = 0; y < outputHeight; ++y) { + float max = Float.NEGATIVE_INFINITY; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { continue; } - float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; - if (max < d) { - max = d; + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + if (max < d) { + max = d; + } } } + result[ch * outputWidth * outputHeight + x * outputHeight + y] = max; } - result[ch * outputWidth * outputHeight + x * outputHeight + y] = max; } - } - }); + }); + } return result; } @Override public float[] backward(float[] in, float[] delta) { - Arrays.fill(newDelta, 0); - IntStream.range(0, inputChannels).parallel().forEach(ch -> { - for (int x = 0; x < outputWidth; ++x) { - for (int y = 0; y < outputHeight; ++y) { - float max = Float.NEGATIVE_INFINITY; - int maxX = 0; - int maxY = 0; - for (int i = 0; i < size; ++i) { - int xx = x * stride + i - size / 2; - if (xx < 0 || xx >= inputWidth) { - continue; - } - for (int j = 0; j < size; ++j) { - int yy = y * stride + j - size / 2; - if (yy < 0 || yy >= inputHeight) { + return backward(in, delta, false); + } + public float[] backward(float[] in, float[] delta, boolean gpu) { + if(gpu){ + MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, in, delta, newDelta); + }else{ + Arrays.fill(newDelta, 0); + IntStream.range(0, inputChannels).parallel().forEach(ch -> { + for (int x = 0; x < outputWidth; ++x) { + for (int y = 0; y < outputHeight; ++y) { + float max = Float.NEGATIVE_INFINITY; + int maxX = 0; + int maxY = 0; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { continue; } - float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy]; - if (max < d) { - max = d; - maxX = xx; - maxY = yy; + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy]; + if (max < d) { + max = d; + maxX = xx; + maxY = yy; + } } } + int chxy = ch * outputWidth * outputHeight + x * outputHeight + y; + newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] += + delta[chxy]; } - int chxy = ch * outputWidth * outputHeight + x * outputHeight + y; - newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] += - delta[chxy]; } - } - }); + }); + } return newDelta; } + public static void main(String[] args) { + InputLayer input = new InputLayer(6, 6); + MaxPoolingLayer pool = new MaxPoolingLayer("test_pool", 3, 2); + pool.setPreLayer(input); + for(int i = 0; i < pool.newDelta.length; ++i){ + pool.newDelta[i] = 3; + } + float[] in = new float[6 * 6 * 3]; + for(int i = 0; i < in.length; ++i){ + in[i] = i; + } + float[] delta = { + 0.01f, 0.02f, 0.03f, 0.05f, 0.07f, 0.11f, 0.13f, 0.17f, 0.19f, + 1, 2, 3, 5, 7, 11, 13, 17, 19, + 1, 2, 3, 5, 7, 11, 13, 17, 19}; + float[] newDeltaGpu = pool.backward(in, delta, true); + float[] newDeltaCpu = pool.backward(in, delta, false); + System.out.println(Arrays.equals(newDeltaCpu, newDeltaGpu)); + IntStream.range(0, newDeltaGpu.length / 6 / 3).forEach(i -> { + System.out.println(IntStream.range(0, 6) + .map(n -> n + i * 6) + .mapToObj(n -> "" + (int)(newDeltaGpu[n]*100)) + .collect(Collectors.joining(","))); + }); + + Random r = new Random(); + for(int t = 0; t < 1000; ++t){ + for(int i = 0; i < in.length; ++i){ + in[i] = r.nextFloat(); + } + for(int i = 0; i < delta.length; ++i){ + delta[i] = r.nextFloat(); + } + float[] newDeltaGpu2 = pool.backward(in, delta, true); + float[] newDeltaCpu2 = pool.backward(in, delta, false); + if(!Arrays.equals(newDeltaCpu2, newDeltaGpu2)){ + System.out.println("wrong"); + }; + } + } + @Override public String toString() { return String.format("%s:Max pooling size:%dx%d stride:%d in:%dx%dx%d out %dx%dx%d", diff --git a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java new file mode 100644 index 0000000..914a4ff --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java @@ -0,0 +1,117 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.util.Map; + +/** + * + * @author naoki + */ +public class MaxPoolingCL { + public static MaxPoolingCL INSTANCE = new MaxPoolingCL(); + + CLProgram prog; + Map kernels; + + private MaxPoolingCL() { + } + + public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight, + int size, int stride, float[] input, float[] result){ + if(prog == null){ + prog = OpenCL.compile("maxpooling.cl"); + kernels = prog.createCLKernels(); + } + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + CLKernel kernelForward = kernels.get("forward"); + kernelForward.rewind() + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputWidth) + .putArg(ouptutHeight) + .putArg(size) + .putArg(stride) + .putArgs( + bufInput, + bufResult); + OpenCL.execute(kernelForward, + inputChannel * outputWidth * ouptutHeight); + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufInput.release(); + bufResult.release(); + } + + public void backword(int inputChannel, int inputWidth, int inputHeight, + int outputWidth, int outputHeight, + int size, int stride, + float[] input, float[] delta, float[] newDelta){ + if(prog == null){ + prog = OpenCL.compile("maxpooling.cl"); + kernels = prog.createCLKernels(); + } + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufNewDelta = OpenCL.createReadWriteBuffer(newDelta); + + OpenCL.getQueue() + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufDelta, false) + .putWriteBuffer(bufNewDelta, false); + CLKernel kernelForward = kernels.get("backword"); + kernelForward.rewind() + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(size) + .putArg(stride) + .putArgs( + bufInput, + bufDelta, + bufNewDelta); + OpenCL.execute(kernelForward, + inputChannel * inputWidth * inputHeight); + OpenCL.getQueue().putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + + bufInput.release(); + bufDelta.release(); + bufNewDelta.release(); + + } + + public static void main(String[] args) { + int inputChannel = 3; + int inputWidth = 32; + int inputHeight = 32; + int size = 11; + int stride = 2; + int outputWidth = inputWidth / stride; + int outputHeight = inputHeight / stride; + float[] input = new float[inputChannel * inputWidth * inputHeight]; + float[] result = new float[inputChannel * outputWidth * outputHeight]; + float[] newDelta = new float[input.length]; + float[] delta = new float[result.length]; + + INSTANCE.forward(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride, + input,result); + INSTANCE.backword(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride, + input, delta, newDelta); + } +} diff --git a/src/main/resources/kernels/maxpooling.cl b/src/main/resources/kernels/maxpooling.cl new file mode 100644 index 0000000..acd4de7 --- /dev/null +++ b/src/main/resources/kernels/maxpooling.cl @@ -0,0 +1,95 @@ +__kernel void forward( + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight, + int size, + int stride, + __global const float* data, + __global float* result, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + + int ch = chxy / (outputWidth * outputHeight); + int x = (chxy % (outputWidth * outputHeight)) / outputHeight; + int y = chxy % outputHeight; + + float max = -INFINITY; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { + continue; + } + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + if (max < d) { + max = d; + } + } + } + result[chxy] = max; + +} + +__kernel void backword( + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight, + int size, + int stride, + __global const float* input, + __global const float* delta, + __global float* newDelta, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + + int ch = chxy / (inputWidth * inputHeight); + int xi = (chxy % (inputWidth * inputHeight)) / inputHeight; + int yi = chxy % inputHeight; + + float nd = 0; + for(int x = max(0, (xi - size / 2) / stride - 1); + x < min(outputWidth, (xi + size / 2) / stride + 1); ++x){ + for(int y = max(0, (yi - size / 2) / stride - 1); + y < min(outputHeight, (yi + size / 2) / stride + 1); ++y){ + float max = -INFINITY; + int maxX = 0; + int maxY = 0; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { + continue; + } + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = input[ch * inputWidth * inputHeight + xx * inputWidth + yy]; + if (max < d) { + max = d; + maxX = xx; + maxY = yy; + } + } + } + if(maxX == xi && maxY == yi){ + nd += delta[ch * outputWidth * outputHeight + x * outputHeight + y]; + } + } + } + newDelta[chxy] = nd; +} \ No newline at end of file From ae91a1f55e1b699d3af821e51cbb4ce7d7d7fba2 Mon Sep 17 00:00:00 2001 From: kishida Date: Tue, 22 Sep 2015 18:56:56 +0900 Subject: [PATCH 09/22] =?UTF-8?q?MultiNormalize=E3=81=AEOpenCL=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kishida/cnn/layers/MaxPoolingLayer.java | 2 +- .../cnn/layers/MultiNormalizeLayer.java | 79 +++++++++-------- .../kishida/cnn/opencl/MultiNormalizeCL.java | 84 +++++++++++++++++++ src/main/resources/kernels/multi_normalize.cl | 76 +++++++++++++++++ 4 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java create mode 100644 src/main/resources/kernels/multi_normalize.cl diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java index a3641c4..1d94b5b 100644 --- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java +++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java @@ -48,7 +48,7 @@ public final void setPreLayer(NeuralLayer preLayer) { /** プーリング(max) */ @Override public float[] forward(float[] data) { - if(true){ + if(false){ MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight, outputWidth, outputHeight, size, stride, data, result); }else{ diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index 50681b6..957e863 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -8,6 +8,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import java.util.stream.IntStream; +import kishida.cnn.opencl.MultiNormalizeCL; import lombok.Getter; /** @@ -45,52 +46,56 @@ public final void setPreLayer(NeuralLayer preLayer) { @Override public float[] forward(float[] in) { - IntStream.range(0, inputWidth).parallel().forEach(x -> { - for(int y = 0; y < inputHeight; ++y){ - float total = 0; - int count = 0; - for(int i = 0; i < size; ++i){ - int xx = x + i - size / 2; - if(xx < 0 || xx >= inputWidth){ - continue; - } - for(int j = 0; j < size; ++j){ - int yy = y + j - size / 2; - if(yy < 0 || yy >= inputHeight){ + if(false){ + MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight, + size, threshold, in, result); + } else{ + IntStream.range(0, inputWidth).parallel().forEach(x -> { + for(int y = 0; y < inputHeight; ++y){ + float total = 0; + int count = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ continue; } - for(int ch = 0; ch < inputChannels; ++ch){ - total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; - ++count; + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; + ++count; + } } } - } - float average = total / count; - float variance = 0; - for(int i = 0; i < size; ++i){ - int xx = x + i - size / 2; - if(xx < 0 || xx >= inputWidth){ - continue; - } - for(int j = 0; j < size; ++j){ - int yy = y + j - size / 2; - if(yy < 0 || yy >= inputHeight){ + float average = total / count; + float variance = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ continue; } - for(int ch = 0; ch < inputChannels; ++ch){ - float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; - variance += (data - average) * (data - average); + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; + variance += (data - average) * (data - average); + } } } + float std = Math.max(threshold, (float)Math.sqrt(variance / count)); + for(int ch = 0; ch < inputChannels; ++ch){ + int pos = ch * inputHeight * inputWidth + x * inputHeight + y; + result[pos] = (in[pos] - average) / std; + } } - float std = Math.max(threshold, (float)Math.sqrt(variance / count)); - for(int ch = 0; ch < inputChannels; ++ch){ - int pos = ch * inputHeight * inputWidth + x * inputHeight + y; - result[pos] = (in[pos] - average) / std; - } - } - }); - + }); + } return result; } diff --git a/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java new file mode 100644 index 0000000..47cee35 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java @@ -0,0 +1,84 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.util.Map; + +/** + * + * @author naoki + */ +public class MultiNormalizeCL { + public static MultiNormalizeCL INSTANCE = new MultiNormalizeCL(); + + CLProgram prog; + Map kernels; + + public void normalize(int inputChannels, int inputWidth, int inputHeight, + int size, float threshold, + float[] input, float[] result){ + if(prog == null){ + prog = OpenCL.compile("multi_normalize.cl"); + kernels = prog.createCLKernels(); + } + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufAverages = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); + CLBuffer bufStds = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + + OpenCL.getQueue().putWriteBuffer(bufInput, false); + CLKernel kernelAverage = kernels.get("average"); + kernelAverage.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(size) + .putArg(threshold) + .putArgs( + bufInput, + bufAverages, + bufStds); + OpenCL.execute(kernelAverage, inputWidth * inputHeight); + + CLKernel kernelForward = kernels.get("forward"); + kernelForward.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufAverages, + bufStds, + bufResult); + OpenCL.execute(kernelForward, inputChannels * inputWidth * inputHeight); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + + bufResult.getBuffer().get(result); + + bufInput.release(); + bufAverages.release(); + bufStds.release(); + bufResult.release(); + + } + + public static void main(String[] args) { + int inputChannels = 3; + int inputWidth = 32; + int inputHeight = 32; + int size = 5; + float threshold = 0.00001f; + float[] input = new float[inputChannels * inputWidth * inputHeight]; + float[] result = new float[inputChannels * inputWidth * inputHeight]; + new MultiNormalizeCL().normalize(inputChannels, inputWidth, inputHeight, size, threshold, input, result); + } +} diff --git a/src/main/resources/kernels/multi_normalize.cl b/src/main/resources/kernels/multi_normalize.cl new file mode 100644 index 0000000..ad5b87e --- /dev/null +++ b/src/main/resources/kernels/multi_normalize.cl @@ -0,0 +1,76 @@ +__kernel void average( + int inputChannels, + int inputWidth, + int inputHeight, + int size, + float threshold, + __global const float* input, + __global float* averages, + __global float* stds, + int len +){ + int xy = get_global_id(0); + if(xy >= len){ + return; + } + int x = xy / inputHeight; + int y = xy % inputHeight; + + float total = 0; + int count = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ + continue; + } + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + total += input[ch * inputHeight * inputWidth + xy]; + ++count; + } + } + } + float average = total / count; + float variance = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ + continue; + } + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + float data = input[ch * inputHeight * inputWidth + xy]; + variance += (data - average) * (data - average); + } + } + } + averages[xy] = average; + stds[xy] = max(threshold, sqrt(variance / count)); + +} + +__kernel void forward( + int inputChannels, + int inputWidth, + int inputHeight, + __global const float* input, + __global const float* averages, + __global const float* stds, + __global float* result, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + int xy = chxy % (inputWidth * inputHeight); + result[chxy] = (input[chxy] - averages[xy]) / stds[xy]; +} \ No newline at end of file From 8761e82b8b68deb6f058012b28af4aebc9c58c85 Mon Sep 17 00:00:00 2001 From: kishida Date: Tue, 22 Sep 2015 22:19:49 +0900 Subject: [PATCH 10/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E3=83=95=E3=82=A3=E3=83=AB=E3=82=BF=E3=83=BB=E3=83=90?= =?UTF-8?q?=E3=82=A4=E3=82=A2=E3=82=B9=E3=82=92GPU=E3=83=A1=E3=83=A2?= =?UTF-8?q?=E3=83=AA=E3=81=AB=E7=BD=AE=E3=81=84=E3=81=9F=E3=81=BE=E3=81=BE?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 8 +- .../kishida/cnn/layers/ConvolutionLayer.java | 97 +++++++++++-------- .../cnn/opencl/ConvolutionBackwordCL.java | 89 +++++++++++------ .../cnn/opencl/ConvolutionForwardCL.java | 75 ++++++++++---- 4 files changed, 177 insertions(+), 92 deletions(-) diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 74c65ba..8c64a9a 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -59,10 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - //private static final String RESOURCE_NAME = "/alexnet_def.json"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; - private static final String RESOURCE_NAME = "/tinynet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + private static final String RESOURCE_NAME = "/alexnet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + //private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index a1f00ab..6375755 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -9,7 +9,8 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Arrays; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; @@ -22,6 +23,7 @@ import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel; import kishida.cnn.opencl.ConvolutionBackwordCL; import kishida.cnn.opencl.ConvolutionForwardCL; +import kishida.cnn.opencl.OpenCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -30,10 +32,9 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] filter; + @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) @Getter @@ -53,6 +54,9 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ float[] tempDelta; float[] newDelta; + CLBuffer bufFilter; + CLBuffer bufBias; + public ConvolutionLayer(String name, int filterCount, int size, int stride, float initBias, boolean useGpu) { this(name, size, filterCount, stride, null, null, initBias, null, null, useGpu); @@ -112,6 +116,30 @@ public final void setPreLayer(NeuralLayer preLayer) { this.result = new float[outputChannels * outputWidth * outputHeight]; this.tempDelta = new float[result.length]; this.newDelta = new float[inputChannels * inputWidth * inputHeight]; + + if(true){ + this.bufBias = OpenCL.createReadBuffer(bias); + this.bufFilter = OpenCL.createReadBuffer(filter); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false); + } + } + + public float[] getFilter() { + if(bufFilter != null){ + OpenCL.getQueue().putReadBuffer(bufFilter, true); + bufFilter.getBuffer().get(filter).rewind(); + } + return filter; + } + + public float[] getBias() { + if(bufBias != null){ + OpenCL.getQueue().putReadBuffer(bufBias, true); + bufBias.getBuffer().get(bias).rewind(); + } + return bias; } /** 畳み込みフィルタを適用する */ @@ -127,8 +155,13 @@ public float[] forward(float[] img) { outputChannels, outputWidth, outputHeight, false); } else{ // JOCL - result = ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias); + if(true){ + ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bufBias); + } else { + ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias); + } } }else { //CPU @@ -141,33 +174,6 @@ public float[] forward(float[] img) { return result; } - private void localNormalization(float[] result){ - final int n = 5; - final int k = 2; - final float a = 0.0001f; - final float b = 0.75f; - // resultをコピーするほうが楽だけど、メモリを節約するため - final float[] sigma = new float[n]; - for(int x = 0; x < outputWidth; ++x){ - for(int y = 0; y < outputHeight; ++y){ - int xy = x * outputHeight + y; - Arrays.fill(sigma, 0); - int lp = 0; - for(; lp < n / 2; ++lp){ - sigma[lp] = result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy]; - } - for(int ch = 0; ch < outputChannels; ++ch){ - sigma[lp % 5] = lp >= outputChannels ? 0 : - result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy]; - lp = lp + 1; - float sum = FloatUtil.floatSum(sigma); - result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] / - (float)Math.pow(k + a * sum, b); - } - } - } - } - /** 畳み込み層の学習 */ @Override public float[] backward(float[] input, float[] delta) { @@ -195,15 +201,23 @@ public float[] backward(float[] input, float[] delta) { System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); } - return newDelta; }else{ // JOCL - return ConvolutionBackwordCL.INSTANCE.backward( - delta, result, input, - inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, - filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + if(true){ + ConvolutionBackwordCL.INSTANCE.backward( + delta, result, input, + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + }else{ + ConvolutionBackwordCL.INSTANCE.backward( + delta, result, input, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + } } + return newDelta; } else { // CPUバージョン return ConvolutionBackwordKernel.INSTANCE.backward(delta, result, @@ -227,6 +241,13 @@ public void joinBatch() { IntStream.range(0, filter.length).parallel().forEach(i -> filter[i] += filterDelta[i] / count - parent.getWeightDecay() * parent.getLearningRate() * filter[i]); IntStream.range(0, bias.length).parallel().forEach(i -> bias[i] += biasDelta[i] / count); + if(bufFilter != null){ + bufFilter.getBuffer().put(filter).rewind(); + bufBias.getBuffer().put(bias).rewind(); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false); + } } @Override diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index 99ce5a0..404e0bd 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -9,6 +9,7 @@ import com.jogamp.opencl.CLKernel; import com.jogamp.opencl.CLProgram; import java.nio.FloatBuffer; +import java.util.Map; /** * @@ -17,25 +18,38 @@ public class ConvolutionBackwordCL { public static ConvolutionBackwordCL INSTANCE = new ConvolutionBackwordCL(); CLProgram prog; + Map kernels; private ConvolutionBackwordCL() { } - public float[] backward(float[] delta, float[] result, + public void backward(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] filterDelta, float[] biasDelta, int filterSize, int stride, float[] newDelta, float learningRate) { - if(prog == null){ - prog = OpenCL.compile("convolution_backword.cl"); - } + CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + backward(delta, result, + input, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, + filterSize, stride, newDelta, learningRate); + + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false); + + bufFilter.release(); + } + public void backward(float[] delta, float[] result, + float[] input, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + float[] filterDelta, float[] biasDelta, + int filterSize, int stride, float[] newDelta, float learningRate) { CLBuffer bufDelta = OpenCL.createReadBuffer(delta); - CLBuffer bufFilter = OpenCL.createReadBuffer(filter); CLBuffer bufResult = OpenCL.createReadBuffer(result); CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); - CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(result.length); CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); OpenCL.getQueue() @@ -44,11 +58,44 @@ public float[] backward(float[] delta, float[] result, .putWriteBuffer(bufResult, false) .putWriteBuffer(bufInput, false) .putWriteBuffer(bufFilterDelta, false) - .putWriteBuffer(bufTempBias, false) .putWriteBuffer(bufBiasDelta, false); + backward(bufDelta, bufResult, + bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, + filterSize, stride, bufNewDelta, learningRate); + + OpenCL.getQueue() + .putReadBuffer(bufBiasDelta, true) + .putReadBuffer(bufFilterDelta, true) + .putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + bufFilterDelta.getBuffer().get(filterDelta); + bufBiasDelta.getBuffer().get(biasDelta); + + bufDelta.release(); + bufResult.release(); + bufInput.release(); + bufFilterDelta.release(); + bufBiasDelta.release(); + bufNewDelta.release(); + } + public void backward(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + if(prog == null){ + prog = OpenCL.compile("convolution_backword.cl"); + kernels = prog.createCLKernels(); + } + + CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); + CLKernel deltaKernel = prog.createCLKernel("delta"); deltaKernel + .rewind() .putArg(inputWidth) .putArg(inputHeight) .putArg(filterSize) @@ -64,10 +111,10 @@ public float[] backward(float[] delta, float[] result, .putArg(bufNewDelta); OpenCL.execute(deltaKernel, inputChannels * inputWidth * inputHeight); - deltaKernel.release(); - CLKernel filterKernel = prog.createCLKernel("filter"); + CLKernel filterKernel = kernels.get("filter"); filterKernel + .rewind() .putArg(inputChannels) .putArg(filterSize) .putArg(outputWidth) @@ -84,10 +131,10 @@ public float[] backward(float[] delta, float[] result, bufFilterDelta); OpenCL.execute(filterKernel, outputChannels * inputChannels * filterSize * filterSize); - filterKernel.release(); - CLKernel biasKernel = prog.createCLKernel("bias"); + CLKernel biasKernel = kernels.get("bias"); biasKernel + .rewind() .putArgs( bufResult, bufDelta, @@ -95,35 +142,19 @@ public float[] backward(float[] delta, float[] result, .putArg(learningRate); OpenCL.execute(biasKernel, outputChannels * outputWidth * outputHeight); - biasKernel.release(); - CLKernel biasAfterKernel = prog.createCLKernel("biasAfter"); + CLKernel biasAfterKernel = kernels.get("biasAfter"); biasAfterKernel + .rewind() .putArg(outputWidth) .putArg(outputHeight) .putArgs( bufTempBias, bufBiasDelta); OpenCL.execute(biasAfterKernel, outputChannels); - biasAfterKernel.release(); - OpenCL.getQueue() - .putReadBuffer(bufBiasDelta, true) - .putReadBuffer(bufFilterDelta, true) - .putReadBuffer(bufNewDelta, true); - bufNewDelta.getBuffer().get(newDelta); - bufFilterDelta.getBuffer().get(filterDelta); - bufBiasDelta.getBuffer().get(biasDelta); - bufDelta.release(); - bufFilter.release(); - bufResult.release(); - bufInput.release(); - bufFilterDelta.release(); bufTempBias.release(); - bufBiasDelta.release(); - bufNewDelta.release(); - return newDelta; } public static void main(String[] args) { diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java index 28bb85e..3ecc145 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java @@ -17,28 +17,72 @@ public class ConvolutionForwardCL { public static ConvolutionForwardCL INSTANCE = new ConvolutionForwardCL(); CLProgram prog; + CLKernel forwardKernel; + CLKernel normalizeKernel; + private ConvolutionForwardCL() { } - public float[] forward(float[] input, int inputChannels, int inputWidth, int inputHeight, + /** + * バッファを外部にもたない + */ + public void forward(float[] input, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] result, int filterSize, int stride, float[] bias){ - if(prog == null){ - prog = OpenCL.compile("convolution_forward.cl"); - } - CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufFilter = OpenCL.createReadBuffer(filter); - CLBuffer bufResult = OpenCL.createReadWriteBuffer(result); CLBuffer bufBias = OpenCL.createReadBuffer(bias); OpenCL.getQueue() - .putWriteBuffer(bufInput, false) .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufBias, false); - CLKernel forwardKernel = prog.createCLKernel("forward"); + forward(input, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, result, + filterSize, stride, bufBias); + + bufBias.release(); + bufFilter.release(); + } + + /** + * filterとbiasは外部管理 + */ + public void forward(float[] input, + int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + float[] result, + int filterSize, int stride, CLBuffer bufBias){ + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result); + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + + forward(bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, bufResult, + filterSize, stride, bufBias); + + OpenCL.getQueue() + .putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufResult.release(); + bufInput.release(); + + } + public void forward(CLBuffer bufInput, + int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufResult, + int filterSize, int stride, CLBuffer bufBias){ + if(prog == null){ + prog = OpenCL.compile("convolution_forward.cl"); + forwardKernel = prog.createCLKernel("forward"); + } + forwardKernel + .rewind() .putArg(outputHeight) .putArg(outputWidth) .putArg(inputChannels) @@ -53,28 +97,17 @@ public float[] forward(float[] input, int inputChannels, int inputWidth, int inp bufBias); OpenCL.execute(forwardKernel, outputChannels * outputWidth * outputHeight); - forwardKernel.release(); - CLKernel normalizeKernel = prog.createCLKernel("localNormalize"); + normalizeKernel = prog.createCLKernel("localNormalize"); normalizeKernel + .rewind() .putArg(outputWidth) .putArg(outputHeight) .putArg(outputChannels) .putArg(bufResult); OpenCL.execute(normalizeKernel, outputChannels * outputWidth * outputHeight); - normalizeKernel.release(); - - OpenCL.getQueue() - .putReadBuffer(bufResult, true); - bufResult.getBuffer().get(result); - - bufBias.release(); - bufResult.release(); - bufInput.release(); - bufFilter.release(); - return result; } } From 0bc644ebc71efc72e3a90290f30501ddff36ebbf Mon Sep 17 00:00:00 2001 From: kishida Date: Tue, 22 Sep 2015 23:52:02 +0900 Subject: [PATCH 11/22] =?UTF-8?q?=E5=85=A8=E7=B5=90=E5=90=88=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E3=83=95=E3=82=A3=E3=83=AB=E3=82=BF=E3=83=BB=E3=83=90?= =?UTF-8?q?=E3=82=A4=E3=82=A2=E3=82=B9=E3=82=92GPU=E3=83=A1=E3=83=A2?= =?UTF-8?q?=E3=83=AA=E3=81=AB=E7=BD=AE=E3=81=84=E3=81=9F=E3=81=BE=E3=81=BE?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=81=A7=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86?= =?UTF-8?q?=E3=81=AB=E3=81=99=E3=82=8B=E3=80=82=E3=81=91=E3=81=A9=E9=81=85?= =?UTF-8?q?=E3=81=84=E3=80=82nio.Buffer=E3=81=8C=E9=81=85=E3=81=9D?= =?UTF-8?q?=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 8 +- .../java/kishida/cnn/layers/FullyConnect.java | 44 +++++++-- .../kishida/cnn/opencl/FullyBackwordCL.java | 90 +++++++++++++------ .../kishida/cnn/opencl/FullyForwardCL.java | 56 ++++++++---- 4 files changed, 142 insertions(+), 56 deletions(-) diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 8c64a9a..74c65ba 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -59,10 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - private static final String RESOURCE_NAME = "/alexnet_def.json"; - //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; - //private static final String RESOURCE_NAME = "/tinynet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + //private static final String RESOURCE_NAME = "/alexnet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index 224dbb9..a4b66b2 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -8,12 +8,15 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.kernels.FullyForwardKernel; import kishida.cnn.opencl.FullyBackwordCL; import kishida.cnn.opencl.FullyForwardCL; +import kishida.cnn.opencl.OpenCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -24,10 +27,8 @@ */ public class FullyConnect extends NeuralLayer implements LerningLayer{ @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[]weight; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) @Getter @@ -51,6 +52,9 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ @Getter private float initBias; + CLBuffer bufWeight; + CLBuffer bufBias; + public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) { this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu); } @@ -113,6 +117,29 @@ public final void setPreLayer(NeuralLayer preLayer) { if(biasDelta == null){ this.biasDelta = new float[outputSize]; } + if(useGpu){ + bufWeight = OpenCL.createReadWriteBuffer(weight); + bufBias = OpenCL.createReadWriteBuffer(bias); + OpenCL.getQueue() + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false); + } + } + + public float[] getWeight() { + if(bufWeight != null){ + OpenCL.getQueue().putReadBuffer(bufWeight, true); + bufWeight.getBuffer().get(weight).rewind(); + } + return weight; + } + + public float[] getBias() { + if(bufBias != null){ + OpenCL.getQueue().putReadBuffer(bufBias, true); + bufBias.getBuffer().get(bias).rewind(); + } + return bias; } @JsonProperty("activationObj") @@ -138,7 +165,7 @@ public float[] forward(float[] in) { FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); activation.applyAfter(result); }else{ - FullyForwardCL.INSTANCE.forward(inputSize, outputSize, dropout, in, weight, bias, result, activation); + FullyForwardCL.INSTANCE.forward(inputSize, outputSize, dropout, in, bufWeight, bufBias, result, activation); } }else{ FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); @@ -149,9 +176,9 @@ public float[] forward(float[] in) { @Override public float[] backward(float[] in, float[] delta) { - if(false){ + if(useGpu && false){ FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, - dropout, in, delta, result, weight, weightDelta, biasDelta, newDelta, + dropout, in, delta, result, bufWeight, weightDelta, biasDelta, newDelta, parent.getLearningRate(), activation); }else{ for(int i = 0; i < result.length; ++i){ @@ -192,6 +219,13 @@ public void joinBatch() { IntStream.range(0, bias.length).parallel().forEach(i -> { bias[i] += biasDelta[i] / parent.getMiniBatch(); }); + if(bufWeight != null){ + bufWeight.getBuffer().put(weight).rewind(); + bufBias.getBuffer().put(bias).rewind(); + OpenCL.getQueue() + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false); + } } @Override diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java index c6d6c4e..ee4d0ce 100644 --- a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -34,34 +34,81 @@ public void backword(int inputSize, int outputSize, float[] weightDelta, float[] biasDelta, float[] newDelta, float learningRate, ActivationFunction activation){ - if(prog == null){ - prog = OpenCL.compile("fully_backword.cl"); - kernels = prog.createCLKernels(); - } - if(progActivation == null){ - progActivation = OpenCL.compile("activation.cl"); - actKernels = progActivation.createCLKernels(); - } + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + OpenCL.getQueue() + .putWriteBuffer(bufWeight ,false); - CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + backword(inputSize, outputSize, + dropout, input, delta, + result, bufWeight, weightDelta, biasDelta, + newDelta, + learningRate, activation); + + bufWeight .release(); + } + + public void backword(int inputSize, int outputSize, + int[] dropout, float[] input, float[] delta, + float[] result, CLBuffer bufWeight, + float[] weightDelta, float[] biasDelta, + float[] newDelta, + float learningRate, ActivationFunction activation){ CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufDelta = OpenCL.createReadBuffer(delta); CLBuffer bufResult = OpenCL.createReadBuffer(result); - CLBuffer bufWeight = OpenCL.createReadBuffer(weight); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); - CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(result.length); - OpenCL.getQueue() - .putWriteBuffer(bufDropout ,false) .putWriteBuffer(bufInput ,false) .putWriteBuffer(bufDelta ,false) .putWriteBuffer(bufResult ,false) - .putWriteBuffer(bufWeight ,false) .putWriteBuffer(bufWeightDelta ,false) .putWriteBuffer(bufBiasDelta ,false); + backword(inputSize, outputSize, + dropout, bufInput, bufDelta, + bufResult, bufWeight, bufWeightDelta, bufBiasDelta, + bufNewDelta, + learningRate, activation); + + OpenCL.getQueue() + .putReadBuffer(bufNewDelta ,false) + .putReadBuffer(bufBiasDelta ,false) + .putReadBuffer(bufWeightDelta ,true); + bufNewDelta.getBuffer().get(newDelta); + bufBiasDelta.getBuffer().get(biasDelta); + bufWeightDelta.getBuffer().get(weightDelta); + + bufInput .release(); + bufDelta .release(); + bufResult .release(); + bufNewDelta .release(); + bufWeightDelta .release(); + bufBiasDelta .release(); + + } + public void backword(int inputSize, int outputSize, + int[] dropout, CLBuffer bufInput, CLBuffer bufDelta, + CLBuffer bufResult, CLBuffer bufWeight, + CLBuffer bufWeightDelta, CLBuffer bufBiasDelta, + CLBuffer bufNewDelta, + float learningRate, ActivationFunction activation){ + if(prog == null){ + prog = OpenCL.compile("fully_backword.cl"); + kernels = prog.createCLKernels(); + } + if(progActivation == null){ + progActivation = OpenCL.compile("activation.cl"); + actKernels = progActivation.createCLKernels(); + } + + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(outputSize); + + OpenCL.getQueue() + .putWriteBuffer(bufDropout ,false); + CLKernel actKernel = actKernels.get(activation.getName() + "_diff"); actKernel.rewind() .putArg(bufResult) @@ -102,22 +149,7 @@ public void backword(int inputSize, int outputSize, .putArg(bufBiasDelta); OpenCL.execute(kernelBias, outputSize); - OpenCL.getQueue() - .putReadBuffer(bufNewDelta ,false) - .putReadBuffer(bufBiasDelta ,false) - .putReadBuffer(bufWeightDelta ,true); - bufNewDelta.getBuffer().get(newDelta); - bufBiasDelta.getBuffer().get(biasDelta); - bufWeightDelta.getBuffer().get(weightDelta); - bufDropout .release(); - bufInput .release(); - bufDelta .release(); - bufResult .release(); - bufWeight .release(); - bufNewDelta .release(); - bufWeightDelta .release(); - bufBiasDelta .release(); bufDiffed .release(); } diff --git a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java index e27010d..fedc596 100644 --- a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java @@ -30,6 +30,41 @@ public FullyForwardCL() { public void forward(int inputSize, int outputSize, int[] dropout, float[] input, float[] weight, float[] bias, float[] result, ActivationFunction activation){ + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufBias = OpenCL.createReadBuffer(bias); + + forward(inputSize, outputSize, dropout, input, bufWeight, bufBias, result, activation); + + OpenCL.getQueue() + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false); + bufWeight.release(); + bufBias.release(); + + } + public void forward(int inputSize, int outputSize, int[] dropout, + float[] input, CLBuffer bufWeight, + CLBuffer bufBias, float[] result, + ActivationFunction activation){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length); + + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + + forward(inputSize, outputSize, dropout, bufInput, bufWeight, bufBias, bufResult, activation); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufInput.release(); + bufResult.release(); + + } + public void forward(int inputSize, int outputSize, int[] dropout, + CLBuffer bufInput, CLBuffer bufWeight, + CLBuffer bufBias, CLBuffer bufResult, + ActivationFunction activation){ if(progFully == null){ progFully = OpenCL.compile("fully_forward.cl"); forwardKernel = progFully.createCLKernel("forward"); @@ -40,19 +75,11 @@ public void forward(int inputSize, int outputSize, int[] dropout, } CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); - CLBuffer bufInput = OpenCL.createReadBuffer(input); - CLBuffer bufWeight = OpenCL.createReadBuffer(weight); - CLBuffer bufBias = OpenCL.createReadBuffer(bias); - CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length); - OpenCL.getQueue() - .putWriteBuffer(bufDropout, false) - .putWriteBuffer(bufInput, false) - .putWriteBuffer(bufWeight, false) - .putWriteBuffer(bufBias, false); + .putWriteBuffer(bufDropout, false); forwardKernel.rewind() - .putArg(input.length) + .putArg(inputSize) .putArg(outputSize) .putArgs( bufDropout, @@ -63,7 +90,7 @@ public void forward(int inputSize, int outputSize, int[] dropout, OpenCL.execute(forwardKernel, outputSize); if(activation instanceof SoftMaxFunction){ - CLBuffer bufExped = OpenCL.createReadWriteBuffer(result.length); + CLBuffer bufExped = OpenCL.createReadWriteBuffer(outputSize); CLKernel kernelActPre = actKernels.get("softmax_before"); kernelActPre.rewind() .putArg(bufResult) @@ -85,14 +112,7 @@ public void forward(int inputSize, int outputSize, int[] dropout, OpenCL.execute(kernelAct, outputSize); } - OpenCL.getQueue().putReadBuffer(bufResult, true); - bufResult.getBuffer().get(result); - bufDropout.release(); - bufInput.release(); - bufWeight.release(); - bufBias.release(); - bufResult.release(); } } From e8ae1645d00a7792a8cb51163b3f0b83a710d367 Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 23 Sep 2015 01:39:37 +0900 Subject: [PATCH 12/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E3=83=95=E3=82=A3=E3=83=AB=E3=82=BF=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E3=82=92GPU=E5=AF=BE=E5=BF=9C=E3=81=AB=E3=80=820.5=E5=89=B2?= =?UTF-8?q?=E3=81=AF=E3=82=84=E3=81=8F=E3=81=AA=E3=81=A3=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kishida/cnn/layers/ConvolutionLayer.java | 63 +++++++++++++---- .../cnn/opencl/ConvolutionBackwordCL.java | 70 +++++++++++++++---- .../resources/kernels/convolution_backword.cl | 43 ++++++++++++ 3 files changed, 147 insertions(+), 29 deletions(-) diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 6375755..f1f83ed 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -37,10 +37,8 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ @JsonInclude(JsonInclude.Include.NON_NULL) float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] filterDelta; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] biasDelta; @Getter int stride; @@ -56,6 +54,8 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ CLBuffer bufFilter; CLBuffer bufBias; + CLBuffer bufFilterDelta; + CLBuffer bufBiasDelta; public ConvolutionLayer(String name, int filterCount, int size, int stride, float initBias, boolean useGpu) { @@ -118,11 +118,15 @@ public final void setPreLayer(NeuralLayer preLayer) { this.newDelta = new float[inputChannels * inputWidth * inputHeight]; if(true){ - this.bufBias = OpenCL.createReadBuffer(bias); - this.bufFilter = OpenCL.createReadBuffer(filter); + this.bufFilter = OpenCL.createReadWriteBuffer(filter); + this.bufBias = OpenCL.createReadWriteBuffer(bias); + this.bufFilterDelta = OpenCL.createReadWriteBuffer(filter.length); + this.bufBiasDelta = OpenCL.createReadWriteBuffer(bias.length); OpenCL.getQueue() .putWriteBuffer(bufFilter, false) - .putWriteBuffer(bufBias, false); + .putWriteBuffer(bufBias, false) + .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufBiasDelta, false); } } @@ -142,6 +146,22 @@ public float[] getBias() { return bias; } + public float[] getFilterDelta() { + if(bufFilterDelta != null){ + OpenCL.getQueue().putReadBuffer(bufFilterDelta, true); + bufFilterDelta.getBuffer().get(filterDelta).rewind(); + } + return filterDelta; + } + + public float[] getBiasDelta() { + if(bufBiasDelta != null){ + OpenCL.getQueue().putReadBuffer(bufBiasDelta, true); + bufBiasDelta.getBuffer().get(biasDelta).rewind(); + } + return biasDelta; + } + /** 畳み込みフィルタを適用する */ @Override public float[] forward(float[] img) { @@ -208,7 +228,7 @@ public float[] backward(float[] input, float[] delta) { delta, result, input, inputChannels, inputWidth, inputHeight, bufFilter, outputChannels, outputWidth, outputHeight, - filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + bufFilterDelta, bufBiasDelta, filterSize, stride, newDelta, parent.getLearningRate()); }else{ ConvolutionBackwordCL.INSTANCE.backward( delta, result, input, @@ -230,23 +250,38 @@ public float[] backward(float[] input, float[] delta) { @Override public void prepareBatch() { - float momentam = parent.getMomentam(); - IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam); - IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + if(useGpu){ + ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(), + filterDelta.length, biasDelta.length, bufFilterDelta, bufBiasDelta); + }else{ + float momentam = parent.getMomentam(); + IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam); + IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + } } @Override public void joinBatch() { - float count = parent.getMiniBatch(); - IntStream.range(0, filter.length).parallel().forEach(i -> filter[i] += filterDelta[i] / count - - parent.getWeightDecay() * parent.getLearningRate() * filter[i]); - IntStream.range(0, bias.length).parallel().forEach(i -> bias[i] += biasDelta[i] / count); - if(bufFilter != null){ + if(useGpu){ + ConvolutionBackwordCL.INSTANCE.join( + parent.getWeightDecay(), parent.getLearningRate(), + filter.length, bias.length, + parent.getMiniBatch(), + bufFilter, bufFilterDelta, bufBias, bufBiasDelta); + /* bufFilter.getBuffer().put(filter).rewind(); bufBias.getBuffer().put(bias).rewind(); OpenCL.getQueue() .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufBias, false); + */ + }else{ + float count = parent.getMiniBatch(); + IntStream.range(0, filter.length).parallel().forEach( + i -> filter[i] += filterDelta[i] / count + - parent.getWeightDecay() * parent.getLearningRate() * filter[i]); + IntStream.range(0, bias.length).parallel().forEach( + i -> bias[i] += biasDelta[i] / count); } } diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index 404e0bd..7b0ef25 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -29,36 +29,42 @@ public void backward(float[] delta, float[] result, float[] filterDelta, float[] biasDelta, int filterSize, int stride, float[] newDelta, float learningRate) { CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufBiasDelta, false); backward(delta, result, input, inputChannels, inputWidth, inputHeight, bufFilter, outputChannels, outputWidth, outputHeight, - filterDelta, biasDelta, + bufFilterDelta, bufBiasDelta, filterSize, stride, newDelta, learningRate); OpenCL.getQueue() - .putWriteBuffer(bufFilter, false); + .putReadBuffer(bufBiasDelta, true) + .putReadBuffer(bufFilterDelta, true); + bufFilterDelta.getBuffer().get(filterDelta); + bufBiasDelta.getBuffer().get(biasDelta); bufFilter.release(); + bufFilterDelta.release(); + bufBiasDelta.release(); } public void backward(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, - float[] filterDelta, float[] biasDelta, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, int filterSize, int stride, float[] newDelta, float learningRate) { CLBuffer bufDelta = OpenCL.createReadBuffer(delta); CLBuffer bufResult = OpenCL.createReadBuffer(result); CLBuffer bufInput = OpenCL.createReadBuffer(input); - CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); - CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); OpenCL.getQueue() .putWriteBuffer(bufDelta, false) - .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufResult, false) - .putWriteBuffer(bufInput, false) - .putWriteBuffer(bufFilterDelta, false) - .putWriteBuffer(bufBiasDelta, false); + .putWriteBuffer(bufInput, false); backward(bufDelta, bufResult, bufInput, inputChannels, inputWidth, inputHeight, @@ -67,18 +73,12 @@ public void backward(float[] delta, float[] result, filterSize, stride, bufNewDelta, learningRate); OpenCL.getQueue() - .putReadBuffer(bufBiasDelta, true) - .putReadBuffer(bufFilterDelta, true) .putReadBuffer(bufNewDelta, true); bufNewDelta.getBuffer().get(newDelta); - bufFilterDelta.getBuffer().get(filterDelta); - bufBiasDelta.getBuffer().get(biasDelta); bufDelta.release(); bufResult.release(); bufInput.release(); - bufFilterDelta.release(); - bufBiasDelta.release(); bufNewDelta.release(); } public void backward(CLBuffer bufDelta, CLBuffer bufResult, @@ -157,6 +157,46 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe } + public void prepare(float momentam, + int filterCount, int biasCount, + CLBuffer bufFilterDelta, + CLBuffer bufBiasDelta){ + + CLKernel kernel = kernels.get("prepare"); + kernel.rewind() + .putArg(momentam) + .putArg(bufFilterDelta); + OpenCL.execute(kernel, filterCount); + kernel.rewind() + .putArg(momentam) + .putArg(bufBiasDelta); + OpenCL.execute(kernel, biasCount); + } + + public void join(float weightDecay, float learningRate, + int filterCount, int biasCount, int count, + CLBuffer bufFilter, CLBuffer bufFilterDelta, + CLBuffer bufBias, CLBuffer bufBiasDelta){ + CLKernel kernelFilter = kernels.get("joinFilter"); + kernelFilter.rewind() + .putArg(weightDecay) + .putArg(learningRate) + .putArg(count) + .putArgs( + bufFilter, + bufFilterDelta); + OpenCL.execute(kernelFilter, filterCount); + + CLKernel kernelBias = kernels.get("joinBias"); + kernelBias.rewind() + .putArg(count) + .putArgs( + bufBias, + bufBiasDelta); + OpenCL.execute(kernelBias, biasCount); + + } + public static void main(String[] args) { int inputChannels = 3; int inputWidth = 200; diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl index cce42a9..6a5ea27 100644 --- a/src/main/resources/kernels/convolution_backword.cl +++ b/src/main/resources/kernels/convolution_backword.cl @@ -116,3 +116,46 @@ __kernel void biasAfter( } biasDelta[f] += b; } + +__kernel void prepare( + float momentam, + __global float* delta, + int count +){ + int f = get_global_id(0); + if(f >= count){ + return; + } + delta[f] *= momentam; +} + +__kernel void joinFilter( + float weightDecay, + float learningRate, + int count, + __global float* filter, + __global const float* filterDelta, + int len +){ + int f = get_global_id(0); + if(f >= len){ + return; + } + filter[f] += filterDelta[f] / count + - weightDecay * learningRate * filter[f]; +} + +__kernel void joinBias( + int count, + __global float* bias, + __global const float* biasDelta, + int len +){ + int f = get_global_id(0); + if(f >= len){ + return; + } + bias[f] += biasDelta[f] / count; +} + + From 21a339d800aef3a2438236273a5c110ef1decdec Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 23 Sep 2015 02:38:54 +0900 Subject: [PATCH 13/22] =?UTF-8?q?=E5=85=A8=E7=B5=90=E5=90=88=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E3=83=95=E3=82=A3=E3=83=AB=E3=82=BF=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E3=82=92GPU=E5=AF=BE=E5=BF=9C=E3=81=AB=E3=80=82alexnet?= =?UTF-8?q?=E3=81=AEfc0=E3=81=8CGPU=E3=81=AB=E8=BC=89=E3=81=9B=E3=82=8C?= =?UTF-8?q?=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E3=81=AA=E3=81=A3=E3=81=A6?= =?UTF-8?q?=E5=80=8D=E9=80=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/layers/FullyConnect.java | 73 +++++++++++++------ .../kishida/cnn/opencl/FullyBackwordCL.java | 30 ++++---- src/main/resources/alexnet_def.json | 2 +- 3 files changed, 67 insertions(+), 38 deletions(-) diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index a4b66b2..db51080 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -14,6 +14,7 @@ import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.kernels.FullyForwardKernel; +import kishida.cnn.opencl.ConvolutionBackwordCL; import kishida.cnn.opencl.FullyBackwordCL; import kishida.cnn.opencl.FullyForwardCL; import kishida.cnn.opencl.OpenCL; @@ -31,10 +32,8 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ @JsonInclude(JsonInclude.Include.NON_NULL) private float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[]weightDelta; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[] biasDelta; @JsonProperty @@ -54,6 +53,9 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ CLBuffer bufWeight; CLBuffer bufBias; + CLBuffer bufWeightDelta; + CLBuffer bufBiasDelta; + public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) { this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu); @@ -120,9 +122,13 @@ public final void setPreLayer(NeuralLayer preLayer) { if(useGpu){ bufWeight = OpenCL.createReadWriteBuffer(weight); bufBias = OpenCL.createReadWriteBuffer(bias); + bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); + bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); OpenCL.getQueue() .putWriteBuffer(bufWeight, false) - .putWriteBuffer(bufBias, false); + .putWriteBuffer(bufBias, false) + .putWriteBuffer(bufWeightDelta, false) + .putWriteBuffer(bufBiasDelta, false); } } @@ -141,6 +147,21 @@ public float[] getBias() { } return bias; } + public float[] getWeightDelta() { + if(bufWeightDelta != null){ + OpenCL.getQueue().putReadBuffer(bufWeightDelta, true); + bufWeightDelta.getBuffer().get(weightDelta).rewind(); + } + return weightDelta; + } + + public float[] getBiasDelta() { + if(bufBiasDelta != null){ + OpenCL.getQueue().putReadBuffer(bufBiasDelta, true); + bufBiasDelta.getBuffer().get(biasDelta).rewind(); + } + return biasDelta; + } @JsonProperty("activationObj") public ActivationFunction getActivation() { @@ -161,7 +182,7 @@ public void prepareDropout() { public float[] forward(float[] in) { prepareDropout(); if(useGpu){ - if(true){ + if(false){ FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); activation.applyAfter(result); }else{ @@ -176,9 +197,9 @@ public float[] forward(float[] in) { @Override public float[] backward(float[] in, float[] delta) { - if(useGpu && false){ + if(useGpu && true){ FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, - dropout, in, delta, result, bufWeight, weightDelta, biasDelta, newDelta, + dropout, in, delta, result, bufWeight, bufWeightDelta, bufBiasDelta, newDelta, parent.getLearningRate(), activation); }else{ for(int i = 0; i < result.length; ++i){ @@ -205,27 +226,33 @@ public float[] backward(float[] in, float[] delta) { @Override public void prepareBatch() { - float momentam = parent.getMomentam(); - IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam); - IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + if(useGpu & true){ + ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(), + weightDelta.length, biasDelta.length, bufWeightDelta, bufBiasDelta); + }else{ + float momentam = parent.getMomentam(); + IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam); + IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + } } @Override public void joinBatch() { - IntStream.range(0, weight.length).parallel().forEach(ij -> { - weight[ij] += weightDelta[ij] / parent.getMiniBatch() - - weight[ij] * parent.getWeightDecay() * parent.getLearningRate(); - }); - IntStream.range(0, bias.length).parallel().forEach(i -> { - bias[i] += biasDelta[i] / parent.getMiniBatch(); - }); - if(bufWeight != null){ - bufWeight.getBuffer().put(weight).rewind(); - bufBias.getBuffer().put(bias).rewind(); - OpenCL.getQueue() - .putWriteBuffer(bufWeight, false) - .putWriteBuffer(bufBias, false); - } + if(useGpu & true){ + ConvolutionBackwordCL.INSTANCE.join( + parent.getWeightDecay(), parent.getLearningRate(), + weight.length, bias.length, + parent.getMiniBatch(), + bufWeight, bufWeightDelta, bufBias, bufBiasDelta); + }else{ + IntStream.range(0, weight.length).parallel().forEach(ij -> { + weight[ij] += weightDelta[ij] / parent.getMiniBatch() + - weight[ij] * parent.getWeightDecay() * parent.getLearningRate(); + }); + IntStream.range(0, bias.length).parallel().forEach(i -> { + bias[i] += biasDelta[i] / parent.getMiniBatch(); + }); + } } @Override diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java index ee4d0ce..c92a121 100644 --- a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -35,36 +35,44 @@ public void backword(int inputSize, int outputSize, float[] newDelta, float learningRate, ActivationFunction activation){ CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); OpenCL.getQueue() + .putWriteBuffer(bufWeightDelta ,false) + .putWriteBuffer(bufBiasDelta ,false) .putWriteBuffer(bufWeight ,false); backword(inputSize, outputSize, dropout, input, delta, - result, bufWeight, weightDelta, biasDelta, + result, bufWeight, bufWeightDelta, bufBiasDelta, newDelta, learningRate, activation); + OpenCL.getQueue() + .putReadBuffer(bufBiasDelta ,false) + .putReadBuffer(bufWeightDelta ,true); + bufBiasDelta.getBuffer().get(biasDelta); + bufWeightDelta.getBuffer().get(weightDelta); + bufWeight .release(); + bufWeightDelta .release(); + bufBiasDelta .release(); } public void backword(int inputSize, int outputSize, int[] dropout, float[] input, float[] delta, float[] result, CLBuffer bufWeight, - float[] weightDelta, float[] biasDelta, + CLBuffer bufWeightDelta, CLBuffer bufBiasDelta, float[] newDelta, float learningRate, ActivationFunction activation){ CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufDelta = OpenCL.createReadBuffer(delta); CLBuffer bufResult = OpenCL.createReadBuffer(result); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); - CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); - CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); OpenCL.getQueue() .putWriteBuffer(bufInput ,false) .putWriteBuffer(bufDelta ,false) - .putWriteBuffer(bufResult ,false) - .putWriteBuffer(bufWeightDelta ,false) - .putWriteBuffer(bufBiasDelta ,false); + .putWriteBuffer(bufResult ,false); backword(inputSize, outputSize, dropout, bufInput, bufDelta, @@ -73,19 +81,13 @@ public void backword(int inputSize, int outputSize, learningRate, activation); OpenCL.getQueue() - .putReadBuffer(bufNewDelta ,false) - .putReadBuffer(bufBiasDelta ,false) - .putReadBuffer(bufWeightDelta ,true); + .putReadBuffer(bufNewDelta ,false); bufNewDelta.getBuffer().get(newDelta); - bufBiasDelta.getBuffer().get(biasDelta); - bufWeightDelta.getBuffer().get(weightDelta); bufInput .release(); bufDelta .release(); bufResult .release(); bufNewDelta .release(); - bufWeightDelta .release(); - bufBiasDelta .release(); } public void backword(int inputSize, int outputSize, diff --git a/src/main/resources/alexnet_def.json b/src/main/resources/alexnet_def.json index c5a0e0f..960b9e4 100644 --- a/src/main/resources/alexnet_def.json +++ b/src/main/resources/alexnet_def.json @@ -95,7 +95,7 @@ "initBias" : 1.0, "dropoutRate" : 0.5, "activation" : "RectifiedLinear", - "useGpu" : false + "useGpu" : true } }, { "FullyConnect" : { From c5da4985616e9f6210c3657f82474237da4a4b2a Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 23 Sep 2015 07:03:22 +0900 Subject: [PATCH 14/22] =?UTF-8?q?=E9=A0=86=E4=BC=9D=E6=92=AD=E3=82=92GPU?= =?UTF-8?q?=E3=83=A1=E3=83=A2=E3=83=AA=E4=B8=8A=E3=81=A7=E5=8B=95=E3=81=8B?= =?UTF-8?q?=E3=81=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kishida/cnn/layers/ConvolutionLayer.java | 29 ++++++++++-- .../kishida/cnn/layers/FullGpuEnabled.java | 21 +++++++++ .../java/kishida/cnn/layers/FullyConnect.java | 41 +++++++++++++++-- .../java/kishida/cnn/layers/InputLayer.java | 27 ++++++++++- .../kishida/cnn/layers/MaxPoolingLayer.java | 26 ++++++++++- .../cnn/layers/MultiNormalizeLayer.java | 25 ++++++++++- .../java/kishida/cnn/layers/NeuralLayer.java | 9 +++- .../kishida/cnn/opencl/FullyForwardCL.java | 45 ++++++++++--------- .../java/kishida/cnn/opencl/MaxPoolingCL.java | 28 +++++++----- .../kishida/cnn/opencl/MultiNormalizeCL.java | 26 +++++++---- src/main/resources/alexnet_def.json | 2 +- src/main/resources/tinynet_def.json | 2 +- 12 files changed, 226 insertions(+), 55 deletions(-) create mode 100644 src/main/java/kishida/cnn/layers/FullGpuEnabled.java diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index f1f83ed..0a926e6 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -7,6 +7,7 @@ import com.amd.aparapi.Kernel; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.jogamp.opencl.CLBuffer; @@ -29,7 +30,7 @@ import lombok.Setter; /** 畳み込み層 */ -public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ +public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer, FullGpuEnabled{ @JsonInclude(JsonInclude.Include.NON_NULL) float[] filter; @@ -57,6 +58,10 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ CLBuffer bufFilterDelta; CLBuffer bufBiasDelta; + @JsonIgnore + @Getter + CLBuffer bufResult; + public ConvolutionLayer(String name, int filterCount, int size, int stride, float initBias, boolean useGpu) { this(name, size, filterCount, stride, null, null, initBias, null, null, useGpu); @@ -120,8 +125,9 @@ public final void setPreLayer(NeuralLayer preLayer) { if(true){ this.bufFilter = OpenCL.createReadWriteBuffer(filter); this.bufBias = OpenCL.createReadWriteBuffer(bias); - this.bufFilterDelta = OpenCL.createReadWriteBuffer(filter.length); - this.bufBiasDelta = OpenCL.createReadWriteBuffer(bias.length); + this.bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); + this.bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + this.bufResult = OpenCL.createReadWriteBuffer(result.length); OpenCL.getQueue() .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufBias, false) @@ -162,6 +168,15 @@ public float[] getBiasDelta() { return biasDelta; } + @Override + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; + } + /** 畳み込みフィルタを適用する */ @Override public float[] forward(float[] img) { @@ -194,6 +209,14 @@ public float[] forward(float[] img) { return result; } + @Override + public void forward(CLBuffer input) { + ConvolutionForwardCL.INSTANCE.forward(input, + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufResult, filterSize, stride, bufBias); + } + /** 畳み込み層の学習 */ @Override public float[] backward(float[] input, float[] delta) { diff --git a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java new file mode 100644 index 0000000..4712966 --- /dev/null +++ b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java @@ -0,0 +1,21 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.layers; + +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; + +/** + * + * @author naoki + */ +public interface FullGpuEnabled { + default boolean isUseGpu(){ + return true; + } + CLBuffer getBufResult(); + void forward(CLBuffer input); +} diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index db51080..df8555f 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -6,10 +6,12 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.jogamp.opencl.CLBuffer; import java.nio.FloatBuffer; +import java.nio.IntBuffer; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; @@ -26,7 +28,7 @@ * * @author naoki */ -public class FullyConnect extends NeuralLayer implements LerningLayer{ +public class FullyConnect extends NeuralLayer implements LerningLayer, FullGpuEnabled{ @JsonInclude(JsonInclude.Include.NON_NULL) private float[]weight; @JsonInclude(JsonInclude.Include.NON_NULL) @@ -42,7 +44,8 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ private int[] dropout; @Getter private float dropoutRate = 1; - @Getter @Setter + //@Getter + @Setter private boolean useGpu; private float[] newDelta; private float[] diffed; @@ -55,7 +58,10 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ CLBuffer bufBias; CLBuffer bufWeightDelta; CLBuffer bufBiasDelta; - + CLBuffer bufDropout; + @JsonIgnore + @Getter + CLBuffer bufResult; public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) { this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu); @@ -124,11 +130,14 @@ public final void setPreLayer(NeuralLayer preLayer) { bufBias = OpenCL.createReadWriteBuffer(bias); bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + bufResult = OpenCL.createReadWriteBuffer(result.length); + bufDropout = OpenCL.createReadBuffer(dropout); OpenCL.getQueue() .putWriteBuffer(bufWeight, false) .putWriteBuffer(bufBias, false) .putWriteBuffer(bufWeightDelta, false) - .putWriteBuffer(bufBiasDelta, false); + .putWriteBuffer(bufBiasDelta, false) + .putWriteBuffer(bufDropout, false); } } @@ -163,6 +172,20 @@ public float[] getBiasDelta() { return biasDelta; } + @Override + public boolean isUseGpu() { + return useGpu; + } + + @Override + public float[] getResult() { + if(bufResult != null && isUseGpu()){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; + } + @JsonProperty("activationObj") public ActivationFunction getActivation() { return activation; @@ -195,6 +218,16 @@ public float[] forward(float[] in) { return result; } + @Override + public void forward(CLBuffer input) { + prepareDropout(); + bufDropout.getBuffer().put(dropout).rewind(); + OpenCL.getQueue().putWriteBuffer(bufDropout, false); + + FullyForwardCL.INSTANCE.forward(inputSize, outputSize, + bufDropout, input, bufWeight, bufBias, bufResult, activation); + } + @Override public float[] backward(float[] in, float[] delta) { if(useGpu && true){ diff --git a/src/main/java/kishida/cnn/layers/InputLayer.java b/src/main/java/kishida/cnn/layers/InputLayer.java index 88a38f4..df8e787 100644 --- a/src/main/java/kishida/cnn/layers/InputLayer.java +++ b/src/main/java/kishida/cnn/layers/InputLayer.java @@ -6,13 +6,21 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; +import kishida.cnn.opencl.OpenCL; +import lombok.Getter; /** * * @author naoki */ -public class InputLayer extends ImageNeuralLayer { +public class InputLayer extends ImageNeuralLayer implements FullGpuEnabled { + @JsonIgnore + @Getter + CLBuffer bufResult; public InputLayer(int width, int height) { this("input", width, height); @@ -24,6 +32,7 @@ public InputLayer( @JsonProperty("width") int width, @JsonProperty("height") int height) { super("input", 0, 0, 0, 3, width, height); + bufResult = OpenCL.createWriteBuffer(outputChannels * outputWidth * outputHeight); } @Override @@ -39,12 +48,25 @@ public int getHeight() { return super.outputHeight; } + @Override + public boolean isUseGpu() { + return false; + } + @Override public float[] forward(float[] in) { this.result = in; + bufResult.getBuffer().put(result); + OpenCL.getQueue() + .putWriteBuffer(bufResult, false); return result; } + @Override + public void forward(CLBuffer input) { + // do nothing + } + @Override public float[] backward(float[] in, float[] delta) { // do nothing @@ -53,6 +75,9 @@ public float[] backward(float[] in, float[] delta) { public void setInput(float[] input){ result = input; + bufResult.getBuffer().put(result).rewind(); + OpenCL.getQueue() + .putWriteBuffer(bufResult, false); } @Override diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java index 1d94b5b..32e399e 100644 --- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java +++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java @@ -6,24 +6,31 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.Arrays; import java.util.Random; import java.util.stream.Collectors; import java.util.stream.IntStream; import kishida.cnn.opencl.MaxPoolingCL; +import kishida.cnn.opencl.OpenCL; import lombok.Getter; /** * * @author naoki */ -public class MaxPoolingLayer extends ImageNeuralLayer { +public class MaxPoolingLayer extends ImageNeuralLayer implements FullGpuEnabled { @Getter int size; @Getter int stride; float[] newDelta; + @JsonIgnore + @Getter + CLBuffer bufResult; @JsonCreator public MaxPoolingLayer( @@ -43,6 +50,16 @@ public final void setPreLayer(NeuralLayer preLayer) { outputHeight = inputHeight / stride; result = new float[outputChannels * outputWidth * outputHeight]; newDelta = new float[inputChannels * inputWidth * inputHeight]; + bufResult = OpenCL.createReadWriteBuffer(result.length); + } + + @Override + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; } /** プーリング(max) */ @@ -80,6 +97,13 @@ public float[] forward(float[] data) { return result; } + @Override + public void forward(CLBuffer input) { + MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + input, bufResult); + } + @Override public float[] backward(float[] in, float[] delta) { return backward(in, delta, false); diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index 957e863..3c7c779 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -6,22 +6,29 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.stream.IntStream; import kishida.cnn.opencl.MultiNormalizeCL; +import kishida.cnn.opencl.OpenCL; import lombok.Getter; /** * * @author naoki */ -public class MultiNormalizeLayer extends ImageNeuralLayer{ +public class MultiNormalizeLayer extends ImageNeuralLayer implements FullGpuEnabled{ @Getter int size; @Getter float threshold; @Getter boolean useGpu; + @JsonIgnore + @Getter + CLBuffer bufResult; @JsonCreator public MultiNormalizeLayer( @@ -42,6 +49,16 @@ public final void setPreLayer(NeuralLayer preLayer) { outputWidth = inputWidth; outputHeight = inputHeight; result = new float[inputChannels * inputHeight * inputWidth]; + bufResult = OpenCL.createReadWriteBuffer(result.length); + } + + @Override + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; } @Override @@ -99,6 +116,12 @@ public float[] forward(float[] in) { return result; } + @Override + public void forward(CLBuffer input) { + MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight, size, + threshold, input, bufResult); + } + @Override public float[] backward(float[] in, float[] delta) { return delta; diff --git a/src/main/java/kishida/cnn/layers/NeuralLayer.java b/src/main/java/kishida/cnn/layers/NeuralLayer.java index 27f0b03..3501d97 100644 --- a/src/main/java/kishida/cnn/layers/NeuralLayer.java +++ b/src/main/java/kishida/cnn/layers/NeuralLayer.java @@ -46,9 +46,14 @@ public NeuralLayer(String name) { this.name = name; } - public float[] forward() { + public void forward() { Objects.requireNonNull(preLayer, "preLayer is null on " + name); - return forward(preLayer.result); + if(this instanceof FullGpuEnabled && preLayer instanceof FullGpuEnabled && + ((FullGpuEnabled)this).isUseGpu()){ + ((FullGpuEnabled)this).forward(((FullGpuEnabled)preLayer).getBufResult()); + }else{ + forward(preLayer.getResult()); + } } public float[] backward(float[] delta) { diff --git a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java index fedc596..10cb2e0 100644 --- a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java @@ -48,20 +48,23 @@ public void forward(int inputSize, int outputSize, int[] dropout, ActivationFunction activation){ CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length); + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); OpenCL.getQueue() - .putWriteBuffer(bufInput, false); + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufDropout, false); - forward(inputSize, outputSize, dropout, bufInput, bufWeight, bufBias, bufResult, activation); + forward(inputSize, outputSize, bufDropout, bufInput, bufWeight, bufBias, bufResult, activation); OpenCL.getQueue().putReadBuffer(bufResult, true); bufResult.getBuffer().get(result); bufInput.release(); bufResult.release(); + bufDropout.release(); } - public void forward(int inputSize, int outputSize, int[] dropout, + public void forward(int inputSize, int outputSize, CLBuffer bufDropout, CLBuffer bufInput, CLBuffer bufWeight, CLBuffer bufBias, CLBuffer bufResult, ActivationFunction activation){ @@ -74,10 +77,6 @@ public void forward(int inputSize, int outputSize, int[] dropout, actKernels = progActivation.createCLKernels(); } - CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); - OpenCL.getQueue() - .putWriteBuffer(bufDropout, false); - forwardKernel.rewind() .putArg(inputSize) .putArg(outputSize) @@ -90,20 +89,7 @@ public void forward(int inputSize, int outputSize, int[] dropout, OpenCL.execute(forwardKernel, outputSize); if(activation instanceof SoftMaxFunction){ - CLBuffer bufExped = OpenCL.createReadWriteBuffer(outputSize); - CLKernel kernelActPre = actKernels.get("softmax_before"); - kernelActPre.rewind() - .putArg(bufResult) - .putArg(bufExped); - OpenCL.execute(kernelActPre, outputSize); - - CLKernel kernelAct = actKernels.get("softmax"); - kernelAct.rewind() - .putArg(bufExped) - .putArg(bufResult); - OpenCL.execute(kernelAct, outputSize); - - bufExped.release(); + softmax(outputSize, bufResult); }else{ CLKernel kernelAct = actKernels.get(activation.getName()); @@ -112,7 +98,22 @@ public void forward(int inputSize, int outputSize, int[] dropout, OpenCL.execute(kernelAct, outputSize); } - bufDropout.release(); + } + private void softmax(int outputSize, CLBuffer bufResult) { + CLBuffer bufExped = OpenCL.createReadWriteBuffer(outputSize); + CLKernel kernelActPre = actKernels.get("softmax_before"); + kernelActPre.rewind() + .putArg(bufResult) + .putArg(bufExped); + OpenCL.execute(kernelActPre, outputSize); + + CLKernel kernelAct = actKernels.get("softmax"); + kernelAct.rewind() + .putArg(bufExped) + .putArg(bufResult); + OpenCL.execute(kernelAct, outputSize); + + bufExped.release(); } } diff --git a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java index 914a4ff..d250b12 100644 --- a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java +++ b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java @@ -26,16 +26,29 @@ private MaxPoolingCL() { public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight, int size, int stride, float[] input, float[] result){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + + forward(inputChannel, inputWidth, inputHeight, + outputWidth, ouptutHeight, size, stride, bufInput, bufResult); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufInput.release(); + bufResult.release(); + + } + public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight, + int size, int stride, CLBuffer bufInput, CLBuffer bufResult){ + if(prog == null){ prog = OpenCL.compile("maxpooling.cl"); kernels = prog.createCLKernels(); } - CLBuffer bufInput = OpenCL.createReadBuffer(input); - CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); - - OpenCL.getQueue() - .putWriteBuffer(bufInput, false); CLKernel kernelForward = kernels.get("forward"); kernelForward.rewind() .putArg(inputWidth) @@ -49,11 +62,6 @@ public void forward(int inputChannel, int inputWidth, int inputHeight, int outpu bufResult); OpenCL.execute(kernelForward, inputChannel * outputWidth * ouptutHeight); - OpenCL.getQueue().putReadBuffer(bufResult, true); - bufResult.getBuffer().get(result); - - bufInput.release(); - bufResult.release(); } public void backword(int inputChannel, int inputWidth, int inputHeight, diff --git a/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java index 47cee35..294810d 100644 --- a/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java +++ b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java @@ -24,17 +24,31 @@ public class MultiNormalizeCL { public void normalize(int inputChannels, int inputWidth, int inputHeight, int size, float threshold, float[] input, float[] result){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + OpenCL.getQueue().putWriteBuffer(bufInput, false); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + normalize(inputChannels, inputWidth, inputHeight, size, threshold, + bufInput, bufResult); + + bufInput.release(); + bufResult.release(); + + } + public void normalize(int inputChannels, int inputWidth, int inputHeight, + int size, float threshold, + CLBuffer bufInput, CLBuffer bufResult){ if(prog == null){ prog = OpenCL.compile("multi_normalize.cl"); kernels = prog.createCLKernels(); } - CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufAverages = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); CLBuffer bufStds = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); - CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); - OpenCL.getQueue().putWriteBuffer(bufInput, false); CLKernel kernelAverage = kernels.get("average"); kernelAverage.rewind() .putArg(inputChannels) @@ -60,14 +74,8 @@ public void normalize(int inputChannels, int inputWidth, int inputHeight, bufResult); OpenCL.execute(kernelForward, inputChannels * inputWidth * inputHeight); - OpenCL.getQueue().putReadBuffer(bufResult, true); - - bufResult.getBuffer().get(result); - - bufInput.release(); bufAverages.release(); bufStds.release(); - bufResult.release(); } diff --git a/src/main/resources/alexnet_def.json b/src/main/resources/alexnet_def.json index 960b9e4..66771f6 100644 --- a/src/main/resources/alexnet_def.json +++ b/src/main/resources/alexnet_def.json @@ -113,7 +113,7 @@ "initBias" : 1.0, "dropoutRate" : 1.0, "activation" : "SoftMaxFunction", - "useGpu" : false + "useGpu" : true } } ] } \ No newline at end of file diff --git a/src/main/resources/tinynet_def.json b/src/main/resources/tinynet_def.json index 6723860..c54e1cd 100644 --- a/src/main/resources/tinynet_def.json +++ b/src/main/resources/tinynet_def.json @@ -73,7 +73,7 @@ "initBias" : 1.0, "dropoutRate" : 1.0, "activation" : "SoftMaxFunction", - "useGpu" : false + "useGpu" : true } } ] } \ No newline at end of file From fff78277975041274f4d5a21977d566cd90236ca Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 23 Sep 2015 16:26:42 +0900 Subject: [PATCH 15/22] =?UTF-8?q?=E9=80=86=E4=BC=9D=E6=92=AD=E3=82=92GPU?= =?UTF-8?q?=E3=83=A1=E3=83=A2=E3=83=AA=E4=B8=8A=E3=81=A7=E5=8B=95=E3=81=8B?= =?UTF-8?q?=E3=81=99=E3=80=82=E6=80=9D=E3=81=84=E3=81=AE=E3=81=BB=E3=81=8B?= =?UTF-8?q?=E9=80=9F=E3=81=8F=E3=81=AA=E3=82=89=E3=81=9A=E3=80=82=E3=83=97?= =?UTF-8?q?=E3=83=AD=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E3=81=AB=E3=81=AF?= =?UTF-8?q?=E3=83=8E=E3=83=B3=E3=83=96=E3=83=AD=E3=83=83=E3=82=AD=E3=83=B3?= =?UTF-8?q?=E3=82=B0=E5=87=A6=E7=90=86=E3=81=AE=E6=95=B0=E5=AD=97=E3=81=8C?= =?UTF-8?q?=E3=81=A7=E3=81=AA=E3=81=8F=E3=81=A6=E3=81=A0=E3=81=BE=E3=81=95?= =?UTF-8?q?=E3=82=8C=E3=82=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 10 ++--- src/main/java/kishida/cnn/NeuralNetwork.java | 20 +++++++++- .../kishida/cnn/layers/ConvolutionLayer.java | 31 ++++++++++++++-- .../kishida/cnn/layers/FullGpuEnabled.java | 12 +++++- .../java/kishida/cnn/layers/FullyConnect.java | 11 ++++++ .../java/kishida/cnn/layers/InputLayer.java | 12 ++++++ .../kishida/cnn/layers/MaxPoolingLayer.java | 11 ++++++ .../cnn/layers/MultiNormalizeLayer.java | 5 +++ .../java/kishida/cnn/layers/NeuralLayer.java | 5 ++- .../cnn/opencl/ConvolutionBackwordCL.java | 31 ++++++++-------- .../kishida/cnn/opencl/FullyBackwordCL.java | 13 +++---- .../java/kishida/cnn/opencl/MaxPoolingCL.java | 37 ++++++++++++------- 12 files changed, 149 insertions(+), 49 deletions(-) diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 74c65ba..659b400 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -59,10 +59,10 @@ public class ConvolutionalNet { private static final int MINI_BATCH = 128; private static final float MOMENTAM = 0.9f; public static final String AVERAGE_PNG = "average.png"; - //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; - //private static final String RESOURCE_NAME = "/alexnet_def.json"; - private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; - private static final String RESOURCE_NAME = "/tinynet_def.json"; + private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; + private static final String RESOURCE_NAME = "/alexnet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + //private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ @@ -325,7 +325,7 @@ public static void main(String[] args) throws IOException { System.out.printf("weight: %.2f~%.2f average %.2f ", ws.getMin(), ws.getMax(), ws.getAverage()); DoubleSummaryStatistics bs = ((LerningLayer)layer).getBiasStatistics(); - System.out.printf("bias: %.2f~%.2f average %.2f ", + System.out.printf("bias: %.8f~%.8f average %.2f ", bs.getMin(), bs.getMax(), bs.getAverage()); } System.out.println(); diff --git a/src/main/java/kishida/cnn/NeuralNetwork.java b/src/main/java/kishida/cnn/NeuralNetwork.java index 640839d..809adad 100644 --- a/src/main/java/kishida/cnn/NeuralNetwork.java +++ b/src/main/java/kishida/cnn/NeuralNetwork.java @@ -10,11 +10,13 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; +import com.jogamp.opencl.CLBuffer; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; +import java.nio.FloatBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -23,6 +25,7 @@ import java.util.stream.IntStream; import kishida.cnn.activation.LogisticFunction; import kishida.cnn.layers.ConvolutionLayer; +import kishida.cnn.layers.FullGpuEnabled; import kishida.cnn.layers.FullyConnect; import kishida.cnn.layers.InputLayer; import kishida.cnn.layers.MaxPoolingLayer; @@ -165,8 +168,23 @@ public float[] forward(float[] readData, float[] correctData){ delta[idx] = correctData[idx] - output[idx]; } //逆伝播 + CLBuffer bufDelta = null; for(int i = layers.size() - 1; i >= 1; --i){ - delta = layers.get(i).backward(delta); + FullGpuEnabled layer = layers.get(i) instanceof FullGpuEnabled ? + (FullGpuEnabled) layers.get(i) : null; + FullGpuEnabled pre = layers.get(i).getPreLayer() instanceof FullGpuEnabled ? + (FullGpuEnabled)layers.get(i).getPreLayer() : null; + if(true && layer != null && pre != null && layer.isUseGpu()){ + if(bufDelta == null){ + bufDelta = layer.backwardBuf(pre.getBufResult(), delta); + }else{ + bufDelta = layer.backwardBuf(pre.getBufResult(), bufDelta); + } + delta = null; + }else{ + delta = layers.get(i).backward(delta); + bufDelta = null; + } } return output; diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 0a926e6..23c773c 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -57,6 +57,9 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer, CLBuffer bufBias; CLBuffer bufFilterDelta; CLBuffer bufBiasDelta; + CLBuffer bufDelta; + CLBuffer bufNewDelta; + CLBuffer bufTempBias; @JsonIgnore @Getter @@ -128,6 +131,9 @@ public final void setPreLayer(NeuralLayer preLayer) { this.bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); this.bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); this.bufResult = OpenCL.createReadWriteBuffer(result.length); + this.bufDelta = OpenCL.createReadWriteBuffer(result.length); + this.bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); OpenCL.getQueue() .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufBias, false) @@ -247,11 +253,18 @@ public float[] backward(float[] input, float[] delta) { }else{ // JOCL if(true){ + bufDelta.getBuffer().put(delta).rewind(); + OpenCL.getQueue().putWriteBuffer(bufDelta, false); + ConvolutionBackwordCL.INSTANCE.backward( - delta, result, input, + bufDelta, bufResult, ((FullGpuEnabled)preLayer).getBufResult(), inputChannels, inputWidth, inputHeight, bufFilter, outputChannels, outputWidth, outputHeight, - bufFilterDelta, bufBiasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + bufFilterDelta, bufBiasDelta, bufTempBias, filterSize, stride, bufNewDelta, + parent.getLearningRate()); + + OpenCL.getQueue().putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta).rewind(); }else{ ConvolutionBackwordCL.INSTANCE.backward( delta, result, input, @@ -271,6 +284,16 @@ public float[] backward(float[] input, float[] delta) { } } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + ConvolutionBackwordCL.INSTANCE.backward(bufDelta, bufResult, bufInput, + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, + filterSize, stride, bufNewDelta, parent.getLearningRate()); + return bufNewDelta; + } + @Override public void prepareBatch() { if(useGpu){ @@ -318,12 +341,12 @@ public String toString() { @Override public DoubleSummaryStatistics getWeightStatistics() { - return FloatUtil.summary(filter); + return FloatUtil.summary(getFilter()); } @Override public DoubleSummaryStatistics getBiasStatistics() { - return FloatUtil.summary(bias); + return FloatUtil.summary(getBias()); } } diff --git a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java index 4712966..437ff73 100644 --- a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java +++ b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java @@ -7,6 +7,7 @@ import com.jogamp.opencl.CLBuffer; import java.nio.FloatBuffer; +import kishida.cnn.opencl.OpenCL; /** * @@ -17,5 +18,14 @@ default boolean isUseGpu(){ return true; } CLBuffer getBufResult(); - void forward(CLBuffer input); + void forward(CLBuffer bufInput); + CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta); + default CLBuffer backwardBuf(CLBuffer bufInput, float[] delta){ + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + OpenCL.getQueue().putWriteBuffer(bufDelta, false); + CLBuffer result = backwardBuf(bufInput, bufDelta); + bufDelta.release(); + return result; + } + } diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index df8555f..9f93f22 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -62,6 +62,7 @@ public class FullyConnect extends NeuralLayer implements LerningLayer, FullGpuEn @JsonIgnore @Getter CLBuffer bufResult; + CLBuffer bufNewDelta; public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) { this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu); @@ -132,6 +133,7 @@ public final void setPreLayer(NeuralLayer preLayer) { bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); bufResult = OpenCL.createReadWriteBuffer(result.length); bufDropout = OpenCL.createReadBuffer(dropout); + bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length); OpenCL.getQueue() .putWriteBuffer(bufWeight, false) .putWriteBuffer(bufBias, false) @@ -257,6 +259,15 @@ public float[] backward(float[] in, float[] delta) { return newDelta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, bufDropout, + bufInput, bufDelta, bufResult, + bufWeight, bufWeightDelta, + bufBiasDelta, bufNewDelta, parent.getLearningRate(), activation); + return bufNewDelta; + } + @Override public void prepareBatch() { if(useGpu & true){ diff --git a/src/main/java/kishida/cnn/layers/InputLayer.java b/src/main/java/kishida/cnn/layers/InputLayer.java index df8e787..80c380e 100644 --- a/src/main/java/kishida/cnn/layers/InputLayer.java +++ b/src/main/java/kishida/cnn/layers/InputLayer.java @@ -73,6 +73,18 @@ public float[] backward(float[] in, float[] delta) { return null; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + // do nothing + return null; + } + + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, float[] delta) { + // do nothing + return null; + } + public void setInput(float[] input){ result = input; bufResult.getBuffer().put(result).rewind(); diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java index 32e399e..9625241 100644 --- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java +++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java @@ -32,6 +32,8 @@ public class MaxPoolingLayer extends ImageNeuralLayer implements FullGpuEnabled @Getter CLBuffer bufResult; + CLBuffer bufNewDelta; + @JsonCreator public MaxPoolingLayer( @JsonProperty("name") String name, @@ -51,6 +53,7 @@ public final void setPreLayer(NeuralLayer preLayer) { result = new float[outputChannels * outputWidth * outputHeight]; newDelta = new float[inputChannels * inputWidth * inputHeight]; bufResult = OpenCL.createReadWriteBuffer(result.length); + bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length); } @Override @@ -148,6 +151,14 @@ public float[] backward(float[] in, float[] delta, boolean gpu) { return newDelta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + bufInput, bufDelta, bufNewDelta); + return bufNewDelta; + } + public static void main(String[] args) { InputLayer input = new InputLayer(6, 6); MaxPoolingLayer pool = new MaxPoolingLayer("test_pool", 3, 2); diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index 3c7c779..a0f504c 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -127,6 +127,11 @@ public float[] backward(float[] in, float[] delta) { return delta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + return bufDelta; + } + @Override public String toString() { return String.format("%s:Multi channel normalize size:%dx%d in:%dx%dx%d out %dx%dx%d", diff --git a/src/main/java/kishida/cnn/layers/NeuralLayer.java b/src/main/java/kishida/cnn/layers/NeuralLayer.java index 3501d97..7906c5e 100644 --- a/src/main/java/kishida/cnn/layers/NeuralLayer.java +++ b/src/main/java/kishida/cnn/layers/NeuralLayer.java @@ -36,7 +36,8 @@ public abstract class NeuralLayer { @Getter float[] result; - @Setter + @JsonIgnore + @Setter @Getter NeuralLayer preLayer; @Setter @@ -76,7 +77,7 @@ public void joinBatch(){ @JsonIgnore public DoubleSummaryStatistics getResultStatistics(){ - return FloatUtil.summary(result); + return FloatUtil.summary(getResult()); } } diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index 7b0ef25..bc981d3 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -31,13 +31,17 @@ public void backward(float[] delta, float[] result, CLBuffer bufFilter = OpenCL.createReadBuffer(filter); CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + CLBuffer bufResult = OpenCL.createReadBuffer(result); + CLBuffer bufInput = OpenCL.createReadBuffer(input); OpenCL.getQueue() .putWriteBuffer(bufFilter, false) .putWriteBuffer(bufFilterDelta, false) - .putWriteBuffer(bufBiasDelta, false); + .putWriteBuffer(bufBiasDelta, false) + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufResult, false); - backward(delta, result, - input, inputChannels, inputWidth, inputHeight, + backward(delta, bufResult, + bufInput, inputChannels, inputWidth, inputHeight, bufFilter, outputChannels, outputWidth, outputHeight, bufFilterDelta, bufBiasDelta, filterSize, stride, newDelta, learningRate); @@ -51,25 +55,24 @@ public void backward(float[] delta, float[] result, bufFilter.release(); bufFilterDelta.release(); bufBiasDelta.release(); + bufInput.release(); + bufResult.release(); } - public void backward(float[] delta, float[] result, - float[] input, int inputChannels, int inputWidth, int inputHeight, + public void backward(float[] delta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, int filterSize, int stride, float[] newDelta, float learningRate) { CLBuffer bufDelta = OpenCL.createReadBuffer(delta); - CLBuffer bufResult = OpenCL.createReadBuffer(result); - CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); OpenCL.getQueue() - .putWriteBuffer(bufDelta, false) - .putWriteBuffer(bufResult, false) - .putWriteBuffer(bufInput, false); + .putWriteBuffer(bufDelta, false); backward(bufDelta, bufResult, bufInput, inputChannels, inputWidth, inputHeight, bufFilter, outputChannels, outputWidth, outputHeight, - bufFilterDelta, bufBiasDelta, + bufFilterDelta, bufBiasDelta, bufTempBias, filterSize, stride, bufNewDelta, learningRate); OpenCL.getQueue() @@ -77,22 +80,20 @@ public void backward(float[] delta, float[] result, bufNewDelta.getBuffer().get(newDelta); bufDelta.release(); - bufResult.release(); - bufInput.release(); bufNewDelta.release(); + bufTempBias.release(); } public void backward(CLBuffer bufDelta, CLBuffer bufResult, CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { if(prog == null){ prog = OpenCL.compile("convolution_backword.cl"); kernels = prog.createCLKernels(); } - CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); - CLKernel deltaKernel = prog.createCLKernel("delta"); deltaKernel .rewind() diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java index c92a121..c08eae1 100644 --- a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -69,13 +69,15 @@ public void backword(int inputSize, int outputSize, CLBuffer bufDelta = OpenCL.createReadBuffer(delta); CLBuffer bufResult = OpenCL.createReadBuffer(result); CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); OpenCL.getQueue() .putWriteBuffer(bufInput ,false) .putWriteBuffer(bufDelta ,false) - .putWriteBuffer(bufResult ,false); + .putWriteBuffer(bufResult ,false) + .putWriteBuffer(bufDropout ,false); backword(inputSize, outputSize, - dropout, bufInput, bufDelta, + bufDropout, bufInput, bufDelta, bufResult, bufWeight, bufWeightDelta, bufBiasDelta, bufNewDelta, learningRate, activation); @@ -88,10 +90,11 @@ public void backword(int inputSize, int outputSize, bufDelta .release(); bufResult .release(); bufNewDelta .release(); + bufDropout .release(); } public void backword(int inputSize, int outputSize, - int[] dropout, CLBuffer bufInput, CLBuffer bufDelta, + CLBuffer bufDropout, CLBuffer bufInput, CLBuffer bufDelta, CLBuffer bufResult, CLBuffer bufWeight, CLBuffer bufWeightDelta, CLBuffer bufBiasDelta, CLBuffer bufNewDelta, @@ -105,12 +108,8 @@ public void backword(int inputSize, int outputSize, actKernels = progActivation.createCLKernels(); } - CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(outputSize); - OpenCL.getQueue() - .putWriteBuffer(bufDropout ,false); - CLKernel actKernel = actKernels.get(activation.getName() + "_diff"); actKernel.rewind() .putArg(bufResult) diff --git a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java index d250b12..e143fe9 100644 --- a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java +++ b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java @@ -68,19 +68,34 @@ public void backword(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int outputHeight, int size, int stride, float[] input, float[] delta, float[] newDelta){ - if(prog == null){ - prog = OpenCL.compile("maxpooling.cl"); - kernels = prog.createCLKernels(); - } - CLBuffer bufInput = OpenCL.createReadBuffer(input); CLBuffer bufDelta = OpenCL.createReadBuffer(delta); CLBuffer bufNewDelta = OpenCL.createReadWriteBuffer(newDelta); - OpenCL.getQueue() .putWriteBuffer(bufInput, false) - .putWriteBuffer(bufDelta, false) - .putWriteBuffer(bufNewDelta, false); + .putWriteBuffer(bufDelta, false); + + backword(inputChannel, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + bufInput, bufDelta, bufNewDelta); + + OpenCL.getQueue().putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + + bufInput.release(); + bufDelta.release(); + bufNewDelta.release(); + } + public void backword(int inputChannel, int inputWidth, int inputHeight, + int outputWidth, int outputHeight, + int size, int stride, + CLBuffer bufInput, + CLBuffer bufDelta, CLBuffer bufNewDelta){ + if(prog == null){ + prog = OpenCL.compile("maxpooling.cl"); + kernels = prog.createCLKernels(); + } + CLKernel kernelForward = kernels.get("backword"); kernelForward.rewind() .putArg(inputWidth) @@ -95,12 +110,6 @@ public void backword(int inputChannel, int inputWidth, int inputHeight, bufNewDelta); OpenCL.execute(kernelForward, inputChannel * inputWidth * inputHeight); - OpenCL.getQueue().putReadBuffer(bufNewDelta, true); - bufNewDelta.getBuffer().get(newDelta); - - bufInput.release(); - bufDelta.release(); - bufNewDelta.release(); } From 0fee26dab87f14fa49093fa291e8462f589d7ca4 Mon Sep 17 00:00:00 2001 From: kishida Date: Wed, 23 Sep 2015 16:45:48 +0900 Subject: [PATCH 16/22] =?UTF-8?q?=E5=BF=85=E8=A6=81=E3=81=AA=E3=83=90?= =?UTF-8?q?=E3=83=83=E3=83=95=E3=82=A1=E3=82=92=E3=83=AA=E3=83=AA=E3=83=BC?= =?UTF-8?q?=E3=82=B9=E3=81=97=E3=81=A6=E3=81=84=E3=81=9F=E3=81=AE=E3=81=A7?= =?UTF-8?q?=E4=BF=AE=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java | 2 -- src/main/java/kishida/cnn/opencl/FullyBackwordCL.java | 1 - src/main/java/kishida/cnn/opencl/OpenCL.java | 2 +- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index bc981d3..0965684 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -154,8 +154,6 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe bufBiasDelta); OpenCL.execute(biasAfterKernel, outputChannels); - bufTempBias.release(); - } public void prepare(float momentam, diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java index c08eae1..99bd99f 100644 --- a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -150,7 +150,6 @@ public void backword(int inputSize, int outputSize, .putArg(bufBiasDelta); OpenCL.execute(kernelBias, outputSize); - bufDropout .release(); bufDiffed .release(); } diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java index 6d943f0..bb635c1 100644 --- a/src/main/java/kishida/cnn/opencl/OpenCL.java +++ b/src/main/java/kishida/cnn/opencl/OpenCL.java @@ -85,7 +85,7 @@ public static CLBuffer createReadBuffer(int[] data){ return buf; } public static CLCommandQueue execute(CLKernel kernel, int range){ - int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 256); + int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 128); int globalWorkSize = roundUp(localWorkSize, range); kernel.putArg(range); return getQueue().put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); From e87944054f06e83e51233cdc6b81462d695b3b4a Mon Sep 17 00:00:00 2001 From: kishida Date: Thu, 24 Sep 2015 13:15:47 +0900 Subject: [PATCH 17/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E9=80=86?= =?UTF-8?q?=E4=BC=9D=E6=92=AD=E3=81=AE3=E3=81=A4=E3=81=AE=E3=82=AB?= =?UTF-8?q?=E3=83=BC=E3=83=8D=E3=83=AB=E3=82=92=E4=B8=A6=E5=88=97=E5=AE=9F?= =?UTF-8?q?=E8=A1=8C=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cnn/opencl/ConvolutionBackwordCL.java | 55 ++++++- .../resources/kernels/convolution_backword.cl | 141 +++++++++++++++--- 2 files changed, 173 insertions(+), 23 deletions(-) diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index 0965684..c2bd4d8 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -83,7 +83,7 @@ public void backward(float[] delta, CLBuffer bufResult, bufNewDelta.release(); bufTempBias.release(); } - public void backward(CLBuffer bufDelta, CLBuffer bufResult, + public void backward_sep(CLBuffer bufDelta, CLBuffer bufResult, CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, @@ -94,7 +94,7 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe kernels = prog.createCLKernels(); } - CLKernel deltaKernel = prog.createCLKernel("delta"); + CLKernel deltaKernel = prog.createCLKernel("delta_kernel"); deltaKernel .rewind() .putArg(inputWidth) @@ -113,7 +113,7 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe OpenCL.execute(deltaKernel, inputChannels * inputWidth * inputHeight); - CLKernel filterKernel = kernels.get("filter"); + CLKernel filterKernel = kernels.get("filter_kernel"); filterKernel .rewind() .putArg(inputChannels) @@ -133,7 +133,7 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe OpenCL.execute(filterKernel, outputChannels * inputChannels * filterSize * filterSize); - CLKernel biasKernel = kernels.get("bias"); + CLKernel biasKernel = kernels.get("bias_kernel"); biasKernel .rewind() .putArgs( @@ -155,7 +155,54 @@ public void backward(CLBuffer bufDelta, CLBuffer bufRe OpenCL.execute(biasAfterKernel, outputChannels); } + public void backward(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + if(prog == null){ + prog = OpenCL.compile("convolution_backword.cl"); + kernels = prog.createCLKernels(); + } + CLKernel dfbKernel = kernels.get("dfb"); + dfbKernel.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputChannels) + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(filterSize) + .putArg(stride) + .putArg(learningRate) + .putArgs( + bufInput, + bufResult, + bufFilter, + bufFilterDelta, + bufDelta, + bufNewDelta, + bufTempBias) + .putArg(inputChannels * inputWidth * inputHeight) + .putArg(outputChannels * inputChannels * filterSize * filterSize) + .putArg(outputChannels * outputWidth * outputHeight); + OpenCL.execute(dfbKernel, + inputChannels * inputWidth * inputHeight + + outputChannels * inputChannels * filterSize * filterSize + + outputChannels * outputWidth * outputHeight); + + CLKernel biasAfterKernel = kernels.get("biasAfter"); + biasAfterKernel + .rewind() + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufTempBias, + bufBiasDelta); + OpenCL.execute(biasAfterKernel, outputChannels); + } public void prepare(float momentam, int filterCount, int biasCount, CLBuffer bufFilterDelta, diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl index 6a5ea27..0d0d9a0 100644 --- a/src/main/resources/kernels/convolution_backword.cl +++ b/src/main/resources/kernels/convolution_backword.cl @@ -1,4 +1,5 @@ -__kernel void delta( +void delta_proc( + int chxxyy, int inputWidth, int inputHeight, int filterSize, @@ -10,13 +11,8 @@ __kernel void delta( __global const float *delta, __global const float *filter, int inputChannels, - __global float *newDelta, - int count + __global float *newDelta ){ - int chxxyy = get_global_id(0); - if(chxxyy >= count){ - return; - } int ch = chxxyy / (inputWidth * inputHeight); int xx = (chxxyy % (inputWidth * inputHeight)) / inputHeight; int yy = chxxyy % inputHeight; @@ -43,7 +39,43 @@ __kernel void delta( newDelta[chxxyy] = tempDelta; } -__kernel void filter( +__kernel void delta_kernel( + int inputWidth, + int inputHeight, + int filterSize, + int outputChannels, + int stride, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + __global const float *filter, + int inputChannels, + __global float *newDelta, + int count +){ + int chxxyy = get_global_id(0); + if(chxxyy >= count){ + return; + } + delta_proc( + chxxyy, + inputWidth, + inputHeight, + filterSize, + outputChannels, + stride, + outputWidth, + outputHeight, + result, + delta, + filter, + inputChannels, + newDelta); +} + +void filter_proc( + int fchij, int inputChannels, int filterSize, int outputWidth, @@ -55,13 +87,9 @@ __kernel void filter( int inputHeight, float learningRate, __global const float *input, - __global float *filter, - int count + __global float *filterDelta ){ - int fchij = get_global_id(0); - if(fchij >= count){ - return; - } + int f = fchij / ((inputChannels * filterSize) * filterSize); int ch = (fchij % ((inputChannels * filterSize) * filterSize)) / (filterSize * filterSize); int i = (fchij % (filterSize * filterSize)) / filterSize; @@ -81,10 +109,46 @@ __kernel void filter( } } } - filter[fchij] = filter[fchij] + df; + filterDelta[fchij] += df; } -__kernel void bias( +__kernel void filter_kernel( + int inputChannels, + int filterSize, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + int stride, + int inputWidth, + int inputHeight, + float learningRate, + __global const float *input, + __global float *filterDelta, + int count +){ + int fchij = get_global_id(0); + if(fchij >= count){ + return; + } + filter_proc(fchij, inputChannels, filterSize, + outputWidth, outputHeight, result, delta, + stride, inputWidth, inputHeight, learningRate, + input, filterDelta); +} + +void bias_proc( + int fxy, + __global const float *result, + __global const float *delta, + __global float *tempBiasDelta, + float learningRate +){ + float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f; + tempBiasDelta[fxy] = learningRate * d; +} + +__kernel void bias_kernel( __global const float *result, __global const float *delta, __global float *tempBiasDelta, @@ -95,10 +159,8 @@ __kernel void bias( if(fxy >= count){ return; } - float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f; - tempBiasDelta[fxy] = learningRate * d; + bias_proc(fxy, result, delta, tempBiasDelta, learningRate); } - __kernel void biasAfter( int outputWidth, int outputHeight, @@ -117,6 +179,47 @@ __kernel void biasAfter( biasDelta[f] += b; } +__kernel void dfb( + int inputChannels, + int inputWidth, + int inputHeight, + int outputChannels, + int outputWidth, + int outputHeight, + int filterSize, + int stride, + float learningRate, + __global float *input, + __global float *result, + __global float *filter, + __global float *filterDelta, + __global float *delta, + __global float *newDelta, + __global float *tempBiasDelta, + int deltaCount, + int filterCount, + int biasCount, + int count +){ + int n = get_global_id(0); + if(n < deltaCount){ + delta_proc(n, + inputWidth, inputHeight, + filterSize, outputChannels, stride, + outputWidth, outputHeight, + result, delta, filter, + inputChannels, newDelta); + }else if (n < deltaCount + filterCount){ + filter_proc(n - deltaCount, inputChannels, filterSize, + outputWidth, outputHeight, result, delta, + stride, inputWidth, inputHeight, learningRate, + input, filterDelta); + }else if (n < deltaCount + filterCount + biasCount){ + bias_proc(n - deltaCount - filterCount, + result, delta, tempBiasDelta, learningRate); + } +} + __kernel void prepare( float momentam, __global float* delta, From 52b3b632c00a15cf233efa916d8ebc593727ce60 Mon Sep 17 00:00:00 2001 From: kishida Date: Thu, 24 Sep 2015 13:16:31 +0900 Subject: [PATCH 18/22] =?UTF-8?q?=E3=83=87=E3=83=BC=E3=82=BF=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E3=81=AE=E4=B8=8D=E5=85=B7=E5=90=88=E3=82=92=E4=BF=AE?= =?UTF-8?q?=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/activation/ActivationFunction.java | 2 ++ src/main/java/kishida/cnn/layers/ConvolutionLayer.java | 1 + src/main/java/kishida/cnn/layers/FullGpuEnabled.java | 4 ++++ src/main/java/kishida/cnn/layers/FullyConnect.java | 9 ++------- .../java/kishida/cnn/layers/MultiNormalizeLayer.java | 1 + 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/main/java/kishida/cnn/activation/ActivationFunction.java b/src/main/java/kishida/cnn/activation/ActivationFunction.java index ac58b07..efbc29b 100644 --- a/src/main/java/kishida/cnn/activation/ActivationFunction.java +++ b/src/main/java/kishida/cnn/activation/ActivationFunction.java @@ -5,6 +5,7 @@ */ package kishida.cnn.activation; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; @@ -28,5 +29,6 @@ public void applyAfter(float[] values) { /** 微分 */ public abstract float diff(float value); + @JsonIgnore public abstract String getName(); } diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 23c773c..9a8ef59 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -46,6 +46,7 @@ public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer, @Getter int filterSize; private ActivationFunction activation; + @JsonProperty @Getter @Setter boolean useGpu; @Getter diff --git a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java index 437ff73..d9f37aa 100644 --- a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java +++ b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java @@ -5,8 +5,10 @@ */ package kishida.cnn.layers; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.jogamp.opencl.CLBuffer; import java.nio.FloatBuffer; +import java.util.Objects; import kishida.cnn.opencl.OpenCL; /** @@ -14,6 +16,7 @@ * @author naoki */ public interface FullGpuEnabled { + @JsonIgnore default boolean isUseGpu(){ return true; } @@ -21,6 +24,7 @@ default boolean isUseGpu(){ void forward(CLBuffer bufInput); CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta); default CLBuffer backwardBuf(CLBuffer bufInput, float[] delta){ + Objects.requireNonNull(delta, "delta is null on " + ((NeuralLayer)this).getName()); CLBuffer bufDelta = OpenCL.createReadBuffer(delta); OpenCL.getQueue().putWriteBuffer(bufDelta, false); CLBuffer result = backwardBuf(bufInput, bufDelta); diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index 9f93f22..99aca63 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -44,8 +44,8 @@ public class FullyConnect extends NeuralLayer implements LerningLayer, FullGpuEn private int[] dropout; @Getter private float dropoutRate = 1; - //@Getter - @Setter + @JsonProperty + @Setter @Getter private boolean useGpu; private float[] newDelta; private float[] diffed; @@ -174,11 +174,6 @@ public float[] getBiasDelta() { return biasDelta; } - @Override - public boolean isUseGpu() { - return useGpu; - } - @Override public float[] getResult() { if(bufResult != null && isUseGpu()){ diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index a0f504c..7f3580b 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -24,6 +24,7 @@ public class MultiNormalizeLayer extends ImageNeuralLayer implements FullGpuEnab int size; @Getter float threshold; + @JsonProperty @Getter boolean useGpu; @JsonIgnore From 568ce27a2525db5cf1077a146e3e8a68eb26d40a Mon Sep 17 00:00:00 2001 From: kishida Date: Mon, 5 Oct 2015 03:54:10 +0900 Subject: [PATCH 19/22] =?UTF-8?q?OpenCL=E6=83=85=E5=A0=B1=E5=8F=96?= =?UTF-8?q?=E5=BE=97=E3=82=B3=E3=83=9E=E3=83=B3=E3=83=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/imagefiltering/InfoCL.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 src/main/java/kishida/imagefiltering/InfoCL.java diff --git a/src/main/java/kishida/imagefiltering/InfoCL.java b/src/main/java/kishida/imagefiltering/InfoCL.java new file mode 100644 index 0000000..f505b94 --- /dev/null +++ b/src/main/java/kishida/imagefiltering/InfoCL.java @@ -0,0 +1,36 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.imagefiltering; + +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; + +/** + * + * @author naoki + */ +public class InfoCL { + public static void main(String[] args) { + CLContext ctx = CLContext.create(); + CLDevice dev = ctx.getMaxFlopsDevice(); + System.out.println(dev); + System.out.println(memSizeString(dev.getGlobalMemSize())); + System.out.println(memSizeString(dev.getLocalMemSize())); + System.out.println(dev.getMaxWorkGroupSize()); + ctx.release(); + } + static String memSizeString(long memSize){ + double size = memSize; + String[] unit = {"", "K", "M", "G", "T"}; + for(int i = 0; i < unit.length; ++i){ + if(size < 1024){ + return String.format("%.1f%sB", size, unit[i]); + } + size /= 1024; + } + return String.format("%.1f%sPB", size); + } +} From d10384c69abc40de1824903a854768f4cd08d999 Mon Sep 17 00:00:00 2001 From: kishida Date: Tue, 6 Oct 2015 01:44:53 +0900 Subject: [PATCH 20/22] =?UTF-8?q?=E7=95=B3=E8=BE=BC=E3=81=BF=E5=B1=A4?= =?UTF-8?q?=E3=81=AE=E9=A0=86=E4=BC=9D=E6=92=AD=E3=81=AE=E3=81=A8=E3=81=8D?= =?UTF-8?q?=E3=81=AB=E3=83=AD=E3=83=BC=E3=82=AB=E3=83=AB=E3=83=A1=E3=83=A2?= =?UTF-8?q?=E3=83=AA=E3=82=92=E4=BD=BF=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cnn/opencl/ConvolutionBackwordCL.java | 14 +++- .../cnn/opencl/ConvolutionForwardCL.java | 66 ++++++++++++++++++- .../resources/kernels/convolution_forward.cl | 57 ++++++++++++++++ 3 files changed, 135 insertions(+), 2 deletions(-) diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java index c2bd4d8..6b2655a 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -83,6 +83,18 @@ public void backward(float[] delta, CLBuffer bufResult, bufNewDelta.release(); bufTempBias.release(); } + public void backward(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + backward_sep(bufDelta, bufResult, bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, + filterSize, stride, bufNewDelta, learningRate); + } + public void backward_sep(CLBuffer bufDelta, CLBuffer bufResult, CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, @@ -155,7 +167,7 @@ public void backward_sep(CLBuffer bufDelta, CLBuffer b OpenCL.execute(biasAfterKernel, outputChannels); } - public void backward(CLBuffer bufDelta, CLBuffer bufResult, + public void backward_gen(CLBuffer bufDelta, CLBuffer bufResult, CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java index 3ecc145..2a0fb50 100644 --- a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java +++ b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java @@ -78,7 +78,7 @@ public void forward(CLBuffer bufInput, int filterSize, int stride, CLBuffer bufBias){ if(prog == null){ prog = OpenCL.compile("convolution_forward.cl"); - forwardKernel = prog.createCLKernel("forward"); + forwardKernel = prog.createCLKernel("forward_local"); } forwardKernel @@ -95,8 +95,13 @@ public void forward(CLBuffer bufInput, bufFilter, bufResult, bufBias); + /* OpenCL.execute(forwardKernel, outputChannels * outputWidth * outputHeight); + */ + forwardKernel.putArg(outputChannels * outputWidth * outputHeight); + OpenCL.getQueue().put1DRangeKernel(forwardKernel, 0, + outputChannels * outputWidth * outputHeight, outputChannels); normalizeKernel = prog.createCLKernel("localNormalize"); normalizeKernel @@ -109,5 +114,64 @@ public void forward(CLBuffer bufInput, outputChannels * outputWidth * outputHeight); } + public static void main(String[] args) { + CLProgram prog = OpenCL.compile("convolution_forward.cl"); + CLKernel forwardKernel = prog.createCLKernel("forward"); + + int inputChannels = 384; + int inputWidth = 14; + int inputHeight = 14; + int outputChannels = 384; + int outputWidth = 14; + int outputHeight = 14; + int filterSize = 3; + int stride = 1; + CLBuffer bufInput = OpenCL.createReadWriteBuffer( + inputChannels * inputWidth * inputHeight); + CLBuffer bufFilter = OpenCL.createReadWriteBuffer( + inputChannels * outputChannels * filterSize * filterSize); + CLBuffer bufResult = OpenCL.createReadWriteBuffer( + outputChannels * outputWidth * outputHeight); + CLBuffer bufBias = OpenCL.createReadWriteBuffer( + outputChannels); + long start = System.currentTimeMillis(); + for(int i = 0; i < 5000; ++i){ + forwardKernel + .rewind() + .putArg(outputHeight) + .putArg(outputWidth) + .putArg(inputChannels) + .putArg(filterSize) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufFilter, + bufResult, + bufBias) + .putArg(outputChannels * outputWidth * outputHeight); + int workSize = outputChannels; + OpenCL.getQueue().put1DRangeKernel(forwardKernel, + 0, outputChannels * outputWidth * outputHeight, + workSize); + } + OpenCL.getQueue().putBarrier(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + bufFilter.release(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + bufInput.release(); + bufResult.release(); + bufBias.release(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + + forwardKernel.release(); + prog.release(); + OpenCL.getQueue().release(); + OpenCL.getCtx().release(); + } + static int roundUp(int groupSize, int globalSize){ + return ((globalSize + groupSize - 1) / groupSize) * groupSize; + } } diff --git a/src/main/resources/kernels/convolution_forward.cl b/src/main/resources/kernels/convolution_forward.cl index 9f256f3..3af321b 100644 --- a/src/main/resources/kernels/convolution_forward.cl +++ b/src/main/resources/kernels/convolution_forward.cl @@ -16,6 +16,7 @@ __kernel void forward( if(fxy >= count){ return; } + int f = fxy / (outputHeight * outputWidth); int x = (fxy % (outputHeight * outputWidth)) / outputHeight; int y = fxy % outputHeight; @@ -39,6 +40,62 @@ __kernel void forward( result[fxy] = rs >= 0 ? rs : 0; } +__kernel void forward_local( + int outputHeight, + int outputWidth, + int inputChannels, + int filterSize, + int stride, + int inputWidth, + int inputHeight, + __global const float *input, + __global const float *filter, + __global float *result, + __global const float *bias, + int count +){ + int fxy = get_global_id(0); + if(fxy >= count){ + return; + } + int f = fxy / (outputHeight * outputWidth); + int x = (fxy % (outputHeight * outputWidth)) / outputHeight; + int y = fxy % outputHeight; + + __local float lfilter[384 * 3 * 3]; // + int len = inputChannels * filterSize * filterSize; + /* + int start = get_local_id(0) * len / outputChannels; + int end = (get_local_id(0) + 1) * len / outputChannels; + for(int i = start; i < end; ++i){ + lfilter[i] = filter[f * inputChannels * filterSize * filterSize + i]; + }*/ + event_t ev; + ev = async_work_group_copy(lfilter, + filter + f * inputChannels * filterSize * filterSize, + len, ev); + wait_group_events(1, &ev); + + float r = 0.0f; + for (int ch = 0; ch=0 && xx=0 && yy= 0 ? rs : 0; +} + + __kernel void localNormalize( int outputWidth, int outputHeight, From 84d5ec39ae77f9bc4631929489283eb31e55e8f2 Mon Sep 17 00:00:00 2001 From: kishida Date: Mon, 12 Oct 2015 05:11:17 +0900 Subject: [PATCH 21/22] =?UTF-8?q?=E6=AF=8E=E5=9B=9E=E3=83=90=E3=82=A4?= =?UTF-8?q?=E3=82=A2=E3=82=B9=E3=82=92=E8=A1=A8=E7=A4=BA=E3=81=99=E3=82=8B?= =?UTF-8?q?=E3=81=A8=E6=99=82=E9=96=93=E3=81=8C=E3=81=8B=E3=81=8B=E3=82=8B?= =?UTF-8?q?=E3=81=AE=E3=81=A7=E7=9C=81=E7=95=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/kishida/cnn/ConvolutionalNet.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 659b400..70de76e 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -291,23 +291,13 @@ public static void main(String[] args) throws IOException { Image lineGraph = createLineGraph(500, 200, historyData, 1, 0); historyLabel.setIcon(new ImageIcon(lineGraph)); - //一段目のフィルタの表示 - //全結合一段の表示 - firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult()))); - //全結合二段の表示 - lastResult.setIcon(new ImageIcon(createGraph(256, 128, output))); - - firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias()))); - secondBias.setIcon(new ImageIcon(createGraph(500, 128, - conv2.getBias()))); - fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias()))); - fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias()))); //System.out.println(Arrays.stream(output).mapToObj(d -> String.format("%.2f", d)).collect(Collectors.joining(","))); count[0]++; nn.setImageIndex(nn.getImageIndex() + 1); if(count[0] >= MINI_BATCH){ + nn.joinBatch(); batchCount[0]++; System.out.printf("%5d %4d %.2f/m %s %s%n", batchCount[0], @@ -335,6 +325,18 @@ public static void main(String[] args) throws IOException { pStart[0] = System.currentTimeMillis(); nn.prepareBatch(); + //一段目のフィルタの表示 + //全結合一段の表示 + firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult()))); + //全結合二段の表示 + lastResult.setIcon(new ImageIcon(createGraph(256, 128, output))); + + firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias()))); + secondBias.setIcon(new ImageIcon(createGraph(500, 128, + conv2.getBias()))); + fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias()))); + fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias()))); + // 1時間に一回保存する int hour = LocalTime.now().getHour(); if(lastHour[0] != hour){ From 8cd2cb1daf253b56199556df3eeebe873250e924 Mon Sep 17 00:00:00 2001 From: Naoki Kishida Date: Tue, 23 Oct 2018 07:26:26 +0900 Subject: [PATCH 22/22] update libraries --- pom.xml | 12 ++++++------ src/main/java/kishida/cnn/ConvolutionalNet.java | 2 +- .../cnn/kernels/ConvolutionBackwordBiasKernel.java | 2 +- .../cnn/kernels/ConvolutionBackwordDeltaKernel.java | 2 +- .../cnn/kernels/ConvolutionBackwordFilterKernel.java | 2 +- .../cnn/kernels/ConvolutionBackwordKernel.java | 2 +- .../cnn/kernels/ConvolutionForwardKernel.java | 2 +- .../kernels/ConvolutionLocalNormalizationKernel.java | 2 +- .../java/kishida/cnn/kernels/FullyForwardKernel.java | 2 +- .../java/kishida/cnn/kernels/NormalizeKernel.java | 2 +- .../java/kishida/cnn/layers/ConvolutionLayer.java | 2 +- .../kishida/imagefiltering/ConvolutionalNet.java | 2 +- 12 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pom.xml b/pom.xml index 0cc4506..39ae23f 100644 --- a/pom.xml +++ b/pom.xml @@ -7,29 +7,29 @@ jar - com.amd.aparapi + com.aparapi aparapi - 1.0-kishida + 1.8.0 com.fasterxml.jackson.core jackson-databind - 2.4.3 + 2.9.7 org.projectlombok lombok - 1.16.6 + 1.18.2 org.jogamp.gluegen gluegen-rt-main - 2.1.4 + 2.3.2 org.jogamp.jocl jocl-main - 2.1.4 + 2.3.2 diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index 70de76e..8672fdf 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -95,7 +95,7 @@ BufferedImage readImage(){ @SuppressWarnings({"ThrowableInstanceNotThrown", "ThrowableInstanceNeverThrown"}) public static void main(String[] args) throws IOException { - System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "false"); + System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "false"); String def = "C:\\Users\\naoki\\Desktop\\sampleimg288"; Path dir = Paths.get(args.length > 0 ? args[0] : def); List categories = Files.list(dir) diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordBiasKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordBiasKernel.java index cc4cd30..1f384a8 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordBiasKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordBiasKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java index 67f7449..5412038 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java index 451d11e..771dded 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java index de8d9ca..efd8d72 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.Arrays; import java.util.stream.IntStream; diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionForwardKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionForwardKernel.java index 7f62d19..90a1b68 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionForwardKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionForwardKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java index a60aaa8..dd33cce 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java index 17469af..2c57ac0 100644 --- a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java +++ b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java index 8202643..d4bf793 100644 --- a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java +++ b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java @@ -5,7 +5,7 @@ */ package kishida.cnn.kernels; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.util.stream.IntStream; /** diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index 9a8ef59..1e1ac22 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -5,7 +5,7 @@ */ package kishida.cnn.layers; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; diff --git a/src/main/java/kishida/imagefiltering/ConvolutionalNet.java b/src/main/java/kishida/imagefiltering/ConvolutionalNet.java index 6bff969..fdb2906 100644 --- a/src/main/java/kishida/imagefiltering/ConvolutionalNet.java +++ b/src/main/java/kishida/imagefiltering/ConvolutionalNet.java @@ -1,6 +1,6 @@ package kishida.imagefiltering; -import com.amd.aparapi.Kernel; +import com.aparapi.Kernel; import java.awt.Color; import java.awt.Graphics; import java.awt.Graphics2D;