diff --git a/.gitignore b/.gitignore index 718edb6..74785ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target/ /nb-configuration.xml -/nbactions*.xml \ No newline at end of file +/nbactions*.xml +/*.log \ No newline at end of file diff --git a/pom.xml b/pom.xml index 6f79c84..8e06b50 100644 --- a/pom.xml +++ b/pom.xml @@ -14,13 +14,23 @@ com.fasterxml.jackson.core jackson-databind - 2.9.7 + 2.10.0 org.projectlombok lombok 1.18.2 + + org.jogamp.gluegen + gluegen-rt-main + 2.3.2 + + + org.jogamp.jocl + jocl-main + 2.3.2 + @@ -40,4 +50,4 @@ 1.8 1.8 - \ No newline at end of file + diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java index d8acaeb..c3ff5d6 100644 --- a/src/main/java/kishida/cnn/ConvolutionalNet.java +++ b/src/main/java/kishida/cnn/ConvolutionalNet.java @@ -8,6 +8,8 @@ import java.awt.Image; import java.awt.image.BufferedImage; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; import java.io.UncheckedIOException; import java.io.Writer; @@ -59,6 +61,8 @@ public class ConvolutionalNet { public static final String AVERAGE_PNG = "average.png"; private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt"; private static final String RESOURCE_NAME = "/alexnet_def.json"; + //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt"; + //private static final String RESOURCE_NAME = "/tinynet_def.json"; static class Img{ @@ -89,8 +93,9 @@ BufferedImage readImage(){ static List historyData = new ArrayList<>(); static LinkedList rateData = new LinkedList<>(); + @SuppressWarnings({"ThrowableInstanceNotThrown", "ThrowableInstanceNeverThrown"}) public static void main(String[] args) throws IOException { - System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "false"); + System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "false"); String def = "C:\\Users\\naoki\\Desktop\\sampleimg288"; Path dir = Paths.get(args.length > 0 ? args[0] : def); List categories = Files.list(dir) @@ -184,14 +189,15 @@ public static void main(String[] args) throws IOException { NeuralNetwork nn; - /* - try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME); - InputStreamReader isr = new InputStreamReader(is)){ - nn = NeuralNetwork.readFromJson(isr); - }*/ - - try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){ - nn = NeuralNetwork.readFromJson(r); + if(true){ + try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME); + InputStreamReader isr = new InputStreamReader(is)){ + nn = NeuralNetwork.readFromJson(isr); + } + }else{ + try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){ + nn = NeuralNetwork.readFromJson(r); + } } nn.init(); @@ -285,23 +291,13 @@ public static void main(String[] args) throws IOException { Image lineGraph = createLineGraph(500, 200, historyData, 1, 0); historyLabel.setIcon(new ImageIcon(lineGraph)); - //一段目のフィルタの表示 - //全結合一段の表示 - firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult()))); - //全結合二段の表示 - lastResult.setIcon(new ImageIcon(createGraph(256, 128, output))); - - firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias()))); - secondBias.setIcon(new ImageIcon(createGraph(500, 128, - conv2.getBias()))); - fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias()))); - fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias()))); //System.out.println(Arrays.stream(output).mapToObj(d -> String.format("%.2f", d)).collect(Collectors.joining(","))); count[0]++; nn.setImageIndex(nn.getImageIndex() + 1); if(count[0] >= MINI_BATCH){ + nn.joinBatch(); batchCount[0]++; System.out.printf("%5d %4d %.2f/m %s %s%n", batchCount[0], @@ -319,7 +315,7 @@ public static void main(String[] args) throws IOException { System.out.printf("weight: %.2f~%.2f average %.2f ", ws.getMin(), ws.getMax(), ws.getAverage()); DoubleSummaryStatistics bs = ((LerningLayer)layer).getBiasStatistics(); - System.out.printf("bias: %.2f~%.2f average %.2f ", + System.out.printf("bias: %.8f~%.8f average %.2f ", bs.getMin(), bs.getMax(), bs.getAverage()); } System.out.println(); @@ -329,6 +325,18 @@ public static void main(String[] args) throws IOException { pStart[0] = System.currentTimeMillis(); nn.prepareBatch(); + //一段目のフィルタの表示 + //全結合一段の表示 + firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult()))); + //全結合二段の表示 + lastResult.setIcon(new ImageIcon(createGraph(256, 128, output))); + + firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias()))); + secondBias.setIcon(new ImageIcon(createGraph(500, 128, + conv2.getBias()))); + fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias()))); + fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias()))); + // 1時間に一回保存する int hour = LocalTime.now().getHour(); if(lastHour[0] != hour){ diff --git a/src/main/java/kishida/cnn/NeuralNetwork.java b/src/main/java/kishida/cnn/NeuralNetwork.java index 640839d..809adad 100644 --- a/src/main/java/kishida/cnn/NeuralNetwork.java +++ b/src/main/java/kishida/cnn/NeuralNetwork.java @@ -10,11 +10,13 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; +import com.jogamp.opencl.CLBuffer; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; +import java.nio.FloatBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -23,6 +25,7 @@ import java.util.stream.IntStream; import kishida.cnn.activation.LogisticFunction; import kishida.cnn.layers.ConvolutionLayer; +import kishida.cnn.layers.FullGpuEnabled; import kishida.cnn.layers.FullyConnect; import kishida.cnn.layers.InputLayer; import kishida.cnn.layers.MaxPoolingLayer; @@ -165,8 +168,23 @@ public float[] forward(float[] readData, float[] correctData){ delta[idx] = correctData[idx] - output[idx]; } //逆伝播 + CLBuffer bufDelta = null; for(int i = layers.size() - 1; i >= 1; --i){ - delta = layers.get(i).backward(delta); + FullGpuEnabled layer = layers.get(i) instanceof FullGpuEnabled ? + (FullGpuEnabled) layers.get(i) : null; + FullGpuEnabled pre = layers.get(i).getPreLayer() instanceof FullGpuEnabled ? + (FullGpuEnabled)layers.get(i).getPreLayer() : null; + if(true && layer != null && pre != null && layer.isUseGpu()){ + if(bufDelta == null){ + bufDelta = layer.backwardBuf(pre.getBufResult(), delta); + }else{ + bufDelta = layer.backwardBuf(pre.getBufResult(), bufDelta); + } + delta = null; + }else{ + delta = layers.get(i).backward(delta); + bufDelta = null; + } } return output; diff --git a/src/main/java/kishida/cnn/activation/ActivationFunction.java b/src/main/java/kishida/cnn/activation/ActivationFunction.java index 4768adc..efbc29b 100644 --- a/src/main/java/kishida/cnn/activation/ActivationFunction.java +++ b/src/main/java/kishida/cnn/activation/ActivationFunction.java @@ -5,6 +5,7 @@ */ package kishida.cnn.activation; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; @@ -28,4 +29,6 @@ public void applyAfter(float[] values) { /** 微分 */ public abstract float diff(float value); + @JsonIgnore + public abstract String getName(); } diff --git a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java index 4d37ccb..7c4c221 100644 --- a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java +++ b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java @@ -23,4 +23,9 @@ public float diff(float value) { return value >= 0 && value <= 2 ? 1 : 0; } + @Override + public String getName() { + return "limitrelu"; + } + } diff --git a/src/main/java/kishida/cnn/activation/LinearFunction.java b/src/main/java/kishida/cnn/activation/LinearFunction.java index 5bb1ef2..b46eb2a 100644 --- a/src/main/java/kishida/cnn/activation/LinearFunction.java +++ b/src/main/java/kishida/cnn/activation/LinearFunction.java @@ -20,5 +20,9 @@ public float apply(float value) { public float diff(float value) { return 1; } + @Override + public String getName() { + return "linear"; + } } diff --git a/src/main/java/kishida/cnn/activation/LogisticFunction.java b/src/main/java/kishida/cnn/activation/LogisticFunction.java index 637b0f0..577468c 100644 --- a/src/main/java/kishida/cnn/activation/LogisticFunction.java +++ b/src/main/java/kishida/cnn/activation/LogisticFunction.java @@ -17,5 +17,9 @@ public float apply(float value) { public float diff(float value) { return value * (1 - value); } + @Override + public String getName() { + return "logistic"; + } } diff --git a/src/main/java/kishida/cnn/activation/RectifiedLinear.java b/src/main/java/kishida/cnn/activation/RectifiedLinear.java index b37e663..c620a0b 100644 --- a/src/main/java/kishida/cnn/activation/RectifiedLinear.java +++ b/src/main/java/kishida/cnn/activation/RectifiedLinear.java @@ -17,5 +17,9 @@ public float apply(float value) { public float diff(float value) { return value >= 0 ? 1 : 0; } + @Override + public String getName() { + return "relu"; + } } diff --git a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java index b6e9875..86ad50e 100644 --- a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java +++ b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java @@ -30,4 +30,9 @@ public float diff(float value) { return value * (1 - value); } + @Override + public String getName() { + return "softmax"; + } + } diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java index 2666102..5412038 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java @@ -51,7 +51,6 @@ private void proc(int chxxyy) { } newDelta[chxxyy] = tempDelta; } - float[] input; float[] result; int inputChannels; int inputWidth; @@ -65,11 +64,10 @@ private void proc(int chxxyy) { float[] delta; float[] newDelta; - public float[] backword(float[] input, float[] delta, float[] result, + public float[] backword(float[] delta, float[] result, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, - int filterSize, int stride, boolean useGpu) { - this.input = input; + int filterSize, int stride, float[] newDelta, boolean useGpu) { this.delta = delta; this.inputChannels = inputChannels; this.inputWidth = inputWidth; @@ -81,12 +79,11 @@ public float[] backword(float[] input, float[] delta, float[] result, this.filterSize = filterSize; this.stride = stride; this.result = result; - this.newDelta = new float[inputChannels * inputWidth * inputHeight]; + this.newDelta = newDelta; if (useGpu) { put(filter); put(delta); put(result); - put(input); execute(inputChannels * inputWidth * inputHeight); get(newDelta); } else { diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java index 982d455..771dded 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java @@ -44,7 +44,7 @@ private void proc(int fchij) { } } } - filter[fchij] += df; + filterDelta[fchij] += df; } float[] input; float[] result; @@ -52,7 +52,7 @@ private void proc(int fchij) { int inputChannels; int inputWidth; int inputHeight; - float[] filter; + float[] filterDelta; int outputChannels; int outputWidth; int outputHeight; @@ -62,14 +62,14 @@ private void proc(int fchij) { public void backword(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, - float[] filter, int outputChannels, int outputWidth, int outputHeight, + float[] filterDelta, int outputChannels, int outputWidth, int outputHeight, int filterSize, int stride, float learningRate, boolean useGpu) { this.input = input; this.delta = delta; this.inputChannels = inputChannels; this.inputWidth = inputWidth; this.inputHeight = inputHeight; - this.filter = filter; + this.filterDelta = filterDelta; this.outputChannels = outputChannels; this.outputWidth = outputWidth; this.outputHeight = outputHeight; @@ -79,11 +79,11 @@ public void backword(float[] delta, float[] result, this.learningRate = learningRate;// / outputWidth;// * outputHeight); if (useGpu) { put(delta); - put(filter); + put(filterDelta); put(input); put(result); execute(outputChannels * inputChannels * filterSize * filterSize); - get(filter); + get(filterDelta); } else { IntStream.range(0, outputChannels).parallel().forEach((f) -> { for (int chij = 0; chij < inputChannels * filterSize * filterSize; ++chij) { diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java index f867aeb..efd8d72 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java @@ -62,7 +62,7 @@ private void proc(int fxy) { int outputHeight; int filterSize; int stride; - float[] bias; + //float[] bias; float[] delta; float learningRate; float[] tempDelta; @@ -74,7 +74,7 @@ public float[] backward(float[] delta, float[] result, float[] input, int inputChannels, int inputWidth, int inputHeight, float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] filterDelta, float[] biasDelta, - int filterSize, int stride, float[] bias, float learningRate, boolean useGpu) { + int filterSize, int stride, float learningRate, boolean useGpu) { this.delta = delta; this.input = input; this.inputChannels = inputChannels; @@ -86,7 +86,7 @@ public float[] backward(float[] delta, float[] result, this.outputHeight = outputHeight; this.filterSize = filterSize; this.stride = stride; - this.bias = bias; + //this.bias = bias; this.result = result; this.tempDelta = new float[outputChannels * inputChannels * inputWidth * inputHeight]; this.learningRate = learningRate;// / (outputWidth * outputHeight); diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java index dd50c82..dd33cce 100644 --- a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java +++ b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java @@ -23,8 +23,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid this.outputChannels = outputChannels; this.outputWidth = outputWidth; this.outputHeight = outputHeight; - if(useGpu && outputWidth * outputHeight > 500){ - execute(outputWidth * outputHeight); + if(useGpu){ + execute(outputChannels * outputWidth * outputHeight); //throw new UnsupportedOperationException("because I dont know how to use private memory."); }else{ IntStream.range(0, outputWidth).parallel().forEach(x -> { @@ -38,8 +38,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid @Override public void run() { - int xy = getGlobalId(); - procGpu(xy); + int chxy = getGlobalId(); + procGpu(chxy); } float[] result; @@ -48,30 +48,22 @@ public void run() { int outputChannels; static final int n = 5; - @PrivateMemorySpace(n) float[] sigma = new float[n]; // not work + //@PrivateMemorySpace(n) float[] sigma = new float[n]; // not work - public void procGpu(int xy){ + public void procGpu(int chxy){ final int k = 2; final float a = 0.0001f; final float b = 0.75f; - int lp = 0; - for(; lp < n / 2; ++lp){ - sigma[lp] = - result[lp * outputWidth * outputHeight + xy] * - result[lp * outputWidth * outputHeight + xy]; - } - for(int ch = 0; ch < outputChannels; ++ch){ - sigma[lp % n] = lp >= outputChannels ? 0 : - result[lp * outputWidth * outputHeight + xy] * - result[lp * outputWidth * outputHeight + xy]; - lp = lp + 1; - float sum = 0; - for(int i = 0; i < n; ++i){ - sum += sigma[i]; - } - result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] / - pow(k + a * sum, b); + int ch = chxy / (outputWidth * outputHeight); + int xy = chxy % (outputWidth * outputHeight); + + float sum = 0; + for(int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); ++lp){ + sum += result[lp * outputWidth * outputHeight + xy] * + result[lp * outputWidth * outputHeight + xy]; } + result[chxy] = result[chxy] / + pow(k + a * sum, b); } diff --git a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java index ffaabb9..2c57ac0 100644 --- a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java +++ b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java @@ -26,10 +26,13 @@ public void run() { private void proc(int j){ if(dropout[j] == 1){ + float r = 0; for (int i = 0; i < inSize; ++i) { - result[j] += in[i] * weight[i * out + j]; + r += in[i] * weight[i * out + j]; } - result[j] += bias[j]; + result[j] = r + bias[j]; + }else{ + result[j] = 0; } } int out; diff --git a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java index ff5e854..d4bf793 100644 --- a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java +++ b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java @@ -61,11 +61,7 @@ private void proc(int chxy) { } float std = max(threshold, sqrt(variance / count)); result[chxy] = (input[chxy] - average) / std; - averages[chxy] = average; - rates[chxy] = std; } - float[] averages; - float[] rates; float[] result; float[] input; int inputChannels; @@ -75,11 +71,9 @@ private void proc(int chxy) { float threshold; public float[] normalize(float[] input, int inputChannels, int inputWidth, int inputHeight, - int size, float[] averages, float[] rates, float threshold, boolean useGpu) { + int size, float threshold, float[] result, boolean useGpu) { this.input = input; - this.rates = rates; - this.result = new float[inputChannels * inputWidth * inputHeight]; - this.averages = averages; + this.result = result; this.inputChannels = inputChannels; this.inputWidth = inputWidth; this.inputHeight = inputHeight; @@ -88,8 +82,6 @@ public float[] normalize(float[] input, int inputChannels, int inputWidth, int i if (useGpu) { put(input); execute(inputChannels * inputWidth * inputHeight); - get(averages); - get(rates); get(result); } else { IntStream.range(0, inputChannels).parallel().forEach(ch -> { diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java index a63476e..1e1ac22 100644 --- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java +++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java @@ -7,9 +7,11 @@ import com.aparapi.Kernel; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Arrays; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; @@ -20,35 +22,49 @@ import kishida.cnn.kernels.ConvolutionBackwordKernel; import kishida.cnn.kernels.ConvolutionForwardKernel; import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel; +import kishida.cnn.opencl.ConvolutionBackwordCL; +import kishida.cnn.opencl.ConvolutionForwardCL; +import kishida.cnn.opencl.OpenCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; /** 畳み込み層 */ -public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{ +public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer, FullGpuEnabled{ @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] filter; + @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] filterDelta; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter float[] biasDelta; @Getter int stride; @Getter int filterSize; private ActivationFunction activation; + @JsonProperty @Getter @Setter boolean useGpu; @Getter float initBias; float[] tempDelta; + float[] newDelta; + + CLBuffer bufFilter; + CLBuffer bufBias; + CLBuffer bufFilterDelta; + CLBuffer bufBiasDelta; + CLBuffer bufDelta; + CLBuffer bufNewDelta; + CLBuffer bufTempBias; + + @JsonIgnore + @Getter + CLBuffer bufResult; public ConvolutionLayer(String name, int filterCount, int size, int stride, float initBias, boolean useGpu) { @@ -108,44 +124,104 @@ public final void setPreLayer(NeuralLayer preLayer) { this.result = new float[outputChannels * outputWidth * outputHeight]; this.tempDelta = new float[result.length]; + this.newDelta = new float[inputChannels * inputWidth * inputHeight]; + + if(true){ + this.bufFilter = OpenCL.createReadWriteBuffer(filter); + this.bufBias = OpenCL.createReadWriteBuffer(bias); + this.bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); + this.bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + this.bufResult = OpenCL.createReadWriteBuffer(result.length); + this.bufDelta = OpenCL.createReadWriteBuffer(result.length); + this.bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false) + .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufBiasDelta, false); + } + } + + public float[] getFilter() { + if(bufFilter != null){ + OpenCL.getQueue().putReadBuffer(bufFilter, true); + bufFilter.getBuffer().get(filter).rewind(); + } + return filter; + } + + public float[] getBias() { + if(bufBias != null){ + OpenCL.getQueue().putReadBuffer(bufBias, true); + bufBias.getBuffer().get(bias).rewind(); + } + return bias; + } + + public float[] getFilterDelta() { + if(bufFilterDelta != null){ + OpenCL.getQueue().putReadBuffer(bufFilterDelta, true); + bufFilterDelta.getBuffer().get(filterDelta).rewind(); + } + return filterDelta; + } + + public float[] getBiasDelta() { + if(bufBiasDelta != null){ + OpenCL.getQueue().putReadBuffer(bufBiasDelta, true); + bufBiasDelta.getBuffer().get(biasDelta).rewind(); + } + return biasDelta; } - /** 畳み込みフィルタを適用する */ @Override - public float[] forward(float[] img) { - result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, useGpu); - //localNormalization(result); - ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, - outputChannels, outputWidth, outputHeight, false); + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } return result; } - private void localNormalization(float[] result){ - final int n = 5; - final int k = 2; - final float a = 0.0001f; - final float b = 0.75f; - // resultをコピーするほうが楽だけど、メモリを節約するため - final float[] sigma = new float[n]; - for(int x = 0; x < outputWidth; ++x){ - for(int y = 0; y < outputHeight; ++y){ - int xy = x * outputHeight + y; - Arrays.fill(sigma, 0); - int lp = 0; - for(; lp < n / 2; ++lp){ - sigma[lp] = result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy]; - } - for(int ch = 0; ch < outputChannels; ++ch){ - sigma[lp % 5] = lp >= outputChannels ? 0 : - result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy]; - lp = lp + 1; - float sum = FloatUtil.floatSum(sigma); - result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] / - (float)Math.pow(k + a * sum, b); + /** 畳み込みフィルタを適用する */ + @Override + public float[] forward(float[] img) { + if(true){ + if(false){ + // aparapi + result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false); + //localNormalization(result); + ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, + outputChannels, outputWidth, outputHeight, false); + } else{ + // JOCL + if(true){ + ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bufBias); + } else { + ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias); } } + }else { + //CPU + result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false); + //localNormalization(result); + ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result, + outputChannels, outputWidth, outputHeight, false); } + return result; + } + + @Override + public void forward(CLBuffer input) { + ConvolutionForwardCL.INSTANCE.forward(input, + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufResult, filterSize, stride, bufBias); } /** 畳み込み層の学習 */ @@ -153,24 +229,50 @@ private void localNormalization(float[] result){ public float[] backward(float[] input, float[] delta) { if (useGpu) { // GPUバージョン - float[] newDelta = ConvolutionBackwordDeltaKernel.INSTANCE.backword(input, delta, result, - inputChannels, inputWidth, inputHeight, - filter, outputChannels, outputWidth, outputHeight, filterSize, stride, useGpu); - ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result, - input, inputChannels, inputWidth, inputHeight, - filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu); - ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result, - outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu); - if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || - ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || - ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) { - useGpu = false; - } - if (!useGpu) { - System.out.println("Can't use GPU on " + name); - System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode()); - System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); - System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); + if(false){ + // aparapi + ConvolutionBackwordDeltaKernel.INSTANCE.backword(delta, result, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterSize, stride, newDelta, useGpu); + ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result, + input, inputChannels, inputWidth, inputHeight, + filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu); + ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result, + outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu); + if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU || + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) { + useGpu = false; + } + if (!useGpu) { + System.out.println("Can't use GPU on " + name); + System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode()); + System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode()); + System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode()); + } + }else{ + // JOCL + if(true){ + bufDelta.getBuffer().put(delta).rewind(); + OpenCL.getQueue().putWriteBuffer(bufDelta, false); + + ConvolutionBackwordCL.INSTANCE.backward( + bufDelta, bufResult, ((FullGpuEnabled)preLayer).getBufResult(), + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, filterSize, stride, bufNewDelta, + parent.getLearningRate()); + + OpenCL.getQueue().putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta).rewind(); + }else{ + ConvolutionBackwordCL.INSTANCE.backward( + delta, result, input, + inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, + filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate()); + } } return newDelta; } else { @@ -179,23 +281,55 @@ public float[] backward(float[] input, float[] delta) { input, inputChannels, inputWidth, inputHeight, filter, outputChannels, outputWidth, outputHeight, filterDelta, biasDelta, - filterSize, stride, bias, parent.getLearningRate(), false); + filterSize, stride, parent.getLearningRate(), false); } } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + ConvolutionBackwordCL.INSTANCE.backward(bufDelta, bufResult, bufInput, + inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, + filterSize, stride, bufNewDelta, parent.getLearningRate()); + return bufNewDelta; + } + @Override public void prepareBatch() { - float momentam = parent.getMomentam(); - IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam); - IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + if(useGpu){ + ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(), + filterDelta.length, biasDelta.length, bufFilterDelta, bufBiasDelta); + }else{ + float momentam = parent.getMomentam(); + IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam); + IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + } } @Override public void joinBatch() { - float count = parent.getMiniBatch(); - IntStream.range(0, filter.length).parallel().forEach(i -> filter[i] += filterDelta[i] / count - - parent.getWeightDecay() * parent.getLearningRate() * filter[i]); - IntStream.range(0, bias.length).parallel().forEach(i -> bias[i] += biasDelta[i] / count); + if(useGpu){ + ConvolutionBackwordCL.INSTANCE.join( + parent.getWeightDecay(), parent.getLearningRate(), + filter.length, bias.length, + parent.getMiniBatch(), + bufFilter, bufFilterDelta, bufBias, bufBiasDelta); + /* + bufFilter.getBuffer().put(filter).rewind(); + bufBias.getBuffer().put(bias).rewind(); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false); + */ + }else{ + float count = parent.getMiniBatch(); + IntStream.range(0, filter.length).parallel().forEach( + i -> filter[i] += filterDelta[i] / count + - parent.getWeightDecay() * parent.getLearningRate() * filter[i]); + IntStream.range(0, bias.length).parallel().forEach( + i -> bias[i] += biasDelta[i] / count); + } } @Override @@ -208,12 +342,12 @@ public String toString() { @Override public DoubleSummaryStatistics getWeightStatistics() { - return FloatUtil.summary(filter); + return FloatUtil.summary(getFilter()); } @Override public DoubleSummaryStatistics getBiasStatistics() { - return FloatUtil.summary(bias); + return FloatUtil.summary(getBias()); } } diff --git a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java new file mode 100644 index 0000000..d9f37aa --- /dev/null +++ b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java @@ -0,0 +1,35 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.layers; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; +import java.util.Objects; +import kishida.cnn.opencl.OpenCL; + +/** + * + * @author naoki + */ +public interface FullGpuEnabled { + @JsonIgnore + default boolean isUseGpu(){ + return true; + } + CLBuffer getBufResult(); + void forward(CLBuffer bufInput); + CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta); + default CLBuffer backwardBuf(CLBuffer bufInput, float[] delta){ + Objects.requireNonNull(delta, "delta is null on " + ((NeuralLayer)this).getName()); + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + OpenCL.getQueue().putWriteBuffer(bufDelta, false); + CLBuffer result = backwardBuf(bufInput, bufDelta); + bufDelta.release(); + return result; + } + +} diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java index 9183706..99aca63 100644 --- a/src/main/java/kishida/cnn/layers/FullyConnect.java +++ b/src/main/java/kishida/cnn/layers/FullyConnect.java @@ -6,13 +6,20 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Arrays; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; import java.util.DoubleSummaryStatistics; import java.util.stream.IntStream; import kishida.cnn.activation.ActivationFunction; import kishida.cnn.kernels.FullyForwardKernel; +import kishida.cnn.opencl.ConvolutionBackwordCL; +import kishida.cnn.opencl.FullyBackwordCL; +import kishida.cnn.opencl.FullyForwardCL; +import kishida.cnn.opencl.OpenCL; import kishida.cnn.util.FloatUtil; import lombok.Getter; import lombok.Setter; @@ -21,18 +28,14 @@ * * @author naoki */ -public class FullyConnect extends NeuralLayer implements LerningLayer{ +public class FullyConnect extends NeuralLayer implements LerningLayer, FullGpuEnabled{ @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[]weight; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[] bias; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[]weightDelta; @JsonInclude(JsonInclude.Include.NON_NULL) - @Getter private float[] biasDelta; @JsonProperty @@ -41,7 +44,8 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ private int[] dropout; @Getter private float dropoutRate = 1; - @Getter @Setter + @JsonProperty + @Setter @Getter private boolean useGpu; private float[] newDelta; private float[] diffed; @@ -50,6 +54,16 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{ @Getter private float initBias; + CLBuffer bufWeight; + CLBuffer bufBias; + CLBuffer bufWeightDelta; + CLBuffer bufBiasDelta; + CLBuffer bufDropout; + @JsonIgnore + @Getter + CLBuffer bufResult; + CLBuffer bufNewDelta; + public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) { this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu); } @@ -112,6 +126,61 @@ public final void setPreLayer(NeuralLayer preLayer) { if(biasDelta == null){ this.biasDelta = new float[outputSize]; } + if(useGpu){ + bufWeight = OpenCL.createReadWriteBuffer(weight); + bufBias = OpenCL.createReadWriteBuffer(bias); + bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); + bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + bufResult = OpenCL.createReadWriteBuffer(result.length); + bufDropout = OpenCL.createReadBuffer(dropout); + bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length); + OpenCL.getQueue() + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false) + .putWriteBuffer(bufWeightDelta, false) + .putWriteBuffer(bufBiasDelta, false) + .putWriteBuffer(bufDropout, false); + } + } + + public float[] getWeight() { + if(bufWeight != null){ + OpenCL.getQueue().putReadBuffer(bufWeight, true); + bufWeight.getBuffer().get(weight).rewind(); + } + return weight; + } + + public float[] getBias() { + if(bufBias != null){ + OpenCL.getQueue().putReadBuffer(bufBias, true); + bufBias.getBuffer().get(bias).rewind(); + } + return bias; + } + public float[] getWeightDelta() { + if(bufWeightDelta != null){ + OpenCL.getQueue().putReadBuffer(bufWeightDelta, true); + bufWeightDelta.getBuffer().get(weightDelta).rewind(); + } + return weightDelta; + } + + public float[] getBiasDelta() { + if(bufBiasDelta != null){ + OpenCL.getQueue().putReadBuffer(bufBiasDelta, true); + bufBiasDelta.getBuffer().get(biasDelta).rewind(); + } + return biasDelta; + } + + @Override + public float[] getResult() { + if(bufResult != null && isUseGpu()){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; } @JsonProperty("activationObj") @@ -132,58 +201,97 @@ public void prepareDropout() { @Override public float[] forward(float[] in) { prepareDropout(); - Arrays.fill(result, 0); - FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); - /* - IntStream.range(0, out).parallel().filter(j -> dropout[j] == 1).forEach(j -> { - for (int i = 0; i < in.length; ++i) { - result[j] += in[i] * weight[i * out + j]; + if(useGpu){ + if(false){ + FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); + activation.applyAfter(result); + }else{ + FullyForwardCL.INSTANCE.forward(inputSize, outputSize, dropout, in, bufWeight, bufBias, result, activation); } - result[j] += bias[j]; - });*/ - activation.applyAfter(result); + }else{ + FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu); + activation.applyAfter(result); + } return result; } + @Override + public void forward(CLBuffer input) { + prepareDropout(); + bufDropout.getBuffer().put(dropout).rewind(); + OpenCL.getQueue().putWriteBuffer(bufDropout, false); + + FullyForwardCL.INSTANCE.forward(inputSize, outputSize, + bufDropout, input, bufWeight, bufBias, bufResult, activation); + } + @Override public float[] backward(float[] in, float[] delta) { - Arrays.fill(newDelta, 0); - Arrays.fill(diffed, 0); - for(int i = 0; i < result.length; ++i){ - diffed[i] = activation.diff(result[i]); - } - IntStream.range(0, in.length).parallel().forEach((i) -> { - for (int j = 0; j < outputSize; ++j) { - if (dropout[j] != 1) { - continue; - } - float d = diffed[j] * delta[j]; - newDelta[i] += d * weight[i * outputSize + j];//in[i] *; - weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate(); + if(useGpu && true){ + FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, + dropout, in, delta, result, bufWeight, bufWeightDelta, bufBiasDelta, newDelta, + parent.getLearningRate(), activation); + }else{ + for(int i = 0; i < result.length; ++i){ + diffed[i] = activation.diff(result[i]); } - }); - IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> { - biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate(); - }); + IntStream.range(0, in.length).parallel().forEach((i) -> { + float nd = 0; + for (int j = 0; j < outputSize; ++j) { + if (dropout[j] != 1) { + continue; + } + float d = diffed[j] * delta[j]; + nd += d * weight[i * outputSize + j];//in[i] *; + weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate(); + } + newDelta[i] = nd; + }); + IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> { + biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate(); + }); + } return newDelta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, bufDropout, + bufInput, bufDelta, bufResult, + bufWeight, bufWeightDelta, + bufBiasDelta, bufNewDelta, parent.getLearningRate(), activation); + return bufNewDelta; + } + @Override public void prepareBatch() { - float momentam = parent.getMomentam(); - IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam); - IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + if(useGpu & true){ + ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(), + weightDelta.length, biasDelta.length, bufWeightDelta, bufBiasDelta); + }else{ + float momentam = parent.getMomentam(); + IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam); + IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam); + } } @Override public void joinBatch() { - IntStream.range(0, weight.length).parallel().forEach(ij -> { - weight[ij] += weightDelta[ij] / parent.getMiniBatch() - - weight[ij] * parent.getWeightDecay() * parent.getLearningRate(); - }); - IntStream.range(0, bias.length).parallel().forEach(i -> { - bias[i] += biasDelta[i] / parent.getMiniBatch(); - }); + if(useGpu & true){ + ConvolutionBackwordCL.INSTANCE.join( + parent.getWeightDecay(), parent.getLearningRate(), + weight.length, bias.length, + parent.getMiniBatch(), + bufWeight, bufWeightDelta, bufBias, bufBiasDelta); + }else{ + IntStream.range(0, weight.length).parallel().forEach(ij -> { + weight[ij] += weightDelta[ij] / parent.getMiniBatch() + - weight[ij] * parent.getWeightDecay() * parent.getLearningRate(); + }); + IntStream.range(0, bias.length).parallel().forEach(i -> { + bias[i] += biasDelta[i] / parent.getMiniBatch(); + }); + } } @Override diff --git a/src/main/java/kishida/cnn/layers/InputLayer.java b/src/main/java/kishida/cnn/layers/InputLayer.java index 88a38f4..80c380e 100644 --- a/src/main/java/kishida/cnn/layers/InputLayer.java +++ b/src/main/java/kishida/cnn/layers/InputLayer.java @@ -6,13 +6,21 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; +import kishida.cnn.opencl.OpenCL; +import lombok.Getter; /** * * @author naoki */ -public class InputLayer extends ImageNeuralLayer { +public class InputLayer extends ImageNeuralLayer implements FullGpuEnabled { + @JsonIgnore + @Getter + CLBuffer bufResult; public InputLayer(int width, int height) { this("input", width, height); @@ -24,6 +32,7 @@ public InputLayer( @JsonProperty("width") int width, @JsonProperty("height") int height) { super("input", 0, 0, 0, 3, width, height); + bufResult = OpenCL.createWriteBuffer(outputChannels * outputWidth * outputHeight); } @Override @@ -39,20 +48,48 @@ public int getHeight() { return super.outputHeight; } + @Override + public boolean isUseGpu() { + return false; + } + @Override public float[] forward(float[] in) { this.result = in; + bufResult.getBuffer().put(result); + OpenCL.getQueue() + .putWriteBuffer(bufResult, false); return result; } + @Override + public void forward(CLBuffer input) { + // do nothing + } + @Override public float[] backward(float[] in, float[] delta) { // do nothing return null; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + // do nothing + return null; + } + + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, float[] delta) { + // do nothing + return null; + } + public void setInput(float[] input){ result = input; + bufResult.getBuffer().put(result).rewind(); + OpenCL.getQueue() + .putWriteBuffer(bufResult, false); } @Override diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java index ad7f4fe..9625241 100644 --- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java +++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java @@ -6,21 +6,33 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.Arrays; +import java.util.Random; +import java.util.stream.Collectors; import java.util.stream.IntStream; +import kishida.cnn.opencl.MaxPoolingCL; +import kishida.cnn.opencl.OpenCL; import lombok.Getter; /** * * @author naoki */ -public class MaxPoolingLayer extends ImageNeuralLayer { +public class MaxPoolingLayer extends ImageNeuralLayer implements FullGpuEnabled { @Getter int size; @Getter int stride; float[] newDelta; + @JsonIgnore + @Getter + CLBuffer bufResult; + + CLBuffer bufNewDelta; @JsonCreator public MaxPoolingLayer( @@ -40,74 +52,154 @@ public final void setPreLayer(NeuralLayer preLayer) { outputHeight = inputHeight / stride; result = new float[outputChannels * outputWidth * outputHeight]; newDelta = new float[inputChannels * inputWidth * inputHeight]; + bufResult = OpenCL.createReadWriteBuffer(result.length); + bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length); + } + + @Override + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; } /** プーリング(max) */ @Override public float[] forward(float[] data) { - IntStream.range(0, inputChannels).parallel().forEach(ch -> { - for (int x = 0; x < outputWidth; ++x) { - for (int y = 0; y < outputHeight; ++y) { - float max = Float.NEGATIVE_INFINITY; - for (int i = 0; i < size; ++i) { - int xx = x * stride + i - size / 2; - if (xx < 0 || xx >= inputWidth) { - continue; - } - for (int j = 0; j < size; ++j) { - int yy = y * stride + j - size / 2; - if (yy < 0 || yy >= inputHeight) { + if(false){ + MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, data, result); + }else{ + IntStream.range(0, inputChannels).parallel().forEach(ch -> { + for (int x = 0; x < outputWidth; ++x) { + for (int y = 0; y < outputHeight; ++y) { + float max = Float.NEGATIVE_INFINITY; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { continue; } - float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; - if (max < d) { - max = d; + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + if (max < d) { + max = d; + } } } + result[ch * outputWidth * outputHeight + x * outputHeight + y] = max; } - result[ch * outputWidth * outputHeight + x * outputHeight + y] = max; } - } - }); + }); + } return result; } + @Override + public void forward(CLBuffer input) { + MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + input, bufResult); + } + @Override public float[] backward(float[] in, float[] delta) { - Arrays.fill(newDelta, 0); - IntStream.range(0, inputChannels).parallel().forEach(ch -> { - for (int x = 0; x < outputWidth; ++x) { - for (int y = 0; y < outputHeight; ++y) { - float max = Float.NEGATIVE_INFINITY; - int maxX = 0; - int maxY = 0; - for (int i = 0; i < size; ++i) { - int xx = x * stride + i - size / 2; - if (xx < 0 || xx >= inputWidth) { - continue; - } - for (int j = 0; j < size; ++j) { - int yy = y * stride + j - size / 2; - if (yy < 0 || yy >= inputHeight) { + return backward(in, delta, false); + } + public float[] backward(float[] in, float[] delta, boolean gpu) { + if(gpu){ + MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, in, delta, newDelta); + }else{ + Arrays.fill(newDelta, 0); + IntStream.range(0, inputChannels).parallel().forEach(ch -> { + for (int x = 0; x < outputWidth; ++x) { + for (int y = 0; y < outputHeight; ++y) { + float max = Float.NEGATIVE_INFINITY; + int maxX = 0; + int maxY = 0; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { continue; } - float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy]; - if (max < d) { - max = d; - maxX = xx; - maxY = yy; + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy]; + if (max < d) { + max = d; + maxX = xx; + maxY = yy; + } } } + int chxy = ch * outputWidth * outputHeight + x * outputHeight + y; + newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] += + delta[chxy]; } - int chxy = ch * outputWidth * outputHeight + x * outputHeight + y; - newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] += - delta[chxy]; } - } - }); + }); + } return newDelta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + bufInput, bufDelta, bufNewDelta); + return bufNewDelta; + } + + public static void main(String[] args) { + InputLayer input = new InputLayer(6, 6); + MaxPoolingLayer pool = new MaxPoolingLayer("test_pool", 3, 2); + pool.setPreLayer(input); + for(int i = 0; i < pool.newDelta.length; ++i){ + pool.newDelta[i] = 3; + } + float[] in = new float[6 * 6 * 3]; + for(int i = 0; i < in.length; ++i){ + in[i] = i; + } + float[] delta = { + 0.01f, 0.02f, 0.03f, 0.05f, 0.07f, 0.11f, 0.13f, 0.17f, 0.19f, + 1, 2, 3, 5, 7, 11, 13, 17, 19, + 1, 2, 3, 5, 7, 11, 13, 17, 19}; + float[] newDeltaGpu = pool.backward(in, delta, true); + float[] newDeltaCpu = pool.backward(in, delta, false); + System.out.println(Arrays.equals(newDeltaCpu, newDeltaGpu)); + IntStream.range(0, newDeltaGpu.length / 6 / 3).forEach(i -> { + System.out.println(IntStream.range(0, 6) + .map(n -> n + i * 6) + .mapToObj(n -> "" + (int)(newDeltaGpu[n]*100)) + .collect(Collectors.joining(","))); + }); + + Random r = new Random(); + for(int t = 0; t < 1000; ++t){ + for(int i = 0; i < in.length; ++i){ + in[i] = r.nextFloat(); + } + for(int i = 0; i < delta.length; ++i){ + delta[i] = r.nextFloat(); + } + float[] newDeltaGpu2 = pool.backward(in, delta, true); + float[] newDeltaCpu2 = pool.backward(in, delta, false); + if(!Arrays.equals(newDeltaCpu2, newDeltaGpu2)){ + System.out.println("wrong"); + }; + } + } + @Override public String toString() { return String.format("%s:Max pooling size:%dx%d stride:%d in:%dx%dx%d out %dx%dx%d", diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java index c776d3a..7f3580b 100644 --- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java @@ -6,24 +6,30 @@ package kishida.cnn.layers; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.jogamp.opencl.CLBuffer; +import java.nio.FloatBuffer; import java.util.stream.IntStream; +import kishida.cnn.opencl.MultiNormalizeCL; +import kishida.cnn.opencl.OpenCL; import lombok.Getter; /** * * @author naoki */ -public class MultiNormalizeLayer extends ImageNeuralLayer{ +public class MultiNormalizeLayer extends ImageNeuralLayer implements FullGpuEnabled{ @Getter int size; @Getter float threshold; + @JsonProperty @Getter boolean useGpu; - - float[] averages; - float[] rates; + @JsonIgnore + @Getter + CLBuffer bufResult; @JsonCreator public MultiNormalizeLayer( @@ -43,70 +49,90 @@ public final void setPreLayer(NeuralLayer preLayer) { outputChannels = inputChannels; outputWidth = inputWidth; outputHeight = inputHeight; - averages = new float[inputWidth * inputHeight]; - rates = new float[inputWidth * inputHeight]; result = new float[inputChannels * inputHeight * inputWidth]; + bufResult = OpenCL.createReadWriteBuffer(result.length); } @Override - public float[] forward(float[] in) { + public float[] getResult() { + if(bufResult != null){ + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result).rewind(); + } + return result; + } - IntStream.range(0, inputWidth).parallel().forEach(x -> { - for(int y = 0; y < inputHeight; ++y){ - float total = 0; - int count = 0; - for(int i = 0; i < size; ++i){ - int xx = x + i - size / 2; - if(xx < 0 || xx >= inputWidth){ - continue; - } - for(int j = 0; j < size; ++j){ - int yy = y + j - size / 2; - if(yy < 0 || yy >= inputHeight){ + @Override + public float[] forward(float[] in) { + if(false){ + MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight, + size, threshold, in, result); + } else{ + IntStream.range(0, inputWidth).parallel().forEach(x -> { + for(int y = 0; y < inputHeight; ++y){ + float total = 0; + int count = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ continue; } - for(int ch = 0; ch < inputChannels; ++ch){ - total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; - ++count; + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; + ++count; + } } } - } - float average = total / count; - float variance = 0; - for(int i = 0; i < size; ++i){ - int xx = x + i - size / 2; - if(xx < 0 || xx >= inputWidth){ - continue; - } - for(int j = 0; j < size; ++j){ - int yy = y + j - size / 2; - if(yy < 0 || yy >= inputHeight){ + float average = total / count; + float variance = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ continue; } - for(int ch = 0; ch < inputChannels; ++ch){ - float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; - variance += (data - average) * (data - average); + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy]; + variance += (data - average) * (data - average); + } } } + float std = Math.max(threshold, (float)Math.sqrt(variance / count)); + for(int ch = 0; ch < inputChannels; ++ch){ + int pos = ch * inputHeight * inputWidth + x * inputHeight + y; + result[pos] = (in[pos] - average) / std; + } } - float std = Math.max(threshold, (float)Math.sqrt(variance / count)); - averages[x * inputHeight + y] = average; - rates[x * inputHeight + y] = std; - for(int ch = 0; ch < inputChannels; ++ch){ - int pos = ch * inputHeight * inputWidth + x * inputHeight + y; - result[pos] = (in[pos] - average) / std; - } - } - }); - + }); + } return result; } + @Override + public void forward(CLBuffer input) { + MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight, size, + threshold, input, bufResult); + } + @Override public float[] backward(float[] in, float[] delta) { return delta; } + @Override + public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) { + return bufDelta; + } + @Override public String toString() { return String.format("%s:Multi channel normalize size:%dx%d in:%dx%dx%d out %dx%dx%d", diff --git a/src/main/java/kishida/cnn/layers/NeuralLayer.java b/src/main/java/kishida/cnn/layers/NeuralLayer.java index 27f0b03..7906c5e 100644 --- a/src/main/java/kishida/cnn/layers/NeuralLayer.java +++ b/src/main/java/kishida/cnn/layers/NeuralLayer.java @@ -36,7 +36,8 @@ public abstract class NeuralLayer { @Getter float[] result; - @Setter + @JsonIgnore + @Setter @Getter NeuralLayer preLayer; @Setter @@ -46,9 +47,14 @@ public NeuralLayer(String name) { this.name = name; } - public float[] forward() { + public void forward() { Objects.requireNonNull(preLayer, "preLayer is null on " + name); - return forward(preLayer.result); + if(this instanceof FullGpuEnabled && preLayer instanceof FullGpuEnabled && + ((FullGpuEnabled)this).isUseGpu()){ + ((FullGpuEnabled)this).forward(((FullGpuEnabled)preLayer).getBufResult()); + }else{ + forward(preLayer.getResult()); + } } public float[] backward(float[] delta) { @@ -71,7 +77,7 @@ public void joinBatch(){ @JsonIgnore public DoubleSummaryStatistics getResultStatistics(){ - return FloatUtil.summary(result); + return FloatUtil.summary(getResult()); } } diff --git a/src/main/java/kishida/cnn/layers/NormalizeLayer.java b/src/main/java/kishida/cnn/layers/NormalizeLayer.java index ff55efe..c4097a7 100644 --- a/src/main/java/kishida/cnn/layers/NormalizeLayer.java +++ b/src/main/java/kishida/cnn/layers/NormalizeLayer.java @@ -38,7 +38,7 @@ public float[] forward(float[] in) { averages = new float[in.length]; rates = new float[in.length]; result = NormalizeKernel.INSTANCE.normalize(in, inputChannels, inputWidth, inputHeight, - size, averages, rates, threshold, useGpu); + size, threshold, result, useGpu); return result; } diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java new file mode 100644 index 0000000..6b2655a --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java @@ -0,0 +1,283 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.util.Map; + +/** + * + * @author naoki + */ +public class ConvolutionBackwordCL { + public static ConvolutionBackwordCL INSTANCE = new ConvolutionBackwordCL(); + CLProgram prog; + Map kernels; + + private ConvolutionBackwordCL() { + } + + public void backward(float[] delta, float[] result, + float[] input, int inputChannels, int inputWidth, int inputHeight, + float[] filter, int outputChannels, int outputWidth, int outputHeight, + float[] filterDelta, float[] biasDelta, + int filterSize, int stride, float[] newDelta, float learningRate) { + CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + CLBuffer bufResult = OpenCL.createReadBuffer(result); + CLBuffer bufInput = OpenCL.createReadBuffer(input); + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufFilterDelta, false) + .putWriteBuffer(bufBiasDelta, false) + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufResult, false); + + backward(delta, bufResult, + bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, + filterSize, stride, newDelta, learningRate); + + OpenCL.getQueue() + .putReadBuffer(bufBiasDelta, true) + .putReadBuffer(bufFilterDelta, true); + bufFilterDelta.getBuffer().get(filterDelta); + bufBiasDelta.getBuffer().get(biasDelta); + + bufFilter.release(); + bufFilterDelta.release(); + bufBiasDelta.release(); + bufInput.release(); + bufResult.release(); + } + public void backward(float[] delta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + int filterSize, int stride, float[] newDelta, float learningRate) { + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight); + OpenCL.getQueue() + .putWriteBuffer(bufDelta, false); + + backward(bufDelta, bufResult, + bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, + filterSize, stride, bufNewDelta, learningRate); + + OpenCL.getQueue() + .putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + + bufDelta.release(); + bufNewDelta.release(); + bufTempBias.release(); + } + public void backward(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + backward_sep(bufDelta, bufResult, bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, + bufFilterDelta, bufBiasDelta, bufTempBias, + filterSize, stride, bufNewDelta, learningRate); + } + + public void backward_sep(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + if(prog == null){ + prog = OpenCL.compile("convolution_backword.cl"); + kernels = prog.createCLKernels(); + } + + CLKernel deltaKernel = prog.createCLKernel("delta_kernel"); + deltaKernel + .rewind() + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(filterSize) + .putArg(outputChannels) + .putArg(stride) + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufResult, + bufDelta, + bufFilter) + .putArg(inputChannels) + .putArg(bufNewDelta); + OpenCL.execute(deltaKernel, + inputChannels * inputWidth * inputHeight); + + CLKernel filterKernel = kernels.get("filter_kernel"); + filterKernel + .rewind() + .putArg(inputChannels) + .putArg(filterSize) + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufResult, + bufDelta) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(learningRate) + .putArgs( + bufInput, + bufFilterDelta); + OpenCL.execute(filterKernel, + outputChannels * inputChannels * filterSize * filterSize); + + CLKernel biasKernel = kernels.get("bias_kernel"); + biasKernel + .rewind() + .putArgs( + bufResult, + bufDelta, + bufTempBias) + .putArg(learningRate); + OpenCL.execute(biasKernel, + outputChannels * outputWidth * outputHeight); + + CLKernel biasAfterKernel = kernels.get("biasAfter"); + biasAfterKernel + .rewind() + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufTempBias, + bufBiasDelta); + OpenCL.execute(biasAfterKernel, outputChannels); + + } + public void backward_gen(CLBuffer bufDelta, CLBuffer bufResult, + CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufFilterDelta, CLBuffer bufBiasDelta, + CLBuffer bufTempBias, + int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) { + if(prog == null){ + prog = OpenCL.compile("convolution_backword.cl"); + kernels = prog.createCLKernels(); + } + CLKernel dfbKernel = kernels.get("dfb"); + dfbKernel.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputChannels) + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(filterSize) + .putArg(stride) + .putArg(learningRate) + .putArgs( + bufInput, + bufResult, + bufFilter, + bufFilterDelta, + bufDelta, + bufNewDelta, + bufTempBias) + .putArg(inputChannels * inputWidth * inputHeight) + .putArg(outputChannels * inputChannels * filterSize * filterSize) + .putArg(outputChannels * outputWidth * outputHeight); + OpenCL.execute(dfbKernel, + inputChannels * inputWidth * inputHeight + + outputChannels * inputChannels * filterSize * filterSize + + outputChannels * outputWidth * outputHeight); + + CLKernel biasAfterKernel = kernels.get("biasAfter"); + biasAfterKernel + .rewind() + .putArg(outputWidth) + .putArg(outputHeight) + .putArgs( + bufTempBias, + bufBiasDelta); + OpenCL.execute(biasAfterKernel, outputChannels); + + } + public void prepare(float momentam, + int filterCount, int biasCount, + CLBuffer bufFilterDelta, + CLBuffer bufBiasDelta){ + + CLKernel kernel = kernels.get("prepare"); + kernel.rewind() + .putArg(momentam) + .putArg(bufFilterDelta); + OpenCL.execute(kernel, filterCount); + kernel.rewind() + .putArg(momentam) + .putArg(bufBiasDelta); + OpenCL.execute(kernel, biasCount); + } + + public void join(float weightDecay, float learningRate, + int filterCount, int biasCount, int count, + CLBuffer bufFilter, CLBuffer bufFilterDelta, + CLBuffer bufBias, CLBuffer bufBiasDelta){ + CLKernel kernelFilter = kernels.get("joinFilter"); + kernelFilter.rewind() + .putArg(weightDecay) + .putArg(learningRate) + .putArg(count) + .putArgs( + bufFilter, + bufFilterDelta); + OpenCL.execute(kernelFilter, filterCount); + + CLKernel kernelBias = kernels.get("joinBias"); + kernelBias.rewind() + .putArg(count) + .putArgs( + bufBias, + bufBiasDelta); + OpenCL.execute(kernelBias, biasCount); + + } + + public static void main(String[] args) { + int inputChannels = 3; + int inputWidth = 200; + int inputHeight = 200; + int stride = 3; + int filterSize = 11; + int outputChannels = 24; + int outputWidth = inputWidth / stride; + int outputHeight = inputHeight / stride; + float[] input = new float[inputChannels * inputWidth * inputHeight]; + float[] newDelta = new float[input.length]; + float[] filter = new float[inputChannels * outputChannels * filterSize * filterSize]; + float[] filterDelta = new float[filter.length]; + float[] biasDelta = new float[outputChannels]; + float[] result = new float[outputChannels * outputWidth * outputHeight]; + float[] delta = new float[result.length]; + float learningRate = 0.001f; + + for(int i = 0; i < 3; ++i){ + System.out.println(i + 1); + ConvolutionBackwordCL.INSTANCE.backward(delta, result, + input, inputChannels, inputWidth, inputHeight, + filter, outputChannels, outputWidth, outputHeight, filterDelta, biasDelta, filterSize, stride, newDelta, learningRate); + } + } +} diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java new file mode 100644 index 0000000..2a0fb50 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java @@ -0,0 +1,177 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; + +/** + * + * @author naoki + */ +public class ConvolutionForwardCL { + public static ConvolutionForwardCL INSTANCE = new ConvolutionForwardCL(); + CLProgram prog; + CLKernel forwardKernel; + CLKernel normalizeKernel; + + private ConvolutionForwardCL() { + } + + /** + * バッファを外部にもたない + */ + public void forward(float[] input, int inputChannels, int inputWidth, int inputHeight, + float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] result, + int filterSize, int stride, float[] bias){ + + CLBuffer bufFilter = OpenCL.createReadBuffer(filter); + CLBuffer bufBias = OpenCL.createReadBuffer(bias); + + OpenCL.getQueue() + .putWriteBuffer(bufFilter, false) + .putWriteBuffer(bufBias, false); + + forward(input, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, result, + filterSize, stride, bufBias); + + bufBias.release(); + bufFilter.release(); + } + + /** + * filterとbiasは外部管理 + */ + public void forward(float[] input, + int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + float[] result, + int filterSize, int stride, CLBuffer bufBias){ + + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result); + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + + forward(bufInput, inputChannels, inputWidth, inputHeight, + bufFilter, outputChannels, outputWidth, outputHeight, bufResult, + filterSize, stride, bufBias); + + OpenCL.getQueue() + .putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufResult.release(); + bufInput.release(); + + } + public void forward(CLBuffer bufInput, + int inputChannels, int inputWidth, int inputHeight, + CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight, + CLBuffer bufResult, + int filterSize, int stride, CLBuffer bufBias){ + if(prog == null){ + prog = OpenCL.compile("convolution_forward.cl"); + forwardKernel = prog.createCLKernel("forward_local"); + } + + forwardKernel + .rewind() + .putArg(outputHeight) + .putArg(outputWidth) + .putArg(inputChannels) + .putArg(filterSize) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufFilter, + bufResult, + bufBias); + /* + OpenCL.execute(forwardKernel, + outputChannels * outputWidth * outputHeight); + */ + forwardKernel.putArg(outputChannels * outputWidth * outputHeight); + OpenCL.getQueue().put1DRangeKernel(forwardKernel, 0, + outputChannels * outputWidth * outputHeight, outputChannels); + + normalizeKernel = prog.createCLKernel("localNormalize"); + normalizeKernel + .rewind() + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(outputChannels) + .putArg(bufResult); + OpenCL.execute(normalizeKernel, + outputChannels * outputWidth * outputHeight); + + } + public static void main(String[] args) { + CLProgram prog = OpenCL.compile("convolution_forward.cl"); + CLKernel forwardKernel = prog.createCLKernel("forward"); + + int inputChannels = 384; + int inputWidth = 14; + int inputHeight = 14; + int outputChannels = 384; + int outputWidth = 14; + int outputHeight = 14; + int filterSize = 3; + int stride = 1; + CLBuffer bufInput = OpenCL.createReadWriteBuffer( + inputChannels * inputWidth * inputHeight); + CLBuffer bufFilter = OpenCL.createReadWriteBuffer( + inputChannels * outputChannels * filterSize * filterSize); + CLBuffer bufResult = OpenCL.createReadWriteBuffer( + outputChannels * outputWidth * outputHeight); + CLBuffer bufBias = OpenCL.createReadWriteBuffer( + outputChannels); + long start = System.currentTimeMillis(); + for(int i = 0; i < 5000; ++i){ + forwardKernel + .rewind() + .putArg(outputHeight) + .putArg(outputWidth) + .putArg(inputChannels) + .putArg(filterSize) + .putArg(stride) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufFilter, + bufResult, + bufBias) + .putArg(outputChannels * outputWidth * outputHeight); + int workSize = outputChannels; + OpenCL.getQueue().put1DRangeKernel(forwardKernel, + 0, outputChannels * outputWidth * outputHeight, + workSize); + } + OpenCL.getQueue().putBarrier(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + bufFilter.release(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + bufInput.release(); + bufResult.release(); + bufBias.release(); + System.out.println((System.currentTimeMillis() - start) / 1000.); + + forwardKernel.release(); + prog.release(); + + OpenCL.getQueue().release(); + OpenCL.getCtx().release(); + } + static int roundUp(int groupSize, int globalSize){ + return ((globalSize + groupSize - 1) / groupSize) * groupSize; + } +} diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java new file mode 100644 index 0000000..99bd99f --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java @@ -0,0 +1,172 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.Map; +import kishida.cnn.activation.ActivationFunction; +import kishida.cnn.activation.RectifiedLinear; + +/** + * + * @author naoki + */ +public class FullyBackwordCL { + public static FullyBackwordCL INSTANCE = new FullyBackwordCL(); + CLProgram prog; + Map kernels; + CLProgram progActivation; + Map actKernels; + + private FullyBackwordCL() { + } + + public void backword(int inputSize, int outputSize, + int[] dropout, float[] input, float[] delta, + float[] result, float[] weight, + float[] weightDelta, float[] biasDelta, + float[] newDelta, + float learningRate, ActivationFunction activation){ + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta); + CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta); + OpenCL.getQueue() + .putWriteBuffer(bufWeightDelta ,false) + .putWriteBuffer(bufBiasDelta ,false) + .putWriteBuffer(bufWeight ,false); + + backword(inputSize, outputSize, + dropout, input, delta, + result, bufWeight, bufWeightDelta, bufBiasDelta, + newDelta, + learningRate, activation); + + OpenCL.getQueue() + .putReadBuffer(bufBiasDelta ,false) + .putReadBuffer(bufWeightDelta ,true); + bufBiasDelta.getBuffer().get(biasDelta); + bufWeightDelta.getBuffer().get(weightDelta); + + bufWeight .release(); + bufWeightDelta .release(); + bufBiasDelta .release(); + } + + public void backword(int inputSize, int outputSize, + int[] dropout, float[] input, float[] delta, + float[] result, CLBuffer bufWeight, + CLBuffer bufWeightDelta, CLBuffer bufBiasDelta, + float[] newDelta, + float learningRate, ActivationFunction activation){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufResult = OpenCL.createReadBuffer(result); + CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length); + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + OpenCL.getQueue() + .putWriteBuffer(bufInput ,false) + .putWriteBuffer(bufDelta ,false) + .putWriteBuffer(bufResult ,false) + .putWriteBuffer(bufDropout ,false); + + backword(inputSize, outputSize, + bufDropout, bufInput, bufDelta, + bufResult, bufWeight, bufWeightDelta, bufBiasDelta, + bufNewDelta, + learningRate, activation); + + OpenCL.getQueue() + .putReadBuffer(bufNewDelta ,false); + bufNewDelta.getBuffer().get(newDelta); + + bufInput .release(); + bufDelta .release(); + bufResult .release(); + bufNewDelta .release(); + bufDropout .release(); + + } + public void backword(int inputSize, int outputSize, + CLBuffer bufDropout, CLBuffer bufInput, CLBuffer bufDelta, + CLBuffer bufResult, CLBuffer bufWeight, + CLBuffer bufWeightDelta, CLBuffer bufBiasDelta, + CLBuffer bufNewDelta, + float learningRate, ActivationFunction activation){ + if(prog == null){ + prog = OpenCL.compile("fully_backword.cl"); + kernels = prog.createCLKernels(); + } + if(progActivation == null){ + progActivation = OpenCL.compile("activation.cl"); + actKernels = progActivation.createCLKernels(); + } + + CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(outputSize); + + CLKernel actKernel = actKernels.get(activation.getName() + "_diff"); + actKernel.rewind() + .putArg(bufResult) + .putArg(bufDiffed); + OpenCL.execute(actKernel, outputSize); + + CLKernel kernelDelta = kernels.get("backword_delta"); + kernelDelta.rewind() + .putArg(outputSize) + .putArgs( + bufDropout, + bufDelta, + bufDiffed, + bufWeight, + bufNewDelta); + OpenCL.execute(kernelDelta, inputSize); + + CLKernel kernelWeight = kernels.get("backword_weight"); + kernelWeight.rewind() + .putArg(outputSize) + .putArg(learningRate) + .putArgs( + bufDropout, + bufInput, + bufDelta, + bufDiffed, + bufWeight, + bufWeightDelta); + OpenCL.execute(kernelWeight, inputSize * outputSize); + + CLKernel kernelBias = kernels.get("backword_bias"); + kernelBias.rewind() + .putArg(outputSize) + .putArg(learningRate) + .putArg(bufDropout) + .putArg(bufDelta) + .putArg(bufDiffed) + .putArg(bufBiasDelta); + OpenCL.execute(kernelBias, outputSize); + + bufDiffed .release(); + } + + public static void main(String[] args) { + int inputSize = 5; + int outputSize = 9; + int[] dropout = new int[outputSize]; + float[] delta = new float[outputSize]; + float[] input = new float[inputSize]; + float[] result = new float[outputSize]; + float[] weight = new float[inputSize * outputSize]; + float[] weightDelta = new float[weight.length]; + float[] biasDelta = new float[outputSize]; + float[] newDelta = new float[inputSize]; + + INSTANCE.backword(inputSize, outputSize, + dropout, input, delta, result, weight, + weightDelta, biasDelta, newDelta, 0.001f, new RectifiedLinear()); + } +} diff --git a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java new file mode 100644 index 0000000..10cb2e0 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java @@ -0,0 +1,119 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.Map; +import kishida.cnn.activation.ActivationFunction; +import kishida.cnn.activation.SoftMaxFunction; + +/** + * + * @author naoki + */ +public class FullyForwardCL { + public static FullyForwardCL INSTANCE = new FullyForwardCL(); + CLProgram progFully; + CLProgram progActivation; + CLKernel forwardKernel; + Map actKernels; + public FullyForwardCL() { + } + + public void forward(int inputSize, int outputSize, int[] dropout, + float[] input, float[] weight, float[] bias, float[] result, + ActivationFunction activation){ + CLBuffer bufWeight = OpenCL.createReadBuffer(weight); + CLBuffer bufBias = OpenCL.createReadBuffer(bias); + + forward(inputSize, outputSize, dropout, input, bufWeight, bufBias, result, activation); + + OpenCL.getQueue() + .putWriteBuffer(bufWeight, false) + .putWriteBuffer(bufBias, false); + bufWeight.release(); + bufBias.release(); + + } + public void forward(int inputSize, int outputSize, int[] dropout, + float[] input, CLBuffer bufWeight, + CLBuffer bufBias, float[] result, + ActivationFunction activation){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length); + CLBuffer bufDropout = OpenCL.createReadBuffer(dropout); + + OpenCL.getQueue() + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufDropout, false); + + forward(inputSize, outputSize, bufDropout, bufInput, bufWeight, bufBias, bufResult, activation); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufInput.release(); + bufResult.release(); + bufDropout.release(); + + } + public void forward(int inputSize, int outputSize, CLBuffer bufDropout, + CLBuffer bufInput, CLBuffer bufWeight, + CLBuffer bufBias, CLBuffer bufResult, + ActivationFunction activation){ + if(progFully == null){ + progFully = OpenCL.compile("fully_forward.cl"); + forwardKernel = progFully.createCLKernel("forward"); + } + if(progActivation == null){ + progActivation = OpenCL.compile("activation.cl"); + actKernels = progActivation.createCLKernels(); + } + + forwardKernel.rewind() + .putArg(inputSize) + .putArg(outputSize) + .putArgs( + bufDropout, + bufInput, + bufWeight, + bufBias, + bufResult); + OpenCL.execute(forwardKernel, outputSize); + + if(activation instanceof SoftMaxFunction){ + softmax(outputSize, bufResult); + + }else{ + CLKernel kernelAct = actKernels.get(activation.getName()); + kernelAct.rewind() + .putArg(bufResult); + OpenCL.execute(kernelAct, outputSize); + } + + } + + private void softmax(int outputSize, CLBuffer bufResult) { + CLBuffer bufExped = OpenCL.createReadWriteBuffer(outputSize); + CLKernel kernelActPre = actKernels.get("softmax_before"); + kernelActPre.rewind() + .putArg(bufResult) + .putArg(bufExped); + OpenCL.execute(kernelActPre, outputSize); + + CLKernel kernelAct = actKernels.get("softmax"); + kernelAct.rewind() + .putArg(bufExped) + .putArg(bufResult); + OpenCL.execute(kernelAct, outputSize); + + bufExped.release(); + } +} diff --git a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java new file mode 100644 index 0000000..e143fe9 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java @@ -0,0 +1,134 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.util.Map; + +/** + * + * @author naoki + */ +public class MaxPoolingCL { + public static MaxPoolingCL INSTANCE = new MaxPoolingCL(); + + CLProgram prog; + Map kernels; + + private MaxPoolingCL() { + } + + public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight, + int size, int stride, float[] input, float[] result){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + OpenCL.getQueue() + .putWriteBuffer(bufInput, false); + + forward(inputChannel, inputWidth, inputHeight, + outputWidth, ouptutHeight, size, stride, bufInput, bufResult); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + bufInput.release(); + bufResult.release(); + + } + public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight, + int size, int stride, CLBuffer bufInput, CLBuffer bufResult){ + + if(prog == null){ + prog = OpenCL.compile("maxpooling.cl"); + kernels = prog.createCLKernels(); + } + + CLKernel kernelForward = kernels.get("forward"); + kernelForward.rewind() + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputWidth) + .putArg(ouptutHeight) + .putArg(size) + .putArg(stride) + .putArgs( + bufInput, + bufResult); + OpenCL.execute(kernelForward, + inputChannel * outputWidth * ouptutHeight); + } + + public void backword(int inputChannel, int inputWidth, int inputHeight, + int outputWidth, int outputHeight, + int size, int stride, + float[] input, float[] delta, float[] newDelta){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufDelta = OpenCL.createReadBuffer(delta); + CLBuffer bufNewDelta = OpenCL.createReadWriteBuffer(newDelta); + OpenCL.getQueue() + .putWriteBuffer(bufInput, false) + .putWriteBuffer(bufDelta, false); + + backword(inputChannel, inputWidth, inputHeight, + outputWidth, outputHeight, size, stride, + bufInput, bufDelta, bufNewDelta); + + OpenCL.getQueue().putReadBuffer(bufNewDelta, true); + bufNewDelta.getBuffer().get(newDelta); + + bufInput.release(); + bufDelta.release(); + bufNewDelta.release(); + } + public void backword(int inputChannel, int inputWidth, int inputHeight, + int outputWidth, int outputHeight, + int size, int stride, + CLBuffer bufInput, + CLBuffer bufDelta, CLBuffer bufNewDelta){ + if(prog == null){ + prog = OpenCL.compile("maxpooling.cl"); + kernels = prog.createCLKernels(); + } + + CLKernel kernelForward = kernels.get("backword"); + kernelForward.rewind() + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(outputWidth) + .putArg(outputHeight) + .putArg(size) + .putArg(stride) + .putArgs( + bufInput, + bufDelta, + bufNewDelta); + OpenCL.execute(kernelForward, + inputChannel * inputWidth * inputHeight); + + } + + public static void main(String[] args) { + int inputChannel = 3; + int inputWidth = 32; + int inputHeight = 32; + int size = 11; + int stride = 2; + int outputWidth = inputWidth / stride; + int outputHeight = inputHeight / stride; + float[] input = new float[inputChannel * inputWidth * inputHeight]; + float[] result = new float[inputChannel * outputWidth * outputHeight]; + float[] newDelta = new float[input.length]; + float[] delta = new float[result.length]; + + INSTANCE.forward(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride, + input,result); + INSTANCE.backword(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride, + input, delta, newDelta); + } +} diff --git a/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java new file mode 100644 index 0000000..294810d --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java @@ -0,0 +1,92 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLProgram; +import java.nio.FloatBuffer; +import java.util.Map; + +/** + * + * @author naoki + */ +public class MultiNormalizeCL { + public static MultiNormalizeCL INSTANCE = new MultiNormalizeCL(); + + CLProgram prog; + Map kernels; + + public void normalize(int inputChannels, int inputWidth, int inputHeight, + int size, float threshold, + float[] input, float[] result){ + CLBuffer bufInput = OpenCL.createReadBuffer(input); + CLBuffer bufResult = OpenCL.createWriteBuffer(result.length); + OpenCL.getQueue().putWriteBuffer(bufInput, false); + + OpenCL.getQueue().putReadBuffer(bufResult, true); + bufResult.getBuffer().get(result); + + normalize(inputChannels, inputWidth, inputHeight, size, threshold, + bufInput, bufResult); + + bufInput.release(); + bufResult.release(); + + } + public void normalize(int inputChannels, int inputWidth, int inputHeight, + int size, float threshold, + CLBuffer bufInput, CLBuffer bufResult){ + if(prog == null){ + prog = OpenCL.compile("multi_normalize.cl"); + kernels = prog.createCLKernels(); + } + + CLBuffer bufAverages = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); + CLBuffer bufStds = OpenCL.createReadWriteBuffer(inputWidth * inputHeight); + + CLKernel kernelAverage = kernels.get("average"); + kernelAverage.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArg(size) + .putArg(threshold) + .putArgs( + bufInput, + bufAverages, + bufStds); + OpenCL.execute(kernelAverage, inputWidth * inputHeight); + + CLKernel kernelForward = kernels.get("forward"); + kernelForward.rewind() + .putArg(inputChannels) + .putArg(inputWidth) + .putArg(inputHeight) + .putArgs( + bufInput, + bufAverages, + bufStds, + bufResult); + OpenCL.execute(kernelForward, inputChannels * inputWidth * inputHeight); + + bufAverages.release(); + bufStds.release(); + + } + + public static void main(String[] args) { + int inputChannels = 3; + int inputWidth = 32; + int inputHeight = 32; + int size = 5; + float threshold = 0.00001f; + float[] input = new float[inputChannels * inputWidth * inputHeight]; + float[] result = new float[inputChannels * inputWidth * inputHeight]; + new MultiNormalizeCL().normalize(inputChannels, inputWidth, inputHeight, size, threshold, input, result); + } +} diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java new file mode 100644 index 0000000..bb635c1 --- /dev/null +++ b/src/main/java/kishida/cnn/opencl/OpenCL.java @@ -0,0 +1,96 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.cnn.opencl; + +import com.jogamp.opencl.CLBuffer; +import com.jogamp.opencl.CLCommandQueue; +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; +import com.jogamp.opencl.CLKernel; +import com.jogamp.opencl.CLMemory; +import com.jogamp.opencl.CLProgram; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import lombok.Getter; + +/** + * + * @author naoki + */ +public class OpenCL { + + static CLContext ctx; + @Getter + static CLCommandQueue queue; + static CLDevice device; + + public static void prepare(){ + ctx = CLContext.create(); + device = ctx.getMaxFlopsDevice(); + System.out.println(device); + queue = device.createCommandQueue(); + } + + public static void release(){ + queue.finish(); + ctx.release(); + ctx = null; + } + + public static CLContext getCtx() { + if(ctx == null){ + prepare(); + } + return ctx; + } + + public static CLProgram compile(String path){ + try { + return getCtx().createProgram(OpenCL.class.getResourceAsStream("/kernels/" + path)) + .build(); + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } + } + + public static CLBuffer createReadBuffer(float[] data){ + CLBuffer buf = getCtx().createFloatBuffer( + data.length, CLMemory.Mem.READ_ONLY); + buf.getBuffer().put(data).rewind();//rewindしないと不安定になる + return buf; + } + public static CLBuffer createReadWriteBuffer(float[] data){ + CLBuffer buf = createReadWriteBuffer(data.length); + buf.getBuffer().put(data).rewind();//rewindしないと不安定になる + return buf; + } + public static CLBuffer createReadWriteBuffer(int size){ + CLBuffer buf = getCtx().createFloatBuffer( + size, CLMemory.Mem.READ_WRITE); + return buf; + + } + public static CLBuffer createWriteBuffer(int size){ + return getCtx().createFloatBuffer(size, CLMemory.Mem.WRITE_ONLY); + } + public static CLBuffer createReadBuffer(int[] data){ + CLBuffer buf = getCtx().createIntBuffer( + data.length, CLMemory.Mem.READ_ONLY); + buf.getBuffer().put(data).rewind(); + return buf; + } + public static CLCommandQueue execute(CLKernel kernel, int range){ + int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 128); + int globalWorkSize = roundUp(localWorkSize, range); + kernel.putArg(range); + return getQueue().put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize); + } + static int roundUp(int groupSize, int globalSize){ + return ((globalSize + groupSize - 1) / groupSize) * groupSize; + } +} diff --git a/src/main/java/kishida/imagefiltering/InfoCL.java b/src/main/java/kishida/imagefiltering/InfoCL.java new file mode 100644 index 0000000..9af0e73 --- /dev/null +++ b/src/main/java/kishida/imagefiltering/InfoCL.java @@ -0,0 +1,37 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package kishida.imagefiltering; + +import com.jogamp.opencl.CLContext; +import com.jogamp.opencl.CLDevice; + +/** + * + * @author naoki + */ +public class InfoCL { + public static void main(String[] args) { + CLContext ctx = CLContext.create(); + CLDevice dev = ctx.getMaxFlopsDevice(); + System.out.println(dev); + System.out.println(memSizeString(dev.getGlobalMemSize())); + System.out.println(memSizeString(dev.getLocalMemSize())); + System.out.println(dev.getMaxWorkGroupSize()); + System.out.println(dev.getCVersion()); + ctx.release(); + } + static String memSizeString(long memSize){ + double size = memSize; + String[] unit = {"", "K", "M", "G", "T"}; + for(int i = 0; i < unit.length; ++i){ + if(size < 1024){ + return String.format("%.1f%sB", size, unit[i]); + } + size /= 1024; + } + return String.format("%.1f%sPB", size); + } +} diff --git a/src/main/java/kishida/imagefiltering/KernelBench.java b/src/main/java/kishida/imagefiltering/KernelBench.java index cc236bd..8308817 100644 --- a/src/main/java/kishida/imagefiltering/KernelBench.java +++ b/src/main/java/kishida/imagefiltering/KernelBench.java @@ -6,10 +6,8 @@ package kishida.imagefiltering; import java.util.Arrays; -import java.util.Objects; import java.util.Random; import java.util.function.Consumer; -import java.util.function.DoubleConsumer; /** * diff --git a/src/main/resources/alexnet_def.json b/src/main/resources/alexnet_def.json index c5a0e0f..66771f6 100644 --- a/src/main/resources/alexnet_def.json +++ b/src/main/resources/alexnet_def.json @@ -95,7 +95,7 @@ "initBias" : 1.0, "dropoutRate" : 0.5, "activation" : "RectifiedLinear", - "useGpu" : false + "useGpu" : true } }, { "FullyConnect" : { @@ -113,7 +113,7 @@ "initBias" : 1.0, "dropoutRate" : 1.0, "activation" : "SoftMaxFunction", - "useGpu" : false + "useGpu" : true } } ] } \ No newline at end of file diff --git a/src/main/resources/kernels/activation.cl b/src/main/resources/kernels/activation.cl new file mode 100644 index 0000000..0143565 --- /dev/null +++ b/src/main/resources/kernels/activation.cl @@ -0,0 +1,64 @@ +__kernel void relu( + __global float *result, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float r = result[i]; + result[i] = r >= 0 ? r : 0; +} + +__kernel void relu_diff( + __global const float* result, + __global float* diff, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + diff[i] = result[i] >= 0 ? 1 : 0; +} + +__kernel void softmax_before( + __global const float* result, + __global float* exped, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + exped[i] = exp(min(700.0f, result[i])); +} + +__kernel void softmax( + __global const float* exped, + __global float* result, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float sum = 0; + for(int j = 0; j < count; ++j){ + sum += exped[j]; + } + result[i] = exped[i] / sum; +} + +__kernel void softmax_diff( + __global const float* result, + __global float* diff, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float r = result[i]; + diff[i] = r * (1 - r); +} diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl new file mode 100644 index 0000000..0d0d9a0 --- /dev/null +++ b/src/main/resources/kernels/convolution_backword.cl @@ -0,0 +1,264 @@ +void delta_proc( + int chxxyy, + int inputWidth, + int inputHeight, + int filterSize, + int outputChannels, + int stride, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + __global const float *filter, + int inputChannels, + __global float *newDelta +){ + int ch = chxxyy / (inputWidth * inputHeight); + int xx = (chxxyy % (inputWidth * inputHeight)) / inputHeight; + int yy = chxxyy % inputHeight; + int sizeHalf = filterSize / 2; + float tempDelta = 0.0f; + for (int f = 0; f=0 && x=0 && y=0.0f)?delta[fxy]:0.0f; + tempDelta = tempDelta + d * filter[ + f * inputChannels * filterSize * filterSize + + ch * filterSize * filterSize + + i * filterSize + j]; + } + } + } + } + } + newDelta[chxxyy] = tempDelta; +} + +__kernel void delta_kernel( + int inputWidth, + int inputHeight, + int filterSize, + int outputChannels, + int stride, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + __global const float *filter, + int inputChannels, + __global float *newDelta, + int count +){ + int chxxyy = get_global_id(0); + if(chxxyy >= count){ + return; + } + delta_proc( + chxxyy, + inputWidth, + inputHeight, + filterSize, + outputChannels, + stride, + outputWidth, + outputHeight, + result, + delta, + filter, + inputChannels, + newDelta); +} + +void filter_proc( + int fchij, + int inputChannels, + int filterSize, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + int stride, + int inputWidth, + int inputHeight, + float learningRate, + __global const float *input, + __global float *filterDelta +){ + + int f = fchij / ((inputChannels * filterSize) * filterSize); + int ch = (fchij % ((inputChannels * filterSize) * filterSize)) / (filterSize * filterSize); + int i = (fchij % (filterSize * filterSize)) / filterSize; + int j = fchij % filterSize; + float df = 0.0f; + for (int x = 0; x=0.0f)?delta[fxy]:0.0f; + int xx = x * stride + i - filterSize / 2; + if (xx >= 0 && xx < inputWidth){ + int yy = y * stride + j - filterSize / 2; + if (yy >= 0 && yy < inputHeight){ + df = df + d * learningRate * + input[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + } + } + } + } + filterDelta[fchij] += df; +} + +__kernel void filter_kernel( + int inputChannels, + int filterSize, + int outputWidth, + int outputHeight, + __global const float *result, + __global const float *delta, + int stride, + int inputWidth, + int inputHeight, + float learningRate, + __global const float *input, + __global float *filterDelta, + int count +){ + int fchij = get_global_id(0); + if(fchij >= count){ + return; + } + filter_proc(fchij, inputChannels, filterSize, + outputWidth, outputHeight, result, delta, + stride, inputWidth, inputHeight, learningRate, + input, filterDelta); +} + +void bias_proc( + int fxy, + __global const float *result, + __global const float *delta, + __global float *tempBiasDelta, + float learningRate +){ + float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f; + tempBiasDelta[fxy] = learningRate * d; +} + +__kernel void bias_kernel( + __global const float *result, + __global const float *delta, + __global float *tempBiasDelta, + float learningRate, + int count +){ + int fxy = get_global_id(0); + if(fxy >= count){ + return; + } + bias_proc(fxy, result, delta, tempBiasDelta, learningRate); +} +__kernel void biasAfter( + int outputWidth, + int outputHeight, + __global const float *tempBiasDelta, + __global float *biasDelta, + int count +){ + int f = get_global_id(0); + if(f >= count){ + return; + } + float b = 0; + for(int xy = 0; xy < outputWidth * outputHeight; ++xy){ + b += tempBiasDelta[f * outputWidth * outputHeight + xy]; + } + biasDelta[f] += b; +} + +__kernel void dfb( + int inputChannels, + int inputWidth, + int inputHeight, + int outputChannels, + int outputWidth, + int outputHeight, + int filterSize, + int stride, + float learningRate, + __global float *input, + __global float *result, + __global float *filter, + __global float *filterDelta, + __global float *delta, + __global float *newDelta, + __global float *tempBiasDelta, + int deltaCount, + int filterCount, + int biasCount, + int count +){ + int n = get_global_id(0); + if(n < deltaCount){ + delta_proc(n, + inputWidth, inputHeight, + filterSize, outputChannels, stride, + outputWidth, outputHeight, + result, delta, filter, + inputChannels, newDelta); + }else if (n < deltaCount + filterCount){ + filter_proc(n - deltaCount, inputChannels, filterSize, + outputWidth, outputHeight, result, delta, + stride, inputWidth, inputHeight, learningRate, + input, filterDelta); + }else if (n < deltaCount + filterCount + biasCount){ + bias_proc(n - deltaCount - filterCount, + result, delta, tempBiasDelta, learningRate); + } +} + +__kernel void prepare( + float momentam, + __global float* delta, + int count +){ + int f = get_global_id(0); + if(f >= count){ + return; + } + delta[f] *= momentam; +} + +__kernel void joinFilter( + float weightDecay, + float learningRate, + int count, + __global float* filter, + __global const float* filterDelta, + int len +){ + int f = get_global_id(0); + if(f >= len){ + return; + } + filter[f] += filterDelta[f] / count + - weightDecay * learningRate * filter[f]; +} + +__kernel void joinBias( + int count, + __global float* bias, + __global const float* biasDelta, + int len +){ + int f = get_global_id(0); + if(f >= len){ + return; + } + bias[f] += biasDelta[f] / count; +} + + diff --git a/src/main/resources/kernels/convolution_forward.cl b/src/main/resources/kernels/convolution_forward.cl new file mode 100644 index 0000000..3af321b --- /dev/null +++ b/src/main/resources/kernels/convolution_forward.cl @@ -0,0 +1,123 @@ +__kernel void forward( + int outputHeight, + int outputWidth, + int inputChannels, + int filterSize, + int stride, + int inputWidth, + int inputHeight, + __global const float *input, + __global const float *filter, + __global float *result, + __global const float *bias, + int count +){ + int fxy = get_global_id(0); + if(fxy >= count){ + return; + } + + int f = fxy / (outputHeight * outputWidth); + int x = (fxy % (outputHeight * outputWidth)) / outputHeight; + int y = fxy % outputHeight; + float r = 0.0f; + for (int ch = 0; ch=0 && xx=0 && yy= 0 ? rs : 0; +} + +__kernel void forward_local( + int outputHeight, + int outputWidth, + int inputChannels, + int filterSize, + int stride, + int inputWidth, + int inputHeight, + __global const float *input, + __global const float *filter, + __global float *result, + __global const float *bias, + int count +){ + int fxy = get_global_id(0); + if(fxy >= count){ + return; + } + int f = fxy / (outputHeight * outputWidth); + int x = (fxy % (outputHeight * outputWidth)) / outputHeight; + int y = fxy % outputHeight; + + __local float lfilter[384 * 3 * 3]; // + int len = inputChannels * filterSize * filterSize; + /* + int start = get_local_id(0) * len / outputChannels; + int end = (get_local_id(0) + 1) * len / outputChannels; + for(int i = start; i < end; ++i){ + lfilter[i] = filter[f * inputChannels * filterSize * filterSize + i]; + }*/ + event_t ev; + ev = async_work_group_copy(lfilter, + filter + f * inputChannels * filterSize * filterSize, + len, ev); + wait_group_events(1, &ev); + + float r = 0.0f; + for (int ch = 0; ch=0 && xx=0 && yy= 0 ? rs : 0; +} + + +__kernel void localNormalize( + int outputWidth, + int outputHeight, + int outputChannels, + __global float *result, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + float k = 2; + float a = 1.0E-4f; + float b = 0.75f; + int n = 5; + int ch = chxy / (outputWidth * outputHeight); + int xy = chxy % (outputWidth * outputHeight); + float sum = 0.0f; + + for (int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); lp++){ + sum += result[lp * outputWidth * outputHeight + xy] * + result[lp * outputWidth * outputHeight + xy]; + } + result[chxy] /= pow(k + a * sum, b); +} diff --git a/src/main/resources/kernels/fully_backword.cl b/src/main/resources/kernels/fully_backword.cl new file mode 100644 index 0000000..e2d0b6f --- /dev/null +++ b/src/main/resources/kernels/fully_backword.cl @@ -0,0 +1,67 @@ +__kernel void backword_delta( + int outputSize, + __global const int* dropout, + __global const float* delta, + __global const float* diffed, + __global const float* weight, + __global float* newDelta, + int count +){ + int i = get_global_id(0); + if(i >= count){ + return; + } + float nd = 0; + for (int j = 0; j < outputSize; ++j) { + if (dropout[j] != 1) { + continue; + } + float d = diffed[j] * delta[j]; + nd += d * weight[i * outputSize + j];//in[i] *; + } + newDelta[i] = nd; +} + +__kernel void backword_weight( + int outputSize, + float learningRate, + __global const int* dropout, + __global const float* input, + __global const float* delta, + __global const float* diffed, + __global const float* weight, + __global float* weightDelta, + int count +){ + int ij = get_global_id(0); + if(ij >= count){ + return; + } + int i = ij / outputSize; + int j = ij % outputSize; + if (dropout[j] != 1) { + return; + } + float d = diffed[j] * delta[j]; + weightDelta[ij] += d * input[i] * learningRate; + +} + +__kernel void backword_bias( + int outputSize, + float learningRate, + __global const int* dropout, + __global const float* delta, + __global const float* diffed, + __global float* biasDelta, + int count +){ + int j = get_global_id(0); + if(j >= count){ + return; + } + if (dropout[j] != 1) { + return; + } + biasDelta[j] += diffed[j] * delta[j] * learningRate; +} \ No newline at end of file diff --git a/src/main/resources/kernels/fully_forward.cl b/src/main/resources/kernels/fully_forward.cl new file mode 100644 index 0000000..e8b06a3 --- /dev/null +++ b/src/main/resources/kernels/fully_forward.cl @@ -0,0 +1,24 @@ +__kernel void forward( + int inSize, + int out, + __global const int *dropout, + __global const float *in, + __global float *weight, + __global float *bias, + __global float *result, + int count +){ + int j = get_global_id(0); + if(j >= count){ + return; + } + if (dropout[j] == 1){ + float r = 0; + for (int i = 0; i= count){ + return; + } + + int ch = chxy / (outputWidth * outputHeight); + int x = (chxy % (outputWidth * outputHeight)) / outputHeight; + int y = chxy % outputHeight; + + float max = -INFINITY; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { + continue; + } + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy]; + if (max < d) { + max = d; + } + } + } + result[chxy] = max; + +} + +__kernel void backword( + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight, + int size, + int stride, + __global const float* input, + __global const float* delta, + __global float* newDelta, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + + int ch = chxy / (inputWidth * inputHeight); + int xi = (chxy % (inputWidth * inputHeight)) / inputHeight; + int yi = chxy % inputHeight; + + float nd = 0; + for(int x = max(0, (xi - size / 2) / stride - 1); + x < min(outputWidth, (xi + size / 2) / stride + 1); ++x){ + for(int y = max(0, (yi - size / 2) / stride - 1); + y < min(outputHeight, (yi + size / 2) / stride + 1); ++y){ + float max = -INFINITY; + int maxX = 0; + int maxY = 0; + for (int i = 0; i < size; ++i) { + int xx = x * stride + i - size / 2; + if (xx < 0 || xx >= inputWidth) { + continue; + } + for (int j = 0; j < size; ++j) { + int yy = y * stride + j - size / 2; + if (yy < 0 || yy >= inputHeight) { + continue; + } + float d = input[ch * inputWidth * inputHeight + xx * inputWidth + yy]; + if (max < d) { + max = d; + maxX = xx; + maxY = yy; + } + } + } + if(maxX == xi && maxY == yi){ + nd += delta[ch * outputWidth * outputHeight + x * outputHeight + y]; + } + } + } + newDelta[chxy] = nd; +} \ No newline at end of file diff --git a/src/main/resources/kernels/multi_normalize.cl b/src/main/resources/kernels/multi_normalize.cl new file mode 100644 index 0000000..ad5b87e --- /dev/null +++ b/src/main/resources/kernels/multi_normalize.cl @@ -0,0 +1,76 @@ +__kernel void average( + int inputChannels, + int inputWidth, + int inputHeight, + int size, + float threshold, + __global const float* input, + __global float* averages, + __global float* stds, + int len +){ + int xy = get_global_id(0); + if(xy >= len){ + return; + } + int x = xy / inputHeight; + int y = xy % inputHeight; + + float total = 0; + int count = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ + continue; + } + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + total += input[ch * inputHeight * inputWidth + xy]; + ++count; + } + } + } + float average = total / count; + float variance = 0; + for(int i = 0; i < size; ++i){ + int xx = x + i - size / 2; + if(xx < 0 || xx >= inputWidth){ + continue; + } + for(int j = 0; j < size; ++j){ + int yy = y + j - size / 2; + if(yy < 0 || yy >= inputHeight){ + continue; + } + for(int ch = 0; ch < inputChannels; ++ch){ + float data = input[ch * inputHeight * inputWidth + xy]; + variance += (data - average) * (data - average); + } + } + } + averages[xy] = average; + stds[xy] = max(threshold, sqrt(variance / count)); + +} + +__kernel void forward( + int inputChannels, + int inputWidth, + int inputHeight, + __global const float* input, + __global const float* averages, + __global const float* stds, + __global float* result, + int count +){ + int chxy = get_global_id(0); + if(chxy >= count){ + return; + } + int xy = chxy % (inputWidth * inputHeight); + result[chxy] = (input[chxy] - averages[xy]) / stds[xy]; +} \ No newline at end of file diff --git a/src/main/resources/tinynet_def.json b/src/main/resources/tinynet_def.json index 6723860..c54e1cd 100644 --- a/src/main/resources/tinynet_def.json +++ b/src/main/resources/tinynet_def.json @@ -73,7 +73,7 @@ "initBias" : 1.0, "dropoutRate" : 1.0, "activation" : "SoftMaxFunction", - "useGpu" : false + "useGpu" : true } } ] } \ No newline at end of file