diff --git a/.gitignore b/.gitignore
index 718edb6..74785ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
/target/
/nb-configuration.xml
-/nbactions*.xml
\ No newline at end of file
+/nbactions*.xml
+/*.log
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 6f79c84..8e06b50 100644
--- a/pom.xml
+++ b/pom.xml
@@ -14,13 +14,23 @@
com.fasterxml.jackson.core
jackson-databind
- 2.9.7
+ 2.10.0
org.projectlombok
lombok
1.18.2
+
+ org.jogamp.gluegen
+ gluegen-rt-main
+ 2.3.2
+
+
+ org.jogamp.jocl
+ jocl-main
+ 2.3.2
+
@@ -40,4 +50,4 @@
1.8
1.8
-
\ No newline at end of file
+
diff --git a/src/main/java/kishida/cnn/ConvolutionalNet.java b/src/main/java/kishida/cnn/ConvolutionalNet.java
index d8acaeb..c3ff5d6 100644
--- a/src/main/java/kishida/cnn/ConvolutionalNet.java
+++ b/src/main/java/kishida/cnn/ConvolutionalNet.java
@@ -8,6 +8,8 @@
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.io.Writer;
@@ -59,6 +61,8 @@ public class ConvolutionalNet {
public static final String AVERAGE_PNG = "average.png";
private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\alexnet.json.txt";
private static final String RESOURCE_NAME = "/alexnet_def.json";
+ //private static final String FILENAME = "C:\\Users\\naoki\\Desktop\\tinynet.json.txt";
+ //private static final String RESOURCE_NAME = "/tinynet_def.json";
static class Img{
@@ -89,8 +93,9 @@ BufferedImage readImage(){
static List historyData = new ArrayList<>();
static LinkedList rateData = new LinkedList<>();
+ @SuppressWarnings({"ThrowableInstanceNotThrown", "ThrowableInstanceNeverThrown"})
public static void main(String[] args) throws IOException {
- System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "false");
+ System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "false");
String def = "C:\\Users\\naoki\\Desktop\\sampleimg288";
Path dir = Paths.get(args.length > 0 ? args[0] : def);
List categories = Files.list(dir)
@@ -184,14 +189,15 @@ public static void main(String[] args) throws IOException {
NeuralNetwork nn;
- /*
- try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME);
- InputStreamReader isr = new InputStreamReader(is)){
- nn = NeuralNetwork.readFromJson(isr);
- }*/
-
- try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){
- nn = NeuralNetwork.readFromJson(r);
+ if(true){
+ try(InputStream is = ConvolutionalNet.class.getResourceAsStream(RESOURCE_NAME);
+ InputStreamReader isr = new InputStreamReader(is)){
+ nn = NeuralNetwork.readFromJson(isr);
+ }
+ }else{
+ try(Reader r = Files.newBufferedReader(Paths.get(FILENAME))){
+ nn = NeuralNetwork.readFromJson(r);
+ }
}
nn.init();
@@ -285,23 +291,13 @@ public static void main(String[] args) throws IOException {
Image lineGraph = createLineGraph(500, 200,
historyData, 1, 0);
historyLabel.setIcon(new ImageIcon(lineGraph));
- //一段目のフィルタの表示
- //全結合一段の表示
- firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult())));
- //全結合二段の表示
- lastResult.setIcon(new ImageIcon(createGraph(256, 128, output)));
-
- firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias())));
- secondBias.setIcon(new ImageIcon(createGraph(500, 128,
- conv2.getBias())));
- fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias())));
- fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias())));
//System.out.println(Arrays.stream(output).mapToObj(d -> String.format("%.2f", d)).collect(Collectors.joining(",")));
count[0]++;
nn.setImageIndex(nn.getImageIndex() + 1);
if(count[0] >= MINI_BATCH){
+
nn.joinBatch();
batchCount[0]++;
System.out.printf("%5d %4d %.2f/m %s %s%n", batchCount[0],
@@ -319,7 +315,7 @@ public static void main(String[] args) throws IOException {
System.out.printf("weight: %.2f~%.2f average %.2f ",
ws.getMin(), ws.getMax(), ws.getAverage());
DoubleSummaryStatistics bs = ((LerningLayer)layer).getBiasStatistics();
- System.out.printf("bias: %.2f~%.2f average %.2f ",
+ System.out.printf("bias: %.8f~%.8f average %.2f ",
bs.getMin(), bs.getMax(), bs.getAverage());
}
System.out.println();
@@ -329,6 +325,18 @@ public static void main(String[] args) throws IOException {
pStart[0] = System.currentTimeMillis();
nn.prepareBatch();
+ //一段目のフィルタの表示
+ //全結合一段の表示
+ firstFc.setIcon(new ImageIcon(createGraph(256, 128, fc1.getResult())));
+ //全結合二段の表示
+ lastResult.setIcon(new ImageIcon(createGraph(256, 128, output)));
+
+ firstBias.setIcon(new ImageIcon(createGraph(500, 128, conv1.getBias())));
+ secondBias.setIcon(new ImageIcon(createGraph(500, 128,
+ conv2.getBias())));
+ fc1Bias.setIcon(new ImageIcon(createGraph(500, 128, fc1.getBias())));
+ fc2Bias.setIcon(new ImageIcon(createGraph(500, 128, fc2.getBias())));
+
// 1時間に一回保存する
int hour = LocalTime.now().getHour();
if(lastHour[0] != hour){
diff --git a/src/main/java/kishida/cnn/NeuralNetwork.java b/src/main/java/kishida/cnn/NeuralNetwork.java
index 640839d..809adad 100644
--- a/src/main/java/kishida/cnn/NeuralNetwork.java
+++ b/src/main/java/kishida/cnn/NeuralNetwork.java
@@ -10,11 +10,13 @@
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
+import com.jogamp.opencl.CLBuffer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
+import java.nio.FloatBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -23,6 +25,7 @@
import java.util.stream.IntStream;
import kishida.cnn.activation.LogisticFunction;
import kishida.cnn.layers.ConvolutionLayer;
+import kishida.cnn.layers.FullGpuEnabled;
import kishida.cnn.layers.FullyConnect;
import kishida.cnn.layers.InputLayer;
import kishida.cnn.layers.MaxPoolingLayer;
@@ -165,8 +168,23 @@ public float[] forward(float[] readData, float[] correctData){
delta[idx] = correctData[idx] - output[idx];
}
//逆伝播
+ CLBuffer bufDelta = null;
for(int i = layers.size() - 1; i >= 1; --i){
- delta = layers.get(i).backward(delta);
+ FullGpuEnabled layer = layers.get(i) instanceof FullGpuEnabled ?
+ (FullGpuEnabled) layers.get(i) : null;
+ FullGpuEnabled pre = layers.get(i).getPreLayer() instanceof FullGpuEnabled ?
+ (FullGpuEnabled)layers.get(i).getPreLayer() : null;
+ if(true && layer != null && pre != null && layer.isUseGpu()){
+ if(bufDelta == null){
+ bufDelta = layer.backwardBuf(pre.getBufResult(), delta);
+ }else{
+ bufDelta = layer.backwardBuf(pre.getBufResult(), bufDelta);
+ }
+ delta = null;
+ }else{
+ delta = layers.get(i).backward(delta);
+ bufDelta = null;
+ }
}
return output;
diff --git a/src/main/java/kishida/cnn/activation/ActivationFunction.java b/src/main/java/kishida/cnn/activation/ActivationFunction.java
index 4768adc..efbc29b 100644
--- a/src/main/java/kishida/cnn/activation/ActivationFunction.java
+++ b/src/main/java/kishida/cnn/activation/ActivationFunction.java
@@ -5,6 +5,7 @@
*/
package kishida.cnn.activation;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
@@ -28,4 +29,6 @@ public void applyAfter(float[] values) {
/** 微分 */
public abstract float diff(float value);
+ @JsonIgnore
+ public abstract String getName();
}
diff --git a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java
index 4d37ccb..7c4c221 100644
--- a/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java
+++ b/src/main/java/kishida/cnn/activation/LimitedRectifiedLinear.java
@@ -23,4 +23,9 @@ public float diff(float value) {
return value >= 0 && value <= 2 ? 1 : 0;
}
+ @Override
+ public String getName() {
+ return "limitrelu";
+ }
+
}
diff --git a/src/main/java/kishida/cnn/activation/LinearFunction.java b/src/main/java/kishida/cnn/activation/LinearFunction.java
index 5bb1ef2..b46eb2a 100644
--- a/src/main/java/kishida/cnn/activation/LinearFunction.java
+++ b/src/main/java/kishida/cnn/activation/LinearFunction.java
@@ -20,5 +20,9 @@ public float apply(float value) {
public float diff(float value) {
return 1;
}
+ @Override
+ public String getName() {
+ return "linear";
+ }
}
diff --git a/src/main/java/kishida/cnn/activation/LogisticFunction.java b/src/main/java/kishida/cnn/activation/LogisticFunction.java
index 637b0f0..577468c 100644
--- a/src/main/java/kishida/cnn/activation/LogisticFunction.java
+++ b/src/main/java/kishida/cnn/activation/LogisticFunction.java
@@ -17,5 +17,9 @@ public float apply(float value) {
public float diff(float value) {
return value * (1 - value);
}
+ @Override
+ public String getName() {
+ return "logistic";
+ }
}
diff --git a/src/main/java/kishida/cnn/activation/RectifiedLinear.java b/src/main/java/kishida/cnn/activation/RectifiedLinear.java
index b37e663..c620a0b 100644
--- a/src/main/java/kishida/cnn/activation/RectifiedLinear.java
+++ b/src/main/java/kishida/cnn/activation/RectifiedLinear.java
@@ -17,5 +17,9 @@ public float apply(float value) {
public float diff(float value) {
return value >= 0 ? 1 : 0;
}
+ @Override
+ public String getName() {
+ return "relu";
+ }
}
diff --git a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java
index b6e9875..86ad50e 100644
--- a/src/main/java/kishida/cnn/activation/SoftMaxFunction.java
+++ b/src/main/java/kishida/cnn/activation/SoftMaxFunction.java
@@ -30,4 +30,9 @@ public float diff(float value) {
return value * (1 - value);
}
+ @Override
+ public String getName() {
+ return "softmax";
+ }
+
}
diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java
index 2666102..5412038 100644
--- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java
+++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordDeltaKernel.java
@@ -51,7 +51,6 @@ private void proc(int chxxyy) {
}
newDelta[chxxyy] = tempDelta;
}
- float[] input;
float[] result;
int inputChannels;
int inputWidth;
@@ -65,11 +64,10 @@ private void proc(int chxxyy) {
float[] delta;
float[] newDelta;
- public float[] backword(float[] input, float[] delta, float[] result,
+ public float[] backword(float[] delta, float[] result,
int inputChannels, int inputWidth, int inputHeight,
float[] filter, int outputChannels, int outputWidth, int outputHeight,
- int filterSize, int stride, boolean useGpu) {
- this.input = input;
+ int filterSize, int stride, float[] newDelta, boolean useGpu) {
this.delta = delta;
this.inputChannels = inputChannels;
this.inputWidth = inputWidth;
@@ -81,12 +79,11 @@ public float[] backword(float[] input, float[] delta, float[] result,
this.filterSize = filterSize;
this.stride = stride;
this.result = result;
- this.newDelta = new float[inputChannels * inputWidth * inputHeight];
+ this.newDelta = newDelta;
if (useGpu) {
put(filter);
put(delta);
put(result);
- put(input);
execute(inputChannels * inputWidth * inputHeight);
get(newDelta);
} else {
diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java
index 982d455..771dded 100644
--- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java
+++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordFilterKernel.java
@@ -44,7 +44,7 @@ private void proc(int fchij) {
}
}
}
- filter[fchij] += df;
+ filterDelta[fchij] += df;
}
float[] input;
float[] result;
@@ -52,7 +52,7 @@ private void proc(int fchij) {
int inputChannels;
int inputWidth;
int inputHeight;
- float[] filter;
+ float[] filterDelta;
int outputChannels;
int outputWidth;
int outputHeight;
@@ -62,14 +62,14 @@ private void proc(int fchij) {
public void backword(float[] delta, float[] result,
float[] input, int inputChannels, int inputWidth, int inputHeight,
- float[] filter, int outputChannels, int outputWidth, int outputHeight,
+ float[] filterDelta, int outputChannels, int outputWidth, int outputHeight,
int filterSize, int stride, float learningRate, boolean useGpu) {
this.input = input;
this.delta = delta;
this.inputChannels = inputChannels;
this.inputWidth = inputWidth;
this.inputHeight = inputHeight;
- this.filter = filter;
+ this.filterDelta = filterDelta;
this.outputChannels = outputChannels;
this.outputWidth = outputWidth;
this.outputHeight = outputHeight;
@@ -79,11 +79,11 @@ public void backword(float[] delta, float[] result,
this.learningRate = learningRate;// / outputWidth;// * outputHeight);
if (useGpu) {
put(delta);
- put(filter);
+ put(filterDelta);
put(input);
put(result);
execute(outputChannels * inputChannels * filterSize * filterSize);
- get(filter);
+ get(filterDelta);
} else {
IntStream.range(0, outputChannels).parallel().forEach((f) -> {
for (int chij = 0; chij < inputChannels * filterSize * filterSize; ++chij) {
diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java
index f867aeb..efd8d72 100644
--- a/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java
+++ b/src/main/java/kishida/cnn/kernels/ConvolutionBackwordKernel.java
@@ -62,7 +62,7 @@ private void proc(int fxy) {
int outputHeight;
int filterSize;
int stride;
- float[] bias;
+ //float[] bias;
float[] delta;
float learningRate;
float[] tempDelta;
@@ -74,7 +74,7 @@ public float[] backward(float[] delta, float[] result,
float[] input, int inputChannels, int inputWidth, int inputHeight,
float[] filter, int outputChannels, int outputWidth, int outputHeight,
float[] filterDelta, float[] biasDelta,
- int filterSize, int stride, float[] bias, float learningRate, boolean useGpu) {
+ int filterSize, int stride, float learningRate, boolean useGpu) {
this.delta = delta;
this.input = input;
this.inputChannels = inputChannels;
@@ -86,7 +86,7 @@ public float[] backward(float[] delta, float[] result,
this.outputHeight = outputHeight;
this.filterSize = filterSize;
this.stride = stride;
- this.bias = bias;
+ //this.bias = bias;
this.result = result;
this.tempDelta = new float[outputChannels * inputChannels * inputWidth * inputHeight];
this.learningRate = learningRate;// / (outputWidth * outputHeight);
diff --git a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java
index dd50c82..dd33cce 100644
--- a/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java
+++ b/src/main/java/kishida/cnn/kernels/ConvolutionLocalNormalizationKernel.java
@@ -23,8 +23,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid
this.outputChannels = outputChannels;
this.outputWidth = outputWidth;
this.outputHeight = outputHeight;
- if(useGpu && outputWidth * outputHeight > 500){
- execute(outputWidth * outputHeight);
+ if(useGpu){
+ execute(outputChannels * outputWidth * outputHeight);
//throw new UnsupportedOperationException("because I dont know how to use private memory.");
}else{
IntStream.range(0, outputWidth).parallel().forEach(x -> {
@@ -38,8 +38,8 @@ public void localNormalization(float[] result, int outputChannels, int outputWid
@Override
public void run() {
- int xy = getGlobalId();
- procGpu(xy);
+ int chxy = getGlobalId();
+ procGpu(chxy);
}
float[] result;
@@ -48,30 +48,22 @@ public void run() {
int outputChannels;
static final int n = 5;
- @PrivateMemorySpace(n) float[] sigma = new float[n]; // not work
+ //@PrivateMemorySpace(n) float[] sigma = new float[n]; // not work
- public void procGpu(int xy){
+ public void procGpu(int chxy){
final int k = 2;
final float a = 0.0001f;
final float b = 0.75f;
- int lp = 0;
- for(; lp < n / 2; ++lp){
- sigma[lp] =
- result[lp * outputWidth * outputHeight + xy] *
- result[lp * outputWidth * outputHeight + xy];
- }
- for(int ch = 0; ch < outputChannels; ++ch){
- sigma[lp % n] = lp >= outputChannels ? 0 :
- result[lp * outputWidth * outputHeight + xy] *
- result[lp * outputWidth * outputHeight + xy];
- lp = lp + 1;
- float sum = 0;
- for(int i = 0; i < n; ++i){
- sum += sigma[i];
- }
- result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] /
- pow(k + a * sum, b);
+ int ch = chxy / (outputWidth * outputHeight);
+ int xy = chxy % (outputWidth * outputHeight);
+
+ float sum = 0;
+ for(int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); ++lp){
+ sum += result[lp * outputWidth * outputHeight + xy] *
+ result[lp * outputWidth * outputHeight + xy];
}
+ result[chxy] = result[chxy] /
+ pow(k + a * sum, b);
}
diff --git a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java
index ffaabb9..2c57ac0 100644
--- a/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java
+++ b/src/main/java/kishida/cnn/kernels/FullyForwardKernel.java
@@ -26,10 +26,13 @@ public void run() {
private void proc(int j){
if(dropout[j] == 1){
+ float r = 0;
for (int i = 0; i < inSize; ++i) {
- result[j] += in[i] * weight[i * out + j];
+ r += in[i] * weight[i * out + j];
}
- result[j] += bias[j];
+ result[j] = r + bias[j];
+ }else{
+ result[j] = 0;
}
}
int out;
diff --git a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java
index ff5e854..d4bf793 100644
--- a/src/main/java/kishida/cnn/kernels/NormalizeKernel.java
+++ b/src/main/java/kishida/cnn/kernels/NormalizeKernel.java
@@ -61,11 +61,7 @@ private void proc(int chxy) {
}
float std = max(threshold, sqrt(variance / count));
result[chxy] = (input[chxy] - average) / std;
- averages[chxy] = average;
- rates[chxy] = std;
}
- float[] averages;
- float[] rates;
float[] result;
float[] input;
int inputChannels;
@@ -75,11 +71,9 @@ private void proc(int chxy) {
float threshold;
public float[] normalize(float[] input, int inputChannels, int inputWidth, int inputHeight,
- int size, float[] averages, float[] rates, float threshold, boolean useGpu) {
+ int size, float threshold, float[] result, boolean useGpu) {
this.input = input;
- this.rates = rates;
- this.result = new float[inputChannels * inputWidth * inputHeight];
- this.averages = averages;
+ this.result = result;
this.inputChannels = inputChannels;
this.inputWidth = inputWidth;
this.inputHeight = inputHeight;
@@ -88,8 +82,6 @@ public float[] normalize(float[] input, int inputChannels, int inputWidth, int i
if (useGpu) {
put(input);
execute(inputChannels * inputWidth * inputHeight);
- get(averages);
- get(rates);
get(result);
} else {
IntStream.range(0, inputChannels).parallel().forEach(ch -> {
diff --git a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java
index a63476e..1e1ac22 100644
--- a/src/main/java/kishida/cnn/layers/ConvolutionLayer.java
+++ b/src/main/java/kishida/cnn/layers/ConvolutionLayer.java
@@ -7,9 +7,11 @@
import com.aparapi.Kernel;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
-import java.util.Arrays;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
import java.util.DoubleSummaryStatistics;
import java.util.stream.IntStream;
import kishida.cnn.activation.ActivationFunction;
@@ -20,35 +22,49 @@
import kishida.cnn.kernels.ConvolutionBackwordKernel;
import kishida.cnn.kernels.ConvolutionForwardKernel;
import kishida.cnn.kernels.ConvolutionLocalNormalizationKernel;
+import kishida.cnn.opencl.ConvolutionBackwordCL;
+import kishida.cnn.opencl.ConvolutionForwardCL;
+import kishida.cnn.opencl.OpenCL;
import kishida.cnn.util.FloatUtil;
import lombok.Getter;
import lombok.Setter;
/** 畳み込み層 */
-public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer{
+public class ConvolutionLayer extends ImageNeuralLayer implements LerningLayer, FullGpuEnabled{
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
float[] filter;
+
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
float[] bias;
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
float[] filterDelta;
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
float[] biasDelta;
@Getter
int stride;
@Getter
int filterSize;
private ActivationFunction activation;
+ @JsonProperty
@Getter @Setter
boolean useGpu;
@Getter
float initBias;
float[] tempDelta;
+ float[] newDelta;
+
+ CLBuffer bufFilter;
+ CLBuffer bufBias;
+ CLBuffer bufFilterDelta;
+ CLBuffer bufBiasDelta;
+ CLBuffer bufDelta;
+ CLBuffer bufNewDelta;
+ CLBuffer bufTempBias;
+
+ @JsonIgnore
+ @Getter
+ CLBuffer bufResult;
public ConvolutionLayer(String name,
int filterCount, int size, int stride, float initBias, boolean useGpu) {
@@ -108,44 +124,104 @@ public final void setPreLayer(NeuralLayer preLayer) {
this.result = new float[outputChannels * outputWidth * outputHeight];
this.tempDelta = new float[result.length];
+ this.newDelta = new float[inputChannels * inputWidth * inputHeight];
+
+ if(true){
+ this.bufFilter = OpenCL.createReadWriteBuffer(filter);
+ this.bufBias = OpenCL.createReadWriteBuffer(bias);
+ this.bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta);
+ this.bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta);
+ this.bufResult = OpenCL.createReadWriteBuffer(result.length);
+ this.bufDelta = OpenCL.createReadWriteBuffer(result.length);
+ this.bufNewDelta = OpenCL.createWriteBuffer(newDelta.length);
+ bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufFilter, false)
+ .putWriteBuffer(bufBias, false)
+ .putWriteBuffer(bufFilterDelta, false)
+ .putWriteBuffer(bufBiasDelta, false);
+ }
+ }
+
+ public float[] getFilter() {
+ if(bufFilter != null){
+ OpenCL.getQueue().putReadBuffer(bufFilter, true);
+ bufFilter.getBuffer().get(filter).rewind();
+ }
+ return filter;
+ }
+
+ public float[] getBias() {
+ if(bufBias != null){
+ OpenCL.getQueue().putReadBuffer(bufBias, true);
+ bufBias.getBuffer().get(bias).rewind();
+ }
+ return bias;
+ }
+
+ public float[] getFilterDelta() {
+ if(bufFilterDelta != null){
+ OpenCL.getQueue().putReadBuffer(bufFilterDelta, true);
+ bufFilterDelta.getBuffer().get(filterDelta).rewind();
+ }
+ return filterDelta;
+ }
+
+ public float[] getBiasDelta() {
+ if(bufBiasDelta != null){
+ OpenCL.getQueue().putReadBuffer(bufBiasDelta, true);
+ bufBiasDelta.getBuffer().get(biasDelta).rewind();
+ }
+ return biasDelta;
}
- /** 畳み込みフィルタを適用する */
@Override
- public float[] forward(float[] img) {
- result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight,
- filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, useGpu);
- //localNormalization(result);
- ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result,
- outputChannels, outputWidth, outputHeight, false);
+ public float[] getResult() {
+ if(bufResult != null){
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result).rewind();
+ }
return result;
}
- private void localNormalization(float[] result){
- final int n = 5;
- final int k = 2;
- final float a = 0.0001f;
- final float b = 0.75f;
- // resultをコピーするほうが楽だけど、メモリを節約するため
- final float[] sigma = new float[n];
- for(int x = 0; x < outputWidth; ++x){
- for(int y = 0; y < outputHeight; ++y){
- int xy = x * outputHeight + y;
- Arrays.fill(sigma, 0);
- int lp = 0;
- for(; lp < n / 2; ++lp){
- sigma[lp] = result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy];
- }
- for(int ch = 0; ch < outputChannels; ++ch){
- sigma[lp % 5] = lp >= outputChannels ? 0 :
- result[lp * outputWidth * outputHeight + xy] * result[lp * outputWidth * outputHeight + xy];
- lp = lp + 1;
- float sum = FloatUtil.floatSum(sigma);
- result[ch * outputWidth * outputHeight + xy] = result[ch * outputWidth * outputHeight + xy] /
- (float)Math.pow(k + a * sum, b);
+ /** 畳み込みフィルタを適用する */
+ @Override
+ public float[] forward(float[] img) {
+ if(true){
+ if(false){
+ // aparapi
+ result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false);
+ //localNormalization(result);
+ ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result,
+ outputChannels, outputWidth, outputHeight, false);
+ } else{
+ // JOCL
+ if(true){
+ ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bufBias);
+ } else {
+ ConvolutionForwardCL.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias);
}
}
+ }else {
+ //CPU
+ result = ConvolutionForwardKernel.INSTANCE.forward(img, inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight, result, filterSize, stride, bias, activation, false);
+ //localNormalization(result);
+ ConvolutionLocalNormalizationKernel.INSTANCE.localNormalization(result,
+ outputChannels, outputWidth, outputHeight, false);
}
+ return result;
+ }
+
+ @Override
+ public void forward(CLBuffer input) {
+ ConvolutionForwardCL.INSTANCE.forward(input,
+ inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufResult, filterSize, stride, bufBias);
}
/** 畳み込み層の学習 */
@@ -153,24 +229,50 @@ private void localNormalization(float[] result){
public float[] backward(float[] input, float[] delta) {
if (useGpu) {
// GPUバージョン
- float[] newDelta = ConvolutionBackwordDeltaKernel.INSTANCE.backword(input, delta, result,
- inputChannels, inputWidth, inputHeight,
- filter, outputChannels, outputWidth, outputHeight, filterSize, stride, useGpu);
- ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result,
- input, inputChannels, inputWidth, inputHeight,
- filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu);
- ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result,
- outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu);
- if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU ||
- ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU ||
- ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) {
- useGpu = false;
- }
- if (!useGpu) {
- System.out.println("Can't use GPU on " + name);
- System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode());
- System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode());
- System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode());
+ if(false){
+ // aparapi
+ ConvolutionBackwordDeltaKernel.INSTANCE.backword(delta, result,
+ inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight,
+ filterSize, stride, newDelta, useGpu);
+ ConvolutionBackwordFilterKernel.INSTANCE.backword(delta, result,
+ input, inputChannels, inputWidth, inputHeight,
+ filterDelta, outputChannels, outputWidth, outputHeight, filterSize, stride, parent.getLearningRate(), useGpu);
+ ConvolutionBackwordBiasKernel.INSTANCE.backwordBias(delta, result,
+ outputChannels, outputWidth, outputHeight, biasDelta, parent.getLearningRate(), tempDelta, useGpu);
+ if (ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU ||
+ ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU ||
+ ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode() != Kernel.EXECUTION_MODE.GPU) {
+ useGpu = false;
+ }
+ if (!useGpu) {
+ System.out.println("Can't use GPU on " + name);
+ System.out.println("delta" + ConvolutionBackwordDeltaKernel.INSTANCE.getExecutionMode());
+ System.out.println("filter" + ConvolutionBackwordFilterKernel.INSTANCE.getExecutionMode());
+ System.out.println("bias" + ConvolutionBackwordBiasKernel.INSTANCE.getExecutionMode());
+ }
+ }else{
+ // JOCL
+ if(true){
+ bufDelta.getBuffer().put(delta).rewind();
+ OpenCL.getQueue().putWriteBuffer(bufDelta, false);
+
+ ConvolutionBackwordCL.INSTANCE.backward(
+ bufDelta, bufResult, ((FullGpuEnabled)preLayer).getBufResult(),
+ inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufFilterDelta, bufBiasDelta, bufTempBias, filterSize, stride, bufNewDelta,
+ parent.getLearningRate());
+
+ OpenCL.getQueue().putReadBuffer(bufNewDelta, true);
+ bufNewDelta.getBuffer().get(newDelta).rewind();
+ }else{
+ ConvolutionBackwordCL.INSTANCE.backward(
+ delta, result, input,
+ inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight,
+ filterDelta, biasDelta, filterSize, stride, newDelta, parent.getLearningRate());
+ }
}
return newDelta;
} else {
@@ -179,23 +281,55 @@ public float[] backward(float[] input, float[] delta) {
input, inputChannels, inputWidth, inputHeight,
filter, outputChannels, outputWidth, outputHeight,
filterDelta, biasDelta,
- filterSize, stride, bias, parent.getLearningRate(), false);
+ filterSize, stride, parent.getLearningRate(), false);
}
}
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) {
+ ConvolutionBackwordCL.INSTANCE.backward(bufDelta, bufResult, bufInput,
+ inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufFilterDelta, bufBiasDelta, bufTempBias,
+ filterSize, stride, bufNewDelta, parent.getLearningRate());
+ return bufNewDelta;
+ }
+
@Override
public void prepareBatch() {
- float momentam = parent.getMomentam();
- IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam);
- IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam);
+ if(useGpu){
+ ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(),
+ filterDelta.length, biasDelta.length, bufFilterDelta, bufBiasDelta);
+ }else{
+ float momentam = parent.getMomentam();
+ IntStream.range(0, filterDelta.length).parallel().forEach(i -> filterDelta[i] = filterDelta[i] * momentam);
+ IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam);
+ }
}
@Override
public void joinBatch() {
- float count = parent.getMiniBatch();
- IntStream.range(0, filter.length).parallel().forEach(i -> filter[i] += filterDelta[i] / count
- - parent.getWeightDecay() * parent.getLearningRate() * filter[i]);
- IntStream.range(0, bias.length).parallel().forEach(i -> bias[i] += biasDelta[i] / count);
+ if(useGpu){
+ ConvolutionBackwordCL.INSTANCE.join(
+ parent.getWeightDecay(), parent.getLearningRate(),
+ filter.length, bias.length,
+ parent.getMiniBatch(),
+ bufFilter, bufFilterDelta, bufBias, bufBiasDelta);
+ /*
+ bufFilter.getBuffer().put(filter).rewind();
+ bufBias.getBuffer().put(bias).rewind();
+ OpenCL.getQueue()
+ .putWriteBuffer(bufFilter, false)
+ .putWriteBuffer(bufBias, false);
+ */
+ }else{
+ float count = parent.getMiniBatch();
+ IntStream.range(0, filter.length).parallel().forEach(
+ i -> filter[i] += filterDelta[i] / count
+ - parent.getWeightDecay() * parent.getLearningRate() * filter[i]);
+ IntStream.range(0, bias.length).parallel().forEach(
+ i -> bias[i] += biasDelta[i] / count);
+ }
}
@Override
@@ -208,12 +342,12 @@ public String toString() {
@Override
public DoubleSummaryStatistics getWeightStatistics() {
- return FloatUtil.summary(filter);
+ return FloatUtil.summary(getFilter());
}
@Override
public DoubleSummaryStatistics getBiasStatistics() {
- return FloatUtil.summary(bias);
+ return FloatUtil.summary(getBias());
}
}
diff --git a/src/main/java/kishida/cnn/layers/FullGpuEnabled.java b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java
new file mode 100644
index 0000000..d9f37aa
--- /dev/null
+++ b/src/main/java/kishida/cnn/layers/FullGpuEnabled.java
@@ -0,0 +1,35 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.layers;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
+import java.util.Objects;
+import kishida.cnn.opencl.OpenCL;
+
+/**
+ *
+ * @author naoki
+ */
+public interface FullGpuEnabled {
+ @JsonIgnore
+ default boolean isUseGpu(){
+ return true;
+ }
+ CLBuffer getBufResult();
+ void forward(CLBuffer bufInput);
+ CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta);
+ default CLBuffer backwardBuf(CLBuffer bufInput, float[] delta){
+ Objects.requireNonNull(delta, "delta is null on " + ((NeuralLayer)this).getName());
+ CLBuffer bufDelta = OpenCL.createReadBuffer(delta);
+ OpenCL.getQueue().putWriteBuffer(bufDelta, false);
+ CLBuffer result = backwardBuf(bufInput, bufDelta);
+ bufDelta.release();
+ return result;
+ }
+
+}
diff --git a/src/main/java/kishida/cnn/layers/FullyConnect.java b/src/main/java/kishida/cnn/layers/FullyConnect.java
index 9183706..99aca63 100644
--- a/src/main/java/kishida/cnn/layers/FullyConnect.java
+++ b/src/main/java/kishida/cnn/layers/FullyConnect.java
@@ -6,13 +6,20 @@
package kishida.cnn.layers;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
-import java.util.Arrays;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
import java.util.DoubleSummaryStatistics;
import java.util.stream.IntStream;
import kishida.cnn.activation.ActivationFunction;
import kishida.cnn.kernels.FullyForwardKernel;
+import kishida.cnn.opencl.ConvolutionBackwordCL;
+import kishida.cnn.opencl.FullyBackwordCL;
+import kishida.cnn.opencl.FullyForwardCL;
+import kishida.cnn.opencl.OpenCL;
import kishida.cnn.util.FloatUtil;
import lombok.Getter;
import lombok.Setter;
@@ -21,18 +28,14 @@
*
* @author naoki
*/
-public class FullyConnect extends NeuralLayer implements LerningLayer{
+public class FullyConnect extends NeuralLayer implements LerningLayer, FullGpuEnabled{
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
private float[]weight;
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
private float[] bias;
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
private float[]weightDelta;
@JsonInclude(JsonInclude.Include.NON_NULL)
- @Getter
private float[] biasDelta;
@JsonProperty
@@ -41,7 +44,8 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{
private int[] dropout;
@Getter
private float dropoutRate = 1;
- @Getter @Setter
+ @JsonProperty
+ @Setter @Getter
private boolean useGpu;
private float[] newDelta;
private float[] diffed;
@@ -50,6 +54,16 @@ public class FullyConnect extends NeuralLayer implements LerningLayer{
@Getter
private float initBias;
+ CLBuffer bufWeight;
+ CLBuffer bufBias;
+ CLBuffer bufWeightDelta;
+ CLBuffer bufBiasDelta;
+ CLBuffer bufDropout;
+ @JsonIgnore
+ @Getter
+ CLBuffer bufResult;
+ CLBuffer bufNewDelta;
+
public FullyConnect(String name, int outputSize, float initBias, float dropoutRate, ActivationFunction activation, boolean useGpu) {
this(name, outputSize, null, null, initBias, null, null, dropoutRate, null, activation, useGpu);
}
@@ -112,6 +126,61 @@ public final void setPreLayer(NeuralLayer preLayer) {
if(biasDelta == null){
this.biasDelta = new float[outputSize];
}
+ if(useGpu){
+ bufWeight = OpenCL.createReadWriteBuffer(weight);
+ bufBias = OpenCL.createReadWriteBuffer(bias);
+ bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta);
+ bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta);
+ bufResult = OpenCL.createReadWriteBuffer(result.length);
+ bufDropout = OpenCL.createReadBuffer(dropout);
+ bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufWeight, false)
+ .putWriteBuffer(bufBias, false)
+ .putWriteBuffer(bufWeightDelta, false)
+ .putWriteBuffer(bufBiasDelta, false)
+ .putWriteBuffer(bufDropout, false);
+ }
+ }
+
+ public float[] getWeight() {
+ if(bufWeight != null){
+ OpenCL.getQueue().putReadBuffer(bufWeight, true);
+ bufWeight.getBuffer().get(weight).rewind();
+ }
+ return weight;
+ }
+
+ public float[] getBias() {
+ if(bufBias != null){
+ OpenCL.getQueue().putReadBuffer(bufBias, true);
+ bufBias.getBuffer().get(bias).rewind();
+ }
+ return bias;
+ }
+ public float[] getWeightDelta() {
+ if(bufWeightDelta != null){
+ OpenCL.getQueue().putReadBuffer(bufWeightDelta, true);
+ bufWeightDelta.getBuffer().get(weightDelta).rewind();
+ }
+ return weightDelta;
+ }
+
+ public float[] getBiasDelta() {
+ if(bufBiasDelta != null){
+ OpenCL.getQueue().putReadBuffer(bufBiasDelta, true);
+ bufBiasDelta.getBuffer().get(biasDelta).rewind();
+ }
+ return biasDelta;
+ }
+
+ @Override
+ public float[] getResult() {
+ if(bufResult != null && isUseGpu()){
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result).rewind();
+ }
+ return result;
}
@JsonProperty("activationObj")
@@ -132,58 +201,97 @@ public void prepareDropout() {
@Override
public float[] forward(float[] in) {
prepareDropout();
- Arrays.fill(result, 0);
- FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu);
- /*
- IntStream.range(0, out).parallel().filter(j -> dropout[j] == 1).forEach(j -> {
- for (int i = 0; i < in.length; ++i) {
- result[j] += in[i] * weight[i * out + j];
+ if(useGpu){
+ if(false){
+ FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu);
+ activation.applyAfter(result);
+ }else{
+ FullyForwardCL.INSTANCE.forward(inputSize, outputSize, dropout, in, bufWeight, bufBias, result, activation);
}
- result[j] += bias[j];
- });*/
- activation.applyAfter(result);
+ }else{
+ FullyForwardKernel.INSTANCE.forward(outputSize, dropout, in, result, weight, bias, useGpu);
+ activation.applyAfter(result);
+ }
return result;
}
+ @Override
+ public void forward(CLBuffer input) {
+ prepareDropout();
+ bufDropout.getBuffer().put(dropout).rewind();
+ OpenCL.getQueue().putWriteBuffer(bufDropout, false);
+
+ FullyForwardCL.INSTANCE.forward(inputSize, outputSize,
+ bufDropout, input, bufWeight, bufBias, bufResult, activation);
+ }
+
@Override
public float[] backward(float[] in, float[] delta) {
- Arrays.fill(newDelta, 0);
- Arrays.fill(diffed, 0);
- for(int i = 0; i < result.length; ++i){
- diffed[i] = activation.diff(result[i]);
- }
- IntStream.range(0, in.length).parallel().forEach((i) -> {
- for (int j = 0; j < outputSize; ++j) {
- if (dropout[j] != 1) {
- continue;
- }
- float d = diffed[j] * delta[j];
- newDelta[i] += d * weight[i * outputSize + j];//in[i] *;
- weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate();
+ if(useGpu && true){
+ FullyBackwordCL.INSTANCE.backword(inputSize, outputSize,
+ dropout, in, delta, result, bufWeight, bufWeightDelta, bufBiasDelta, newDelta,
+ parent.getLearningRate(), activation);
+ }else{
+ for(int i = 0; i < result.length; ++i){
+ diffed[i] = activation.diff(result[i]);
}
- });
- IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> {
- biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate();
- });
+ IntStream.range(0, in.length).parallel().forEach((i) -> {
+ float nd = 0;
+ for (int j = 0; j < outputSize; ++j) {
+ if (dropout[j] != 1) {
+ continue;
+ }
+ float d = diffed[j] * delta[j];
+ nd += d * weight[i * outputSize + j];//in[i] *;
+ weightDelta[i * outputSize + j] += d * in[i] * parent.getLearningRate();
+ }
+ newDelta[i] = nd;
+ });
+ IntStream.range(0, outputSize).parallel().filter(j -> dropout[j] == 1).forEach(j -> {
+ biasDelta[j] += diffed[j] * delta[j] * parent.getLearningRate();
+ });
+ }
return newDelta;
}
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) {
+ FullyBackwordCL.INSTANCE.backword(inputSize, outputSize, bufDropout,
+ bufInput, bufDelta, bufResult,
+ bufWeight, bufWeightDelta,
+ bufBiasDelta, bufNewDelta, parent.getLearningRate(), activation);
+ return bufNewDelta;
+ }
+
@Override
public void prepareBatch() {
- float momentam = parent.getMomentam();
- IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam);
- IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam);
+ if(useGpu & true){
+ ConvolutionBackwordCL.INSTANCE.prepare(parent.getMomentam(),
+ weightDelta.length, biasDelta.length, bufWeightDelta, bufBiasDelta);
+ }else{
+ float momentam = parent.getMomentam();
+ IntStream.range(0, weightDelta.length).forEach(i -> weightDelta[i] = weightDelta[i] * momentam);
+ IntStream.range(0, biasDelta.length).parallel().forEach(i -> biasDelta[i] = biasDelta[i] * momentam);
+ }
}
@Override
public void joinBatch() {
- IntStream.range(0, weight.length).parallel().forEach(ij -> {
- weight[ij] += weightDelta[ij] / parent.getMiniBatch()
- - weight[ij] * parent.getWeightDecay() * parent.getLearningRate();
- });
- IntStream.range(0, bias.length).parallel().forEach(i -> {
- bias[i] += biasDelta[i] / parent.getMiniBatch();
- });
+ if(useGpu & true){
+ ConvolutionBackwordCL.INSTANCE.join(
+ parent.getWeightDecay(), parent.getLearningRate(),
+ weight.length, bias.length,
+ parent.getMiniBatch(),
+ bufWeight, bufWeightDelta, bufBias, bufBiasDelta);
+ }else{
+ IntStream.range(0, weight.length).parallel().forEach(ij -> {
+ weight[ij] += weightDelta[ij] / parent.getMiniBatch()
+ - weight[ij] * parent.getWeightDecay() * parent.getLearningRate();
+ });
+ IntStream.range(0, bias.length).parallel().forEach(i -> {
+ bias[i] += biasDelta[i] / parent.getMiniBatch();
+ });
+ }
}
@Override
diff --git a/src/main/java/kishida/cnn/layers/InputLayer.java b/src/main/java/kishida/cnn/layers/InputLayer.java
index 88a38f4..80c380e 100644
--- a/src/main/java/kishida/cnn/layers/InputLayer.java
+++ b/src/main/java/kishida/cnn/layers/InputLayer.java
@@ -6,13 +6,21 @@
package kishida.cnn.layers;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
+import kishida.cnn.opencl.OpenCL;
+import lombok.Getter;
/**
*
* @author naoki
*/
-public class InputLayer extends ImageNeuralLayer {
+public class InputLayer extends ImageNeuralLayer implements FullGpuEnabled {
+ @JsonIgnore
+ @Getter
+ CLBuffer bufResult;
public InputLayer(int width, int height) {
this("input", width, height);
@@ -24,6 +32,7 @@ public InputLayer(
@JsonProperty("width") int width,
@JsonProperty("height") int height) {
super("input", 0, 0, 0, 3, width, height);
+ bufResult = OpenCL.createWriteBuffer(outputChannels * outputWidth * outputHeight);
}
@Override
@@ -39,20 +48,48 @@ public int getHeight() {
return super.outputHeight;
}
+ @Override
+ public boolean isUseGpu() {
+ return false;
+ }
+
@Override
public float[] forward(float[] in) {
this.result = in;
+ bufResult.getBuffer().put(result);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufResult, false);
return result;
}
+ @Override
+ public void forward(CLBuffer input) {
+ // do nothing
+ }
+
@Override
public float[] backward(float[] in, float[] delta) {
// do nothing
return null;
}
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) {
+ // do nothing
+ return null;
+ }
+
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, float[] delta) {
+ // do nothing
+ return null;
+ }
+
public void setInput(float[] input){
result = input;
+ bufResult.getBuffer().put(result).rewind();
+ OpenCL.getQueue()
+ .putWriteBuffer(bufResult, false);
}
@Override
diff --git a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java
index ad7f4fe..9625241 100644
--- a/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java
+++ b/src/main/java/kishida/cnn/layers/MaxPoolingLayer.java
@@ -6,21 +6,33 @@
package kishida.cnn.layers;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
import java.util.Arrays;
+import java.util.Random;
+import java.util.stream.Collectors;
import java.util.stream.IntStream;
+import kishida.cnn.opencl.MaxPoolingCL;
+import kishida.cnn.opencl.OpenCL;
import lombok.Getter;
/**
*
* @author naoki
*/
-public class MaxPoolingLayer extends ImageNeuralLayer {
+public class MaxPoolingLayer extends ImageNeuralLayer implements FullGpuEnabled {
@Getter
int size;
@Getter
int stride;
float[] newDelta;
+ @JsonIgnore
+ @Getter
+ CLBuffer bufResult;
+
+ CLBuffer bufNewDelta;
@JsonCreator
public MaxPoolingLayer(
@@ -40,74 +52,154 @@ public final void setPreLayer(NeuralLayer preLayer) {
outputHeight = inputHeight / stride;
result = new float[outputChannels * outputWidth * outputHeight];
newDelta = new float[inputChannels * inputWidth * inputHeight];
+ bufResult = OpenCL.createReadWriteBuffer(result.length);
+ bufNewDelta = OpenCL.createReadWriteBuffer(newDelta.length);
+ }
+
+ @Override
+ public float[] getResult() {
+ if(bufResult != null){
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result).rewind();
+ }
+ return result;
}
/** プーリング(max) */
@Override
public float[] forward(float[] data) {
- IntStream.range(0, inputChannels).parallel().forEach(ch -> {
- for (int x = 0; x < outputWidth; ++x) {
- for (int y = 0; y < outputHeight; ++y) {
- float max = Float.NEGATIVE_INFINITY;
- for (int i = 0; i < size; ++i) {
- int xx = x * stride + i - size / 2;
- if (xx < 0 || xx >= inputWidth) {
- continue;
- }
- for (int j = 0; j < size; ++j) {
- int yy = y * stride + j - size / 2;
- if (yy < 0 || yy >= inputHeight) {
+ if(false){
+ MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight,
+ outputWidth, outputHeight, size, stride, data, result);
+ }else{
+ IntStream.range(0, inputChannels).parallel().forEach(ch -> {
+ for (int x = 0; x < outputWidth; ++x) {
+ for (int y = 0; y < outputHeight; ++y) {
+ float max = Float.NEGATIVE_INFINITY;
+ for (int i = 0; i < size; ++i) {
+ int xx = x * stride + i - size / 2;
+ if (xx < 0 || xx >= inputWidth) {
continue;
}
- float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy];
- if (max < d) {
- max = d;
+ for (int j = 0; j < size; ++j) {
+ int yy = y * stride + j - size / 2;
+ if (yy < 0 || yy >= inputHeight) {
+ continue;
+ }
+ float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy];
+ if (max < d) {
+ max = d;
+ }
}
}
+ result[ch * outputWidth * outputHeight + x * outputHeight + y] = max;
}
- result[ch * outputWidth * outputHeight + x * outputHeight + y] = max;
}
- }
- });
+ });
+ }
return result;
}
+ @Override
+ public void forward(CLBuffer input) {
+ MaxPoolingCL.INSTANCE.forward(inputChannels, inputWidth, inputHeight,
+ outputWidth, outputHeight, size, stride,
+ input, bufResult);
+ }
+
@Override
public float[] backward(float[] in, float[] delta) {
- Arrays.fill(newDelta, 0);
- IntStream.range(0, inputChannels).parallel().forEach(ch -> {
- for (int x = 0; x < outputWidth; ++x) {
- for (int y = 0; y < outputHeight; ++y) {
- float max = Float.NEGATIVE_INFINITY;
- int maxX = 0;
- int maxY = 0;
- for (int i = 0; i < size; ++i) {
- int xx = x * stride + i - size / 2;
- if (xx < 0 || xx >= inputWidth) {
- continue;
- }
- for (int j = 0; j < size; ++j) {
- int yy = y * stride + j - size / 2;
- if (yy < 0 || yy >= inputHeight) {
+ return backward(in, delta, false);
+ }
+ public float[] backward(float[] in, float[] delta, boolean gpu) {
+ if(gpu){
+ MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight,
+ outputWidth, outputHeight, size, stride, in, delta, newDelta);
+ }else{
+ Arrays.fill(newDelta, 0);
+ IntStream.range(0, inputChannels).parallel().forEach(ch -> {
+ for (int x = 0; x < outputWidth; ++x) {
+ for (int y = 0; y < outputHeight; ++y) {
+ float max = Float.NEGATIVE_INFINITY;
+ int maxX = 0;
+ int maxY = 0;
+ for (int i = 0; i < size; ++i) {
+ int xx = x * stride + i - size / 2;
+ if (xx < 0 || xx >= inputWidth) {
continue;
}
- float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy];
- if (max < d) {
- max = d;
- maxX = xx;
- maxY = yy;
+ for (int j = 0; j < size; ++j) {
+ int yy = y * stride + j - size / 2;
+ if (yy < 0 || yy >= inputHeight) {
+ continue;
+ }
+ float d = in[ch * inputWidth * inputHeight + xx * inputWidth + yy];
+ if (max < d) {
+ max = d;
+ maxX = xx;
+ maxY = yy;
+ }
}
}
+ int chxy = ch * outputWidth * outputHeight + x * outputHeight + y;
+ newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] +=
+ delta[chxy];
}
- int chxy = ch * outputWidth * outputHeight + x * outputHeight + y;
- newDelta[ch * inputWidth * inputHeight + maxX * inputHeight + maxY] +=
- delta[chxy];
}
- }
- });
+ });
+ }
return newDelta;
}
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) {
+ MaxPoolingCL.INSTANCE.backword(inputChannels, inputWidth, inputHeight,
+ outputWidth, outputHeight, size, stride,
+ bufInput, bufDelta, bufNewDelta);
+ return bufNewDelta;
+ }
+
+ public static void main(String[] args) {
+ InputLayer input = new InputLayer(6, 6);
+ MaxPoolingLayer pool = new MaxPoolingLayer("test_pool", 3, 2);
+ pool.setPreLayer(input);
+ for(int i = 0; i < pool.newDelta.length; ++i){
+ pool.newDelta[i] = 3;
+ }
+ float[] in = new float[6 * 6 * 3];
+ for(int i = 0; i < in.length; ++i){
+ in[i] = i;
+ }
+ float[] delta = {
+ 0.01f, 0.02f, 0.03f, 0.05f, 0.07f, 0.11f, 0.13f, 0.17f, 0.19f,
+ 1, 2, 3, 5, 7, 11, 13, 17, 19,
+ 1, 2, 3, 5, 7, 11, 13, 17, 19};
+ float[] newDeltaGpu = pool.backward(in, delta, true);
+ float[] newDeltaCpu = pool.backward(in, delta, false);
+ System.out.println(Arrays.equals(newDeltaCpu, newDeltaGpu));
+ IntStream.range(0, newDeltaGpu.length / 6 / 3).forEach(i -> {
+ System.out.println(IntStream.range(0, 6)
+ .map(n -> n + i * 6)
+ .mapToObj(n -> "" + (int)(newDeltaGpu[n]*100))
+ .collect(Collectors.joining(",")));
+ });
+
+ Random r = new Random();
+ for(int t = 0; t < 1000; ++t){
+ for(int i = 0; i < in.length; ++i){
+ in[i] = r.nextFloat();
+ }
+ for(int i = 0; i < delta.length; ++i){
+ delta[i] = r.nextFloat();
+ }
+ float[] newDeltaGpu2 = pool.backward(in, delta, true);
+ float[] newDeltaCpu2 = pool.backward(in, delta, false);
+ if(!Arrays.equals(newDeltaCpu2, newDeltaGpu2)){
+ System.out.println("wrong");
+ };
+ }
+ }
+
@Override
public String toString() {
return String.format("%s:Max pooling size:%dx%d stride:%d in:%dx%dx%d out %dx%dx%d",
diff --git a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java
index c776d3a..7f3580b 100644
--- a/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java
+++ b/src/main/java/kishida/cnn/layers/MultiNormalizeLayer.java
@@ -6,24 +6,30 @@
package kishida.cnn.layers;
import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
+import com.jogamp.opencl.CLBuffer;
+import java.nio.FloatBuffer;
import java.util.stream.IntStream;
+import kishida.cnn.opencl.MultiNormalizeCL;
+import kishida.cnn.opencl.OpenCL;
import lombok.Getter;
/**
*
* @author naoki
*/
-public class MultiNormalizeLayer extends ImageNeuralLayer{
+public class MultiNormalizeLayer extends ImageNeuralLayer implements FullGpuEnabled{
@Getter
int size;
@Getter
float threshold;
+ @JsonProperty
@Getter
boolean useGpu;
-
- float[] averages;
- float[] rates;
+ @JsonIgnore
+ @Getter
+ CLBuffer bufResult;
@JsonCreator
public MultiNormalizeLayer(
@@ -43,70 +49,90 @@ public final void setPreLayer(NeuralLayer preLayer) {
outputChannels = inputChannels;
outputWidth = inputWidth;
outputHeight = inputHeight;
- averages = new float[inputWidth * inputHeight];
- rates = new float[inputWidth * inputHeight];
result = new float[inputChannels * inputHeight * inputWidth];
+ bufResult = OpenCL.createReadWriteBuffer(result.length);
}
@Override
- public float[] forward(float[] in) {
+ public float[] getResult() {
+ if(bufResult != null){
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result).rewind();
+ }
+ return result;
+ }
- IntStream.range(0, inputWidth).parallel().forEach(x -> {
- for(int y = 0; y < inputHeight; ++y){
- float total = 0;
- int count = 0;
- for(int i = 0; i < size; ++i){
- int xx = x + i - size / 2;
- if(xx < 0 || xx >= inputWidth){
- continue;
- }
- for(int j = 0; j < size; ++j){
- int yy = y + j - size / 2;
- if(yy < 0 || yy >= inputHeight){
+ @Override
+ public float[] forward(float[] in) {
+ if(false){
+ MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight,
+ size, threshold, in, result);
+ } else{
+ IntStream.range(0, inputWidth).parallel().forEach(x -> {
+ for(int y = 0; y < inputHeight; ++y){
+ float total = 0;
+ int count = 0;
+ for(int i = 0; i < size; ++i){
+ int xx = x + i - size / 2;
+ if(xx < 0 || xx >= inputWidth){
continue;
}
- for(int ch = 0; ch < inputChannels; ++ch){
- total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy];
- ++count;
+ for(int j = 0; j < size; ++j){
+ int yy = y + j - size / 2;
+ if(yy < 0 || yy >= inputHeight){
+ continue;
+ }
+ for(int ch = 0; ch < inputChannels; ++ch){
+ total += in[ch * inputHeight * inputWidth + xx * inputHeight + yy];
+ ++count;
+ }
}
}
- }
- float average = total / count;
- float variance = 0;
- for(int i = 0; i < size; ++i){
- int xx = x + i - size / 2;
- if(xx < 0 || xx >= inputWidth){
- continue;
- }
- for(int j = 0; j < size; ++j){
- int yy = y + j - size / 2;
- if(yy < 0 || yy >= inputHeight){
+ float average = total / count;
+ float variance = 0;
+ for(int i = 0; i < size; ++i){
+ int xx = x + i - size / 2;
+ if(xx < 0 || xx >= inputWidth){
continue;
}
- for(int ch = 0; ch < inputChannels; ++ch){
- float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy];
- variance += (data - average) * (data - average);
+ for(int j = 0; j < size; ++j){
+ int yy = y + j - size / 2;
+ if(yy < 0 || yy >= inputHeight){
+ continue;
+ }
+ for(int ch = 0; ch < inputChannels; ++ch){
+ float data = in[ch * inputHeight * inputWidth + xx * inputHeight + yy];
+ variance += (data - average) * (data - average);
+ }
}
}
+ float std = Math.max(threshold, (float)Math.sqrt(variance / count));
+ for(int ch = 0; ch < inputChannels; ++ch){
+ int pos = ch * inputHeight * inputWidth + x * inputHeight + y;
+ result[pos] = (in[pos] - average) / std;
+ }
}
- float std = Math.max(threshold, (float)Math.sqrt(variance / count));
- averages[x * inputHeight + y] = average;
- rates[x * inputHeight + y] = std;
- for(int ch = 0; ch < inputChannels; ++ch){
- int pos = ch * inputHeight * inputWidth + x * inputHeight + y;
- result[pos] = (in[pos] - average) / std;
- }
- }
- });
-
+ });
+ }
return result;
}
+ @Override
+ public void forward(CLBuffer input) {
+ MultiNormalizeCL.INSTANCE.normalize(inputChannels, inputWidth, inputHeight, size,
+ threshold, input, bufResult);
+ }
+
@Override
public float[] backward(float[] in, float[] delta) {
return delta;
}
+ @Override
+ public CLBuffer backwardBuf(CLBuffer bufInput, CLBuffer bufDelta) {
+ return bufDelta;
+ }
+
@Override
public String toString() {
return String.format("%s:Multi channel normalize size:%dx%d in:%dx%dx%d out %dx%dx%d",
diff --git a/src/main/java/kishida/cnn/layers/NeuralLayer.java b/src/main/java/kishida/cnn/layers/NeuralLayer.java
index 27f0b03..7906c5e 100644
--- a/src/main/java/kishida/cnn/layers/NeuralLayer.java
+++ b/src/main/java/kishida/cnn/layers/NeuralLayer.java
@@ -36,7 +36,8 @@ public abstract class NeuralLayer {
@Getter
float[] result;
- @Setter
+ @JsonIgnore
+ @Setter @Getter
NeuralLayer preLayer;
@Setter
@@ -46,9 +47,14 @@ public NeuralLayer(String name) {
this.name = name;
}
- public float[] forward() {
+ public void forward() {
Objects.requireNonNull(preLayer, "preLayer is null on " + name);
- return forward(preLayer.result);
+ if(this instanceof FullGpuEnabled && preLayer instanceof FullGpuEnabled &&
+ ((FullGpuEnabled)this).isUseGpu()){
+ ((FullGpuEnabled)this).forward(((FullGpuEnabled)preLayer).getBufResult());
+ }else{
+ forward(preLayer.getResult());
+ }
}
public float[] backward(float[] delta) {
@@ -71,7 +77,7 @@ public void joinBatch(){
@JsonIgnore
public DoubleSummaryStatistics getResultStatistics(){
- return FloatUtil.summary(result);
+ return FloatUtil.summary(getResult());
}
}
diff --git a/src/main/java/kishida/cnn/layers/NormalizeLayer.java b/src/main/java/kishida/cnn/layers/NormalizeLayer.java
index ff55efe..c4097a7 100644
--- a/src/main/java/kishida/cnn/layers/NormalizeLayer.java
+++ b/src/main/java/kishida/cnn/layers/NormalizeLayer.java
@@ -38,7 +38,7 @@ public float[] forward(float[] in) {
averages = new float[in.length];
rates = new float[in.length];
result = NormalizeKernel.INSTANCE.normalize(in, inputChannels, inputWidth, inputHeight,
- size, averages, rates, threshold, useGpu);
+ size, threshold, result, useGpu);
return result;
}
diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java
new file mode 100644
index 0000000..6b2655a
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/ConvolutionBackwordCL.java
@@ -0,0 +1,283 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+import java.util.Map;
+
+/**
+ *
+ * @author naoki
+ */
+public class ConvolutionBackwordCL {
+ public static ConvolutionBackwordCL INSTANCE = new ConvolutionBackwordCL();
+ CLProgram prog;
+ Map kernels;
+
+ private ConvolutionBackwordCL() {
+ }
+
+ public void backward(float[] delta, float[] result,
+ float[] input, int inputChannels, int inputWidth, int inputHeight,
+ float[] filter, int outputChannels, int outputWidth, int outputHeight,
+ float[] filterDelta, float[] biasDelta,
+ int filterSize, int stride, float[] newDelta, float learningRate) {
+ CLBuffer bufFilter = OpenCL.createReadBuffer(filter);
+ CLBuffer bufFilterDelta = OpenCL.createReadWriteBuffer(filterDelta);
+ CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta);
+ CLBuffer bufResult = OpenCL.createReadBuffer(result);
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufFilter, false)
+ .putWriteBuffer(bufFilterDelta, false)
+ .putWriteBuffer(bufBiasDelta, false)
+ .putWriteBuffer(bufInput, false)
+ .putWriteBuffer(bufResult, false);
+
+ backward(delta, bufResult,
+ bufInput, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufFilterDelta, bufBiasDelta,
+ filterSize, stride, newDelta, learningRate);
+
+ OpenCL.getQueue()
+ .putReadBuffer(bufBiasDelta, true)
+ .putReadBuffer(bufFilterDelta, true);
+ bufFilterDelta.getBuffer().get(filterDelta);
+ bufBiasDelta.getBuffer().get(biasDelta);
+
+ bufFilter.release();
+ bufFilterDelta.release();
+ bufBiasDelta.release();
+ bufInput.release();
+ bufResult.release();
+ }
+ public void backward(float[] delta, CLBuffer bufResult,
+ CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ CLBuffer bufFilterDelta, CLBuffer bufBiasDelta,
+ int filterSize, int stride, float[] newDelta, float learningRate) {
+ CLBuffer bufDelta = OpenCL.createReadBuffer(delta);
+ CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length);
+ CLBuffer bufTempBias = OpenCL.createReadWriteBuffer(outputChannels * outputWidth * outputHeight);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufDelta, false);
+
+ backward(bufDelta, bufResult,
+ bufInput, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufFilterDelta, bufBiasDelta, bufTempBias,
+ filterSize, stride, bufNewDelta, learningRate);
+
+ OpenCL.getQueue()
+ .putReadBuffer(bufNewDelta, true);
+ bufNewDelta.getBuffer().get(newDelta);
+
+ bufDelta.release();
+ bufNewDelta.release();
+ bufTempBias.release();
+ }
+ public void backward(CLBuffer bufDelta, CLBuffer bufResult,
+ CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ CLBuffer bufFilterDelta, CLBuffer bufBiasDelta,
+ CLBuffer bufTempBias,
+ int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) {
+ backward_sep(bufDelta, bufResult, bufInput, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight,
+ bufFilterDelta, bufBiasDelta, bufTempBias,
+ filterSize, stride, bufNewDelta, learningRate);
+ }
+
+ public void backward_sep(CLBuffer bufDelta, CLBuffer bufResult,
+ CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ CLBuffer bufFilterDelta, CLBuffer bufBiasDelta,
+ CLBuffer bufTempBias,
+ int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) {
+ if(prog == null){
+ prog = OpenCL.compile("convolution_backword.cl");
+ kernels = prog.createCLKernels();
+ }
+
+ CLKernel deltaKernel = prog.createCLKernel("delta_kernel");
+ deltaKernel
+ .rewind()
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(filterSize)
+ .putArg(outputChannels)
+ .putArg(stride)
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArgs(
+ bufResult,
+ bufDelta,
+ bufFilter)
+ .putArg(inputChannels)
+ .putArg(bufNewDelta);
+ OpenCL.execute(deltaKernel,
+ inputChannels * inputWidth * inputHeight);
+
+ CLKernel filterKernel = kernels.get("filter_kernel");
+ filterKernel
+ .rewind()
+ .putArg(inputChannels)
+ .putArg(filterSize)
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArgs(
+ bufResult,
+ bufDelta)
+ .putArg(stride)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(learningRate)
+ .putArgs(
+ bufInput,
+ bufFilterDelta);
+ OpenCL.execute(filterKernel,
+ outputChannels * inputChannels * filterSize * filterSize);
+
+ CLKernel biasKernel = kernels.get("bias_kernel");
+ biasKernel
+ .rewind()
+ .putArgs(
+ bufResult,
+ bufDelta,
+ bufTempBias)
+ .putArg(learningRate);
+ OpenCL.execute(biasKernel,
+ outputChannels * outputWidth * outputHeight);
+
+ CLKernel biasAfterKernel = kernels.get("biasAfter");
+ biasAfterKernel
+ .rewind()
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArgs(
+ bufTempBias,
+ bufBiasDelta);
+ OpenCL.execute(biasAfterKernel, outputChannels);
+
+ }
+ public void backward_gen(CLBuffer bufDelta, CLBuffer bufResult,
+ CLBuffer bufInput, int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ CLBuffer bufFilterDelta, CLBuffer bufBiasDelta,
+ CLBuffer bufTempBias,
+ int filterSize, int stride, CLBuffer bufNewDelta, float learningRate) {
+ if(prog == null){
+ prog = OpenCL.compile("convolution_backword.cl");
+ kernels = prog.createCLKernels();
+ }
+ CLKernel dfbKernel = kernels.get("dfb");
+ dfbKernel.rewind()
+ .putArg(inputChannels)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(outputChannels)
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArg(filterSize)
+ .putArg(stride)
+ .putArg(learningRate)
+ .putArgs(
+ bufInput,
+ bufResult,
+ bufFilter,
+ bufFilterDelta,
+ bufDelta,
+ bufNewDelta,
+ bufTempBias)
+ .putArg(inputChannels * inputWidth * inputHeight)
+ .putArg(outputChannels * inputChannels * filterSize * filterSize)
+ .putArg(outputChannels * outputWidth * outputHeight);
+ OpenCL.execute(dfbKernel,
+ inputChannels * inputWidth * inputHeight +
+ outputChannels * inputChannels * filterSize * filterSize +
+ outputChannels * outputWidth * outputHeight);
+
+ CLKernel biasAfterKernel = kernels.get("biasAfter");
+ biasAfterKernel
+ .rewind()
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArgs(
+ bufTempBias,
+ bufBiasDelta);
+ OpenCL.execute(biasAfterKernel, outputChannels);
+
+ }
+ public void prepare(float momentam,
+ int filterCount, int biasCount,
+ CLBuffer bufFilterDelta,
+ CLBuffer bufBiasDelta){
+
+ CLKernel kernel = kernels.get("prepare");
+ kernel.rewind()
+ .putArg(momentam)
+ .putArg(bufFilterDelta);
+ OpenCL.execute(kernel, filterCount);
+ kernel.rewind()
+ .putArg(momentam)
+ .putArg(bufBiasDelta);
+ OpenCL.execute(kernel, biasCount);
+ }
+
+ public void join(float weightDecay, float learningRate,
+ int filterCount, int biasCount, int count,
+ CLBuffer bufFilter, CLBuffer bufFilterDelta,
+ CLBuffer bufBias, CLBuffer bufBiasDelta){
+ CLKernel kernelFilter = kernels.get("joinFilter");
+ kernelFilter.rewind()
+ .putArg(weightDecay)
+ .putArg(learningRate)
+ .putArg(count)
+ .putArgs(
+ bufFilter,
+ bufFilterDelta);
+ OpenCL.execute(kernelFilter, filterCount);
+
+ CLKernel kernelBias = kernels.get("joinBias");
+ kernelBias.rewind()
+ .putArg(count)
+ .putArgs(
+ bufBias,
+ bufBiasDelta);
+ OpenCL.execute(kernelBias, biasCount);
+
+ }
+
+ public static void main(String[] args) {
+ int inputChannels = 3;
+ int inputWidth = 200;
+ int inputHeight = 200;
+ int stride = 3;
+ int filterSize = 11;
+ int outputChannels = 24;
+ int outputWidth = inputWidth / stride;
+ int outputHeight = inputHeight / stride;
+ float[] input = new float[inputChannels * inputWidth * inputHeight];
+ float[] newDelta = new float[input.length];
+ float[] filter = new float[inputChannels * outputChannels * filterSize * filterSize];
+ float[] filterDelta = new float[filter.length];
+ float[] biasDelta = new float[outputChannels];
+ float[] result = new float[outputChannels * outputWidth * outputHeight];
+ float[] delta = new float[result.length];
+ float learningRate = 0.001f;
+
+ for(int i = 0; i < 3; ++i){
+ System.out.println(i + 1);
+ ConvolutionBackwordCL.INSTANCE.backward(delta, result,
+ input, inputChannels, inputWidth, inputHeight,
+ filter, outputChannels, outputWidth, outputHeight, filterDelta, biasDelta, filterSize, stride, newDelta, learningRate);
+ }
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java
new file mode 100644
index 0000000..2a0fb50
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/ConvolutionForwardCL.java
@@ -0,0 +1,177 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+
+/**
+ *
+ * @author naoki
+ */
+public class ConvolutionForwardCL {
+ public static ConvolutionForwardCL INSTANCE = new ConvolutionForwardCL();
+ CLProgram prog;
+ CLKernel forwardKernel;
+ CLKernel normalizeKernel;
+
+ private ConvolutionForwardCL() {
+ }
+
+ /**
+ * バッファを外部にもたない
+ */
+ public void forward(float[] input, int inputChannels, int inputWidth, int inputHeight,
+ float[] filter, int outputChannels, int outputWidth, int outputHeight, float[] result,
+ int filterSize, int stride, float[] bias){
+
+ CLBuffer bufFilter = OpenCL.createReadBuffer(filter);
+ CLBuffer bufBias = OpenCL.createReadBuffer(bias);
+
+ OpenCL.getQueue()
+ .putWriteBuffer(bufFilter, false)
+ .putWriteBuffer(bufBias, false);
+
+ forward(input, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight, result,
+ filterSize, stride, bufBias);
+
+ bufBias.release();
+ bufFilter.release();
+ }
+
+ /**
+ * filterとbiasは外部管理
+ */
+ public void forward(float[] input,
+ int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ float[] result,
+ int filterSize, int stride, CLBuffer bufBias){
+
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufResult = OpenCL.createReadWriteBuffer(result);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufInput, false);
+
+ forward(bufInput, inputChannels, inputWidth, inputHeight,
+ bufFilter, outputChannels, outputWidth, outputHeight, bufResult,
+ filterSize, stride, bufBias);
+
+ OpenCL.getQueue()
+ .putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result);
+
+ bufResult.release();
+ bufInput.release();
+
+ }
+ public void forward(CLBuffer bufInput,
+ int inputChannels, int inputWidth, int inputHeight,
+ CLBuffer bufFilter, int outputChannels, int outputWidth, int outputHeight,
+ CLBuffer bufResult,
+ int filterSize, int stride, CLBuffer bufBias){
+ if(prog == null){
+ prog = OpenCL.compile("convolution_forward.cl");
+ forwardKernel = prog.createCLKernel("forward_local");
+ }
+
+ forwardKernel
+ .rewind()
+ .putArg(outputHeight)
+ .putArg(outputWidth)
+ .putArg(inputChannels)
+ .putArg(filterSize)
+ .putArg(stride)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArgs(
+ bufInput,
+ bufFilter,
+ bufResult,
+ bufBias);
+ /*
+ OpenCL.execute(forwardKernel,
+ outputChannels * outputWidth * outputHeight);
+ */
+ forwardKernel.putArg(outputChannels * outputWidth * outputHeight);
+ OpenCL.getQueue().put1DRangeKernel(forwardKernel, 0,
+ outputChannels * outputWidth * outputHeight, outputChannels);
+
+ normalizeKernel = prog.createCLKernel("localNormalize");
+ normalizeKernel
+ .rewind()
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArg(outputChannels)
+ .putArg(bufResult);
+ OpenCL.execute(normalizeKernel,
+ outputChannels * outputWidth * outputHeight);
+
+ }
+ public static void main(String[] args) {
+ CLProgram prog = OpenCL.compile("convolution_forward.cl");
+ CLKernel forwardKernel = prog.createCLKernel("forward");
+
+ int inputChannels = 384;
+ int inputWidth = 14;
+ int inputHeight = 14;
+ int outputChannels = 384;
+ int outputWidth = 14;
+ int outputHeight = 14;
+ int filterSize = 3;
+ int stride = 1;
+ CLBuffer bufInput = OpenCL.createReadWriteBuffer(
+ inputChannels * inputWidth * inputHeight);
+ CLBuffer bufFilter = OpenCL.createReadWriteBuffer(
+ inputChannels * outputChannels * filterSize * filterSize);
+ CLBuffer bufResult = OpenCL.createReadWriteBuffer(
+ outputChannels * outputWidth * outputHeight);
+ CLBuffer bufBias = OpenCL.createReadWriteBuffer(
+ outputChannels);
+ long start = System.currentTimeMillis();
+ for(int i = 0; i < 5000; ++i){
+ forwardKernel
+ .rewind()
+ .putArg(outputHeight)
+ .putArg(outputWidth)
+ .putArg(inputChannels)
+ .putArg(filterSize)
+ .putArg(stride)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArgs(
+ bufInput,
+ bufFilter,
+ bufResult,
+ bufBias)
+ .putArg(outputChannels * outputWidth * outputHeight);
+ int workSize = outputChannels;
+ OpenCL.getQueue().put1DRangeKernel(forwardKernel,
+ 0, outputChannels * outputWidth * outputHeight,
+ workSize);
+ }
+ OpenCL.getQueue().putBarrier();
+ System.out.println((System.currentTimeMillis() - start) / 1000.);
+ bufFilter.release();
+ System.out.println((System.currentTimeMillis() - start) / 1000.);
+ bufInput.release();
+ bufResult.release();
+ bufBias.release();
+ System.out.println((System.currentTimeMillis() - start) / 1000.);
+
+ forwardKernel.release();
+ prog.release();
+
+ OpenCL.getQueue().release();
+ OpenCL.getCtx().release();
+ }
+ static int roundUp(int groupSize, int globalSize){
+ return ((globalSize + groupSize - 1) / groupSize) * groupSize;
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java
new file mode 100644
index 0000000..99bd99f
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/FullyBackwordCL.java
@@ -0,0 +1,172 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import java.util.Map;
+import kishida.cnn.activation.ActivationFunction;
+import kishida.cnn.activation.RectifiedLinear;
+
+/**
+ *
+ * @author naoki
+ */
+public class FullyBackwordCL {
+ public static FullyBackwordCL INSTANCE = new FullyBackwordCL();
+ CLProgram prog;
+ Map kernels;
+ CLProgram progActivation;
+ Map actKernels;
+
+ private FullyBackwordCL() {
+ }
+
+ public void backword(int inputSize, int outputSize,
+ int[] dropout, float[] input, float[] delta,
+ float[] result, float[] weight,
+ float[] weightDelta, float[] biasDelta,
+ float[] newDelta,
+ float learningRate, ActivationFunction activation){
+ CLBuffer bufWeight = OpenCL.createReadBuffer(weight);
+ CLBuffer bufWeightDelta = OpenCL.createReadWriteBuffer(weightDelta);
+ CLBuffer bufBiasDelta = OpenCL.createReadWriteBuffer(biasDelta);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufWeightDelta ,false)
+ .putWriteBuffer(bufBiasDelta ,false)
+ .putWriteBuffer(bufWeight ,false);
+
+ backword(inputSize, outputSize,
+ dropout, input, delta,
+ result, bufWeight, bufWeightDelta, bufBiasDelta,
+ newDelta,
+ learningRate, activation);
+
+ OpenCL.getQueue()
+ .putReadBuffer(bufBiasDelta ,false)
+ .putReadBuffer(bufWeightDelta ,true);
+ bufBiasDelta.getBuffer().get(biasDelta);
+ bufWeightDelta.getBuffer().get(weightDelta);
+
+ bufWeight .release();
+ bufWeightDelta .release();
+ bufBiasDelta .release();
+ }
+
+ public void backword(int inputSize, int outputSize,
+ int[] dropout, float[] input, float[] delta,
+ float[] result, CLBuffer bufWeight,
+ CLBuffer bufWeightDelta, CLBuffer bufBiasDelta,
+ float[] newDelta,
+ float learningRate, ActivationFunction activation){
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufDelta = OpenCL.createReadBuffer(delta);
+ CLBuffer bufResult = OpenCL.createReadBuffer(result);
+ CLBuffer bufNewDelta = OpenCL.createWriteBuffer(newDelta.length);
+ CLBuffer bufDropout = OpenCL.createReadBuffer(dropout);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufInput ,false)
+ .putWriteBuffer(bufDelta ,false)
+ .putWriteBuffer(bufResult ,false)
+ .putWriteBuffer(bufDropout ,false);
+
+ backword(inputSize, outputSize,
+ bufDropout, bufInput, bufDelta,
+ bufResult, bufWeight, bufWeightDelta, bufBiasDelta,
+ bufNewDelta,
+ learningRate, activation);
+
+ OpenCL.getQueue()
+ .putReadBuffer(bufNewDelta ,false);
+ bufNewDelta.getBuffer().get(newDelta);
+
+ bufInput .release();
+ bufDelta .release();
+ bufResult .release();
+ bufNewDelta .release();
+ bufDropout .release();
+
+ }
+ public void backword(int inputSize, int outputSize,
+ CLBuffer bufDropout, CLBuffer bufInput, CLBuffer bufDelta,
+ CLBuffer bufResult, CLBuffer bufWeight,
+ CLBuffer bufWeightDelta, CLBuffer bufBiasDelta,
+ CLBuffer bufNewDelta,
+ float learningRate, ActivationFunction activation){
+ if(prog == null){
+ prog = OpenCL.compile("fully_backword.cl");
+ kernels = prog.createCLKernels();
+ }
+ if(progActivation == null){
+ progActivation = OpenCL.compile("activation.cl");
+ actKernels = progActivation.createCLKernels();
+ }
+
+ CLBuffer bufDiffed = OpenCL.createReadWriteBuffer(outputSize);
+
+ CLKernel actKernel = actKernels.get(activation.getName() + "_diff");
+ actKernel.rewind()
+ .putArg(bufResult)
+ .putArg(bufDiffed);
+ OpenCL.execute(actKernel, outputSize);
+
+ CLKernel kernelDelta = kernels.get("backword_delta");
+ kernelDelta.rewind()
+ .putArg(outputSize)
+ .putArgs(
+ bufDropout,
+ bufDelta,
+ bufDiffed,
+ bufWeight,
+ bufNewDelta);
+ OpenCL.execute(kernelDelta, inputSize);
+
+ CLKernel kernelWeight = kernels.get("backword_weight");
+ kernelWeight.rewind()
+ .putArg(outputSize)
+ .putArg(learningRate)
+ .putArgs(
+ bufDropout,
+ bufInput,
+ bufDelta,
+ bufDiffed,
+ bufWeight,
+ bufWeightDelta);
+ OpenCL.execute(kernelWeight, inputSize * outputSize);
+
+ CLKernel kernelBias = kernels.get("backword_bias");
+ kernelBias.rewind()
+ .putArg(outputSize)
+ .putArg(learningRate)
+ .putArg(bufDropout)
+ .putArg(bufDelta)
+ .putArg(bufDiffed)
+ .putArg(bufBiasDelta);
+ OpenCL.execute(kernelBias, outputSize);
+
+ bufDiffed .release();
+ }
+
+ public static void main(String[] args) {
+ int inputSize = 5;
+ int outputSize = 9;
+ int[] dropout = new int[outputSize];
+ float[] delta = new float[outputSize];
+ float[] input = new float[inputSize];
+ float[] result = new float[outputSize];
+ float[] weight = new float[inputSize * outputSize];
+ float[] weightDelta = new float[weight.length];
+ float[] biasDelta = new float[outputSize];
+ float[] newDelta = new float[inputSize];
+
+ INSTANCE.backword(inputSize, outputSize,
+ dropout, input, delta, result, weight,
+ weightDelta, biasDelta, newDelta, 0.001f, new RectifiedLinear());
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/FullyForwardCL.java b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java
new file mode 100644
index 0000000..10cb2e0
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/FullyForwardCL.java
@@ -0,0 +1,119 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import java.util.Map;
+import kishida.cnn.activation.ActivationFunction;
+import kishida.cnn.activation.SoftMaxFunction;
+
+/**
+ *
+ * @author naoki
+ */
+public class FullyForwardCL {
+ public static FullyForwardCL INSTANCE = new FullyForwardCL();
+ CLProgram progFully;
+ CLProgram progActivation;
+ CLKernel forwardKernel;
+ Map actKernels;
+ public FullyForwardCL() {
+ }
+
+ public void forward(int inputSize, int outputSize, int[] dropout,
+ float[] input, float[] weight, float[] bias, float[] result,
+ ActivationFunction activation){
+ CLBuffer bufWeight = OpenCL.createReadBuffer(weight);
+ CLBuffer bufBias = OpenCL.createReadBuffer(bias);
+
+ forward(inputSize, outputSize, dropout, input, bufWeight, bufBias, result, activation);
+
+ OpenCL.getQueue()
+ .putWriteBuffer(bufWeight, false)
+ .putWriteBuffer(bufBias, false);
+ bufWeight.release();
+ bufBias.release();
+
+ }
+ public void forward(int inputSize, int outputSize, int[] dropout,
+ float[] input, CLBuffer bufWeight,
+ CLBuffer bufBias, float[] result,
+ ActivationFunction activation){
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufResult = OpenCL.createReadWriteBuffer(result.length);
+ CLBuffer bufDropout = OpenCL.createReadBuffer(dropout);
+
+ OpenCL.getQueue()
+ .putWriteBuffer(bufInput, false)
+ .putWriteBuffer(bufDropout, false);
+
+ forward(inputSize, outputSize, bufDropout, bufInput, bufWeight, bufBias, bufResult, activation);
+
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result);
+
+ bufInput.release();
+ bufResult.release();
+ bufDropout.release();
+
+ }
+ public void forward(int inputSize, int outputSize, CLBuffer bufDropout,
+ CLBuffer bufInput, CLBuffer bufWeight,
+ CLBuffer bufBias, CLBuffer bufResult,
+ ActivationFunction activation){
+ if(progFully == null){
+ progFully = OpenCL.compile("fully_forward.cl");
+ forwardKernel = progFully.createCLKernel("forward");
+ }
+ if(progActivation == null){
+ progActivation = OpenCL.compile("activation.cl");
+ actKernels = progActivation.createCLKernels();
+ }
+
+ forwardKernel.rewind()
+ .putArg(inputSize)
+ .putArg(outputSize)
+ .putArgs(
+ bufDropout,
+ bufInput,
+ bufWeight,
+ bufBias,
+ bufResult);
+ OpenCL.execute(forwardKernel, outputSize);
+
+ if(activation instanceof SoftMaxFunction){
+ softmax(outputSize, bufResult);
+
+ }else{
+ CLKernel kernelAct = actKernels.get(activation.getName());
+ kernelAct.rewind()
+ .putArg(bufResult);
+ OpenCL.execute(kernelAct, outputSize);
+ }
+
+ }
+
+ private void softmax(int outputSize, CLBuffer bufResult) {
+ CLBuffer bufExped = OpenCL.createReadWriteBuffer(outputSize);
+ CLKernel kernelActPre = actKernels.get("softmax_before");
+ kernelActPre.rewind()
+ .putArg(bufResult)
+ .putArg(bufExped);
+ OpenCL.execute(kernelActPre, outputSize);
+
+ CLKernel kernelAct = actKernels.get("softmax");
+ kernelAct.rewind()
+ .putArg(bufExped)
+ .putArg(bufResult);
+ OpenCL.execute(kernelAct, outputSize);
+
+ bufExped.release();
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java
new file mode 100644
index 0000000..e143fe9
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/MaxPoolingCL.java
@@ -0,0 +1,134 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+import java.util.Map;
+
+/**
+ *
+ * @author naoki
+ */
+public class MaxPoolingCL {
+ public static MaxPoolingCL INSTANCE = new MaxPoolingCL();
+
+ CLProgram prog;
+ Map kernels;
+
+ private MaxPoolingCL() {
+ }
+
+ public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight,
+ int size, int stride, float[] input, float[] result){
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufResult = OpenCL.createWriteBuffer(result.length);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufInput, false);
+
+ forward(inputChannel, inputWidth, inputHeight,
+ outputWidth, ouptutHeight, size, stride, bufInput, bufResult);
+
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result);
+
+ bufInput.release();
+ bufResult.release();
+
+ }
+ public void forward(int inputChannel, int inputWidth, int inputHeight, int outputWidth, int ouptutHeight,
+ int size, int stride, CLBuffer bufInput, CLBuffer bufResult){
+
+ if(prog == null){
+ prog = OpenCL.compile("maxpooling.cl");
+ kernels = prog.createCLKernels();
+ }
+
+ CLKernel kernelForward = kernels.get("forward");
+ kernelForward.rewind()
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(outputWidth)
+ .putArg(ouptutHeight)
+ .putArg(size)
+ .putArg(stride)
+ .putArgs(
+ bufInput,
+ bufResult);
+ OpenCL.execute(kernelForward,
+ inputChannel * outputWidth * ouptutHeight);
+ }
+
+ public void backword(int inputChannel, int inputWidth, int inputHeight,
+ int outputWidth, int outputHeight,
+ int size, int stride,
+ float[] input, float[] delta, float[] newDelta){
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufDelta = OpenCL.createReadBuffer(delta);
+ CLBuffer bufNewDelta = OpenCL.createReadWriteBuffer(newDelta);
+ OpenCL.getQueue()
+ .putWriteBuffer(bufInput, false)
+ .putWriteBuffer(bufDelta, false);
+
+ backword(inputChannel, inputWidth, inputHeight,
+ outputWidth, outputHeight, size, stride,
+ bufInput, bufDelta, bufNewDelta);
+
+ OpenCL.getQueue().putReadBuffer(bufNewDelta, true);
+ bufNewDelta.getBuffer().get(newDelta);
+
+ bufInput.release();
+ bufDelta.release();
+ bufNewDelta.release();
+ }
+ public void backword(int inputChannel, int inputWidth, int inputHeight,
+ int outputWidth, int outputHeight,
+ int size, int stride,
+ CLBuffer bufInput,
+ CLBuffer bufDelta, CLBuffer bufNewDelta){
+ if(prog == null){
+ prog = OpenCL.compile("maxpooling.cl");
+ kernels = prog.createCLKernels();
+ }
+
+ CLKernel kernelForward = kernels.get("backword");
+ kernelForward.rewind()
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(outputWidth)
+ .putArg(outputHeight)
+ .putArg(size)
+ .putArg(stride)
+ .putArgs(
+ bufInput,
+ bufDelta,
+ bufNewDelta);
+ OpenCL.execute(kernelForward,
+ inputChannel * inputWidth * inputHeight);
+
+ }
+
+ public static void main(String[] args) {
+ int inputChannel = 3;
+ int inputWidth = 32;
+ int inputHeight = 32;
+ int size = 11;
+ int stride = 2;
+ int outputWidth = inputWidth / stride;
+ int outputHeight = inputHeight / stride;
+ float[] input = new float[inputChannel * inputWidth * inputHeight];
+ float[] result = new float[inputChannel * outputWidth * outputHeight];
+ float[] newDelta = new float[input.length];
+ float[] delta = new float[result.length];
+
+ INSTANCE.forward(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride,
+ input,result);
+ INSTANCE.backword(inputChannel, inputWidth, inputHeight, outputWidth, outputHeight, size, stride,
+ input, delta, newDelta);
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java
new file mode 100644
index 0000000..294810d
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/MultiNormalizeCL.java
@@ -0,0 +1,92 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLProgram;
+import java.nio.FloatBuffer;
+import java.util.Map;
+
+/**
+ *
+ * @author naoki
+ */
+public class MultiNormalizeCL {
+ public static MultiNormalizeCL INSTANCE = new MultiNormalizeCL();
+
+ CLProgram prog;
+ Map kernels;
+
+ public void normalize(int inputChannels, int inputWidth, int inputHeight,
+ int size, float threshold,
+ float[] input, float[] result){
+ CLBuffer bufInput = OpenCL.createReadBuffer(input);
+ CLBuffer bufResult = OpenCL.createWriteBuffer(result.length);
+ OpenCL.getQueue().putWriteBuffer(bufInput, false);
+
+ OpenCL.getQueue().putReadBuffer(bufResult, true);
+ bufResult.getBuffer().get(result);
+
+ normalize(inputChannels, inputWidth, inputHeight, size, threshold,
+ bufInput, bufResult);
+
+ bufInput.release();
+ bufResult.release();
+
+ }
+ public void normalize(int inputChannels, int inputWidth, int inputHeight,
+ int size, float threshold,
+ CLBuffer bufInput, CLBuffer bufResult){
+ if(prog == null){
+ prog = OpenCL.compile("multi_normalize.cl");
+ kernels = prog.createCLKernels();
+ }
+
+ CLBuffer bufAverages = OpenCL.createReadWriteBuffer(inputWidth * inputHeight);
+ CLBuffer bufStds = OpenCL.createReadWriteBuffer(inputWidth * inputHeight);
+
+ CLKernel kernelAverage = kernels.get("average");
+ kernelAverage.rewind()
+ .putArg(inputChannels)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArg(size)
+ .putArg(threshold)
+ .putArgs(
+ bufInput,
+ bufAverages,
+ bufStds);
+ OpenCL.execute(kernelAverage, inputWidth * inputHeight);
+
+ CLKernel kernelForward = kernels.get("forward");
+ kernelForward.rewind()
+ .putArg(inputChannels)
+ .putArg(inputWidth)
+ .putArg(inputHeight)
+ .putArgs(
+ bufInput,
+ bufAverages,
+ bufStds,
+ bufResult);
+ OpenCL.execute(kernelForward, inputChannels * inputWidth * inputHeight);
+
+ bufAverages.release();
+ bufStds.release();
+
+ }
+
+ public static void main(String[] args) {
+ int inputChannels = 3;
+ int inputWidth = 32;
+ int inputHeight = 32;
+ int size = 5;
+ float threshold = 0.00001f;
+ float[] input = new float[inputChannels * inputWidth * inputHeight];
+ float[] result = new float[inputChannels * inputWidth * inputHeight];
+ new MultiNormalizeCL().normalize(inputChannels, inputWidth, inputHeight, size, threshold, input, result);
+ }
+}
diff --git a/src/main/java/kishida/cnn/opencl/OpenCL.java b/src/main/java/kishida/cnn/opencl/OpenCL.java
new file mode 100644
index 0000000..bb635c1
--- /dev/null
+++ b/src/main/java/kishida/cnn/opencl/OpenCL.java
@@ -0,0 +1,96 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.cnn.opencl;
+
+import com.jogamp.opencl.CLBuffer;
+import com.jogamp.opencl.CLCommandQueue;
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+import com.jogamp.opencl.CLKernel;
+import com.jogamp.opencl.CLMemory;
+import com.jogamp.opencl.CLProgram;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import lombok.Getter;
+
+/**
+ *
+ * @author naoki
+ */
+public class OpenCL {
+
+ static CLContext ctx;
+ @Getter
+ static CLCommandQueue queue;
+ static CLDevice device;
+
+ public static void prepare(){
+ ctx = CLContext.create();
+ device = ctx.getMaxFlopsDevice();
+ System.out.println(device);
+ queue = device.createCommandQueue();
+ }
+
+ public static void release(){
+ queue.finish();
+ ctx.release();
+ ctx = null;
+ }
+
+ public static CLContext getCtx() {
+ if(ctx == null){
+ prepare();
+ }
+ return ctx;
+ }
+
+ public static CLProgram compile(String path){
+ try {
+ return getCtx().createProgram(OpenCL.class.getResourceAsStream("/kernels/" + path))
+ .build();
+ } catch (IOException ex) {
+ throw new UncheckedIOException(ex);
+ }
+ }
+
+ public static CLBuffer createReadBuffer(float[] data){
+ CLBuffer buf = getCtx().createFloatBuffer(
+ data.length, CLMemory.Mem.READ_ONLY);
+ buf.getBuffer().put(data).rewind();//rewindしないと不安定になる
+ return buf;
+ }
+ public static CLBuffer createReadWriteBuffer(float[] data){
+ CLBuffer buf = createReadWriteBuffer(data.length);
+ buf.getBuffer().put(data).rewind();//rewindしないと不安定になる
+ return buf;
+ }
+ public static CLBuffer createReadWriteBuffer(int size){
+ CLBuffer buf = getCtx().createFloatBuffer(
+ size, CLMemory.Mem.READ_WRITE);
+ return buf;
+
+ }
+ public static CLBuffer createWriteBuffer(int size){
+ return getCtx().createFloatBuffer(size, CLMemory.Mem.WRITE_ONLY);
+ }
+ public static CLBuffer createReadBuffer(int[] data){
+ CLBuffer buf = getCtx().createIntBuffer(
+ data.length, CLMemory.Mem.READ_ONLY);
+ buf.getBuffer().put(data).rewind();
+ return buf;
+ }
+ public static CLCommandQueue execute(CLKernel kernel, int range){
+ int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 128);
+ int globalWorkSize = roundUp(localWorkSize, range);
+ kernel.putArg(range);
+ return getQueue().put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize);
+ }
+ static int roundUp(int groupSize, int globalSize){
+ return ((globalSize + groupSize - 1) / groupSize) * groupSize;
+ }
+}
diff --git a/src/main/java/kishida/imagefiltering/InfoCL.java b/src/main/java/kishida/imagefiltering/InfoCL.java
new file mode 100644
index 0000000..9af0e73
--- /dev/null
+++ b/src/main/java/kishida/imagefiltering/InfoCL.java
@@ -0,0 +1,37 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package kishida.imagefiltering;
+
+import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
+
+/**
+ *
+ * @author naoki
+ */
+public class InfoCL {
+ public static void main(String[] args) {
+ CLContext ctx = CLContext.create();
+ CLDevice dev = ctx.getMaxFlopsDevice();
+ System.out.println(dev);
+ System.out.println(memSizeString(dev.getGlobalMemSize()));
+ System.out.println(memSizeString(dev.getLocalMemSize()));
+ System.out.println(dev.getMaxWorkGroupSize());
+ System.out.println(dev.getCVersion());
+ ctx.release();
+ }
+ static String memSizeString(long memSize){
+ double size = memSize;
+ String[] unit = {"", "K", "M", "G", "T"};
+ for(int i = 0; i < unit.length; ++i){
+ if(size < 1024){
+ return String.format("%.1f%sB", size, unit[i]);
+ }
+ size /= 1024;
+ }
+ return String.format("%.1f%sPB", size);
+ }
+}
diff --git a/src/main/java/kishida/imagefiltering/KernelBench.java b/src/main/java/kishida/imagefiltering/KernelBench.java
index cc236bd..8308817 100644
--- a/src/main/java/kishida/imagefiltering/KernelBench.java
+++ b/src/main/java/kishida/imagefiltering/KernelBench.java
@@ -6,10 +6,8 @@
package kishida.imagefiltering;
import java.util.Arrays;
-import java.util.Objects;
import java.util.Random;
import java.util.function.Consumer;
-import java.util.function.DoubleConsumer;
/**
*
diff --git a/src/main/resources/alexnet_def.json b/src/main/resources/alexnet_def.json
index c5a0e0f..66771f6 100644
--- a/src/main/resources/alexnet_def.json
+++ b/src/main/resources/alexnet_def.json
@@ -95,7 +95,7 @@
"initBias" : 1.0,
"dropoutRate" : 0.5,
"activation" : "RectifiedLinear",
- "useGpu" : false
+ "useGpu" : true
}
}, {
"FullyConnect" : {
@@ -113,7 +113,7 @@
"initBias" : 1.0,
"dropoutRate" : 1.0,
"activation" : "SoftMaxFunction",
- "useGpu" : false
+ "useGpu" : true
}
} ]
}
\ No newline at end of file
diff --git a/src/main/resources/kernels/activation.cl b/src/main/resources/kernels/activation.cl
new file mode 100644
index 0000000..0143565
--- /dev/null
+++ b/src/main/resources/kernels/activation.cl
@@ -0,0 +1,64 @@
+__kernel void relu(
+ __global float *result,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ float r = result[i];
+ result[i] = r >= 0 ? r : 0;
+}
+
+__kernel void relu_diff(
+ __global const float* result,
+ __global float* diff,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ diff[i] = result[i] >= 0 ? 1 : 0;
+}
+
+__kernel void softmax_before(
+ __global const float* result,
+ __global float* exped,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ exped[i] = exp(min(700.0f, result[i]));
+}
+
+__kernel void softmax(
+ __global const float* exped,
+ __global float* result,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ float sum = 0;
+ for(int j = 0; j < count; ++j){
+ sum += exped[j];
+ }
+ result[i] = exped[i] / sum;
+}
+
+__kernel void softmax_diff(
+ __global const float* result,
+ __global float* diff,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ float r = result[i];
+ diff[i] = r * (1 - r);
+}
diff --git a/src/main/resources/kernels/convolution_backword.cl b/src/main/resources/kernels/convolution_backword.cl
new file mode 100644
index 0000000..0d0d9a0
--- /dev/null
+++ b/src/main/resources/kernels/convolution_backword.cl
@@ -0,0 +1,264 @@
+void delta_proc(
+ int chxxyy,
+ int inputWidth,
+ int inputHeight,
+ int filterSize,
+ int outputChannels,
+ int stride,
+ int outputWidth,
+ int outputHeight,
+ __global const float *result,
+ __global const float *delta,
+ __global const float *filter,
+ int inputChannels,
+ __global float *newDelta
+){
+ int ch = chxxyy / (inputWidth * inputHeight);
+ int xx = (chxxyy % (inputWidth * inputHeight)) / inputHeight;
+ int yy = chxxyy % inputHeight;
+ int sizeHalf = filterSize / 2;
+ float tempDelta = 0.0f;
+ for (int f = 0; f=0 && x=0 && y=0.0f)?delta[fxy]:0.0f;
+ tempDelta = tempDelta + d * filter[
+ f * inputChannels * filterSize * filterSize +
+ ch * filterSize * filterSize +
+ i * filterSize + j];
+ }
+ }
+ }
+ }
+ }
+ newDelta[chxxyy] = tempDelta;
+}
+
+__kernel void delta_kernel(
+ int inputWidth,
+ int inputHeight,
+ int filterSize,
+ int outputChannels,
+ int stride,
+ int outputWidth,
+ int outputHeight,
+ __global const float *result,
+ __global const float *delta,
+ __global const float *filter,
+ int inputChannels,
+ __global float *newDelta,
+ int count
+){
+ int chxxyy = get_global_id(0);
+ if(chxxyy >= count){
+ return;
+ }
+ delta_proc(
+ chxxyy,
+ inputWidth,
+ inputHeight,
+ filterSize,
+ outputChannels,
+ stride,
+ outputWidth,
+ outputHeight,
+ result,
+ delta,
+ filter,
+ inputChannels,
+ newDelta);
+}
+
+void filter_proc(
+ int fchij,
+ int inputChannels,
+ int filterSize,
+ int outputWidth,
+ int outputHeight,
+ __global const float *result,
+ __global const float *delta,
+ int stride,
+ int inputWidth,
+ int inputHeight,
+ float learningRate,
+ __global const float *input,
+ __global float *filterDelta
+){
+
+ int f = fchij / ((inputChannels * filterSize) * filterSize);
+ int ch = (fchij % ((inputChannels * filterSize) * filterSize)) / (filterSize * filterSize);
+ int i = (fchij % (filterSize * filterSize)) / filterSize;
+ int j = fchij % filterSize;
+ float df = 0.0f;
+ for (int x = 0; x=0.0f)?delta[fxy]:0.0f;
+ int xx = x * stride + i - filterSize / 2;
+ if (xx >= 0 && xx < inputWidth){
+ int yy = y * stride + j - filterSize / 2;
+ if (yy >= 0 && yy < inputHeight){
+ df = df + d * learningRate *
+ input[ch * inputWidth * inputHeight + xx * inputHeight + yy];
+ }
+ }
+ }
+ }
+ filterDelta[fchij] += df;
+}
+
+__kernel void filter_kernel(
+ int inputChannels,
+ int filterSize,
+ int outputWidth,
+ int outputHeight,
+ __global const float *result,
+ __global const float *delta,
+ int stride,
+ int inputWidth,
+ int inputHeight,
+ float learningRate,
+ __global const float *input,
+ __global float *filterDelta,
+ int count
+){
+ int fchij = get_global_id(0);
+ if(fchij >= count){
+ return;
+ }
+ filter_proc(fchij, inputChannels, filterSize,
+ outputWidth, outputHeight, result, delta,
+ stride, inputWidth, inputHeight, learningRate,
+ input, filterDelta);
+}
+
+void bias_proc(
+ int fxy,
+ __global const float *result,
+ __global const float *delta,
+ __global float *tempBiasDelta,
+ float learningRate
+){
+ float d = result[fxy]>=0.0f ? delta[fxy] : 0.0f;
+ tempBiasDelta[fxy] = learningRate * d;
+}
+
+__kernel void bias_kernel(
+ __global const float *result,
+ __global const float *delta,
+ __global float *tempBiasDelta,
+ float learningRate,
+ int count
+){
+ int fxy = get_global_id(0);
+ if(fxy >= count){
+ return;
+ }
+ bias_proc(fxy, result, delta, tempBiasDelta, learningRate);
+}
+__kernel void biasAfter(
+ int outputWidth,
+ int outputHeight,
+ __global const float *tempBiasDelta,
+ __global float *biasDelta,
+ int count
+){
+ int f = get_global_id(0);
+ if(f >= count){
+ return;
+ }
+ float b = 0;
+ for(int xy = 0; xy < outputWidth * outputHeight; ++xy){
+ b += tempBiasDelta[f * outputWidth * outputHeight + xy];
+ }
+ biasDelta[f] += b;
+}
+
+__kernel void dfb(
+ int inputChannels,
+ int inputWidth,
+ int inputHeight,
+ int outputChannels,
+ int outputWidth,
+ int outputHeight,
+ int filterSize,
+ int stride,
+ float learningRate,
+ __global float *input,
+ __global float *result,
+ __global float *filter,
+ __global float *filterDelta,
+ __global float *delta,
+ __global float *newDelta,
+ __global float *tempBiasDelta,
+ int deltaCount,
+ int filterCount,
+ int biasCount,
+ int count
+){
+ int n = get_global_id(0);
+ if(n < deltaCount){
+ delta_proc(n,
+ inputWidth, inputHeight,
+ filterSize, outputChannels, stride,
+ outputWidth, outputHeight,
+ result, delta, filter,
+ inputChannels, newDelta);
+ }else if (n < deltaCount + filterCount){
+ filter_proc(n - deltaCount, inputChannels, filterSize,
+ outputWidth, outputHeight, result, delta,
+ stride, inputWidth, inputHeight, learningRate,
+ input, filterDelta);
+ }else if (n < deltaCount + filterCount + biasCount){
+ bias_proc(n - deltaCount - filterCount,
+ result, delta, tempBiasDelta, learningRate);
+ }
+}
+
+__kernel void prepare(
+ float momentam,
+ __global float* delta,
+ int count
+){
+ int f = get_global_id(0);
+ if(f >= count){
+ return;
+ }
+ delta[f] *= momentam;
+}
+
+__kernel void joinFilter(
+ float weightDecay,
+ float learningRate,
+ int count,
+ __global float* filter,
+ __global const float* filterDelta,
+ int len
+){
+ int f = get_global_id(0);
+ if(f >= len){
+ return;
+ }
+ filter[f] += filterDelta[f] / count
+ - weightDecay * learningRate * filter[f];
+}
+
+__kernel void joinBias(
+ int count,
+ __global float* bias,
+ __global const float* biasDelta,
+ int len
+){
+ int f = get_global_id(0);
+ if(f >= len){
+ return;
+ }
+ bias[f] += biasDelta[f] / count;
+}
+
+
diff --git a/src/main/resources/kernels/convolution_forward.cl b/src/main/resources/kernels/convolution_forward.cl
new file mode 100644
index 0000000..3af321b
--- /dev/null
+++ b/src/main/resources/kernels/convolution_forward.cl
@@ -0,0 +1,123 @@
+__kernel void forward(
+ int outputHeight,
+ int outputWidth,
+ int inputChannels,
+ int filterSize,
+ int stride,
+ int inputWidth,
+ int inputHeight,
+ __global const float *input,
+ __global const float *filter,
+ __global float *result,
+ __global const float *bias,
+ int count
+){
+ int fxy = get_global_id(0);
+ if(fxy >= count){
+ return;
+ }
+
+ int f = fxy / (outputHeight * outputWidth);
+ int x = (fxy % (outputHeight * outputWidth)) / outputHeight;
+ int y = fxy % outputHeight;
+ float r = 0.0f;
+ for (int ch = 0; ch=0 && xx=0 && yy= 0 ? rs : 0;
+}
+
+__kernel void forward_local(
+ int outputHeight,
+ int outputWidth,
+ int inputChannels,
+ int filterSize,
+ int stride,
+ int inputWidth,
+ int inputHeight,
+ __global const float *input,
+ __global const float *filter,
+ __global float *result,
+ __global const float *bias,
+ int count
+){
+ int fxy = get_global_id(0);
+ if(fxy >= count){
+ return;
+ }
+ int f = fxy / (outputHeight * outputWidth);
+ int x = (fxy % (outputHeight * outputWidth)) / outputHeight;
+ int y = fxy % outputHeight;
+
+ __local float lfilter[384 * 3 * 3]; //
+ int len = inputChannels * filterSize * filterSize;
+ /*
+ int start = get_local_id(0) * len / outputChannels;
+ int end = (get_local_id(0) + 1) * len / outputChannels;
+ for(int i = start; i < end; ++i){
+ lfilter[i] = filter[f * inputChannels * filterSize * filterSize + i];
+ }*/
+ event_t ev;
+ ev = async_work_group_copy(lfilter,
+ filter + f * inputChannels * filterSize * filterSize,
+ len, ev);
+ wait_group_events(1, &ev);
+
+ float r = 0.0f;
+ for (int ch = 0; ch=0 && xx=0 && yy= 0 ? rs : 0;
+}
+
+
+__kernel void localNormalize(
+ int outputWidth,
+ int outputHeight,
+ int outputChannels,
+ __global float *result,
+ int count
+){
+ int chxy = get_global_id(0);
+ if(chxy >= count){
+ return;
+ }
+ float k = 2;
+ float a = 1.0E-4f;
+ float b = 0.75f;
+ int n = 5;
+ int ch = chxy / (outputWidth * outputHeight);
+ int xy = chxy % (outputWidth * outputHeight);
+ float sum = 0.0f;
+
+ for (int lp = max(0, ch - n / 2); lp <= min(outputChannels - 1, ch + n / 2); lp++){
+ sum += result[lp * outputWidth * outputHeight + xy] *
+ result[lp * outputWidth * outputHeight + xy];
+ }
+ result[chxy] /= pow(k + a * sum, b);
+}
diff --git a/src/main/resources/kernels/fully_backword.cl b/src/main/resources/kernels/fully_backword.cl
new file mode 100644
index 0000000..e2d0b6f
--- /dev/null
+++ b/src/main/resources/kernels/fully_backword.cl
@@ -0,0 +1,67 @@
+__kernel void backword_delta(
+ int outputSize,
+ __global const int* dropout,
+ __global const float* delta,
+ __global const float* diffed,
+ __global const float* weight,
+ __global float* newDelta,
+ int count
+){
+ int i = get_global_id(0);
+ if(i >= count){
+ return;
+ }
+ float nd = 0;
+ for (int j = 0; j < outputSize; ++j) {
+ if (dropout[j] != 1) {
+ continue;
+ }
+ float d = diffed[j] * delta[j];
+ nd += d * weight[i * outputSize + j];//in[i] *;
+ }
+ newDelta[i] = nd;
+}
+
+__kernel void backword_weight(
+ int outputSize,
+ float learningRate,
+ __global const int* dropout,
+ __global const float* input,
+ __global const float* delta,
+ __global const float* diffed,
+ __global const float* weight,
+ __global float* weightDelta,
+ int count
+){
+ int ij = get_global_id(0);
+ if(ij >= count){
+ return;
+ }
+ int i = ij / outputSize;
+ int j = ij % outputSize;
+ if (dropout[j] != 1) {
+ return;
+ }
+ float d = diffed[j] * delta[j];
+ weightDelta[ij] += d * input[i] * learningRate;
+
+}
+
+__kernel void backword_bias(
+ int outputSize,
+ float learningRate,
+ __global const int* dropout,
+ __global const float* delta,
+ __global const float* diffed,
+ __global float* biasDelta,
+ int count
+){
+ int j = get_global_id(0);
+ if(j >= count){
+ return;
+ }
+ if (dropout[j] != 1) {
+ return;
+ }
+ biasDelta[j] += diffed[j] * delta[j] * learningRate;
+}
\ No newline at end of file
diff --git a/src/main/resources/kernels/fully_forward.cl b/src/main/resources/kernels/fully_forward.cl
new file mode 100644
index 0000000..e8b06a3
--- /dev/null
+++ b/src/main/resources/kernels/fully_forward.cl
@@ -0,0 +1,24 @@
+__kernel void forward(
+ int inSize,
+ int out,
+ __global const int *dropout,
+ __global const float *in,
+ __global float *weight,
+ __global float *bias,
+ __global float *result,
+ int count
+){
+ int j = get_global_id(0);
+ if(j >= count){
+ return;
+ }
+ if (dropout[j] == 1){
+ float r = 0;
+ for (int i = 0; i= count){
+ return;
+ }
+
+ int ch = chxy / (outputWidth * outputHeight);
+ int x = (chxy % (outputWidth * outputHeight)) / outputHeight;
+ int y = chxy % outputHeight;
+
+ float max = -INFINITY;
+ for (int i = 0; i < size; ++i) {
+ int xx = x * stride + i - size / 2;
+ if (xx < 0 || xx >= inputWidth) {
+ continue;
+ }
+ for (int j = 0; j < size; ++j) {
+ int yy = y * stride + j - size / 2;
+ if (yy < 0 || yy >= inputHeight) {
+ continue;
+ }
+ float d = data[ch * inputWidth * inputHeight + xx * inputHeight + yy];
+ if (max < d) {
+ max = d;
+ }
+ }
+ }
+ result[chxy] = max;
+
+}
+
+__kernel void backword(
+ int inputWidth,
+ int inputHeight,
+ int outputWidth,
+ int outputHeight,
+ int size,
+ int stride,
+ __global const float* input,
+ __global const float* delta,
+ __global float* newDelta,
+ int count
+){
+ int chxy = get_global_id(0);
+ if(chxy >= count){
+ return;
+ }
+
+ int ch = chxy / (inputWidth * inputHeight);
+ int xi = (chxy % (inputWidth * inputHeight)) / inputHeight;
+ int yi = chxy % inputHeight;
+
+ float nd = 0;
+ for(int x = max(0, (xi - size / 2) / stride - 1);
+ x < min(outputWidth, (xi + size / 2) / stride + 1); ++x){
+ for(int y = max(0, (yi - size / 2) / stride - 1);
+ y < min(outputHeight, (yi + size / 2) / stride + 1); ++y){
+ float max = -INFINITY;
+ int maxX = 0;
+ int maxY = 0;
+ for (int i = 0; i < size; ++i) {
+ int xx = x * stride + i - size / 2;
+ if (xx < 0 || xx >= inputWidth) {
+ continue;
+ }
+ for (int j = 0; j < size; ++j) {
+ int yy = y * stride + j - size / 2;
+ if (yy < 0 || yy >= inputHeight) {
+ continue;
+ }
+ float d = input[ch * inputWidth * inputHeight + xx * inputWidth + yy];
+ if (max < d) {
+ max = d;
+ maxX = xx;
+ maxY = yy;
+ }
+ }
+ }
+ if(maxX == xi && maxY == yi){
+ nd += delta[ch * outputWidth * outputHeight + x * outputHeight + y];
+ }
+ }
+ }
+ newDelta[chxy] = nd;
+}
\ No newline at end of file
diff --git a/src/main/resources/kernels/multi_normalize.cl b/src/main/resources/kernels/multi_normalize.cl
new file mode 100644
index 0000000..ad5b87e
--- /dev/null
+++ b/src/main/resources/kernels/multi_normalize.cl
@@ -0,0 +1,76 @@
+__kernel void average(
+ int inputChannels,
+ int inputWidth,
+ int inputHeight,
+ int size,
+ float threshold,
+ __global const float* input,
+ __global float* averages,
+ __global float* stds,
+ int len
+){
+ int xy = get_global_id(0);
+ if(xy >= len){
+ return;
+ }
+ int x = xy / inputHeight;
+ int y = xy % inputHeight;
+
+ float total = 0;
+ int count = 0;
+ for(int i = 0; i < size; ++i){
+ int xx = x + i - size / 2;
+ if(xx < 0 || xx >= inputWidth){
+ continue;
+ }
+ for(int j = 0; j < size; ++j){
+ int yy = y + j - size / 2;
+ if(yy < 0 || yy >= inputHeight){
+ continue;
+ }
+ for(int ch = 0; ch < inputChannels; ++ch){
+ total += input[ch * inputHeight * inputWidth + xy];
+ ++count;
+ }
+ }
+ }
+ float average = total / count;
+ float variance = 0;
+ for(int i = 0; i < size; ++i){
+ int xx = x + i - size / 2;
+ if(xx < 0 || xx >= inputWidth){
+ continue;
+ }
+ for(int j = 0; j < size; ++j){
+ int yy = y + j - size / 2;
+ if(yy < 0 || yy >= inputHeight){
+ continue;
+ }
+ for(int ch = 0; ch < inputChannels; ++ch){
+ float data = input[ch * inputHeight * inputWidth + xy];
+ variance += (data - average) * (data - average);
+ }
+ }
+ }
+ averages[xy] = average;
+ stds[xy] = max(threshold, sqrt(variance / count));
+
+}
+
+__kernel void forward(
+ int inputChannels,
+ int inputWidth,
+ int inputHeight,
+ __global const float* input,
+ __global const float* averages,
+ __global const float* stds,
+ __global float* result,
+ int count
+){
+ int chxy = get_global_id(0);
+ if(chxy >= count){
+ return;
+ }
+ int xy = chxy % (inputWidth * inputHeight);
+ result[chxy] = (input[chxy] - averages[xy]) / stds[xy];
+}
\ No newline at end of file
diff --git a/src/main/resources/tinynet_def.json b/src/main/resources/tinynet_def.json
index 6723860..c54e1cd 100644
--- a/src/main/resources/tinynet_def.json
+++ b/src/main/resources/tinynet_def.json
@@ -73,7 +73,7 @@
"initBias" : 1.0,
"dropoutRate" : 1.0,
"activation" : "SoftMaxFunction",
- "useGpu" : false
+ "useGpu" : true
}
} ]
}
\ No newline at end of file