|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +import jcuda.*; |
| 21 | +import jcuda.driver.*; |
| 22 | + |
| 23 | +import java.io.BufferedReader; |
| 24 | +import java.io.File; |
| 25 | +import java.io.FileWriter; |
| 26 | +import java.io.InputStreamReader; |
| 27 | +import java.util.ArrayList; |
| 28 | +import java.util.List; |
| 29 | +import java.util.Random; |
| 30 | + |
| 31 | +import static java.nio.file.Files.readAllBytes; |
| 32 | +import static jcuda.driver.JCudaDriver.*; |
| 33 | + |
| 34 | +public class PhiloxRuntimeCompilationExample implements AutoCloseable { |
| 35 | + private static String philox4x64KernelSource = "#include <cuda_runtime.h>\n" + |
| 36 | + "#include <Random123/philox.h>\n" + |
| 37 | + "extern \"C\" __global__ void philox_4_64(ulong* output, uint64_t startingCounter, uint64_t seed, size_t numElements) {\n" |
| 38 | + + |
| 39 | + " uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n" + |
| 40 | + " if (idx * 4 < numElements) {\n" + |
| 41 | + " r123::Philox4x64 rng;\n" + |
| 42 | + " r123::Philox4x64::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}};\n" + |
| 43 | + " r123::Philox4x64::key_type key = {{seed}};\n" + |
| 44 | + " r123::Philox4x64::ctr_type result = rng(ctr, key);\n" + |
| 45 | + " for (int i = 0; i < 4; ++i) {\n" + |
| 46 | + " size_t outputIdx = idx * 4 + i;\n" + |
| 47 | + " if (outputIdx < numElements) {\n" + |
| 48 | + " output[outputIdx] = result[i];\n" + |
| 49 | + " }\n" + |
| 50 | + " }\n" + |
| 51 | + " }\n" + |
| 52 | + "}\n"; |
| 53 | + |
| 54 | + private final CUcontext context; |
| 55 | + private final CUmodule module; |
| 56 | + private final CUfunction function; |
| 57 | + private final int blockSize; |
| 58 | + |
| 59 | + public PhiloxRuntimeCompilationExample() { |
| 60 | + JCudaDriver.setExceptionsEnabled(true); |
| 61 | + // Initialize CUDA |
| 62 | + cuInit(0); |
| 63 | + CUdevice device = new CUdevice(); |
| 64 | + cuDeviceGet(device, 0); |
| 65 | + context = new CUcontext(); |
| 66 | + int result = cuCtxCreate(context, 0, device); |
| 67 | + if (result != CUresult.CUDA_SUCCESS) { |
| 68 | + throw new RuntimeException( |
| 69 | + "Kontext-Erstellung fehlgeschlagen: " + result + ", " + CUresult.stringFor(result)); |
| 70 | + } |
| 71 | + |
| 72 | + // Compile to PTX |
| 73 | + String ptx = compileToTPX(philox4x64KernelSource); |
| 74 | + |
| 75 | + // Load the PTX |
| 76 | + module = new CUmodule(); |
| 77 | + cuModuleLoadData(module, ptx); |
| 78 | + function = new CUfunction(); |
| 79 | + cuModuleGetFunction(function, module, "philox_4_64"); |
| 80 | + |
| 81 | + // Set block size based on device capabilities |
| 82 | + blockSize = 64; // Can be adjusted based on device properties |
| 83 | + } |
| 84 | + |
| 85 | + private String compileToTPX(String source) { |
| 86 | + try { |
| 87 | + // Temporäre Dateien erstellen |
| 88 | + File sourceFile = File.createTempFile("philox_kernel", ".cu"); |
| 89 | + File outputFile = File.createTempFile("philox_kernel", ".ptx"); |
| 90 | + |
| 91 | + // CUDA-Quellcode in temporäre Datei schreiben |
| 92 | + try (FileWriter writer = new FileWriter(sourceFile)) { |
| 93 | + writer.write(philox4x64KernelSource); |
| 94 | + } |
| 95 | + |
| 96 | + // nvcc Kommando zusammenbauen |
| 97 | + List<String> command = new ArrayList<>(); |
| 98 | + command.add("/usr/local/cuda/bin/nvcc"); |
| 99 | + command.add("-ccbin"); |
| 100 | + command.add("gcc-8"); |
| 101 | + command.add("--ptx"); // PTX-Output generieren |
| 102 | + command.add("-o"); |
| 103 | + command.add(outputFile.getAbsolutePath()); |
| 104 | + command.add("-I"); |
| 105 | + command.add("./lib/random123/include"); |
| 106 | + command.add(sourceFile.getAbsolutePath()); |
| 107 | + |
| 108 | + // Prozess erstellen und ausführen |
| 109 | + ProcessBuilder pb = new ProcessBuilder(command); |
| 110 | + pb.redirectErrorStream(true); |
| 111 | + Process process = pb.start(); |
| 112 | + |
| 113 | + // Output des Kompilers lesen |
| 114 | + try (BufferedReader reader = new BufferedReader( |
| 115 | + new InputStreamReader(process.getInputStream()))) { |
| 116 | + String line; |
| 117 | + StringBuilder output = new StringBuilder(); |
| 118 | + while ((line = reader.readLine()) != null) { |
| 119 | + output.append(line).append("\n"); |
| 120 | + } |
| 121 | + System.out.println("Compiler Output: " + output.toString()); |
| 122 | + } |
| 123 | + |
| 124 | + // Auf Prozessende warten |
| 125 | + int exitCode = process.waitFor(); |
| 126 | + if (exitCode != 0) { |
| 127 | + throw new RuntimeException("nvcc Kompilierung fehlgeschlagen mit Exit-Code: " + exitCode); |
| 128 | + } |
| 129 | + |
| 130 | + // PTX-Datei einlesen |
| 131 | + String ptxCode = new String(readAllBytes(outputFile.toPath())); |
| 132 | + |
| 133 | + // Aufräumen |
| 134 | + sourceFile.delete(); |
| 135 | + outputFile.delete(); |
| 136 | + |
| 137 | + return ptxCode; |
| 138 | + |
| 139 | + } catch (Exception e) { |
| 140 | + throw new RuntimeException("Fehler bei der CUDA-Kompilierung: " + e.getMessage(), e); |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + /** |
| 145 | + * Generates random numbers using the Philox4x64 algorithm |
| 146 | + * |
| 147 | + * @param startingCounter Initial counter value |
| 148 | + * @param seed Random seed |
| 149 | + * @param numElements Number of random numbers to generate |
| 150 | + * @return Array of random numbers |
| 151 | + */ |
| 152 | + public CUdeviceptr Philox4x64(long startingCounter, long seed, int numElements) { |
| 153 | + // Allocate host memory for results |
| 154 | + // long[] hostOutput = new long[numElements]; |
| 155 | + |
| 156 | + // Allocate device memory |
| 157 | + CUdeviceptr deviceOutput = new CUdeviceptr(); |
| 158 | + cuMemAlloc(deviceOutput, (long) numElements * Sizeof.LONG); |
| 159 | + |
| 160 | + try { |
| 161 | + // Set up kernel parameters mit Debugging |
| 162 | + System.out.printf("numElements: %d, seed: %d, startingCounter: %d%n", |
| 163 | + numElements, seed, startingCounter); |
| 164 | + |
| 165 | + Pointer kernelParams = Pointer.to( |
| 166 | + Pointer.to(deviceOutput), |
| 167 | + Pointer.to(new long[] { startingCounter }), |
| 168 | + Pointer.to(new long[] { seed }), |
| 169 | + Pointer.to(new long[] { numElements })); |
| 170 | + |
| 171 | + // Calculate grid size |
| 172 | + int gridSize = (numElements + (blockSize * 4) - 1) / (blockSize * 4); |
| 173 | + |
| 174 | + // Launch kernel mit Fehlerprüfung |
| 175 | + int kernelResult = cuLaunchKernel(function, |
| 176 | + gridSize, 1, 1, // Grid dimension |
| 177 | + blockSize, 1, 1, // Block dimension |
| 178 | + 0, null, // Shared memory size and stream |
| 179 | + kernelParams, null // Kernel parameters and extra parameters |
| 180 | + ); |
| 181 | + if (kernelResult != CUresult.CUDA_SUCCESS) { |
| 182 | + throw new RuntimeException( |
| 183 | + "Kernel-Launch fehlgeschlagen: " + kernelResult + ", " + CUresult.stringFor(kernelResult)); |
| 184 | + } |
| 185 | + |
| 186 | + // Copy results back to host |
| 187 | + // cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, (long) numElements * |
| 188 | + // Sizeof.LONG); |
| 189 | + } finally { |
| 190 | + // Free device memory |
| 191 | + // cuMemFree(deviceOutput); |
| 192 | + } |
| 193 | + |
| 194 | + // return hostOutput; |
| 195 | + return deviceOutput; |
| 196 | + } |
| 197 | + |
| 198 | + /** |
| 199 | + * Cleans up CUDA resources |
| 200 | + */ |
| 201 | + public void close() { |
| 202 | + cuModuleUnload(module); |
| 203 | + cuCtxDestroy(context); |
| 204 | + } |
| 205 | + |
| 206 | + // Example usage |
| 207 | + public static void main(String[] args) { |
| 208 | + try (PhiloxRuntimeCompilationExample generator = new PhiloxRuntimeCompilationExample()) { |
| 209 | + // Generate 1 million random numbers |
| 210 | + int numElements = 1_000_000; |
| 211 | + long seed = 0L; |
| 212 | + long startingCounter = 0L; |
| 213 | + |
| 214 | + CUdeviceptr randomNumbers = generator.Philox4x64(startingCounter, seed, numElements); |
| 215 | + |
| 216 | + long[] elements = new long[10]; |
| 217 | + cuMemcpyDtoH(Pointer.to(elements), randomNumbers, 10L * Sizeof.LONG); |
| 218 | + cuMemFree(randomNumbers); |
| 219 | + |
| 220 | + // Print first few numbers |
| 221 | + System.out.println("First 10 random numbers:"); |
| 222 | + for (int i = 0; i < 10; i++) { |
| 223 | + System.out.printf("%d: %x%n", i, elements[i]); |
| 224 | + } |
| 225 | + |
| 226 | + int size = 10_000_000; |
| 227 | + long start = System.currentTimeMillis(); |
| 228 | + CUdeviceptr ptr = generator.Philox4x64(0L, 0L, size); |
| 229 | + long end = System.currentTimeMillis(); |
| 230 | + System.out.println("philox4x64 speed test: " + (end - start) * 1000 + " microseconds"); |
| 231 | + cuMemFree(ptr); |
| 232 | + Random r = new Random(); |
| 233 | + long javaStart = System.currentTimeMillis(); |
| 234 | + for (int i = 0; i < size; i++) { |
| 235 | + r.nextLong(); |
| 236 | + } |
| 237 | + long javaEnd = System.currentTimeMillis(); |
| 238 | + System.out.println("java speed test: " + (javaEnd - javaStart) * 1000 + " microseconds"); |
| 239 | + System.out.println("philox4x64 is " + (double) (javaEnd - javaStart) / (double) (end - start) |
| 240 | + + " times faster than java"); |
| 241 | + |
| 242 | + } |
| 243 | + } |
| 244 | +} |
0 commit comments