-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.cu
executable file
·234 lines (200 loc) · 8.25 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "ImageUtil.h"
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <iomanip>
//smooth out the potential to avoid divide by zero errors
#define EPS2 0.00001
//GPU parameters
const int nThreads=1024;
/*Number of particles to skip in input. nSkip=1
gives 100,000 particles. nSkip=10 gives 10,000
particles. Overall 100,000/nSkip total particles. */
const int nSkip=10;
//Produce only nFrames worth of animation. 10 seconds at 30fps is 300 frames, which is a good number for homeworks.
const int nFrames=30;
// GPU Gems style HANDLE_ERROR function
static void HandleError(cudaError_t err,
const char *file,
int line) {
if (err != cudaSuccess) {
std::cout << cudaGetErrorString(err) << " in " << file << " at line " << line << std::endl;
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//Some string manipulation functions for saving files. pad_int(1234,5) returns "01234".
std::string pad_int(int arg, int padcount) {
std::stringstream ss;
ss << std::setfill('0') << std::setw(padcount) << arg;
return ss.str();
}
//Returns a file name in the form of "prefix00###suffix". For example "image0032.bmp"
std::string getFilename(std::string prefix, int num, int padcount, std::string suffix) {
return prefix + pad_int(num, padcount) + suffix;
}
//Hybrid of GPU Gems 3 ch. 31 and CUDA nbody example.
//20 FLOPs
__device__ float3 bodyBodyInteraction(float4 bi, float4 bj, float3 ai) {
float3 r;
// r_ij [3 FLOPs]
r.x = bj.x - bi.x;
r.y = bj.y - bi.y;
r.z = bj.z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPs]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS2;
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float distSixth = distSqr * distSqr * distSqr;
float invDistCube = 1.0f / sqrtf(distSixth);
// s = m_j * invDistCube [1 FLOP]
float s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPs]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
/* Hybrid of GPU Gems 3 ch. 31 and samples/5_Simulations/nbody/bodyststemcuda.cu */
__device__ float4 calculate_acceleration(float4 *devX, int numParticles, int deviceOffset) {
__shared__ float4 shPosition[nThreads];
float4 myPosition;
int tile;
float3 acc = { 0.0f, 0.0f, 0.0f };
int gtid = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;
myPosition = devX[gtid];
for (tile = 0; tile<numParticles / blockDim.x; tile++) {
shPosition[threadIdx.x] = devX[tile * blockDim.x + threadIdx.x];
__syncthreads();
#pragma unroll 64
for (unsigned int j = 0; j < blockDim.x; j++) {
acc = bodyBodyInteraction(myPosition, shPosition[j], acc);
}
__syncthreads();
}
float4 acc4 = { acc.x, acc.y, acc.z, 0.0f };
return acc4;
}
__global__ void kernel_step(float4 *devX, float4 *devV, float dt, int numParticles, int deviceOffset) {
int index = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;
//6 FLOPs
devX[index].x += devV[index].x*dt;
devX[index].y += devV[index].y*dt;
devX[index].z += devV[index].z*dt;
float4 acc = calculate_acceleration(devX, numParticles, deviceOffset);
//6 FLOPs.
devV[index].x += acc.x*dt;
devV[index].y += acc.y*dt;
devV[index].z += acc.z*dt;
}
int main()
{
int nParticles;
//Particle loading code!
std::vector<float4> particlepos;
std::vector<float4> particlevel;
//from http://stackoverflow.com/a/8421315/1030718
std::ifstream source; // build a read-Stream
source.open("galaxy1.txt", std::ios_base::in); // open data
int ctr=-1;
for (std::string line; std::getline(source, line); ) //read stream line by line
{
ctr++;
std::istringstream in(line); //make a stream for the line itself
if(ctr%nSkip!=0)
continue;
float m;
float x;
float y;
float z;
float vx;
float vy;
float vz;
in >> m >> x >> y >> z >> vx >> vy >> vz;
//multiplying my nSkip makes sure that if we only have one particle for every 10 particles in the data file, it has 10 times the mass.
particlepos.push_back(make_float4(x,y,z,m*nSkip ));
particlevel.push_back(make_float4( vx,vy,vz,0.0 ));
}
nParticles = (int)particlepos.size();
//ensure nParticles is a multiple of nThreads*deviceCount
int nParticlesNew = nThreads*int(float(nParticles) / (nThreads));
if (nParticles != nParticlesNew) {
std::cout << "WARNING in GPU_Phy::initialize. Position list argument size is not a multiple of nThreads*deviceCount!" << std::endl;
std::cout << "Continuing by chopping to nearest multiple." << std::endl;
nParticles = nParticlesNew; //this rounds down. nParticles<=positions.size().
}
float4 *host_x = new float4[nParticles];
float4 *host_v = new float4[nParticles];
for (int i = 0; i < nParticles; i++) {
host_x[i] = particlepos[i];
host_v[i] = particlevel[i];
}
int nBlocks = nParticles / (nThreads);
HANDLE_ERROR(cudaSetDevice(0));
//initialize device memory
float4 *dev_x;
float4 *dev_v;
//Allocate the memory on the device
HANDLE_ERROR(cudaMalloc((void**)(&dev_x), nParticles*sizeof(float4)));
HANDLE_ERROR(cudaMalloc((void**)(&dev_v), nParticles*sizeof(float4)));
//Copy the memory to the device
HANDLE_ERROR(cudaMemcpy(dev_x, host_x, nParticles*sizeof(float4), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_v, host_v, nParticles*sizeof(float4), cudaMemcpyHostToDevice));
//timestep and save pictures
float dt = 0.01;
for (int i = 0; i < nFrames; i++) {
std::cout << i << std::endl;
//Do the computation
for (int k = 0; k < 15; k++){
kernel_step <<<nBlocks, nThreads >>>(dev_x, dev_v, dt, nParticles, 0);
cudaDeviceSynchronize();
}
//wait for the computation to finish
cudaDeviceSynchronize();
//copy the memory to the computer
HANDLE_ERROR(cudaMemcpy(host_x, dev_x, nParticles*sizeof(float4), cudaMemcpyDeviceToHost));
//construct an image.
//First fill up two arrays of scalars representing the particle density.
//One for the dark matter (which will be reddish), and one for regular matter (bluish).
DoubleImage regularMass(400, 400);
DoubleImage darkMatter(400, 400);
Image pic(400,400);
for (int j = 0; j < nParticles; j++) {
int x = int(host_x[j].x * 20) + 400 / 2;
int y = int(host_x[j].y * 20) + 400 / 2;
if(j<50000/nSkip){
regularMass.increase(x, y, 1);
regularMass.increase(x + 1, y, 0.5);
regularMass.increase(x - 1, y, 0.5);
regularMass.increase(x, y + 1, 0.5);
regularMass.increase(x, y - 1, 0.5);
} else {
darkMatter.increase(x, y, 1);
darkMatter.increase(x + 1, y, 0.5);
darkMatter.increase(x - 1, y, 0.5);
darkMatter.increase(x, y + 1, 0.5);
darkMatter.increase(x, y - 1, 0.5);
}
}
//Combine the two scalar arrays into an array of rgb values.
for (int x = 0; x < 400; x++) {
for (int y = 0; y < 400; y++) {
//Human eye sees brightness logarithmically, so take a log.
double scalar1 = log(1+3*regularMass.get(x, y))/4.0 ;
double scalar2 = log(1 + darkMatter.get(x, y)) / 3.0;
//combine everything into a nice color.
pic.put(x, y, floatToRGB(scalar2*1.2+scalar1, 0+scalar1*1.5, scalar1*1.7));
}
}
//actually save the image
pic.save(getFilename("out/image", i, 3, ".bmp"));
}
delete [] host_x;
delete [] host_v;
HANDLE_ERROR(cudaFree(dev_x));
HANDLE_ERROR(cudaFree(dev_v));
cudaDeviceReset();
return 0;
}