-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddvector_gpu.cu
136 lines (105 loc) · 4.81 KB
/
addvector_gpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#define N 4194304
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
}
// includes CUDA
#include <cuda_runtime.h>
#include <helper_functions.h> // helper functions for SDK examples
extern "C" void addVectorCpu(int numberElements, float *firstArray, float *secondArray, float *resultArray);
extern "C" __global__ void addVectorGpu(int numberElements, float *firstArray, float *secondArray, float *resultArray);
void addVec(int numberElements, float *firstArray, float *secondArray, float *resultArray, bool useDevice = false)
{
StopWatchInterface *timer = 0;
if (useDevice == false)
{
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
addVectorCpu(numberElements, firstArray, secondArray, resultArray);
sdkStopTimer(&timer);
printf("Processing time on CPU: %f (ms)\n", sdkGetTimerValue(&timer));
}
else
{
cudaDeviceProp devProv;
cudaGetDeviceProperties(&devProv, 0);
printf("**********GPU info**********\n");
printf("Name: %s\n", devProv.name);
printf("Compute capability: %d.%d\n", devProv.major, devProv.minor);
printf("GMEM: %zu byte\n", devProv.totalGlobalMem);
printf("****************************\n");
// Host allocates memories on device
// Way 1:
float *d_firstArray, *d_secondArray, *d_resultArray;
size_t nBytes = numberElements * sizeof(float);
CHECK(cudaMalloc(&d_firstArray, nBytes));
CHECK(cudaMalloc(&d_secondArray, nBytes));
CHECK(cudaMalloc(&d_resultArray, nBytes));
// Host copies data to device memories
CHECK(cudaMemcpy(d_firstArray, firstArray, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_secondArray, secondArray, nBytes, cudaMemcpyHostToDevice));
// Host invokes kernel function to add vectors on device
dim3 blockSize(512); // For simplicity, you can temporarily view blockSize as a number
dim3 gridSize((numberElements - 1) / blockSize.x + 1); // Similarity, view gridSize as a number
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
addVectorGpu<<<gridSize, blockSize>>>(numberElements, d_firstArray, d_secondArray, d_resultArray);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
printf("Processing time on GPU: %f (ms)\n", sdkGetTimerValue(&timer));
// Host copies result from device memory
CHECK(cudaMemcpy(resultArray, d_resultArray, nBytes, cudaMemcpyDeviceToHost));
// Free device memories
CHECK(cudaFree(d_firstArray));
CHECK(cudaFree(d_secondArray));
CHECK(cudaFree(d_resultArray));
}
sdkDeleteTimer(&timer);
}
int main(int argc, char **argv)
{
float *firstArray, *secondArray; // Input vectors
float *resultArray, *correctResultArray; // Output vector
// Allocate memories for firstArray, secondArray, resultArray, correctResultArray
size_t nBytes = N * sizeof(float);
// Allocate the host input vector A (the first vector)
firstArray = reinterpret_cast<float *>(malloc(nBytes));
// Allocate the host input vector B (the second vector)
secondArray = reinterpret_cast<float *>(malloc(nBytes));
// Allocate the host input vector C (the result vector)
resultArray = reinterpret_cast<float *>(malloc(nBytes));
// Allocate the host input vector D (the correct vector which computed on host)
correctResultArray = reinterpret_cast<float *>(malloc(nBytes));
// Input data into in1, in2
for (int i = 0; i < N; i++)
{
firstArray[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
secondArray[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
}
// Add vectors (on host)
addVec(N, firstArray, secondArray, correctResultArray);
// Add in1 & in2 on device
addVec(N, firstArray, secondArray, resultArray, true);
// Check correctness
for (int i = 0; i < N; i = -~i)
{
if (resultArray[i] != correctResultArray[i])
{
printf("INCORRECT.\n");
return 1;
}
}
printf("CORRECT.\n");
}