-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw08.cu
172 lines (133 loc) · 4.28 KB
/
hw08.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*
Barker Homework 8
Finding the problem with GPU dot product
To compile: nvcc dotProductRobustNot.cu -O3 -o dotProductRobustNot -lcudart
To run: ./dotProductRobustNot lengthofvector sizeofblock
*/
#include <sys/time.h>
#include <stdio.h>
// max number of block 65535
// max number of threads per block 1024
// max number of threads 67107840
//#define THREADSPERBLOCK 1024
int THREADSPERBLOCK;
int N; //Global that holds the length of the vectors. It will be loaded from the command line
double *A_CPU, *B_CPU, *C_CPU; //CPU pointers
double *A_GPU, *B_GPU, *C_GPU; //GPU pointers
dim3 dimBlock; //This variable will hold the Dimensions of your block
dim3 dimGrid; //This variable will hold the Dimensions of your grid
//Select the block and grid architecture for the threads on the GPU
void SetUpCudaDevices()
{
//Threads in a block
dimBlock.x = THREADSPERBLOCK;
dimBlock.y = 1;
dimBlock.z = 1;
//Blocks in a grid
dimGrid.x = (N - 1)/dimBlock.x + 1; //Makes enough blocks to add the whole vector. If N is greater than dimBlock.x*65535 you are out of luck.
dimGrid.y = 1;
dimGrid.z = 1;
}
void AllocateMemory()
{
//Allocate Device (GPU) Memory, & allocates the value of the specific pointer/array
cudaMalloc(&A_GPU,N*sizeof(double));
cudaMalloc(&B_GPU,N*sizeof(double));
cudaMalloc(&C_GPU,N*sizeof(double));
//Allocate Host (CPU) Memory
A_CPU = (double*)malloc(N*sizeof(double)); //(float*) to prevent from being a void
B_CPU = (double*)malloc(N*sizeof(double));
C_CPU = (double*)malloc(N*sizeof(double));
}
//Loads values into vectors that we will dot.
void Innitialize()
{
int i;
for(i = 0; i < N; i++)
{
A_CPU[i] = (double)1;
B_CPU[i] = (double)1;
}
}
//Cleaning up memory after we are finished.
void CleanUp(double *A_CPU,double *B_CPU,double *C_CPU,double *A_GPU,double *B_GPU,double *C_GPU)
{
free(A_CPU); free(B_CPU); free(C_CPU);
cudaFree(A_GPU); cudaFree(B_GPU); cudaFree(C_GPU);
}
//This is the kernel. It is the function that will run on the GPU.
__global__ void Addition(double *A, double *B, double *C, int n)
{
int id = blockDim.x*blockIdx.x + threadIdx.x;
//Multiplying the vectors
if(id < n)
{
C[id] = A[id] * B[id];
}
__syncthreads();
int fold = blockDim.x;
while(fold > 1)
{
if(fold%2 != 0) //Checking to see if the fold is even
{
if(threadIdx.x == 0 && id + (fold - 1) < n) //If fold is not even add the last element and now it is
{
fold = fold - 1;
C[id] = C[id] + C[id + fold];
}
}
fold = fold/2; //Fold the remaining parts in
if(threadIdx.x < fold && id + fold < n)
{
C[id] = C[id] + C[id + fold];
}
__syncthreads();
}
}
int main(int argc, char** argv)
{
int i;
N = atoi(argv[1]); //Reading the length of the vectors from the command line
THREADSPERBLOCK = atoi(argv[2]); //Reading the size of the blocks
timeval start, end;
cudaError_t err;
//Set the thread structure that you will be using on the GPU
SetUpCudaDevices();
//Partitioning off the memory that you will be using.
AllocateMemory();
//Loading up values to be added.
Innitialize();
//Starting the timer
gettimeofday(&start, NULL);
//Copy Memory from CPU to GPU
cudaMemcpyAsync(A_GPU, A_CPU, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyAsync(B_GPU, B_CPU, N*sizeof(double), cudaMemcpyHostToDevice);
//Calling the Kernel (GPU) function.
Addition<<<dimGrid, dimBlock>>>(A_GPU, B_GPU, C_GPU, N);
//Checking to see if the Kernel had any problems.
err = cudaGetLastError();
if (err != 0)
{
printf("\n CUDA error = %s\n", cudaGetErrorString(err));
return(1);
}
//Copy Memory from GPU to CPU
cudaMemcpyAsync(C_CPU, C_GPU, N*sizeof(double), cudaMemcpyDeviceToHost);
//Stopping the timer
gettimeofday(&end, NULL);
//Calculating the total time used in the addition and converting it to milliseconds.
float time = (end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec);
//Displaying the time
printf("GPU Time in milliseconds= %.15f\n", (time/1000.0));
//Add up the remaining info to get the final dot product
double dot = 0.0;
for(i = 0; i < N; i = i + dimBlock.x)
{
dot = dot + C_CPU[i];
}
//Displaying the dot product.
printf("\n *** N = %d DotProduct = %.15f ***\n",N, dot);
//You're done so cleanup your mess.
CleanUp(A_CPU,B_CPU,C_CPU,A_GPU,B_GPU,C_GPU);
return(0);
}