-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw07.cu
148 lines (117 loc) · 3.53 KB
/
hw07.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/*
Mary Barker
Homework 7
Vector dot product on GPU with more blocks than allowed.
to compile: nvcc BarkerHW7.cu
*/
#include <sys/time.h>
#include <stdio.h>
#define N 300000
#define MIN(x,y) (x<y)?x:y
#define threadsPerBlock 1024
float *A_CPU, *B_CPU, *C_CPU; //CPU pointers
float *A_GPU, *B_GPU, *C_GPU; //GPU pointers
dim3 grid, block;
void AllocateMemory()
{
//Allocate Device (GPU) Memory, & allocates the value of the specific pointer/array
cudaMalloc(&A_GPU,N*sizeof(float));
cudaMalloc(&B_GPU,N*sizeof(float));
cudaMalloc(&C_GPU,N*sizeof(float));
//Allocate Host (CPU) Memory
A_CPU = (float*)malloc(N*sizeof(float));
B_CPU = (float*)malloc(N*sizeof(float));
C_CPU = (float*)malloc(N*sizeof(float));
block = threadsPerBlock;
grid = (N - 1) / block.x + 1;
printf("Grid dim = %d, block dim = %d, total = %d\n", grid.x, block.x, grid.x*block.x);
}
//Loads values into vectors that we will add.
void Innitialize()
{
int i;
for(i = 0; i < N; i++)
{
A_CPU[i] = (float)1;
B_CPU[i] = (float)1;
}
}
//Cleaning up memory after we are finished.
void CleanUp(float *A_CPU,float *B_CPU,float *C_CPU,float *A_GPU,float *B_GPU,float *C_GPU) //free
{
free(A_CPU); free(B_CPU); free(C_CPU);
cudaFree(A_GPU); cudaFree(B_GPU); cudaFree(C_GPU);
}
//This is the kernel. It is the function that will run on the GPU.
//It adds vectors A and B then stores result in vector C
__global__ void DotProduct(float *A, float *B, float *C, int n)
{
__shared__ float dummy[threadsPerBlock];
int mythread = threadIdx.x, id = blockDim.x * blockIdx.x + threadIdx.x;
int odd, new_n = blockDim.x;
float value = 0.0;
while(id < n)
{
value += A[id] * B[id];
id+=blockDim.x*gridDim.x;
}
dummy[mythread] = value;
id = blockDim.x * blockIdx.x + threadIdx.x;
// 'Fold' the vector in half repeatedly
while(new_n > 0)
{
odd = new_n % 2;
new_n /= 2;
__syncthreads();
if(mythread < new_n)
{
if(id + new_n < n)
{
dummy[mythread] += dummy[mythread + new_n];
if( (odd > 0) && (mythread < 1) ) dummy[mythread] += dummy[mythread + 2*new_n];
}
}
}
if(mythread < 1)
{
C[blockIdx.x] = dummy[0];
}
}
int main()
{
int i;
timeval start, end;
//Partitioning off the memory that you will be using.
AllocateMemory();
//Loading up values to be added.
Innitialize();
//Starting the timer
gettimeofday(&start, NULL);
//Copy Memory from CPU to GPU
cudaMemcpy(A_GPU, A_CPU, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_GPU, B_CPU, N*sizeof(float), cudaMemcpyHostToDevice);
//Calling the Kernel (GPU) function.
DotProduct<<<grid, block>>>(A_GPU, B_GPU, C_GPU, N);
//Copy Memory from GPU to CPU
cudaMemcpy(C_CPU, C_GPU, N*sizeof(float), cudaMemcpyDeviceToHost);
if(grid.x > 1)
for(i = 1; i < grid.x; i++)
C_CPU[0] += C_CPU[i];
//Stopping the timer
gettimeofday(&end, NULL);
//Calculating the total time used in the addition and converting it to milliseconds.
float time = (end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec);
//Displaying the time
printf("Time in milliseconds= %.15f\n", (time/1000.0));
// Displaying vector info you will want to comment out the vector print line when your
//vector becomes big. This is just to make sure everything is running correctly.
for(i = 0; i < N; i++)
{
//printf("A[%d] = %.15f B[%d] = %.15f C[%d] = %.15f\n", i, A_CPU[i], i, B_CPU[i], i, C_CPU[i]);
}
//Displaying the value of the dot product
printf("Value is %f\n", C_CPU[0]);
//You're done so cleanup your mess.
CleanUp(A_CPU,B_CPU,C_CPU,A_GPU,B_GPU,C_GPU);
return(0);
}