forked from flexflow/flexflow-train
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_helper.h
85 lines (72 loc) · 3.12 KB
/
cuda_helper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#ifndef _LEGION_CNN_HELPER_H_
#define _LEGION_CNN_HELPER_H_
#include "legion.h"
#define FatalError(s) do { \
std::stringstream _where, _message; \
_where << __FILE__ << ':' << __LINE__; \
_message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
std::cerr << _message.str() << "\nAborting...\n"; \
exit(1); \
} while(0)
#ifndef DISABLE_COMPUTATION
#define checkCUDNN(status) do { \
std::stringstream _error; \
if (status != CUDNN_STATUS_SUCCESS) { \
_error << "CUDNN failure: " << cudnnGetErrorString(status); \
FatalError(_error.str()); \
} \
} while(0)
#endif
#define checkCUDA(status) do { \
std::stringstream _error; \
if (status != 0) { \
_error << "Cuda failure: " << status; \
FatalError(_error.str()); \
} \
} while(0)
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;
const int BLOCK_SIZE_LIMIT = 32768;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
{
int ret = (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
}
using namespace Legion;
__global__
void scale_kernel(float* ptr, coord_t size, float a, float b);
__global__
void ones_kernel(float* ptr, coord_t size);
__global__
void assign_kernel(float* ptr, coord_t size, float value);
__global__
void reluBackward(float* grad_ptr, const float* input, int n);
__global__
void apply_add_with_scale(float *data_ptr, const float *grad_ptr,
size_t size, float scale);
__host__
void updateGAS(float* para_ptr, const float* grad_ptr, size_t replica_size,
int num_replica, float learning_rate);
template<unsigned DIM, typename T>
void print_tensor(const T* ptr, Rect<DIM> rect, const char* prefix)
{
// device synchronize to make sure the data are ready
checkCUDA(cudaDeviceSynchronize());
T* host_ptr;
checkCUDA(cudaHostAlloc(&host_ptr, sizeof(T) * rect.volume(),
cudaHostAllocPortable | cudaHostAllocMapped));
checkCUDA(cudaMemcpy(host_ptr, ptr, sizeof(T) * rect.volume(),
cudaMemcpyDeviceToHost));
int idx = 0;
printf("%s", prefix);
for (PointInRectIterator<DIM> it(rect); it(); it++, idx++) {
printf(" %.4lf", host_ptr[idx]);
}
printf("\n");
checkCUDA(cudaFreeHost(host_ptr));
}
#endif