-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGetRecLengthsAndColCountErrors.cu
123 lines (102 loc) · 4.77 KB
/
GetRecLengthsAndColCountErrors.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/******************************************************************************
* Copyright (c) 2016-2018, Brian Kennedy. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/******************************************************************************
*
* See https://github.com/Simantex/CSVImporter for repository and documentation.
*
******************************************************************************/
#include "CSV_kernel_declarations.cuh"
#include "csvImporter.h"
// Goes through the Records Table. Gets record lengths, Column counts per record, and sets column count errors flag if column nums don't match setting.
// This version of the launcher and kernel use shared mem to store last lane values in block. Since need to compare current value to prev, this saves a buffer read.
// However we do not write out last lane of last thread to SMEM (would be throw away), so we don't allocate SMEM for that.
extern "C" void
launch_GetRecLengthsAndColCountErrorsSMEM2(uint32_t * d_RecsTabl, uint32_t * d_RecsToColsTabl,
uint32_t * d_RecLengths, uint32_t * d_ColCountErrors, uint32_t * d_ColumnCountsPerRecordTable,
uint32_t ValuesCount, uint16_t numcols)
{
// Call stream compact kernel.
int iThreads = 256;
float fBlocks = (float)ValuesCount / ((float)iThreads);
int iBlocks = ValuesCount / iThreads;
fBlocks = fBlocks - iBlocks;
if (fBlocks > 0)
iBlocks++;
float fwarpsperblock = (float)iThreads / ((float)32);
int iwarpsperblock = iThreads / 32;
fwarpsperblock = fwarpsperblock - iwarpsperblock;
if (fwarpsperblock > 0)
iwarpsperblock++;
// need 2 values per warp.
int smemvalues = (iwarpsperblock - 1) * 2; // need # warps - 1
GetRecLengthsAndColCountErrorsSMEM2<<<iBlocks, iThreads, smemvalues * sizeof(uint32_t)>>> (d_RecsTabl, d_RecsToColsTabl, d_RecLengths, d_ColCountErrors, d_ColumnCountsPerRecordTable, ValuesCount, numcols);
Check_cuda_Errors("GetRecLengthsAndColCountErrorsSMEM2");
}
__global__ void GetRecLengthsAndColCountErrorsSMEM2(uint32_t * d_RecsTabl, uint32_t * d_RecsToColsTabl,
uint32_t * d_RecLengths, uint32_t * d_ColCountErrors, uint32_t * d_ColumnCountsPerRecordTable,
uint32_t ValuesCount, uint16_t numcols)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
if (ix >= ValuesCount) return; // ignore anything in last block beyond source arrays length.
// get current record array idx and recstocols
uint32_t myrecs = d_RecsTabl[ix];
uint32_t myrecstocols = d_RecsToColsTabl[ix];
// look for myrecs (offset 0) and myrecstocols (offset 1). so 2 uint32_t's per warp.
extern __shared__ uint32_t finallanevalues[];
int warpnum = threadIdx.x >> 5; // int divide by 32, throw away remainder
int laneid = ix & 0x1f;
// last lane writes input to shared, unless it is the last thread in the block (value would not be used).
if ((laneid == 31) && (threadIdx.x != (blockDim.x - 1)))
{
finallanevalues[warpnum * 2] = myrecs;
finallanevalues[(warpnum * 2) + 1] = myrecstocols;
}
// ensure all warps in the block have access to these values in shared.
__syncthreads();
// try to load prev based on warp shuffles
uint32_t prevrecs = __shfl_up(myrecs, 1);
uint32_t prevrecstocols = __shfl_up(myrecstocols, 1);
// now mod based on thread and lane
// set to 0 for very first thread (0) since nothing to look up
if (ix == 0)
{
prevrecs = 0;
prevrecstocols = 0;
}
// for the first thread in the block, must look up from mem.
else if (threadIdx.x == 0)
{
prevrecs = d_RecsTabl[ix - 1];
prevrecstocols = d_RecsToColsTabl[ix - 1];
}
// otherwise if the first thread in the warp (not 1st in block), can get from shared. look back 1 warp position.
else if ((ix % 32) == 0)
{
prevrecs = finallanevalues[(warpnum - 1) * 2];
prevrecstocols = finallanevalues[((warpnum - 1) * 2) + 1];
}
uint32_t reclen = myrecs - prevrecs;
uint32_t colscount = myrecstocols - prevrecstocols;
// add 1 if this is the first value to adjust for initial 0-based
// (rectocols 0 is 1 less than rectocols 1 minus rectocols 0, but consistent from there up)
if (ix == 0) colscount += 1;
uint32_t colscounterrorflag = ((uint16_t)colscount == numcols) ? 0 : 1;
d_RecLengths[ix] = reclen;
d_ColCountErrors[ix] = colscounterrorflag;
d_ColumnCountsPerRecordTable[ix] = colscount;
return;
}