From e0800a9534410a918481081b293c752c435c3e02 Mon Sep 17 00:00:00 2001 From: Due Date: Fri, 22 Nov 2024 16:16:20 +0100 Subject: [PATCH] A start on adding a CUDA backend. --- alpacc.cabal | 4 +- cuda/lexer.cu | 633 +++++++++++++++++++++ src/Alpacc/Generator/Cuda/Cudafy.hs | 57 ++ src/Alpacc/Generator/Futhark/FutPrinter.hs | 77 --- src/Alpacc/Generator/Futhark/Futharkify.hs | 77 +++ src/Alpacc/Generator/Futhark/Generator.hs | 4 +- src/Alpacc/Generator/Futhark/Lexer.hs | 12 +- src/Alpacc/Generator/Futhark/Parser.hs | 28 +- 8 files changed, 792 insertions(+), 100 deletions(-) create mode 100644 cuda/lexer.cu create mode 100644 src/Alpacc/Generator/Cuda/Cudafy.hs delete mode 100644 src/Alpacc/Generator/Futhark/FutPrinter.hs create mode 100644 src/Alpacc/Generator/Futhark/Futharkify.hs diff --git a/alpacc.cabal b/alpacc.cabal index f5e3bc5..c0a0089 100644 --- a/alpacc.cabal +++ b/alpacc.cabal @@ -16,6 +16,7 @@ extra-source-files: CHANGELOG.md futhark/parser.fut futhark/lexer.fut + cuda/lexer.fut library default-language: GHC2021 @@ -62,9 +63,10 @@ library Alpacc.Types Alpacc.Generator.Futhark.Lexer Alpacc.Generator.Futhark.Parser - Alpacc.Generator.Futhark.FutPrinter + Alpacc.Generator.Futhark.Futharkify Alpacc.Generator.Futhark.Generator Alpacc.Generator.Futhark.Util + Alpacc.Generator.Cuda.Cudafy Alpacc.Lexer.RegularExpression Alpacc.Lexer.NFA Alpacc.Lexer.DFA diff --git a/cuda/lexer.cu b/cuda/lexer.cu new file mode 100644 index 0000000..00f3c7a --- /dev/null +++ b/cuda/lexer.cu @@ -0,0 +1,633 @@ +#include +#include +#include +#include +#include +#include + +using token_t = unsigned char; +using state_t = unsigned short; + +const unsigned int NUM_STATES = 12; +const unsigned int NUM_TRANS = 256; +// const token_t IGNORE_TOKEN = 0; +const state_t ENDO_MASK = 15; +const state_t ENDO_OFFSET = 0; +const state_t TOKEN_MASK = 112; +const state_t TOKEN_OFFSET = 4; +const state_t ACCEPT_MASK = 128; +const state_t ACCEPT_OFFSET = 7; +const state_t PRODUCE_MASK = 256; +const state_t PRODUCE_OFFSET = 8; +const state_t IDENTITY = 74; + +state_t h_to_state[NUM_TRANS] = + {75, 75, 75, 75, 75, 75, 75, 75, 75, 128, 128, 75, 75, 128, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 128, 75, 75, 75, 75, 75, 75, 75, 161, 178, 75, + 75, 75, 75, 75, 75, 147, 147, 147, 147, 147, 147, 147, 147, + 147, 147, 75, 75, 75, 75, 75, 75, 75, 147, 147, 147, 147, + 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, + 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 75, 75, + 75, 75, 75, 75, 147, 147, 147, 147, 147, 147, 147, 147, 147, + 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, + 147, 147, 147, 147, 147, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75}; + +state_t h_compose[NUM_STATES * NUM_STATES] = + {132, 392, 392, 392, 132, 392, 392, 392, 132, 392, 128, 75, + 421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 161, 75, + 438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 178, 75, + 407, 407, 407, 153, 407, 407, 407, 153, 407, 153, 147, 75, + 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 75, + 421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 75, + 438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 75, + 407, 407, 407, 407, 407, 407, 407, 407, 407, 407, 407, 75, + 392, 392, 392, 392, 392, 392, 392, 392, 392, 392, 392, 75, + 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 75, + 128, 161, 178, 147, 132, 421, 438, 407, 392, 153, 74, 75, + 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75}; + +const unsigned char LG_WARP = 5; +const unsigned char WARP = 1 << LG_WARP; + +template +__device__ inline void +glbToShmemCpy(const I glb_offs, + const I size, + const T ne, + T* d_read, + volatile T* shmem_write) { + #pragma unroll + for (I i = 0; i < ITEMS_PER_THREAD; i++) { + I lid = i * blockDim.x + threadIdx.x; + I gid = glb_offs + lid; + shmem_write[lid] = gid < size ? d_read[gid] : ne; + } + __syncthreads(); +} + +template +__device__ inline void +shmemToGlbCpy(const I glb_offs, + const I size, + T* d_write, + volatile T* shmem_read) { + #pragma unroll + for (I i = 0; i < ITEMS_PER_THREAD; i++) { + I lid = blockDim.x * i + threadIdx.x; + I gid = glb_offs + lid; + if (gid < size) + d_write[gid] = shmem_read[lid]; + } + __syncthreads(); +} + +template +__device__ inline T +scanThread(volatile T* shmem, + volatile T* shmem_aux, + OP op) { + const I offset = threadIdx.x * ITEMS_PER_THREAD; + const I upper = offset + ITEMS_PER_THREAD; + T acc = shmem[offset]; + #pragma unroll + for (I lid = offset + 1; lid < upper; lid++) { + T tmp = shmem[lid]; + acc = op(acc, tmp); + shmem[lid] = acc; + } + shmem_aux[threadIdx.x] = acc; + __syncthreads(); +} + +template +__device__ inline T +scanWarp(volatile T* shmem, + OP op, + const unsigned char lane) { + unsigned char h; + + #pragma unroll + for (unsigned char d = 0; d < LG_WARP; d++) + if ((h = 1 << d) <= lane) + shmem[threadIdx.x] = op(shmem[threadIdx.x - h], shmem[threadIdx.x]); + + return shmem[threadIdx.x]; +} + +template +__device__ inline T +scanBlock(volatile T* shmem, + OP op) { + const unsigned char lane = threadIdx.x & (WARP - 1); + const I warpid = threadIdx.x >> LG_WARP; + + T res = scanWarp(shmem, op, lane); + __syncthreads(); + + if (lane == (WARP - 1)) + shmem[warpid] = res; + __syncthreads(); + + if (warpid == 0) + scanWarp(shmem, op, lane); + __syncthreads(); + + if (warpid > 0) + res = op(shmem[warpid-1], res); + __syncthreads(); + + shmem[threadIdx.x] = res; + __syncthreads(); +} + +template +__device__ inline void +addAuxBlockScan(volatile T* shmem, + volatile T* shmem_aux, + OP op) { + if (threadIdx.x > 0) { + const I offset = threadIdx.x * ITEMS_PER_THREAD; + const I upper = offset + ITEMS_PER_THREAD; + const T val = shmem_aux[threadIdx.x - 1]; + #pragma unroll + for (I lid = offset; lid < upper; lid++) { + shmem[lid] = op(val, shmem[lid]); + } + } + __syncthreads(); +} + +template +__device__ inline void +scanBlock(volatile T* block, + volatile T* block_aux, + OP op) { + scanThread(block, block_aux, op); + + scanBlock(block_aux, op); + + addAuxBlockScan(block, block_aux, op); +} + +__device__ inline unsigned int dynamicIndex(volatile unsigned int* dyn_idx_ptr) { + volatile __shared__ unsigned int dyn_idx; + + if (threadIdx.x == 0) + dyn_idx = atomicAdd(const_cast(dyn_idx_ptr), 1); + + __syncthreads(); + return dyn_idx; +} + +enum Status: unsigned char { + Invalid = 0, + Aggregate = 1, + Prefix = 2, +}; + +template +struct State { + T aggregate; + T prefix; + Status status = Invalid; +}; + +__device__ inline Status +combine(Status a, Status b) { + if (b == Aggregate) + return a; + return b; +} + +template +__device__ inline void +scanWarp(volatile T* values, + volatile Status* statuses, + OP op, + const unsigned char lane) { + unsigned char h; + const I tid = threadIdx.x; + + #pragma unroll + for (unsigned char d = 0; d < LG_WARP; d++) { + if ((h = 1 << d) <= lane) { + bool is_not_aggregate = statuses[tid] != Aggregate; + values[tid] = is_not_aggregate ? const_cast(values)[tid] : op(values[tid - h], values[tid]); + statuses[tid] = combine(statuses[tid - h], statuses[tid]); + } + } +} + +template +__device__ inline T +decoupledLookbackScanNoWrite(volatile State* states, + volatile T* shmem, + OP op, + const T ne, + unsigned int dyn_idx) { + volatile __shared__ T values[WARP]; + volatile __shared__ Status statuses[WARP]; + volatile __shared__ T shmem_prefix; + const unsigned char lane = threadIdx.x & (WARP - 1); + const bool is_first = threadIdx.x == 0; + + T aggregate = shmem[ITEMS_PER_THREAD * blockDim.x - 1]; + + if (is_first) { + states[dyn_idx].aggregate = aggregate; + } + + if (dyn_idx == 0 && is_first) { + states[dyn_idx].prefix = aggregate; + } + + __threadfence(); + if (dyn_idx == 0 && is_first) { + states[dyn_idx].status = Prefix; + } else if (is_first) { + states[dyn_idx].status = Aggregate; + } + + T prefix = ne; + if (threadIdx.x < WARP && dyn_idx != 0) { + I lookback_idx = threadIdx.x + dyn_idx; + I lookback_warp = WARP; + Status status = Aggregate; + do { + if (lookback_warp <= lookback_idx) { + I idx = lookback_idx - lookback_warp; + status = states[idx].status; + statuses[threadIdx.x] = status; + values[threadIdx.x] = status == Prefix ? states[idx].prefix : states[idx].aggregate; + } else { + statuses[threadIdx.x] = Aggregate; + values[threadIdx.x] = ne; + } + + scanWarp(values, statuses, op, lane); + + T result = values[WARP - 1]; + status = statuses[WARP - 1]; + + if (status == Invalid) + continue; + + if (is_first) { + prefix = op(result, prefix); + } + + lookback_warp += WARP; + } while (status != Prefix); + } + + if (is_first) { + shmem_prefix = prefix; + } + + __syncthreads(); + + if (is_first) { + states[dyn_idx].prefix = op(prefix, aggregate); + __threadfence(); + states[dyn_idx].status = Prefix; + } + + return shmem_prefix; +} + +__device__ __host__ __forceinline__ state_t get_index(state_t state) { + return (state & ENDO_MASK) >> ENDO_OFFSET; +} + +__device__ __host__ __forceinline__ token_t get_token(state_t state) { + return (state & TOKEN_MASK) >> TOKEN_OFFSET; +} + +__device__ bool is_accept(state_t state) { + return (state & ACCEPT_MASK) >> ACCEPT_OFFSET; +} + +__device__ __host__ __forceinline__ bool is_produce(state_t state) { + return (state & PRODUCE_MASK) >> PRODUCE_OFFSET; +} + +struct LexerCtx { + state_t* d_to_state; + state_t* d_compose; + + LexerCtx() : d_to_state(NULL), d_compose(NULL) { + cudaMalloc(&d_to_state, sizeof(h_to_state)); + cudaMemcpy(d_to_state, h_to_state, sizeof(h_to_state), + cudaMemcpyHostToDevice); + cudaMalloc(&d_compose, sizeof(h_compose)); + cudaMemcpy(d_compose, h_compose, sizeof(h_compose), + cudaMemcpyHostToDevice); + } + + void Cleanup() { + if (d_to_state) cudaFree(d_to_state); + if (d_compose) cudaFree(d_compose); + } + + __device__ __host__ __forceinline__ + state_t operator()(const state_t &a, const state_t &b) const { + return d_compose[get_index(b) * NUM_STATES + get_index(a)]; + } + + __device__ __host__ __forceinline__ + state_t operator()(const volatile state_t &a, const volatile state_t &b) const { + return d_compose[get_index(b) * NUM_STATES + get_index(a)]; + } + + __device__ __host__ __forceinline__ + state_t to_state(const char &a) const { + return d_to_state[a]; + } +}; + +template +struct Add { + __device__ __forceinline__ I operator()(I a, I b) const { + return a + b; + } +}; + +template +__global__ void +lexer(LexerCtx ctx, + unsigned char* d_in, + unsigned int* d_index_out, + token_t* d_token_out, + volatile State* state_states, + volatile State* index_states, + I size, + I num_logical_blocks, + volatile unsigned int* dyn_index_ptr, + volatile I* new_size, + volatile bool* is_valid) { + volatile __shared__ state_t states[ITEMS_PER_THREAD * BLOCK_SIZE]; + volatile __shared__ I indices[ITEMS_PER_THREAD * BLOCK_SIZE]; + volatile __shared__ I indices_aux[BLOCK_SIZE]; + __shared__ state_t next_block_first_state; + volatile state_t* states_aux = (volatile state_t*) indices; + const I REG_MEM = 1 + ITEMS_PER_THREAD / sizeof(unsigned long long); + unsigned long long copy_reg[REG_MEM]; + unsigned char *chars_reg = (unsigned char*) copy_reg; + unsigned int is_produce_state = 0; + + unsigned int dyn_index = dynamicIndex(dyn_index_ptr); + I glb_offs = dyn_index * BLOCK_SIZE * ITEMS_PER_THREAD; + + states_aux[threadIdx.x] = ctx.to_state(threadIdx.x); + + if (threadIdx.x == I()) { + next_block_first_state = IDENTITY; + } + + __syncthreads(); + + #pragma unroll + for (I i = 0; i < REG_MEM; i++) { + I uint64_lid = i * blockDim.x + threadIdx.x; + I lid = sizeof(unsigned long long) * uint64_lid; + I gid = glb_offs + lid; + if (gid + sizeof(unsigned long long) < size) { + copy_reg[i] = *((unsigned long long*) (gid + (unsigned char*) d_in)); + } else { + for (I j = 0; j < sizeof(unsigned long long); j++) { + I loc_gid = gid + j; + if (loc_gid < size) { + chars_reg[sizeof(unsigned long long) * i + j] = d_in[loc_gid]; + } + } + } + } + + #pragma unroll + for (I i = 0; i < REG_MEM; i++) { + I lid = i * blockDim.x + threadIdx.x; + I _gid = glb_offs + sizeof(unsigned long long) * lid; + for (I j = 0; j < sizeof(unsigned long long); j++) { + I gid = _gid + j; + I lid_off = sizeof(unsigned long long) * lid + j; + I reg_off = sizeof(unsigned long long) * i + j; + bool is_in_block = lid_off < ITEMS_PER_THREAD * BLOCK_SIZE; + if (gid < size && is_in_block) { + states[lid_off] = states_aux[chars_reg[reg_off]]; + } else if (is_in_block) { + states[lid_off] = IDENTITY; + } else if (lid_off == ITEMS_PER_THREAD * BLOCK_SIZE) { + next_block_first_state = states_aux[chars_reg[reg_off]]; + } + } + } + + __syncthreads(); + + scan(states, states_aux, state_states, ctx, IDENTITY, dyn_index); + + #pragma unroll + for (I i = 0; i < ITEMS_PER_THREAD; i++) { + I lid = i * blockDim.x + threadIdx.x; + I gid = glb_offs + lid; + bool temp = false; + if (gid < size) { + if (lid == ITEMS_PER_THREAD * BLOCK_SIZE - 1) { + temp = gid == size - 1 || is_produce(ctx(states[lid], next_block_first_state)); + } else { + temp = gid == size - 1 || is_produce(states[lid + 1]); + } + } + is_produce_state |= temp << i; + indices[lid] = temp; + } + + __syncthreads(); + + scanBlock, ITEMS_PER_THREAD>(indices, indices_aux, Add()); + + I prefix = decoupledLookbackScanNoWrite, ITEMS_PER_THREAD>(index_states, indices, Add(), I(), dyn_index); + + #pragma unroll + for (I i = 0; i < ITEMS_PER_THREAD; i++) { + I lid = blockDim.x * i + threadIdx.x; + I gid = glb_offs + lid; + if (gid < size && ((is_produce_state >> i) & 1)) { + I offset = Add()(prefix, indices[lid]) - 1; + d_index_out[offset] = gid; + d_token_out[offset] = get_token(states[lid]); + } + } + + if (dyn_index == num_logical_blocks - 1 && threadIdx.x == blockDim.x - 1) { + *new_size = Add()(prefix, indices[ITEMS_PER_THREAD * BLOCK_SIZE - 1]); + *is_valid = is_accept(states[ITEMS_PER_THREAD * BLOCK_SIZE - 1]); + } +} + +void testLexer(unsigned char* input, + size_t input_size, + unsigned int* expected_indices, + token_t* expected_tokens, + size_t expected_size) { + using I = unsigned int; + const I size = input_size; + const I BLOCK_SIZE = 256; + const I ITEMS_PER_THREAD = 31; + const I NUM_LOGICAL_BLOCKS = (size + BLOCK_SIZE * ITEMS_PER_THREAD - 1) / (BLOCK_SIZE * ITEMS_PER_THREAD); + const I IN_ARRAY_BYTES = size * sizeof(unsigned char); + const I INDEX_OUT_ARRAY_BYTES = size * sizeof(I); + const I TOKEN_OUT_ARRAY_BYTES = size * sizeof(token_t); + const I STATE_STATES_BYTES = NUM_LOGICAL_BLOCKS * sizeof(State); + const I INDEX_STATES_BYTES = NUM_LOGICAL_BLOCKS * sizeof(State); + const I WARMUP_RUNS = 500; + const I RUNS = 50; + + std::vector h_token_out(size, 0); + std::vector h_index_out(size, 0); + + unsigned int* d_dyn_index_ptr; + I* d_new_size; + bool* d_is_valid; + unsigned char *d_in; + I *d_index_out; + token_t *d_token_out; + State* d_index_states; + State* d_state_states; + gpuAssert(cudaMalloc((void**)&d_dyn_index_ptr, sizeof(unsigned int))); + gpuAssert(cudaMalloc((void**)&d_new_size, sizeof(I))); + gpuAssert(cudaMalloc((void**)&d_is_valid, sizeof(bool))); + cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int)); + cudaMemset(d_is_valid, false, sizeof(bool)); + gpuAssert(cudaMalloc((void**)&d_index_states, INDEX_STATES_BYTES)); + gpuAssert(cudaMalloc((void**)&d_state_states, STATE_STATES_BYTES)); + I padding = IN_ARRAY_BYTES; // sizeof(unsigned long long) - (IN_ARRAY_BYTES % sizeof(unsigned long long)); + gpuAssert(cudaMalloc((void**)&d_in, IN_ARRAY_BYTES + padding)); + gpuAssert(cudaMalloc((void**)&d_index_out, INDEX_OUT_ARRAY_BYTES)); + gpuAssert(cudaMalloc((void**)&d_token_out, TOKEN_OUT_ARRAY_BYTES)); + gpuAssert(cudaMemcpy(d_in, input, IN_ARRAY_BYTES, cudaMemcpyHostToDevice)); + + LexerCtx ctx = LexerCtx(); + + for (I i = 0; i < WARMUP_RUNS; ++i) { + lexer<<>>( + ctx, + d_in, + d_index_out, + d_token_out, + d_state_states, + d_index_states, + size, + NUM_LOGICAL_BLOCKS, + d_dyn_index_ptr, + d_new_size, + d_is_valid + ); + cudaDeviceSynchronize(); + cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int)); + gpuAssert(cudaPeekAtLastError()); + } + + timeval * temp = (timeval *) malloc(sizeof(timeval) * RUNS); + timeval prev; + timeval curr; + timeval t_diff; + + for (I i = 0; i < RUNS; ++i) { + gettimeofday(&prev, NULL); + lexer<<>>( + ctx, + d_in, + d_index_out, + d_token_out, + d_state_states, + d_index_states, + size, + NUM_LOGICAL_BLOCKS, + d_dyn_index_ptr, + d_new_size, + d_is_valid + ); + cudaDeviceSynchronize(); + gettimeofday(&curr, NULL); + timeval_subtract(&t_diff, &curr, &prev); + temp[i] = t_diff; + cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int)); + gpuAssert(cudaPeekAtLastError()); + } + + I temp_size = 0; + gpuAssert(cudaMemcpy(&temp_size, d_new_size, sizeof(I), cudaMemcpyDeviceToHost)); + const I OUT_WRITE = temp_size * (sizeof(I) + sizeof(token_t)); + const I IN_READ = IN_ARRAY_BYTES; + const I IN_STATE_MAP = sizeof(state_t) * size; + const I SCAN_READ = sizeof(state_t) * (size + size / 2); // Lowerbound, it does more work. + + lexer<<>>( + ctx, + d_in, + d_index_out, + d_token_out, + d_state_states, + d_index_states, + size, + NUM_LOGICAL_BLOCKS, + d_dyn_index_ptr, + d_new_size, + d_is_valid + ); + cudaDeviceSynchronize(); + gpuAssert(cudaPeekAtLastError()); + bool is_valid = false; + gpuAssert(cudaMemcpy(h_index_out.data(), d_index_out, INDEX_OUT_ARRAY_BYTES, cudaMemcpyDeviceToHost)); + gpuAssert(cudaMemcpy(h_token_out.data(), d_token_out, TOKEN_OUT_ARRAY_BYTES, cudaMemcpyDeviceToHost)); + gpuAssert(cudaMemcpy(&temp_size, d_new_size, sizeof(I), cudaMemcpyDeviceToHost)); + gpuAssert(cudaMemcpy(&is_valid, d_is_valid, sizeof(bool), cudaMemcpyDeviceToHost)); + + bool test_passes = is_valid; + + if (!test_passes) { + std::cout << "Lexer Test Failed: The input given to the lexer does not result in an accepting state." << std::endl; + } + + test_passes = temp_size == expected_size; + if (!test_passes) { + std::cout << "Lexer Test Failed: Expected size=" << expected_size << " but got size=" << temp_size << std::endl; + } else { + for (I i = 0; i < expected_size; ++i) { + test_passes &= h_index_out[i] == expected_indices[i]; + test_passes &= h_token_out[i] == expected_tokens[i]; + + if (!test_passes) { + std::cout << "Lexer Test Failed: Due to elements mismatch at index=" << i << std::endl; + break; + } + } + } + + if (test_passes) { + compute_descriptors(temp, RUNS, IN_READ + IN_STATE_MAP + SCAN_READ + OUT_WRITE); + } + + free(temp); + gpuAssert(cudaFree(d_in)); + gpuAssert(cudaFree(d_token_out)); + gpuAssert(cudaFree(d_index_out)); + gpuAssert(cudaFree(d_index_states)); + gpuAssert(cudaFree(d_state_states)); + gpuAssert(cudaFree(d_dyn_index_ptr)); + gpuAssert(cudaFree(d_new_size)); + + ctx.Cleanup(); +} + diff --git a/src/Alpacc/Generator/Cuda/Cudafy.hs b/src/Alpacc/Generator/Cuda/Cudafy.hs new file mode 100644 index 0000000..3437cdb --- /dev/null +++ b/src/Alpacc/Generator/Cuda/Cudafy.hs @@ -0,0 +1,57 @@ +module Alpacc.Generator.Cuda.Cudafy + ( Cudafy (..) + , RawString (..) + ) +where + +import Data.Array as Array hiding (Array) +import Data.List qualified as List +import Data.Array.Unboxed (UArray) +import Data.Array.IArray as IArray +import Alpacc.Types +import Numeric.Natural + +newtype RawString = RawString String deriving (Show, Eq, Ord, Read) + +class Cudafy a where + cudafy :: a -> String + +instance Cudafy UInt where + cudafy U8 = "unsigned char" + cudafy U16 = "unsigned short" + cudafy U32 = "unsigned int" + cudafy U64 = "unsigned long long" + +instance Cudafy IInt where + cudafy I8 = "char" + cudafy I16 = "short" + cudafy I32 = "int" + cudafy I64 = "long long" + +instance Cudafy RawString where + cudafy (RawString s) = s + +instance Cudafy String where + cudafy = show + +instance Cudafy Int where + cudafy = show + +instance Cudafy Bool where + cudafy True = "true" + cudafy False = "false" + +instance Cudafy Natural where + cudafy = show + +instance Cudafy Integer where + cudafy = show + +instance (Cudafy a) => Cudafy [a] where + cudafy = ("["<>) . (<>"]") . List.intercalate ", " . fmap cudafy + +instance (Cudafy a) => Cudafy (Array i a) where + cudafy = cudafy . Array.elems + +instance (Cudafy a, IArray UArray a, Ix i) => Cudafy (UArray i a) where + cudafy = cudafy . IArray.elems diff --git a/src/Alpacc/Generator/Futhark/FutPrinter.hs b/src/Alpacc/Generator/Futhark/FutPrinter.hs deleted file mode 100644 index 72082e7..0000000 --- a/src/Alpacc/Generator/Futhark/FutPrinter.hs +++ /dev/null @@ -1,77 +0,0 @@ -module Alpacc.Generator.Futhark.FutPrinter - ( FutPrinter (..) - , NTuple (..) - , RawString (..) - ) -where - -import Data.Foldable -import Data.Array as Array hiding (Array) -import Data.List qualified as List -import Data.Array.Unboxed (UArray) -import Data.Array.IArray as IArray -import Data.String.Interpolate ( i ) -import Alpacc.Types -import Numeric.Natural - -newtype NTuple a = NTuple [a] deriving (Show, Eq, Ord, Read, Foldable) - -newtype RawString = RawString String deriving (Show, Eq, Ord, Read) - -class FutPrinter a where - futPrint :: a -> String - -instance FutPrinter UInt where - futPrint U8 = "u8" - futPrint U16 = "u16" - futPrint U32 = "u32" - futPrint U64 = "u64" - -instance FutPrinter IInt where - futPrint I8 = "i8" - futPrint I16 = "i16" - futPrint I32 = "i32" - futPrint I64 = "i64" - -instance FutPrinter RawString where - futPrint (RawString s) = s - -instance FutPrinter String where - futPrint = show - -instance FutPrinter Int where - futPrint = show - -instance FutPrinter Bool where - futPrint True = "true" - futPrint False = "false" - -instance FutPrinter Natural where - futPrint = show - -instance FutPrinter Integer where - futPrint = show - -instance (FutPrinter a, FutPrinter b) => FutPrinter (a, b) where - futPrint (a, b) = [i|(#{futPrint a}, #{futPrint b})|] - -instance (FutPrinter a) => FutPrinter [a] where - futPrint = ("["<>) . (<>"]") . List.intercalate ", " . fmap futPrint - -instance (FutPrinter a) => FutPrinter (NTuple a) where - futPrint = - ("("<>) - . (<>")") - . List.intercalate ", " - . fmap futPrint - . toList - -instance (FutPrinter a) => FutPrinter (Array i a) where - futPrint = futPrint . Array.elems - -instance (FutPrinter a, IArray UArray a, Ix i) => FutPrinter (UArray i a) where - futPrint = futPrint . IArray.elems - -instance (FutPrinter a) => FutPrinter (Maybe a) where - futPrint (Just a) = "#some " <> futPrint a - futPrint Nothing = "#none" diff --git a/src/Alpacc/Generator/Futhark/Futharkify.hs b/src/Alpacc/Generator/Futhark/Futharkify.hs new file mode 100644 index 0000000..ec4ba75 --- /dev/null +++ b/src/Alpacc/Generator/Futhark/Futharkify.hs @@ -0,0 +1,77 @@ +module Alpacc.Generator.Futhark.Futharkify + ( Futharkify (..) + , NTuple (..) + , RawString (..) + ) +where + +import Data.Foldable +import Data.Array as Array hiding (Array) +import Data.List qualified as List +import Data.Array.Unboxed (UArray) +import Data.Array.IArray as IArray +import Data.String.Interpolate ( i ) +import Alpacc.Types +import Numeric.Natural + +newtype NTuple a = NTuple [a] deriving (Show, Eq, Ord, Read, Foldable) + +newtype RawString = RawString String deriving (Show, Eq, Ord, Read) + +class Futharkify a where + futharkify :: a -> String + +instance Futharkify UInt where + futharkify U8 = "u8" + futharkify U16 = "u16" + futharkify U32 = "u32" + futharkify U64 = "u64" + +instance Futharkify IInt where + futharkify I8 = "i8" + futharkify I16 = "i16" + futharkify I32 = "i32" + futharkify I64 = "i64" + +instance Futharkify RawString where + futharkify (RawString s) = s + +instance Futharkify String where + futharkify = show + +instance Futharkify Int where + futharkify = show + +instance Futharkify Bool where + futharkify True = "true" + futharkify False = "false" + +instance Futharkify Natural where + futharkify = show + +instance Futharkify Integer where + futharkify = show + +instance (Futharkify a, Futharkify b) => Futharkify (a, b) where + futharkify (a, b) = [i|(#{futharkify a}, #{futharkify b})|] + +instance (Futharkify a) => Futharkify [a] where + futharkify = ("["<>) . (<>"]") . List.intercalate ", " . fmap futharkify + +instance (Futharkify a) => Futharkify (NTuple a) where + futharkify = + ("("<>) + . (<>")") + . List.intercalate ", " + . fmap futharkify + . toList + +instance (Futharkify a) => Futharkify (Array i a) where + futharkify = futharkify . Array.elems + +instance (Futharkify a, IArray UArray a, Ix i) => Futharkify (UArray i a) where + futharkify = futharkify . IArray.elems + +instance (Futharkify a) => Futharkify (Maybe a) where + futharkify (Just a) = "#some " <> futharkify a + futharkify Nothing = "#none" diff --git a/src/Alpacc/Generator/Futhark/Generator.hs b/src/Alpacc/Generator/Futhark/Generator.hs index 7dea8a9..ff71214 100644 --- a/src/Alpacc/Generator/Futhark/Generator.hs +++ b/src/Alpacc/Generator/Futhark/Generator.hs @@ -11,7 +11,7 @@ import Data.Map qualified as Map import Data.Map ( Map ) import Alpacc.Types import Data.Either.Extra -import Alpacc.Generator.Futhark.FutPrinter +import Alpacc.Generator.Futhark.Futharkify parentVectorTest :: String parentVectorTest = @@ -47,7 +47,7 @@ lexerFunction t = [i| entry lex s = match lexer.lex_chunked 16777216 s case #some r -> - map (\\(a, (b, c)) -> [i32.#{futPrint t} a, b, c]) r + map (\\(a, (b, c)) -> [i32.#{futharkify t} a, b, c]) r case #none -> [] |] diff --git a/src/Alpacc/Generator/Futhark/Lexer.hs b/src/Alpacc/Generator/Futhark/Lexer.hs index 1112cd4..31b7ca3 100644 --- a/src/Alpacc/Generator/Futhark/Lexer.hs +++ b/src/Alpacc/Generator/Futhark/Lexer.hs @@ -14,7 +14,7 @@ import Alpacc.Lexer.ParallelLexing import Data.List qualified as List import Data.Either.Extra import Alpacc.Types -import Alpacc.Generator.Futhark.FutPrinter +import Alpacc.Generator.Futhark.Futharkify futharkLexer :: String futharkLexer = $(embedStringFile "futhark/lexer.fut") @@ -49,8 +49,8 @@ compositionsArray int parallel_lexer = #{ps} :> [endomorphism_size * endomorphism_size]endomorphism |] where - ps = futPrint $ p <$> listCompositions parallel_lexer - p = RawString . (<> futPrint int) . futPrint + ps = futharkify $ p <$> listCompositions parallel_lexer + p = RawString . (<> futharkify int) . futharkify ignoreFunction :: Map T Int -> String ignoreFunction terminal_index_map = @@ -83,8 +83,8 @@ generateLexer lexer terminal_index_map terminal_type = do futharkLexer <> [i| module lexer = mk_lexer { - module terminal_module = #{futPrint terminal_type} - module endomorphism_module = #{futPrint endomorphism_type} + module terminal_module = #{futharkify terminal_type} + module endomorphism_module = #{futharkify endomorphism_type} type endomorphism = endomorphism_module.t type terminal = terminal_module.t @@ -103,7 +103,7 @@ module lexer = mk_lexer { #{defEndomorphismSize parallel_lexer} def accept_array: [endomorphism_size]bool = - sized endomorphism_size #{futPrint accept_array} + sized endomorphism_size #{futharkify accept_array} #{transitions_to_endo} diff --git a/src/Alpacc/Generator/Futhark/Parser.hs b/src/Alpacc/Generator/Futhark/Parser.hs index a87b405..8d63b8e 100644 --- a/src/Alpacc/Generator/Futhark/Parser.hs +++ b/src/Alpacc/Generator/Futhark/Parser.hs @@ -17,7 +17,7 @@ import Data.Map (Map) import Data.Map qualified as Map import Data.String.Interpolate (i) import Data.Tuple.Extra -import Alpacc.Generator.Futhark.FutPrinter +import Alpacc.Generator.Futhark.Futharkify import Alpacc.Generator.Futhark.Util import Data.Composition import Alpacc.HashTable @@ -174,7 +174,7 @@ maxAoPi table = (max_alpha_omega, max_pi) max_pi = maximum $ length . snd <$> values createNe :: Int -> Int -> String -createNe max_alpha_omega max_pi = futPrint ne +createNe max_alpha_omega max_pi = futharkify ne where stacks = replicate max_alpha_omega (RawString "epsilon") rules = replicate max_pi (RawString "empty_production") @@ -184,7 +184,7 @@ createNe max_alpha_omega max_pi = futPrint ne -- n - 1 in the Futhark language. toTupleIndexArray :: (Show a, Num a, Enum a) => String -> a -> String toTupleIndexArray name n = - futPrint $ NTuple $ map (indexArray name) [0 .. n - 1] + futharkify $ NTuple $ map (indexArray name) [0 .. n - 1] createHashFunction :: Int -> Int -> String createHashFunction q k = @@ -194,8 +194,8 @@ def hash_no_mod #{a_arg} #{b_arg} = |] where qk = q + k - a_arg = futPrint $ NTuple $ map RawString as - b_arg = futPrint $ NTuple $ map RawString bs + a_arg = futharkify $ NTuple $ map RawString as + b_arg = futharkify $ NTuple $ map RawString bs as = ["a" ++ show j | j <- [0..(qk - 1)]] bs = ["b" ++ show j | j <- [0..(qk - 1)]] concatWith a b c = b ++ a ++ c @@ -240,25 +240,25 @@ generateParser q k grammar symbol_index_map = do padAndStringifyTable empty_terminal q k max_ao max_pi symbol_index_map table hash_table <- hashTable terminal_type 13 $ Map.mapKeys (fmap fromIntegral) integer_table let ne = createNe max_ao max_pi - let offsets_array_str = futPrint $ offsetArray hash_table + let offsets_array_str = futharkify $ offsetArray hash_table let hash_table_mem_size = ABase.numElements $ elementArray hash_table let hash_table_str = - futPrint + futharkify $ fmap (first NTuple) <$> elementArray hash_table let consts_array = constsArray hash_table - let consts_array_str = futPrint $ fmap (fmap NTuple) consts_array - let size_array = futPrint $ sizeArray hash_table - let consts = futPrint $ NTuple $ initHashConsts hash_table + let consts_array_str = futharkify $ fmap (fmap NTuple) consts_array + let size_array = futharkify $ sizeArray hash_table + let consts = futharkify $ NTuple $ initHashConsts hash_table let hash_table_size = ABase.numElements consts_array return . (,terminal_type) $ futharkParser <> [i| module parser = mk_parser { -module terminal_module = #{futPrint terminal_type} -module production_module = #{futPrint production_type} -module bracket_module = #{futPrint bracket_type} +module terminal_module = #{futharkify terminal_type} +module production_module = #{futharkify production_type} +module bracket_module = #{futharkify bracket_type} #{declarations} @@ -315,6 +315,6 @@ def ne: ([max_ao]bracket, [max_pi]production) = augmented_grammar = augmentGrammar grammar terminals' = terminals augmented_grammar look_type = - futPrint + futharkify $ NTuple $ replicate (q + k) (RawString "terminal")