From e0800a9534410a918481081b293c752c435c3e02 Mon Sep 17 00:00:00 2001
From: Due <williamhenrichdue@gmail.com>
Date: Fri, 22 Nov 2024 16:16:20 +0100
Subject: [PATCH] A start on adding a CUDA backend.

---
 alpacc.cabal                               |   4 +-
 cuda/lexer.cu                              | 633 +++++++++++++++++++++
 src/Alpacc/Generator/Cuda/Cudafy.hs        |  57 ++
 src/Alpacc/Generator/Futhark/FutPrinter.hs |  77 ---
 src/Alpacc/Generator/Futhark/Futharkify.hs |  77 +++
 src/Alpacc/Generator/Futhark/Generator.hs  |   4 +-
 src/Alpacc/Generator/Futhark/Lexer.hs      |  12 +-
 src/Alpacc/Generator/Futhark/Parser.hs     |  28 +-
 8 files changed, 792 insertions(+), 100 deletions(-)
 create mode 100644 cuda/lexer.cu
 create mode 100644 src/Alpacc/Generator/Cuda/Cudafy.hs
 delete mode 100644 src/Alpacc/Generator/Futhark/FutPrinter.hs
 create mode 100644 src/Alpacc/Generator/Futhark/Futharkify.hs
diff --git a/alpacc.cabal b/alpacc.cabal
index f5e3bc5..c0a0089 100644
--- a/alpacc.cabal
+++ b/alpacc.cabal
@@ -16,6 +16,7 @@ extra-source-files:
   CHANGELOG.md
   futhark/parser.fut
   futhark/lexer.fut
+  cuda/lexer.fut
 
 library
   default-language:   GHC2021
@@ -62,9 +63,10 @@ library
     Alpacc.Types
     Alpacc.Generator.Futhark.Lexer
     Alpacc.Generator.Futhark.Parser
-    Alpacc.Generator.Futhark.FutPrinter
+    Alpacc.Generator.Futhark.Futharkify
     Alpacc.Generator.Futhark.Generator
     Alpacc.Generator.Futhark.Util
+    Alpacc.Generator.Cuda.Cudafy
     Alpacc.Lexer.RegularExpression
     Alpacc.Lexer.NFA
     Alpacc.Lexer.DFA
diff --git a/cuda/lexer.cu b/cuda/lexer.cu
new file mode 100644
index 0000000..00f3c7a
--- /dev/null
+++ b/cuda/lexer.cu
@@ -0,0 +1,633 @@
+#include <iostream>
+#include <vector>
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <math.h>
+#include <cstdint>
+
+using token_t = unsigned char;
+using state_t = unsigned short;
+
+const unsigned int NUM_STATES = 12;
+const unsigned int NUM_TRANS = 256;
+// const token_t IGNORE_TOKEN = 0;
+const state_t ENDO_MASK = 15;
+const state_t ENDO_OFFSET = 0;
+const state_t TOKEN_MASK = 112;
+const state_t TOKEN_OFFSET = 4;
+const state_t ACCEPT_MASK = 128;
+const state_t ACCEPT_OFFSET = 7;
+const state_t PRODUCE_MASK = 256;
+const state_t PRODUCE_OFFSET = 8;
+const state_t IDENTITY = 74;
+
+state_t h_to_state[NUM_TRANS] =
+        {75, 75, 75, 75, 75, 75, 75, 75, 75, 128, 128, 75, 75, 128,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 128, 75, 75, 75, 75, 75, 75, 75, 161, 178, 75,
+         75, 75, 75, 75, 75, 147, 147, 147, 147, 147, 147, 147, 147,
+         147, 147, 75, 75, 75, 75, 75, 75, 75, 147, 147, 147, 147,
+         147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147,
+         147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 75, 75,
+         75, 75, 75, 75, 147, 147, 147, 147, 147, 147, 147, 147, 147,
+         147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147,
+         147, 147, 147, 147, 147, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75,
+         75, 75, 75, 75};
+
+state_t h_compose[NUM_STATES * NUM_STATES] =
+    {132, 392, 392, 392, 132, 392, 392, 392, 132, 392, 128, 75,
+     421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 161, 75,
+     438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 178, 75,
+     407, 407, 407, 153, 407, 407, 407, 153, 407, 153, 147, 75,
+     132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 75,
+     421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 421, 75,
+     438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 438, 75,
+     407, 407, 407, 407, 407, 407, 407, 407, 407, 407, 407, 75,
+     392, 392, 392, 392, 392, 392, 392, 392, 392, 392, 392, 75,
+     153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 75,
+     128, 161, 178, 147, 132, 421, 438, 407, 392, 153, 74, 75,
+     75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75};
+
+const unsigned char LG_WARP = 5;
+const unsigned char WARP = 1 << LG_WARP;
+
+template<typename T, typename I, I ITEMS_PER_THREAD>
+__device__ inline void
+glbToShmemCpy(const I glb_offs,
+              const I size,
+              const T ne,
+              T* d_read,
+              volatile T* shmem_write) {
+    #pragma unroll
+    for (I i = 0; i < ITEMS_PER_THREAD; i++) {
+        I lid = i * blockDim.x + threadIdx.x;
+        I gid = glb_offs + lid;
+        shmem_write[lid] = gid < size ? d_read[gid] : ne;
+    }
+    __syncthreads();
+}
+
+template<typename T, typename I, I ITEMS_PER_THREAD>
+__device__ inline void
+shmemToGlbCpy(const I glb_offs,
+              const I size,
+              T* d_write,
+              volatile T* shmem_read) {
+    #pragma unroll
+    for (I i = 0; i < ITEMS_PER_THREAD; i++) {
+        I lid = blockDim.x * i + threadIdx.x;
+        I gid = glb_offs + lid;
+        if (gid < size)
+            d_write[gid] = shmem_read[lid];
+    }
+    __syncthreads();
+}
+
+template<typename T, typename I, typename OP, I ITEMS_PER_THREAD>
+__device__ inline T
+scanThread(volatile T* shmem,
+           volatile T* shmem_aux,
+           OP op) {
+    const I offset = threadIdx.x * ITEMS_PER_THREAD;
+    const I upper = offset + ITEMS_PER_THREAD;
+    T acc = shmem[offset];
+    #pragma unroll
+    for (I lid = offset + 1; lid < upper; lid++) {
+        T tmp = shmem[lid];
+        acc = op(acc, tmp);
+        shmem[lid] = acc;
+    }
+    shmem_aux[threadIdx.x] = acc;
+	__syncthreads();
+}
+
+template<typename T, typename I, typename OP>
+__device__ inline T
+scanWarp(volatile T* shmem,
+         OP op,
+         const unsigned char lane) {
+    unsigned char h;
+
+    #pragma unroll
+    for (unsigned char d = 0; d < LG_WARP; d++)
+        if ((h = 1 << d) <= lane)
+            shmem[threadIdx.x] = op(shmem[threadIdx.x - h], shmem[threadIdx.x]);
+    
+    return shmem[threadIdx.x];
+}
+
+template<typename T, typename I, typename OP>
+__device__ inline T
+scanBlock(volatile T* shmem,
+          OP op) {
+    const unsigned char lane = threadIdx.x & (WARP - 1);
+    const I warpid = threadIdx.x >> LG_WARP;
+
+    T res = scanWarp<T, I, OP>(shmem, op, lane);
+    __syncthreads();
+
+    if (lane == (WARP - 1))
+        shmem[warpid] = res;
+    __syncthreads();
+
+    if (warpid == 0)
+        scanWarp<T, I, OP>(shmem, op, lane);
+    __syncthreads();
+
+    if (warpid > 0)
+        res = op(shmem[warpid-1], res);
+    __syncthreads();
+
+    shmem[threadIdx.x] = res;
+    __syncthreads();
+}
+
+template<typename T, typename I, typename OP, I ITEMS_PER_THREAD>
+__device__ inline void
+addAuxBlockScan(volatile T* shmem,
+                volatile T* shmem_aux,
+                OP op) {
+    if (threadIdx.x > 0) {
+        const I offset = threadIdx.x * ITEMS_PER_THREAD;
+        const I upper = offset + ITEMS_PER_THREAD;
+        const T val = shmem_aux[threadIdx.x - 1];
+        #pragma unroll
+        for (I lid = offset; lid < upper; lid++) {
+            shmem[lid] = op(val, shmem[lid]);
+        }
+    }
+	__syncthreads();
+}
+
+template<typename T, typename I, typename OP, I ITEMS_PER_THREAD>
+__device__ inline void
+scanBlock(volatile T* block,
+          volatile T* block_aux,
+          OP op) {
+    scanThread<T, I, OP, ITEMS_PER_THREAD>(block, block_aux, op);
+
+    scanBlock<T, I, OP>(block_aux, op);
+
+    addAuxBlockScan<T, I, OP, ITEMS_PER_THREAD>(block, block_aux, op);
+}
+
+__device__ inline unsigned int dynamicIndex(volatile unsigned int* dyn_idx_ptr) {
+    volatile __shared__ unsigned int dyn_idx;
+    
+    if (threadIdx.x == 0)
+        dyn_idx = atomicAdd(const_cast<unsigned int*>(dyn_idx_ptr), 1);
+    
+    __syncthreads();
+    return dyn_idx;
+}
+
+enum Status: unsigned char {
+    Invalid = 0,
+    Aggregate = 1,
+    Prefix = 2,
+};
+
+template<typename T>
+struct State {
+    T aggregate;
+    T prefix;
+    Status status = Invalid;
+};
+
+__device__ inline Status
+combine(Status a, Status b) {
+    if (b == Aggregate)
+        return a;
+    return b;
+}
+
+template<typename T, typename I, typename OP>
+__device__ inline void
+scanWarp(volatile T* values,
+         volatile Status* statuses,
+         OP op,
+         const unsigned char lane) {
+    unsigned char h;
+    const I tid = threadIdx.x;
+
+    #pragma unroll
+    for (unsigned char d = 0; d < LG_WARP; d++) {
+        if ((h = 1 << d) <= lane) {
+            bool is_not_aggregate = statuses[tid] != Aggregate;
+            values[tid] = is_not_aggregate ? const_cast<T*>(values)[tid] : op(values[tid - h], values[tid]);
+            statuses[tid] = combine(statuses[tid - h], statuses[tid]);
+        }
+    }
+}
+
+template<typename T, typename I, typename OP, I ITEMS_PER_THREAD>
+__device__ inline T
+decoupledLookbackScanNoWrite(volatile State<T>* states,
+                             volatile T* shmem,
+                             OP op,
+                             const T ne,
+                             unsigned int dyn_idx) {
+    volatile __shared__ T values[WARP];
+    volatile __shared__ Status statuses[WARP];
+    volatile __shared__ T shmem_prefix;
+    const unsigned char lane = threadIdx.x & (WARP - 1);
+    const bool is_first = threadIdx.x == 0;
+
+    T aggregate = shmem[ITEMS_PER_THREAD * blockDim.x - 1];
+
+    if (is_first) {
+        states[dyn_idx].aggregate = aggregate;
+    }
+    
+    if (dyn_idx == 0 && is_first) {
+        states[dyn_idx].prefix = aggregate;
+    }
+    
+    __threadfence();
+    if (dyn_idx == 0 && is_first) {
+        states[dyn_idx].status = Prefix;
+    } else if (is_first) {
+        states[dyn_idx].status = Aggregate;
+    }
+
+    T prefix = ne;
+    if (threadIdx.x < WARP && dyn_idx != 0) {
+        I lookback_idx = threadIdx.x + dyn_idx;
+        I lookback_warp = WARP;
+        Status status = Aggregate;
+        do {
+            if (lookback_warp <= lookback_idx) {
+                I idx = lookback_idx - lookback_warp;
+                status = states[idx].status;
+                statuses[threadIdx.x] = status;
+                values[threadIdx.x] = status == Prefix ? states[idx].prefix : states[idx].aggregate;
+            } else {
+                statuses[threadIdx.x] = Aggregate;
+                values[threadIdx.x] = ne;
+            }
+
+            scanWarp<T, I, OP>(values, statuses, op, lane);
+
+            T result = values[WARP - 1];
+            status = statuses[WARP - 1];
+
+            if (status == Invalid)
+                continue;
+                
+            if (is_first) {
+                prefix = op(result, prefix);
+            }
+
+            lookback_warp += WARP;
+        } while (status != Prefix);
+    }
+
+    if (is_first) {
+        shmem_prefix = prefix;
+    }
+
+    __syncthreads();
+
+    if (is_first) {
+        states[dyn_idx].prefix = op(prefix, aggregate);
+        __threadfence();
+        states[dyn_idx].status = Prefix;
+    }
+    
+    return shmem_prefix;
+}
+
+__device__ __host__ __forceinline__ state_t get_index(state_t state) {
+    return (state & ENDO_MASK) >> ENDO_OFFSET;
+}
+
+__device__ __host__ __forceinline__ token_t get_token(state_t state) {
+    return (state & TOKEN_MASK) >> TOKEN_OFFSET;
+}
+
+__device__ bool is_accept(state_t state) {
+    return (state & ACCEPT_MASK) >> ACCEPT_OFFSET;
+}
+
+__device__ __host__ __forceinline__ bool is_produce(state_t state) {
+    return (state & PRODUCE_MASK) >> PRODUCE_OFFSET;
+}
+
+struct LexerCtx {
+    state_t* d_to_state;
+    state_t* d_compose;
+
+    LexerCtx() : d_to_state(NULL), d_compose(NULL) {
+        cudaMalloc(&d_to_state, sizeof(h_to_state));
+        cudaMemcpy(d_to_state, h_to_state, sizeof(h_to_state),
+                cudaMemcpyHostToDevice);
+        cudaMalloc(&d_compose, sizeof(h_compose));
+        cudaMemcpy(d_compose, h_compose, sizeof(h_compose),
+                cudaMemcpyHostToDevice);
+    }
+
+    void Cleanup() {
+        if (d_to_state) cudaFree(d_to_state);
+        if (d_compose) cudaFree(d_compose);
+    }
+
+    __device__ __host__ __forceinline__
+    state_t operator()(const state_t &a, const state_t &b) const {
+        return d_compose[get_index(b) * NUM_STATES + get_index(a)];
+    }
+
+    __device__ __host__ __forceinline__
+    state_t operator()(const volatile state_t &a, const volatile state_t &b) const {
+        return d_compose[get_index(b) * NUM_STATES + get_index(a)];
+    }
+
+    __device__ __host__ __forceinline__
+    state_t to_state(const char &a) const {
+        return d_to_state[a];
+    }
+};
+
+template<typename I>
+struct Add {
+    __device__ __forceinline__ I operator()(I a, I b) const {
+        return a + b;
+    }
+};
+
+template<typename I, I BLOCK_SIZE, I ITEMS_PER_THREAD>
+__global__ void
+lexer(LexerCtx ctx,
+      unsigned char* d_in,
+      unsigned int* d_index_out,
+      token_t* d_token_out,
+      volatile State<state_t>* state_states,
+      volatile State<I>* index_states,
+      I size,
+      I num_logical_blocks,
+      volatile unsigned int* dyn_index_ptr,
+      volatile I* new_size,
+      volatile bool* is_valid) {
+    volatile __shared__ state_t states[ITEMS_PER_THREAD * BLOCK_SIZE];
+    volatile __shared__ I indices[ITEMS_PER_THREAD * BLOCK_SIZE];
+    volatile __shared__ I indices_aux[BLOCK_SIZE];
+    __shared__ state_t next_block_first_state;
+    volatile state_t* states_aux = (volatile state_t*) indices;
+    const I REG_MEM = 1 + ITEMS_PER_THREAD / sizeof(unsigned long long);
+    unsigned long long copy_reg[REG_MEM];
+    unsigned char *chars_reg = (unsigned char*) copy_reg;
+    unsigned int is_produce_state = 0;
+
+    unsigned int dyn_index = dynamicIndex(dyn_index_ptr);
+    I glb_offs = dyn_index * BLOCK_SIZE * ITEMS_PER_THREAD;
+    
+    states_aux[threadIdx.x] = ctx.to_state(threadIdx.x);
+    
+    if (threadIdx.x == I()) {
+        next_block_first_state = IDENTITY;
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (I i = 0; i < REG_MEM; i++) {
+        I uint64_lid = i * blockDim.x + threadIdx.x;
+        I lid = sizeof(unsigned long long) * uint64_lid;
+        I gid = glb_offs + lid;
+        if (gid + sizeof(unsigned long long) < size) {
+            copy_reg[i] = *((unsigned long long*) (gid + (unsigned char*) d_in));
+        } else {
+            for (I j = 0; j < sizeof(unsigned long long); j++) {
+                I loc_gid = gid + j;
+                if (loc_gid < size) {
+                    chars_reg[sizeof(unsigned long long) * i + j] = d_in[loc_gid];
+                }
+            }
+        }
+    }
+    
+    #pragma unroll
+    for (I i = 0; i < REG_MEM; i++) {
+        I lid = i * blockDim.x + threadIdx.x;
+        I _gid = glb_offs + sizeof(unsigned long long) * lid;
+        for (I j = 0; j < sizeof(unsigned long long); j++) {
+            I gid = _gid + j;
+	        I lid_off = sizeof(unsigned long long) * lid + j;
+	        I reg_off = sizeof(unsigned long long) * i + j;
+	        bool is_in_block = lid_off < ITEMS_PER_THREAD * BLOCK_SIZE; 
+            if (gid < size && is_in_block) {
+                states[lid_off] = states_aux[chars_reg[reg_off]];
+            } else if (is_in_block) {
+                states[lid_off] = IDENTITY;
+            } else if (lid_off == ITEMS_PER_THREAD * BLOCK_SIZE) {
+                next_block_first_state = states_aux[chars_reg[reg_off]];
+            }
+        }
+    }
+
+    __syncthreads();
+
+    scan<state_t, I, LexerCtx, ITEMS_PER_THREAD>(states, states_aux, state_states, ctx, IDENTITY, dyn_index);
+
+    #pragma unroll
+    for (I i = 0; i < ITEMS_PER_THREAD; i++) {
+        I lid = i * blockDim.x + threadIdx.x;
+        I gid = glb_offs + lid;
+        bool temp = false;
+        if (gid < size) {
+            if (lid == ITEMS_PER_THREAD * BLOCK_SIZE - 1) {
+                temp = gid == size - 1 || is_produce(ctx(states[lid], next_block_first_state));
+            } else {
+                temp = gid == size - 1 || is_produce(states[lid + 1]);
+            }
+        }
+        is_produce_state |= temp << i;
+        indices[lid] = temp;
+    }
+
+    __syncthreads();
+
+    scanBlock<I, I, Add<I>, ITEMS_PER_THREAD>(indices, indices_aux, Add<I>());
+
+    I prefix = decoupledLookbackScanNoWrite<I, I, Add<I>, ITEMS_PER_THREAD>(index_states, indices, Add<I>(), I(), dyn_index);
+
+    #pragma unroll
+    for (I i = 0; i < ITEMS_PER_THREAD; i++) {
+        I lid = blockDim.x * i + threadIdx.x;
+        I gid = glb_offs + lid;
+        if (gid < size && ((is_produce_state >> i) & 1)) {
+            I offset = Add<I>()(prefix, indices[lid]) - 1;
+            d_index_out[offset] = gid;
+            d_token_out[offset] = get_token(states[lid]);
+        }
+    }
+    
+    if (dyn_index == num_logical_blocks - 1 && threadIdx.x == blockDim.x - 1) {
+        *new_size = Add<I>()(prefix, indices[ITEMS_PER_THREAD * BLOCK_SIZE - 1]);
+        *is_valid = is_accept(states[ITEMS_PER_THREAD * BLOCK_SIZE - 1]);
+    }
+}
+
+void testLexer(unsigned char* input,
+               size_t input_size,
+               unsigned int* expected_indices,
+               token_t* expected_tokens,
+               size_t expected_size) {
+    using I = unsigned int;
+    const I size = input_size;
+    const I BLOCK_SIZE = 256;
+    const I ITEMS_PER_THREAD = 31;
+    const I NUM_LOGICAL_BLOCKS = (size + BLOCK_SIZE * ITEMS_PER_THREAD - 1) / (BLOCK_SIZE * ITEMS_PER_THREAD);
+    const I IN_ARRAY_BYTES = size * sizeof(unsigned char);
+    const I INDEX_OUT_ARRAY_BYTES = size * sizeof(I);
+    const I TOKEN_OUT_ARRAY_BYTES = size * sizeof(token_t);
+    const I STATE_STATES_BYTES = NUM_LOGICAL_BLOCKS * sizeof(State<state_t>);
+    const I INDEX_STATES_BYTES = NUM_LOGICAL_BLOCKS * sizeof(State<I>);
+    const I WARMUP_RUNS = 500;
+    const I RUNS = 50;
+
+    std::vector<token_t> h_token_out(size, 0);
+    std::vector<I> h_index_out(size, 0);
+
+    unsigned int* d_dyn_index_ptr;
+    I* d_new_size;
+    bool* d_is_valid;
+    unsigned char *d_in;
+    I *d_index_out;
+    token_t *d_token_out;
+    State<I>* d_index_states;
+    State<state_t>* d_state_states;
+    gpuAssert(cudaMalloc((void**)&d_dyn_index_ptr, sizeof(unsigned int)));
+    gpuAssert(cudaMalloc((void**)&d_new_size, sizeof(I)));
+    gpuAssert(cudaMalloc((void**)&d_is_valid, sizeof(bool)));
+    cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int));
+    cudaMemset(d_is_valid, false, sizeof(bool));
+    gpuAssert(cudaMalloc((void**)&d_index_states, INDEX_STATES_BYTES));
+    gpuAssert(cudaMalloc((void**)&d_state_states, STATE_STATES_BYTES));
+    I padding = IN_ARRAY_BYTES; // sizeof(unsigned long long) - (IN_ARRAY_BYTES % sizeof(unsigned long long));
+    gpuAssert(cudaMalloc((void**)&d_in, IN_ARRAY_BYTES + padding));
+    gpuAssert(cudaMalloc((void**)&d_index_out, INDEX_OUT_ARRAY_BYTES));
+    gpuAssert(cudaMalloc((void**)&d_token_out, TOKEN_OUT_ARRAY_BYTES));
+    gpuAssert(cudaMemcpy(d_in, input, IN_ARRAY_BYTES, cudaMemcpyHostToDevice));
+    
+    LexerCtx ctx = LexerCtx();
+
+    for (I i = 0; i < WARMUP_RUNS; ++i) {
+        lexer<I, BLOCK_SIZE, ITEMS_PER_THREAD><<<NUM_LOGICAL_BLOCKS, BLOCK_SIZE>>>(
+            ctx,
+            d_in,
+            d_index_out,
+            d_token_out,
+            d_state_states,
+            d_index_states,
+            size,
+            NUM_LOGICAL_BLOCKS,
+            d_dyn_index_ptr,
+            d_new_size,
+            d_is_valid
+        );
+        cudaDeviceSynchronize();
+        cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int));
+        gpuAssert(cudaPeekAtLastError());
+    }
+
+    timeval * temp = (timeval *) malloc(sizeof(timeval) * RUNS);
+    timeval prev;
+    timeval curr;
+    timeval t_diff;
+
+    for (I i = 0; i < RUNS; ++i) {
+        gettimeofday(&prev, NULL);
+        lexer<I, BLOCK_SIZE, ITEMS_PER_THREAD><<<NUM_LOGICAL_BLOCKS, BLOCK_SIZE>>>(
+            ctx,
+            d_in,
+            d_index_out,
+            d_token_out,
+            d_state_states,
+            d_index_states,
+            size,
+            NUM_LOGICAL_BLOCKS,
+            d_dyn_index_ptr,
+            d_new_size,
+            d_is_valid
+        );
+        cudaDeviceSynchronize();
+        gettimeofday(&curr, NULL);
+        timeval_subtract(&t_diff, &curr, &prev);
+        temp[i] = t_diff;
+        cudaMemset(d_dyn_index_ptr, 0, sizeof(unsigned int));
+        gpuAssert(cudaPeekAtLastError());
+    }
+
+    I temp_size = 0;
+    gpuAssert(cudaMemcpy(&temp_size, d_new_size, sizeof(I), cudaMemcpyDeviceToHost));
+    const I OUT_WRITE = temp_size * (sizeof(I) + sizeof(token_t));
+    const I IN_READ = IN_ARRAY_BYTES;
+    const I IN_STATE_MAP = sizeof(state_t) * size;
+    const I SCAN_READ =  sizeof(state_t) * (size + size / 2); // Lowerbound, it does more work.
+    
+    lexer<I, BLOCK_SIZE, ITEMS_PER_THREAD><<<NUM_LOGICAL_BLOCKS, BLOCK_SIZE>>>(
+        ctx,
+        d_in,
+        d_index_out,
+        d_token_out,
+        d_state_states,
+        d_index_states,
+        size,
+        NUM_LOGICAL_BLOCKS,
+        d_dyn_index_ptr,
+        d_new_size,
+        d_is_valid
+    );
+    cudaDeviceSynchronize();
+    gpuAssert(cudaPeekAtLastError());
+    bool is_valid = false;
+    gpuAssert(cudaMemcpy(h_index_out.data(), d_index_out, INDEX_OUT_ARRAY_BYTES, cudaMemcpyDeviceToHost));
+    gpuAssert(cudaMemcpy(h_token_out.data(), d_token_out, TOKEN_OUT_ARRAY_BYTES, cudaMemcpyDeviceToHost));
+    gpuAssert(cudaMemcpy(&temp_size, d_new_size, sizeof(I), cudaMemcpyDeviceToHost));
+    gpuAssert(cudaMemcpy(&is_valid, d_is_valid, sizeof(bool), cudaMemcpyDeviceToHost));
+    
+    bool test_passes = is_valid;
+
+    if (!test_passes) {
+        std::cout << "Lexer Test Failed: The input given to the lexer does not result in an accepting state." << std::endl;
+    }
+
+    test_passes = temp_size == expected_size;
+    if (!test_passes) {
+        std::cout << "Lexer Test Failed: Expected size=" << expected_size << " but got size=" << temp_size << std::endl;
+    } else {
+        for (I i = 0; i < expected_size; ++i) {
+            test_passes &= h_index_out[i] == expected_indices[i];
+            test_passes &= h_token_out[i] == expected_tokens[i];
+
+            if (!test_passes) {
+                std::cout << "Lexer Test Failed: Due to elements mismatch at index=" << i << std::endl;
+                break;
+            }
+        } 
+    }
+
+    if (test_passes) {
+        compute_descriptors(temp, RUNS, IN_READ + IN_STATE_MAP + SCAN_READ + OUT_WRITE);
+    }    
+
+    free(temp);
+    gpuAssert(cudaFree(d_in));
+    gpuAssert(cudaFree(d_token_out));
+    gpuAssert(cudaFree(d_index_out));
+    gpuAssert(cudaFree(d_index_states));
+    gpuAssert(cudaFree(d_state_states));
+    gpuAssert(cudaFree(d_dyn_index_ptr));
+    gpuAssert(cudaFree(d_new_size));
+
+    ctx.Cleanup();
+}
+
diff --git a/src/Alpacc/Generator/Cuda/Cudafy.hs b/src/Alpacc/Generator/Cuda/Cudafy.hs
new file mode 100644
index 0000000..3437cdb
--- /dev/null
+++ b/src/Alpacc/Generator/Cuda/Cudafy.hs
@@ -0,0 +1,57 @@
+module Alpacc.Generator.Cuda.Cudafy
+  ( Cudafy (..)
+  , RawString (..)
+  )
+where
+
+import Data.Array as Array hiding (Array)
+import Data.List qualified as List
+import Data.Array.Unboxed (UArray)
+import Data.Array.IArray as IArray
+import Alpacc.Types
+import Numeric.Natural
+
+newtype RawString = RawString String deriving (Show, Eq, Ord, Read)
+
+class Cudafy a where
+  cudafy :: a -> String
+
+instance Cudafy UInt where
+  cudafy U8 = "unsigned char"
+  cudafy U16 = "unsigned short"
+  cudafy U32 = "unsigned int"
+  cudafy U64 = "unsigned long long"
+
+instance Cudafy IInt where
+  cudafy I8 = "char"
+  cudafy I16 = "short"
+  cudafy I32 = "int"
+  cudafy I64 = "long long"
+
+instance Cudafy RawString where
+  cudafy (RawString s) = s
+
+instance Cudafy String where
+  cudafy = show
+
+instance Cudafy Int where
+  cudafy = show
+
+instance Cudafy Bool where
+  cudafy True = "true"
+  cudafy False = "false"
+
+instance Cudafy Natural where
+  cudafy = show
+
+instance Cudafy Integer where
+  cudafy = show
+
+instance (Cudafy a) => Cudafy [a] where
+  cudafy = ("["<>) . (<>"]") . List.intercalate ", " . fmap cudafy
+
+instance (Cudafy a) => Cudafy (Array i a) where
+  cudafy = cudafy . Array.elems
+
+instance (Cudafy a, IArray UArray a, Ix i) => Cudafy (UArray i a) where
+  cudafy = cudafy . IArray.elems
diff --git a/src/Alpacc/Generator/Futhark/FutPrinter.hs b/src/Alpacc/Generator/Futhark/FutPrinter.hs
deleted file mode 100644
index 72082e7..0000000
--- a/src/Alpacc/Generator/Futhark/FutPrinter.hs
+++ /dev/null
@@ -1,77 +0,0 @@
-module Alpacc.Generator.Futhark.FutPrinter
-  ( FutPrinter (..)
-  , NTuple (..)
-  , RawString (..)
-  )
-where
-
-import Data.Foldable
-import Data.Array as Array hiding (Array)
-import Data.List qualified as List
-import Data.Array.Unboxed (UArray)
-import Data.Array.IArray as IArray
-import Data.String.Interpolate ( i )
-import Alpacc.Types
-import Numeric.Natural
-
-newtype NTuple a = NTuple [a] deriving (Show, Eq, Ord, Read, Foldable)
-
-newtype RawString = RawString String deriving (Show, Eq, Ord, Read)
-
-class FutPrinter a where
-  futPrint :: a -> String
-
-instance FutPrinter UInt where
-  futPrint U8 = "u8"
-  futPrint U16 = "u16"
-  futPrint U32 = "u32"
-  futPrint U64 = "u64"
-
-instance FutPrinter IInt where
-  futPrint I8 = "i8"
-  futPrint I16 = "i16"
-  futPrint I32 = "i32"
-  futPrint I64 = "i64"
-
-instance FutPrinter RawString where
-  futPrint (RawString s) = s
-
-instance FutPrinter String where
-  futPrint = show
-
-instance FutPrinter Int where
-  futPrint = show
-
-instance FutPrinter Bool where
-  futPrint True = "true"
-  futPrint False = "false"
-
-instance FutPrinter Natural where
-  futPrint = show
-
-instance FutPrinter Integer where
-  futPrint = show
-
-instance (FutPrinter a, FutPrinter b) => FutPrinter (a, b) where
-  futPrint (a, b) = [i|(#{futPrint a}, #{futPrint b})|]
-
-instance (FutPrinter a) => FutPrinter [a] where
-  futPrint = ("["<>) . (<>"]") . List.intercalate ", " . fmap futPrint
-
-instance (FutPrinter a) => FutPrinter (NTuple a) where
-  futPrint =
-    ("("<>)
-    . (<>")")
-    . List.intercalate ", "
-    . fmap futPrint
-    . toList
-
-instance (FutPrinter a) => FutPrinter (Array i a) where
-  futPrint = futPrint . Array.elems
-
-instance (FutPrinter a, IArray UArray a, Ix i) => FutPrinter (UArray i a) where
-  futPrint = futPrint . IArray.elems
-
-instance (FutPrinter a) => FutPrinter (Maybe a) where
-  futPrint (Just a) = "#some " <> futPrint a
-  futPrint Nothing = "#none"
diff --git a/src/Alpacc/Generator/Futhark/Futharkify.hs b/src/Alpacc/Generator/Futhark/Futharkify.hs
new file mode 100644
index 0000000..ec4ba75
--- /dev/null
+++ b/src/Alpacc/Generator/Futhark/Futharkify.hs
@@ -0,0 +1,77 @@
+module Alpacc.Generator.Futhark.Futharkify
+  ( Futharkify (..)
+  , NTuple (..)
+  , RawString (..)
+  )
+where
+
+import Data.Foldable
+import Data.Array as Array hiding (Array)
+import Data.List qualified as List
+import Data.Array.Unboxed (UArray)
+import Data.Array.IArray as IArray
+import Data.String.Interpolate ( i )
+import Alpacc.Types
+import Numeric.Natural
+
+newtype NTuple a = NTuple [a] deriving (Show, Eq, Ord, Read, Foldable)
+
+newtype RawString = RawString String deriving (Show, Eq, Ord, Read)
+
+class Futharkify a where
+  futharkify :: a -> String
+
+instance Futharkify UInt where
+  futharkify U8 = "u8"
+  futharkify U16 = "u16"
+  futharkify U32 = "u32"
+  futharkify U64 = "u64"
+
+instance Futharkify IInt where
+  futharkify I8 = "i8"
+  futharkify I16 = "i16"
+  futharkify I32 = "i32"
+  futharkify I64 = "i64"
+
+instance Futharkify RawString where
+  futharkify (RawString s) = s
+
+instance Futharkify String where
+  futharkify = show
+
+instance Futharkify Int where
+  futharkify = show
+
+instance Futharkify Bool where
+  futharkify True = "true"
+  futharkify False = "false"
+
+instance Futharkify Natural where
+  futharkify = show
+
+instance Futharkify Integer where
+  futharkify = show
+
+instance (Futharkify a, Futharkify b) => Futharkify (a, b) where
+  futharkify (a, b) = [i|(#{futharkify a}, #{futharkify b})|]
+
+instance (Futharkify a) => Futharkify [a] where
+  futharkify = ("["<>) . (<>"]") . List.intercalate ", " . fmap futharkify
+
+instance (Futharkify a) => Futharkify (NTuple a) where
+  futharkify =
+    ("("<>)
+    . (<>")")
+    . List.intercalate ", "
+    . fmap futharkify
+    . toList
+
+instance (Futharkify a) => Futharkify (Array i a) where
+  futharkify = futharkify . Array.elems
+
+instance (Futharkify a, IArray UArray a, Ix i) => Futharkify (UArray i a) where
+  futharkify = futharkify . IArray.elems
+
+instance (Futharkify a) => Futharkify (Maybe a) where
+  futharkify (Just a) = "#some " <> futharkify a
+  futharkify Nothing = "#none"
diff --git a/src/Alpacc/Generator/Futhark/Generator.hs b/src/Alpacc/Generator/Futhark/Generator.hs
index 7dea8a9..ff71214 100644
--- a/src/Alpacc/Generator/Futhark/Generator.hs
+++ b/src/Alpacc/Generator/Futhark/Generator.hs
@@ -11,7 +11,7 @@ import Data.Map qualified as Map
 import Data.Map ( Map )
 import Alpacc.Types
 import Data.Either.Extra
-import Alpacc.Generator.Futhark.FutPrinter
+import Alpacc.Generator.Futhark.Futharkify
 
 parentVectorTest :: String
 parentVectorTest =
@@ -47,7 +47,7 @@ lexerFunction t = [i|
 entry lex s =
   match lexer.lex_chunked 16777216 s
   case #some r ->
-    map (\\(a, (b, c)) -> [i32.#{futPrint t} a, b, c]) r
+    map (\\(a, (b, c)) -> [i32.#{futharkify t} a, b, c]) r
   case #none -> []
 |]
 
diff --git a/src/Alpacc/Generator/Futhark/Lexer.hs b/src/Alpacc/Generator/Futhark/Lexer.hs
index 1112cd4..31b7ca3 100644
--- a/src/Alpacc/Generator/Futhark/Lexer.hs
+++ b/src/Alpacc/Generator/Futhark/Lexer.hs
@@ -14,7 +14,7 @@ import Alpacc.Lexer.ParallelLexing
 import Data.List qualified as List
 import Data.Either.Extra
 import Alpacc.Types
-import Alpacc.Generator.Futhark.FutPrinter
+import Alpacc.Generator.Futhark.Futharkify
 
 futharkLexer :: String
 futharkLexer = $(embedStringFile "futhark/lexer.fut")
@@ -49,8 +49,8 @@ compositionsArray int parallel_lexer =
   #{ps} :> [endomorphism_size * endomorphism_size]endomorphism
 |]
   where
-    ps = futPrint $ p <$> listCompositions parallel_lexer
-    p = RawString . (<> futPrint int) . futPrint 
+    ps = futharkify $ p <$> listCompositions parallel_lexer
+    p = RawString . (<> futharkify int) . futharkify 
 
 ignoreFunction :: Map T Int -> String
 ignoreFunction terminal_index_map = 
@@ -83,8 +83,8 @@ generateLexer lexer terminal_index_map terminal_type = do
     futharkLexer
       <> [i|
 module lexer = mk_lexer {
-  module terminal_module = #{futPrint terminal_type}
-  module endomorphism_module = #{futPrint endomorphism_type}
+  module terminal_module = #{futharkify terminal_type}
+  module endomorphism_module = #{futharkify endomorphism_type}
 
   type endomorphism = endomorphism_module.t
   type terminal = terminal_module.t
@@ -103,7 +103,7 @@ module lexer = mk_lexer {
   #{defEndomorphismSize parallel_lexer}
   
   def accept_array: [endomorphism_size]bool =
-    sized endomorphism_size #{futPrint accept_array}
+    sized endomorphism_size #{futharkify accept_array}
 
   #{transitions_to_endo}
 
diff --git a/src/Alpacc/Generator/Futhark/Parser.hs b/src/Alpacc/Generator/Futhark/Parser.hs
index a87b405..8d63b8e 100644
--- a/src/Alpacc/Generator/Futhark/Parser.hs
+++ b/src/Alpacc/Generator/Futhark/Parser.hs
@@ -17,7 +17,7 @@ import Data.Map (Map)
 import Data.Map qualified as Map
 import Data.String.Interpolate (i)
 import Data.Tuple.Extra
-import Alpacc.Generator.Futhark.FutPrinter
+import Alpacc.Generator.Futhark.Futharkify
 import Alpacc.Generator.Futhark.Util
 import Data.Composition
 import Alpacc.HashTable
@@ -174,7 +174,7 @@ maxAoPi table = (max_alpha_omega, max_pi)
     max_pi = maximum $ length . snd <$> values
 
 createNe :: Int -> Int -> String
-createNe max_alpha_omega max_pi = futPrint ne
+createNe max_alpha_omega max_pi = futharkify ne
   where
     stacks = replicate max_alpha_omega (RawString "epsilon")
     rules = replicate max_pi (RawString "empty_production")
@@ -184,7 +184,7 @@ createNe max_alpha_omega max_pi = futPrint ne
 -- n - 1 in the Futhark language.
 toTupleIndexArray :: (Show a, Num a, Enum a) => String -> a -> String
 toTupleIndexArray name n =
-  futPrint $ NTuple $ map (indexArray name) [0 .. n - 1]
+  futharkify $ NTuple $ map (indexArray name) [0 .. n - 1]
 
 createHashFunction :: Int -> Int -> String
 createHashFunction q k = 
@@ -194,8 +194,8 @@ def hash_no_mod #{a_arg} #{b_arg} =
 |]
   where
     qk = q + k
-    a_arg = futPrint $ NTuple $ map RawString as
-    b_arg = futPrint $ NTuple $ map RawString bs
+    a_arg = futharkify $ NTuple $ map RawString as
+    b_arg = futharkify $ NTuple $ map RawString bs
     as = ["a" ++ show j | j <- [0..(qk - 1)]]
     bs = ["b" ++ show j | j <- [0..(qk - 1)]]
     concatWith a b c = b ++ a ++ c
@@ -240,25 +240,25 @@ generateParser q k grammar symbol_index_map = do
         padAndStringifyTable empty_terminal q k max_ao max_pi symbol_index_map table
   hash_table <- hashTable terminal_type 13 $ Map.mapKeys (fmap fromIntegral) integer_table
   let ne = createNe max_ao max_pi
-  let offsets_array_str = futPrint $ offsetArray hash_table
+  let offsets_array_str = futharkify $ offsetArray hash_table
   let hash_table_mem_size = ABase.numElements $ elementArray hash_table
   let hash_table_str =
-        futPrint
+        futharkify
         $ fmap (first NTuple)
         <$> elementArray hash_table
   let consts_array = constsArray hash_table
-  let consts_array_str = futPrint $ fmap (fmap NTuple) consts_array
-  let size_array = futPrint $ sizeArray hash_table
-  let consts = futPrint $ NTuple $ initHashConsts hash_table
+  let consts_array_str = futharkify $ fmap (fmap NTuple) consts_array
+  let size_array = futharkify $ sizeArray hash_table
+  let consts = futharkify $ NTuple $ initHashConsts hash_table
   let hash_table_size = ABase.numElements consts_array 
   return . (,terminal_type) $
     futharkParser
           <> [i|
 module parser = mk_parser {
 
-module terminal_module = #{futPrint terminal_type}
-module production_module = #{futPrint production_type}
-module bracket_module = #{futPrint bracket_type}
+module terminal_module = #{futharkify terminal_type}
+module production_module = #{futharkify production_type}
+module bracket_module = #{futharkify bracket_type}
 
 #{declarations}
 
@@ -315,6 +315,6 @@ def ne: ([max_ao]bracket, [max_pi]production) =
     augmented_grammar = augmentGrammar grammar
     terminals' = terminals augmented_grammar
     look_type =
-      futPrint
+      futharkify
       $ NTuple
       $ replicate (q + k) (RawString "terminal")