diff --git a/sources/algo/ethash/CMakeLists.txt b/sources/algo/ethash/CMakeLists.txt
index 8e28f54..39c9f34 100644
--- a/sources/algo/ethash/CMakeLists.txt
+++ b/sources/algo/ethash/CMakeLists.txt
@@ -15,6 +15,14 @@ if (BUILD_EXE_UNIT_TEST)
     )
 endif()
 
+if (BUILD_EXE_MINER)
+    target_sources(${BENCH_EXE} PUBLIC
+        ${HEADERS}
+        ${SOURCES}
+    )
+endif()
+
+
 add_subdirectory(opencl)
 add_subdirectory(tests)
 
diff --git a/sources/algo/ethash/ethash.cpp b/sources/algo/ethash/ethash.cpp
index 4338337..fa7b5df 100644
--- a/sources/algo/ethash/ethash.cpp
+++ b/sources/algo/ethash/ethash.cpp
@@ -71,8 +71,7 @@ void algo::ethash::initializeDagContext(
     uint64_t const dagCountItemsGrowth,
     uint64_t const dagCountItemsInit,
     uint32_t const lightCacheCountItemsGrowth,
-    uint32_t const lightCacheCountItemsInit,
-    bool const buildOnCPU)
+    uint32_t const lightCacheCountItemsInit)
 {
     ////////////////////////////////////////////////////////////////////////////
     context.epoch = castU32(currentEpoch);
@@ -115,12 +114,23 @@ void algo::ethash::initializeDagContext(
         seed = algo::keccak(seed);
     }
     algo::copyHash(context.originalSeedCache, seed);
+}
+
+
+void algo::ethash::buildLightCache(
+    algo::DagContext& context,
+    bool const buildOnCPU)
+{
+    ////////////////////////////////////////////////////////////////////////////
+    algo::hash512 item
+    {
+        algo::keccak<algo::hash512, algo::hash256>(context.originalSeedCache)
+    };
 
     ////////////////////////////////////////////////////////////////////////////
     if (false == buildOnCPU)
     {
-        algo::hash512 const hashedSeed{ algo::keccak<algo::hash512, algo::hash256>(seed) };
-        algo::copyHash(context.hashedSeedCache, hashedSeed);
+        algo::copyHash(context.hashedSeedCache, item);
         return;
     }
 
@@ -133,27 +143,12 @@ void algo::ethash::initializeDagContext(
         logErr() << "Cannot alloc context data";
         return;
     }
-
-    ////////////////////////////////////////////////////////////////////////////
     context.lightCache.hash = castPtrHash512(context.data + algo::LEN_HASH_512);
+    context.lightCache.hash[0] = item;
 
     ////////////////////////////////////////////////////////////////////////////
-    if (true == buildOnCPU)
-    {
-        logInfo() << "Building light cache on CPU";
-        common::ChronoGuard chrono{ "Built light cache", common::CHRONO_UNIT::MS };
-        buildLightCache(context, seed);
-    }
-}
-
-
-void algo::ethash::buildLightCache(
-    algo::DagContext& context,
-    algo::hash256 const& seed)
-{
-    ////////////////////////////////////////////////////////////////////////////
-    algo::hash512 item{ algo::keccak<algo::hash512, algo::hash256>(seed) };
-    context.lightCache.hash[0] = item;
+    logInfo() << "Building light cache on CPU";
+    common::ChronoGuard chrono{ "Built light cache", common::CHRONO_UNIT::MS };
 
     ////////////////////////////////////////////////////////////////////////////
     for (uint64_t i{ 1ull }; i < context.lightCache.numberItem; ++i)
diff --git a/sources/algo/ethash/ethash.hpp b/sources/algo/ethash/ethash.hpp
index a04a98f..4156814 100644
--- a/sources/algo/ethash/ethash.hpp
+++ b/sources/algo/ethash/ethash.hpp
@@ -40,11 +40,10 @@ namespace algo
                                   uint64_t const dagCountItemsGrowth,
                                   uint64_t const dagCountItemsInit,
                                   uint32_t const lightCacheCountItemsGrowth,
-                                  uint32_t const lightCacheCountItemsInit,
-                                  bool const buildOnCPU);
+                                  uint32_t const lightCacheCountItemsInit);
         void freeDagContext(algo::DagContext& context);
         void buildLightCache(algo::DagContext& context,
-                             algo::hash256 const& seed);
+                             bool const buildOnCPU);
 #endif
     }
 }
diff --git a/sources/algo/ethash/tests/ethash.cpp b/sources/algo/ethash/tests/ethash.cpp
index 7d9a873..2a32b79 100644
--- a/sources/algo/ethash/tests/ethash.cpp
+++ b/sources/algo/ethash/tests/ethash.cpp
@@ -63,9 +63,9 @@ TEST_F(EthashTest, lightCacheBuild)
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        true
+        lightCacheCountItemsInit
     );
+    algo::ethash::buildLightCache(context, true);
 
     // light cache size
     ASSERT_EQ(1411061ull,    context.lightCache.numberItem);
diff --git a/sources/algo/progpow/opencl/progpow.cl b/sources/algo/progpow/opencl/progpow.cl
index 2bdd579..8d1dc0a 100644
--- a/sources/algo/progpow/opencl/progpow.cl
+++ b/sources/algo/progpow/opencl/progpow.cl
@@ -122,60 +122,69 @@ void progpow_search(
     uint const thread_id = get_global_id(0) + (get_global_id(1) * GROUP_SIZE);
     uint const lane_id = thread_id % LANES;
     uint const worker_group = get_global_id(0) / LANES;
-    ulong const nonce = start_nonce + thread_id;
+    ulong nonce = start_nonce + thread_id;
     uint const index_share_seed = get_global_id(0) / BATCH_GROUP_LANE;
 
     ////////////////////////////////////////////////////////////////////////
     initialize_header(dag, header_dag, (thread_id % GROUP_SIZE));
     ulong const seed = initialize_seed(header, state_mix, nonce);
 
+#if defined(INTERNAL_LOOP)
     __attribute__((opencl_unroll_hint(1)))
-    for (uint l_id = 0u; l_id < LANES; ++l_id)
+    for (uint i = 0; i < INTERNAL_LOOP; ++i)
     {
-        ////////////////////////////////////////////////////////////////////////
-        if (l_id == lane_id)
+        nonce += (i * TOTAL_THREADS);
+#endif
+        __attribute__((opencl_unroll_hint(1)))
+        for (uint l_id = 0u; l_id < LANES; ++l_id)
         {
-            share_msb_lsb[index_share_seed] = seed;
+            ////////////////////////////////////////////////////////////////////////
+            if (l_id == lane_id)
+            {
+                share_msb_lsb[index_share_seed] = seed;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            ////////////////////////////////////////////////////////////////////////
+            ulong const seedShare = share_msb_lsb[index_share_seed];
+
+            ////////////////////////////////////////////////////////////////////////
+            fill_hash(hash, lane_id, seedShare);
+            loop_math(dag, share_hash0, header_dag, hash, lane_id, worker_group);
+            reduce_hash(
+                share_fnv1a,
+                hash,
+                digest,
+                worker_group,
+                l_id == lane_id);
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
 
         ////////////////////////////////////////////////////////////////////////
-        ulong const seedShare = share_msb_lsb[index_share_seed];
-
-        ////////////////////////////////////////////////////////////////////////
-        fill_hash(hash, lane_id, seedShare);
-        loop_math(dag, share_hash0, header_dag, hash, lane_id, worker_group);
-        reduce_hash(
-            share_fnv1a,
-            hash,
-            digest,
-            worker_group,
-            l_id == lane_id);
-    }
+    #if defined(__KERNEL_PROGPOW)
+        ulong const bytes_result = is_valid(header, digest, seed);
+    #else
+        ulong const bytes_result = is_valid(state_mix, digest);
+    #endif
 
-    ////////////////////////////////////////////////////////////////////////
-#if defined(__KERNEL_PROGPOW)
-    ulong const bytes_result = is_valid(header, digest, seed);
-#else
-    ulong const bytes_result = is_valid(state_mix, digest);
-#endif
-
-    if (bytes_result <= boundary)
-    {
-        uint const index = atomic_inc(&result->count);
-        if (index < MAX_RESULT)
+        if (bytes_result <= boundary)
         {
-            result->found = true;
-            result->nonces[index] = nonce;
-
-            result->hash[index][0] = digest[0];
-            result->hash[index][1] = digest[1];
-            result->hash[index][2] = digest[2];
-            result->hash[index][3] = digest[3];
-            result->hash[index][4] = digest[4];
-            result->hash[index][5] = digest[5];
-            result->hash[index][6] = digest[6];
-            result->hash[index][7] = digest[7];
+            uint const index = atomic_inc(&result->count);
+            if (index < MAX_RESULT)
+            {
+                result->found = true;
+                result->nonces[index] = nonce;
+
+                result->hash[index][0] = digest[0];
+                result->hash[index][1] = digest[1];
+                result->hash[index][2] = digest[2];
+                result->hash[index][3] = digest[3];
+                result->hash[index][4] = digest[4];
+                result->hash[index][5] = digest[5];
+                result->hash[index][6] = digest[6];
+                result->hash[index][7] = digest[7];
+            }
         }
+#if defined(INTERNAL_LOOP)
     }
+#endif
 }
diff --git a/sources/benchmark/amd/kawpow.cpp b/sources/benchmark/amd/kawpow.cpp
index 2b37a7a..5ac9e76 100644
--- a/sources/benchmark/amd/kawpow.cpp
+++ b/sources/benchmark/amd/kawpow.cpp
@@ -2,6 +2,8 @@
 
 #include <CL/opencl.hpp>
 
+#include <algo/dag_context.hpp>
+#include <algo/ethash/ethash.hpp>
 #include <algo/hash.hpp>
 #include <algo/hash_utils.hpp>
 #include <algo/progpow/progpow.hpp>
@@ -19,14 +21,18 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
 
     ////////////////////////////////////////////////////////////////////////////
     bool dagInitialized{ false };
-    uint64_t const dagItems{ 16777213ull };
-    uint64_t const dagItemsKawpow{ dagItems / 2ull };
-    auto const header
+    algo::hash256 const headerHash
     {
         algo::toHash256("71c967486cb3b70d5dfcb2ebd8eeef138453637cacbf3ccb580a41a7e96986bb")
     };
+    algo::hash256 const seedHash
+    {
+        algo::toHash256("7c4fb8a5d141973b69b521ce76b0dc50f0d2834d817c7f8310a6ab5becc6bb0c")
+    };
+    int32_t const epoch{ algo::ethash::findEpoch(seedHash, algo::ethash::EPOCH_LENGTH) };
 
     ////////////////////////////////////////////////////////////////////////////
+    common::opencl::Buffer<algo::hash512> lightCache { CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY };
     common::opencl::Buffer<algo::hash1024> dagCache{ CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS };
     common::opencl::BufferMapped<uint32_t> headerCache
     {
@@ -36,14 +42,29 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
     common::opencl::BufferMapped<t_result> resultCache{ CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR };
 
     ////////////////////////////////////////////////////////////////////////////
-    dagCache.setSize(dagItems * algo::LEN_HASH_1024);
+    algo::DagContext dagContext{};
+    algo::ethash::initializeDagContext
+    (
+        dagContext,
+        epoch,
+        algo::ethash::MAX_EPOCH_NUMBER,
+        algo::ethash::DAG_COUNT_ITEMS_GROWTH,
+        algo::ethash::DAG_COUNT_ITEMS_INIT,
+        algo::ethash::LIGHT_CACHE_COUNT_ITEMS_GROWTH,
+        algo::ethash::LIGHT_CACHE_COUNT_ITEMS_INIT
+    );
+
+    ////////////////////////////////////////////////////////////////////////////
+    dagCache.setSize(dagContext.dagCache.size);
+
+    ////////////////////////////////////////////////////////////////////////////
     dagCache.alloc(propertiesAmd.clContext);
     headerCache.alloc(&propertiesAmd.clQueue, propertiesAmd.clContext);
     resultCache.alloc(&propertiesAmd.clQueue, propertiesAmd.clContext);
 
     ////////////////////////////////////////////////////////////////////////////
     if (false == headerCache.setBufferDevice(&propertiesAmd.clQueue,
-                                             (uint32_t*)header.word32))
+                                             (uint32_t*)headerHash.word32))
     {
         logErr() << "Fail to copy header in cache";
     }
@@ -66,7 +87,7 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
             ///////////////////////////////////////////////////////////////////
             auto& clKernel{ generator.clKernel };
             OPENCL_ER(clKernel.setArg(0u, *dagCache.getBuffer()));
-            OPENCL_ER(clKernel.setArg(1u, castU32(dagItems)));
+            OPENCL_ER(clKernel.setArg(1u, castU32(dagContext.dagCache.numberItem)));
 
             ///////////////////////////////////////////////////////////////////
             OPENCL_ER(
@@ -87,10 +108,10 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
 
     ///////////////////////////////////////////////////////////////////////////
     auto benchKawpow = [&](std::string const& kernelName,
+                           uint32_t const loop,
                            uint32_t const groupSize,
                            uint32_t const workerGroupCount,
-                           uint32_t const workItemCollaborate,
-                           uint32_t const loop) -> bool
+                           uint32_t const workItemCollaborate) -> bool
     {
         ///////////////////////////////////////////////////////////////////////
         common::KernelGeneratorOpenCL generator{};
@@ -113,7 +134,7 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
         generator.addDefine("MODULE_CACHE", algo::progpow::MODULE_CACHE);
         generator.addDefine("HEADER_ITEM_BY_THREAD", algo::progpow::MODULE_CACHE / groupSize);
         generator.addDefine("COUNT_DAG", algo::progpow::COUNT_DAG);
-        generator.addDefine("DAG_SIZE", castU32(dagItemsKawpow));
+        generator.addDefine("DAG_SIZE", castU32(dagContext.dagCache.numberItem / 2ull));
         generator.addDefine("BATCH_GROUP_LANE", batchGroupLane);
         generator.addDefine("SHARE_SEED_SIZE", batchGroupLane);
         generator.addDefine("SHARE_HASH0_SIZE", batchGroupLane);
@@ -145,10 +166,10 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
 
         ///////////////////////////////////////////////////////////////////////
         auto& clKernel{ generator.clKernel };
-        OPENCL_ER(clKernel.setArg(0u, 0ull));
-        OPENCL_ER(clKernel.setArg(1u, *headerCache.getBuffer()));
-        OPENCL_ER(clKernel.setArg(2u, *dagCache.getBuffer()));
-        OPENCL_ER(clKernel.setArg(3u, *resultCache.getBuffer()));
+        OPENCL_ER(clKernel.setArg(0u, *dagCache.getBuffer()));
+        OPENCL_ER(clKernel.setArg(1u, *resultCache.getBuffer()));
+        OPENCL_ER(clKernel.setArg(2u, *headerCache.getBuffer()));
+        OPENCL_ER(clKernel.setArg(3u, 0ull));
 
         ///////////////////////////////////////////////////////////////////////
         setGrid(groupSize, workerGroupCount);
@@ -177,11 +198,15 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow()
     ////////////////////////////////////////////////////////////////////////////
     if (true == dagInitialized)
     {
-        benchKawpow("kawpow_lm1", 256u, 1024u, algo::progpow::LANES, 1u); // Parallele + LDS
-        benchKawpow("kawpow_lm2", 256u, 1024u, algo::progpow::LANES, 1u); // Parallel + crosslane
-        benchKawpow("kawpow_lm3", 256u, 1024u, algo::progpow::LANES, 1u); // Parallel + crosslane + LDS header
+        benchKawpow("kawpow_lm1", 1u, 256u, 1024u, algo::progpow::LANES); // Parallele + LDS
+        benchKawpow("kawpow_lm2", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane
+        benchKawpow("kawpow_lm3", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane + LDS header
+        benchKawpow("kawpow_lm4", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane + LDS header
     }
 
+    ////////////////////////////////////////////////////////////////////////////
+    algo::ethash::freeDagContext(dagContext);
+
     ////////////////////////////////////////////////////////////////////////////
     dagCache.free();
     headerCache.free();
diff --git a/sources/benchmark/opencl/CMakeLists.txt b/sources/benchmark/opencl/CMakeLists.txt
index e680705..b9cb7fe 100644
--- a/sources/benchmark/opencl/CMakeLists.txt
+++ b/sources/benchmark/opencl/CMakeLists.txt
@@ -8,6 +8,7 @@ set(BENCHMARK_OPENCL_FILES
     kawpow/kawpow_lm1.cl
     kawpow/kawpow_lm2.cl
     kawpow/kawpow_lm3.cl
+    kawpow/kawpow_lm4.cl
     kawpow/sequence_dynamic.cl
     kawpow/sequence_dynamic_local.cl
 )
@@ -28,6 +29,7 @@ add_custom_target(copy_benchmark_opencl ALL
     ${OUT_COMMON}/kawpow/kawpow_lm1.cl
     ${OUT_COMMON}/kawpow/kawpow_lm2.cl
     ${OUT_COMMON}/kawpow/kawpow_lm3.cl
+    ${OUT_COMMON}/kawpow/kawpow_lm4.cl
     ${OUT_COMMON}/kawpow/sequence_dynamic.cl
     ${OUT_COMMON}/kawpow/sequence_dynamic_local.cl
 )
diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm1.cl b/sources/benchmark/opencl/kawpow/kawpow_lm1.cl
index 1a17ea0..c99cd4a 100644
--- a/sources/benchmark/opencl/kawpow/kawpow_lm1.cl
+++ b/sources/benchmark/opencl/kawpow/kawpow_lm1.cl
@@ -149,10 +149,11 @@ void loop_math(
         barrier(CLK_LOCAL_MEM_FENCE);
 
         uint dag_index = share_hash0[worker_group];
-        uint fd = dag_index;
         dag_index %= DAG_SIZE;
-        dag_index *= WORK_ITEM_COLLABORATE;
-        dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
+
+        // TODO: FIX
+        // dag_index *= WORK_ITEM_COLLABORATE;
+        // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
 
         uint4 const entries = dag[dag_index];
         sequence_dynamic_local(header_dag, hash, entries);
@@ -208,10 +209,10 @@ void initialize_header(
 
 __kernel
 void kawpow_lm1(
-    ulong const start_nonce,
-    __constant uint4 const* const restrict header,
-    __global uint4 const* const restrict dag,
-    __global t_result* const restrict result)
+    __global uint const* const restrict dag,
+    __global t_result* const restrict result,
+    __constant uint const* const restrict header,
+    ulong const start_nonce)
 {
     __local uint header_dag[MODULE_CACHE];
     __local ulong share_msb_lsb[SHARE_SEED_SIZE];
@@ -259,6 +260,8 @@ void kawpow_lm1(
 
     ///////////////////////////////////////////////////////////////////////////
     ulong const bytes_result = is_valid(state_mix, digest);
+    PRINT_U32_IF("bytes_result", 0u, bytes_result);
+    PRINT_U32_IF("bytes_result", 16u, bytes_result);
     if (bytes_result <= 0)
     {
         uint const index = atomic_inc(&result->count);
diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm2.cl b/sources/benchmark/opencl/kawpow/kawpow_lm2.cl
index 45bf938..9f8fdb7 100644
--- a/sources/benchmark/opencl/kawpow/kawpow_lm2.cl
+++ b/sources/benchmark/opencl/kawpow/kawpow_lm2.cl
@@ -71,8 +71,10 @@ void loop_math(
         uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE);
         uint fd = dag_index;
         dag_index %= DAG_SIZE;
-        dag_index *= WORK_ITEM_COLLABORATE;
-        dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
+
+        // TODO: FIX
+        // dag_index *= WORK_ITEM_COLLABORATE;
+        // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
 
         uint4 entries = ((uint4*)dag)[dag_index];
         sequence_dynamic(dag, hash, entries);
@@ -172,10 +174,10 @@ ulong is_valid(
 
 __kernel
 void kawpow_lm2(
-    ulong const start_nonce,
-    __constant uint const* const restrict header,
     __global uint const* const restrict dag,
-    __global t_result* const restrict result)
+    __global t_result* const restrict result,
+    __constant uint const* const restrict header,
+    ulong const start_nonce)
 {
     ///////////////////////////////////////////////////////////////////////////
     uint state[STATE_SIZE];
@@ -210,6 +212,8 @@ void kawpow_lm2(
 
     ///////////////////////////////////////////////////////////////////////////
     ulong const bytes_result = is_valid(state, digest);
+    PRINT_U32_IF("bytes_result", 0u, bytes_result);
+    PRINT_U32_IF("bytes_result", 16u, bytes_result);
     if (bytes_result <= 0)
     {
         uint const index = atomic_inc(&result->count);
diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm3.cl b/sources/benchmark/opencl/kawpow/kawpow_lm3.cl
index a49a7ba..1fbd216 100644
--- a/sources/benchmark/opencl/kawpow/kawpow_lm3.cl
+++ b/sources/benchmark/opencl/kawpow/kawpow_lm3.cl
@@ -7,7 +7,7 @@ void initialize_header(
     __attribute__((opencl_unroll_hint))
     for (uint i = 0u; i < HEADER_ITEM_BY_THREAD; ++i)
     {
-        uint const index_dag = i * GROUP_SIZE + thread_id;
+        uint const index_dag = i * GROUP_SIZE + thread_id; // TODO: mad4
         uint const item_dag = dag[index_dag];
         header_dag[index_dag] = item_dag;
     }
@@ -80,7 +80,8 @@ void loop_math(
     __global uint* const restrict dag,
     __local uint* const restrict header_dag,
     uint* const restrict hash,
-    uint const lane_id)
+    uint const lane_id,
+uint const l_id)
 {
     __attribute__((opencl_unroll_hint(1)))
     for (uint cnt = 0u; cnt < COUNT_DAG; ++cnt)
@@ -89,8 +90,10 @@ void loop_math(
         uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE);
         uint fd = dag_index;
         dag_index %= DAG_SIZE;
-        dag_index *= WORK_ITEM_COLLABORATE;
-        dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
+
+        // TODO: FIX
+        // dag_index *= WORK_ITEM_COLLABORATE;
+        // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
 
         uint4 entries = ((uint4*)dag)[dag_index];
         sequence_dynamic_local(header_dag, hash, entries);
@@ -190,10 +193,10 @@ ulong is_valid(
 
 __kernel
 void kawpow_lm3(
-    ulong const start_nonce,
-    __constant uint const* const restrict header,
     __global uint const* const restrict dag,
-    __global t_result* const restrict result)
+    __global t_result* const restrict result,
+    __constant uint const* const restrict header,
+    ulong const start_nonce)
 {
     ///////////////////////////////////////////////////////////////////////////
     __local uint header_dag[MODULE_CACHE];
@@ -228,12 +231,14 @@ void kawpow_lm3(
         fill_hash(hash, lane_id % WORK_ITEM_COLLABORATE, lane_lsb, lane_msb, l_id);
 
         ///////////////////////////////////////////////////////////////////////
-        loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE);
+        loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE, l_id);
         reduce_hash(hash, digest, l_id == lane_id % WORK_ITEM_COLLABORATE);
     }
 
     ///////////////////////////////////////////////////////////////////////////
     ulong const bytes_result = is_valid(state, digest);
+    PRINT_U32_IF("bytes_result", 0u, bytes_result);
+    PRINT_U32_IF("bytes_result", 16u, bytes_result);
     if (bytes_result <= 0)
     {
         uint const index = atomic_inc(&result->count);
diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm4.cl b/sources/benchmark/opencl/kawpow/kawpow_lm4.cl
new file mode 100644
index 0000000..da9dba9
--- /dev/null
+++ b/sources/benchmark/opencl/kawpow/kawpow_lm4.cl
@@ -0,0 +1,251 @@
+inline
+void initialize_header(
+    __global uint const* restrict const dag,
+    __local uint * restrict const header_dag,
+    uint const thread_id)
+{
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0u; i < HEADER_ITEM_BY_THREAD; ++i)
+    {
+        uint const index_dag = i * GROUP_SIZE + thread_id; // TODO: mad4
+        uint const item_dag = dag[index_dag];
+        header_dag[index_dag] = item_dag;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+inline
+void initialize_state(
+    __constant uint const* restrict header,
+    uint* const restrict state,
+    ulong const nonce)
+{
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0; i < 8u; ++i)
+    {
+        state[i] = header[i];
+    }
+
+    state[8] = nonce;
+    state[9] = (nonce >> 32);
+
+    state[10] = 'r';
+    state[11] = 'A';
+    state[12] = 'V';
+    state[13] = 'E';
+    state[14] = 'N';
+
+    state[15] = 'C';
+    state[16] = 'O';
+    state[17] = 'I';
+    state[18] = 'N';
+
+    state[19] = 'K';
+    state[20] = 'A';
+    state[21] = 'W';
+    state[22] = 'P';
+    state[23] = 'O';
+    state[24] = 'W';
+
+    keccak_f800(state);
+}
+
+
+inline
+void fill_hash(
+    uint* const restrict hash,
+    uint const lane_id,
+    uint const lsb,
+    uint const msb,
+    uint const l_id)
+{
+    uint4 data;
+
+    data.x = fnv1a_u32(FNV1_OFFSET, msb);
+    data.y = fnv1a_u32(data.x, lsb);
+    data.z = fnv1a_u32(data.y, lane_id);
+    data.w = fnv1a_u32(data.z, lane_id);
+
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0u; i < REGS; ++i)
+    {
+        hash[i] = kiss99(&data);
+    }
+}
+
+
+inline
+void loop_math(
+    __global uint* const restrict dag,
+    __local uint* const restrict header_dag,
+    uint* const restrict hash,
+    uint const lane_id,
+uint const l_id)
+{
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint cnt = 0u; cnt < COUNT_DAG; ++cnt)
+    {
+        uint const mix0 = hash[0];
+        uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE);
+        uint fd = dag_index;
+        dag_index %= DAG_SIZE;
+
+        // TODO: FIX
+        // dag_index *= WORK_ITEM_COLLABORATE;
+        // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE);
+
+        uint4 entries = ((uint4*)dag)[dag_index];
+        sequence_dynamic_local(header_dag, hash, entries);
+    }
+}
+
+
+inline
+void reduce_hash(
+    uint* const restrict hash,
+    uint* const restrict digest,
+    bool const is_same_lane)
+{
+    uint value = FNV1_OFFSET;
+
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0u; i < REGS; ++i)
+    {
+        value = fnv1a_u32(value, hash[i]);
+    }
+
+    uint tmp[DIGEST_SIZE];
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0u; i < DIGEST_SIZE; ++i)
+    {
+        tmp[i] = reg_load(value, i, WORK_ITEM_COLLABORATE);
+    }
+
+    if (true == is_same_lane)
+    {
+        __attribute__((opencl_unroll_hint))
+        for (uint i = 0u; i < DIGEST_SIZE; ++i)
+        {
+            digest[i] = tmp[i];
+        }
+    }
+}
+
+
+inline
+ulong sha3(
+    uint const* const restrict digest_1,
+    uint* const restrict digest_2)
+{
+    uint state[STATE_SIZE];
+
+    __attribute__((opencl_unroll_hint))
+    for (uint i = 0u; i < 8u; ++i)
+    {
+        state[i] = digest_1[i];
+    }
+
+    state[8] = digest_2[0];
+    state[9] = digest_2[1];
+    state[10] = digest_2[2];
+    state[11] = digest_2[3];
+    state[12] = digest_2[4];
+    state[13] = digest_2[5];
+    state[14] = digest_2[6];
+    state[15] = digest_2[7];
+
+    state[16] = 'r';
+    state[17] = 'A';
+    state[18] = 'V';
+    state[19] = 'E';
+    state[20] = 'N';
+
+    state[21] = 'C';
+    state[22] = 'O';
+    state[23] = 'I';
+    state[24] = 'N';
+
+    keccak_f800(state);
+
+    ulong const res = ((ulong)state[1]) << 32 | state[0];
+    return as_ulong(as_uchar8(res).s76543210);
+}
+
+
+inline
+ulong is_valid(
+    uint const* const restrict state,
+    uint* const restrict digest)
+{
+    digest[0] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[0]), digest[8]);
+    digest[1] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[1]), digest[9]);
+    digest[2] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[2]), digest[10]);
+    digest[3] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[3]), digest[11]);
+    digest[4] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[4]), digest[12]);
+    digest[5] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[5]), digest[13]);
+    digest[6] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[6]), digest[14]);
+    digest[7] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[7]), digest[15]);
+
+    return sha3(state, digest);
+}
+
+
+__kernel
+void kawpow_lm4(
+    __global uint const* const restrict dag,
+    __global t_result* const restrict result,
+    __constant uint const* const restrict header,
+    ulong const start_nonce)
+{
+    ///////////////////////////////////////////////////////////////////////////
+    __local uint header_dag[MODULE_CACHE];
+
+    ///////////////////////////////////////////////////////////////////////////
+    uint state[STATE_SIZE];
+    uint digest[DIGEST_SIZE];
+    uint hash[HASH_SIZE];
+
+    ///////////////////////////////////////////////////////////////////////////
+    uint const thread_id = get_thread_id_2d();
+    uint const lane_id = get_sub_group_local_id();
+    ulong const nonce = start_nonce + thread_id;
+
+    ///////////////////////////////////////////////////////////////////////////
+    initialize_header(dag, header_dag, get_local_id(0));
+
+    ///////////////////////////////////////////////////////////////////////////
+    initialize_state(header, state, nonce);
+    uint msb = state[0];
+    uint lsb = state[1];
+
+    ///////////////////////////////////////////////////////////////////////////
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint l_id = 0u; l_id < WORK_ITEM_COLLABORATE; ++l_id)
+    {
+        ///////////////////////////////////////////////////////////////////////
+        uint const lane_msb = reg_load(msb, l_id, WORK_ITEM_COLLABORATE);
+        uint const lane_lsb = reg_load(lsb, l_id, WORK_ITEM_COLLABORATE);
+
+        ///////////////////////////////////////////////////////////////////////
+        fill_hash(hash, lane_id % WORK_ITEM_COLLABORATE, lane_lsb, lane_msb, l_id);
+
+        ///////////////////////////////////////////////////////////////////////
+        loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE, l_id);
+        reduce_hash(hash, digest, l_id == lane_id % WORK_ITEM_COLLABORATE);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    ulong const bytes_result = is_valid(state, digest);
+    PRINT_U32_IF("bytes_result", 0u, bytes_result);
+    PRINT_U32_IF("bytes_result", 16u, bytes_result);
+    if (bytes_result <= 0)
+    {
+        uint const index = atomic_inc(&result->count);
+        if (index < 1)
+        {
+            result->found = true;
+            result->nonce = nonce;
+        }
+    }
+}
diff --git a/sources/common/opencl/cross_lane.cl b/sources/common/opencl/cross_lane.cl
index 3852a59..6a1e294 100644
--- a/sources/common/opencl/cross_lane.cl
+++ b/sources/common/opencl/cross_lane.cl
@@ -1,24 +1,47 @@
-
+///////////////////////////////////////////////////////////////////////////////
+// WAVEFRONT 32
+///////////////////////////////////////////////////////////////////////////////
 #if WAVEFRONT == 32
-    inline uint reg_load(
+    inline
+    uint reg_load(
         uint const var,
-        uint const lane_target,
+        uint const lane_id,
         uint const width)
     {
+#if defined CROSSLANE_BY_LDS
+        uint const gap = get_thread_id_2d() / width;
+        uint const value = sub_group_shuffle(var, gap + lane_id);
+
+        return value;
+#else
         uint const local_id = get_sub_group_local_id();
         uint const group_id = local_id / width;
 
-        uint const val_group_0 = sub_group_broadcast(var, lane_target);
-        uint const val_group_1 = sub_group_broadcast(var, width + lane_target);
+        uint const val_group_0 = sub_group_broadcast(var, lane_id);
+        uint const val_group_1 = sub_group_broadcast(var, width + lane_id);
 
         return (group_id == 0u) ? val_group_0 : val_group_1;
+#endif
     }
-#else // WAVEFRONT == 64
-    inline uint reg_load(
+#endif // WAVEFRONT == 32
+
+
+///////////////////////////////////////////////////////////////////////////////
+// WAVEFRONT 64
+///////////////////////////////////////////////////////////////////////////////
+#if WAVEFRONT == 64
+    inline
+    uint reg_load(
         uint const var,
         uint const lane_target,
         uint const width)
     {
+#if defined CROSSLANE_BY_LDS
+        uint const gap = get_thread_id_2d() / width;
+        uint const value = sub_group_shuffle(var, gap + lane_id);
+
+        return value;
+#else
         uint const local_id = get_sub_group_local_id();
         uint const group_id = local_id / width;
 
@@ -37,5 +60,6 @@
         }
 
         return result;
-    }
 #endif
+    }
+#endif // WAVEFRONT == 64
diff --git a/sources/device/amd.cpp b/sources/device/amd.cpp
index 78133d0..9ba1cd9 100644
--- a/sources/device/amd.cpp
+++ b/sources/device/amd.cpp
@@ -11,7 +11,8 @@ bool device::DeviceAmd::initialize()
     cleanUp();
 
     clContext = cl::Context(clDevice);
-    clQueue = cl::CommandQueue(clContext, clDevice);
+    clQueue[0] = cl::CommandQueue(clContext, clDevice);
+    clQueue[1] = cl::CommandQueue(clContext, clDevice);
 
     resolver::ResolverAmd* const resolverAmd{ dynamic_cast<resolver::ResolverAmd* const>(resolver) };
     if (nullptr == resolverAmd)
@@ -21,7 +22,7 @@ bool device::DeviceAmd::initialize()
 
     resolverAmd->setDevice(&clDevice);
     resolverAmd->setContext(&clContext);
-    resolverAmd->setQueue(&clQueue);
+    resolverAmd->setQueue(clQueue);
 
     return true;
 }
diff --git a/sources/device/amd.hpp b/sources/device/amd.hpp
index a66e684..2b4a34b 100644
--- a/sources/device/amd.hpp
+++ b/sources/device/amd.hpp
@@ -16,7 +16,7 @@ namespace device
 
     protected:
         cl::Context      clContext{};
-        cl::CommandQueue clQueue{};
+        cl::CommandQueue clQueue[2]{};
 
         bool initialize() final;
         void cleanUp() final;
diff --git a/sources/resolver/amd/amd.cpp b/sources/resolver/amd/amd.cpp
index b810816..221ecd1 100644
--- a/sources/resolver/amd/amd.cpp
+++ b/sources/resolver/amd/amd.cpp
@@ -20,7 +20,8 @@ void resolver::ResolverAmd::setContext(
 void resolver::ResolverAmd::setQueue(
     cl::CommandQueue* const queue)
 {
-    clQueue = queue;
+    clQueue[0] = &queue[0];
+    clQueue[1] = &queue[1];
 }
 
 
diff --git a/sources/resolver/amd/amd.hpp b/sources/resolver/amd/amd.hpp
index a1e2a88..9ada547 100644
--- a/sources/resolver/amd/amd.hpp
+++ b/sources/resolver/amd/amd.hpp
@@ -10,7 +10,6 @@ namespace resolver
     class ResolverAmd : public resolver::Resolver
     {
     public:
-
         virtual ~ResolverAmd() = default;
 
         void setDevice(cl::Device* const device);
@@ -22,7 +21,7 @@ namespace resolver
     protected:
         cl::Context*      clContext{ nullptr };
         cl::Device*       clDevice{ nullptr };
-        cl::CommandQueue* clQueue{ nullptr };
+        cl::CommandQueue* clQueue[2]{ nullptr, nullptr };
 
         void overrideOccupancy(uint32_t const defaultThreads,
                                uint32_t const defaultBlocks) final;
diff --git a/sources/resolver/amd/autolykos_v2.cpp b/sources/resolver/amd/autolykos_v2.cpp
index df35133..d644e51 100644
--- a/sources/resolver/amd/autolykos_v2.cpp
+++ b/sources/resolver/amd/autolykos_v2.cpp
@@ -25,7 +25,9 @@ bool resolver::ResolverAmdAutolykosV2::updateMemory(
     stratum::StratumJobInfo const& jobInfo)
 {
     ////////////////////////////////////////////////////////////////////////////
-    if (nullptr == clContext || nullptr == clQueue)
+    if (   nullptr == clContext
+        || nullptr == clQueue[0]
+        || nullptr == clQueue[1])
     {
         return false;
     }
@@ -66,9 +68,9 @@ bool resolver::ResolverAmdAutolykosV2::updateMemory(
     ////////////////////////////////////////////////////////////////////////////
     if (   false == parameters.BHashes.alloc(*clContext)
         || false == parameters.dagCache.alloc(*clContext)
-        || false == parameters.boundaryCache.alloc(clQueue, *clContext)
-        || false == parameters.headerCache.alloc(clQueue, *clContext)
-        || false == parameters.resultCache.alloc(clQueue, *clContext))
+        || false == parameters.boundaryCache.alloc(clQueue[currentIndexStream], *clContext)
+        || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext)
+        || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext))
     {
         return false;
     }
@@ -113,14 +115,14 @@ bool resolver::ResolverAmdAutolykosV2::updateConstants(
 
     ////////////////////////////////////////////////////////////////////////////
     uint32_t const* const boundary { jobInfo.boundary.word32 };
-    if (false == parameters.boundaryCache.setBufferDevice(clQueue, boundary))
+    if (false == parameters.boundaryCache.setBufferDevice(clQueue[currentIndexStream], boundary))
     {
         return false;
     }
 
     ////////////////////////////////////////////////////////////////////////////
     uint32_t const* const header { jobInfo.headerHash.word32 };
-    if (false == parameters.headerCache.setBufferDevice(clQueue, header))
+    if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header))
     {
         return false;
     }
@@ -183,12 +185,12 @@ bool resolver::ResolverAmdAutolykosV2::fillDAG()
     uint32_t const blockDim { algo::autolykos_v2::AMD_BLOCK_DIM };
     uint32_t globalDimX { ((parameters.hostPeriod / blockDim) + 1) * blockDim };
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(globalDimX, 1, 1),
             cl::NDRange(blockDim,   1, 1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     return true;
 }
@@ -281,12 +283,12 @@ bool resolver::ResolverAmdAutolykosV2::executeSync(
     OPENCL_ER(clKernel.setArg(3u, parameters.hostNonce));
     OPENCL_ER(clKernel.setArg(4u, parameters.hostPeriod));
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(maxGroupSizeSearch,                1, 1),
             cl::NDRange(algo::autolykos_v2::AMD_BLOCK_DIM, 1, 1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     ////////////////////////////////////////////////////////////////////////////
     auto& clKernelVerify { kernelGeneratorVerify.clKernel };
@@ -298,12 +300,12 @@ bool resolver::ResolverAmdAutolykosV2::executeSync(
     OPENCL_ER(clKernelVerify.setArg(5u, parameters.hostPeriod));
     OPENCL_ER(clKernelVerify.setArg(6u, parameters.hostHeight));
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernelVerify,
             cl::NullRange,
             cl::NDRange(maxGroupSizeVerify,                1, 1),
             cl::NDRange(algo::autolykos_v2::AMD_BLOCK_DIM, 1, 1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     ////////////////////////////////////////////////////////////////////////////
     if (false == getResultCache(jobInfo.jobIDStr,
@@ -334,13 +336,13 @@ bool resolver::ResolverAmdAutolykosV2::getResultCache(
     algo::hash256 boundary{};
 
     ////////////////////////////////////////////////////////////////////////////
-    if (false == parameters.resultCache.getBufferHost(clQueue, &data))
+    if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data))
     {
         return false;
     }
 
     ////////////////////////////////////////////////////////////////////////////
-    if (false == parameters.boundaryCache.getBufferHost(clQueue, boundary.word32))
+    if (false == parameters.boundaryCache.getBufferHost(clQueue[currentIndexStream], boundary.word32))
     {
         return false;
     }
@@ -384,7 +386,7 @@ bool resolver::ResolverAmdAutolykosV2::getResultCache(
             resultShare.jobId.assign(_jobId);
         }
 
-        if (false == parameters.resultCache.resetBufferHost(clQueue))
+        if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream]))
         {
             return false;
         }
diff --git a/sources/resolver/amd/etchash.cpp b/sources/resolver/amd/etchash.cpp
index 40113bf..8ea9fcd 100644
--- a/sources/resolver/amd/etchash.cpp
+++ b/sources/resolver/amd/etchash.cpp
@@ -26,9 +26,9 @@ bool resolver::ResolverAmdEtchash::updateContext(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU);
 
     if (   context.lightCache.numberItem == 0ull
         || context.lightCache.size == 0ull
diff --git a/sources/resolver/amd/ethash.cpp b/sources/resolver/amd/ethash.cpp
index c53b206..7f82ea0 100644
--- a/sources/resolver/amd/ethash.cpp
+++ b/sources/resolver/amd/ethash.cpp
@@ -36,9 +36,10 @@ bool resolver::ResolverAmdEthash::updateContext(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        true // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU
+    algo::ethash::buildLightCache(context, true);
 
     if (   context.lightCache.numberItem == 0ull
         || context.lightCache.size == 0ull
@@ -78,7 +79,8 @@ bool resolver::ResolverAmdEthash::updateMemory(
     {
         return false;
     }
-    if (nullptr == clQueue)
+    if (   nullptr == clQueue[0]
+        || nullptr == clQueue[1])
     {
         return false;
     }
@@ -102,8 +104,8 @@ bool resolver::ResolverAmdEthash::updateMemory(
     ////////////////////////////////////////////////////////////////////////////
     if (   false == parameters.lightCache.alloc(*clContext)
         || false == parameters.dagCache.alloc(*clContext)
-        || false == parameters.headerCache.alloc(clQueue, *clContext)
-        || false == parameters.resultCache.alloc(clQueue, *clContext))
+        || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext)
+        || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext))
     {
         return false;
     }
@@ -111,7 +113,7 @@ bool resolver::ResolverAmdEthash::updateMemory(
     ////////////////////////////////////////////////////////////////////////////
     if (false == parameters.lightCache.write(context.lightCache.hash,
                                              context.lightCache.size,
-                                             clQueue))
+                                             clQueue[currentIndexStream]))
     {
         return false;
     }
@@ -136,7 +138,7 @@ bool resolver::ResolverAmdEthash::updateConstants(
 {
     ////////////////////////////////////////////////////////////////////////////
     uint32_t const* const header { jobInfo.headerHash.word32 };
-    if (false == parameters.headerCache.setBufferDevice(clQueue, header))
+    if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header))
     {
         return false;
     }
@@ -193,12 +195,12 @@ bool resolver::ResolverAmdEthash::buildDAG()
     uint32_t const maxGroupSize { getMaxGroupSize() };
     uint32_t const threadKernel { castU32(context.dagCache.numberItem) / maxGroupSize };
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(maxGroupSize, threadKernel, 1),
             cl::NDRange(maxGroupSize, 1,            1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     ////////////////////////////////////////////////////////////////////////////
     parameters.lightCache.free();
@@ -263,12 +265,12 @@ bool resolver::ResolverAmdEthash::executeSync(
     OPENCL_ER(clKernel.setArg(4u, jobInfo.boundaryU64));
 
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(blocks, threads, 1),
             cl::NDRange(blocks, 1,       1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize))
     {
@@ -292,7 +294,7 @@ bool resolver::ResolverAmdEthash::getResultCache(
 {
     algo::ethash::Result data{};
 
-    if (false == parameters.resultCache.getBufferHost(clQueue, &data))
+    if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data))
     {
         return false;
     }
@@ -314,7 +316,7 @@ bool resolver::ResolverAmdEthash::getResultCache(
             resultShare.nonces[i] = data.nonces[i];
         }
 
-        if (false == parameters.resultCache.resetBufferHost(clQueue))
+        if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream]))
         {
             return false;
         }
diff --git a/sources/resolver/amd/progpow.cpp b/sources/resolver/amd/progpow.cpp
index 1782854..b89691a 100644
--- a/sources/resolver/amd/progpow.cpp
+++ b/sources/resolver/amd/progpow.cpp
@@ -32,9 +32,10 @@ bool resolver::ResolverAmdProgPOW::updateContext(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        true // config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU
+    algo::ethash::buildLightCache(context, true);
 
     if (   0ull == context.lightCache.numberItem
         || 0ull == context.lightCache.size
@@ -71,7 +72,7 @@ bool resolver::ResolverAmdProgPOW::updateMemory(
     stratum::StratumJobInfo const& jobInfo)
 {
     IS_NULL(clContext);
-    IS_NULL(clQueue);
+    IS_NULL(clQueue[currentIndexStream]);
 
     ////////////////////////////////////////////////////////////////////////////
     if (false == updateContext(jobInfo))
@@ -92,8 +93,8 @@ bool resolver::ResolverAmdProgPOW::updateMemory(
     ////////////////////////////////////////////////////////////////////////////
     if (   false == parameters.lightCache.alloc(*clContext)
         || false == parameters.dagCache.alloc(*clContext)
-        || false == parameters.headerCache.alloc(clQueue, *clContext)
-        || false == parameters.resultCache.alloc(clQueue, *clContext))
+        || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext)
+        || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext))
     {
         return false;
     }
@@ -101,7 +102,7 @@ bool resolver::ResolverAmdProgPOW::updateMemory(
     ////////////////////////////////////////////////////////////////////////////
     if (false == parameters.lightCache.write(context.lightCache.hash,
                                              context.lightCache.size,
-                                             clQueue))
+                                             clQueue[currentIndexStream]))
     {
         return false;
     }
@@ -140,7 +141,7 @@ bool resolver::ResolverAmdProgPOW::updateConstants(
 
     ////////////////////////////////////////////////////////////////////////////
     uint32_t const* const header { jobInfo.headerHash.word32 };
-    if (false == parameters.headerCache.setBufferDevice(clQueue, header))
+    if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header))
     {
         return false;
     }
@@ -193,12 +194,12 @@ bool resolver::ResolverAmdProgPOW::buildDAG()
     uint32_t const maxGroupSize { getMaxGroupSize() };
     uint32_t const threadKernel { castU32(context.dagCache.numberItem) / maxGroupSize };
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(maxGroupSize, threadKernel, 1),
             cl::NDRange(maxGroupSize, 1,            1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
     ////////////////////////////////////////////////////////////////////////////
     parameters.lightCache.free();
@@ -210,6 +211,9 @@ bool resolver::ResolverAmdProgPOW::buildDAG()
 
 bool resolver::ResolverAmdProgPOW::buildSearch()
 {
+    ////////////////////////////////////////////////////////////////////////////
+    auto const& config{ common::Config::instance() };
+
     ////////////////////////////////////////////////////////////////////////////
     algo::progpow::writeMathRandomKernelOpenCL(progpowVersion,
                                                deviceId,
@@ -278,6 +282,12 @@ bool resolver::ResolverAmdProgPOW::buildSearch()
     kernelGenerator.addDefine("SHARE_FNV1A_SIZE", maxThreadByGroup);
     kernelGenerator.addDefine("MODULE_CACHE_GROUP", maxThreadByGroup * 4u);
     kernelGenerator.addDefine("MODULE_LOOP", algo::progpow::MODULE_CACHE / (maxThreadByGroup / 4u));
+    kernelGenerator.addDefine("TOTAL_THREADS", getBlocks() * getThreads());
+    if (std::nullopt != config.occupancy.internalLoop)
+    {
+        uint32_t const internalLoop{ *config.occupancy.internalLoop };
+        kernelGenerator.addDefine("INTERNAL_LOOP", internalLoop);
+    }
 
     ////////////////////////////////////////////////////////////////////////////
     kernelGenerator.addInclude("kernel/common/rotate_byte.cl");
@@ -332,27 +342,28 @@ bool resolver::ResolverAmdProgPOW::buildSearch()
 bool resolver::ResolverAmdProgPOW::executeSync(
     stratum::StratumJobInfo const& jobInfo)
 {
+    ///////////////////////////////////////////////////////////////////////////
     auto& clKernel { kernelGenerator.clKernel };
-
     OPENCL_ER(clKernel.setArg(0u, jobInfo.nonce));
     OPENCL_ER(clKernel.setArg(1u, jobInfo.boundaryU64));
     OPENCL_ER(clKernel.setArg(2u, *(parameters.headerCache.getBuffer())));
     OPENCL_ER(clKernel.setArg(3u, *(parameters.dagCache.getBuffer())));
     OPENCL_ER(clKernel.setArg(4u, *(parameters.resultCache.getBuffer())));
-
     OPENCL_ER(
-        clQueue->enqueueNDRangeKernel(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
             clKernel,
             cl::NullRange,
             cl::NDRange(blocks, threads, 1),
             cl::NDRange(blocks, 1,       1)));
-    OPENCL_ER(clQueue->finish());
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
 
+    ///////////////////////////////////////////////////////////////////////////
     if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize))
     {
         return false;
     }
 
+    ///////////////////////////////////////////////////////////////////////////
     return true;
 }
 
@@ -360,7 +371,36 @@ bool resolver::ResolverAmdProgPOW::executeSync(
 bool resolver::ResolverAmdProgPOW::executeAsync(
     stratum::StratumJobInfo const& jobInfo)
 {
-    return executeSync(jobInfo);
+    ///////////////////////////////////////////////////////////////////////////
+    OPENCL_ER(clQueue[currentIndexStream]->finish());
+
+    ///////////////////////////////////////////////////////////////////////////
+    swapIndexStream();
+    auto& clKernel { kernelGenerator.clKernel };
+    OPENCL_ER(clKernel.setArg(0u, jobInfo.nonce));
+    OPENCL_ER(clKernel.setArg(1u, jobInfo.boundaryU64));
+    OPENCL_ER(clKernel.setArg(2u, *(parameters.headerCache.getBuffer())));
+    OPENCL_ER(clKernel.setArg(3u, *(parameters.dagCache.getBuffer())));
+    OPENCL_ER(clKernel.setArg(4u, *(parameters.resultCache.getBuffer())));
+    OPENCL_ER(
+        clQueue[currentIndexStream]->enqueueNDRangeKernel(
+            clKernel,
+            cl::NullRange,
+            cl::NDRange(blocks, threads, 1),
+            cl::NDRange(blocks, 1,       1)));
+
+    ///////////////////////////////////////////////////////////////////////////
+    swapIndexStream();
+    if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize))
+    {
+        return false;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    swapIndexStream();
+
+    ///////////////////////////////////////////////////////////////////////////
+    return true;
 }
 
 
@@ -370,7 +410,7 @@ bool resolver::ResolverAmdProgPOW::getResultCache(
 {
     algo::progpow::Result data{};
 
-    if (false == parameters.resultCache.getBufferHost(clQueue, &data))
+    if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data))
     {
         return false;
     }
@@ -399,7 +439,7 @@ bool resolver::ResolverAmdProgPOW::getResultCache(
             }
         }
 
-        if (false == parameters.resultCache.resetBufferHost(clQueue))
+        if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream]))
         {
             return false;
         }
diff --git a/sources/resolver/cpu/progpow.cpp b/sources/resolver/cpu/progpow.cpp
index d9cd9bd..f1bab19 100644
--- a/sources/resolver/cpu/progpow.cpp
+++ b/sources/resolver/cpu/progpow.cpp
@@ -25,9 +25,10 @@ bool resolver::ResolverCpuProgPOW::updateContext(stratum::StratumJobInfo const&
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        true
+        lightCacheCountItemsInit
     );
+    // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU
+    algo::ethash::buildLightCache(context, true);
 
     if (   0ull == context.lightCache.numberItem
         || 0ull == context.lightCache.size
diff --git a/sources/resolver/mocker.cpp b/sources/resolver/mocker.cpp
index 356fcb8..f0e700e 100644
--- a/sources/resolver/mocker.cpp
+++ b/sources/resolver/mocker.cpp
@@ -48,9 +48,9 @@ bool resolver::ResolverMocker::updateMemory(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU);
 
     ///////////////////////////////////////////////////////////////////////////
     boost::this_thread::sleep_for(WAIT_UPDATE_MEMORY);
diff --git a/sources/resolver/nvidia/blake3.cpp b/sources/resolver/nvidia/blake3.cpp
index 8b65eb5..9134c0d 100644
--- a/sources/resolver/nvidia/blake3.cpp
+++ b/sources/resolver/nvidia/blake3.cpp
@@ -100,7 +100,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync(
     CUDA_ER(cudaGetLastError());
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     parameters.hostNonce = jobInfo.nonce;
     blake3Search(cuStream[currentIndexStream],
                  parameters,
@@ -109,7 +109,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync(
                  threads);
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     algo::blake3::Result* resultCache { &parameters.resultCache[currentIndexStream] };
     if (true == resultCache->found)
     {
@@ -135,7 +135,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync(
     }
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
 
     return true;
 }
diff --git a/sources/resolver/nvidia/etchash.cpp b/sources/resolver/nvidia/etchash.cpp
index f53e612..75e8542 100644
--- a/sources/resolver/nvidia/etchash.cpp
+++ b/sources/resolver/nvidia/etchash.cpp
@@ -25,8 +25,8 @@ bool resolver::ResolverNvidiaEtchash::updateContext(
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
         lightCacheCountItemsInit,
-        config.deviceAlgorithm.ethashBuildLightCacheCPU
     );
+    algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU);
 
     ///////////////////////////////////////////////////////////////////////////
     if (   context.lightCache.numberItem == 0ull
diff --git a/sources/resolver/nvidia/ethash.cpp b/sources/resolver/nvidia/ethash.cpp
index 49ad852..e9bd014 100644
--- a/sources/resolver/nvidia/ethash.cpp
+++ b/sources/resolver/nvidia/ethash.cpp
@@ -29,9 +29,9 @@ bool resolver::ResolverNvidiaEthash::updateContext(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU);
 
     if (   context.lightCache.numberItem == 0ull
         || context.lightCache.size == 0ull
@@ -185,7 +185,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync(
     CUDA_ER(cudaGetLastError());
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     ethashSearch(cuStream[currentIndexStream],
                  &parameters.resultCache[currentIndexStream],
                  blocks,
@@ -193,7 +193,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync(
                  jobInfo.nonce);
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     algo::ethash::Result* resultCache{ &parameters.resultCache[currentIndexStream] };
     if (true == resultCache->found)
     {
@@ -217,7 +217,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync(
     }
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
 
     return true;
 }
diff --git a/sources/resolver/nvidia/progpow.cpp b/sources/resolver/nvidia/progpow.cpp
index 5994179..460cfc4 100644
--- a/sources/resolver/nvidia/progpow.cpp
+++ b/sources/resolver/nvidia/progpow.cpp
@@ -31,9 +31,9 @@ bool resolver::ResolverNvidiaProgPOW::updateContext(
         dagCountItemsGrowth,
         dagCountItemsInit,
         lightCacheCountItemsGrowth,
-        lightCacheCountItemsInit,
-        config.deviceAlgorithm.ethashBuildLightCacheCPU
+        lightCacheCountItemsInit
     );
+    algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU);
 
     if (   context.lightCache.numberItem == 0ull
         || context.lightCache.size == 0ull
@@ -260,7 +260,7 @@ bool resolver::ResolverNvidiaProgPOW::buildSearch()
     kernelGenerator.addDefine("COUNT_DAG", algo::progpow::COUNT_DAG);
     kernelGenerator.addDefine("STATE_LEN", 25u);
     kernelGenerator.addDefine("TOTAL_THREADS", getThreads() * getBlocks());
-    if (std::nullopt != config.occupancy.internalLoop )
+    if (std::nullopt != config.occupancy.internalLoop)
     {
         uint32_t const internalLoop{ *config.occupancy.internalLoop };
         kernelGenerator.addDefine("INTERNAL_LOOP", internalLoop);
@@ -397,7 +397,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync(
     CUDA_ER(cudaGetLastError());
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     uint64_t nonce{ jobInfo.nonce };
     uint64_t boundary{ jobInfo.boundaryU64 };
     algo::progpow::Result* result{ &parameters.resultCache[currentIndexStream] };
@@ -418,7 +418,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync(
                          nullptr));
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
     algo::progpow::Result* resultCache { &parameters.resultCache[currentIndexStream] };
     if (true == resultCache->found)
     {
@@ -450,7 +450,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync(
     }
 
     ////////////////////////////////////////////////////////////////////////////
-    swapIndexStrean();
+    swapIndexStream();
 
     ////////////////////////////////////////////////////////////////////////////
     return true;
diff --git a/sources/resolver/resolver.cpp b/sources/resolver/resolver.cpp
index c613ba0..678c451 100644
--- a/sources/resolver/resolver.cpp
+++ b/sources/resolver/resolver.cpp
@@ -4,7 +4,7 @@
 #include <resolver/resolver.hpp>
 
 
-void resolver::Resolver::swapIndexStrean()
+void resolver::Resolver::swapIndexStream()
 {
     if (0u == currentIndexStream)
     {
diff --git a/sources/resolver/resolver.hpp b/sources/resolver/resolver.hpp
index 0af9221..812ca61 100644
--- a/sources/resolver/resolver.hpp
+++ b/sources/resolver/resolver.hpp
@@ -23,7 +23,7 @@ namespace resolver
         Resolver& operator=(Resolver const&) = delete;
         Resolver& operator=(Resolver&&) = delete;
 
-        void swapIndexStrean();
+        void swapIndexStream();
         void setBlocks(uint32_t const newBlocks);
         void setThreads(uint32_t const newThreads);
         uint32_t getBlocks() const;