diff --git a/sources/algo/ethash/CMakeLists.txt b/sources/algo/ethash/CMakeLists.txt index 8e28f54..39c9f34 100644 --- a/sources/algo/ethash/CMakeLists.txt +++ b/sources/algo/ethash/CMakeLists.txt @@ -15,6 +15,14 @@ if (BUILD_EXE_UNIT_TEST) ) endif() +if (BUILD_EXE_MINER) + target_sources(${BENCH_EXE} PUBLIC + ${HEADERS} + ${SOURCES} + ) +endif() + + add_subdirectory(opencl) add_subdirectory(tests) diff --git a/sources/algo/ethash/ethash.cpp b/sources/algo/ethash/ethash.cpp index 4338337..fa7b5df 100644 --- a/sources/algo/ethash/ethash.cpp +++ b/sources/algo/ethash/ethash.cpp @@ -71,8 +71,7 @@ void algo::ethash::initializeDagContext( uint64_t const dagCountItemsGrowth, uint64_t const dagCountItemsInit, uint32_t const lightCacheCountItemsGrowth, - uint32_t const lightCacheCountItemsInit, - bool const buildOnCPU) + uint32_t const lightCacheCountItemsInit) { //////////////////////////////////////////////////////////////////////////// context.epoch = castU32(currentEpoch); @@ -115,12 +114,23 @@ void algo::ethash::initializeDagContext( seed = algo::keccak(seed); } algo::copyHash(context.originalSeedCache, seed); +} + + +void algo::ethash::buildLightCache( + algo::DagContext& context, + bool const buildOnCPU) +{ + //////////////////////////////////////////////////////////////////////////// + algo::hash512 item + { + algo::keccak(context.originalSeedCache) + }; //////////////////////////////////////////////////////////////////////////// if (false == buildOnCPU) { - algo::hash512 const hashedSeed{ algo::keccak(seed) }; - algo::copyHash(context.hashedSeedCache, hashedSeed); + algo::copyHash(context.hashedSeedCache, item); return; } @@ -133,27 +143,12 @@ void algo::ethash::initializeDagContext( logErr() << "Cannot alloc context data"; return; } - - //////////////////////////////////////////////////////////////////////////// context.lightCache.hash = castPtrHash512(context.data + algo::LEN_HASH_512); + context.lightCache.hash[0] = item; //////////////////////////////////////////////////////////////////////////// - if (true == buildOnCPU) - { - logInfo() << "Building light cache on CPU"; - common::ChronoGuard chrono{ "Built light cache", common::CHRONO_UNIT::MS }; - buildLightCache(context, seed); - } -} - - -void algo::ethash::buildLightCache( - algo::DagContext& context, - algo::hash256 const& seed) -{ - //////////////////////////////////////////////////////////////////////////// - algo::hash512 item{ algo::keccak(seed) }; - context.lightCache.hash[0] = item; + logInfo() << "Building light cache on CPU"; + common::ChronoGuard chrono{ "Built light cache", common::CHRONO_UNIT::MS }; //////////////////////////////////////////////////////////////////////////// for (uint64_t i{ 1ull }; i < context.lightCache.numberItem; ++i) diff --git a/sources/algo/ethash/ethash.hpp b/sources/algo/ethash/ethash.hpp index a04a98f..4156814 100644 --- a/sources/algo/ethash/ethash.hpp +++ b/sources/algo/ethash/ethash.hpp @@ -40,11 +40,10 @@ namespace algo uint64_t const dagCountItemsGrowth, uint64_t const dagCountItemsInit, uint32_t const lightCacheCountItemsGrowth, - uint32_t const lightCacheCountItemsInit, - bool const buildOnCPU); + uint32_t const lightCacheCountItemsInit); void freeDagContext(algo::DagContext& context); void buildLightCache(algo::DagContext& context, - algo::hash256 const& seed); + bool const buildOnCPU); #endif } } diff --git a/sources/algo/ethash/tests/ethash.cpp b/sources/algo/ethash/tests/ethash.cpp index 7d9a873..2a32b79 100644 --- a/sources/algo/ethash/tests/ethash.cpp +++ b/sources/algo/ethash/tests/ethash.cpp @@ -63,9 +63,9 @@ TEST_F(EthashTest, lightCacheBuild) dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - true + lightCacheCountItemsInit ); + algo::ethash::buildLightCache(context, true); // light cache size ASSERT_EQ(1411061ull, context.lightCache.numberItem); diff --git a/sources/algo/progpow/opencl/progpow.cl b/sources/algo/progpow/opencl/progpow.cl index 2bdd579..8d1dc0a 100644 --- a/sources/algo/progpow/opencl/progpow.cl +++ b/sources/algo/progpow/opencl/progpow.cl @@ -122,60 +122,69 @@ void progpow_search( uint const thread_id = get_global_id(0) + (get_global_id(1) * GROUP_SIZE); uint const lane_id = thread_id % LANES; uint const worker_group = get_global_id(0) / LANES; - ulong const nonce = start_nonce + thread_id; + ulong nonce = start_nonce + thread_id; uint const index_share_seed = get_global_id(0) / BATCH_GROUP_LANE; //////////////////////////////////////////////////////////////////////// initialize_header(dag, header_dag, (thread_id % GROUP_SIZE)); ulong const seed = initialize_seed(header, state_mix, nonce); +#if defined(INTERNAL_LOOP) __attribute__((opencl_unroll_hint(1))) - for (uint l_id = 0u; l_id < LANES; ++l_id) + for (uint i = 0; i < INTERNAL_LOOP; ++i) { - //////////////////////////////////////////////////////////////////////// - if (l_id == lane_id) + nonce += (i * TOTAL_THREADS); +#endif + __attribute__((opencl_unroll_hint(1))) + for (uint l_id = 0u; l_id < LANES; ++l_id) { - share_msb_lsb[index_share_seed] = seed; + //////////////////////////////////////////////////////////////////////// + if (l_id == lane_id) + { + share_msb_lsb[index_share_seed] = seed; + } + barrier(CLK_LOCAL_MEM_FENCE); + + //////////////////////////////////////////////////////////////////////// + ulong const seedShare = share_msb_lsb[index_share_seed]; + + //////////////////////////////////////////////////////////////////////// + fill_hash(hash, lane_id, seedShare); + loop_math(dag, share_hash0, header_dag, hash, lane_id, worker_group); + reduce_hash( + share_fnv1a, + hash, + digest, + worker_group, + l_id == lane_id); } - barrier(CLK_LOCAL_MEM_FENCE); //////////////////////////////////////////////////////////////////////// - ulong const seedShare = share_msb_lsb[index_share_seed]; - - //////////////////////////////////////////////////////////////////////// - fill_hash(hash, lane_id, seedShare); - loop_math(dag, share_hash0, header_dag, hash, lane_id, worker_group); - reduce_hash( - share_fnv1a, - hash, - digest, - worker_group, - l_id == lane_id); - } + #if defined(__KERNEL_PROGPOW) + ulong const bytes_result = is_valid(header, digest, seed); + #else + ulong const bytes_result = is_valid(state_mix, digest); + #endif - //////////////////////////////////////////////////////////////////////// -#if defined(__KERNEL_PROGPOW) - ulong const bytes_result = is_valid(header, digest, seed); -#else - ulong const bytes_result = is_valid(state_mix, digest); -#endif - - if (bytes_result <= boundary) - { - uint const index = atomic_inc(&result->count); - if (index < MAX_RESULT) + if (bytes_result <= boundary) { - result->found = true; - result->nonces[index] = nonce; - - result->hash[index][0] = digest[0]; - result->hash[index][1] = digest[1]; - result->hash[index][2] = digest[2]; - result->hash[index][3] = digest[3]; - result->hash[index][4] = digest[4]; - result->hash[index][5] = digest[5]; - result->hash[index][6] = digest[6]; - result->hash[index][7] = digest[7]; + uint const index = atomic_inc(&result->count); + if (index < MAX_RESULT) + { + result->found = true; + result->nonces[index] = nonce; + + result->hash[index][0] = digest[0]; + result->hash[index][1] = digest[1]; + result->hash[index][2] = digest[2]; + result->hash[index][3] = digest[3]; + result->hash[index][4] = digest[4]; + result->hash[index][5] = digest[5]; + result->hash[index][6] = digest[6]; + result->hash[index][7] = digest[7]; + } } +#if defined(INTERNAL_LOOP) } +#endif } diff --git a/sources/benchmark/amd/kawpow.cpp b/sources/benchmark/amd/kawpow.cpp index 2b37a7a..5ac9e76 100644 --- a/sources/benchmark/amd/kawpow.cpp +++ b/sources/benchmark/amd/kawpow.cpp @@ -2,6 +2,8 @@ #include +#include +#include #include #include #include @@ -19,14 +21,18 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() //////////////////////////////////////////////////////////////////////////// bool dagInitialized{ false }; - uint64_t const dagItems{ 16777213ull }; - uint64_t const dagItemsKawpow{ dagItems / 2ull }; - auto const header + algo::hash256 const headerHash { algo::toHash256("71c967486cb3b70d5dfcb2ebd8eeef138453637cacbf3ccb580a41a7e96986bb") }; + algo::hash256 const seedHash + { + algo::toHash256("7c4fb8a5d141973b69b521ce76b0dc50f0d2834d817c7f8310a6ab5becc6bb0c") + }; + int32_t const epoch{ algo::ethash::findEpoch(seedHash, algo::ethash::EPOCH_LENGTH) }; //////////////////////////////////////////////////////////////////////////// + common::opencl::Buffer lightCache { CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY }; common::opencl::Buffer dagCache{ CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS }; common::opencl::BufferMapped headerCache { @@ -36,14 +42,29 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() common::opencl::BufferMapped resultCache{ CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR }; //////////////////////////////////////////////////////////////////////////// - dagCache.setSize(dagItems * algo::LEN_HASH_1024); + algo::DagContext dagContext{}; + algo::ethash::initializeDagContext + ( + dagContext, + epoch, + algo::ethash::MAX_EPOCH_NUMBER, + algo::ethash::DAG_COUNT_ITEMS_GROWTH, + algo::ethash::DAG_COUNT_ITEMS_INIT, + algo::ethash::LIGHT_CACHE_COUNT_ITEMS_GROWTH, + algo::ethash::LIGHT_CACHE_COUNT_ITEMS_INIT + ); + + //////////////////////////////////////////////////////////////////////////// + dagCache.setSize(dagContext.dagCache.size); + + //////////////////////////////////////////////////////////////////////////// dagCache.alloc(propertiesAmd.clContext); headerCache.alloc(&propertiesAmd.clQueue, propertiesAmd.clContext); resultCache.alloc(&propertiesAmd.clQueue, propertiesAmd.clContext); //////////////////////////////////////////////////////////////////////////// if (false == headerCache.setBufferDevice(&propertiesAmd.clQueue, - (uint32_t*)header.word32)) + (uint32_t*)headerHash.word32)) { logErr() << "Fail to copy header in cache"; } @@ -66,7 +87,7 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() /////////////////////////////////////////////////////////////////// auto& clKernel{ generator.clKernel }; OPENCL_ER(clKernel.setArg(0u, *dagCache.getBuffer())); - OPENCL_ER(clKernel.setArg(1u, castU32(dagItems))); + OPENCL_ER(clKernel.setArg(1u, castU32(dagContext.dagCache.numberItem))); /////////////////////////////////////////////////////////////////// OPENCL_ER( @@ -87,10 +108,10 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() /////////////////////////////////////////////////////////////////////////// auto benchKawpow = [&](std::string const& kernelName, + uint32_t const loop, uint32_t const groupSize, uint32_t const workerGroupCount, - uint32_t const workItemCollaborate, - uint32_t const loop) -> bool + uint32_t const workItemCollaborate) -> bool { /////////////////////////////////////////////////////////////////////// common::KernelGeneratorOpenCL generator{}; @@ -113,7 +134,7 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() generator.addDefine("MODULE_CACHE", algo::progpow::MODULE_CACHE); generator.addDefine("HEADER_ITEM_BY_THREAD", algo::progpow::MODULE_CACHE / groupSize); generator.addDefine("COUNT_DAG", algo::progpow::COUNT_DAG); - generator.addDefine("DAG_SIZE", castU32(dagItemsKawpow)); + generator.addDefine("DAG_SIZE", castU32(dagContext.dagCache.numberItem / 2ull)); generator.addDefine("BATCH_GROUP_LANE", batchGroupLane); generator.addDefine("SHARE_SEED_SIZE", batchGroupLane); generator.addDefine("SHARE_HASH0_SIZE", batchGroupLane); @@ -145,10 +166,10 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() /////////////////////////////////////////////////////////////////////// auto& clKernel{ generator.clKernel }; - OPENCL_ER(clKernel.setArg(0u, 0ull)); - OPENCL_ER(clKernel.setArg(1u, *headerCache.getBuffer())); - OPENCL_ER(clKernel.setArg(2u, *dagCache.getBuffer())); - OPENCL_ER(clKernel.setArg(3u, *resultCache.getBuffer())); + OPENCL_ER(clKernel.setArg(0u, *dagCache.getBuffer())); + OPENCL_ER(clKernel.setArg(1u, *resultCache.getBuffer())); + OPENCL_ER(clKernel.setArg(2u, *headerCache.getBuffer())); + OPENCL_ER(clKernel.setArg(3u, 0ull)); /////////////////////////////////////////////////////////////////////// setGrid(groupSize, workerGroupCount); @@ -177,11 +198,15 @@ bool benchmark::BenchmarkWorkflow::runAmdKawpow() //////////////////////////////////////////////////////////////////////////// if (true == dagInitialized) { - benchKawpow("kawpow_lm1", 256u, 1024u, algo::progpow::LANES, 1u); // Parallele + LDS - benchKawpow("kawpow_lm2", 256u, 1024u, algo::progpow::LANES, 1u); // Parallel + crosslane - benchKawpow("kawpow_lm3", 256u, 1024u, algo::progpow::LANES, 1u); // Parallel + crosslane + LDS header + benchKawpow("kawpow_lm1", 1u, 256u, 1024u, algo::progpow::LANES); // Parallele + LDS + benchKawpow("kawpow_lm2", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane + benchKawpow("kawpow_lm3", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane + LDS header + benchKawpow("kawpow_lm4", 1u, 256u, 1024u, algo::progpow::LANES); // Parallel + crosslane + LDS header } + //////////////////////////////////////////////////////////////////////////// + algo::ethash::freeDagContext(dagContext); + //////////////////////////////////////////////////////////////////////////// dagCache.free(); headerCache.free(); diff --git a/sources/benchmark/opencl/CMakeLists.txt b/sources/benchmark/opencl/CMakeLists.txt index e680705..b9cb7fe 100644 --- a/sources/benchmark/opencl/CMakeLists.txt +++ b/sources/benchmark/opencl/CMakeLists.txt @@ -8,6 +8,7 @@ set(BENCHMARK_OPENCL_FILES kawpow/kawpow_lm1.cl kawpow/kawpow_lm2.cl kawpow/kawpow_lm3.cl + kawpow/kawpow_lm4.cl kawpow/sequence_dynamic.cl kawpow/sequence_dynamic_local.cl ) @@ -28,6 +29,7 @@ add_custom_target(copy_benchmark_opencl ALL ${OUT_COMMON}/kawpow/kawpow_lm1.cl ${OUT_COMMON}/kawpow/kawpow_lm2.cl ${OUT_COMMON}/kawpow/kawpow_lm3.cl + ${OUT_COMMON}/kawpow/kawpow_lm4.cl ${OUT_COMMON}/kawpow/sequence_dynamic.cl ${OUT_COMMON}/kawpow/sequence_dynamic_local.cl ) diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm1.cl b/sources/benchmark/opencl/kawpow/kawpow_lm1.cl index 1a17ea0..c99cd4a 100644 --- a/sources/benchmark/opencl/kawpow/kawpow_lm1.cl +++ b/sources/benchmark/opencl/kawpow/kawpow_lm1.cl @@ -149,10 +149,11 @@ void loop_math( barrier(CLK_LOCAL_MEM_FENCE); uint dag_index = share_hash0[worker_group]; - uint fd = dag_index; dag_index %= DAG_SIZE; - dag_index *= WORK_ITEM_COLLABORATE; - dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); + + // TODO: FIX + // dag_index *= WORK_ITEM_COLLABORATE; + // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); uint4 const entries = dag[dag_index]; sequence_dynamic_local(header_dag, hash, entries); @@ -208,10 +209,10 @@ void initialize_header( __kernel void kawpow_lm1( - ulong const start_nonce, - __constant uint4 const* const restrict header, - __global uint4 const* const restrict dag, - __global t_result* const restrict result) + __global uint const* const restrict dag, + __global t_result* const restrict result, + __constant uint const* const restrict header, + ulong const start_nonce) { __local uint header_dag[MODULE_CACHE]; __local ulong share_msb_lsb[SHARE_SEED_SIZE]; @@ -259,6 +260,8 @@ void kawpow_lm1( /////////////////////////////////////////////////////////////////////////// ulong const bytes_result = is_valid(state_mix, digest); + PRINT_U32_IF("bytes_result", 0u, bytes_result); + PRINT_U32_IF("bytes_result", 16u, bytes_result); if (bytes_result <= 0) { uint const index = atomic_inc(&result->count); diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm2.cl b/sources/benchmark/opencl/kawpow/kawpow_lm2.cl index 45bf938..9f8fdb7 100644 --- a/sources/benchmark/opencl/kawpow/kawpow_lm2.cl +++ b/sources/benchmark/opencl/kawpow/kawpow_lm2.cl @@ -71,8 +71,10 @@ void loop_math( uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE); uint fd = dag_index; dag_index %= DAG_SIZE; - dag_index *= WORK_ITEM_COLLABORATE; - dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); + + // TODO: FIX + // dag_index *= WORK_ITEM_COLLABORATE; + // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); uint4 entries = ((uint4*)dag)[dag_index]; sequence_dynamic(dag, hash, entries); @@ -172,10 +174,10 @@ ulong is_valid( __kernel void kawpow_lm2( - ulong const start_nonce, - __constant uint const* const restrict header, __global uint const* const restrict dag, - __global t_result* const restrict result) + __global t_result* const restrict result, + __constant uint const* const restrict header, + ulong const start_nonce) { /////////////////////////////////////////////////////////////////////////// uint state[STATE_SIZE]; @@ -210,6 +212,8 @@ void kawpow_lm2( /////////////////////////////////////////////////////////////////////////// ulong const bytes_result = is_valid(state, digest); + PRINT_U32_IF("bytes_result", 0u, bytes_result); + PRINT_U32_IF("bytes_result", 16u, bytes_result); if (bytes_result <= 0) { uint const index = atomic_inc(&result->count); diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm3.cl b/sources/benchmark/opencl/kawpow/kawpow_lm3.cl index a49a7ba..1fbd216 100644 --- a/sources/benchmark/opencl/kawpow/kawpow_lm3.cl +++ b/sources/benchmark/opencl/kawpow/kawpow_lm3.cl @@ -7,7 +7,7 @@ void initialize_header( __attribute__((opencl_unroll_hint)) for (uint i = 0u; i < HEADER_ITEM_BY_THREAD; ++i) { - uint const index_dag = i * GROUP_SIZE + thread_id; + uint const index_dag = i * GROUP_SIZE + thread_id; // TODO: mad4 uint const item_dag = dag[index_dag]; header_dag[index_dag] = item_dag; } @@ -80,7 +80,8 @@ void loop_math( __global uint* const restrict dag, __local uint* const restrict header_dag, uint* const restrict hash, - uint const lane_id) + uint const lane_id, +uint const l_id) { __attribute__((opencl_unroll_hint(1))) for (uint cnt = 0u; cnt < COUNT_DAG; ++cnt) @@ -89,8 +90,10 @@ void loop_math( uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE); uint fd = dag_index; dag_index %= DAG_SIZE; - dag_index *= WORK_ITEM_COLLABORATE; - dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); + + // TODO: FIX + // dag_index *= WORK_ITEM_COLLABORATE; + // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); uint4 entries = ((uint4*)dag)[dag_index]; sequence_dynamic_local(header_dag, hash, entries); @@ -190,10 +193,10 @@ ulong is_valid( __kernel void kawpow_lm3( - ulong const start_nonce, - __constant uint const* const restrict header, __global uint const* const restrict dag, - __global t_result* const restrict result) + __global t_result* const restrict result, + __constant uint const* const restrict header, + ulong const start_nonce) { /////////////////////////////////////////////////////////////////////////// __local uint header_dag[MODULE_CACHE]; @@ -228,12 +231,14 @@ void kawpow_lm3( fill_hash(hash, lane_id % WORK_ITEM_COLLABORATE, lane_lsb, lane_msb, l_id); /////////////////////////////////////////////////////////////////////// - loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE); + loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE, l_id); reduce_hash(hash, digest, l_id == lane_id % WORK_ITEM_COLLABORATE); } /////////////////////////////////////////////////////////////////////////// ulong const bytes_result = is_valid(state, digest); + PRINT_U32_IF("bytes_result", 0u, bytes_result); + PRINT_U32_IF("bytes_result", 16u, bytes_result); if (bytes_result <= 0) { uint const index = atomic_inc(&result->count); diff --git a/sources/benchmark/opencl/kawpow/kawpow_lm4.cl b/sources/benchmark/opencl/kawpow/kawpow_lm4.cl new file mode 100644 index 0000000..da9dba9 --- /dev/null +++ b/sources/benchmark/opencl/kawpow/kawpow_lm4.cl @@ -0,0 +1,251 @@ +inline +void initialize_header( + __global uint const* restrict const dag, + __local uint * restrict const header_dag, + uint const thread_id) +{ + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < HEADER_ITEM_BY_THREAD; ++i) + { + uint const index_dag = i * GROUP_SIZE + thread_id; // TODO: mad4 + uint const item_dag = dag[index_dag]; + header_dag[index_dag] = item_dag; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + + +inline +void initialize_state( + __constant uint const* restrict header, + uint* const restrict state, + ulong const nonce) +{ + __attribute__((opencl_unroll_hint)) + for (uint i = 0; i < 8u; ++i) + { + state[i] = header[i]; + } + + state[8] = nonce; + state[9] = (nonce >> 32); + + state[10] = 'r'; + state[11] = 'A'; + state[12] = 'V'; + state[13] = 'E'; + state[14] = 'N'; + + state[15] = 'C'; + state[16] = 'O'; + state[17] = 'I'; + state[18] = 'N'; + + state[19] = 'K'; + state[20] = 'A'; + state[21] = 'W'; + state[22] = 'P'; + state[23] = 'O'; + state[24] = 'W'; + + keccak_f800(state); +} + + +inline +void fill_hash( + uint* const restrict hash, + uint const lane_id, + uint const lsb, + uint const msb, + uint const l_id) +{ + uint4 data; + + data.x = fnv1a_u32(FNV1_OFFSET, msb); + data.y = fnv1a_u32(data.x, lsb); + data.z = fnv1a_u32(data.y, lane_id); + data.w = fnv1a_u32(data.z, lane_id); + + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < REGS; ++i) + { + hash[i] = kiss99(&data); + } +} + + +inline +void loop_math( + __global uint* const restrict dag, + __local uint* const restrict header_dag, + uint* const restrict hash, + uint const lane_id, +uint const l_id) +{ + __attribute__((opencl_unroll_hint(1))) + for (uint cnt = 0u; cnt < COUNT_DAG; ++cnt) + { + uint const mix0 = hash[0]; + uint dag_index = reg_load(mix0, cnt % WORK_ITEM_COLLABORATE, WORK_ITEM_COLLABORATE); + uint fd = dag_index; + dag_index %= DAG_SIZE; + + // TODO: FIX + // dag_index *= WORK_ITEM_COLLABORATE; + // dag_index += ((lane_id ^ cnt) % WORK_ITEM_COLLABORATE); + + uint4 entries = ((uint4*)dag)[dag_index]; + sequence_dynamic_local(header_dag, hash, entries); + } +} + + +inline +void reduce_hash( + uint* const restrict hash, + uint* const restrict digest, + bool const is_same_lane) +{ + uint value = FNV1_OFFSET; + + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < REGS; ++i) + { + value = fnv1a_u32(value, hash[i]); + } + + uint tmp[DIGEST_SIZE]; + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < DIGEST_SIZE; ++i) + { + tmp[i] = reg_load(value, i, WORK_ITEM_COLLABORATE); + } + + if (true == is_same_lane) + { + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < DIGEST_SIZE; ++i) + { + digest[i] = tmp[i]; + } + } +} + + +inline +ulong sha3( + uint const* const restrict digest_1, + uint* const restrict digest_2) +{ + uint state[STATE_SIZE]; + + __attribute__((opencl_unroll_hint)) + for (uint i = 0u; i < 8u; ++i) + { + state[i] = digest_1[i]; + } + + state[8] = digest_2[0]; + state[9] = digest_2[1]; + state[10] = digest_2[2]; + state[11] = digest_2[3]; + state[12] = digest_2[4]; + state[13] = digest_2[5]; + state[14] = digest_2[6]; + state[15] = digest_2[7]; + + state[16] = 'r'; + state[17] = 'A'; + state[18] = 'V'; + state[19] = 'E'; + state[20] = 'N'; + + state[21] = 'C'; + state[22] = 'O'; + state[23] = 'I'; + state[24] = 'N'; + + keccak_f800(state); + + ulong const res = ((ulong)state[1]) << 32 | state[0]; + return as_ulong(as_uchar8(res).s76543210); +} + + +inline +ulong is_valid( + uint const* const restrict state, + uint* const restrict digest) +{ + digest[0] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[0]), digest[8]); + digest[1] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[1]), digest[9]); + digest[2] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[2]), digest[10]); + digest[3] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[3]), digest[11]); + digest[4] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[4]), digest[12]); + digest[5] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[5]), digest[13]); + digest[6] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[6]), digest[14]); + digest[7] = fnv1a_u32(fnv1a_u32(FNV1_OFFSET, digest[7]), digest[15]); + + return sha3(state, digest); +} + + +__kernel +void kawpow_lm4( + __global uint const* const restrict dag, + __global t_result* const restrict result, + __constant uint const* const restrict header, + ulong const start_nonce) +{ + /////////////////////////////////////////////////////////////////////////// + __local uint header_dag[MODULE_CACHE]; + + /////////////////////////////////////////////////////////////////////////// + uint state[STATE_SIZE]; + uint digest[DIGEST_SIZE]; + uint hash[HASH_SIZE]; + + /////////////////////////////////////////////////////////////////////////// + uint const thread_id = get_thread_id_2d(); + uint const lane_id = get_sub_group_local_id(); + ulong const nonce = start_nonce + thread_id; + + /////////////////////////////////////////////////////////////////////////// + initialize_header(dag, header_dag, get_local_id(0)); + + /////////////////////////////////////////////////////////////////////////// + initialize_state(header, state, nonce); + uint msb = state[0]; + uint lsb = state[1]; + + /////////////////////////////////////////////////////////////////////////// + __attribute__((opencl_unroll_hint(1))) + for (uint l_id = 0u; l_id < WORK_ITEM_COLLABORATE; ++l_id) + { + /////////////////////////////////////////////////////////////////////// + uint const lane_msb = reg_load(msb, l_id, WORK_ITEM_COLLABORATE); + uint const lane_lsb = reg_load(lsb, l_id, WORK_ITEM_COLLABORATE); + + /////////////////////////////////////////////////////////////////////// + fill_hash(hash, lane_id % WORK_ITEM_COLLABORATE, lane_lsb, lane_msb, l_id); + + /////////////////////////////////////////////////////////////////////// + loop_math(dag, header_dag, hash, lane_id % WORK_ITEM_COLLABORATE, l_id); + reduce_hash(hash, digest, l_id == lane_id % WORK_ITEM_COLLABORATE); + } + + /////////////////////////////////////////////////////////////////////////// + ulong const bytes_result = is_valid(state, digest); + PRINT_U32_IF("bytes_result", 0u, bytes_result); + PRINT_U32_IF("bytes_result", 16u, bytes_result); + if (bytes_result <= 0) + { + uint const index = atomic_inc(&result->count); + if (index < 1) + { + result->found = true; + result->nonce = nonce; + } + } +} diff --git a/sources/common/opencl/cross_lane.cl b/sources/common/opencl/cross_lane.cl index 3852a59..6a1e294 100644 --- a/sources/common/opencl/cross_lane.cl +++ b/sources/common/opencl/cross_lane.cl @@ -1,24 +1,47 @@ - +/////////////////////////////////////////////////////////////////////////////// +// WAVEFRONT 32 +/////////////////////////////////////////////////////////////////////////////// #if WAVEFRONT == 32 - inline uint reg_load( + inline + uint reg_load( uint const var, - uint const lane_target, + uint const lane_id, uint const width) { +#if defined CROSSLANE_BY_LDS + uint const gap = get_thread_id_2d() / width; + uint const value = sub_group_shuffle(var, gap + lane_id); + + return value; +#else uint const local_id = get_sub_group_local_id(); uint const group_id = local_id / width; - uint const val_group_0 = sub_group_broadcast(var, lane_target); - uint const val_group_1 = sub_group_broadcast(var, width + lane_target); + uint const val_group_0 = sub_group_broadcast(var, lane_id); + uint const val_group_1 = sub_group_broadcast(var, width + lane_id); return (group_id == 0u) ? val_group_0 : val_group_1; +#endif } -#else // WAVEFRONT == 64 - inline uint reg_load( +#endif // WAVEFRONT == 32 + + +/////////////////////////////////////////////////////////////////////////////// +// WAVEFRONT 64 +/////////////////////////////////////////////////////////////////////////////// +#if WAVEFRONT == 64 + inline + uint reg_load( uint const var, uint const lane_target, uint const width) { +#if defined CROSSLANE_BY_LDS + uint const gap = get_thread_id_2d() / width; + uint const value = sub_group_shuffle(var, gap + lane_id); + + return value; +#else uint const local_id = get_sub_group_local_id(); uint const group_id = local_id / width; @@ -37,5 +60,6 @@ } return result; - } #endif + } +#endif // WAVEFRONT == 64 diff --git a/sources/device/amd.cpp b/sources/device/amd.cpp index 78133d0..9ba1cd9 100644 --- a/sources/device/amd.cpp +++ b/sources/device/amd.cpp @@ -11,7 +11,8 @@ bool device::DeviceAmd::initialize() cleanUp(); clContext = cl::Context(clDevice); - clQueue = cl::CommandQueue(clContext, clDevice); + clQueue[0] = cl::CommandQueue(clContext, clDevice); + clQueue[1] = cl::CommandQueue(clContext, clDevice); resolver::ResolverAmd* const resolverAmd{ dynamic_cast(resolver) }; if (nullptr == resolverAmd) @@ -21,7 +22,7 @@ bool device::DeviceAmd::initialize() resolverAmd->setDevice(&clDevice); resolverAmd->setContext(&clContext); - resolverAmd->setQueue(&clQueue); + resolverAmd->setQueue(clQueue); return true; } diff --git a/sources/device/amd.hpp b/sources/device/amd.hpp index a66e684..2b4a34b 100644 --- a/sources/device/amd.hpp +++ b/sources/device/amd.hpp @@ -16,7 +16,7 @@ namespace device protected: cl::Context clContext{}; - cl::CommandQueue clQueue{}; + cl::CommandQueue clQueue[2]{}; bool initialize() final; void cleanUp() final; diff --git a/sources/resolver/amd/amd.cpp b/sources/resolver/amd/amd.cpp index b810816..221ecd1 100644 --- a/sources/resolver/amd/amd.cpp +++ b/sources/resolver/amd/amd.cpp @@ -20,7 +20,8 @@ void resolver::ResolverAmd::setContext( void resolver::ResolverAmd::setQueue( cl::CommandQueue* const queue) { - clQueue = queue; + clQueue[0] = &queue[0]; + clQueue[1] = &queue[1]; } diff --git a/sources/resolver/amd/amd.hpp b/sources/resolver/amd/amd.hpp index a1e2a88..9ada547 100644 --- a/sources/resolver/amd/amd.hpp +++ b/sources/resolver/amd/amd.hpp @@ -10,7 +10,6 @@ namespace resolver class ResolverAmd : public resolver::Resolver { public: - virtual ~ResolverAmd() = default; void setDevice(cl::Device* const device); @@ -22,7 +21,7 @@ namespace resolver protected: cl::Context* clContext{ nullptr }; cl::Device* clDevice{ nullptr }; - cl::CommandQueue* clQueue{ nullptr }; + cl::CommandQueue* clQueue[2]{ nullptr, nullptr }; void overrideOccupancy(uint32_t const defaultThreads, uint32_t const defaultBlocks) final; diff --git a/sources/resolver/amd/autolykos_v2.cpp b/sources/resolver/amd/autolykos_v2.cpp index df35133..d644e51 100644 --- a/sources/resolver/amd/autolykos_v2.cpp +++ b/sources/resolver/amd/autolykos_v2.cpp @@ -25,7 +25,9 @@ bool resolver::ResolverAmdAutolykosV2::updateMemory( stratum::StratumJobInfo const& jobInfo) { //////////////////////////////////////////////////////////////////////////// - if (nullptr == clContext || nullptr == clQueue) + if ( nullptr == clContext + || nullptr == clQueue[0] + || nullptr == clQueue[1]) { return false; } @@ -66,9 +68,9 @@ bool resolver::ResolverAmdAutolykosV2::updateMemory( //////////////////////////////////////////////////////////////////////////// if ( false == parameters.BHashes.alloc(*clContext) || false == parameters.dagCache.alloc(*clContext) - || false == parameters.boundaryCache.alloc(clQueue, *clContext) - || false == parameters.headerCache.alloc(clQueue, *clContext) - || false == parameters.resultCache.alloc(clQueue, *clContext)) + || false == parameters.boundaryCache.alloc(clQueue[currentIndexStream], *clContext) + || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext) + || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext)) { return false; } @@ -113,14 +115,14 @@ bool resolver::ResolverAmdAutolykosV2::updateConstants( //////////////////////////////////////////////////////////////////////////// uint32_t const* const boundary { jobInfo.boundary.word32 }; - if (false == parameters.boundaryCache.setBufferDevice(clQueue, boundary)) + if (false == parameters.boundaryCache.setBufferDevice(clQueue[currentIndexStream], boundary)) { return false; } //////////////////////////////////////////////////////////////////////////// uint32_t const* const header { jobInfo.headerHash.word32 }; - if (false == parameters.headerCache.setBufferDevice(clQueue, header)) + if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header)) { return false; } @@ -183,12 +185,12 @@ bool resolver::ResolverAmdAutolykosV2::fillDAG() uint32_t const blockDim { algo::autolykos_v2::AMD_BLOCK_DIM }; uint32_t globalDimX { ((parameters.hostPeriod / blockDim) + 1) * blockDim }; OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(globalDimX, 1, 1), cl::NDRange(blockDim, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); return true; } @@ -281,12 +283,12 @@ bool resolver::ResolverAmdAutolykosV2::executeSync( OPENCL_ER(clKernel.setArg(3u, parameters.hostNonce)); OPENCL_ER(clKernel.setArg(4u, parameters.hostPeriod)); OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(maxGroupSizeSearch, 1, 1), cl::NDRange(algo::autolykos_v2::AMD_BLOCK_DIM, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); //////////////////////////////////////////////////////////////////////////// auto& clKernelVerify { kernelGeneratorVerify.clKernel }; @@ -298,12 +300,12 @@ bool resolver::ResolverAmdAutolykosV2::executeSync( OPENCL_ER(clKernelVerify.setArg(5u, parameters.hostPeriod)); OPENCL_ER(clKernelVerify.setArg(6u, parameters.hostHeight)); OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernelVerify, cl::NullRange, cl::NDRange(maxGroupSizeVerify, 1, 1), cl::NDRange(algo::autolykos_v2::AMD_BLOCK_DIM, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); //////////////////////////////////////////////////////////////////////////// if (false == getResultCache(jobInfo.jobIDStr, @@ -334,13 +336,13 @@ bool resolver::ResolverAmdAutolykosV2::getResultCache( algo::hash256 boundary{}; //////////////////////////////////////////////////////////////////////////// - if (false == parameters.resultCache.getBufferHost(clQueue, &data)) + if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data)) { return false; } //////////////////////////////////////////////////////////////////////////// - if (false == parameters.boundaryCache.getBufferHost(clQueue, boundary.word32)) + if (false == parameters.boundaryCache.getBufferHost(clQueue[currentIndexStream], boundary.word32)) { return false; } @@ -384,7 +386,7 @@ bool resolver::ResolverAmdAutolykosV2::getResultCache( resultShare.jobId.assign(_jobId); } - if (false == parameters.resultCache.resetBufferHost(clQueue)) + if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream])) { return false; } diff --git a/sources/resolver/amd/etchash.cpp b/sources/resolver/amd/etchash.cpp index 40113bf..8ea9fcd 100644 --- a/sources/resolver/amd/etchash.cpp +++ b/sources/resolver/amd/etchash.cpp @@ -26,9 +26,9 @@ bool resolver::ResolverAmdEtchash::updateContext( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU); if ( context.lightCache.numberItem == 0ull || context.lightCache.size == 0ull diff --git a/sources/resolver/amd/ethash.cpp b/sources/resolver/amd/ethash.cpp index c53b206..7f82ea0 100644 --- a/sources/resolver/amd/ethash.cpp +++ b/sources/resolver/amd/ethash.cpp @@ -36,9 +36,10 @@ bool resolver::ResolverAmdEthash::updateContext( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - true // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU + algo::ethash::buildLightCache(context, true); if ( context.lightCache.numberItem == 0ull || context.lightCache.size == 0ull @@ -78,7 +79,8 @@ bool resolver::ResolverAmdEthash::updateMemory( { return false; } - if (nullptr == clQueue) + if ( nullptr == clQueue[0] + || nullptr == clQueue[1]) { return false; } @@ -102,8 +104,8 @@ bool resolver::ResolverAmdEthash::updateMemory( //////////////////////////////////////////////////////////////////////////// if ( false == parameters.lightCache.alloc(*clContext) || false == parameters.dagCache.alloc(*clContext) - || false == parameters.headerCache.alloc(clQueue, *clContext) - || false == parameters.resultCache.alloc(clQueue, *clContext)) + || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext) + || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext)) { return false; } @@ -111,7 +113,7 @@ bool resolver::ResolverAmdEthash::updateMemory( //////////////////////////////////////////////////////////////////////////// if (false == parameters.lightCache.write(context.lightCache.hash, context.lightCache.size, - clQueue)) + clQueue[currentIndexStream])) { return false; } @@ -136,7 +138,7 @@ bool resolver::ResolverAmdEthash::updateConstants( { //////////////////////////////////////////////////////////////////////////// uint32_t const* const header { jobInfo.headerHash.word32 }; - if (false == parameters.headerCache.setBufferDevice(clQueue, header)) + if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header)) { return false; } @@ -193,12 +195,12 @@ bool resolver::ResolverAmdEthash::buildDAG() uint32_t const maxGroupSize { getMaxGroupSize() }; uint32_t const threadKernel { castU32(context.dagCache.numberItem) / maxGroupSize }; OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(maxGroupSize, threadKernel, 1), cl::NDRange(maxGroupSize, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); //////////////////////////////////////////////////////////////////////////// parameters.lightCache.free(); @@ -263,12 +265,12 @@ bool resolver::ResolverAmdEthash::executeSync( OPENCL_ER(clKernel.setArg(4u, jobInfo.boundaryU64)); OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(blocks, threads, 1), cl::NDRange(blocks, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize)) { @@ -292,7 +294,7 @@ bool resolver::ResolverAmdEthash::getResultCache( { algo::ethash::Result data{}; - if (false == parameters.resultCache.getBufferHost(clQueue, &data)) + if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data)) { return false; } @@ -314,7 +316,7 @@ bool resolver::ResolverAmdEthash::getResultCache( resultShare.nonces[i] = data.nonces[i]; } - if (false == parameters.resultCache.resetBufferHost(clQueue)) + if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream])) { return false; } diff --git a/sources/resolver/amd/progpow.cpp b/sources/resolver/amd/progpow.cpp index 1782854..b89691a 100644 --- a/sources/resolver/amd/progpow.cpp +++ b/sources/resolver/amd/progpow.cpp @@ -32,9 +32,10 @@ bool resolver::ResolverAmdProgPOW::updateContext( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - true // config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU + algo::ethash::buildLightCache(context, true); if ( 0ull == context.lightCache.numberItem || 0ull == context.lightCache.size @@ -71,7 +72,7 @@ bool resolver::ResolverAmdProgPOW::updateMemory( stratum::StratumJobInfo const& jobInfo) { IS_NULL(clContext); - IS_NULL(clQueue); + IS_NULL(clQueue[currentIndexStream]); //////////////////////////////////////////////////////////////////////////// if (false == updateContext(jobInfo)) @@ -92,8 +93,8 @@ bool resolver::ResolverAmdProgPOW::updateMemory( //////////////////////////////////////////////////////////////////////////// if ( false == parameters.lightCache.alloc(*clContext) || false == parameters.dagCache.alloc(*clContext) - || false == parameters.headerCache.alloc(clQueue, *clContext) - || false == parameters.resultCache.alloc(clQueue, *clContext)) + || false == parameters.headerCache.alloc(clQueue[currentIndexStream], *clContext) + || false == parameters.resultCache.alloc(clQueue[currentIndexStream], *clContext)) { return false; } @@ -101,7 +102,7 @@ bool resolver::ResolverAmdProgPOW::updateMemory( //////////////////////////////////////////////////////////////////////////// if (false == parameters.lightCache.write(context.lightCache.hash, context.lightCache.size, - clQueue)) + clQueue[currentIndexStream])) { return false; } @@ -140,7 +141,7 @@ bool resolver::ResolverAmdProgPOW::updateConstants( //////////////////////////////////////////////////////////////////////////// uint32_t const* const header { jobInfo.headerHash.word32 }; - if (false == parameters.headerCache.setBufferDevice(clQueue, header)) + if (false == parameters.headerCache.setBufferDevice(clQueue[currentIndexStream], header)) { return false; } @@ -193,12 +194,12 @@ bool resolver::ResolverAmdProgPOW::buildDAG() uint32_t const maxGroupSize { getMaxGroupSize() }; uint32_t const threadKernel { castU32(context.dagCache.numberItem) / maxGroupSize }; OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(maxGroupSize, threadKernel, 1), cl::NDRange(maxGroupSize, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); //////////////////////////////////////////////////////////////////////////// parameters.lightCache.free(); @@ -210,6 +211,9 @@ bool resolver::ResolverAmdProgPOW::buildDAG() bool resolver::ResolverAmdProgPOW::buildSearch() { + //////////////////////////////////////////////////////////////////////////// + auto const& config{ common::Config::instance() }; + //////////////////////////////////////////////////////////////////////////// algo::progpow::writeMathRandomKernelOpenCL(progpowVersion, deviceId, @@ -278,6 +282,12 @@ bool resolver::ResolverAmdProgPOW::buildSearch() kernelGenerator.addDefine("SHARE_FNV1A_SIZE", maxThreadByGroup); kernelGenerator.addDefine("MODULE_CACHE_GROUP", maxThreadByGroup * 4u); kernelGenerator.addDefine("MODULE_LOOP", algo::progpow::MODULE_CACHE / (maxThreadByGroup / 4u)); + kernelGenerator.addDefine("TOTAL_THREADS", getBlocks() * getThreads()); + if (std::nullopt != config.occupancy.internalLoop) + { + uint32_t const internalLoop{ *config.occupancy.internalLoop }; + kernelGenerator.addDefine("INTERNAL_LOOP", internalLoop); + } //////////////////////////////////////////////////////////////////////////// kernelGenerator.addInclude("kernel/common/rotate_byte.cl"); @@ -332,27 +342,28 @@ bool resolver::ResolverAmdProgPOW::buildSearch() bool resolver::ResolverAmdProgPOW::executeSync( stratum::StratumJobInfo const& jobInfo) { + /////////////////////////////////////////////////////////////////////////// auto& clKernel { kernelGenerator.clKernel }; - OPENCL_ER(clKernel.setArg(0u, jobInfo.nonce)); OPENCL_ER(clKernel.setArg(1u, jobInfo.boundaryU64)); OPENCL_ER(clKernel.setArg(2u, *(parameters.headerCache.getBuffer()))); OPENCL_ER(clKernel.setArg(3u, *(parameters.dagCache.getBuffer()))); OPENCL_ER(clKernel.setArg(4u, *(parameters.resultCache.getBuffer()))); - OPENCL_ER( - clQueue->enqueueNDRangeKernel( + clQueue[currentIndexStream]->enqueueNDRangeKernel( clKernel, cl::NullRange, cl::NDRange(blocks, threads, 1), cl::NDRange(blocks, 1, 1))); - OPENCL_ER(clQueue->finish()); + OPENCL_ER(clQueue[currentIndexStream]->finish()); + /////////////////////////////////////////////////////////////////////////// if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize)) { return false; } + /////////////////////////////////////////////////////////////////////////// return true; } @@ -360,7 +371,36 @@ bool resolver::ResolverAmdProgPOW::executeSync( bool resolver::ResolverAmdProgPOW::executeAsync( stratum::StratumJobInfo const& jobInfo) { - return executeSync(jobInfo); + /////////////////////////////////////////////////////////////////////////// + OPENCL_ER(clQueue[currentIndexStream]->finish()); + + /////////////////////////////////////////////////////////////////////////// + swapIndexStream(); + auto& clKernel { kernelGenerator.clKernel }; + OPENCL_ER(clKernel.setArg(0u, jobInfo.nonce)); + OPENCL_ER(clKernel.setArg(1u, jobInfo.boundaryU64)); + OPENCL_ER(clKernel.setArg(2u, *(parameters.headerCache.getBuffer()))); + OPENCL_ER(clKernel.setArg(3u, *(parameters.dagCache.getBuffer()))); + OPENCL_ER(clKernel.setArg(4u, *(parameters.resultCache.getBuffer()))); + OPENCL_ER( + clQueue[currentIndexStream]->enqueueNDRangeKernel( + clKernel, + cl::NullRange, + cl::NDRange(blocks, threads, 1), + cl::NDRange(blocks, 1, 1))); + + /////////////////////////////////////////////////////////////////////////// + swapIndexStream(); + if (false == getResultCache(jobInfo.jobIDStr, jobInfo.extraNonceSize)) + { + return false; + } + + /////////////////////////////////////////////////////////////////////////// + swapIndexStream(); + + /////////////////////////////////////////////////////////////////////////// + return true; } @@ -370,7 +410,7 @@ bool resolver::ResolverAmdProgPOW::getResultCache( { algo::progpow::Result data{}; - if (false == parameters.resultCache.getBufferHost(clQueue, &data)) + if (false == parameters.resultCache.getBufferHost(clQueue[currentIndexStream], &data)) { return false; } @@ -399,7 +439,7 @@ bool resolver::ResolverAmdProgPOW::getResultCache( } } - if (false == parameters.resultCache.resetBufferHost(clQueue)) + if (false == parameters.resultCache.resetBufferHost(clQueue[currentIndexStream])) { return false; } diff --git a/sources/resolver/cpu/progpow.cpp b/sources/resolver/cpu/progpow.cpp index d9cd9bd..f1bab19 100644 --- a/sources/resolver/cpu/progpow.cpp +++ b/sources/resolver/cpu/progpow.cpp @@ -25,9 +25,10 @@ bool resolver::ResolverCpuProgPOW::updateContext(stratum::StratumJobInfo const& dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - true + lightCacheCountItemsInit ); + // TODO: config.deviceAlgorithm.ethashBuildLightCacheCPU + algo::ethash::buildLightCache(context, true); if ( 0ull == context.lightCache.numberItem || 0ull == context.lightCache.size diff --git a/sources/resolver/mocker.cpp b/sources/resolver/mocker.cpp index 356fcb8..f0e700e 100644 --- a/sources/resolver/mocker.cpp +++ b/sources/resolver/mocker.cpp @@ -48,9 +48,9 @@ bool resolver::ResolverMocker::updateMemory( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU); /////////////////////////////////////////////////////////////////////////// boost::this_thread::sleep_for(WAIT_UPDATE_MEMORY); diff --git a/sources/resolver/nvidia/blake3.cpp b/sources/resolver/nvidia/blake3.cpp index 8b65eb5..9134c0d 100644 --- a/sources/resolver/nvidia/blake3.cpp +++ b/sources/resolver/nvidia/blake3.cpp @@ -100,7 +100,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync( CUDA_ER(cudaGetLastError()); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); parameters.hostNonce = jobInfo.nonce; blake3Search(cuStream[currentIndexStream], parameters, @@ -109,7 +109,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync( threads); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); algo::blake3::Result* resultCache { ¶meters.resultCache[currentIndexStream] }; if (true == resultCache->found) { @@ -135,7 +135,7 @@ bool resolver::ResolverNvidiaBlake3::executeAsync( } //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); return true; } diff --git a/sources/resolver/nvidia/etchash.cpp b/sources/resolver/nvidia/etchash.cpp index f53e612..75e8542 100644 --- a/sources/resolver/nvidia/etchash.cpp +++ b/sources/resolver/nvidia/etchash.cpp @@ -25,8 +25,8 @@ bool resolver::ResolverNvidiaEtchash::updateContext( dagCountItemsInit, lightCacheCountItemsGrowth, lightCacheCountItemsInit, - config.deviceAlgorithm.ethashBuildLightCacheCPU ); + algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU); /////////////////////////////////////////////////////////////////////////// if ( context.lightCache.numberItem == 0ull diff --git a/sources/resolver/nvidia/ethash.cpp b/sources/resolver/nvidia/ethash.cpp index 49ad852..e9bd014 100644 --- a/sources/resolver/nvidia/ethash.cpp +++ b/sources/resolver/nvidia/ethash.cpp @@ -29,9 +29,9 @@ bool resolver::ResolverNvidiaEthash::updateContext( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU); if ( context.lightCache.numberItem == 0ull || context.lightCache.size == 0ull @@ -185,7 +185,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync( CUDA_ER(cudaGetLastError()); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); ethashSearch(cuStream[currentIndexStream], ¶meters.resultCache[currentIndexStream], blocks, @@ -193,7 +193,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync( jobInfo.nonce); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); algo::ethash::Result* resultCache{ ¶meters.resultCache[currentIndexStream] }; if (true == resultCache->found) { @@ -217,7 +217,7 @@ bool resolver::ResolverNvidiaEthash::executeAsync( } //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); return true; } diff --git a/sources/resolver/nvidia/progpow.cpp b/sources/resolver/nvidia/progpow.cpp index 5994179..460cfc4 100644 --- a/sources/resolver/nvidia/progpow.cpp +++ b/sources/resolver/nvidia/progpow.cpp @@ -31,9 +31,9 @@ bool resolver::ResolverNvidiaProgPOW::updateContext( dagCountItemsGrowth, dagCountItemsInit, lightCacheCountItemsGrowth, - lightCacheCountItemsInit, - config.deviceAlgorithm.ethashBuildLightCacheCPU + lightCacheCountItemsInit ); + algo::ethash::buildLightCache(context, config.deviceAlgorithm.ethashBuildLightCacheCPU); if ( context.lightCache.numberItem == 0ull || context.lightCache.size == 0ull @@ -260,7 +260,7 @@ bool resolver::ResolverNvidiaProgPOW::buildSearch() kernelGenerator.addDefine("COUNT_DAG", algo::progpow::COUNT_DAG); kernelGenerator.addDefine("STATE_LEN", 25u); kernelGenerator.addDefine("TOTAL_THREADS", getThreads() * getBlocks()); - if (std::nullopt != config.occupancy.internalLoop ) + if (std::nullopt != config.occupancy.internalLoop) { uint32_t const internalLoop{ *config.occupancy.internalLoop }; kernelGenerator.addDefine("INTERNAL_LOOP", internalLoop); @@ -397,7 +397,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync( CUDA_ER(cudaGetLastError()); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); uint64_t nonce{ jobInfo.nonce }; uint64_t boundary{ jobInfo.boundaryU64 }; algo::progpow::Result* result{ ¶meters.resultCache[currentIndexStream] }; @@ -418,7 +418,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync( nullptr)); //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); algo::progpow::Result* resultCache { ¶meters.resultCache[currentIndexStream] }; if (true == resultCache->found) { @@ -450,7 +450,7 @@ bool resolver::ResolverNvidiaProgPOW::executeAsync( } //////////////////////////////////////////////////////////////////////////// - swapIndexStrean(); + swapIndexStream(); //////////////////////////////////////////////////////////////////////////// return true; diff --git a/sources/resolver/resolver.cpp b/sources/resolver/resolver.cpp index c613ba0..678c451 100644 --- a/sources/resolver/resolver.cpp +++ b/sources/resolver/resolver.cpp @@ -4,7 +4,7 @@ #include -void resolver::Resolver::swapIndexStrean() +void resolver::Resolver::swapIndexStream() { if (0u == currentIndexStream) { diff --git a/sources/resolver/resolver.hpp b/sources/resolver/resolver.hpp index 0af9221..812ca61 100644 --- a/sources/resolver/resolver.hpp +++ b/sources/resolver/resolver.hpp @@ -23,7 +23,7 @@ namespace resolver Resolver& operator=(Resolver const&) = delete; Resolver& operator=(Resolver&&) = delete; - void swapIndexStrean(); + void swapIndexStream(); void setBlocks(uint32_t const newBlocks); void setThreads(uint32_t const newThreads); uint32_t getBlocks() const;