-
Notifications
You must be signed in to change notification settings - Fork 807
[DRAFT][NATIVECPU] using tbb::parallel_for when oneTBB is enabled #20064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: sycl
Are you sure you want to change the base?
Changes from all commits
d4700e5
b3f2215
780588c
db924f0
1509655
45ee46c
aa7dec8
29d11f9
e202f8d
fe8d099
c2a3f57
be5b134
b18401f
4bff038
eacf522
c2996eb
91a6a49
c1745c7
488504c
53013d4
e8d8ff4
99c76c9
9b40081
07c178d
aee938a
59d731a
e0341ef
e719ec0
81c3c82
ecaf51b
f5d6547
e975e77
26a5bd0
38a91f7
b31bd44
960b1d5
04bd48a
45c76d9
7985e95
7008b8b
a3f4ea0
2f1b3fe
d5aa0cf
8efb1e4
67e9995
2c52186
57bff8e
5348490
1de1251
9173f5e
7cd7caa
849ba98
32ecf09
c77454e
6142549
4b05062
2722cad
24a0da3
22898b4
bed18b6
5d12b7a
400ba0d
40f7270
bd161bc
870754a
e11f596
b4069d1
e83715c
ee2d232
dfc67d8
a25b2c7
3074b16
070f0cf
eb64e5d
3207ffa
106a31f
6e1f722
2a557f9
29c201c
941932b
4c5700d
1532779
0204d11
ffe66d0
3505c76
c95ebe7
67d77da
73cf574
6fcea0f
788cf69
d86f429
ddb908f
22ab082
8b20c39
c57b68a
1d62903
aced1a4
11ebe05
666f2ae
37ccfca
dadfcd3
3e978db
0e28a6e
50e0720
5fcea55
4ad9ee7
4386697
2fd6b37
5e0b99d
f05bba1
8548f6a
58ffb89
aecf330
f6b68dc
faa03d2
cfcc325
159db63
a26eb58
271cf93
5784a93
49942d2
5bcb27c
6a6f19f
ecf52a5
fc9b330
0e0b454
a009bd2
02450dc
a44fc99
6a5f9d1
87f3e17
c3ff19f
1b294bb
53d5b87
ea19145
320169f
e1961df
e1cdbdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,6 +96,113 @@ static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { | |
return resized_state; | ||
} | ||
|
||
static inline void invoke_kernel(native_cpu::state &state, | ||
const ur_kernel_handle_t_ &kernel, size_t g0, | ||
size_t g1, size_t g2, | ||
size_t numParallelThreads, size_t threadId, | ||
const native_cpu::NDRDescT &ndr) { | ||
#ifdef NATIVECPU_USE_OCK | ||
state.update(g0, g1, g2); | ||
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), | ||
&state); | ||
(void)ndr; | ||
#else | ||
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { | ||
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { | ||
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { | ||
state.update(g0, g1, g2, local0, local1, local2); | ||
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), | ||
&state); | ||
} | ||
} | ||
} | ||
#endif | ||
} | ||
|
||
#ifdef NATIVECPU_WITH_ONETBB | ||
|
||
#define NATIVECPU_WITH_ONETBB_PARALLELFOR | ||
|
||
using IndexT = std::array<size_t, 3>; | ||
using RangeT = native_cpu::NDRDescT::RangeT; | ||
|
||
static inline void execute_range(native_cpu::state &state, | ||
const ur_kernel_handle_t_ &hKernel, | ||
IndexT first, IndexT lastPlusOne, | ||
size_t numParallelThreads, size_t threadId, | ||
const native_cpu::NDRDescT &ndr) { | ||
for (size_t g2 = first[2]; g2 < lastPlusOne[2]; g2++) { | ||
for (size_t g1 = first[1]; g1 < lastPlusOne[1]; g1++) { | ||
for (size_t g0 = first[0]; g0 < lastPlusOne[0]; g0 += 1) { | ||
invoke_kernel(state, hKernel, g0, g1, g2, numParallelThreads, threadId, | ||
ndr); | ||
} | ||
} | ||
} | ||
} | ||
|
||
namespace native_cpu { | ||
|
||
class nativecpu_tbb_executor { | ||
const native_cpu::NDRDescT ndr; | ||
|
||
protected: | ||
const ur_kernel_handle_t_ &hKernel; | ||
const size_t numParallelThreads; | ||
|
||
void execute(IndexT first, IndexT last_plus_one) const { | ||
auto state = getState(ndr); | ||
auto threadId = native_cpu::getTBBThreadID(); | ||
execute_range(state, hKernel, first, last_plus_one, numParallelThreads, | ||
threadId, ndr); | ||
} | ||
|
||
public: | ||
void operator()(const tbb::blocked_range3d<size_t> &r) const { | ||
execute({r.pages().begin(), r.rows().begin(), r.cols().begin()}, | ||
{r.pages().end(), r.rows().end(), r.cols().end()}); | ||
} | ||
void operator()(const tbb::blocked_range2d<size_t> &r) const { | ||
execute({r.rows().begin(), r.cols().begin(), 0}, | ||
{r.rows().end(), r.cols().end(), 1}); | ||
} | ||
void operator()(const tbb::blocked_range<size_t> &r) const { | ||
execute({r.begin(), 0, 0}, {r.end(), 1, 1}); | ||
} | ||
nativecpu_tbb_executor(const native_cpu::NDRDescT &n, | ||
const ur_kernel_handle_t_ &k, | ||
const size_t numParallelThreads) | ||
: ndr(n), hKernel(k), numParallelThreads(numParallelThreads) {} | ||
}; | ||
|
||
using tbb_nd_executor = nativecpu_tbb_executor; | ||
|
||
template <template <class> class RangeTpl, class... T> | ||
static inline void invoke_tbb_parallel_for(const tbb_nd_executor &tbb_ex, | ||
T... inits) { | ||
RangeTpl<size_t> range(inits...); | ||
tbb::parallel_for(range, tbb_ex); | ||
} | ||
|
||
static inline void invoke_tbb_parallel_for(size_t workDim, | ||
const nativecpu_tbb_executor &tbb_ex, | ||
IndexT first, IndexT last) { | ||
if (workDim == 3) { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range3d>( | ||
tbb_ex, first[0], last[0], first[1], last[1], first[2], last[2]); | ||
} else if (workDim == 2) { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range2d>( | ||
tbb_ex, first[0], last[0], first[1], last[1]); | ||
} else { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range>(tbb_ex, first[0], | ||
last[0]); | ||
} | ||
} | ||
|
||
} // namespace native_cpu | ||
|
||
#endif // NATIVECPU_WITH_ONETBB | ||
|
||
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( | ||
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, | ||
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, | ||
|
@@ -175,6 +282,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( | |
auto InEvents = | ||
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks); | ||
|
||
#ifndef NATIVECPU_WITH_ONETBB_PARALLELFOR | ||
const size_t numWG = numWG0 * numWG1 * numWG2; | ||
const size_t numWGPerThread = numWG / numParallelThreads; | ||
const size_t remainderWG = numWG - numWGPerThread * numParallelThreads; | ||
|
@@ -196,21 +304,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( | |
for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2], | ||
g3 = rangeStart[3]; | ||
g3 < rangeEnd; ++g3) { | ||
#ifdef NATIVECPU_USE_OCK | ||
state.update(g0, g1, g2); | ||
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(), | ||
&state); | ||
#else | ||
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { | ||
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { | ||
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { | ||
state.update(g0, g1, g2, local0, local1, local2); | ||
kernel._subhandler( | ||
kernel.getArgs(numParallelThreads, threadId).data(), &state); | ||
} | ||
} | ||
} | ||
#endif | ||
invoke_kernel(state, kernel, g0, g1, g2, numParallelThreads, threadId, | ||
ndr); | ||
if (++g0 == numWG0) { | ||
g0 = 0; | ||
if (++g1 == numWG1) { | ||
|
@@ -222,6 +317,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( | |
}); | ||
rangeStart = rangeEnd; | ||
} | ||
#else | ||
const IndexT numWG = {numWG0, numWG1, numWG2}; | ||
IndexT groupsPerThread; | ||
size_t dim = 0; | ||
for (size_t t = 0; t < 3; t++) | ||
groupsPerThread[t] = numWG[t] / numParallelThreads; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is confusing, More to the point, in #19550, for non-oneTBB, I simplified the splitting across threads to be done over the linear range, rather than over any specific dimension, and that would probably be better with oneTBB as well. |
||
if (groupsPerThread[0] == 0) { | ||
if (groupsPerThread[1]) | ||
dim = 1; | ||
else if (groupsPerThread[2]) | ||
dim = 2; | ||
} | ||
IndexT first = {0, 0, 0}, last = numWG; | ||
size_t wg_start = 0; | ||
const native_cpu::tbb_nd_executor tbb_ex(ndr, *kernel, numParallelThreads); | ||
auto invoke_parallel_for = [workDim, &tbb_ex, &first, &last]() { | ||
if (workDim == 3) { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range3d>( | ||
tbb_ex, first[0], last[0], first[1], last[1], first[2], last[2]); | ||
} else if (workDim == 2) { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range2d>( | ||
tbb_ex, first[0], last[0], first[1], last[1]); | ||
} else { | ||
native_cpu::invoke_tbb_parallel_for<tbb::blocked_range>(tbb_ex, first[0], | ||
last[0]); | ||
} | ||
}; | ||
|
||
if (groupsPerThread[dim]) { | ||
native_cpu::invoke_tbb_parallel_for(workDim, tbb_ex, first, last); | ||
wg_start = groupsPerThread[dim] * numParallelThreads; | ||
} | ||
if (wg_start < numWG[dim]) { | ||
first[dim] = wg_start; | ||
last[dim] = numWG[dim]; | ||
native_cpu::invoke_tbb_parallel_for(workDim, tbb_ex, first, last); | ||
} | ||
#endif // NATIVECPU_WITH_ONETBB_PARALLELFOR | ||
|
||
event->set_tasksinfo(Tasks.getMovedTaskInfo()); | ||
|
||
if (phEvent) { | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -268,15 +268,20 @@ class TBB_TasksInfo { | |||||||
TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} | ||||||||
}; | ||||||||
|
||||||||
inline auto getTBBThreadID() { | ||||||||
auto thread_id = tbb::this_task_arena::current_thread_index(); | ||||||||
assert(thread_id >= 0 && | ||||||||
thread_id < oneapi::tbb::info::default_concurrency()); | ||||||||
return thread_id; | ||||||||
} | ||||||||
|
||||||||
template <> | ||||||||
struct Scheduler<TBB_threadpool> | ||||||||
: Scheduler_base<TBB_threadpool, TBB_TasksInfo> { | ||||||||
using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base; | ||||||||
template <class T> void schedule(T &&task_) { | ||||||||
ref.Tasks().run([task = std::move(task_)]() { | ||||||||
auto thread_id = tbb::this_task_arena::current_thread_index(); | ||||||||
assert(thread_id >= 0 && | ||||||||
thread_id < oneapi::tbb::info::default_concurrency()); | ||||||||
auto thread_id = getTBBThreadID(); | ||||||||
task(thread_id); | ||||||||
Comment on lines
+284
to
285
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The variable doesn't seem like it's still needed. Likewise in the other file. |
||||||||
}); | ||||||||
} | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be clearer to remove this function and call
tbb::parallel_for
directly in the otherinvoke_tbb_parallel_for
overload.