Skip to content

Add asynchronous copy #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions platforms/artic/runtime.impala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8];
#[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8];
#[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> ();
#[import(cc = "C", name = "anydsl_copy_async")] fn runtime_copy_async(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> ();
#[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8];
#[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> ();
#[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> ();
Expand Down Expand Up @@ -114,6 +115,8 @@ fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));

fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size);
fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size);
fn @copy_async(src: Buffer, dst: Buffer) = runtime_copy_async(src.device, src.data, 0, dst.device, dst.data, 0, src.size);
fn @copy_offset_async(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy_async(src.device, src.data, off_src, dst.device, dst.data, off_dst, size);


// range, range_step, unroll, unroll_step, etc.
Expand Down
9 changes: 9 additions & 0 deletions platforms/impala/runtime.impala
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ extern "C" {
fn "anydsl_alloc_host" runtime_alloc_host(i32, i64) -> &[i8];
fn "anydsl_alloc_unified" runtime_alloc_unified(i32, i64) -> &[i8];
fn "anydsl_copy" runtime_copy(i32, &[i8], i64, i32, &[i8], i64, i64) -> ();
fn "anydsl_copy_async" runtime_copy_async(i32, &[i8], i64, i32, &[i8], i64, i64) -> ();
fn "anydsl_get_device_ptr" runtime_get_device_ptr(i32, &[i8]) -> &[i8];
fn "anydsl_release" runtime_release(i32, &[i8]) -> ();
fn "anydsl_release_host" runtime_release_host(i32, &[i8]) -> ();
Expand Down Expand Up @@ -88,6 +89,14 @@ fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64)
runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size)
}

fn @copy_async(src: Buffer, dst: Buffer) -> () {
runtime_copy_async(src.device, src.data, 0i64, dst.device, dst.data, 0i64, src.size)
}

fn @copy_offset_async(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) -> () {
runtime_copy_async(src.device, src.data, off_src, dst.device, dst.data, off_dst, size)
}


// range, range_step, unroll, unroll_step, etc.
fn @(?lower & ?upper & ?step) unroll_step(lower: i32, upper: i32, @step: i32, body: fn(i32) -> ()) -> () {
Expand Down
10 changes: 9 additions & 1 deletion src/anydsl_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,15 @@ void anydsl_copy(
int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) {
runtime().copy(
to_platform(mask_src), to_device(mask_src), src, offset_src,
to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size);
to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size, false);
}

void anydsl_copy_async(
int32_t mask_src, const void* src, int64_t offset_src,
int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) {
runtime().copy(
to_platform(mask_src), to_device(mask_src), src, offset_src,
to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size, true);
}

void anydsl_launch_kernel(
Expand Down
1 change: 1 addition & 0 deletions src/anydsl_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ AnyDSL_runtime_API void anydsl_release(int32_t, void*);
AnyDSL_runtime_API void anydsl_release_host(int32_t, void*);

AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t);
AnyDSL_runtime_API void anydsl_copy_async(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t);

AnyDSL_runtime_API void anydsl_launch_kernel(
int32_t, const char*, const char*,
Expand Down
21 changes: 21 additions & 0 deletions src/anydsl_runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,27 @@ void copy(const Array<T>& a, int64_t offset_a, Array<T>& b, int64_t offset_b, in
size * sizeof(T));
}

template <typename T>
void copy_async(const Array<T>& a, Array<T>& b) {
anydsl_copy_async(a.device(), (const void*)a.data(), 0,
b.device(), (void*)b.data(), 0,
a.size() * sizeof(T));
}

template <typename T>
void copy_async(const Array<T>& a, Array<T>& b, int64_t size) {
anydsl_copy_async(a.device(), (const void*)a.data(), 0,
b.device(), (void*)b.data(), 0,
size * sizeof(T));
}

template <typename T>
void copy_async(const Array<T>& a, int64_t offset_a, Array<T>& b, int64_t offset_b, int64_t size) {
anydsl_copy_async(a.device(), (const void*)a.data(), offset_a * sizeof(T),
b.device(), (void*)b.data(), offset_b * sizeof(T),
size * sizeof(T));
}

} // namespace anydsl

#endif
8 changes: 5 additions & 3 deletions src/cpu_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,17 @@ class CpuPlatform : public Platform {
}

void copy(DeviceId, const void* src, int64_t offset_src,
DeviceId, void* dst, int64_t offset_dst, int64_t size) override {
DeviceId, void* dst, int64_t offset_dst, int64_t size, bool) override {
copy(src, offset_src, dst, offset_dst, size);
}

void copy_from_host(const void* src, int64_t offset_src, DeviceId,
void* dst, int64_t offset_dst, int64_t size) override {
void* dst, int64_t offset_dst, int64_t size, bool) override {
copy(src, offset_src, dst, offset_dst, size);
}

void copy_to_host(DeviceId, const void* src, int64_t offset_src,
void* dst, int64_t offset_dst, int64_t size) override {
void* dst, int64_t offset_dst, int64_t size, bool) override {
copy(src, offset_src, dst, offset_dst, size);
}

Expand Down
34 changes: 25 additions & 9 deletions src/cuda_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,37 +247,53 @@ void CudaPlatform::synchronize(DeviceId dev) {
erase_profiles(false);
}

void CudaPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
void CudaPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
assert(dev_src == dev_dst);
unused(dev_dst);

cuCtxPushCurrent(devices_[dev_src].ctx);

CUdeviceptr src_mem = (CUdeviceptr)src;
CUdeviceptr dst_mem = (CUdeviceptr)dst;
CUresult err = cuMemcpyDtoD(dst_mem + offset_dst, src_mem + offset_src, size);
CHECK_CUDA(err, "cuMemcpyDtoD()");
if (!hint_async) {
CUresult err = cuMemcpyDtoD(dst_mem + offset_dst, src_mem + offset_src, size);
CHECK_CUDA(err, "cuMemcpyDtoD()");
} else {
CUresult err = cuMemcpyDtoDAsync(dst_mem + offset_dst, src_mem + offset_src, size, 0);
CHECK_CUDA(err, "cuMemcpyDtoDAsync()");
}

cuCtxPopCurrent(NULL);
}

void CudaPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
void CudaPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
cuCtxPushCurrent(devices_[dev_dst].ctx);

CUdeviceptr dst_mem = (CUdeviceptr)dst;

CUresult err = cuMemcpyHtoD(dst_mem + offset_dst, (char*)src + offset_src, size);
CHECK_CUDA(err, "cuMemcpyHtoD()");
if (!hint_async) {
CUresult err = cuMemcpyHtoD(dst_mem + offset_dst, (char*)src + offset_src, size);
CHECK_CUDA(err, "cuMemcpyHtoD()");
} else {
CUresult err = cuMemcpyHtoDAsync(dst_mem + offset_dst, (char*)src + offset_src, size, 0);
CHECK_CUDA(err, "cuMemcpyHtoDAsync()");
}

cuCtxPopCurrent(NULL);
}

void CudaPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
void CudaPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
cuCtxPushCurrent(devices_[dev_src].ctx);

CUdeviceptr src_mem = (CUdeviceptr)src;
CUresult err = cuMemcpyDtoH((char*)dst + offset_dst, src_mem + offset_src, size);
CHECK_CUDA(err, "cuMemcpyDtoH()");

if (!hint_async) {
CUresult err = cuMemcpyDtoH((char*)dst + offset_dst, src_mem + offset_src, size);
CHECK_CUDA(err, "cuMemcpyDtoH()");
} else {
CUresult err = cuMemcpyDtoHAsync((char*)dst + offset_dst, src_mem + offset_src, size, 0);
CHECK_CUDA(err, "cuMemcpyDtoHAsync()");
}

cuCtxPopCurrent(NULL);
}
Expand Down
6 changes: 3 additions & 3 deletions src/cuda_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ class CudaPlatform : public Platform {
void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
void synchronize(DeviceId dev) override;

void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;

size_t dev_count() const override { return devices_.size(); }
std::string name() const override { return "CUDA"; }
Expand Down
6 changes: 3 additions & 3 deletions src/dummy_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ class DummyPlatform : public Platform {
void launch_kernel(DeviceId, const LaunchParams&) override { platform_error(); }
void synchronize(DeviceId) override { platform_error(); }

void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); }
void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t) override { platform_error(); }
void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t, bool) override { platform_error(); }
void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t, bool) override { platform_error(); }
void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t, bool) override { platform_error(); }

size_t dev_count() const override { return 0; }
std::string name() const override { return name_; }
Expand Down
4 changes: 3 additions & 1 deletion src/hsa_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,9 @@ void HSAPlatform::synchronize(DeviceId dev) {
error("HSA signal completion failed: %", completion);
}

void HSAPlatform::copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
void HSAPlatform::copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
unused(hint_async);

hsa_status_t status = hsa_memory_copy((char*)dst + offset_dst, (char*)src + offset_src, size);
CHECK_HSA(status, "hsa_memory_copy()");
}
Expand Down
8 changes: 4 additions & 4 deletions src/hsa_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ class HSAPlatform : public Platform {
void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
void synchronize(DeviceId dev) override;

void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); }
void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async);
void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); }
void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); }
void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); }

size_t dev_count() const override { return devices_.size(); }
std::string name() const override { return "HSA"; }
Expand Down
15 changes: 9 additions & 6 deletions src/opencl_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ void OpenCLPlatform::synchronize(DeviceId dev) {
}
}

void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
assert(dev_src == dev_dst);
unused(dev_dst);

Expand All @@ -458,27 +458,30 @@ void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src,
#endif

cl_int err = clEnqueueCopyBuffer(devices_[dev_src].queue, (cl_mem)src, (cl_mem)dst, offset_src, offset_dst, size, 0, NULL, NULL);
err |= clFinish(devices_[dev_src].queue);
if (!hint_async)
err |= clFinish(devices_[dev_src].queue);
CHECK_OPENCL(err, "clEnqueueCopyBuffer()");
}

void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
#ifdef CL_VERSION_2_0
if (devices_[dev_dst].version_major == 2)
return copy_svm(src, offset_src, dst, offset_dst, size);
#endif
cl_int err = clEnqueueWriteBuffer(devices_[dev_dst].queue, (cl_mem)dst, CL_FALSE, offset_dst, size, (char*)src + offset_src, 0, NULL, NULL);
err |= clFinish(devices_[dev_dst].queue);
if (!hint_async)
err |= clFinish(devices_[dev_dst].queue);
CHECK_OPENCL(err, "clEnqueueWriteBuffer()");
}

void OpenCLPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
void OpenCLPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) {
#ifdef CL_VERSION_2_0
if (devices_[dev_src].version_major == 2)
return copy_svm(src, offset_src, dst, offset_dst, size);
#endif
cl_int err = clEnqueueReadBuffer(devices_[dev_src].queue, (cl_mem)src, CL_FALSE, offset_src, size, (char*)dst + offset_dst, 0, NULL, NULL);
err |= clFinish(devices_[dev_src].queue);
if (!hint_async)
err |= clFinish(devices_[dev_src].queue);
CHECK_OPENCL(err, "clEnqueueReadBuffer()");
}

Expand Down
7 changes: 4 additions & 3 deletions src/opencl_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ class OpenCLPlatform : public Platform {
void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
void synchronize(DeviceId dev) override;

void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override;
void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override;
void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;

void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size);
void dynamic_profile(DeviceId dev, const std::string& filename);

Expand Down
6 changes: 3 additions & 3 deletions src/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ class Platform {
virtual void synchronize(DeviceId dev) = 0;

/// Copies memory. Copy can only be performed devices in the same platform.
virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0;
virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0;
/// Copies memory from the host (CPU).
virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0;
virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0;
/// Copies memory to the host (CPU).
virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) = 0;
virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0;

/// Returns the platform name.
virtual std::string name() const = 0;
Expand Down
9 changes: 5 additions & 4 deletions src/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,23 @@ void Runtime::release_host(PlatformId plat, DeviceId dev, void* ptr) {

void Runtime::copy(
PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src,
PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size,
bool hint_async) {
check_device(plat_src, dev_src);
check_device(plat_dst, dev_dst);
if (plat_src == plat_dst) {
// Copy from same platform
platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size);
platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size, hint_async);
debug("Copy between devices % and % on platform %", dev_src, dev_dst, plat_src);
} else {
// Copy from another platform
if (plat_src == 0) {
// Source is the CPU platform
platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size);
platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size, hint_async);
debug("Copy from host to device % on platform %", dev_dst, plat_dst);
} else if (plat_dst == 0) {
// Destination is the CPU platform
platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size);
platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size, hint_async);
debug("Copy to host from device % on platform %", dev_src, plat_src);
} else {
error("Cannot copy memory between different platforms");
Expand Down
3 changes: 2 additions & 1 deletion src/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ class Runtime {
/// Copies memory between devices.
void copy(
PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src,
PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size);
PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size,
bool hint_async = false);

/// Launches a kernel on the platform and device.
void launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params);
Expand Down