diff --git a/platforms/artic/runtime.impala b/platforms/artic/runtime.impala index a0086adf..10651aca 100644 --- a/platforms/artic/runtime.impala +++ b/platforms/artic/runtime.impala @@ -6,6 +6,7 @@ #[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8]; #[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8]; #[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); +#[import(cc = "C", name = "anydsl_copy_async")] fn runtime_copy_async(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); #[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8]; #[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> (); #[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> (); @@ -114,6 +115,8 @@ fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev)); fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size); fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size); +fn @copy_async(src: Buffer, dst: Buffer) = runtime_copy_async(src.device, src.data, 0, dst.device, dst.data, 0, src.size); +fn @copy_offset_async(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy_async(src.device, src.data, off_src, dst.device, dst.data, off_dst, size); // range, range_step, unroll, unroll_step, etc. diff --git a/platforms/impala/runtime.impala b/platforms/impala/runtime.impala index b3b9aad4..8cbbb3d9 100644 --- a/platforms/impala/runtime.impala +++ b/platforms/impala/runtime.impala @@ -7,6 +7,7 @@ extern "C" { fn "anydsl_alloc_host" runtime_alloc_host(i32, i64) -> &[i8]; fn "anydsl_alloc_unified" runtime_alloc_unified(i32, i64) -> &[i8]; fn "anydsl_copy" runtime_copy(i32, &[i8], i64, i32, &[i8], i64, i64) -> (); + fn "anydsl_copy_async" runtime_copy_async(i32, &[i8], i64, i32, &[i8], i64, i64) -> (); fn "anydsl_get_device_ptr" runtime_get_device_ptr(i32, &[i8]) -> &[i8]; fn "anydsl_release" runtime_release(i32, &[i8]) -> (); fn "anydsl_release_host" runtime_release_host(i32, &[i8]) -> (); @@ -88,6 +89,14 @@ fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size) } +fn @copy_async(src: Buffer, dst: Buffer) -> () { + runtime_copy_async(src.device, src.data, 0i64, dst.device, dst.data, 0i64, src.size) +} + +fn @copy_offset_async(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) -> () { + runtime_copy_async(src.device, src.data, off_src, dst.device, dst.data, off_dst, size) +} + // range, range_step, unroll, unroll_step, etc. fn @(?lower & ?upper & ?step) unroll_step(lower: i32, upper: i32, @step: i32, body: fn(i32) -> ()) -> () { diff --git a/src/anydsl_runtime.cpp b/src/anydsl_runtime.cpp index 42b392c1..7a26e1e0 100644 --- a/src/anydsl_runtime.cpp +++ b/src/anydsl_runtime.cpp @@ -111,7 +111,15 @@ void anydsl_copy( int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) { runtime().copy( to_platform(mask_src), to_device(mask_src), src, offset_src, - to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size); + to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size, false); +} + +void anydsl_copy_async( + int32_t mask_src, const void* src, int64_t offset_src, + int32_t mask_dst, void* dst, int64_t offset_dst, int64_t size) { + runtime().copy( + to_platform(mask_src), to_device(mask_src), src, offset_src, + to_platform(mask_dst), to_device(mask_dst), dst, offset_dst, size, true); } void anydsl_launch_kernel( diff --git a/src/anydsl_runtime.h b/src/anydsl_runtime.h index 901626f1..65241b78 100644 --- a/src/anydsl_runtime.h +++ b/src/anydsl_runtime.h @@ -32,6 +32,7 @@ AnyDSL_runtime_API void anydsl_release(int32_t, void*); AnyDSL_runtime_API void anydsl_release_host(int32_t, void*); AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t); +AnyDSL_runtime_API void anydsl_copy_async(int32_t, const void*, int64_t, int32_t, void*, int64_t, int64_t); AnyDSL_runtime_API void anydsl_launch_kernel( int32_t, const char*, const char*, diff --git a/src/anydsl_runtime.hpp b/src/anydsl_runtime.hpp index d2c63fbb..b8907620 100644 --- a/src/anydsl_runtime.hpp +++ b/src/anydsl_runtime.hpp @@ -123,6 +123,27 @@ void copy(const Array& a, int64_t offset_a, Array& b, int64_t offset_b, in size * sizeof(T)); } +template +void copy_async(const Array& a, Array& b) { + anydsl_copy_async(a.device(), (const void*)a.data(), 0, + b.device(), (void*)b.data(), 0, + a.size() * sizeof(T)); +} + +template +void copy_async(const Array& a, Array& b, int64_t size) { + anydsl_copy_async(a.device(), (const void*)a.data(), 0, + b.device(), (void*)b.data(), 0, + size * sizeof(T)); +} + +template +void copy_async(const Array& a, int64_t offset_a, Array& b, int64_t offset_b, int64_t size) { + anydsl_copy_async(a.device(), (const void*)a.data(), offset_a * sizeof(T), + b.device(), (void*)b.data(), offset_b * sizeof(T), + size * sizeof(T)); +} + } // namespace anydsl #endif diff --git a/src/cpu_platform.h b/src/cpu_platform.h index d52356e2..adafe075 100644 --- a/src/cpu_platform.h +++ b/src/cpu_platform.h @@ -51,15 +51,17 @@ class CpuPlatform : public Platform { } void copy(DeviceId, const void* src, int64_t offset_src, - DeviceId, void* dst, int64_t offset_dst, int64_t size) override { + DeviceId, void* dst, int64_t offset_dst, int64_t size, bool) override { copy(src, offset_src, dst, offset_dst, size); } + void copy_from_host(const void* src, int64_t offset_src, DeviceId, - void* dst, int64_t offset_dst, int64_t size) override { + void* dst, int64_t offset_dst, int64_t size, bool) override { copy(src, offset_src, dst, offset_dst, size); } + void copy_to_host(DeviceId, const void* src, int64_t offset_src, - void* dst, int64_t offset_dst, int64_t size) override { + void* dst, int64_t offset_dst, int64_t size, bool) override { copy(src, offset_src, dst, offset_dst, size); } diff --git a/src/cuda_platform.cpp b/src/cuda_platform.cpp index 75635148..d3c7b94b 100644 --- a/src/cuda_platform.cpp +++ b/src/cuda_platform.cpp @@ -247,7 +247,7 @@ void CudaPlatform::synchronize(DeviceId dev) { erase_profiles(false); } -void CudaPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { +void CudaPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { assert(dev_src == dev_dst); unused(dev_dst); @@ -255,29 +255,45 @@ void CudaPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, D CUdeviceptr src_mem = (CUdeviceptr)src; CUdeviceptr dst_mem = (CUdeviceptr)dst; - CUresult err = cuMemcpyDtoD(dst_mem + offset_dst, src_mem + offset_src, size); - CHECK_CUDA(err, "cuMemcpyDtoD()"); + if (!hint_async) { + CUresult err = cuMemcpyDtoD(dst_mem + offset_dst, src_mem + offset_src, size); + CHECK_CUDA(err, "cuMemcpyDtoD()"); + } else { + CUresult err = cuMemcpyDtoDAsync(dst_mem + offset_dst, src_mem + offset_src, size, 0); + CHECK_CUDA(err, "cuMemcpyDtoDAsync()"); + } cuCtxPopCurrent(NULL); } -void CudaPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { +void CudaPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { cuCtxPushCurrent(devices_[dev_dst].ctx); CUdeviceptr dst_mem = (CUdeviceptr)dst; - CUresult err = cuMemcpyHtoD(dst_mem + offset_dst, (char*)src + offset_src, size); - CHECK_CUDA(err, "cuMemcpyHtoD()"); + if (!hint_async) { + CUresult err = cuMemcpyHtoD(dst_mem + offset_dst, (char*)src + offset_src, size); + CHECK_CUDA(err, "cuMemcpyHtoD()"); + } else { + CUresult err = cuMemcpyHtoDAsync(dst_mem + offset_dst, (char*)src + offset_src, size, 0); + CHECK_CUDA(err, "cuMemcpyHtoDAsync()"); + } cuCtxPopCurrent(NULL); } -void CudaPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { +void CudaPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { cuCtxPushCurrent(devices_[dev_src].ctx); CUdeviceptr src_mem = (CUdeviceptr)src; - CUresult err = cuMemcpyDtoH((char*)dst + offset_dst, src_mem + offset_src, size); - CHECK_CUDA(err, "cuMemcpyDtoH()"); + + if (!hint_async) { + CUresult err = cuMemcpyDtoH((char*)dst + offset_dst, src_mem + offset_src, size); + CHECK_CUDA(err, "cuMemcpyDtoH()"); + } else { + CUresult err = cuMemcpyDtoHAsync((char*)dst + offset_dst, src_mem + offset_src, size, 0); + CHECK_CUDA(err, "cuMemcpyDtoHAsync()"); + } cuCtxPopCurrent(NULL); } diff --git a/src/cuda_platform.h b/src/cuda_platform.h index 0989c2e1..951ee6dc 100644 --- a/src/cuda_platform.h +++ b/src/cuda_platform.h @@ -40,9 +40,9 @@ class CudaPlatform : public Platform { void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; void synchronize(DeviceId dev) override; - void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; - void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; - void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; + void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; + void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; + void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; size_t dev_count() const override { return devices_.size(); } std::string name() const override { return "CUDA"; } diff --git a/src/dummy_platform.h b/src/dummy_platform.h index 480a46be..c533389c 100644 --- a/src/dummy_platform.h +++ b/src/dummy_platform.h @@ -24,9 +24,9 @@ class DummyPlatform : public Platform { void launch_kernel(DeviceId, const LaunchParams&) override { platform_error(); } void synchronize(DeviceId) override { platform_error(); } - void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); } - void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t) override { platform_error(); } - void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t) override { platform_error(); } + void copy(DeviceId, const void*, int64_t, DeviceId, void*, int64_t, int64_t, bool) override { platform_error(); } + void copy_from_host(const void*, int64_t, DeviceId, void*, int64_t, int64_t, bool) override { platform_error(); } + void copy_to_host(DeviceId, const void*, int64_t, void*, int64_t, int64_t, bool) override { platform_error(); } size_t dev_count() const override { return 0; } std::string name() const override { return name_; } diff --git a/src/hsa_platform.cpp b/src/hsa_platform.cpp index 5f68d1c9..0f35a125 100644 --- a/src/hsa_platform.cpp +++ b/src/hsa_platform.cpp @@ -443,7 +443,9 @@ void HSAPlatform::synchronize(DeviceId dev) { error("HSA signal completion failed: %", completion); } -void HSAPlatform::copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { +void HSAPlatform::copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { + unused(hint_async); + hsa_status_t status = hsa_memory_copy((char*)dst + offset_dst, (char*)src + offset_src, size); CHECK_HSA(status, "hsa_memory_copy()"); } diff --git a/src/hsa_platform.h b/src/hsa_platform.h index eff4bdb9..56ecfa33 100644 --- a/src/hsa_platform.h +++ b/src/hsa_platform.h @@ -33,10 +33,10 @@ class HSAPlatform : public Platform { void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; void synchronize(DeviceId dev) override; - void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); - void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } - void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } - void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override { copy(src, offset_src, dst, offset_dst, size); } + void copy(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async); + void copy(DeviceId, const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); } + void copy_from_host(const void* src, int64_t offset_src, DeviceId, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); } + void copy_to_host(DeviceId, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override { copy(src, offset_src, dst, offset_dst, size, hint_async); } size_t dev_count() const override { return devices_.size(); } std::string name() const override { return "HSA"; } diff --git a/src/opencl_platform.cpp b/src/opencl_platform.cpp index 2b9d59ec..848525bd 100644 --- a/src/opencl_platform.cpp +++ b/src/opencl_platform.cpp @@ -445,7 +445,7 @@ void OpenCLPlatform::synchronize(DeviceId dev) { } } -void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { +void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { assert(dev_src == dev_dst); unused(dev_dst); @@ -458,27 +458,30 @@ void OpenCLPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, #endif cl_int err = clEnqueueCopyBuffer(devices_[dev_src].queue, (cl_mem)src, (cl_mem)dst, offset_src, offset_dst, size, 0, NULL, NULL); - err |= clFinish(devices_[dev_src].queue); + if (!hint_async) + err |= clFinish(devices_[dev_src].queue); CHECK_OPENCL(err, "clEnqueueCopyBuffer()"); } -void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { +void OpenCLPlatform::copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { #ifdef CL_VERSION_2_0 if (devices_[dev_dst].version_major == 2) return copy_svm(src, offset_src, dst, offset_dst, size); #endif cl_int err = clEnqueueWriteBuffer(devices_[dev_dst].queue, (cl_mem)dst, CL_FALSE, offset_dst, size, (char*)src + offset_src, 0, NULL, NULL); - err |= clFinish(devices_[dev_dst].queue); + if (!hint_async) + err |= clFinish(devices_[dev_dst].queue); CHECK_OPENCL(err, "clEnqueueWriteBuffer()"); } -void OpenCLPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) { +void OpenCLPlatform::copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) { #ifdef CL_VERSION_2_0 if (devices_[dev_src].version_major == 2) return copy_svm(src, offset_src, dst, offset_dst, size); #endif cl_int err = clEnqueueReadBuffer(devices_[dev_src].queue, (cl_mem)src, CL_FALSE, offset_src, size, (char*)dst + offset_dst, 0, NULL, NULL); - err |= clFinish(devices_[dev_src].queue); + if (!hint_async) + err |= clFinish(devices_[dev_src].queue); CHECK_OPENCL(err, "clEnqueueReadBuffer()"); } diff --git a/src/opencl_platform.h b/src/opencl_platform.h index 6f9d6c37..21c1fb05 100644 --- a/src/opencl_platform.h +++ b/src/opencl_platform.h @@ -34,9 +34,10 @@ class OpenCLPlatform : public Platform { void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override; void synchronize(DeviceId dev) override; - void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; - void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) override; - void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) override; + void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; + void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; + void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override; + void copy_svm(const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size); void dynamic_profile(DeviceId dev, const std::string& filename); diff --git a/src/platform.h b/src/platform.h index c647a293..1946cf78 100644 --- a/src/platform.h +++ b/src/platform.h @@ -42,11 +42,11 @@ class Platform { virtual void synchronize(DeviceId dev) = 0; /// Copies memory. Copy can only be performed devices in the same platform. - virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0; + virtual void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0; /// Copies memory from the host (CPU). - virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) = 0; + virtual void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0; /// Copies memory to the host (CPU). - virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) = 0; + virtual void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) = 0; /// Returns the platform name. virtual std::string name() const = 0; diff --git a/src/runtime.cpp b/src/runtime.cpp index a13cec2f..7d5e136c 100644 --- a/src/runtime.cpp +++ b/src/runtime.cpp @@ -74,22 +74,23 @@ void Runtime::release_host(PlatformId plat, DeviceId dev, void* ptr) { void Runtime::copy( PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src, - PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) { + PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, + bool hint_async) { check_device(plat_src, dev_src); check_device(plat_dst, dev_dst); if (plat_src == plat_dst) { // Copy from same platform - platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size); + platforms_[plat_src]->copy(dev_src, src, offset_src, dev_dst, dst, offset_dst, size, hint_async); debug("Copy between devices % and % on platform %", dev_src, dev_dst, plat_src); } else { // Copy from another platform if (plat_src == 0) { // Source is the CPU platform - platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size); + platforms_[plat_dst]->copy_from_host(src, offset_src, dev_dst, dst, offset_dst, size, hint_async); debug("Copy from host to device % on platform %", dev_dst, plat_dst); } else if (plat_dst == 0) { // Destination is the CPU platform - platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size); + platforms_[plat_src]->copy_to_host(dev_src, src, offset_src, dst, offset_dst, size, hint_async); debug("Copy to host from device % on platform %", dev_src, plat_src); } else { error("Cannot copy memory between different platforms"); diff --git a/src/runtime.h b/src/runtime.h index f975ac5d..d10bd289 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -70,7 +70,8 @@ class Runtime { /// Copies memory between devices. void copy( PlatformId plat_src, DeviceId dev_src, const void* src, int64_t offset_src, - PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size); + PlatformId plat_dst, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, + bool hint_async = false); /// Launches a kernel on the platform and device. void launch_kernel(PlatformId plat, DeviceId dev, const LaunchParams& launch_params);