Skip to content

Commit

Permalink
Use new thrust::cuda::par_nosync execution policy (#715)
Browse files Browse the repository at this point in the history
* Use new `thrust::cuda::par_nosync` execution policy

This replaces the workaround we implemented to prevent stream
synchronization.

* Use more specific cub includes

* Remove non needed include
  • Loading branch information
guillaumekln authored Feb 23, 2022
1 parent 5eb5b8a commit 25532cb
Show file tree
Hide file tree
Showing 6 changed files with 8 additions and 21 deletions.
16 changes: 1 addition & 15 deletions src/cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,8 @@ namespace ctranslate2 {
bool gpu_has_int8_tensor_cores(int device = -1);
bool gpu_has_fp16_tensor_cores(int device = -1);

// Define a custom execution policy to set the default stream and disable synchronization.
struct thrust_execution_policy : thrust::device_execution_policy<thrust_execution_policy> {
private:
cudaStream_t _stream = get_cuda_stream();

friend __host__ __device__ cudaStream_t get_stream(thrust_execution_policy& policy) {
return policy._stream;
}

friend __host__ __device__ cudaError_t synchronize_stream(thrust_execution_policy&) {
return cudaSuccess;
}
};

// Convenience macro to call Thrust functions with a default execution policy.
#define THRUST_CALL(FUN, ...) FUN(ctranslate2::cuda::thrust_execution_policy(), __VA_ARGS__)
#define THRUST_CALL(FUN, ...) FUN(thrust::cuda::par_nosync.on(ctranslate2::cuda::get_cuda_stream()), __VA_ARGS__)

}
}
4 changes: 2 additions & 2 deletions src/ops/layer_norm_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "ctranslate2/ops/layer_norm.h"

#include <cub/cub.cuh>

#include "cuda/helpers.h"
#include "cuda/utils.h"

Expand Down Expand Up @@ -132,6 +130,8 @@ namespace ctranslate2 {
POSSIBILITY OF SUCH DAMAGE.
*/

#include <cub/block/block_reduce.cuh>

namespace at {
namespace native {

Expand Down
2 changes: 1 addition & 1 deletion src/ops/mean_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "ctranslate2/ops/mean.h"

#include <cub/cub.cuh>
#include <cub/block/block_reduce.cuh>

#include "type_dispatch.h"
#include "cuda/helpers.h"
Expand Down
3 changes: 2 additions & 1 deletion src/ops/multinomial_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

#include <memory>

#include <cub/cub.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_scan.cuh>
#include <curand_kernel.h>

#include "ctranslate2/utils.h"
Expand Down
2 changes: 1 addition & 1 deletion src/ops/topk_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ namespace ctranslate2 {
SOFTWARE.
*/

#include <cub/cub.cuh>
#include <cub/block/block_reduce.cuh>

namespace fastertransformer {

Expand Down
2 changes: 1 addition & 1 deletion third_party/thrust
Submodule thrust updated 679 files

0 comments on commit 25532cb

Please sign in to comment.