diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu index 322d997e83..5137b5766c 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu @@ -137,7 +137,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row( const bool enable_optimizer_offloading, {%- endif %} {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args | replace_pta_namespace() | join(",\n ") }} @@ -436,7 +436,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row const bool enable_optimizer_offloading, {%- endif %} {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args_no_defaults | @@ -606,7 +606,7 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd const int32_t max_D, const int32_t max_vecs_per_thread, {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args | replace_pta_namespace() | join(",\n ") }} @@ -771,7 +771,7 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd const int32_t max_D, const int32_t max_vecs_per_thread, {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args_no_defaults | replace_pta_namespace() | join(",\n ") | replace("cache_t", cache_type) }} diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index 186a9d529f..bc9322d0bc 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -220,7 +220,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row( const bool enable_optimizer_offloading, {%- endif %} {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args | replace_pta_namespace() | join(",\n ") }} @@ -292,7 +292,7 @@ hip_split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vd const int32_t max_D, const int32_t max_vecs_per_thread, {%- if is_index_select %} - const at::PackedTensorAccessor32 grad_offsets, + const pta::PackedTensorAccessor32 grad_offsets, const bool permute_output_dim_0_1 {%- else %} {{ args.split_kernel_args | replace_pta_namespace() | join(",\n ") }} @@ -966,7 +966,6 @@ Tensor {{ embedding_cuda_op }}( {%- endif %} DISPATCH_OPTIMAL_KERNEL(max_D, [&] { - auto long_run_ids = at::empty({indices.numel()}, sorted_linear_indices_run_lengths.options()); auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt)); @@ -982,7 +981,6 @@ Tensor {{ embedding_cuda_op }}( at::empty({indices.numel()}, sorted_linear_indices_run_lengths.options()); } - auto num_really_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt)); auto grad_accum_counter = at::empty( use_deterministic_algorithms ? 0 : (indices.numel() / max_segment_length_per_cta), @@ -1292,7 +1290,7 @@ Tensor {{ embedding_cuda_op }}( enable_optimizer_offloading, {%- endif %} {%- if is_index_select %} - grad_offsets.packed_accessor32(), + PTA_B(grad_offsets, int64_t, 1, 32), permute_output_dim_0_1 {%- else %} {{ args.split_kernel_arg_constructors | make_pta_acc_builder_format() | join(",\n ") }}