From 04d8370ff40257a98b43674620672f544304764d Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Mon, 31 Mar 2025 22:58:15 -0700 Subject: [PATCH] Debug stochastic rounding issue (#3906) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3906 Differential Revision: D72191058 --- ...optimizer_split_device_kernel_template.cuh | 7 +- .../include/fbgemm_gpu/utils/weight_row.cuh | 12 + .../tbe/training/backward_adagrad_common.py | 4 + .../tbe/training/backward_adagrad_test.py | 286 ++++++++++-------- 4 files changed, 174 insertions(+), 135 deletions(-) diff --git a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh index 18eb5d2a91..c2c28f1627 100644 --- a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh +++ b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh @@ -98,6 +98,9 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel( {%- endfor %} StochasticRoundingRNGState state; + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: before optimizer &state=%p, stochastic_rounding=%d\n", &state, (int) stochastic_rounding); + } auto weight_row_template = WeightRow>( weights, @@ -106,6 +109,9 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel( stochastic_rounding ? &state : nullptr, &stochastic_rounding_philox_args, threadIdx.x + run_id * blockDim.x); + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: after optimizer stochastic_rounding %d\n", stochastic_rounding ? 1 : 0); + } float2 qparams_template; if (kIsInt8 && !cache_weights) { @@ -113,7 +119,6 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel( } {{ split_precomputation }} - {# /* Note: technically, global weight decay (gwd) compensation should be done before `split_precomputation`). But since decouple mode in `rowwise_adagrad` only computes correction, the order of applying gwd does not matter. We perform gwd update before `split_weight_update` diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh index d023ce3f43..b6387a2f4a 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh @@ -28,8 +28,14 @@ DEVICE_INLINE void quantize_store( StochasticRoundingRNGState* state, const float2 qparams) { if (!state) { + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: nearest_rounding_vector\n"); + } nearest_rounding_vector(output, value, qparams); } else { + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: stochastic_rounding_vector\n"); + } stochastic_rounding_vector(output, value, *state, qparams); } } @@ -133,9 +139,15 @@ struct WeightRow { : row_(row), cache_row_(cache_row), dim_(dim) { // Set the internal stoc_rounding_state_ stoc_rounding_state_ = stoc_rounding_state; + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: WeightRow has stoc_rounding_state_ %p\n", stoc_rounding_state); + } if constexpr (!std::is_same_v) { if (stoc_rounding_state != nullptr) { + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) { + printf("STOC_DEBUG: WeightRow init stoc_rounding_state\n"); + } const auto stochastic_rounding_seeds = at::cuda::philox::unpack(*stochastic_rounding_philox_args); diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py index 918af5c63b..38c9aad6a1 100755 --- a/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py +++ b/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py @@ -8,6 +8,7 @@ # pyre-strict import sys +import logging from typing import Any, Dict @@ -288,6 +289,7 @@ def execute_backward_adagrad( # noqa C901 # do SGD update lr = 0.5 eps = 0.2 + logging.info("STOC_DEBUG: stochastic_rounding {}".format(stochastic_rounding)) optimizer = OptimType.EXACT_ROWWISE_ADAGRAD if row_wise else OptimType.EXACT_ADAGRAD cc = emb_op( @@ -354,6 +356,8 @@ def execute_backward_adagrad( # noqa C901 split_optimizer_states = cc.split_optimizer_states() assert len(split_optimizer_states) == T + return + get_optimizer_states = None if row_wise: # get_optimizer_state should/must be implemented for rowwise diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py index d78f3a3a86..c3255b4af9 100644 --- a/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py +++ b/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py @@ -23,6 +23,7 @@ gpu_unavailable, optests, PoolingMode, + CacheAlgorithm, SparseType, st, ) @@ -36,15 +37,32 @@ test_st_cpu["output_dtype"] = st.sampled_from([SparseType.FP32, SparseType.FP16]) -@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators) +#@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators) class BackwardAdagradTest(unittest.TestCase): - @unittest.skipIf(*gpu_unavailable) - @given(mixed_B=st.booleans(), **test_st) - @settings(**common_settings) + #@unittest.skipIf(*gpu_unavailable) + #@given(mixed_B=st.booleans(), **test_st) + #@settings(**common_settings) def test_backward_adagrad_fp16_pmSUM( # noqa C901 self, - **kwargs: Any, + # **kwargs: Any, ) -> None: + kwargs = { + "mixed_B": False, + "T": 2, + "D": 74, + "B": 65, + "log_E": 5, + "L": 2, + "D_gradcheck": 2, + "stochastic_rounding": True, + "weighted": True, + "row_wise": True, + "mixed": True, + "use_cache": False, + "cache_algorithm": CacheAlgorithm.LRU, + "use_cpu": False, + "output_dtype": SparseType.FP32, + } kwargs = adjust_mixed_B_st(kwargs) execute_backward_adagrad( weights_precision=SparseType.FP16, @@ -53,135 +71,135 @@ def test_backward_adagrad_fp16_pmSUM( # noqa C901 **kwargs, ) - @unittest.skipIf(*gpu_unavailable) - @given( - mixed_B=st.booleans(), - compile=st.booleans(), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp16_pmMEAN( # noqa C901 - self, - **kwargs: Any, - ) -> None: - kwargs = adjust_mixed_B_st(kwargs) - execute_backward_adagrad( - weights_precision=SparseType.FP16, - pooling_mode=PoolingMode.MEAN, - **kwargs, - ) - - @unittest.skipIf(*gpu_unavailable) - @given( - compile=st.booleans(), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp16_pmNONE( # noqa C901 - self, - **kwargs: Any, - ) -> None: - execute_backward_adagrad( - weights_precision=SparseType.FP16, - pooling_mode=PoolingMode.NONE, - mixed_B=False, - **kwargs, - ) - - @given( - mixed_B=st.booleans(), - compile=st.booleans(), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp32_pmSUM( # noqa C901 - self, - **kwargs: Any, - ) -> None: - kwargs = adjust_mixed_B_st(kwargs) - execute_backward_adagrad( - weights_precision=SparseType.FP32, - pooling_mode=PoolingMode.SUM, - **kwargs, - ) - - @given( - compile=st.booleans(), - pooling_mode=st.sampled_from([PoolingMode.SUM, PoolingMode.MEAN]), - **test_st_cpu, - ) - @settings(**common_settings) - def test_backward_adagrad_fp32_cpu( # noqa C901 - self, - pooling_mode: PoolingMode, - **kwargs: Any, - ) -> None: - """ - Test VBE support for CPU on rowwise adagrad - """ - kwargs = adjust_mixed_B_st(kwargs) - execute_backward_adagrad( - weights_precision=SparseType.FP32, - pooling_mode=pooling_mode, - mixed_B=True, - **kwargs, - ) - - @given( - mixed_B=st.booleans(), - compile=st.booleans(), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp32_pmMEAN( # noqa C901 - self, - **kwargs: Any, - ) -> None: - kwargs = adjust_mixed_B_st(kwargs) - execute_backward_adagrad( - weights_precision=SparseType.FP32, - pooling_mode=PoolingMode.MEAN, - **kwargs, - ) - - @unittest.skipIf(*gpu_unavailable) - @given( - compile=st.booleans(), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp32_pmNONE( # noqa C901 - self, - **kwargs: Any, - ) -> None: - execute_backward_adagrad( - weights_precision=SparseType.FP32, - mixed_B=False, - pooling_mode=PoolingMode.NONE, - **kwargs, - ) - - @unittest.skipIf(*gpu_unavailable) - @given( - mixed_B=st.booleans(), - max_norm=st.floats(min_value=0.01, max_value=1.0), - **test_st, - ) - @settings(**common_settings) - def test_backward_adagrad_fp16_pmSUM_with_max_norm( # noqa C901 - self, - **kwargs: Any, - ) -> None: - kwargs = adjust_mixed_B_st(kwargs) - fixed_strategy = {"row_wise": True, "use_cpu": False} - for key, val in fixed_strategy.items(): - assert key in kwargs - kwargs[key] = val - execute_backward_adagrad( - weights_precision=SparseType.FP16, - pooling_mode=PoolingMode.SUM, - **kwargs, - ) +# @unittest.skipIf(*gpu_unavailable) +# @given( +# mixed_B=st.booleans(), +# compile=st.booleans(), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp16_pmMEAN( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# kwargs = adjust_mixed_B_st(kwargs) +# execute_backward_adagrad( +# weights_precision=SparseType.FP16, +# pooling_mode=PoolingMode.MEAN, +# **kwargs, +# ) +# +# @unittest.skipIf(*gpu_unavailable) +# @given( +# compile=st.booleans(), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp16_pmNONE( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# execute_backward_adagrad( +# weights_precision=SparseType.FP16, +# pooling_mode=PoolingMode.NONE, +# mixed_B=False, +# **kwargs, +# ) +# +# @given( +# mixed_B=st.booleans(), +# compile=st.booleans(), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp32_pmSUM( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# kwargs = adjust_mixed_B_st(kwargs) +# execute_backward_adagrad( +# weights_precision=SparseType.FP32, +# pooling_mode=PoolingMode.SUM, +# **kwargs, +# ) +# +# @given( +# compile=st.booleans(), +# pooling_mode=st.sampled_from([PoolingMode.SUM, PoolingMode.MEAN]), +# **test_st_cpu, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp32_cpu( # noqa C901 +# self, +# pooling_mode: PoolingMode, +# **kwargs: Any, +# ) -> None: +# """ +# Test VBE support for CPU on rowwise adagrad +# """ +# kwargs = adjust_mixed_B_st(kwargs) +# execute_backward_adagrad( +# weights_precision=SparseType.FP32, +# pooling_mode=pooling_mode, +# mixed_B=True, +# **kwargs, +# ) +# +# @given( +# mixed_B=st.booleans(), +# compile=st.booleans(), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp32_pmMEAN( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# kwargs = adjust_mixed_B_st(kwargs) +# execute_backward_adagrad( +# weights_precision=SparseType.FP32, +# pooling_mode=PoolingMode.MEAN, +# **kwargs, +# ) +# +# @unittest.skipIf(*gpu_unavailable) +# @given( +# compile=st.booleans(), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp32_pmNONE( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# execute_backward_adagrad( +# weights_precision=SparseType.FP32, +# mixed_B=False, +# pooling_mode=PoolingMode.NONE, +# **kwargs, +# ) +# +# @unittest.skipIf(*gpu_unavailable) +# @given( +# mixed_B=st.booleans(), +# max_norm=st.floats(min_value=0.01, max_value=1.0), +# **test_st, +# ) +# @settings(**common_settings) +# def test_backward_adagrad_fp16_pmSUM_with_max_norm( # noqa C901 +# self, +# **kwargs: Any, +# ) -> None: +# kwargs = adjust_mixed_B_st(kwargs) +# fixed_strategy = {"row_wise": True, "use_cpu": False} +# for key, val in fixed_strategy.items(): +# assert key in kwargs +# kwargs[key] = val +# execute_backward_adagrad( +# weights_precision=SparseType.FP16, +# pooling_mode=PoolingMode.SUM, +# **kwargs, +# ) if __name__ == "__main__":