Skip to content

Debug stochastic rounding issue #3906

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
{%- endfor %}

StochasticRoundingRNGState state;
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: before optimizer &state=%p, stochastic_rounding=%d\n", &state, (int) stochastic_rounding);
}
auto weight_row_template =
WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
weights,
Expand All @@ -106,14 +109,16 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
stochastic_rounding ? &state : nullptr,
&stochastic_rounding_philox_args,
threadIdx.x + run_id * blockDim.x);
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: after optimizer stochastic_rounding %d\n", stochastic_rounding ? 1 : 0);
}

float2 qparams_template;
if (kIsInt8 && !cache_weights) {
qparams_template = weight_row_template.load_qparams();
}

{{ split_precomputation }}

{# /* Note: technically, global weight decay (gwd) compensation should be done before
`split_precomputation`). But since decouple mode in `rowwise_adagrad` only computes correction,
the order of applying gwd does not matter. We perform gwd update before `split_weight_update`
Expand Down
12 changes: 12 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@ DEVICE_INLINE void quantize_store(
StochasticRoundingRNGState* state,
const float2 qparams) {
if (!state) {
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: nearest_rounding_vector\n");
}
nearest_rounding_vector<dst_t, src_t>(output, value, qparams);
} else {
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: stochastic_rounding_vector\n");
}
stochastic_rounding_vector<dst_t, src_t>(output, value, *state, qparams);
}
}
Expand Down Expand Up @@ -133,9 +139,15 @@ struct WeightRow {
: row_(row), cache_row_(cache_row), dim_(dim) {
// Set the internal stoc_rounding_state_
stoc_rounding_state_ = stoc_rounding_state;
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: WeightRow has stoc_rounding_state_ %p\n", stoc_rounding_state);
}

if constexpr (!std::is_same_v<emb_t, float>) {
if (stoc_rounding_state != nullptr) {
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0) {
printf("STOC_DEBUG: WeightRow init stoc_rounding_state\n");
}
const auto stochastic_rounding_seeds =
at::cuda::philox::unpack(*stochastic_rounding_philox_args);

Expand Down
4 changes: 4 additions & 0 deletions fbgemm_gpu/test/tbe/training/backward_adagrad_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# pyre-strict

import sys
import logging

from typing import Any, Dict

Expand Down Expand Up @@ -288,6 +289,7 @@ def execute_backward_adagrad( # noqa C901
# do SGD update
lr = 0.5
eps = 0.2
logging.info("STOC_DEBUG: stochastic_rounding {}".format(stochastic_rounding))

optimizer = OptimType.EXACT_ROWWISE_ADAGRAD if row_wise else OptimType.EXACT_ADAGRAD
cc = emb_op(
Expand Down Expand Up @@ -354,6 +356,8 @@ def execute_backward_adagrad( # noqa C901
split_optimizer_states = cc.split_optimizer_states()
assert len(split_optimizer_states) == T

return

get_optimizer_states = None
if row_wise:
# get_optimizer_state should/must be implemented for rowwise
Expand Down
286 changes: 152 additions & 134 deletions fbgemm_gpu/test/tbe/training/backward_adagrad_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import unittest
from typing import Any, Dict

from hypothesis import given, settings

Check failure on line 15 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 'hypothesis.given' imported but unused

Check failure on line 15 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 'hypothesis.settings' imported but unused

from .backward_adagrad_common import (

Check failure on line 17 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 '.backward_adagrad_common.additional_decorators' imported but unused

Check failure on line 17 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 '.backward_adagrad_common.common_settings' imported but unused

Check failure on line 17 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 '.backward_adagrad_common.gpu_unavailable' imported but unused

Check failure on line 17 in fbgemm_gpu/test/tbe/training/backward_adagrad_test.py

View workflow job for this annotation

GitHub Actions / run-lint (3.13)

F401 '.backward_adagrad_common.optests' imported but unused
additional_decorators,
adjust_mixed_B_st,
common_settings,
Expand All @@ -23,6 +23,7 @@
gpu_unavailable,
optests,
PoolingMode,
CacheAlgorithm,
SparseType,
st,
)
Expand All @@ -36,15 +37,32 @@
test_st_cpu["output_dtype"] = st.sampled_from([SparseType.FP32, SparseType.FP16])


@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators)
#@optests.generate_opcheck_tests(fast=True, additional_decorators=additional_decorators)
class BackwardAdagradTest(unittest.TestCase):
@unittest.skipIf(*gpu_unavailable)
@given(mixed_B=st.booleans(), **test_st)
@settings(**common_settings)
#@unittest.skipIf(*gpu_unavailable)
#@given(mixed_B=st.booleans(), **test_st)
#@settings(**common_settings)
def test_backward_adagrad_fp16_pmSUM( # noqa C901
self,
**kwargs: Any,
# **kwargs: Any,
) -> None:
kwargs = {
"mixed_B": False,
"T": 2,
"D": 74,
"B": 65,
"log_E": 5,
"L": 2,
"D_gradcheck": 2,
"stochastic_rounding": True,
"weighted": True,
"row_wise": True,
"mixed": True,
"use_cache": False,
"cache_algorithm": CacheAlgorithm.LRU,
"use_cpu": False,
"output_dtype": SparseType.FP32,
}
kwargs = adjust_mixed_B_st(kwargs)
execute_backward_adagrad(
weights_precision=SparseType.FP16,
Expand All @@ -53,135 +71,135 @@
**kwargs,
)

@unittest.skipIf(*gpu_unavailable)
@given(
mixed_B=st.booleans(),
compile=st.booleans(),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp16_pmMEAN( # noqa C901
self,
**kwargs: Any,
) -> None:
kwargs = adjust_mixed_B_st(kwargs)
execute_backward_adagrad(
weights_precision=SparseType.FP16,
pooling_mode=PoolingMode.MEAN,
**kwargs,
)

@unittest.skipIf(*gpu_unavailable)
@given(
compile=st.booleans(),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp16_pmNONE( # noqa C901
self,
**kwargs: Any,
) -> None:
execute_backward_adagrad(
weights_precision=SparseType.FP16,
pooling_mode=PoolingMode.NONE,
mixed_B=False,
**kwargs,
)

@given(
mixed_B=st.booleans(),
compile=st.booleans(),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp32_pmSUM( # noqa C901
self,
**kwargs: Any,
) -> None:
kwargs = adjust_mixed_B_st(kwargs)
execute_backward_adagrad(
weights_precision=SparseType.FP32,
pooling_mode=PoolingMode.SUM,
**kwargs,
)

@given(
compile=st.booleans(),
pooling_mode=st.sampled_from([PoolingMode.SUM, PoolingMode.MEAN]),
**test_st_cpu,
)
@settings(**common_settings)
def test_backward_adagrad_fp32_cpu( # noqa C901
self,
pooling_mode: PoolingMode,
**kwargs: Any,
) -> None:
"""
Test VBE support for CPU on rowwise adagrad
"""
kwargs = adjust_mixed_B_st(kwargs)
execute_backward_adagrad(
weights_precision=SparseType.FP32,
pooling_mode=pooling_mode,
mixed_B=True,
**kwargs,
)

@given(
mixed_B=st.booleans(),
compile=st.booleans(),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp32_pmMEAN( # noqa C901
self,
**kwargs: Any,
) -> None:
kwargs = adjust_mixed_B_st(kwargs)
execute_backward_adagrad(
weights_precision=SparseType.FP32,
pooling_mode=PoolingMode.MEAN,
**kwargs,
)

@unittest.skipIf(*gpu_unavailable)
@given(
compile=st.booleans(),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp32_pmNONE( # noqa C901
self,
**kwargs: Any,
) -> None:
execute_backward_adagrad(
weights_precision=SparseType.FP32,
mixed_B=False,
pooling_mode=PoolingMode.NONE,
**kwargs,
)

@unittest.skipIf(*gpu_unavailable)
@given(
mixed_B=st.booleans(),
max_norm=st.floats(min_value=0.01, max_value=1.0),
**test_st,
)
@settings(**common_settings)
def test_backward_adagrad_fp16_pmSUM_with_max_norm( # noqa C901
self,
**kwargs: Any,
) -> None:
kwargs = adjust_mixed_B_st(kwargs)
fixed_strategy = {"row_wise": True, "use_cpu": False}
for key, val in fixed_strategy.items():
assert key in kwargs
kwargs[key] = val
execute_backward_adagrad(
weights_precision=SparseType.FP16,
pooling_mode=PoolingMode.SUM,
**kwargs,
)
# @unittest.skipIf(*gpu_unavailable)
# @given(
# mixed_B=st.booleans(),
# compile=st.booleans(),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp16_pmMEAN( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# kwargs = adjust_mixed_B_st(kwargs)
# execute_backward_adagrad(
# weights_precision=SparseType.FP16,
# pooling_mode=PoolingMode.MEAN,
# **kwargs,
# )
#
# @unittest.skipIf(*gpu_unavailable)
# @given(
# compile=st.booleans(),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp16_pmNONE( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# execute_backward_adagrad(
# weights_precision=SparseType.FP16,
# pooling_mode=PoolingMode.NONE,
# mixed_B=False,
# **kwargs,
# )
#
# @given(
# mixed_B=st.booleans(),
# compile=st.booleans(),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp32_pmSUM( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# kwargs = adjust_mixed_B_st(kwargs)
# execute_backward_adagrad(
# weights_precision=SparseType.FP32,
# pooling_mode=PoolingMode.SUM,
# **kwargs,
# )
#
# @given(
# compile=st.booleans(),
# pooling_mode=st.sampled_from([PoolingMode.SUM, PoolingMode.MEAN]),
# **test_st_cpu,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp32_cpu( # noqa C901
# self,
# pooling_mode: PoolingMode,
# **kwargs: Any,
# ) -> None:
# """
# Test VBE support for CPU on rowwise adagrad
# """
# kwargs = adjust_mixed_B_st(kwargs)
# execute_backward_adagrad(
# weights_precision=SparseType.FP32,
# pooling_mode=pooling_mode,
# mixed_B=True,
# **kwargs,
# )
#
# @given(
# mixed_B=st.booleans(),
# compile=st.booleans(),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp32_pmMEAN( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# kwargs = adjust_mixed_B_st(kwargs)
# execute_backward_adagrad(
# weights_precision=SparseType.FP32,
# pooling_mode=PoolingMode.MEAN,
# **kwargs,
# )
#
# @unittest.skipIf(*gpu_unavailable)
# @given(
# compile=st.booleans(),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp32_pmNONE( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# execute_backward_adagrad(
# weights_precision=SparseType.FP32,
# mixed_B=False,
# pooling_mode=PoolingMode.NONE,
# **kwargs,
# )
#
# @unittest.skipIf(*gpu_unavailable)
# @given(
# mixed_B=st.booleans(),
# max_norm=st.floats(min_value=0.01, max_value=1.0),
# **test_st,
# )
# @settings(**common_settings)
# def test_backward_adagrad_fp16_pmSUM_with_max_norm( # noqa C901
# self,
# **kwargs: Any,
# ) -> None:
# kwargs = adjust_mixed_B_st(kwargs)
# fixed_strategy = {"row_wise": True, "use_cpu": False}
# for key, val in fixed_strategy.items():
# assert key in kwargs
# kwargs[key] = val
# execute_backward_adagrad(
# weights_precision=SparseType.FP16,
# pooling_mode=PoolingMode.SUM,
# **kwargs,
# )


if __name__ == "__main__":
Expand Down
Loading