-
Notifications
You must be signed in to change notification settings - Fork 740
[PyTorch Debug] Add scale_inv_std stat and skip NVFP4 layers in LogFp8TensorStats #3044
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
80ad4d9
441a793
4149e81
dfdf630
0478e3f
5f4de10
a0f417b
97d6d80
4cdf022
f211f76
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,14 +11,21 @@ | |
| import torch | ||
| import nvdlfw_inspect.api as debug_api | ||
|
|
||
| from nvdlfw_inspect.debug_features.log_tensor_stats import LogTensorStats as BaseLogTensorStats | ||
| from nvdlfw_inspect.debug_features.log_tensor_stats import ( | ||
| LogTensorStats as BaseLogTensorStats, | ||
| ) | ||
| from nvdlfw_inspect.registry import Registry, api_method | ||
|
|
||
| from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS | ||
| from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor | ||
| from transformer_engine.pytorch.tensor.nvfp4_tensor import NVFP4Quantizer | ||
| from transformer_engine.debug.features.utils import get_reduction_params, next_enabled_iter | ||
| from transformer_engine.pytorch.tensor.storage.nvfp4_tensor_storage import NVFP4TensorStorage | ||
| from transformer_engine.debug.features.utils import ( | ||
| get_reduction_params, | ||
| next_enabled_iter, | ||
| ) | ||
| from transformer_engine.pytorch.tensor.storage.nvfp4_tensor_storage import ( | ||
| NVFP4TensorStorage, | ||
| ) | ||
|
|
||
|
|
||
| @Registry.register_feature(namespace="transformer_engine") | ||
|
|
@@ -45,6 +52,10 @@ class LogNvfp4TensorStats(BaseLogTensorStats): | |
| List of statistics to collect. Available stats: | ||
| - underflows% - percentage of non-zero elements clipped to 0 (from packed FP4 data) | ||
| - mse - mean squared error = sum((quantized_tensor - original_tensor)**2) / num_elements | ||
| - scale_inv_min - minimum of the inverse of the scaling factors | ||
| - scale_inv_max - maximum of the inverse of the scaling factors | ||
| - scale_inv_std - population standard deviation of the inverse of the scaling factors; | ||
| useful for spotting clipping that min/max alone can miss | ||
|
|
||
| tensors/tensors_struct: List[str] | ||
| list of tensors to log | ||
|
|
@@ -85,13 +96,18 @@ class LogNvfp4TensorStats(BaseLogTensorStats): | |
|
|
||
| def check_if_stat_is_supported(self, stat: str): | ||
| """Returns True if stat is supported, raises ValueError otherwise.""" | ||
| bare = stat[: -len("_columnwise")] if stat.endswith("_columnwise") else stat | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mse_columnwise and underflow%_columnwise should not pass this function but they would with the logic as written.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added them to exception, even though we should have stats for them I guess, but this is not scope of the PR. |
||
| supported_stats = [ | ||
| "underflows%", | ||
| "mse", | ||
| "scale_inv_min", | ||
| "scale_inv_max", | ||
| "scale_inv_std", | ||
| ] | ||
| if stat not in supported_stats: | ||
| if bare not in supported_stats: | ||
| raise ValueError( | ||
| f"Stat {stat} is not supported for NVFP4. Supported stats: {supported_stats}" | ||
| " (any of these may take an optional '_columnwise' suffix)" | ||
| ) | ||
| return True | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.