diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 88dc4220349..caf8167c3b2 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -7,6 +7,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Added +* Added ``--pc-sampling-rows`` analyze option to cap the PC sampling table at the top N rows (default 10); set ``0`` to show all. Must be non-negative. + * Added ``--bench-only`` profile mode option to run the roofline microbenchmark standalone (without profiling an application or collecting performance counters). No application run is required. Useful for regenerating ``roofline.csv`` in an existing workload directory or running the microbenchmark on systems where only HIP is available but rocprofiler-sdk is not. * Added ``--overwrite`` profile mode option to explicitly allow replacing existing workload output. @@ -22,6 +24,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Changed +* `--pc-sampling-sorting-type` now defaults to `count` (was `offset`), so the PC sampling table shows the most-sampled instructions first. + * Renamed the `Pct of Peak` / `PoP` analysis column to `Percent of Peak` in analysis output. * Moved `--gui` and `--tui` analyze options to experimental status. These features now require the `--experimental` flag to be enabled (e.g., `rocprof-compute analyze --experimental --gui`). diff --git a/projects/rocprofiler-compute/docs/how-to/pc_sampling.rst b/projects/rocprofiler-compute/docs/how-to/pc_sampling.rst index 5a7bec68346..8449b98ad7d 100644 --- a/projects/rocprofiler-compute/docs/how-to/pc_sampling.rst +++ b/projects/rocprofiler-compute/docs/how-to/pc_sampling.rst @@ -40,7 +40,8 @@ Analysis options ================ For using analysis options for PC sampling the configuration needed are: -* ``--pc-sampling-sorting-type``: ``offset`` or ``count``. The default option is ``offset``. ``offset`` is an assembly instruction offset in the code object. +* ``--pc-sampling-sorting-type``: ``offset`` or ``count``. The default option is ``count``, which surfaces the most-sampled instructions (hotspots) first. ``offset`` is an assembly instruction offset in the code object. +* ``--pc-sampling-rows``: Maximum number of rows shown in the PC sampling table (DEFAULT: 10). Must be a non-negative integer; use ``0`` to show all rows. **Sample command:** diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 6fcdf78fc99..4d3264050ef 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -104,6 +104,18 @@ def block_token_or_alias(s: str) -> str: return s +def non_negative_int(value: str) -> int: + try: + parsed = int(value) + except ValueError: + raise argparse.ArgumentTypeError(f"expected an integer, got {value!r}") + if parsed < 0: + raise argparse.ArgumentTypeError( + f"must be a non-negative integer (0 means all), got {parsed}" + ) + return parsed + + def print_avail_arch(avail_arch: list[str], args: str) -> str: ret_str = f"List all available {args} for analysis on specified arch:" for arch in avail_arch: @@ -786,10 +798,21 @@ def omniarg_parser( required=False, metavar="", dest="pc_sampling_sorting_type", - default="offset", + default="count", type=str, + choices=["offset", "count"], help="\t\tSet the sorting type of pc sampling: " - "offset or count (DEFAULT: offset).", + "offset or count (DEFAULT: count).", + ) + analyze_group.add_argument( + "--pc-sampling-rows", + required=False, + metavar="", + dest="pc_sampling_rows", + default=10, + type=non_negative_int, + help="\t\tSpecify the maximum number of rows shown in the PC " + "sampling table; use 0 to show all rows (DEFAULT: 10).", ) ## Roofline Command Line Options (analyze: visualization) diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index fbf30e06ccb..22a845c011d 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -404,6 +404,7 @@ def load_pc_sampling_data_per_kernel( tool_data: dict[str, Any], sorting_type: str, kernel_name: Optional[str] = None, + num_rows: Optional[int] = None, ) -> pd.DataFrame: """Build the detailed per-instruction PC sampling table from *tool_data*. @@ -413,6 +414,8 @@ def load_pc_sampling_data_per_kernel( :param tool_data: The parsed ``rocprofiler-sdk-tool[0]`` dict. :param sorting_type: "offset" or "count". :param kernel_name: Kernel to filter to, or None for all kernels. + :param num_rows: Keep only the first *num_rows* rows after sorting; None or + 0 keeps every row. """ kernel_context = f"kernel '{kernel_name}'" if kernel_name else "all kernels" pc_samples = tool_data["buffer_records"][ @@ -471,6 +474,10 @@ def load_pc_sampling_data_per_kernel( ) return pd.DataFrame() + # num_rows of 0 or None (or a negative passed programmatically) shows all. + if num_rows and num_rows > 0: + df_sorted = df_sorted.head(num_rows) + df_sorted["offset"] = df_sorted["offset"].apply(hex) # Stochastic adds issue/stall detail on top of the host_trap columns. @@ -489,6 +496,7 @@ def load_pc_sampling_data( file_prefix: str, sorting_type: str, tool_data: Optional[dict[str, Any]], + num_rows: Optional[int] = None, ) -> pd.DataFrame: """Return the detailed per-instruction table for a single kernel or all. @@ -513,6 +521,7 @@ def load_pc_sampling_data( pc_sampling_method, tool_data, sorting_type, + num_rows=num_rows, ) if len(workload.filter_kernel_ids) > 1: @@ -539,6 +548,7 @@ def load_pc_sampling_data( tool_data, sorting_type, kernel_name, + num_rows=num_rows, ) @@ -630,6 +640,7 @@ def load_non_mertrics_table( df.loc[0, "from_pc_sampling"], args.pc_sampling_sorting_type, pc_sampling_tool_data, + num_rows=args.pc_sampling_rows, ) workload.dfs.update(tmp) diff --git a/projects/rocprofiler-compute/tests/test_argparser.py b/projects/rocprofiler-compute/tests/test_argparser.py index 56800795152..e9440ba0ec8 100644 --- a/projects/rocprofiler-compute/tests/test_argparser.py +++ b/projects/rocprofiler-compute/tests/test_argparser.py @@ -7,6 +7,7 @@ import argparse from pathlib import Path +from unittest.mock import patch import pytest from common import SUPPORTED_ARCHS @@ -113,3 +114,31 @@ def test_config_dir_requires_value(capsys): build_args(["--config-dir"]) assert exc.value.code == 2 assert "--config-dir" in capsys.readouterr().err + + +def test_pc_sampling_analyze_options(): + """Defaults, overrides, and validation for the analyze PC sampling options.""" + defaults = build_args(["analyze"]) + assert defaults.pc_sampling_sorting_type == "count" + assert defaults.pc_sampling_rows == 10 + + overrides = build_args([ + "analyze", + "--pc-sampling-sorting-type", + "offset", + "--pc-sampling-rows", + "25", + ]) + assert overrides.pc_sampling_sorting_type == "offset" + assert overrides.pc_sampling_rows == 25 + + # 0 is allowed and means "show all rows". + assert build_args(["analyze", "--pc-sampling-rows", "0"]).pc_sampling_rows == 0 + + # Negative row counts trigger an argparse error. + with patch.object( + argparse.ArgumentParser, "error", side_effect=SystemExit(2) + ) as mock_error: + with pytest.raises(SystemExit): + build_args(["analyze", "--pc-sampling-rows", "-1"]) + mock_error.assert_called_once() diff --git a/projects/rocprofiler-compute/tests/test_pc_sampling_analysis.py b/projects/rocprofiler-compute/tests/test_pc_sampling_analysis.py index 39a41ba4578..10b3b7a931a 100644 --- a/projects/rocprofiler-compute/tests/test_pc_sampling_analysis.py +++ b/projects/rocprofiler-compute/tests/test_pc_sampling_analysis.py @@ -614,6 +614,22 @@ def test_load_per_kernel_offset_sort_is_numeric() -> None: assert df["offset"].tolist() == ["0x20", "0x100"] +@pytest.mark.parametrize("num_rows, expected_rows", [(1, 1), (0, 2), (None, 2)]) +def test_load_per_kernel_num_rows_limit( + num_rows: int | None, + expected_rows: int, +) -> None: + """num_rows caps the table after sorting; 0 or None keeps every row.""" + df = load_pc_sampling_data_per_kernel( + method="host_trap", + tool_data=setup_per_kernel_data(), + kernel_name="vecCopy", + sorting_type="count", + num_rows=num_rows, + ) + assert len(df) == expected_rows + + def make_per_kernel_guard_data( instructions: list | None, comments: list | None, @@ -1240,7 +1256,7 @@ def test_load_non_mertrics_table_populates_pc_sampling_from_tool_data( tmp_path: Path, ) -> None: """A ``from_pc_sampling`` table is populated when tool data is provided.""" - args = argparse.Namespace(pc_sampling_sorting_type="count") + args = argparse.Namespace(pc_sampling_sorting_type="count", pc_sampling_rows=10) workload = schema.Workload() workload.dfs = {2101: pd.DataFrame({"from_pc_sampling": ["ps_file"]})} tool_data = make_tool_data(**sample_tool_data_kwargs()) @@ -1254,7 +1270,7 @@ def test_load_non_mertrics_table_pc_sampling_empty_without_tool_data( tmp_path: Path, ) -> None: """Without tool data the ``from_pc_sampling`` table stays empty (no crash).""" - args = argparse.Namespace(pc_sampling_sorting_type="count") + args = argparse.Namespace(pc_sampling_sorting_type="count", pc_sampling_rows=10) workload = schema.Workload() workload.dfs = {2101: pd.DataFrame({"from_pc_sampling": ["ps_file"]})} load_non_mertrics_table(workload, str(tmp_path), args)