Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Set via CLI (`--mode`) or env var (`RTL_MODE=lite`).

**lite** skips packets that already have a completion signal (e.g., NCCL kernels, barriers), resulting in near-zero overhead and safety on ROCm <= 7.2. This is the default when `--mode` is not specified. **standard** mode profiles all count==1 dispatches including those with signals. **full** profiles everything including CUDAGraph replay batches, but requires ROCm 7.13+ to avoid a [known ROCR heap overflow](https://github.com/sunway513/rocm-trace-lite/issues/67).

> **Graph replay note:** `lite`, `standard`, and `hip` modes do **not** profile kernels dispatched via HIP graph replay (`hipGraphLaunch`). If your workload uses CUDAGraph / HIP graph (e.g., ATOM/vLLM with `--enforce-eager` disabled), these modes will produce an incomplete trace — the tool will emit a `WARNING: INCOMPLETE TRACE` message at shutdown and record the gap in trace metadata. Use `--mode full` to capture all graph replay kernels.

Sample output:

```text
Expand Down
57 changes: 42 additions & 15 deletions rocm_trace_lite/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
"""rtl: Self-contained GPU kernel profiler for ROCm."""

import argparse
import sys

Expand All @@ -9,33 +10,53 @@ def main():
prog="rtl",
description="Self-contained GPU kernel profiler for ROCm. Zero roctracer dependency.",
)
parser.add_argument("--version", action="version", version=f"%(prog)s {get_version()}")
parser.add_argument(
"--version", action="version", version=f"%(prog)s {get_version()}"
)
sub = parser.add_subparsers(dest="command")

# trace
trace_p = sub.add_parser("trace", help="Trace a workload")
trace_p.add_argument("-o", "--output", default="trace.db", help="Output trace file (default: trace.db)")
trace_p.add_argument("-m", "--mode", choices=["lite", "standard", "default", "full", "hip"], default=None,
help="Profiling mode: "
"lite (~0%% overhead, skip has-signal packets, safe for all ROCm, default), "
"standard (GPU timing for all count==1 dispatches, skip graph replay), "
"hip (standard + HIP API interception for CPU-GPU correlation), "
"full (profile everything including graph replay, requires ROCm 7.13+). "
"'default' is accepted as alias for 'standard'.")
trace_p.add_argument(
"-o",
"--output",
default="trace.db",
help="Output trace file (default: trace.db)",
)
trace_p.add_argument(
"-m",
"--mode",
choices=["lite", "standard", "full", "hip"],
default=None,
help="Profiling mode: "
"lite (~0%% overhead, skip has-signal packets, safe for all ROCm, default), "
"standard (GPU timing for all count==1 dispatches, skip graph replay), "
"hip (standard + HIP API interception for CPU-GPU correlation), "
"full (profile everything including graph replay, requires ROCm 7.13+).",
)
trace_p.add_argument("cmd", nargs=argparse.REMAINDER, help="Command to trace")

# convert
conv_p = sub.add_parser("convert", help="Convert RPD trace to Perfetto JSON or rocprofv3 JSON")
conv_p = sub.add_parser(
"convert", help="Convert RPD trace to Perfetto JSON or rocprofv3 JSON"
)
conv_p.add_argument("input", help="Input .db file")
conv_p.add_argument("-o", "--output", default=None, help="Output file path")
conv_p.add_argument("-f", "--format", choices=["perfetto", "rocprofv3"], default="perfetto",
help="Output format: perfetto (Chrome Trace JSON, default) or "
"rocprofv3 (rocprofiler-sdk-tool JSON for TraceLens)")
conv_p.add_argument(
"-f",
"--format",
choices=["perfetto", "rocprofv3"],
default="perfetto",
help="Output format: perfetto (Chrome Trace JSON, default) or "
"rocprofv3 (rocprofiler-sdk-tool JSON for TraceLens)",
)

# summary
sum_p = sub.add_parser("summary", help="Show top kernels from trace")
sum_p.add_argument("input", help="Input .db file")
sum_p.add_argument("-n", "--limit", type=int, default=20, help="Number of rows (default: 20)")
sum_p.add_argument(
"-n", "--limit", type=int, default=20, help="Number of rows (default: 20)"
)

# info
info_p = sub.add_parser("info", help="Show trace metadata")
Expand All @@ -49,25 +70,31 @@ def main():
# dispatch
if args.command == "trace":
from rocm_trace_lite.cmd_trace import run_trace

run_trace(args)
elif args.command == "convert":
if getattr(args, 'format', 'perfetto') == 'rocprofv3':
if getattr(args, "format", "perfetto") == "rocprofv3":
from rocm_trace_lite.cmd_convert_rocprofv3 import run_convert_rocprofv3

run_convert_rocprofv3(args)
else:
from rocm_trace_lite.cmd_convert import run_convert

run_convert(args)
elif args.command == "summary":
from rocm_trace_lite.cmd_summary import run_summary

run_summary(args)
elif args.command == "info":
from rocm_trace_lite.cmd_info import run_info

run_info(args)


def get_version():
try:
from rocm_trace_lite import __version__

return __version__
except ImportError:
return "unknown"
Expand Down
28 changes: 25 additions & 3 deletions rocm_trace_lite/cmd_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ def run_info(args):
print(f" Size: {os.path.getsize(args.input) / 1024 / 1024:.1f} MB")

# Tables
tables = [r[0] for r in conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
).fetchall()]
tables = [
r[0]
for r in conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
).fetchall()
]
print(f" Tables: {', '.join(tables)}")

# Record counts (guard on table existence)
Expand Down Expand Up @@ -57,3 +60,22 @@ def run_info(args):
"SELECT count(DISTINCT description_id) FROM rocpd_op"
).fetchone()[0]
print(f" Unique kernels: {kernels}")

# Trace completeness warning
if "rocpd_metadata" in tables:
try:
row = conn.execute(
"SELECT value FROM rocpd_metadata WHERE tag='drop_batch_skip'"
).fetchone()
if row and int(row[0]) > 0:
mode_row = conn.execute(
"SELECT value FROM rocpd_metadata WHERE tag='rtl_mode'"
).fetchone()
mode = mode_row[0] if mode_row else "unknown"
print(
f"\n WARNING: INCOMPLETE TRACE — {row[0]} kernel(s) from "
f"graph replay were not profiled (mode={mode})."
)
print(" Use '--mode full' for complete HIP graph replay coverage.")
except (sqlite3.OperationalError, ValueError):
pass
39 changes: 37 additions & 2 deletions src/hsa_intercept.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,17 @@ static void shutdown() {
fprintf(stderr, " drop (ts fail): %" PRIu64 "\n", g_drop_ts_fail.load());
fprintf(stderr, " drop (ts invalid): %" PRIu64 "\n", g_drop_ts_invalid.load());
fprintf(stderr, " recorded OK: %" PRIu64 "\n", g_recorded_ok.load());
fprintf(stderr, "====================================\n\n");
fprintf(stderr, "====================================\n");

uint64_t batch_skipped = g_drop_batch_skip.load();
if (batch_skipped > 0) {
fprintf(stderr,
"\n*** WARNING: INCOMPLETE TRACE — %" PRIu64 " GPU kernel(s) from graph replay "
"were not profiled. ***\n"
"*** Use '--mode full' (or RTL_MODE=full) for complete HIP graph replay coverage. ***\n",
batch_skipped);
}
fprintf(stderr, "\n");

// Signal worker to stop
g_shutdown.store(true, std::memory_order_release);
Expand All @@ -676,6 +686,27 @@ static void shutdown() {
}
}

// Write profiling metadata to DB before closing
{
static const char* mode_names[] = {"standard", "lite", "full"};
std::string mode_str = mode_names[(int)g_rtl_mode];
if (hip_api::g_enabled.load()) mode_str += "+hip";
trace_db::get_trace_db().record_metadata("rtl_mode", mode_str.c_str());

char buf[64];
snprintf(buf, sizeof(buf), "%" PRIu64, batch_skipped);
trace_db::get_trace_db().record_metadata("drop_batch_skip", buf);

snprintf(buf, sizeof(buf), "%" PRIu64, g_recorded_ok.load());
trace_db::get_trace_db().record_metadata("recorded_ok", buf);

if (batch_skipped > 0) {
trace_db::get_trace_db().record_metadata("trace_complete", "false");
} else {
trace_db::get_trace_db().record_metadata("trace_complete", "true");
}
}

// Flush and close trace DB
trace_db::get_trace_db().flush();
trace_db::get_trace_db().close();
Expand Down Expand Up @@ -737,8 +768,12 @@ extern "C" bool OnLoad(void* pTable,
if (mode_env) {
if (strcmp(mode_env, "lite") == 0) {
g_rtl_mode = RtlMode::LITE;
} else if (strcmp(mode_env, "standard") == 0 || strcmp(mode_env, "default") == 0) {
} else if (strcmp(mode_env, "standard") == 0) {
g_rtl_mode = RtlMode::STANDARD;
} else if (strcmp(mode_env, "default") == 0) {
g_rtl_mode = RtlMode::STANDARD;
fprintf(stderr, "rtl: WARNING: RTL_MODE='default' is deprecated and misleading "
"(actual default is 'lite'). Use 'standard' instead.\n");
} else if (strcmp(mode_env, "full") == 0) {
g_rtl_mode = RtlMode::FULL;
} else if (strcmp(mode_env, "hip") == 0) {
Expand Down
14 changes: 14 additions & 0 deletions src/trace_db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,20 @@ void TraceDB::flush() {
}
}

void TraceDB::record_metadata(const char* tag, const char* value) {
std::lock_guard<std::mutex> lock(g_db_mutex);
if (!db_) return;
sqlite3_stmt* s = nullptr;
if (sqlite3_prepare_v2(db_, "INSERT INTO rocpd_metadata(tag,value) VALUES(?1,?2)",
-1, &s, nullptr) != SQLITE_OK) {
return;
}
sqlite3_bind_text(s, 1, tag, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(s, 2, value, -1, SQLITE_TRANSIENT);
sqlite3_step(s);
sqlite3_finalize(s);
}

void TraceDB::record_hip_api(const char* name, const char* args,
uint64_t start_ns, uint64_t duration_ns,
uint64_t correlation_id, int pid, int tid) {
Expand Down
3 changes: 3 additions & 0 deletions src/trace_db.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class TraceDB {
void record_roctx(const char* message, uint64_t start_ns, uint64_t duration_ns,
uint64_t correlation_id);

// Record a metadata tag/value pair into rocpd_metadata
void record_metadata(const char* tag, const char* value);

// Flush buffered records
void flush();

Expand Down