diff --git a/README.md b/README.md index 3a671d4..69d18eb 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ Set via CLI (`--mode`) or env var (`RTL_MODE=lite`). **lite** skips packets that already have a completion signal (e.g., NCCL kernels, barriers), resulting in near-zero overhead and safety on ROCm <= 7.2. This is the default when `--mode` is not specified. **standard** mode profiles all count==1 dispatches including those with signals. **full** profiles everything including CUDAGraph replay batches, but requires ROCm 7.13+ to avoid a [known ROCR heap overflow](https://github.com/sunway513/rocm-trace-lite/issues/67). +> **Graph replay note:** `lite`, `standard`, and `hip` modes do **not** profile kernels dispatched via HIP graph replay (`hipGraphLaunch`). If your workload uses CUDAGraph / HIP graph (e.g., ATOM/vLLM with `--enforce-eager` disabled), these modes will produce an incomplete trace — the tool will emit a `WARNING: INCOMPLETE TRACE` message at shutdown and record the gap in trace metadata. Use `--mode full` to capture all graph replay kernels. + Sample output: ```text diff --git a/rocm_trace_lite/cli.py b/rocm_trace_lite/cli.py index a0e10e6..205c2c3 100644 --- a/rocm_trace_lite/cli.py +++ b/rocm_trace_lite/cli.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """rtl: Self-contained GPU kernel profiler for ROCm.""" + import argparse import sys @@ -9,33 +10,53 @@ def main(): prog="rtl", description="Self-contained GPU kernel profiler for ROCm. Zero roctracer dependency.", ) - parser.add_argument("--version", action="version", version=f"%(prog)s {get_version()}") + parser.add_argument( + "--version", action="version", version=f"%(prog)s {get_version()}" + ) sub = parser.add_subparsers(dest="command") # trace trace_p = sub.add_parser("trace", help="Trace a workload") - trace_p.add_argument("-o", "--output", default="trace.db", help="Output trace file (default: trace.db)") - trace_p.add_argument("-m", "--mode", choices=["lite", "standard", "default", "full", "hip"], default=None, - help="Profiling mode: " - "lite (~0%% overhead, skip has-signal packets, safe for all ROCm, default), " - "standard (GPU timing for all count==1 dispatches, skip graph replay), " - "hip (standard + HIP API interception for CPU-GPU correlation), " - "full (profile everything including graph replay, requires ROCm 7.13+). " - "'default' is accepted as alias for 'standard'.") + trace_p.add_argument( + "-o", + "--output", + default="trace.db", + help="Output trace file (default: trace.db)", + ) + trace_p.add_argument( + "-m", + "--mode", + choices=["lite", "standard", "full", "hip"], + default=None, + help="Profiling mode: " + "lite (~0%% overhead, skip has-signal packets, safe for all ROCm, default), " + "standard (GPU timing for all count==1 dispatches, skip graph replay), " + "hip (standard + HIP API interception for CPU-GPU correlation), " + "full (profile everything including graph replay, requires ROCm 7.13+).", + ) trace_p.add_argument("cmd", nargs=argparse.REMAINDER, help="Command to trace") # convert - conv_p = sub.add_parser("convert", help="Convert RPD trace to Perfetto JSON or rocprofv3 JSON") + conv_p = sub.add_parser( + "convert", help="Convert RPD trace to Perfetto JSON or rocprofv3 JSON" + ) conv_p.add_argument("input", help="Input .db file") conv_p.add_argument("-o", "--output", default=None, help="Output file path") - conv_p.add_argument("-f", "--format", choices=["perfetto", "rocprofv3"], default="perfetto", - help="Output format: perfetto (Chrome Trace JSON, default) or " - "rocprofv3 (rocprofiler-sdk-tool JSON for TraceLens)") + conv_p.add_argument( + "-f", + "--format", + choices=["perfetto", "rocprofv3"], + default="perfetto", + help="Output format: perfetto (Chrome Trace JSON, default) or " + "rocprofv3 (rocprofiler-sdk-tool JSON for TraceLens)", + ) # summary sum_p = sub.add_parser("summary", help="Show top kernels from trace") sum_p.add_argument("input", help="Input .db file") - sum_p.add_argument("-n", "--limit", type=int, default=20, help="Number of rows (default: 20)") + sum_p.add_argument( + "-n", "--limit", type=int, default=20, help="Number of rows (default: 20)" + ) # info info_p = sub.add_parser("info", help="Show trace metadata") @@ -49,25 +70,31 @@ def main(): # dispatch if args.command == "trace": from rocm_trace_lite.cmd_trace import run_trace + run_trace(args) elif args.command == "convert": - if getattr(args, 'format', 'perfetto') == 'rocprofv3': + if getattr(args, "format", "perfetto") == "rocprofv3": from rocm_trace_lite.cmd_convert_rocprofv3 import run_convert_rocprofv3 + run_convert_rocprofv3(args) else: from rocm_trace_lite.cmd_convert import run_convert + run_convert(args) elif args.command == "summary": from rocm_trace_lite.cmd_summary import run_summary + run_summary(args) elif args.command == "info": from rocm_trace_lite.cmd_info import run_info + run_info(args) def get_version(): try: from rocm_trace_lite import __version__ + return __version__ except ImportError: return "unknown" diff --git a/rocm_trace_lite/cmd_info.py b/rocm_trace_lite/cmd_info.py index e07e987..e7d828e 100644 --- a/rocm_trace_lite/cmd_info.py +++ b/rocm_trace_lite/cmd_info.py @@ -20,9 +20,12 @@ def run_info(args): print(f" Size: {os.path.getsize(args.input) / 1024 / 1024:.1f} MB") # Tables - tables = [r[0] for r in conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" - ).fetchall()] + tables = [ + r[0] + for r in conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ).fetchall() + ] print(f" Tables: {', '.join(tables)}") # Record counts (guard on table existence) @@ -57,3 +60,22 @@ def run_info(args): "SELECT count(DISTINCT description_id) FROM rocpd_op" ).fetchone()[0] print(f" Unique kernels: {kernels}") + + # Trace completeness warning + if "rocpd_metadata" in tables: + try: + row = conn.execute( + "SELECT value FROM rocpd_metadata WHERE tag='drop_batch_skip'" + ).fetchone() + if row and int(row[0]) > 0: + mode_row = conn.execute( + "SELECT value FROM rocpd_metadata WHERE tag='rtl_mode'" + ).fetchone() + mode = mode_row[0] if mode_row else "unknown" + print( + f"\n WARNING: INCOMPLETE TRACE — {row[0]} kernel(s) from " + f"graph replay were not profiled (mode={mode})." + ) + print(" Use '--mode full' for complete HIP graph replay coverage.") + except (sqlite3.OperationalError, ValueError): + pass diff --git a/src/hsa_intercept.cpp b/src/hsa_intercept.cpp index 21baca5..0c32d08 100644 --- a/src/hsa_intercept.cpp +++ b/src/hsa_intercept.cpp @@ -651,7 +651,17 @@ static void shutdown() { fprintf(stderr, " drop (ts fail): %" PRIu64 "\n", g_drop_ts_fail.load()); fprintf(stderr, " drop (ts invalid): %" PRIu64 "\n", g_drop_ts_invalid.load()); fprintf(stderr, " recorded OK: %" PRIu64 "\n", g_recorded_ok.load()); - fprintf(stderr, "====================================\n\n"); + fprintf(stderr, "====================================\n"); + + uint64_t batch_skipped = g_drop_batch_skip.load(); + if (batch_skipped > 0) { + fprintf(stderr, + "\n*** WARNING: INCOMPLETE TRACE — %" PRIu64 " GPU kernel(s) from graph replay " + "were not profiled. ***\n" + "*** Use '--mode full' (or RTL_MODE=full) for complete HIP graph replay coverage. ***\n", + batch_skipped); + } + fprintf(stderr, "\n"); // Signal worker to stop g_shutdown.store(true, std::memory_order_release); @@ -676,6 +686,27 @@ static void shutdown() { } } + // Write profiling metadata to DB before closing + { + static const char* mode_names[] = {"standard", "lite", "full"}; + std::string mode_str = mode_names[(int)g_rtl_mode]; + if (hip_api::g_enabled.load()) mode_str += "+hip"; + trace_db::get_trace_db().record_metadata("rtl_mode", mode_str.c_str()); + + char buf[64]; + snprintf(buf, sizeof(buf), "%" PRIu64, batch_skipped); + trace_db::get_trace_db().record_metadata("drop_batch_skip", buf); + + snprintf(buf, sizeof(buf), "%" PRIu64, g_recorded_ok.load()); + trace_db::get_trace_db().record_metadata("recorded_ok", buf); + + if (batch_skipped > 0) { + trace_db::get_trace_db().record_metadata("trace_complete", "false"); + } else { + trace_db::get_trace_db().record_metadata("trace_complete", "true"); + } + } + // Flush and close trace DB trace_db::get_trace_db().flush(); trace_db::get_trace_db().close(); @@ -737,8 +768,12 @@ extern "C" bool OnLoad(void* pTable, if (mode_env) { if (strcmp(mode_env, "lite") == 0) { g_rtl_mode = RtlMode::LITE; - } else if (strcmp(mode_env, "standard") == 0 || strcmp(mode_env, "default") == 0) { + } else if (strcmp(mode_env, "standard") == 0) { + g_rtl_mode = RtlMode::STANDARD; + } else if (strcmp(mode_env, "default") == 0) { g_rtl_mode = RtlMode::STANDARD; + fprintf(stderr, "rtl: WARNING: RTL_MODE='default' is deprecated and misleading " + "(actual default is 'lite'). Use 'standard' instead.\n"); } else if (strcmp(mode_env, "full") == 0) { g_rtl_mode = RtlMode::FULL; } else if (strcmp(mode_env, "hip") == 0) { diff --git a/src/trace_db.cpp b/src/trace_db.cpp index bd36b60..d613908 100644 --- a/src/trace_db.cpp +++ b/src/trace_db.cpp @@ -279,6 +279,20 @@ void TraceDB::flush() { } } +void TraceDB::record_metadata(const char* tag, const char* value) { + std::lock_guard lock(g_db_mutex); + if (!db_) return; + sqlite3_stmt* s = nullptr; + if (sqlite3_prepare_v2(db_, "INSERT INTO rocpd_metadata(tag,value) VALUES(?1,?2)", + -1, &s, nullptr) != SQLITE_OK) { + return; + } + sqlite3_bind_text(s, 1, tag, -1, SQLITE_TRANSIENT); + sqlite3_bind_text(s, 2, value, -1, SQLITE_TRANSIENT); + sqlite3_step(s); + sqlite3_finalize(s); +} + void TraceDB::record_hip_api(const char* name, const char* args, uint64_t start_ns, uint64_t duration_ns, uint64_t correlation_id, int pid, int tid) { diff --git a/src/trace_db.h b/src/trace_db.h index 0f27f6d..dc27212 100644 --- a/src/trace_db.h +++ b/src/trace_db.h @@ -48,6 +48,9 @@ class TraceDB { void record_roctx(const char* message, uint64_t start_ns, uint64_t duration_ns, uint64_t correlation_id); + // Record a metadata tag/value pair into rocpd_metadata + void record_metadata(const char* tag, const char* value); + // Flush buffered records void flush();