diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index 71e3adf5abc..c6846a61a44 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -61,7 +61,7 @@ jobs: shell: bash env: # All available models and quantizations - ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it' + ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt' ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' NUM_RUNS: ${{ inputs.num_runs || '50' }} RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }} @@ -234,6 +234,12 @@ jobs: if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/ fi + if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then + cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/ + fi + if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then + cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/ + fi # Copy tokenizer files for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then @@ -286,6 +292,13 @@ jobs: RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0" MODEL_NAME="gemma3_${{ matrix.quant }}" ;; + nvidia/parakeet-tdt) + RUNNER="cmake-out/examples/models/parakeet/parakeet_runner" + AUDIO="model_artifacts/test_audio.wav" + TOKENIZER="model_artifacts/tokenizer.model" + RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER" + MODEL_NAME="parakeet_${{ matrix.quant }}" + ;; *) echo "Error: Unsupported model '${{ matrix.model }}'" exit 1 diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp index cb0df6cf72f..22dd68dcd58 100644 --- a/examples/models/parakeet/main.cpp +++ b/examples/models/parakeet/main.cpp @@ -26,6 +26,7 @@ #include "types.h" #include +#include #include #include #include @@ -334,6 +335,10 @@ std::vector greedy_decode_executorch( int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); + // Initialize stats for benchmarking + ::executorch::extension::llm::Stats stats; + stats.model_load_start_ms = ::executorch::extension::llm::time_in_ms(); + TimestampOutputMode timestamp_mode; try { timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps); @@ -362,6 +367,8 @@ int main(int argc, char** argv) { ET_LOG(Error, "Failed to load model."); return 1; } + stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms(); + stats.inference_start_ms = ::executorch::extension::llm::time_in_ms(); // Load audio ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str()); @@ -412,6 +419,10 @@ int main(int argc, char** argv) { ET_LOG(Error, "Encoder forward failed."); return 1; } + stats.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms(); + stats.first_token_ms = + stats.prompt_eval_end_ms; // For ASR, first token is at end of encoding + auto& enc_outputs = enc_result.get(); auto f_proj = enc_outputs[0].toTensor(); // [B, T, joint_hidden] int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr()[0]; @@ -488,6 +499,15 @@ int main(int argc, char** argv) { decoded_tokens, *tokenizer); std::cout << "Transcribed text: " << text << std::endl; + // Record inference end time and token counts + stats.inference_end_ms = ::executorch::extension::llm::time_in_ms(); + stats.num_prompt_tokens = + encoded_len; // Use encoder output length as "prompt" tokens + stats.num_generated_tokens = static_cast(decoded_tokens.size()); + + // Print PyTorchObserver stats for benchmarking + ::executorch::extension::llm::print_report(stats); + #ifdef ET_BUILD_METAL executorch::backends::metal::print_metal_backend_stats(); #endif // ET_BUILD_METAL