nod-ai · erman-gurses · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
@@ -68,7 +68,7 @@ jobs:
       - name: Run llama tests
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://0 --html=out/llm/llama/benchmark/index.html
+          pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v --log-cli-level=info -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://0 --html=out/llm/llama/benchmark/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
@@ -67,7 +67,7 @@ jobs:
       - name: Run llama 8b f16 decomposed test
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --iree-device=hip://0 --run-quick-llama-test
+          pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v --log-cli-level=info -s --iree-hip-target=gfx942 --iree-device=hip://0 --run-quick-llama-test
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -20,6 +20,8 @@
     IreeCompileException,
 )
 
+logger = logging.getLogger(__name__)
+
 is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
 skipif_run_quick_llama_test = pytest.mark.skipif(
     'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")',
@@ -106,6 +108,8 @@ def save_benchmarks(
 
 @is_mi300x
 class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
+    logger.info("Testing BenchmarkLlama3_1_8B...")
+
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
@@ -191,6 +195,7 @@ def setUp(self):
         ]
 
     def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
+        logger.info("Testing Benchmark8B_f16_TP1_Non_Decomposed_Input_Len_128...")
         output_file_name = self.dir_path_8b / "f16_torch_128_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -208,6 +213,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -216,6 +222,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -225,6 +232,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -236,6 +244,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
 
     @skipif_run_quick_llama_test
     def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
+        logger.info("Testing Benchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048...")
         output_file_name = self.dir_path_8b / "f16_torch_2048_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -253,6 +262,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -261,6 +271,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -270,6 +281,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -281,6 +293,7 @@ def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
 
     @skipif_run_quick_llama_test
     def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
+        logger.info("Testing Benchmark8B_fp8_TP1_Non_Decomposed...")
         output_file_name = self.dir_path_8b / "fp8_torch_tp1"
         output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -298,6 +311,7 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama8b_fp8_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -306,6 +320,7 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -315,6 +330,7 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -328,6 +344,9 @@ def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
 @is_mi300x
 @skipif_run_quick_llama_test
 class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
+
+    logger.info("Testing BenchmarkLlama3_1_70B...")
+
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
@@ -450,6 +469,7 @@ def setUp(self):
         ]
 
     def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
+        logger.info("Testing Benchmark70B_f16_TP1_Non_Decomposed_Input_Len_128...")
         output_file_name = self.dir_path_70b / "f16_torch_128_tp1"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -467,6 +487,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -475,6 +496,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -484,6 +506,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -494,6 +517,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
         )
 
     def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
+        logger.info("Testing Benchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048...")
         output_file_name = self.dir_path_70b / "f16_torch_2048_tp1"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -511,6 +535,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -519,6 +544,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -528,6 +554,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama70b_f16_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -541,6 +568,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
+        logger.info("Testing Benchmark70B_f16_TP8_Non_Decomposed_Input_Len_128...")
         output_file_name = self.dir_path_70b / "f16_torch_128_tp8"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -563,6 +591,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -571,6 +600,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -579,6 +609,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -591,6 +622,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
+        logger.info("Testing Benchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048...")
         output_file_name = self.dir_path_70b / "f16_torch_2048_tp8"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -613,6 +645,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -621,6 +654,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -629,6 +663,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -641,6 +676,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
         reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
     )
     def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
+        logger.info("Testing Benchmark70B_fp8_TP1_Non_Decomposed...")
         output_file_name = self.dir_path_70b / "fp8_torch_tp1"
         output_mlir = self.llama70b_fp8_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -655,6 +691,7 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama70b_fp8_torch_sdpa_artifacts_tp1.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -663,6 +700,7 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama70b_fp8_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -671,6 +709,7 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama70b_fp8_torch_sdpa_artifacts_tp1.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -683,6 +722,9 @@ def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
 @is_mi300x
 @skipif_run_quick_llama_test
 class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
+
+    logger.info("Testing BenchmarkLlama3_1_405B...")
+
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
@@ -773,6 +815,7 @@ def setUp(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
+        logger.info("Testing Benchmark405B_f16_TP8_Non_Decomposed_Input_Len_128...")
         output_file_name = self.dir_path_405b / "f16_torch_128"
         output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -793,6 +836,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -801,6 +845,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -814,6 +859,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
+        logger.info("Testing Benchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048...")
         output_file_name = self.dir_path_405b / "f16_torch_2048"
         output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -834,6 +880,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -842,6 +889,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -855,6 +903,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
         reason="KeyError in theta.py", strict=True, raises=ExportMlirException
     )
     def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
+        logger.info("Testing Benchmark405B_fp8_TP8_Non_Decomposed...")
         output_file_name = self.dir_path_405b / "fp8_torch"
         output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
@@ -875,6 +924,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
             mlir_path=output_mlir,
             json_path=output_json,
         )
+        logger.info("Compiling MLIR file...")
         self.llama405b_fp8_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
@@ -883,6 +933,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
             args=self.compile_args,
         )
         # benchmark prefill
+        logger.info("IREE Benchmark Prefill...")
         self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
@@ -891,6 +942,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
+        logger.info("IREE Benchmark Decode...")
         self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,