diff --git a/.github/workflows/run_benchmarks.yml b/.github/workflows/run_benchmarks.yml
index 6cba63662a5..cf944044c15 100644
--- a/.github/workflows/run_benchmarks.yml
+++ b/.github/workflows/run_benchmarks.yml
@@ -7,7 +7,7 @@ on:
       - 'releases/**'
 
 jobs:
-  benchmark:
+  build-benchmark-master:
     container:
       image: vowpalwabbit/ubuntu1804-build:latest
     runs-on: ubuntu-latest
@@ -19,46 +19,83 @@ jobs:
       - name: Install google benchmarks
         shell: bash
         run: ./.scripts/linux/install-benchmarks.sh
-      - name: Build ${{ github.base_ref }}
-        shell: bash
-        run: ./.scripts/linux/build-with-benchmarks.sh
-      - name: Benchmark ${{ github.base_ref }}
-        shell: bash
-        run: ./.scripts/linux/run-benchmarks.sh master-benchmarks.json
-      - name: Upload ${{ github.base_ref }} benchmark results
-        uses: actions/upload-artifact@v2
-        with:
-          name: master-benchmarks
-          path: master-benchmarks.json
       - name: Upload benchmark compare
         uses: actions/upload-artifact@v2
         with:
           name: benchmark-compare
           path: benchmark/tools/
-      - name: Install benchmark compare requirements
+      - name: Build ${{ github.base_ref }}
         shell: bash
-        run: |
-          python3 -m pip install -r benchmark/tools/requirements.txt
-          # The above requirements file is missing the pandas dependency.
-          python3 -m pip install pandas
-      - run: rm -rf benchmark build vowpalwabbit/parser/flatbuffer/generated/ # generated or downloaded files
+        run: ./.scripts/linux/build-with-benchmarks.sh
+      - name: Upload benchmark binary
+        uses: actions/upload-artifact@v2
+        with:
+          name: master-benchmark-bin
+          path: build/test/benchmarks/vw-benchmarks.out
+  build-benchmark-branch:
+    container:
+      image: vowpalwabbit/ubuntu1804-build:latest
+    runs-on: ubuntu-latest
+    steps:
       - uses: actions/checkout@v1
         with:
           submodules: 'recursive'
-      - name: Download ${{ github.base_ref }} benchmark results
-        uses: actions/download-artifact@v2
+      - name: Install google benchmarks
+        shell: bash
+        run: ./.scripts/linux/install-benchmarks.sh
+      - name: Build branch
+        shell: bash
+        run: ./.scripts/linux/build-with-benchmarks.sh
+      - name: Upload benchmark binary
+        uses: actions/upload-artifact@v2
         with:
-          name: master-benchmarks
+          name: branch-benchmark-bin
+          path: build/test/benchmarks/vw-benchmarks.out
+  run-benchmarks:
+    container:
+      image: vowpalwabbit/ubuntu1804-build:latest
+    runs-on: ubuntu-latest
+    needs: [build-benchmark-master, build-benchmark-branch]
+    steps:
       - name: Download benchmark compare
         uses: actions/download-artifact@v2
         with:
           name: benchmark-compare
-      - name: Build branch
-        shell: bash
-        run: ./.scripts/linux/build-with-benchmarks.sh
-      - name: Benchmark branch
+          path: tools/
+      - name: Download master bin
+        uses: actions/download-artifact@v2
+        with:
+          name: master-benchmark-bin
+          path: master-benchmark-bin/
+      - name: Download branch bin
+        uses: actions/download-artifact@v2
+        with:
+          name: branch-benchmark-bin
+          path: branch-benchmark-bin/
+      - name: Update permissions
+        run: |
+          chmod +x ./master-benchmark-bin/vw-benchmarks.out
+          chmod +x ./branch-benchmark-bin/vw-benchmarks.out
+      - name: Run master benchmark
+        run: >
+          ./master-benchmark-bin/vw-benchmarks.out
+          --benchmark_min_time=3
+          --benchmark_format=console
+          --benchmark_out_format=json
+          --benchmark_out=master-benchmarks.json
+      - name: Run branch benchmark
+        run: >
+          ./branch-benchmark-bin/vw-benchmarks.out
+          --benchmark_min_time=3
+          --benchmark_format=console
+          --benchmark_out_format=json
+          --benchmark_out=branch-benchmarks.json
+      - name: Install benchmark compare requirements
         shell: bash
-        run: ./.scripts/linux/run-benchmarks.sh branch-benchmarks.json
+        run: |
+          python3 -m pip install -r tools/requirements.txt
+          # The above requirements file is missing the pandas dependency.
+          python3 -m pip install pandas
       - name: Compare benchmarks
         shell: bash
-        run: ./.scripts/linux/compare-benchmarks.sh master-benchmarks.json branch-benchmarks.json
+        run: python3 tools/compare.py benchmarks master-benchmarks.json branch-benchmarks.json
diff --git a/.scripts/linux/install-benchmarks.sh b/.scripts/linux/install-benchmarks.sh
index 526f78c063f..6ac2bfaf10f 100755
--- a/.scripts/linux/install-benchmarks.sh
+++ b/.scripts/linux/install-benchmarks.sh
@@ -12,6 +12,6 @@ cd benchmark
 git checkout v1.6.1
 
 # Generate build system files with cmake.
-cmake -S . -B build -G Ninja -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DCMAKE_BUILD_TYPE=Release
+cmake -S . -B build -G Ninja -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=Off -DCMAKE_BUILD_TYPE=Release
 # Install globally
 sudo cmake --build "build" --config Release --target install
diff --git a/test/benchmarks/standalone/benchmark_text_input.cc b/test/benchmarks/standalone/benchmark_text_input.cc
index 34c332a502c..711725d975a 100644
--- a/test/benchmarks/standalone/benchmark_text_input.cc
+++ b/test/benchmarks/standalone/benchmark_text_input.cc
@@ -247,7 +247,7 @@ BENCHMARK_CAPTURE(benchmark_ccb_adf_learn, many_features_no_predic,
     "a b c d e f g h i j k l m n o p q r s t u v w x y z", " --no_predict");
 
 BENCHMARK_CAPTURE(benchmark_cb_adf_learn, few_features, 2);
-BENCHMARK_CAPTURE(benchmark_cb_adf_learn, many_features, 120);
+BENCHMARK_CAPTURE(benchmark_cb_adf_learn, many_features, 120)->MinTime(15.0);
 
 #ifdef PRIVACY_ACTIVATION
 BENCHMARK_CAPTURE(benchmark_cb_adf_learn_privacy_preserving, few_features, 2);
@@ -255,22 +255,32 @@ BENCHMARK_CAPTURE(benchmark_cb_adf_learn_privacy_preserving, many_features, 120)
 #endif
 
 BENCHMARK_CAPTURE(benchmark_multi, cb_adf_no_namespaces, gen_cb_examples(100, 7, 3, 6, 1, 4, 14, 2, false),
-    "--cb_explore_adf --quiet");
+    "--cb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, cb_adf_diff_char_no_interactions, gen_cb_examples(100, 7, 3, 6, 3, 4, 14, 2, false),
-    "--cb_explore_adf --quiet");
+    "--cb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, cb_adf_diff_char_interactions, gen_cb_examples(100, 7, 3, 6, 3, 4, 14, 2, false),
-    "--cb_explore_adf --quiet -q ::");
+    "--cb_explore_adf --quiet -q ::")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, cb_adf_same_char_no_interactions, gen_cb_examples(100, 7, 3, 6, 3, 4, 14, 2, true),
-    "--cb_explore_adf --quiet");
+    "--cb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, cb_adf_same_char_interactions, gen_cb_examples(100, 7, 3, 6, 3, 4, 14, 2, true),
-    "--cb_explore_adf --quiet -q ::");
+    "--cb_explore_adf --quiet -q ::")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, ccb_adf_no_namespaces, gen_ccb_examples(50, 7, 3, 6, 1, 4, 14, 2, false, 3),
-    "--ccb_explore_adf --quiet");
+    "--ccb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, ccb_adf_diff_char_no_interactions,
-    gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, false, 3), "--ccb_explore_adf --quiet");
+    gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, false, 3), "--ccb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, ccb_adf_diff_char_interactions, gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, false, 3),
-    "--ccb_explore_adf --quiet -q ::");
+    "--ccb_explore_adf --quiet -q ::")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, ccb_adf_same_char_no_interactions,
-    gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, true, 3), "--ccb_explore_adf --quiet");
+    gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, true, 3), "--ccb_explore_adf --quiet")
+    ->MinTime(15.0);
 BENCHMARK_CAPTURE(benchmark_multi, ccb_adf_same_char_interactions, gen_ccb_examples(50, 7, 3, 6, 3, 4, 14, 2, true, 3),
-    "--ccb_explore_adf --quiet -q ::");
+    "--ccb_explore_adf --quiet -q ::")
+    ->MinTime(15.0);
diff --git a/test/benchmarks/standalone/rcv1_benchmarks.cc b/test/benchmarks/standalone/rcv1_benchmarks.cc
index 8f7130e7404..5eb99e86932 100644
--- a/test/benchmarks/standalone/rcv1_benchmarks.cc
+++ b/test/benchmarks/standalone/rcv1_benchmarks.cc
@@ -5332,5 +5332,5 @@ static void benchmark_rcv1_dataset(benchmark::State& state, std::string command_
   VW::finish(*vw);
 }
 
-BENCHMARK_CAPTURE(benchmark_rcv1_dataset, simple, "--quiet");
-BENCHMARK_CAPTURE(benchmark_rcv1_dataset, quadratic, "--quiet -q ::");
+BENCHMARK_CAPTURE(benchmark_rcv1_dataset, simple, "--quiet")->MinTime(15.0);
+BENCHMARK_CAPTURE(benchmark_rcv1_dataset, quadratic, "--quiet -q ::")->MinTime(15.0);