trishullab · amit9oct · Nov 29, 2025 · Nov 29, 2025
diff --git a/docs/leaderboard.html b/docs/leaderboard.html
@@ -105,79 +105,97 @@ <h5>Legend</h5>
             <div class="legend-item">💚 Fully open-sourced method</div>
             <div class="legend-item">💙 Partially open-sourced method</div>
             <div class="legend-item"><strong>FS:</strong> Few-shot prompting with 1-2 examples</div>
-            <div class="legend-item"><strong>COPRA:</strong> Symbolic proof search agent</div>
+            <div class="legend-item"><strong>COPRA:</strong> Symbolic proof search agent <a href="https://arxiv.org/abs/2310.04353">[arXiv]</a></div>
             <div class="legend-item"><strong>Compiled:</strong> Code is syntactically valid and type-checks</div>
             <div class="legend-item"><strong>Proved:</strong> Proofs accepted by Lean's kernel</div>
             <div class="legend-item"><strong>Pass@k-sec:</strong> Success rate within 600-second time budget (k=600)</div>
+            <div class="legend-item"><strong># Proofs Generated:</strong> # Proofs generated while Spec and Impl Certification</div>
         </div>
 
         <!-- Overall End-to-End Performance -->
         <div class="task-header">
             🏆 End-to-End Performance Rankings
         </div>
         <div class="metric-description">
-            Complete pipeline success: specification certification + implementation certification (Pass@600-sec)
+            Complete pipeline success: specification certification + implementation certification (Pass@600-sec) <br>
+            Sorted based on first "End-to-End Code Generation" column, then "# Proofs Generated"
         </div>
         <div class="table-responsive">
             <table class="table table-striped table-bordered">
                 <thead>
                     <tr>
                         <th style="width: 5%">#</th>
-                        <th style="width: 30%">Model</th>
+                        <th style="width: 20%">Model</th>
                         <th style="width: 20%">Approach</th>
-                        <th style="width: 15%">End-to-End</th>
-                        <th style="width: 15%">Problems Solved</th>
+                        <th style="width: 25%">End-to-End Code Generation</th>
                         <th style="width: 15%">Note</th>
+                        <th style="width: 15%"># Proofs Generated</th>
                     </tr>
                 </thead>
                 <tbody>
                     <tr>
                         <td>1</td>
-                        <td class="model-name">💙 GPT-4o mini</td>
-                        <td>Few-Shot</td>
-                        <td><strong>0.621%</strong></td>
-                        <td>1/161</td>
+                        <td class="model-name">💙 Claude-3.7</td>
+                        <td>COPRA-enhanced</td>
+                        <td><strong>1/161</strong></td>
                         <td>Problem 53</td>
+                        <td><strong>2/161 (spec) + 14/161 (impl) = 16/282</strong></td>
                     </tr>
                     <tr>
                         <td>1</td>
-                        <td class="model-name">💙 Claude-3.7</td>
+                        <td class="model-name">💚 DeepSeek-R1</td>
                         <td>Few-Shot</td>
-                        <td><strong>0.621%</strong></td>
-                        <td>1/161</td>
+                        <td><strong>1/161</strong></td>
                         <td>Problem 53</td>
+                        <td>1/161 (spec) + 9/161 (impl) = 10/282</td>
                     </tr>
                     <tr>
                         <td>1</td>
-                        <td class="model-name">💙 DeepSeek-R1</td>
-                        <td>Few-Shot</td>
-                        <td><strong>0.621%</strong></td>
-                        <td>1/161</td>
+                        <td class="model-name">💚 GPT OSS 20b</td>
+                        <td>COPRA-enhanced</td>
+                        <td><strong>1/161</strong></td>
                         <td>Problem 53</td>
+                        <td>2/161 (spec) + 8/161 (impl) = 10/282</td>
                     </tr>
                     <tr>
                         <td>1</td>
                         <td class="model-name">💙 GPT-4o</td>
                         <td>COPRA-enhanced</td>
-                        <td><strong>0.621%</strong></td>
-                        <td>1/161</td>
+                        <td><strong>1/161</strong></td>
+                        <td>Problem 53</td>
+                        <td>3/161 (spec) + 6/161 (impl) = 9/282</td>
+                    </tr>
+                    <tr>
+                        <td>1</td>
+                        <td class="model-name">💙 GPT-4o mini</td>
+                        <td>Few-Shot</td>
+                        <td><strong>1/161</strong></td>
                         <td>Problem 53</td>
+                        <td>2/161 (spec) + 3 / 161 (impl) = 5/282</td>
                     </tr>
                     <tr>
                         <td>1</td>
                         <td class="model-name">💙 Claude-3.7</td>
-                        <td>COPRA-enhanced</td>
-                        <td><strong>0.621%</strong></td>
-                        <td>1/161</td>
+                        <td>Few-Shot</td>
+                        <td><strong>1/161</strong></td>
                         <td>Problem 53</td>
+                        <td>1/161 (spec) + 3/161 (impl) = 4/282</td>
                     </tr>
                     <tr>
                         <td>6</td>
                         <td class="model-name">💙 GPT-4o</td>
                         <td>Few-Shot</td>
-                        <td>0%</td>
                         <td>0/161</td>
                         <td>-</td>
+                        <td>1/161 (spec) + 1/161 (impl) = 2/282</td>
+                    </tr>
+                    <tr>
+                        <td>6</td>
+                        <td class="model-name">💙 GPT-5 mini (For Code Generation) + Kimina Prover (For proofs)</td>
+                        <td>Few-Shot</td>
+                        <td>0/161</td>
+                        <td>-</td>
+                        <td>0/161 (spec) + 1/161 (impl) = 1/282</td>
                     </tr>
                 </tbody>
             </table>
@@ -188,7 +206,9 @@ <h5>Legend</h5>
             📊 Detailed Performance Breakdown
         </div>
         <div class="metric-description">
-            Pass@600-sec rates for specification certification and implementation certification
+            Pass@600-sec rates for specification certification and implementation certification <br>
+            (Compiled: syntactically valid and type-checks; Proved: proofs accepted by Lean's kernel) <br>
+            Sorted based on average compilation rates across both stages
         </div>
         <div class="table-responsive">
             <table class="table table-striped table-bordered">
@@ -211,15 +231,6 @@ <h5>Legend</h5>
                     <tr style="border-top: 3px solid #007bff;">
                         <td colspan="7" class="text-center fw-bold" style="background-color: #f8f9fa;">Few-Shot Baseline</td>
                     </tr>
-                    <tr>
-                        <td class="model-name">💙 GPT-4o</td>
-                        <td>Few-Shot</td>
-                        <td>84.472%</td>
-                        <td>0.621%</td>
-                        <td>68.323%</td>
-                        <td>0.621%</td>
-                        <td><strong>0%</strong></td>
-                    </tr>
                     <tr>
                         <td class="model-name">💙 GPT-4o mini</td>
                         <td>Few-Shot</td>
@@ -239,7 +250,16 @@ <h5>Legend</h5>
                         <td><strong>0.621%</strong></td>
                     </tr>
                     <tr>
-                        <td class="model-name">💙 DeepSeek-R1</td>
+                        <td class="model-name">💙 GPT-4o</td>
+                        <td>Few-Shot</td>
+                        <td>84.472%</td>
+                        <td>0.621%</td>
+                        <td>68.323%</td>
+                        <td>0.621%</td>
+                        <td><strong>0%</strong></td>
+                    </tr>
+                    <tr>
+                        <td class="model-name">💚 DeepSeek-R1</td>
                         <td>Few-Shot</td>
                         <td>71.42%</td>
                         <td>0.621%</td>
@@ -250,15 +270,6 @@ <h5>Legend</h5>
                     <tr style="border-top: 3px solid #007bff;">
                         <td colspan="7" class="text-center fw-bold" style="background-color: #f8f9fa;">COPRA Baseline</td>
                     </tr>
-                    <tr>
-                        <td class="model-name">💙 GPT-4o</td>
-                        <td>COPRA-enhanced</td>
-                        <td>76.398%</td>
-                        <td>1.863%</td>
-                        <td>68.323%</td>
-                        <td>3.727%</td>
-                        <td><strong>0.621%</strong></td>
-                    </tr>
                     <tr>
                         <td class="model-name">💙 Claude-3.7</td>
                         <td>COPRA-enhanced</td>
@@ -268,6 +279,24 @@ <h5>Legend</h5>
                         <td>8.696%</td>
                         <td><strong>0.621%</strong></td>
                     </tr>
+                    <tr>
+                        <td class="model-name">💚 GPT OSS 20b</td>
+                        <td>COPRA-enhanced</td>
+                        <td>78.261%</td>
+                        <td>1.242%</td>
+                        <td>65.839%</td>
+                        <td>4.969%</td>
+                        <td><strong>0.621%</strong></td>
+                    </tr>
+                    <tr>
+                        <td class="model-name">💙 GPT-4o</td>
+                        <td>COPRA-enhanced</td>
+                        <td>76.398%</td>
+                        <td>1.863%</td>
+                        <td>68.323%</td>
+                        <td>3.727%</td>
+                        <td><strong>0.621%</strong></td>
+                    </tr>
                 </tbody>
             </table>
         </div>