From ec748a182ff32e8d04a68f9865aceec8aa9e45b5 Mon Sep 17 00:00:00 2001
From: yycer <yyc15900617310@gmail.com>
Date: Sun, 27 Apr 2025 19:14:54 +0800
Subject: [PATCH] feat: add GaoKaoMath dataset for training and evaluation

---
 rllm/data/preprocess/math/gaokao_math.ipynb | 126 ++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 rllm/data/preprocess/math/gaokao_math.ipynb

diff --git a/rllm/data/preprocess/math/gaokao_math.ipynb b/rllm/data/preprocess/math/gaokao_math.ipynb
new file mode 100644
index 000000000..1318be03e
--- /dev/null
+++ b/rllm/data/preprocess/math/gaokao_math.ipynb
@@ -0,0 +1,126 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9K79xdtZ4Snl",
+        "outputId": "5d76b199-9b14-4429-cf70-3fbec79334bc"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "==================================================\n",
+            "📦 Dataset Overview\n",
+            "==================================================\n",
+            "DatasetDict({\n",
+            "    test: Dataset({\n",
+            "        features: ['number', 'problem', 'answer', 'score', 'id', 'year', 'province'],\n",
+            "        num_rows: 28\n",
+            "    })\n",
+            "})\n",
+            "\n",
+            "==================================================\n",
+            "📄 Features in split 'test'\n",
+            "==================================================\n",
+            "{'answer': Value(dtype='string', id=None),\n",
+            " 'id': Value(dtype='int64', id=None),\n",
+            " 'number': Value(dtype='string', id=None),\n",
+            " 'problem': Value(dtype='string', id=None),\n",
+            " 'province': Value(dtype='string', id=None),\n",
+            " 'score': Value(dtype='int64', id=None),\n",
+            " 'year': Value(dtype='string', id=None)}\n",
+            "\n",
+            "==================================================\n",
+            "🔍 Example(s) from split 'test'\n",
+            "==================================================\n",
+            "Example 1:\n",
+            "{'answer': '\\\\{1, 3, 5\\\\}',\n",
+            " 'id': 1,\n",
+            " 'number': '1',\n",
+            " 'problem': 'Let the universal set be $U = \\\\{1, 2, 3, 4, 5\\\\}$, and set $A = '\n",
+            "            '\\\\{2, 4\\\\}$. Find $C_U A$.',\n",
+            " 'province': 'Shanghai',\n",
+            " 'score': 4,\n",
+            " 'year': '2024'}\n",
+            "------------------------------\n"
+          ]
+        }
+      ],
+      "source": [
+        "from datasets import load_dataset\n",
+        "import json\n",
+        "\n",
+        "ds = load_dataset(\"FrankieYao/GaoKaoMath\")\n",
+        "\n",
+        "def show_dataset_info(ds, split_name=\"test\", num_examples=1):\n",
+        "    from pprint import pprint\n",
+        "\n",
+        "    print(\"=\"*50)\n",
+        "    print(f\"📦 Dataset Overview\")\n",
+        "    print(\"=\"*50)\n",
+        "    print(ds)\n",
+        "\n",
+        "    print(\"\\n\" + \"=\"*50)\n",
+        "    print(f\"📄 Features in split '{split_name}'\")\n",
+        "    print(\"=\"*50)\n",
+        "    pprint(ds[split_name].features)\n",
+        "\n",
+        "    print(\"\\n\" + \"=\"*50)\n",
+        "    print(f\"🔍 Example(s) from split '{split_name}'\")\n",
+        "    print(\"=\"*50)\n",
+        "    for i in range(num_examples):\n",
+        "        print(f\"Example {i+1}:\")\n",
+        "        pprint(ds[split_name][i])\n",
+        "        print(\"-\" * 30)\n",
+        "\n",
+        "\n",
+        "show_dataset_info(ds, split_name=\"test\", num_examples=1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "id": "RyoijzRC5hT-"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = [\n",
+        "    {\n",
+        "        \"id\":       entry[\"id\"],       # Unique identifier for the problem\n",
+        "        \"problem\":  entry[\"problem\"],  # The full math question in LaTeX-compatible format. Note: Proof-based questions have been filtered out\n",
+        "        \"answer\":   entry[\"answer\"],   # The final answer in LaTeX or natural format\n",
+        "        \"score\":    entry[\"score\"],    # Assigned score based on the exam's marking scheme\n",
+        "        \"number\":   entry[\"number\"],   # Question number in the exam paper. For problems that consist of multiple sub-questions, we split them into separate individual entries, and denote them using this format (e.g., \"1\" or \"17.2\")\n",
+        "        \"year\":     entry[\"year\"],     # Exam year (e.g., \"2024\")\n",
+        "        \"province\": entry[\"province\"], # Province or region (e.g., \"Shanghai\")\n",
+        "    }\n",
+        "    for entry in ds[\"test\"]\n",
+        "]\n",
+        "\n",
+        "with open(\"gaokao_math.json\", \"w\", encoding=\"utf-8\") as f:\n",
+        "    json.dump(dataset, f, indent=4)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}