From ec748a182ff32e8d04a68f9865aceec8aa9e45b5 Mon Sep 17 00:00:00 2001 From: yycer Date: Sun, 27 Apr 2025 19:14:54 +0800 Subject: [PATCH] feat: add GaoKaoMath dataset for training and evaluation --- rllm/data/preprocess/math/gaokao_math.ipynb | 126 ++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 rllm/data/preprocess/math/gaokao_math.ipynb diff --git a/rllm/data/preprocess/math/gaokao_math.ipynb b/rllm/data/preprocess/math/gaokao_math.ipynb new file mode 100644 index 000000000..1318be03e --- /dev/null +++ b/rllm/data/preprocess/math/gaokao_math.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9K79xdtZ4Snl", + "outputId": "5d76b199-9b14-4429-cf70-3fbec79334bc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n", + "📦 Dataset Overview\n", + "==================================================\n", + "DatasetDict({\n", + " test: Dataset({\n", + " features: ['number', 'problem', 'answer', 'score', 'id', 'year', 'province'],\n", + " num_rows: 28\n", + " })\n", + "})\n", + "\n", + "==================================================\n", + "📄 Features in split 'test'\n", + "==================================================\n", + "{'answer': Value(dtype='string', id=None),\n", + " 'id': Value(dtype='int64', id=None),\n", + " 'number': Value(dtype='string', id=None),\n", + " 'problem': Value(dtype='string', id=None),\n", + " 'province': Value(dtype='string', id=None),\n", + " 'score': Value(dtype='int64', id=None),\n", + " 'year': Value(dtype='string', id=None)}\n", + "\n", + "==================================================\n", + "🔍 Example(s) from split 'test'\n", + "==================================================\n", + "Example 1:\n", + "{'answer': '\\\\{1, 3, 5\\\\}',\n", + " 'id': 1,\n", + " 'number': '1',\n", + " 'problem': 'Let the universal set be $U = \\\\{1, 2, 3, 4, 5\\\\}$, and set $A = '\n", + " '\\\\{2, 4\\\\}$. Find $C_U A$.',\n", + " 'province': 'Shanghai',\n", + " 'score': 4,\n", + " 'year': '2024'}\n", + "------------------------------\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "import json\n", + "\n", + "ds = load_dataset(\"FrankieYao/GaoKaoMath\")\n", + "\n", + "def show_dataset_info(ds, split_name=\"test\", num_examples=1):\n", + " from pprint import pprint\n", + "\n", + " print(\"=\"*50)\n", + " print(f\"📦 Dataset Overview\")\n", + " print(\"=\"*50)\n", + " print(ds)\n", + "\n", + " print(\"\\n\" + \"=\"*50)\n", + " print(f\"📄 Features in split '{split_name}'\")\n", + " print(\"=\"*50)\n", + " pprint(ds[split_name].features)\n", + "\n", + " print(\"\\n\" + \"=\"*50)\n", + " print(f\"🔍 Example(s) from split '{split_name}'\")\n", + " print(\"=\"*50)\n", + " for i in range(num_examples):\n", + " print(f\"Example {i+1}:\")\n", + " pprint(ds[split_name][i])\n", + " print(\"-\" * 30)\n", + "\n", + "\n", + "show_dataset_info(ds, split_name=\"test\", num_examples=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "RyoijzRC5hT-" + }, + "outputs": [], + "source": [ + "dataset = [\n", + " {\n", + " \"id\": entry[\"id\"], # Unique identifier for the problem\n", + " \"problem\": entry[\"problem\"], # The full math question in LaTeX-compatible format. Note: Proof-based questions have been filtered out\n", + " \"answer\": entry[\"answer\"], # The final answer in LaTeX or natural format\n", + " \"score\": entry[\"score\"], # Assigned score based on the exam's marking scheme\n", + " \"number\": entry[\"number\"], # Question number in the exam paper. For problems that consist of multiple sub-questions, we split them into separate individual entries, and denote them using this format (e.g., \"1\" or \"17.2\")\n", + " \"year\": entry[\"year\"], # Exam year (e.g., \"2024\")\n", + " \"province\": entry[\"province\"], # Province or region (e.g., \"Shanghai\")\n", + " }\n", + " for entry in ds[\"test\"]\n", + "]\n", + "\n", + "with open(\"gaokao_math.json\", \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(dataset, f, indent=4)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}