Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions rllm/data/preprocess/math/gaokao_math.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9K79xdtZ4Snl",
"outputId": "5d76b199-9b14-4429-cf70-3fbec79334bc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==================================================\n",
"📦 Dataset Overview\n",
"==================================================\n",
"DatasetDict({\n",
" test: Dataset({\n",
" features: ['number', 'problem', 'answer', 'score', 'id', 'year', 'province'],\n",
" num_rows: 28\n",
" })\n",
"})\n",
"\n",
"==================================================\n",
"📄 Features in split 'test'\n",
"==================================================\n",
"{'answer': Value(dtype='string', id=None),\n",
" 'id': Value(dtype='int64', id=None),\n",
" 'number': Value(dtype='string', id=None),\n",
" 'problem': Value(dtype='string', id=None),\n",
" 'province': Value(dtype='string', id=None),\n",
" 'score': Value(dtype='int64', id=None),\n",
" 'year': Value(dtype='string', id=None)}\n",
"\n",
"==================================================\n",
"🔍 Example(s) from split 'test'\n",
"==================================================\n",
"Example 1:\n",
"{'answer': '\\\\{1, 3, 5\\\\}',\n",
" 'id': 1,\n",
" 'number': '1',\n",
" 'problem': 'Let the universal set be $U = \\\\{1, 2, 3, 4, 5\\\\}$, and set $A = '\n",
" '\\\\{2, 4\\\\}$. Find $C_U A$.',\n",
" 'province': 'Shanghai',\n",
" 'score': 4,\n",
" 'year': '2024'}\n",
"------------------------------\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"import json\n",
"\n",
"ds = load_dataset(\"FrankieYao/GaoKaoMath\")\n",
"\n",
"def show_dataset_info(ds, split_name=\"test\", num_examples=1):\n",
" from pprint import pprint\n",
"\n",
" print(\"=\"*50)\n",
" print(f\"📦 Dataset Overview\")\n",
" print(\"=\"*50)\n",
" print(ds)\n",
"\n",
" print(\"\\n\" + \"=\"*50)\n",
" print(f\"📄 Features in split '{split_name}'\")\n",
" print(\"=\"*50)\n",
" pprint(ds[split_name].features)\n",
"\n",
" print(\"\\n\" + \"=\"*50)\n",
" print(f\"🔍 Example(s) from split '{split_name}'\")\n",
" print(\"=\"*50)\n",
" for i in range(num_examples):\n",
" print(f\"Example {i+1}:\")\n",
" pprint(ds[split_name][i])\n",
" print(\"-\" * 30)\n",
"\n",
"\n",
"show_dataset_info(ds, split_name=\"test\", num_examples=1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "RyoijzRC5hT-"
},
"outputs": [],
"source": [
"dataset = [\n",
" {\n",
" \"id\": entry[\"id\"], # Unique identifier for the problem\n",
" \"problem\": entry[\"problem\"], # The full math question in LaTeX-compatible format. Note: Proof-based questions have been filtered out\n",
" \"answer\": entry[\"answer\"], # The final answer in LaTeX or natural format\n",
" \"score\": entry[\"score\"], # Assigned score based on the exam's marking scheme\n",
" \"number\": entry[\"number\"], # Question number in the exam paper. For problems that consist of multiple sub-questions, we split them into separate individual entries, and denote them using this format (e.g., \"1\" or \"17.2\")\n",
" \"year\": entry[\"year\"], # Exam year (e.g., \"2024\")\n",
" \"province\": entry[\"province\"], # Province or region (e.g., \"Shanghai\")\n",
" }\n",
" for entry in ds[\"test\"]\n",
"]\n",
"\n",
"with open(\"gaokao_math.json\", \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(dataset, f, indent=4)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}