forked from AkariAsai/self-rag
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharc.py
50 lines (45 loc) · 1.78 KB
/
arc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import argparse
import random
import jsonlines
import datasets
from task_instructions import TASK_INST
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--n', type=int, default=None)
parser.add_argument('--output_file', type=str,
default=None)
parser.add_argument('--data_prefix', type=str,
default=None)
args = parser.parse_args()
data = datasets.load_dataset("ai2_arc", "ARC-Easy")["train"]
new_data = []
for item in data:
question_stem = item["question"]
choices = item["choices"]
answer_key = item["answerKey"]
if answer_key == "1":
answer_key = "A"
if answer_key == "2":
answer_key = "B"
if answer_key == "3":
answer_key = "C"
if answer_key == "4":
answer_key = "D"
answer_labels = {}
choices["label"] = ["A", "B", "C", "D"]
if len(choices["text"]) < 4:
continue
for text, choice in zip(choices["text"], choices["label"]):
answer_labels[choice] = text
q_id = "{0}_{1}".format(args.data_prefix, item["id"])
input_question = "{0}\nA: {1}\nB: {2}\nC: {3}\nD: {4}".format(question_stem, answer_labels["A"], answer_labels["B"], answer_labels["C"], answer_labels["D"])
instruction = TASK_INST[args.data_prefix] + "## Input:\n\n" + input_question
output = answer_key
new_data.append({"instruction": instruction, "output": output, "input": "", "id": q_id, "dataset_name": args.data_prefix})
if args.n is not None:
new_data = random.sample(new_data, k=args.n)
with jsonlines.open(args.output_file, 'w') as writer:
writer.write_all(new_data)
if __name__ == "__main__":
main()