-
Notifications
You must be signed in to change notification settings - Fork 41
/
extract_data.py
88 lines (73 loc) · 2.27 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import glob
import json
import shutil
import argparse
from tqdm import tqdm
def get_contents(line):
obj = json.loads(line)
return obj["summary"], obj["text"]
def extract_data(input_dir, output_dir):
multilingual_dir = os.path.join(
output_dir,
"multilingual"
)
os.makedirs(multilingual_dir, exist_ok=True)
f_iterator = glob.glob(
os.path.join(
input_dir,
"*.jsonl"
)
)
for input_file in tqdm(f_iterator):
lang = "_".join(os.path.basename(input_file).rsplit("_")[:-1])
lang_dir = os.path.join(output_dir, "individual", lang)
os.makedirs(lang_dir, exist_ok=True)
source_file = os.path.join(
lang_dir,
os.path.basename(
input_file
).replace(".jsonl", ".source").rsplit("_", 1)[1]
)
target_file = os.path.join(
lang_dir,
os.path.basename(
input_file
).replace(".jsonl", ".target").rsplit("_", 1)[1]
)
with open(input_file) as inpf:
with open(source_file, 'w') as srcf, \
open(target_file, 'w') as tgtf:
for line in inpf:
summary, text = get_contents(line)
print(text, file=srcf)
print(summary, file=tgtf)
if source_file.endswith("train.source"):
shutil.copy(
source_file,
os.path.join(
multilingual_dir,
lang + "_" + os.path.basename(source_file)
)
)
shutil.copy(
target_file,
os.path.join(
multilingual_dir,
lang + "_" + os.path.basename(target_file)
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--input_dir', '-i', type=str,
required=True,
metavar='PATH',
help="Input directory")
parser.add_argument(
'--output_dir', '-o', type=str,
required=True,
metavar='PATH',
help="Output directory")
args = parser.parse_args()
extract_data(args.input_dir, args.output_dir)