-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJMdictParser.py
147 lines (117 loc) · 5.03 KB
/
JMdictParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
""" This is a script for parsing the JMdict_e data that can be found at http://www.edrdg.org/jmdict/edict_doc.html.
This script parses through entries looking for words made entirely of kanji and builds a Python dictionary containing
readings, definitions, and JLPT levels for those words. The Python dictionary is then exported to a json file to be
used for other projects. JLPT data is obtained from the KANJIDIC project at
http://www.edrdg.org/wiki/index.php/KANJIDIC_Project."""
import json
import re
def read_jmdict():
""" Creates a line generator from the JMdict_e file"""
try:
f = open('JMdict_e', 'r')
for line in f:
yield line
except FileNotFoundError:
print("JMdict_e not found in directory")
raise FileNotFoundError
def yield_entries():
""" Groups together lines that belong to the same entry and creates a generator for entries"""
entry = ""
new_entry = False
for line in read_jmdict():
if line == "<entry>\n":
new_entry = True
if new_entry:
entry += line
if line == "</entry>\n":
new_entry = False
yield entry
entry = ""
def kanji_check(character):
""" Determines if the character parameter is a kanji character based on UTF-8 code"""
if (ord(character) >= ord('\u4e00')) and (ord(character) <= ord('\u9faf')):
return True
else:
return False
def trim_entries_for_kanjiwords():
""" Parses for entries that are composed of only kanji and are of length > 1"""
for entry in yield_entries():
entry_name = re.search('<keb>(.*?)</keb>', entry)
if not entry_name:
pass
else:
entry_name = entry_name.group(1)
kanji_word = True
for char in entry_name:
if not kanji_check(char):
kanji_word = False
if kanji_word and len(entry_name) > 1:
yield entry
def trim_data():
""" Trims extraneous data and yields dictionaries for each entry"""
for entry in trim_entries_for_kanjiwords():
trimmed_entry = {}
entry_name = re.search('<keb>(.*?)</keb>', entry).group(1)
trimmed_entry[entry_name] = {}
entry_reading = re.search('<reb>(.*?)</reb>', entry).group(1)
trimmed_entry[entry_name]['reading'] = entry_reading
entry_senses = re.findall(re.compile('<sense>(.*?)</sense>', re.DOTALL), entry)
entry_glosses = re.findall('<gloss>(.*?)</gloss>', entry_senses[0])
trimmed_entry[entry_name]['meaning'] = entry_glosses
yield trimmed_entry
def fill_in_jlpt(compound_dict):
""" Fills in JLPT levels using KANJIDIC data"""
try:
f = open("kanji_dict.json", 'rb')
kanji_dict = json.load(f)
f.close()
except FileNotFoundError:
print("kanji_dict.json not found, cannot get jlpt data")
raise FileNotFoundError
for i in compound_dict:
jlpt_list = []
for j in i: # gets jlpt for each kanji in the compound and puts them in a list
try:
jlpt_list.append(kanji_dict[j]["jlpt"][0])
except IndexError:
compound_dict[i]["jlpt"] = jlpt_list
jlpt_final = []
for jlpt in jlpt_list:
if jlpt == ' ':
jlpt_final.append('N/A')
else:
jlpt_final.append(jlpt)
compound_dict[i]["jlpt"] = jlpt_final
return compound_dict
def build_compounddict():
""" Creates a python dictionary from the generator produced from trim_data() and fills in JLPT data"""
compound_dict = {}
for entry in trim_data():
compound_dict.update(entry)
# Entries that need to be manually filled in due to errors in the JMdict data
compound_dict['新平民']['meaning'] = ['name given to the lowest rank of the '
'Japanese caste system after its abolition'
]
compound_dict['春塵']['meaning'] = ["spring dust",
"frost and snow that's blown like dust in the air by the spring wind"]
compound_dict['雲白肉']['meaning'] = ['dish of spicy boiled pork', 'usu. served with slices of cucumber']
compound_dict['神幸']['meaning'] = ['transferring a shintai in a portable shrine']
return fill_in_jlpt(compound_dict)
def export_to_json(compound_dict):
""" Exports dictionary to json file"""
f = open("compound_dict.json", 'w')
json.dump(compound_dict, f)
f.close()
# Script for creating a new compound dictionary
a = input("Create new compound_dict.json? [y/n]: ")
while (a != 'y') and (a != 'n'):
print("Invalid Input...")
a = input("Create new compound_dict.json? [y/n]: ")
if a == 'y':
print("Creating compound_dict.json...")
comp = build_compounddict()
export_to_json(comp)
print("Compound dictionary created and exported to compound_dict.json")
print("Compound dictionary contains " + str(len(comp)) + " entries.")
else:
print("Exiting JMdictParser.py...")