forked from zensoup/ulauncher-unicode
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_character_list.py
109 lines (90 loc) · 3.16 KB
/
generate_character_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Download the latest unicode tables from https://www.unicode.org and create a .txt file
containing all the names, blocks and character codes
"""
import os
import logging
from urllib import request
curr_path = os.path.dirname(__file__)
logging.basicConfig(level=logging.DEBUG)
def get_blocks():
""" Download the info file for Unicode blocks.
"""
logging.info("Downloading block data...")
req = request.urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt")
content = req.read().decode()
logging.info("Done")
return content
def get_data():
""" Download the info file for Unicode blocks.
"""
logging.info("Downloading character data...")
req = request.urlopen(
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
)
content = req.read().decode()
logging.info("Done")
return content
def clean(text):
""" Remove all blank or commented lies from a string
"""
lines = text.strip().split("\n")
clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"]
return "\n".join(clean_lines)
def load_blocks():
""" Load and parse the block data and return a function that provides block
search based on a character code.
"""
indices = []
blocks = []
block_data = clean(get_blocks())
for line in block_data.split("\n"):
l, name = line.split(";")
start, stop = l.split("..")
indices.append((int(start, 16), int(stop, 16)))
blocks.append(name.strip())
def locate_block(code, left=0, right=len(indices)):
"""
Binary search on an ordered list of intervals.
"""
half = left + (right - left) // 2
[start, end] = indices[half]
if start > code:
return locate_block(code, left, right=half)
elif end < code:
return locate_block(code, half, right=right)
else:
return blocks[half]
return locate_block
def main():
""" Read the character and block data and unite them to a text file containing the following fields:
`<character name> <character comment> <code> <block name>`
seperated by tab characters.
"""
get_block = load_blocks()
characters = clean(get_data())
logging.info("Parsing character data...")
output = []
for line in characters.split("\n"):
# Parse the needed data from the character's line
attributes = line.strip().split(";")
code = attributes[0]
name = attributes[1]
comment = attributes[10]
# Convert character code to unicode
try:
num = int(code, 16)
except ValueError:
logging.warn("Could not convert " + code)
continue
# Find the character's block
blk = get_block(num)
if blk is not None:
output.append("\t".join((name, comment, code, blk)))
else:
logging.warn("Code %s not found in any block, char: %s", num, unichr(num))
output.append(name + "\t" + comment + "\t" + code + "\t")
with open("unicode_list.txt", "w") as target:
target.write("\n".join(output))
if __name__ == "__main__":
main()