-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathldpy2ldc.py
148 lines (117 loc) · 4.59 KB
/
ldpy2ldc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Repack the Python model into a form that langid.c can read.
Marco Lui, July 2014
"""
import argparse
import langid.langid as langid
import array
import sys
from itertools import islice
model_template = """\
#include "model.h"
unsigned tk_nextmove[NUM_STATES][256] = {tk_nextmove};
unsigned tk_output_c[NUM_STATES] = {tk_output_c};
unsigned tk_output_s[NUM_STATES] = {tk_output_s};
unsigned tk_output[] = {tk_output};
double nb_pc[NUM_LANGS] = {nb_pc};
double nb_ptc[{nb_ptc_size}] = {nb_ptc};
char *nb_classes[NUM_LANGS] = {nb_classes};
"""
header_template = """\
#ifndef _MODEL_H
#define _MODEL_H
#define NUM_FEATS {num_feats}
#define NUM_LANGS {num_langs}
#define NUM_STATES {num_states}
extern unsigned tk_nextmove[NUM_STATES][256];
extern unsigned tk_output_c[NUM_STATES];
extern unsigned tk_output_s[NUM_STATES];
extern unsigned tk_output[];
extern double nb_pc[NUM_LANGS];
extern double nb_ptc[{nb_ptc_size}];
extern char *nb_classes[NUM_LANGS];
#endif
"""
def pack_tk_output(ident):
num_states = len(ident.tk_nextmove) >> 8
# tk_output is a mapping from state to list of feats completed by entering that state.
# we encode it as a single array of 2-byte values. each "entry" is a state label,
# a number representing a count followed by count featlabels
tk_output_c = []
tk_output_s = []
tk_output = []
for i in range(num_states):
if i in ident.tk_output and ident.tk_output[i]:
count = len(ident.tk_output[i])
feats = ident.tk_output[i]
else:
count = 0
feats = []
tk_output_c.append(count)
tk_output_s.append(len(tk_output))
tk_output.extend(feats)
return tk_output_c, tk_output_s, tk_output
def as_c_array_init(seq):
return "{" + ",".join(map(str, seq)) + "}"
def chunk(seq, chunksize):
"""
Break a sequence into chunks not exceeeding a predetermined size
"""
seq_iter = iter(seq)
while True:
chunk = tuple(islice(seq_iter, chunksize))
if not chunk: break
yield chunk
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output", "-o", default=sys.stdout, help="write exported model to", type=argparse.FileType('w'))
parser.add_argument("--header", action="store_true", help="produce header file")
parser.add_argument("--protobuf", action="store_true", help="produce model in protocol buffer format")
parser.add_argument("model", help="read model from")
args = parser.parse_args()
if args.protobuf and args.header:
parser.error("can only specify one of --protobuf or --header")
ident = langid.LanguageIdentifier.from_modelpath(args.model)
print "NB_PTC", type(ident.nb_ptc), ident.nb_ptc.shape, ident.nb_ptc.dtype
print "NB_PC", type(ident.nb_pc), ident.nb_pc.shape, ident.nb_pc.dtype
print "NB_NUMFEATS", type(ident.nb_numfeats), ident.nb_numfeats
print "NB_CLASSES", type(ident.nb_classes), len(ident.nb_classes)
print "TK_NEXTMOVE", type(ident.tk_nextmove), len(ident.tk_nextmove), ident.tk_nextmove.typecode, ident.tk_nextmove.itemsize
print "TK_OUTPUT", type(ident.tk_output), len(ident.tk_output)
print
num_feats, num_langs = ident.nb_ptc.shape
num_states = len(ident.tk_nextmove) >> 8
nb_ptc_size = num_feats * num_langs
if args.protobuf:
import langid_pb2
tk_output_c, tk_output_s, tk_output = pack_tk_output(ident)
lid = langid_pb2.LanguageIdentifier()
# basic parameters
lid.num_feats = num_feats
lid.num_langs = num_langs
lid.num_states = num_states
# pack the tokenizer
lid.tk_nextmove.extend(ident.tk_nextmove)
lid.tk_output_c.extend(tk_output_c)
lid.tk_output_s.extend(tk_output_s)
lid.tk_output.extend(tk_output)
# pack the classifier parameters
lid.nb_pc.extend(ident.nb_pc.tolist())
lid.nb_ptc.extend(ident.nb_ptc.ravel().tolist())
# pack the class labels
lid.nb_classes.extend('{}'.format(c) for c in ident.nb_classes)
args.output.write(lid.SerializeToString())
elif args.header:
args.output.write(header_template.format(**locals()))
else:
# chunk tk_nextmove back into length-256 array initializers, to avoid C warnings
# about initialization mismatches
tk_nextmove = as_c_array_init( as_c_array_init(c) for c in chunk(ident.tk_nextmove,256))
tk_output_c, tk_output_s, tk_output = pack_tk_output(ident)
tk_output_c = as_c_array_init(tk_output_c)
tk_output_s = as_c_array_init(tk_output_s)
tk_output = as_c_array_init(tk_output)
nb_pc = as_c_array_init(ident.nb_pc)
nb_ptc = as_c_array_init(ident.nb_ptc.ravel())
nb_classes = as_c_array_init('"{}"'.format(c) for c in ident.nb_classes)
args.output.write(model_template.format(**locals()))