5
5
CamembertTokenizer ,
6
6
pipeline ,
7
7
)
8
+ import warnings
9
+ from pythainlp .tokenize import word_tokenize
8
10
9
11
_model_name = "wangchanberta-base-att-spm-uncased"
10
12
_tokenizer = CamembertTokenizer .from_pretrained (
@@ -48,7 +50,7 @@ def _clear_tag(self, tag):
48
50
return tag .replace ("B-" , "" ).replace ("I-" , "" )
49
51
50
52
def get_ner (
51
- self , text : str , tag : bool = False
53
+ self , text : str , pos : bool = False , tag : bool = False
52
54
) -> Union [List [Tuple [str , str ]], str ]:
53
55
"""
54
56
This function tags named-entitiy from text in IOB format.
@@ -64,6 +66,8 @@ def get_ner(
64
66
word and NER tag
65
67
:rtype: Union[list[tuple[str, str]]], str
66
68
"""
69
+ if pos :
70
+ warnings .warn ("This model doesn't support output postag and It doesn't output the postag." )
67
71
text = re .sub (" " , "<_>" , text )
68
72
self .json_ner = self .classify_tokens (text )
69
73
self .output = ""
@@ -121,6 +125,86 @@ def get_ner(
121
125
return self .sent_ner
122
126
123
127
128
+ class NamedEntityRecognition :
129
+ def __init__ (self , model : str = "pythainlp/thainer-corpus-v2-base-model" ) -> None :
130
+ """
131
+ This function tags named-entitiy from text in IOB format.
132
+
133
+ Powered by wangchanberta from VISTEC-depa\
134
+ AI Research Institute of Thailand
135
+ :param str model: The model that use wangchanberta pretrained.
136
+ """
137
+ from transformers import AutoTokenizer
138
+ from transformers import AutoModelForTokenClassification
139
+ self .tokenizer = AutoTokenizer .from_pretrained (model )
140
+ self .model = AutoModelForTokenClassification .from_pretrained (model )
141
+ def _fix_span_error (self , words , ner ):
142
+ _ner = []
143
+ _ner = ner
144
+ _new_tag = []
145
+ for i ,j in zip (words ,_ner ):
146
+ i = self .tokenizer .decode (i )
147
+ if i .isspace () and j .startswith ("B-" ):
148
+ j = "O"
149
+ if i == '' or i == '<s>' or i == '</s>' :
150
+ continue
151
+ if i == "<_>" :
152
+ i = " "
153
+ _new_tag .append ((i ,j ))
154
+ return _new_tag
155
+ def get_ner (
156
+ self , text : str , pos : bool = False ,tag : bool = False
157
+ ) -> Union [List [Tuple [str , str ]], str ]:
158
+ """
159
+ This function tags named-entitiy from text in IOB format.
160
+ Powered by wangchanberta from VISTEC-depa\
161
+ AI Research Institute of Thailand
162
+
163
+ :param str text: text in Thai to be tagged
164
+ :param bool tag: output like html tag.
165
+ :return: a list of tuple associated with tokenized word group, NER tag, \
166
+ and output like html tag (if the parameter `tag` is \
167
+ specified as `True`). \
168
+ Otherwise, return a list of tuple associated with tokenized \
169
+ word and NER tag
170
+ :rtype: Union[list[tuple[str, str]]], str
171
+ """
172
+ import torch
173
+ if pos :
174
+ warnings .warn ("This model doesn't support output postag and It doesn't output the postag." )
175
+ words_token = word_tokenize (text .replace (" " , "<_>" ))
176
+ inputs = self .tokenizer (words_token ,is_split_into_words = True ,return_tensors = "pt" )
177
+ ids = inputs ["input_ids" ]
178
+ mask = inputs ["attention_mask" ]
179
+ # forward pass
180
+ outputs = self .model (ids , attention_mask = mask )
181
+ logits = outputs [0 ]
182
+ predictions = torch .argmax (logits , dim = 2 )
183
+ predicted_token_class = [self .model .config .id2label [t .item ()] for t in predictions [0 ]]
184
+ ner_tag = self ._fix_span_error (inputs ['input_ids' ][0 ],predicted_token_class )
185
+ if tag :
186
+ temp = ""
187
+ sent = ""
188
+ for idx , (word , ner ) in enumerate (ner_tag ):
189
+ if ner .startswith ("B-" ) and temp != "" :
190
+ sent += "</" + temp + ">"
191
+ temp = ner [2 :]
192
+ sent += "<" + temp + ">"
193
+ elif ner .startswith ("B-" ):
194
+ temp = ner [2 :]
195
+ sent += "<" + temp + ">"
196
+ elif ner == "O" and temp != "" :
197
+ sent += "</" + temp + ">"
198
+ temp = ""
199
+ sent += word
200
+
201
+ if idx == len (ner_tag ) - 1 and temp != "" :
202
+ sent += "</" + temp + ">"
203
+
204
+ return sent
205
+ return ner_tag
206
+
207
+
124
208
def segment (text : str ) -> List [str ]:
125
209
"""
126
210
Subword tokenize. SentencePiece from wangchanberta model.
0 commit comments