Merge pull request #781 from PyThaiNLP/add-thainer-v2

wannaphong · web-flow · commit 5abb308b67a3 · 2023-03-23T22:18:42.000+07:00
Add Thai NER 2.0
diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst
@@ -30,6 +30,10 @@ Notebook:
 
 Modules
 -------
+.. autoclass:: NamedEntityRecognition
+   :members:
+.. autoclass:: ThaiNameTagger
+   :members:
 .. autofunction:: segment
 
 References
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -16,6 +16,7 @@ class NER:
     **Options for engine**
         * *thainer* - Thai NER engine
         * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.
+        * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0
 
     **Options for corpus**
         * *thainer* - Thai NER corpus
@@ -33,6 +34,9 @@ def load_engine(self, engine: str, corpus: str) -> None:
             from pythainlp.tag.thainer import ThaiNameTagger
 
             self.engine = ThaiNameTagger()
+        elif engine == "thainer-v2" and corpus == "thainer":
+            from pythainlp.wangchanberta import NamedEntityRecognition
+            self.engine = NamedEntityRecognition(model="pythainlp/thainer-corpus-v2-base-model")
         elif engine == "tltk":
             from pythainlp.tag import tltk
 
@@ -49,7 +53,7 @@ def load_engine(self, engine: str, corpus: str) -> None:
             )
 
     def tag(
-        self, text, pos=True, tag=False
+        self, text, pos=False, tag=False
     ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named-entitiy from text in IOB format.
@@ -71,13 +75,13 @@ def tag(
             >>>
             >>> ner = NER("thainer")
             >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์")
-            [('ทดสอบ', 'VV', 'O'),
-            ('นาย', 'NN', 'B-PERSON'),
-            ('วรรณ', 'NN', 'I-PERSON'),
-            ('พงษ์', 'NN', 'I-PERSON'),
-            (' ', 'PU', 'I-PERSON'),
-            ('ภัททิย', 'NN', 'I-PERSON'),
-            ('ไพบูลย์', 'NN', 'I-PERSON')]
+            [('ทดสอบ', 'O'),
+            ('นาย', 'B-PERSON'),
+            ('วรรณ', 'I-PERSON'),
+            ('พงษ์', 'I-PERSON'),
+            (' ', 'I-PERSON'),
+            ('ภัททิย', 'I-PERSON'),
+            ('ไพบูลย์', 'I-PERSON')]
             >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
             'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
         """
diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py
@@ -73,10 +73,11 @@ def _doc2features(doc, i) -> Dict:
 
 class ThaiNameTagger:
     """
-    Thai named-entity recognizer.
+    Thai named-entity recognizer or Thai NER.
+    This function support Thai NER 1.4 and 1.5 only.
     :param str version: Thai NER version.
         It's support Thai NER 1.4 & 1.5.
-        The defualt value is `1.4`
+        The defualt value is `1.4
 
     :Example:
     ::
diff --git a/pythainlp/wangchanberta/__init__.py b/pythainlp/wangchanberta/__init__.py
@@ -2,6 +2,7 @@
 __all__ = [
     "ThaiNameTagger",
     "segment",
+    "NamedEntityRecognition",
 ]
 
-from pythainlp.wangchanberta.core import ThaiNameTagger, segment
+from pythainlp.wangchanberta.core import ThaiNameTagger, segment, NamedEntityRecognition
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py
@@ -5,6 +5,8 @@
     CamembertTokenizer,
     pipeline,
 )
+import warnings
+from pythainlp.tokenize import word_tokenize
 
 _model_name = "wangchanberta-base-att-spm-uncased"
 _tokenizer = CamembertTokenizer.from_pretrained(
@@ -48,7 +50,7 @@ def _clear_tag(self, tag):
         return tag.replace("B-", "").replace("I-", "")
 
     def get_ner(
-        self, text: str, tag: bool = False
+        self, text: str, pos: bool= False,tag: bool = False
     ) -> Union[List[Tuple[str, str]], str]:
         """
         This function tags named-entitiy from text in IOB format.
@@ -64,6 +66,8 @@ def get_ner(
                  word and NER tag
         :rtype: Union[list[tuple[str, str]]], str
         """
+        if pos:
+            warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
         text = re.sub(" ", "<_>", text)
         self.json_ner = self.classify_tokens(text)
         self.output = ""
@@ -121,6 +125,86 @@ def get_ner(
             return self.sent_ner
 
 
+class NamedEntityRecognition:
+    def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None:
+        """
+        This function tags named-entitiy from text in IOB format.
+
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
+        :param str model: The model that use wangchanberta pretrained.
+        """
+        from transformers import AutoTokenizer
+        from transformers import AutoModelForTokenClassification
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModelForTokenClassification.from_pretrained(model)
+    def _fix_span_error(self, words, ner):
+        _ner = []
+        _ner=ner
+        _new_tag=[]
+        for i,j in zip(words,_ner):
+            i=self.tokenizer.decode(i)
+            if i.isspace() and j.startswith("B-"):
+                j="O"
+            if i=='' or i=='<s>' or i=='</s>':
+                continue
+            if i=="<_>":
+                i=" "
+            _new_tag.append((i,j))
+        return _new_tag
+    def get_ner(
+        self, text: str, pos: bool= False,tag: bool = False
+    ) -> Union[List[Tuple[str, str]], str]:
+        """
+        This function tags named-entitiy from text in IOB format.
+        Powered by wangchanberta from VISTEC-depa\
+             AI Research Institute of Thailand
+
+        :param str text: text in Thai to be tagged
+        :param bool tag: output like html tag.
+        :return: a list of tuple associated with tokenized word group, NER tag, \
+                 and output like html tag (if the parameter `tag` is \
+                 specified as `True`). \
+                 Otherwise, return a list of tuple associated with tokenized \
+                 word and NER tag
+        :rtype: Union[list[tuple[str, str]]], str
+        """
+        import torch
+        if pos:
+            warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
+        words_token = word_tokenize(text.replace(" ", "<_>"))
+        inputs=self.tokenizer(words_token,is_split_into_words=True,return_tensors="pt")
+        ids = inputs["input_ids"]
+        mask = inputs["attention_mask"]
+        # forward pass
+        outputs = self.model(ids, attention_mask=mask)
+        logits = outputs[0]
+        predictions = torch.argmax(logits, dim=2)
+        predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
+        ner_tag=self._fix_span_error(inputs['input_ids'][0],predicted_token_class)
+        if tag:
+            temp = ""
+            sent = ""
+            for idx, (word, ner) in enumerate(ner_tag):
+                if ner.startswith("B-") and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner.startswith("B-"):
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner == "O" and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ""
+                sent += word
+
+                if idx == len(ner_tag) - 1 and temp != "":
+                    sent += "</" + temp + ">"
+
+            return sent
+        return ner_tag
+
+
 def segment(text: str) -> List[str]:
     """
     Subword tokenize. SentencePiece from wangchanberta model.
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -213,10 +213,8 @@ def test_ner(self):
         )
 
         # arguement `tag` is True
-        self.assertEqual(
-            ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True),
-            "วันที่ <DATE>15 ก.ย. 61</DATE> "
-            "ทดสอบระบบเวลา <TIME>14:49 น.</TIME>",
+        self.assertIsNotNone(
+            ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True)
         )
 
         ner = ThaiNameTagger(version="1.4")
@@ -352,6 +350,10 @@ def test_NER_class(self):
         self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า"))
         self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", pos=False))
         self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", tag=True))
+        ner = NER(engine="thainer-v2")
+        self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า"))
+        self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", pos=False))
+        self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", tag=True))
         ner = NER(engine="tltk")
         self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า"))
         self.assertIsNotNone(ner.tag("แมวทำอะไรตอนห้าโมงเช้า", pos=False))

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`__all__ = [`
`3`	`3`	`"ThaiNameTagger",`
`4`	`4`	`"segment",`
	`5`	`+ "NamedEntityRecognition",`
`5`	`6`	`]`
`6`	`7`
`7`		`-from pythainlp.wangchanberta.core import ThaiNameTagger, segment`
	`8`	`+from pythainlp.wangchanberta.core import ThaiNameTagger, segment, NamedEntityRecognition`