Merge pull request #309 from PyThaiNLP/update-tokenizer

bact · web-flow · commit 89c21d3b7147 · 2019-10-20T18:40:12.000+08:00
Update tokenizers and test cases
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -142,8 +142,14 @@ def tokenize(self, text: str) -> List[str]:
         return tokens
 
 
-def segment(text: str, custom_dict: Trie = None) -> List[str]:
-    """ตัดคำภาษาไทยด้วยวิธี longest matching"""
+def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
+    """
+    Dictionary-based longest matching word segmentation.
+
+    :param str text: text to be tokenized to words
+    :param pythainlp.trie.Trie custom_dict: dictionary for tokenization
+    :return: list of words, tokenized from the text
+    """
     if not text or not isinstance(text, str):
         return []
 
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -47,7 +47,7 @@ def __init__(self, value, multi=None, in_dict=True):
 _PAT_ENG = re.compile(_RE_ENG)
 
 
-def _multicut(text: str, custom_dict: Trie = None):
+def _multicut(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE):
     """
     ส่งคืน LatticeString คืนมาเป็นก้อนๆ
     """
@@ -122,17 +122,23 @@ def _combine(ww: str):
                     yield m.replace("/", "|") + "|" + tail
 
 
-def segment(text: str, custom_dict: Trie = None) -> List[str]:
+def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
     """
-    ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
+    Dictionary-based maximum matching word segmentation.
+
+    :param str text: text to be tokenized to words
+    :param pythainlp.trie.Trie custom_dict: dictionary for tokenization
+    :return: list of words, tokenized from the text
     """
     if not text or not isinstance(text, str):
         return []
 
     return list(_multicut(text, custom_dict=custom_dict))
 
 
-def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
+def find_all_segment(
+    text: str, custom_dict: Trie = DEFAULT_DICT_TRIE
+) -> List[str]:
     """
     Get all possible segment variations
 
diff --git a/pythainlp/tokenize/ssg.py b/pythainlp/tokenize/ssg.py
@@ -5,4 +5,7 @@
 
 
 def segment(text: str) -> List[str]:
+    if not text or not isinstance(text, str):
+        return []
+
     return ssg.syllable_tokenize(text)
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py