Skip to content

Commit 89c21d3

Browse files
authored
Merge pull request #309 from PyThaiNLP/update-tokenizer
Update tokenizers and test cases
2 parents 69ab4d8 + d19fb29 commit 89c21d3

File tree

4 files changed

+66
-39
lines changed

4 files changed

+66
-39
lines changed

pythainlp/tokenize/longest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,14 @@ def tokenize(self, text: str) -> List[str]:
142142
return tokens
143143

144144

145-
def segment(text: str, custom_dict: Trie = None) -> List[str]:
146-
"""ตัดคำภาษาไทยด้วยวิธี longest matching"""
145+
def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
146+
"""
147+
Dictionary-based longest matching word segmentation.
148+
149+
:param str text: text to be tokenized to words
150+
:param pythainlp.trie.Trie custom_dict: dictionary for tokenization
151+
:return: list of words, tokenized from the text
152+
"""
147153
if not text or not isinstance(text, str):
148154
return []
149155

pythainlp/tokenize/multi_cut.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(self, value, multi=None, in_dict=True):
4747
_PAT_ENG = re.compile(_RE_ENG)
4848

4949

50-
def _multicut(text: str, custom_dict: Trie = None):
50+
def _multicut(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE):
5151
"""
5252
ส่งคืน LatticeString คืนมาเป็นก้อนๆ
5353
"""
@@ -122,17 +122,23 @@ def _combine(ww: str):
122122
yield m.replace("/", "|") + "|" + tail
123123

124124

125-
def segment(text: str, custom_dict: Trie = None) -> List[str]:
125+
def segment(text: str, custom_dict: Trie = DEFAULT_DICT_TRIE) -> List[str]:
126126
"""
127-
ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
127+
Dictionary-based maximum matching word segmentation.
128+
129+
:param str text: text to be tokenized to words
130+
:param pythainlp.trie.Trie custom_dict: dictionary for tokenization
131+
:return: list of words, tokenized from the text
128132
"""
129133
if not text or not isinstance(text, str):
130134
return []
131135

132136
return list(_multicut(text, custom_dict=custom_dict))
133137

134138

135-
def find_all_segment(text: str, custom_dict: Trie = None) -> List[str]:
139+
def find_all_segment(
140+
text: str, custom_dict: Trie = DEFAULT_DICT_TRIE
141+
) -> List[str]:
136142
"""
137143
Get all possible segment variations
138144

pythainlp/tokenize/ssg.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,7 @@
55

66

77
def segment(text: str) -> List[str]:
8+
if not text or not isinstance(text, str):
9+
return []
10+
811
return ssg.syllable_tokenize(text)

0 commit comments

Comments
 (0)