Skip to content

Commit bac4516

Browse files
authored
Merge pull request #27 from PyThaiNLP/dev
update from origin
2 parents 46a569b + bfae642 commit bac4516

File tree

10 files changed

+22
-11
lines changed

10 files changed

+22
-11
lines changed

README-pypi.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4)
22

3-
# PyThaiNLP 2.0.1
3+
# PyThaiNLP 2.0.2
44

55
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade)[![pypi](https://img.shields.io/pypi/v/pythainlp.svg)](https://pypi.python.org/pypi/pythainlp)
66
[![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp)

bin/pythainlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,4 @@ elif args.soundex!=None:
4545
args.engine="lk82"
4646
print(soundex(args.soundex, engine=args.engine))
4747
else:
48-
print("PyThaiNLP 2.0")
48+
print("PyThaiNLP 2.0.2")

conda.recipe/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% set version = "2.0.1" %}
1+
{% set version = "2.0.2" %}
22

33
package:
44
name: pythainlp

meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% set version = "2.0.1" %}
1+
{% set version = "2.0.2" %}
22

33
package:
44
name: pythainlp

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3-
__version__ = "2.0.1"
3+
__version__ = "2.0.2"
44

55
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
66
thai_vowels = "ฤฦะ\u0e31าำ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39เแโใไ\u0e45\u0e47" # 19

pythainlp/tokenize/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,25 @@ def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
122122
def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
123123
"""
124124
:param str text: text to be tokenized
125-
:param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units.
125+
:param str engine: subword tokenizer
126+
:Parameters for engine:
127+
* tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)
128+
* etcc - Enhanced Thai Character Cluster (Inrut et al. 2001) [In development]
126129
:return: a list of tokenized strings.
127130
"""
128131
if not text:
129132
return ""
130133

131134
from .tcc import tcc
135+
from .etcc import etcc
132136

137+
if engine == "tcc":
138+
return tcc(text)
139+
elif engine == "etcc":
140+
return etcc(text).split("/")
141+
#default
133142
return tcc(text)
134143

135-
136144
def syllable_tokenize(text: str) -> List[str]:
137145
"""
138146
:param str text: input string to be tokenized

pythainlp/tokenize/etcc.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
โปรแกรม ETCC ใน Python
44
พัฒนาโดย นาย วรรณพงษ์ ภัททิยไพบูลย์
55
19 มิ.ย. 2560
6+
Reference: Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. "Thai word segmentation using combination of forward and backward longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001.
7+
68
79
วิธีใช้งาน
810
etcc(คำ)

pythainlp/tokenize/tcc.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# -*- coding: utf-8 -*-
22
"""
33
Separate Thai text into Thai Character Cluster (TCC).
4-
Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002)
5-
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
4+
Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2000)
5+
https://dl.acm.org/citation.cfm?id=355225
6+
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
67
78
Credits:
89
- TCC: Jakkrit TeCho

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 2.0.1
2+
current_version = 2.0.2
33
commit = True
44
tag = True
55

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
setup(
3636
name="pythainlp",
37-
version="2.0.1",
37+
version="2.0.2",
3838
description="Thai Natural Language Processing library",
3939
long_description=readme,
4040
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)