add docs

wannaphong · wannaphong · commit b55caabb94d0 · 2017-05-30T19:18:16.000+07:00
diff --git a/docs/pythainlp-1-3-thai.md b/docs/pythainlp-1-3-thai.md
@@ -0,0 +1,167 @@
+# คู่มือการใช้งาน PyThaiNLP 1.3
+
+## API
+
+### ตัดคำไทย
+
+สำหรับการตัดคำไทยนั้น ใน PyThaiNLP 1.3 ได้ทำเปลี่ยน API ใหม่ ยกเลิก pythainlp.segment ให้ทำการเปลี่ยนไปใช้ API ชุดใหม่
+
+```python
+from pythainlp.tokenize import word_tokenize
+word_tokenize(text,engine)
+```
+text คือ ข้อความในรูปแบบสตริง str เท่านั้น
+
+engine คือ ระบบตัดคำไทย ปัจจุบันนี้ PyThaiNLP ได้พัฒนามี 3 engine ให้ใช้งานกันดังนี้
+
+1. icu -  engine ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ) และเป็นค่าเริ่มต้น
+2. dict - เป็นการตัดคำโดยใช้พจานุกรมจาก thaiword.txt ใน corpus  (ความแม่นยำปานกลาง)
+3. mm - ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย
+
+คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]
+
+**ตัวอย่าง**
+
+```python
+from pythainlp.tokenize import word_tokenize
+text='ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด'
+a=word_tokenize(text,engine='icu') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอ', 'เค', 'บ่', 'พวก', 'เรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้าน', 'เกิด']
+b=word_tokenize(text,engine='dict') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
+c=word_tokenize(text,engine='mm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
+```
+
+### Postaggers ภาษาไทย
+
+ตั้งแต่ PyThaiNLP 1.3 เป็นต้นไป ได้ทำการยกเลิก pythainlp.postaggers เดิม เปลี่ยนไปใช้ API ชุดใหม่ดังนี้
+
+```python
+from pythainlp.tag import pos_tag
+pos_tag(list,engine='old')
+```
+
+list คือ list ที่เก็บข้อความหลังผ่านการตัดคำแล้ว
+
+engine คือ ชุดเครื่องมือในการ postaggers มี 2 ตัวดังนี้
+
+1. old เป็น UnigramTagger (ค่าเริ่มต้น)
+2. artagger เป็น RDR POS Tagger ละเอียดยิ่งกว่าเดิม รองรับเฉพาะ Python 3 เท่านั้น
+
+### แปลงข้อความเป็น Latin
+
+```python
+from pythainlp.romanization import romanization
+romanization(str)
+```
+**ตัวอย่าง**
+
+```python
+from pythainlp.romanization import romanization
+romanization("แมว") # 'mæw'
+```
+
+### เช็คคำผิด * 
+
+*ความสามารถนี้รองรับเฉพาะ Python 3
+
+ก่อนใช้งานความสามารถนี้ ให้ทำการติดตั้ง hunspell และ hunspell-th ก่อน
+
+**วิธีติดตั้ง** สำหรับบน Debian , Ubuntu
+
+```
+sudo apt-get install hunspell hunspell-th
+```
+
+ให้ใช้ pythainlp.spell ตามตัวอย่างนี้
+
+```python
+from pythainlp.spell import *
+a=spell("สี่เหลียม")
+print(a) # ['สี่เหลี่ยม', 'เสียเหลี่ยม', 'เหลี่ยม']
+```
+### pythainlp.number
+
+```python
+from pythainlp.number import *
+```
+จัดการกับตัวเลข โดยมีดังนี้
+
+- nttn(str)  - เป็นการแปลงเลขไทยสู่เลข
+- nttt(str) - เลขไทยสู่ข้อความ
+- ntnt(str) - เลขสู่เลขไทย
+- ntt(str) - เลขสู่ข้อความ
+- ttn(str) - ข้อความสู่เลข
+- numtowords(float) -  อ่านจำนวนตัวเลขภาษาไทย (บาท) รับค่าเป็น ''float'' คืนค่าเป็น  'str'
+
+### เรียงลำดับข้อมูลภาษาไทยใน List
+
+```python
+from pythainlp.collation import collation
+print(collation(['ไก่','ไข่','ก','ฮา'])) # ['ก', 'ไก่', 'ไข่', 'ฮา']
+```
+
+รับ list คืนค่า list
+
+### รับเวลาปัจจุบันเป็นภาษาไทย
+
+```
+from pythainlp.date import now
+now() # '30 พฤษภาคม 2560 18:45:24'
+```
+### WordNet ภาษาไทย
+
+เรียกใช้งาน
+
+```
+from pythainlp.corpus import wordnet
+```
+
+**รับ Synset**
+
+```
+wordnet.getSynset(คำ)
+```
+
+เป็นคำสั่ง ใช้รับ Synset รับค่า str ส่งออกเป็น tuple ('Synset', 'synset li')
+
+**รับคำจาก id**
+
+```
+wordnet.getWords()
+```
+
+เป็นคำสั่ง ใช้รับคำจาก ID รับค่า str ส่งออกเป็น tuple ('Word', 'synsetid li')
+
+### stopword ภาษาไทย
+
+```python
+from pythainlp.corpus import stopwords
+stopwords = stopwords.words('thai')
+```
+
+### หาคำที่มีจำนวนการใช้งานมากที่สุด
+
+```
+pythainlp.rank.rank(list)
+```
+
+คืนค่าออกมาเป็น dict
+
+**ตัวอย่างการใช้งาน**
+
+```
+>>> pythainlp.rank.rank(['แมง','แมง','คน'])
+Counter({'แมง': 2, 'คน': 1})
+```
+
+### แก้ไขปัญหาการพิมพ์ลืมเปลี่ยนภาษา
+
+```
+pythainlp.change.คำสั่ง()
+```
+
+มีคำสั่งดังนี้
+
+- texttothai(str) แปลงแป้นตัวอักษรภาษาอังกฤษเป็นภาษาไทย
+- texttoeng(str) แปลงแป้นตัวอักษรภาษาไทยเป็นภาษาอังกฤษ
+
+คืนค่าออกมาเป็น str
diff --git a/example/segment.py b/example/segment.py
@@ -1,4 +1,4 @@
-from pythainlp.segment import segment
+from pythainlp.tokenize import word_tokenize
 a = 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทยและฉันใช้ภาษาไทย'
-b = segment(a)
+b = word_tokenize(a)
 print(b)
diff --git a/pythainlp/postaggers/__init__.py b/pythainlp/postaggers/__init__.py
@@ -1,20 +1,19 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import,division,unicode_literals
-import pythainlp
 import codecs
 import os
 import json
 import nltk.tag
 import nltk.data
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
-template_file = os.path.join(templates_dir, 'thaipos.json')
 def data():
+	import pythainlp
+	templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
+	template_file = os.path.join(templates_dir, 'thaipos.json')
 	with codecs.open(template_file,'r',encoding='utf-8-sig') as handle:
 		model = json.load(handle)
 	return model
 def tag(text):
 	"""
-	หมายเหตุ API ชุดนี้เตรียมหยุดการใช้งาน
 	รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
 	tagger = nltk.tag.UnigramTagger(model=data())# backoff=default_tagger)
 	return tagger.tag(text)
diff --git a/pythainlp/test/__init__.py b/pythainlp/test/__init__.py
@@ -30,6 +30,6 @@ def testTag(self):
 		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
 	def testTagnew(self):
     		if sys.version_info > (3,3):
-    				self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='artagger'),[('ผม', 'PPRS'), ('รัก', 'VSTA'), ('คุณ', 'PPRS')])
+    				self.assertEqual(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'),[('ผม', 'PPRS'), ('รัก', 'VSTA'), ('คุณ', 'PPRS')])
 if __name__ == '__main__':
     unittest.main()
diff --git a/setup.py b/setup.py
@@ -37,6 +37,5 @@
         'License :: OSI Approved :: Apache Software License',
         'Natural Language :: Thai',
         'Topic :: Text Processing :: Linguistic',
-        'Programming Language :: Python :: Implementation'
-    ],
+        'Programming Language :: Python :: Implementation'],
 )

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,5 @@`
`37`	`37`	`'License :: OSI Approved :: Apache Software License',`
`38`	`38`	`'Natural Language :: Thai',`
`39`	`39`	`'Topic :: Text Processing :: Linguistic',`
`40`		`- 'Programming Language :: Python :: Implementation'`
`41`		`- ],`
	`40`	`+ 'Programming Language :: Python :: Implementation'],`
`42`	`41`	`)`