1
+ # -*- coding: utf-8 -*-
1
2
# เครื่องมือในการ build sentiment เพื่อใช้ใน pythainlp
2
- # เวชั่น 0.2
3
- # 2017/08/17
3
+ # เวชั่น 0.3
4
+ # 2018/01/18
4
5
# เขียนโดย นาย วรรณพงษ์ ภัททิยไพบูลย์
5
- # ใช้ประกอบบทความใน python3.wannaphong.com
6
6
# cc-by 3.0 Thai Sentiment Text https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/
7
7
# อ่านบทความได้ที่ https://python3.wannaphong.com/2017/02/ทำ-sentiment-analysis-ภาษาไทยใน-python.html
8
8
from nltk import NaiveBayesClassifier as nbc
@@ -11,28 +11,31 @@ from pythainlp.tokenize import word_tokenize
11
11
from pythainlp.corpus import stopwords
12
12
import codecs
13
13
from itertools import chain
14
- a = stopwords.words(' thai' )
14
+ thaistopwords = stopwords.words(' thai' )
15
15
# pos.txt
16
16
with codecs.open(' pos.txt' , ' r' , " utf-8" ) as f:
17
17
lines = f.readlines ()
18
- listpos=[e.strip() for e in lines]
18
+ listpos=[x for x in [ e.strip() for e in lines] if x not in thaistopwords ]
19
19
del lines
20
20
f.close () # ปิดไฟล์
21
21
# neg.txt
22
22
with codecs.open(' neg.txt' , ' r' , " utf-8" ) as f:
23
23
lines = f.readlines ()
24
- listneg=[e.strip() for e in lines]
24
+ listneg=[x for x in [ e.strip() for e in lines] if x not in thaistopwords ]
25
25
f.close () # ปิดไฟล์
26
26
print(1)
27
27
pos1=[' pos' ]* len(listpos)
28
28
neg1=[' neg' ]* len(listneg)
29
29
print(2)
30
30
training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1))
31
31
print(3)
32
- vocabulary = set(chain(* [(set(word_tokenize(i[0]))-set(stopwords.words(' thai' ))) for i in training_data]))
32
+ # vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data]))
33
33
# vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))
34
+ vocabulary = set(chain(* [word_tokenize(i[0]) for i in training_data]))
35
+ # print(vocabulary)
34
36
print(3.1)
35
37
feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data]
38
+ # print(feature_set)
36
39
print(4)
37
40
classifier = nbc.train(feature_set)
38
41
print(5)
@@ -42,4 +45,4 @@ out_strm.close()
42
45
with open(' sentiment.data' , ' wb' ) as out_strm:
43
46
dill.dump(classifier,out_strm)
44
47
out_strm.close ()
45
- print(' OK' )
48
+ print(' OK' )
0 commit comments