Skip to content

Commit 5d598c6

Browse files
committed
update sentiment
1 parent b4442dd commit 5d598c6

File tree

3 files changed

+11
-8
lines changed

3 files changed

+11
-8
lines changed
Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
# -*- coding: utf-8 -*-
12
# เครื่องมือในการ build sentiment เพื่อใช้ใน pythainlp
2-
# เวชั่น 0.2
3-
# 2017/08/17
3+
# เวชั่น 0.3
4+
# 2018/01/18
45
# เขียนโดย นาย วรรณพงษ์ ภัททิยไพบูลย์
5-
# ใช้ประกอบบทความใน python3.wannaphong.com
66
# cc-by 3.0 Thai Sentiment Text https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/
77
# อ่านบทความได้ที่ https://python3.wannaphong.com/2017/02/ทำ-sentiment-analysis-ภาษาไทยใน-python.html
88
from nltk import NaiveBayesClassifier as nbc
@@ -11,28 +11,31 @@ from pythainlp.tokenize import word_tokenize
1111
from pythainlp.corpus import stopwords
1212
import codecs
1313
from itertools import chain
14-
a = stopwords.words('thai')
14+
thaistopwords = stopwords.words('thai')
1515
# pos.txt
1616
with codecs.open('pos.txt', 'r', "utf-8") as f:
1717
lines = f.readlines()
18-
listpos=[e.strip() for e in lines]
18+
listpos=[x for x in [e.strip() for e in lines] if x not in thaistopwords]
1919
del lines
2020
f.close() # ปิดไฟล์
2121
# neg.txt
2222
with codecs.open('neg.txt', 'r', "utf-8") as f:
2323
lines = f.readlines()
24-
listneg=[e.strip() for e in lines]
24+
listneg=[x for x in [e.strip() for e in lines] if x not in thaistopwords]
2525
f.close() # ปิดไฟล์
2626
print(1)
2727
pos1=['pos']*len(listpos)
2828
neg1=['neg']*len(listneg)
2929
print(2)
3030
training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1))
3131
print(3)
32-
vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data]))
32+
#vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data]))
3333
#vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))
34+
vocabulary = set(chain(*[word_tokenize(i[0]) for i in training_data]))
35+
#print(vocabulary)
3436
print(3.1)
3537
feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data]
38+
#print(feature_set)
3639
print(4)
3740
classifier = nbc.train(feature_set)
3841
print(5)
@@ -42,4 +45,4 @@ out_strm.close()
4245
with open('sentiment.data', 'wb') as out_strm:
4346
dill.dump(classifier,out_strm)
4447
out_strm.close()
45-
print('OK')
48+
print('OK')

pythainlp/sentiment/sentiment.data

66.4 KB
Binary file not shown.

pythainlp/sentiment/vocabulary.data

12.2 KB
Binary file not shown.

0 commit comments

Comments
 (0)