PyThaiNLP
diff --git a/‎pythainlp/sentiment/build_pythainlp.tool
Lines changed: 11 additions & 8 deletions b/‎pythainlp/sentiment/build_pythainlp.tool
Lines changed: 11 additions & 8 deletions
diff --git a/‎pythainlp/sentiment/sentiment.data
66.4 KB b/‎pythainlp/sentiment/sentiment.data
66.4 KB
diff --git a/‎pythainlp/sentiment/vocabulary.data
12.2 KB b/‎pythainlp/sentiment/vocabulary.data
12.2 KB
@@ -1,8 +1,8 @@
+# -*- coding: utf-8 -*-
 # เครื่องมือในการ build sentiment เพื่อใช้ใน pythainlp
-# เวชั่น 0.2
-# 2017/08/17
+# เวชั่น 0.3
+# 2018/01/18
 # เขียนโดย นาย วรรณพงษ์  ภัททิยไพบูลย์
-# ใช้ประกอบบทความใน python3.wannaphong.com
 # cc-by 3.0 Thai Sentiment Text https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/
 # อ่านบทความได้ที่ https://python3.wannaphong.com/2017/02/ทำ-sentiment-analysis-ภาษาไทยใน-python.html
 from nltk import NaiveBayesClassifier as nbc
@@ -11,28 +11,31 @@ from pythainlp.tokenize import word_tokenize
 from pythainlp.corpus import stopwords
 import codecs
 from itertools import chain
-a = stopwords.words('thai')
+thaistopwords = stopwords.words('thai')
 # pos.txt
 with codecs.open('pos.txt', 'r', "utf-8") as f:
     lines = f.readlines()
-listpos=[e.strip() for e in lines]
+listpos=[x for x in [e.strip() for e in lines] if x not in thaistopwords]
 del lines
 f.close() # ปิดไฟล์
 # neg.txt
 with codecs.open('neg.txt', 'r', "utf-8") as f:
     lines = f.readlines()
-listneg=[e.strip() for e in lines]
+listneg=[x for x in [e.strip() for e in lines] if x not in thaistopwords]
 f.close() # ปิดไฟล์
 print(1)
 pos1=['pos']*len(listpos)
 neg1=['neg']*len(listneg)
 print(2)
 training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1))
 print(3)
-vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data]))
+#vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(stopwords.words('thai'))) for i in training_data]))
 #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))
+vocabulary = set(chain(*[word_tokenize(i[0]) for i in training_data]))
+#print(vocabulary)
 print(3.1)
 feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data]
+#print(feature_set)
 print(4)
 classifier = nbc.train(feature_set)
 print(5)
@@ -42,4 +45,4 @@ out_strm.close()
 with open('sentiment.data', 'wb') as out_strm: 
     dill.dump(classifier,out_strm)
 out_strm.close()
-print('OK')
+print('OK')