-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures.py
84 lines (67 loc) · 1.76 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re
import nltk
import string
from nltk.util import ngrams
from nltk import FreqDist
from collections import Counter
def detect_yeah(row, feature):
regex = re.compile('[^a-zA-Z]')
word = regex.sub('', row.split(' ', 1)[0]).lower()
if(word == "yeah"):
feature.append(1)
else:
feature.append(0)
def detect_prop(row):
regex = re.compile('[^a-zA-Z]')
word = regex.sub('', row.split(' ', 1)[0]).lower()
if(word == "i" or word == "im" or word == "you" or word == "youre"):
return 1
else:
return 0
def get_uni(first, second, uni):
bigramfdist = FreqDist()
for line in first:
token = nltk.word_tokenize(line)
token = [x for x in token if not re.fullmatch('[' + string.punctuation + ']+', x)]
bigrams = ngrams(token, 1)
bigramfdist.update(bigrams)
print(bigramfdist.most_common(50))
print(bigramfdist.get("but"))
# print(bigramfdist.viewitems())
# regex = re.compile('[^a-zA-Z]')
# for line in first:
# for word in line.split():
# word = regex.sub('', word).lower()
# if(uni.get(word) == None):
# uni[word] = 1
# else:
# uni[word] += 1
# for line in second:
# for word in line.split():
# word = regex.sub('', word).lower()
# if(uni.get(word) == None):
# uni[word] = 1
# else:
# uni[word] += 1
# for key, value in dict(uni).items():
# if value < 5000:
# del uni[key]
# for key in uni:
# print(key)
# print(uni)
# print(len(uni))
def uni_feature(row, uni, feature):
id = 0
uni_temp = {}
for key in uni:
uni_temp[key] = id
id += 1
buckets = [0] * len(uni_temp)
regex = re.compile('[^a-zA-Z]')
for word in row.split():
word = regex.sub('', word).lower()
# print(uni_temp.get(word))
if(uni.get(word) != None):
buckets[uni_temp.get(word)] += 1
for item in buckets:
feature.append(item)