-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtf.py
More file actions
56 lines (42 loc) · 1.81 KB
/
Copy pathtf.py
File metadata and controls
56 lines (42 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
from featureExtractor import FeatureExtractor
class TF (FeatureExtractor):
"""Tokenizes the input and calculates the frequencies"""
def __init__(self, df):
"""Initialize a TF instance"""
FeatureExtractor.__init__(self, df, 'tf_') # A vector of size m that will be tokenized
self.tokens = {} # A dictionary of unique tokens where each has a list of size m with the token count by training example
# Make sure size = m * 1
def extractFeatures(self):
rows_len = len(self.train_set.index)
column = self.train_set.columns[0]
for i in range (0, rows_len):
string = self.train_set[column][i]
string_len = len(string)
for letter in string:
lower = letter.lower()
if lower not in self.tokens.keys():
self.tokens[lower] = [0] * rows_len
self.tokens[lower][i] = self.tokens[lower][i] + 1.0/string_len
return_df = pd.DataFrame(self.tokens)
return return_df
def applyToTest(self, test_df):
"""
test_df: m * 1 array
"""
test_df_len = len(test_df.index)
test_tokens = {}
# Initialize an empty tokens dict
for token in self.tokens.keys():
test_tokens[token] = [0] * test_df_len
# Get unique chars with the same dimensions as in the test set
column = test_df.columns[0]
for i in range (0, test_df_len):
string = test_df[column][i]
string_len = len(string)
for letter in string:
lower = letter.lower()
if lower in test_tokens.keys():
test_tokens[lower][i] = test_tokens[lower][i] + 1.0/string_len
return_df = pd.DataFrame(test_tokens)
return return_df