-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbase.py
More file actions
42 lines (28 loc) · 1.16 KB
/
Copy pathbase.py
File metadata and controls
42 lines (28 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from preprocessor import Preprocessor
from tf import TF
from tfidf import TFIDF
from training import Training
print("> Loading training set")
X = pd.read_csv("train_set_x.csv")
y = pd.read_csv("train_set_y.csv")
# X = X.truncate(after=10000)
# y = y.truncate(after=10000)
X.drop('Id', axis=1, inplace=True)
y.drop('Id', axis=1, inplace=True)
preprocessor = Preprocessor()
preprocessor.process(X, inplace=True)
print("> Creating feature extraction pipeline")
feature_extraction_pipeline = []
feature_extraction_pipeline.append(TF(X))
feature_extraction_pipeline.append(TFIDF(X, category_df=y))
print("> Extracting features from training set")
# Use the pipeline to extract features
for feature_extractor in feature_extraction_pipeline:
extracted_features = feature_extractor.extractFeatures()
X = pd.concat([X, feature_extractor.addPrefix(extracted_features)], axis=1)
# No need for text column anymore, since the features were extracted
X.drop('Text', axis=1, inplace=True)
Training(X,y,feature_extraction_pipeline, RandomForestClassifier(n_estimators=400)).train(validation=True)
print("> Done")