Language-Classification/base.py at master · zeyadhazem/Language-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from preprocessor import Preprocessor
from tf import TF
from tfidf import TFIDF
from training import Training

print("> Loading training set")

X = pd.read_csv("train_set_x.csv")
y = pd.read_csv("train_set_y.csv")

# X = X.truncate(after=10000)
# y = y.truncate(after=10000)

X.drop('Id', axis=1, inplace=True)
y.drop('Id', axis=1, inplace=True)

preprocessor = Preprocessor()
preprocessor.process(X, inplace=True)

print("> Creating feature extraction pipeline")

feature_extraction_pipeline = []
feature_extraction_pipeline.append(TF(X))
feature_extraction_pipeline.append(TFIDF(X, category_df=y))

print("> Extracting features from training set")

# Use the pipeline to extract features
for feature_extractor in feature_extraction_pipeline:
    extracted_features = feature_extractor.extractFeatures()
    X = pd.concat([X, feature_extractor.addPrefix(extracted_features)], axis=1)

# No need for text column anymore, since the features were extracted
X.drop('Text', axis=1, inplace=True)

Training(X,y,feature_extraction_pipeline, RandomForestClassifier(n_estimators=400)).train(validation=True)

print("> Done")