forked from belate/newsclassifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
65 lines (48 loc) · 1.62 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
classifier.py
==========
Using the trained data generated by ``train.py`` classify the input articles
found in the folder ``input``.
Usage
-----
$ python classifier.py
"""
import os
import re
import sys
import json
from itertools import izip
from glob import glob
from sklearn.externals import joblib
from sklearn.datasets.base import Bunch
import scraper
def get_data(data_path):
""" Get training data from the articles folder. """
all_data = []
for path in glob(os.path.join(data_path, '*.json')):
with open(path, 'r') as jsonfile:
data = json.loads(jsonfile.read())
for article in data.get('articles'):
all_data.extend([scraper.clean(article['content'])])
return Bunch(categories=scraper.CATEGORIES.keys(),
values=None,
data=all_data)
def main(path):
# Get the latest .pkl
files = glob(os.path.join(path, '*.pkl'))
filename = max(files, key=lambda x: int(re.sub(r'\D', '', x)))
# Check if the models exists
if not filename:
print "No models found in %s" % path
sys.exit(1)
# Load the models using the already generated .pkl file
model = joblib.load(filename)
data = get_data('input')
data_weighted = model['vectorizer'].transform(data.data)
data_weighted = model['feature_selection'].transform(data_weighted)
prediction = model['clf'].predict(data_weighted)
# Print the results
for text, prediction in izip(data.data, prediction):
print scraper.CATEGORIES.keys()[prediction].ljust(15, ' '), text[:100], '...'
if __name__ == '__main__':
main('training')