forked from csc4790-fall2019/sp-hagan-solomon-josh-nelson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzipf.py
More file actions
121 lines (87 loc) · 3.71 KB
/
zipf.py
File metadata and controls
121 lines (87 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
from pathlib import Path
from nltk import FreqDist
import matplotlib.pyplot as plt
import re
import collections
stopwords = ['a', 'to', 'from', 'i', 'the', 'and', 'you', 'what', 'of', 'is', 'your', 'that', 'in', 'do', 'are', 'have','would', 'for', 'on', 'with', 'or', 'thing', 'reddit', 'who']
def populate_freq_dist_stop_list(subreddit_name):
freq_dist = FreqDist()
subreddit = Path('subreddits/{0}'.format(subreddit_name))
fd_path = Path('subreddits_freq_dist')
if not Path(fd_path).exists():
Path(fd_path).mkdir(parents=True)
directories = [x for x in subreddit.iterdir() if x.is_dir()]
for directory in directories:
for file in directory.iterdir():
with open(file) as f:
data = json.load(f)
str = data['title']
normal = re.sub(r'\b[A-Z]+\b', '', str)
holder = str.split()
for token in holder:
if token not in stopwords:
if token in freq_dist:
freq_dist[token] += 1
else:
freq_dist[token] = 1
#starting to add sketchy stuff
sorted_freq_dist = collections.OrderedDict(freq_dist.most_common())
num_top_10 = round(len(sorted_freq_dist) * 0.1)
num_top_25 = round(len(sorted_freq_dist) * 0.25)
top_10_percent = []
top_25_percent = []
counter = 0
for key, value in sorted_freq_dist.items():
counter += 1
if key in sorted_freq_dist.keys():
if counter <= 10:
top_10_percent.append(key)
if key in sorted_freq_dist.keys():
if counter <= 20:
top_25_percent.append(key)
#end sketchy stuff
with open((fd_path / '{0}_freq_dist.json'.format(subreddit_name)), 'w', encoding='utf-8') as file:
json.dump(sorted_freq_dist, file)
size ={'size': len(sorted_freq_dist), 'top_10' : top_10_percent, 'top_25' : top_25_percent}
with open((fd_path / '{0}_freq_dist_size.json'.format(subreddit_name)), 'w', encoding='utf-8') as file:
json.dump(size, file)
def stringify(subreddit):
subreddit_dist_size = Path('subreddits_freq_dist/{}_freq_dist_size.json'.format(subreddit))
with open(subreddit_dist_size) as f:
size_data = (json.load(f))
top_10_list = []
top_25_list = []
top_10_list = size_data['top_10']
str_top_10 = '\n'.join(top_10_list)
top_25_list = size_data['top_25']
str_top_25 = '\n'.join(top_25_list)
print(str_top_10)
print(str_top_25)
def determine_top_words(title, subreddit):
subreddit_dist_size = Path('subreddits_freq_dist/{}_freq_dist_size.json'.format(subreddit))
subreddit_dist = Path('subreddits_freq_dist/{}_freq_dist.json'.format(subreddit))
with open(subreddit_dist_size) as f:
size_data = collections.OrderedDict(json.load(f))
length_of_dict = size_data['size']
with open(subreddit_dist) as g:
data = collections.OrderedDict(json.load(g))
re.sub(r'\b[A-Z]+\b', '', title)
title = title.split()
top_10_percent = {}
top_25_percent = {}
num_top_10 = round(length_of_dict * 0.1)
num_top_25 = round(length_of_dict * 0.25)
for token in title:
for key, value in data.items():
if token in data.keys():
if data.index(token) <= num_top_10:
top_10_percent.add(token)
if key in data.keys():
if data.keys().index(token) <= num_top_25:
top_25_percent.add(token)
print(top_10_percent)
print(top_25_percent)
#populate_freq_dist_stop_list('AskReddit')
stringify('AskReddit')
#determine_top_words('What it be people made random fluff noise', 'AskReddit')