Karma-Warrior/zipf.py at master · joshbrucker/Karma-Warrior · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
from pathlib import Path
from nltk import FreqDist
import matplotlib.pyplot as plt
import re
import collections

stopwords = ['a', 'to', 'from', 'i', 'the', 'and', 'you', 'what', 'of', 'is', 'your', 'that', 'in', 'do', 'are', 'have','would', 'for', 'on', 'with', 'or', 'thing', 'reddit', 'who']


def populate_freq_dist_stop_list(subreddit_name):
    freq_dist = FreqDist()
    subreddit = Path('subreddits/{0}'.format(subreddit_name))
    fd_path = Path('subreddits_freq_dist')

    if not Path(fd_path).exists():
        Path(fd_path).mkdir(parents=True)

    directories = [x for x in subreddit.iterdir() if x.is_dir()]
    for directory in directories:
        for file in directory.iterdir():
            with open(file) as f:
                data = json.load(f)
                str = data['title']
                normal = re.sub(r'\b[A-Z]+\b', '', str)
                holder = str.split()
                for token in holder:
                    if token not in stopwords:
                        if token in freq_dist:
                            freq_dist[token] += 1
                        else:
                            freq_dist[token] = 1

#starting to add sketchy stuff

    sorted_freq_dist = collections.OrderedDict(freq_dist.most_common())

    num_top_10 = round(len(sorted_freq_dist) * 0.1)
    num_top_25 = round(len(sorted_freq_dist) * 0.25)
    top_10_percent = []
    top_25_percent = []

    counter = 0

    for key, value in sorted_freq_dist.items():
        counter += 1
        if key in sorted_freq_dist.keys():
            if counter <= 10:
                top_10_percent.append(key)
        if key in sorted_freq_dist.keys():
            if counter <= 20:
                top_25_percent.append(key)

#end sketchy stuff

    with open((fd_path / '{0}_freq_dist.json'.format(subreddit_name)), 'w', encoding='utf-8') as file:
        json.dump(sorted_freq_dist, file)

    size ={'size': len(sorted_freq_dist), 'top_10' : top_10_percent, 'top_25' : top_25_percent}

    with open((fd_path / '{0}_freq_dist_size.json'.format(subreddit_name)), 'w', encoding='utf-8') as file:
        json.dump(size, file)


def stringify(subreddit):

    subreddit_dist_size = Path('subreddits_freq_dist/{}_freq_dist_size.json'.format(subreddit))

    with open(subreddit_dist_size) as f:
        size_data = (json.load(f))

    top_10_list = []
    top_25_list = []

    top_10_list = size_data['top_10']
    str_top_10 = '\n'.join(top_10_list)

    top_25_list = size_data['top_25']
    str_top_25 = '\n'.join(top_25_list)

    print(str_top_10)
    print(str_top_25)

def determine_top_words(title, subreddit):

    subreddit_dist_size = Path('subreddits_freq_dist/{}_freq_dist_size.json'.format(subreddit))
    subreddit_dist = Path('subreddits_freq_dist/{}_freq_dist.json'.format(subreddit))


    with open(subreddit_dist_size) as f:
        size_data = collections.OrderedDict(json.load(f))
        length_of_dict = size_data['size']

    with open(subreddit_dist) as g:
        data = collections.OrderedDict(json.load(g))

    re.sub(r'\b[A-Z]+\b', '', title)
    title = title.split()

    top_10_percent = {}
    top_25_percent = {}

    num_top_10 = round(length_of_dict * 0.1)
    num_top_25 = round(length_of_dict * 0.25)

    for token in title:
        for key, value in data.items():
            if token in data.keys():
                if data.index(token) <= num_top_10:
                    top_10_percent.add(token)
            if key in data.keys():
                if data.keys().index(token) <= num_top_25:
                    top_25_percent.add(token)

    print(top_10_percent)
    print(top_25_percent)

#populate_freq_dist_stop_list('AskReddit')
stringify('AskReddit')
#determine_top_words('What it be people made random fluff noise', 'AskReddit')