-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistics_calculations.py
182 lines (145 loc) · 6.57 KB
/
statistics_calculations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import numpy as np
import pandas as pd
from defaults import *
from parsing_tools import Message
import matplotlib.pyplot as plt
def count_word(df: pd.DataFrame, word: str) -> pd.DataFrame:
""" Count use of word per author. The word is passed in the form of regex."""
return df[df["text"].str.contains(r"{}($|\s)".format(word))].groupby(
"author").count()
def count_haha(df: pd.DataFrame) -> pd.DataFrame:
"""Count how many laugh messages were sent by authors"""
return df[df["text"].str.contains(r"(^|\W)ח+($|\W)")].groupby("author").count()
def count_emoji(df: pd.DataFrame) -> pd.DataFrame:
"""Count how many emojis were used by each author"""
return df[df["text"].str.contains(EMOJI_REGEX)].groupby("author").count()
def count_questions(df: pd.DataFrame) -> pd.DataFrame:
"""Count how many questions were asked by each author"""
return df[df["text"].str.contains(r"\?+")].groupby("author").count()
def count_curses(df: pd.DataFrame) -> pd.DataFrame:
"""count how many curses of each type were used"""
counter = pd.DataFrame()
curse_list = ["פאק", "פאקינג", "שיט", "סעמק", "כוסאמק", "זונה", "דמט",
"דאם", "דאמ", "לעזאזל", "זין", "רבאק"]
for curse in curse_list:
curse_row = pd.DataFrame()
# generate the regex that allows letter repetitions
curse_regex = ""
for letter in curse:
curse_regex = curse_regex + letter + "+"
curse_row[curse] = count_word(df, curse_regex).iloc[:, 1]
counter = pd.concat([counter, curse_row], axis=1)
counter["total"] = counter.sum(axis=1) # total per author
counter = counter.fillna(0)
counter = counter.transpose()
counter["total"] = counter.sum(axis=1) # total per curse
counter = counter.astype(int)
return counter
def counter_by_user(df: pd.DataFrame, media_df: pd.DataFrame) -> pd.DataFrame:
"""
all the counters in one df
currently: messages, words, hhh, emoji, questions
all by authors
"""
counter = pd.DataFrame()
counter["messages"] = df.groupby("author").count().iloc[:, 1]
counter["media"] = media_df.groupby("author").count().iloc[:, 1]
counter["words"] = None
for author in Message.authors:
authors_words = " ".join(df[df['author'] == author][
"text"]).split() # Filter here if you want to remove "yes" and whatever
counter.loc[author, "words"] = len(authors_words)
counter["hhh"] = count_haha(df).iloc[:, 1]
counter["emoji"] = count_emoji(df).iloc[:, 1]
counter["questions"] = count_questions(df).iloc[:, 1]
counter["keilu"] = count_word(df, "כאילו").iloc[:, 1]
counter = counter.transpose()
counter["total"] = counter.sum(axis=1)
counter = counter.fillna(0)
counter = counter.astype(int)
return counter
def reverse_hebrew_columns(authors: List[str]) -> Dict[str, str]:
""" Helper function for plots. Returns a renaming dict for the list given. Revert hebrew names and doesn't change non-hebrew names """
rename_dict = {}
for author in authors:
rename_dict[author] = author if author[0] not in HEBREW_LETTERS else author[::-1]
return rename_dict
def plot_percentage(counter: pd.DataFrame) -> None:
"""
generates a horizontal bar plot of each category, and the percentage of each user in it.
counter is a df that contains the user columns + total column, and indexes of the categories
"""
# Sort columns alphabetically
counter = counter[sorted(counter.drop(columns = ["total"]).columns) + ["total"]]
# reverse hebrew indexes
counter = counter.rename(index=reverse_hebrew_columns(counter.index))
# reverse hebrew columns
counter = counter.rename(columns=reverse_hebrew_columns(counter.columns))
# change counters to percentages
for col in counter.columns[:-1]:
counter[col] = counter[col].mask(counter["total"]!=0, counter[col] / counter["total"] * 100)
fig, ax = plt.subplots()
fig.set_figheight(len(counter) * 0.6)
# generates the first bar (starts with 0)
ax.barh(counter.index, counter.iloc[:,0], label=counter.columns[0])
for user in range(1, len(counter.columns)-1):
ax.barh(counter.index, counter.iloc[:,user], height=0.8, left=counter.iloc[:,:user].sum(axis=1), label=counter.columns[user])
for p in ax.patches:
txt = str(p.get_width().round(1)) + '%'
# locations for two users (two ends)
if len(counter.columns)==3:
if p.get_x() == 0:
txt_x = 0
else:
txt_x = 90
# if more - left side of bar
else:
txt_x = p.get_x()
txt_y = p.get_y()+0.4
ax.text(txt_x, txt_y, txt)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlim(0,100)
plt.show()
def plot_word(df: pd.DataFrame, word: str) -> None:
""" Counts the use of a specific word by month, and plot by user.
The word is passed in the form of regex.
"""
# Take only messages where the word was used
word_df = df[df["text"].str.contains(r"{}($|\s)".format(word))]
# Sort columns alphabetically
word_df = word_df.sort_index(axis=1)
if word_df.empty:
print(f"No matches found for {word}")
return
# Flip hebrew words to rtl
if word[0] in HEBREW_LETTERS:
word = word[::-1]
# Group by month
word_df = word_df.groupby([pd.Grouper(freq='M', key='date'), 'author']).count()
fig, ax = plt.subplots(figsize=(15, 7))
word_df.unstack().plot(ax=ax)
plt.ylim(0, plt.ylim()[1])
plt.title(f"Use Of {word} By User")
plt.show()
def plot_hhh_distribution(df: pd.DataFrame) -> pd.DataFrame:
""" Plot a bar graph presenting the amount of h per hhh per user"""
haha_regex= r"(?:^|\W)(ח+)(?:$|\W)"
df = df[df["text"].str.contains(haha_regex)]
haha_df = pd.DataFrame()
# Get a counter table for each author
for author in df["author"].unique():
author_df = df[df["author"]==author]["text"].str.extractall(haha_regex)
author_df[0] = author_df[0].apply(len)
author_df = author_df[0].value_counts()
author_df.name = author
haha_df = haha_df.merge(author_df, how="outer", left_index = True, right_index = True)
# Sort columns alphabetically
haha_df = haha_df.sort_index(axis=1)
haha_df = haha_df.fillna(0)
# Plot the histogram
plot_df = haha_df.rename(columns = reverse_hebrew_columns(haha_df.columns))
plot_df.plot.bar()
plt.title("Number of ח in חחח")
plt.xlabel("Number of ח")
plt.ylabel("Messages")
return haha_df