-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
78 lines (68 loc) · 3.5 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from functools import reduce
# precision(df, True, 1) == 0.5
# precision(df, False, None) == 0.5
def precision(df, single=False, query_number=None):
"""
This function will calculate the precision of a given query or of the entire DataFrame
:param df: DataFrame: Contains tweet ids, their scores, ranks and relevance
:param single: Boolean: True/False that tell if the function will run on a single query or the entire df
:param query_number: Integer/None that tell on what query_number to evaluate precision or None for the entire DataFrame
:return: Double - The precision
"""
if single:
df2 = df[df['query'] == query_number]
# if df2['y_true'].mean()<0.5: # TODO TO FIND the bad queries undo hint #
# print(query_number, df2['y_true'].mean())
return df2['y_true'].mean()
else:
return df.groupby('query')['y_true'].mean().mean()
def recall_single(df, num_of_relevant, query_number):
"""
This function will calculate the recall of a specific query or of the entire DataFrame
:param df: DataFrame: Contains tweet ids, their scores, ranks and relevance
:param num_of_relevant: Integer: number of relevant tweets
:param query_number: Integer/None that tell on what query_number to evaluate precision or None for the entire DataFrame
:return: Double - The recall
"""
df2 = df[df['query'] == query_number]
return df2['y_true'].sum() / num_of_relevant
# recall(df, {1:2}, True) == 0.5
# recall(df, {1:2, 2:3, 3:1}, False) == 0.388
def recall(df, num_of_relevant):
"""
This function will calculate the recall of a specific query or of the entire DataFrame
:param df: DataFrame: Contains tweet ids, their scores, ranks and relevance
:param num_of_relevant: Dictionary: number of relevant tweets for each query number. keys are the query number and values are the number of relevant.
:return: Double - The recall
"""
rec = 0
for query_number in num_of_relevant.keys():
relevant = num_of_relevant.get(query_number)
rec += recall_single(df, relevant, query_number)
return rec / len(num_of_relevant)
# precision_at_n(df, 1, 2) == 0.5
# precision_at_n(df, 3, 1) == 0
def precision_at_n(df, query_number=1, n=5):
"""
This function will calculate the precision of the first n files in a given query.
:param df: DataFrame: Contains tweet ids, their scores, ranks and relevance
:param query_number: Integer that tell on what query_number to evaluate precision
:param n: Total document to splice from the df
:return: Double: The precision of those n documents
"""
return precision(df[df['query'] == query_number][:n], True, query_number)
# map(df) == 2/3
def map(df):
"""
This function will calculate the mean precision of all the df.
:param df: DataFrame: Contains tweet ids, their scores, ranks and relevance
:return: Double: the average precision of the df
"""
acc = 0
split_df = [pd.DataFrame(y).reset_index() for x, y in df.groupby('query', as_index=True)]
indices = [sdf.index[sdf['y_true'] == 1].tolist() for sdf in split_df]
for i, indexes in enumerate(indices):
pres = [precision_at_n(split_df[i], i + 1, index + 1) for index in indexes]
acc += reduce(lambda a, b: a + b, pres) / len(indexes) if len(pres) > 0 else 0
return acc / len(split_df)