-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathalgs.py
161 lines (146 loc) · 7.5 KB
/
algs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import numpy as np
import pandas as pd
import scipy.sparse as sp
import sklearn.preprocessing as pp
from math import exp
from heapq import heappush, heappop
# conventional i2i
class CosineSimilarity():
# expects DataFrame, loaded from ratings.csv
def __init__(self, df, limit=20):
self.limit = limit
# no need for timestamp here
df = df.drop(labels = 'timestamp', axis = 1)
# let's see what's the mean rating for each movie,
df_mean = df.groupby(['movieId'], sort = False).mean().rename(columns = {'rating': 'mean'})
# join mean values to original DataFrame
df = df.join(df_mean, on = 'movieId', sort = False)
# and subtract mean values from each rating,
# so that rating of 0 becomes neutral
df['rating'] = df['rating'] - df['mean']
# ok? now pivot original DataFrame so that it becomes a feature/document matrix
# and fill all NaNs (where a user hasn't rated a movie) with zeros, which is legal now
df = df.pivot_table(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
# if there were movies, having only equal ratings (ie, all 4.0)
# then we can't really recommend anything to them, hence removing
df = df.loc[:, (df != 0).any(axis = 0)]
# here we go, let's turn DataFrame into sparse matrix, normalize ratings,
cnm = pp.normalize(sp.csr_matrix(df.values), axis = 0)
# calculate recommendations and turn sparse matrix back into DataFrame,
# having movieId index, movieId columns and values, representing relevance of A to B
self.recs = pd.DataFrame((cnm.T * cnm).todense(), columns=df.columns, index=df.columns)
# retrieves "limit" of recommendations for given movie_id out of precalculated DataFrame
def recommend(self, movie_id):
if not movie_id in self.recs.index.values:
return pd.DataFrame([], columns=['movieId']).set_index('movieId')
r = self.recs[movie_id].sort_values(ascending=False)
r = r.drop(labels=movie_id)
r = r.to_frame().rename(columns = {movie_id: 'score'})
return r.head(n=self.limit)
# now, the new way of getting i2i recommendations
class ShortestPath():
# expects DataFrame, loaded from ratings.csv
def __init__(self, df, limit=20):
self.limit = limit
# here the order of happenings is very crucial,
df = df.sort_values(by = ['userId', 'timestamp'])
# once sorted, we can remove timestamp (will save us a couple of bytes)
df = df.drop(labels = 'timestamp', axis = 1)
# mids stands for movie IDs (I'm yet another lazy coder)
self.mids = df.index.get_level_values('movieId').unique()
df = df.reset_index(level = 'movieId')
# al is adjacency list, by the way
al = {}
for uid in df.index.get_level_values('userId').unique():
# let's examine each user path
path = df.loc[uid]
if isinstance(path, pd.DataFrame):
# otherwise means a user made only one rating, not quite helpful here
for m1, m2 in zip(path[:-1].itertuples(), path[1:].itertuples()):
# for each pair of rated movie and next rated movie
al.setdefault(m1.movieId, {}).setdefault(m2.movieId, 0)
# we see what a user thinks of how similar they are,
# the more both ratings are close to each other - the higher
# is similarity
al[m1.movieId][m2.movieId] += 1/exp(abs(m1.rating - m2.rating)) - 0.5
for mid in al:
# let's make a list for each movie in adjacency list, so that
# adjacency list becomes a list indeed, along the way, we have to
# inverse summed up similarity, so that the higher is the similarity -
# the shorter is the length of an edge in movies graph
al[mid] = list(map(lambda kv: (kv[0], -kv[1]), al[mid].items()))
# yes, list is required to be sorted from closest to farthest
al[mid].sort(key = lambda r: r[1])
res = {}
for mid in al:
# you still here? sweet
# we have BFS with priority queue here,
# I always thought that its name is Dijkstra algorithm
# although lately realized, that Dijkstra's one used to be
# a little bit more naive. Wat moet ik doen?
# r stands for Results
r = {}
# e stands for elements in the queue
e = {}
# q stands for queue (sincerely, C.O.)
q = []
# d stands for Depth of search
# (well, actually, there's no depth in breadth first search,
# it's just a number of nodes we're willing to visit)
d = limit + 1
# starting from originator itself
e[mid] = [0, mid]
heappush(q, e[mid])
while q:
# while there are vertices in the queue
v = heappop(q)
# and they are not dummy (-1 is explained below) or not known
if v[1] == -1 or not v[1] in al:
continue
d -= 1
# and required depth isn't reached:
if d < 0:
break
# we consider current vertice a next relevant recommendation
r[v[1]] = v[0]
# and we have to fill the queue with
# other adjacent vertices
for av in al[v[1]]:
if av[0] in r:
# this one we've already seen
continue
# we're getting further from originator
alt = v[0] + av[1]
if av[0] in e:
# but what if next adjacent vertice is already queued
if alt < e[av[0]][0]:
# well, if we found a shorter path, let's prioritize
# this vertice higher in the queue
ii = e[av[0]][1]
# here we go, -1 is a dummy distance value for a vertice
# that has been moved in the queue, one doesn't simply
# remove a node from heapified list, if you know what I mean
e[av[0]][1] = -1
# so we enqueue a new one
e[av[0]] = [alt, ii]
heappush(q, e[av[0]])
else:
# otherwise, just put it in the queue
e[av[0]] = [alt, av[0]]
heappush(q, e[av[0]])
# of course, recommendation of a movie to itself is way too obvious
del r[mid]
# listify and sort other recommendaions
res[mid] = list(r.items())
res[mid].sort(key = lambda r: -r[1])
# save results
self.recs = res
# returns all recommendations for a given movie_id
# the trick here is that "limit" has already been applied upon calculation
# and the higher is the "limit" - the longer calculation takes, linearly,
# so here's no magical overtake of fancy scipy sparse matrix by pure python algorithm
def recommend(self, movie_id):
if not movie_id in self.recs:
return pd.DataFrame([], columns=['movieId']).set_index('movieId')
r = pd.DataFrame(self.recs[movie_id], columns=['movieId', 'score'])
return r.set_index('movieId')