-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspotify_skip_prediction.py
600 lines (468 loc) · 23.3 KB
/
spotify_skip_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
# -*- coding: utf-8 -*-
"""Spotify_skip_prediction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/19OQT2RcMDknjt9H5ah6Omvd2PxUr56Y6
# **SPOTIFY SKIP PREDICTION**
team 9: Varshitha, Winnie, Claudia
## 1. Data loading & Importing GPU library
"""
# We need Aicrowd cli to interact with their datasets
!pip install -U aicrowd-cli
import os
# Get the Aicrowd api key
AICROWD_API_KEY =
# Downloading the tar file from aicrowd to colab notebook
!aicrowd login --api-key $AICROWD_API_KEY
!aicrowd dataset download --challenge spotify-sequential-skip-prediction-challenge '*Training_Set_And_Track_Features_Mini*'
# List of files present in the challenge dataset
!aicrowd dataset list -c spotify-sequential-skip-prediction-challenge
# Files in the tar file are put into the data folder
!tar -xvzf /content/16772e7f-7871-4d42-a44f-5f399f40fd94_training_set_track_features_mini.tar.gz
"""$$$$ Importing SVM GPU Library ThunderSVM
"""
!git clone https://github.com/Xtra-Computing/thundersvm.git
!cd thundersvm && mkdir build && cd build && cmake .. && make -j
!python /content/thundersvm/python/setup.py install
from importlib.machinery import SourceFileLoader
thundersvm = SourceFileLoader("thundersvm", "/content/thundersvm/python/thundersvm/thundersvm.py").load_module()
from thundersvm import SVC
"""## 2. Exploring the data
#### Statistical description of Data
---
"""
# Import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# computing the statistic desription of our 2 datasets
# include describe(), number of null values, distinct values, and if there are duplicates
def getDfSummary(input_data):
output_data = input_data.describe().T
del output_data['count']
output_data.insert(0, 'number_nan', input_data.isnull().sum())
output_data.insert(1, 'number_distinct', input_data.nunique(dropna = True))
output_data.insert(2, 'duplicate', input_data.duplicated().any())
return output_data
""">> user interaction session data"""
session_df = pd.read_csv("/content/data/training_set/log_mini.csv")
print(session_df.shape)
session_df.head()
session_df_summary = getDfSummary(session_df)
session_df_summary
# looking at the unique values in some categorical columns
print(session_df['context_type'].unique())
print(session_df['hist_user_behavior_reason_start'].unique())
print(session_df['hist_user_behavior_reason_end'].unique())
""">>
>> track features data
"""
track_df = pd.read_csv("/content/data/track_features/tf_mini.csv")
print(track_df.shape)
track_df.head()
track_df_summary = getDfSummary(session_df)
track_df_summary
""">> **Q: Are there any Tracks that are in session but not in tracks?**"""
# answer
tracks_in_session = session_df['track_id_clean'].unique()
tracks_in_session.sort()
print('number of unique songs in session data:', len(tracks_in_session))
total_tracks_info = track_df['track_id'].unique()
total_tracks_info.sort()
print('number of unique songs in track feature data:', len(total_tracks_info))
print('all songs in the session are included in tracks:', set(tracks_in_session == total_tracks_info))
"""**Conclusion:**
1. By taking a look at null/missing values in the mini-dataset, there seems to be no null values. And there is no duplicated information(rows).
2. All the tracks in sessions have track information in track features dataset.
3. Tracks have information about songs released from 1950 to 2018.
>>
#### EDA before Data processing
---
**Q. Are there any features that are highly correlated?**
"""
# Gives only the most highly correlated features (>0.75 or <-0.75)
# between 2 highly correlated pairs, we will be removing one of them
def high_corr_pairs(df):
corr_plot = df.corr()
all_pairs = set()
for col in corr_plot.columns:
for row in corr_plot.columns:
if (corr_plot[row][col]>0.75 or corr_plot[row][col]<-0.75) and (col!=row) and (col,row) not in all_pairs:
print(row+", "+col+" = "+str(corr_plot[row][col]))
all_pairs.add((row,col))
high_corr_pairs(session_df)
high_corr_pairs(track_df)
#plotting correlations
import matplotlib.pyplot as plt
import seaborn as sns
# user interaction sessions
plt.figure(figsize=(25, 10))
sns.heatmap(session_df.corr(), vmin=-1, vmax=1, annot=True, cmap="Greens")
# audio features
plt.figure(figsize=(25, 10))
sns.heatmap(track_df.corr(), vmin=-1, vmax=1, annot=True, cmap="Blues")
# Unique combinations of skip columns and their sizes
session_df.groupby(['skip_1','skip_2','skip_3','not_skipped']).size().reset_index()
# Unique combinations of pause columns and their sizes
session_df.groupby(['no_pause_before_play','short_pause_before_play','long_pause_before_play']).size().reset_index()
"""**Conclusion:**
1. All the skips columns are correlated in session dataframe and it makes sense because each skip is an indicator of when they skipped. So if one of them is 1, the others should be 0. It also makes sense that the columns regarding the pauses before playing the track be mutually exclusive for the same reason. Because each one is dependent on each other.
2. There seems to be high correlation within all pairs of 'beat_strength', 'bounciness', 'danceability' and 'dyn_range_mean'in the case of track features dataset.
#### Data pre-processing
---
1. Data sampling - Since the main dataset is huge, we will be using the mini-dataset which was created by Spotify itself.
2. Data cleaning - The data seems to be mostly clean with no null or missing values.
3. **Data integration** - We need to integrate the session dataframe with its respective tracks in order to train the model. There are few things to consider:
- Make sure the order of the tracks do not get mixed up during merging. (No shuffling)
"""
# Integrating datasets
df = session_df.merge(track_df, how='left', left_on="track_id_clean", right_on="track_id")
print('Dataframe shape:', df.shape)
print('Are there any missing values in the dataframe:', df.isnull().any().any())
df.head()
"""4. **Feature Selection**
- remove any redundant information like 'track_id_clean' because after integrating dataset, we already have a column 'track_id' in dataset.
- remove features that are highly correlated.
>> Creating Customer Target Variable - 'skipped'
"""
# add custom variable
df['skipped'] = df['skip_2'] # True - skipped, False - not skipped
df['skipped'] = np.where((df['skip_1'] | df['skip_2'] | df['skip_3'] | df['not_skipped']) == False, True, df['skip_2']) # Case of song not being played at all, mark as skipped.
df.groupby(['skip_1','skip_2','skip_3','not_skipped', 'skipped']).size().reset_index()
"""1. As it can be seen above, a custom target variable 'skipped' was created with 'skip_2' values. "skip_2" conveys whether the song was skipped before 1/3rd of the song was played or not. But there is also a combination in row 0 in above table where everything is false which means the song wasn't even played. We are assuming it to be skipped and changing only those to True.
2. We also can see that this creation of target variable creates balanced classes.
3. All other columns are dropped.
"""
# Dropping redundant information
df = df.drop(columns=['skip_1','skip_2','skip_3','not_skipped','track_id_clean','date'])
df.shape
""">>
>> One-Hot Encoding
"""
# one hot encoding all the categorical elements
for col in df.columns:
cols = []
if (df[col].dtypes == object or df[col].dtypes == bool) and (col not in ['session_id','track_id']):
cols.append(col) # All categorical and boolean columns
df = pd.get_dummies(df,columns = cols, drop_first = True) # Get one hot encoding and drop the previous columns
print(df.shape)
""">>
Feature selection by Decision tree - Decision tree here selects features based on entropy which means we intrinsically get the feature importances of the columns.
"""
# Decision Tree and Feature importance
from sklearn.tree import DecisionTreeClassifier
num_data = df
num_data = num_data.drop('session_id', axis = 1)
num_data = num_data.drop('track_id', axis = 1)
dt = DecisionTreeClassifier(criterion="entropy")
df_pred = num_data.drop(['skipped_True'], axis=1)
df_target = num_data['skipped_True']
dt = dt.fit(df_pred, df_target)
# plot importances
train_importance = dt.feature_importances_
pred_var = list(df_pred.columns)
importance = pd.DataFrame({"Features":pred_var,"Importance":train_importance}).sort_values(by='Importance')
plt.figure(figsize=(12,20))
plt.barh(importance['Features'], importance['Importance'], color="#4c9191")
plt.ylabel("Features")
plt.xlabel("Importance")
plt.title("Feature Importances")
"""**Conclusion:**
1. Surprisingly, most of the top 5 features seem to be more about the user interaction behavior and session instead of track features.
2. The top most important feature is 'hist_user_behavior_reason_end_trackdone' talks about why the track is done playing. **The reason hist_user_behavior_reason_end_trackdone is the highest correlated is because there is a direct correlation of it with skipped variable. As in, if the track is done, it would mean the user didn't skip.** These variables would not be available during actual predictions. Therefore, all history_user_behaviour_end variables are removed.
"""
# removing the hist_user_behavior_reason_end variables
user_behavior_end_features = []
for new_col_name in num_data.columns:
if new_col_name.startswith('hist_user_behavior_reason_end'):
user_behavior_end_features.append(new_col_name)
num_data = num_data.drop(user_behavior_end_features, axis = 1)
""">>
## 3. Model Selection and Feature selection
####kFold Cross Validation for C hyperparameters: Logistic Regression and SVM
"""
# using the first 5000 rows only since we have very limited RAM
cv_data = num_data.iloc[:5000]
# Dependencies
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# using kFold to find the best value for hyperparameter c(regularization strength)
def cv_kFold(dataset, cs, model):
X = dataset.drop(['skipped_True'], axis=1).values
y = dataset['skipped_True'].values
# initialize dictionary
accuracy = {}
for c in cs:
accuracy[c] = []
kf = KFold(n_splits=10, random_state=None)
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
for c in cs:
if model == "Logistic":
logr = LogisticRegression(max_iter=3000, C=c)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)
AUC_c_k = metrics.accuracy_score(y_test, logr_pred)
elif model == "SVM":
svm = SVC(kernel='linear', C=c)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
AUC_c_k = metrics.accuracy_score(y_test, svm_pred)
accuracy[c].append(AUC_c_k)
return accuracy
# sequence of C values
cs = [0.001, 0.01, 0.1, 0.5, 1, 2, 4, 10, 20, 40, 100]
# generate accuracy for each C value
logr_aucs = cv_kFold(cv_data, cs, "Logistic")
svm_aucs = cv_kFold(cv_data, cs, "SVM")
from scipy.stats import sem
means = []
stderrs = []
model_aucs = {"Logistic" : logr_aucs, "SVM" : svm_aucs}
for key in model_aucs.keys():
print(key)
aucs = model_aucs[key]
for key in aucs.keys():
means.append(np.mean(aucs[key]))
stderrs.append(sem(aucs[key]))
print('c:', key, ', mean:', np.mean(aucs[key]), ', standard error:', sem(aucs[key]))
"""**Logistic Regression:**
We can see that c=0.1 and c=40 had the highest mean accuracy, with very similar standard error value. In the case of logistic regression, a high C value of 40 puts a lot of emphasis on the training data and has a very low regularization strength. But since we are only using a small sample of the data for cross validation, the training data here might not be too representative of the entire dataset. Therefore, it might be better to usue c=0.1 to give more weight to this complexity penalty, less to the training data, when it comes to classifying the entire dataset.
**SVM:**
In the case of Support Veector Machines, the accuracy is highest when C = 0.01. In fact, models with C values >= 0.5 do exceptionally worse, at an average of 0.65 accuracy. This suggests that when regularization is smaller and training data is weighted greater, the models overfit the training data and do not do as well.
>>
#### Testing different models on different feature selections
>> We are testing the accuracy of the models based on the feature selection. There are five different feature selections that are being done for comparison.
"""
# Different Feature selectors
"""
The following are features we want to remove from our dataset for training.
For reusability of the code and data, we will be selecting instead of dropping
the columns.
"""
"""
Features selected based on pairwise correlations in the EDA section.
When many columns seem to be highly correlated only the feature with lower
correlation is kept while the others are removed.
"""
list_of_features_1 = ['short_pause_before_play', 'long_pause_before_play', 'acousticness', 'organism', 'bounciness', 'beat_strength', 'dyn_range_mean']
"""
The top features from the decision tree feature importances are selected.
"""
# Sorting list based on feature importance in ascending order
sorted_asc_col = [x for _,x in sorted(zip(dt.feature_importances_,num_data.drop(columns = ['skipped_True']).columns))]
# Bottom 50% important features
list_of_features_2 = sorted_asc_col[:len(sorted_asc_col)//2]
# Bottom 30% important features
list_of_features_3 = sorted_asc_col[:int(len(sorted_asc_col) * 0.3)]
# User behaviour features
list_of_features_4 = []
for col_name in session_df.columns:
for new_col_name in num_data.columns:
if new_col_name.startswith(col_name):
list_of_features_4.append(new_col_name)
# Track features
list_of_features_5 = []
for col_name in track_df.columns:
for new_col_name in num_data.columns:
if new_col_name.startswith(col_name):
list_of_features_5.append(new_col_name)
# Adding custom_skip_var_True to list values that need to be removed from training data
list_of_features_1.append('skipped_True')
list_of_features_2.append('skipped_True')
list_of_features_3.append('skipped_True')
list_of_features_4.append('skipped_True')
list_of_features_5.append('skipped_True')
# Train-Test Split
from sklearn.model_selection import train_test_split
def get_train_test_split(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test
# Dependencies
import keras
from keras.models import Sequential
from keras.layers import Dense,SimpleRNN
import tensorflow as tf
# Neural network - define the keras model
def construct_model(X_train, model_type = 'CNN'):
if model_type == 'CNN':
tf.random.set_seed(124)
model = Sequential()
model.add(Dense(20, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
elif model_type == 'LogReg': # Logistic Regression
model = LogisticRegression(C=0.1, max_iter=3000)
elif model_type == 'SVM':
model = SVC(kernel = 'linear', C = 0.01)
return model
# Training and Testing with different feature selections
# By default it removes no features
def train_test_model_different_config(list_of_features_to_remove = ['skipped_True'], model_type = 'CNN'):
X_train, X_test, y_train, y_test = get_train_test_split(num_data.loc[:, ~num_data.columns.isin(list_of_features_to_remove)], num_data['skipped_True'])
model = construct_model(X_train, model_type)
if model_type == 'CNN':
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
elif model_type == 'LogReg': # Logistic Regression
model.fit(X_train, y_train)
logr_pred_test = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, logr_pred_test)
elif model_type == "SVM":
model.fit(X_train, y_train)
svm_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, svm_pred)
print('Test Accuracy: %.2f' % (accuracy*100))
return accuracy*100
"""The below code tests different feature selections and models. Therefore, it'll take a lot of time for their runtime ~ 15 - 20 mins."""
# Testing different feature selections and models
list_of_features = [list_of_features_1,list_of_features_2,list_of_features_3,list_of_features_4,list_of_features_5]
list_of_names = ['Remove Pairwise Highly correlated', 'Remove Bottom 50% important', 'Remove Bottom 30% important', 'Remove User behaviour features', 'Remove Track features']
result_dict = dict()
# With no feature selection
test_acc_1 = train_test_model_different_config()
test_acc_2 = train_test_model_different_config(model_type='LogReg')
test_acc_3 = train_test_model_different_config(model_type = "SVM")
result_dict['No feature selection'] = [test_acc_1,test_acc_2, test_acc_3]
# With feature selection
for i in range(len(list_of_features)):
test_acc_1 = train_test_model_different_config(list_of_features[i])
test_acc_2 = train_test_model_different_config(list_of_features[i],'LogReg')
test_acc_3 = train_test_model_different_config(list_of_features[i], 'SVM')
result_dict[list_of_names[i]] = [test_acc_1,test_acc_2, test_acc_3]
# Features removed for each set
print(list_of_features_1)
print(list_of_features_2)
print(list_of_features_3)
print(list_of_features_4)
print(list_of_features_5)
# Visualizing the Results from different configurations
results = pd.DataFrame(data=result_dict, index=['CNN', 'Logistic Regression', 'SVM']).T
results
""">>
####plotting the ROC-AUC curves
"""
# plotting the ROC-AUC curve
# code taken from homework 5
from sklearn.metrics import roc_auc_score, roc_curve, auc
def getAUC(truth, pred):
fpr, tpr, thresholds = roc_curve(truth, pred)
return auc(fpr, tpr)
def plotAUC(truth, pred, lab, title):
fpr, tpr, thresholds = roc_curve(truth, pred)
roc_auc = auc(fpr, tpr)
c = (np.random.rand(), np.random.rand(), np.random.rand())
plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title(title)
plt.legend(loc="lower right")
def AUC_plot(list_of_features_to_remove = ['skipped_True'], name = '', model_type = "LogReg"):
X_train, X_test, y_train, y_test = get_train_test_split(num_data.loc[:, ~num_data.columns.isin(list_of_features_to_remove)], num_data['skipped_True'])
model = construct_model(X_train, model_type)
if model_type == 'LogReg': # Logistic Regression
model.fit(X_train, y_train)
logr_pred = model.predict_proba(X_test)[:,1]
roc_auc = getAUC(y_test, logr_pred)
plotAUC(y_test, logr_pred, name, 'Logistic Regression ROC-AUC Curve')
elif model_type == 'SVM':
model.fit(X_train, y_train)
svm_pred = model.decision_function(X_test)
roc_auc = roc_curve(y_test, svm_pred)[:-1]
plotAUC(y_test, svm_pred, name, "SVM ROC-AUC Curve")
return roc_auc
# Logistic Regression
# With no feature selection
test_auc_1 = AUC_plot(name ='No feature selection', model_type='LogReg')
# With feature selection for LogReg
for i in range(len(list_of_features)):
test_auc_1 = AUC_plot(list_of_features[i], list_of_names[i], 'LogReg')
#SVM
# With no feature selection
test_auc_2 = AUC_plot(name = "No feature selection", model_type = "SVM")
# With feature selection for SVM
for i in range(len(list_of_features)):
test_auc_2 = AUC_plot(list_of_features[i], list_of_names[i], 'SVM')
"""For logistic regression and SVM, the feature selection that resulted in the highest AUC score is with removing only the track features.
>>
#### RNN: Proposed method
>> **Restructuring of data -** In order to give the input to the RNN, the data is restructured in such a way that one row in the input would contain track features of the entire session (20 tracks). The output would be the skipped variable for all 20 tracks. Since, some sessions do not have 20 tracks, they are padded with zeros.
"""
def restructure_data(new_df, list_of_features_to_remove):
X = []
y = []
curr_row = 0
while curr_row < new_df.shape[0]: # Iterate over the rows
X_temp = []
y_temp = []
sess_len = new_df.iloc[curr_row]['session_length'] # Session length
sess_end_row = curr_row + sess_len - 1
# Checking if it has same session id
if new_df.iloc[curr_row]['session_id'] == new_df.iloc[sess_end_row]['session_id']:
for i in range(curr_row,sess_end_row+1):
X_temp.append(list(new_df.loc[i, ~new_df.columns.isin(['session_id','track_id','skipped_True']+ list_of_features_to_remove)]))
y_temp = list(new_df.iloc[curr_row:sess_end_row + 1]['skipped_True'])
if sess_len!= 20: # Padding other values with 0
for i in range(0,20-sess_len):
X_temp.append([0]*len(X_temp[0]))
y_temp.append(0)
X.append(X_temp)
y.append(y_temp)
curr_row += sess_len
# Train test split
X_train, X_test, y_train, y_test = get_train_test_split(np.array(X), np.array(y))
return X_train, X_test, y_train, y_test
def construct_model(X_train):
tf.random.set_seed(1234)
model = Sequential()
model.add(Dense(20, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
model.add(SimpleRNN(units=20, return_sequences=True, activation=tf.nn.relu))
model.add(Dense(12, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
return model
def train_RNN(df, list_of_features_to_remove):
X_train, X_test, y_train, y_test = restructure_data(df,list_of_features_to_remove)
model = construct_model(X_train)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
# Returns test accuracy
return accuracy
# Need the data with session id to combine them into one row
new_df = df
# Removing the history end variables as explained previously
list_of_hist_end = ['hist_user_behavior_reason_end_clickrow',
'hist_user_behavior_reason_end_endplay',
'hist_user_behavior_reason_end_fwdbtn',
'hist_user_behavior_reason_end_logout',
'hist_user_behavior_reason_end_remote',
'hist_user_behavior_reason_end_trackdone']
new_df = new_df.drop(list_of_hist_end, axis = 1)
result_dict = dict()
# With no feature selection
test_acc_1 = train_RNN(df,[])
result_dict['No feature selection'] = [test_acc_1]
# With feature selection
for i in range(len(list_of_features)):
test_acc_1 = train_RNN(df,list_of_features[i])
result_dict[list_of_names[i]] = [test_acc_1]
# Visualizing the Results from different configurations
results = pd.DataFrame(data=result_dict, index=['RNN']).T
results
"""**Conclusion and Discussion**
1. Overall, RNN seems to be performing (test accuracy) better than any other model. It could be because each session belongs to the same user and hence keeping 20 session features as one input could predict the skipability better.
2. While trying to train the RNN, it was observed that random seed needed to be fixed to observe consistent results.
3. Overall, removing track features was giving the highest accuracy for all models. This made us come to conclusion that Spotify needs to focus more on user interaction to predict the skipability.
"""