-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmachine_prediction_interactive.py
794 lines (606 loc) · 35.4 KB
/
machine_prediction_interactive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
from library import database, classification, my_time, normilisation, feature_helper
import json
import datetime
import sys
'''
purpose: test to see if random forest can make predictions for
features to put in to random forest:
day of week, month, week number, city demand level/duration of demand, listing_cluster number, enquiry #, cancellaton #, (#days_in advance)
price? Transformed???? (price / location average)
#instead of listing_cluster, can we also just put in the listing_cluster data straight into the random forest?
#going to have to first test just with Barcelona, 2015
ALSO keep in mind that all data from jsons are strings.
'''
json_files = {"day_features": {}, "day_features_normalised": {}, "occupancy_dict": {}}
#general data
testing_listings = []
all_data = {}
#listing id : {"deleted_at", "active", Updated_at, created at}
listing_important_dates = {}
#dates + some market intelligence
#feature_data_space = [{"listing_cluster": False}, {"days_active": False}, {"day_features": "day_number"}, {"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}]
#just dates feature_space
#feature_data_space = [{"day_features_normalised": "weekday_number"}, {"day_features_normalised": "week_number"}, {"day_features_normalised": "quarter_in_year"}]
feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}, {"listing_cluster": True}, {"days_active": True}]
point_of_view = None
training_history = None
#training and testing data
current_location = None
listing_cluster_normalisation = {}
#need to get listing data where it's at least one year of data. Just using Barcelona 2014 as default. Predict into 2015
def get_testing_listings(list_of_location_ids = [1]):
#return listings that have at least one year of occupancy data
#And restricted to barcelona
global testing_listings, json_files
testing_listings = []
thesis_data = database.database("Thesis")
query_entries = ""
for x in list_of_location_ids:
query_entries += str(x) + ","
#pop off final coma
query_entries = query_entries[:(len(query_entries) - 1)]
query = "SELECT `listing_locations_DBSCAN_final`.`listing_id`, `listing_clusters_plain`.`cluster_id`, `listing_locations_DBSCAN_final`.`label_id` FROM `listing_locations_DBSCAN_final` INNER JOIN `listing_clusters_plain` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE `label_id` IN(" + query_entries + ");"
initial_data = thesis_data.get_data(query)
for listing_data in initial_data:
try:
sample = json_files["occupancy_dict"][str(listing_data[0])]
testing_listings.append(listing_data)
except KeyError:
pass
thesis_data.destroy_connection()
def fill_global_data(inclusive = True):
global listing_important_dates, json_files, feature_data_space, listing_cluster_normalisation
#json_files loading: occupancy_dict, date_dict
for filename in json_files.keys():
json_files[filename] = _load_json(filename, inclusive)
#earliest dates
worldhomes_data = database.database("worldhomes")
listing_important_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at`, `updated_at`, `deleted_at`, `active` FROM `listings`")
listing_important_dates = {entry[0]: {"created_at": entry[1], "updated_at": entry[2], "deleted_at": entry[3], "active": entry[4]} for entry in listing_important_dates_list}
worldhomes_data.destroy_connection()
#listing_cluster_normalisation, to make automatic so we don't have to deal with cluster changing issues.
'''
all dates need to be in dictionary format with:
end_date
start_date
DOes this per listing
default take all data
'''
def fill_training_and_testing_data(listing_id, testing_dates, training_dates = None):
global all_data
'''
if listing_ids:
training_with = listing_ids
else:
training_with = [entry[0] for entry in testing_listings]
'''
training_data = {"features": [], "classification" : []}
testing_data = {"features": [], "classification" : []}
#does listing have occupancy data to begin with
try:
occupancy_test = json_files["occupancy_dict"][str(listing_id)]
if training_dates:
#force it to take only listings that have at least started by the start date
start = training_dates["start_date"]
#dis-allowing majority of dataset
'''
if listing_important_dates[listing_id]["created_at"].date() > start:
print "not enough training data for listing: ", listing_id
#can put in training?
return None
'''
else:
start = datetime.date(2008, 01, 01)
for day in my_time._daterange(start, testing_dates["end_date"]):
try:
if day < testing_dates["start_date"]:
training_data["features"].append(all_data[listing_id][day]["feature_data"])
training_data["classification"].append(all_data[listing_id][day]["classification"])
else:
testing_data["features"].append(all_data[listing_id][day]["feature_data"])
testing_data["classification"].append(all_data[listing_id][day]["classification"])
testing_data["days"].append(day.strftime("%Y-%m-%d"))
#print "putting data in testing_data"
except KeyError as e: #this day didn't have data, but listing might have data
#shouldn't be an issue anymore because we deleted the day structure if it has no data in all_data
pass
except KeyError as e:
print e
print "in 122"
return None
return (training_data, testing_data)
'''
All data holds :
listings : year: day: feature_data: [feature_space], classification: 0/1
automatically filters for listings that have full data
normalisation says wehter or not to normalise all the features to betwee 0 and 1
'''
def fill_all_data(start_date, end_date, normalised = True):
global all_data, testing_listings, json_files, point_of_view
#clear
all_data = {}
for listing_data in testing_listings:
#setup basic structure
listing_id = listing_data[0]
#if earliest_dates[listing_id] <= start_date:
all_data[listing_id] = my_time.compact_default_date_structure(start_date, end_date)
for day in all_data[listing_id].keys():
#this is a valid, active day for the listing, can be 0 or 1
if json_files["occupancy_dict"][str(listing_id)][day.strftime("%Y")][day.strftime("%Y-%m-%d")] is not None:
#day is datetime object
if point_of_view:
all_data[listing_id][day] = {"feature_data" : _get_feature_data_with_pointofview(listing_data, day, normalised), "classification" : json_files["occupancy_dict"][str(listing_id)][day.strftime("%Y")][day.strftime("%Y-%m-%d")]}
else:
feature_data = _get_feature_data(listing_data, day, normalised)
if feature_data is not False:
all_data[listing_id][day] = {"feature_data" : _get_feature_data(listing_data, day, normalised), "classification" : json_files["occupancy_dict"][str(listing_id)][day.strftime("%Y")][day.strftime("%Y-%m-%d")]}
else:
print "deleting ", listing_id, "from all_data"
del all_data[listing_id]
break
else:
#there is no occupancy data for this day
del all_data[listing_id][day]
#else:
# print "this listing_id doens't have enough data %s " % (listing_id)
# pass
#listing_cluster, day of month, day of week, week number,historical demand level_for_day (default to kmeans = 3),
##days active, months active, #of this type of demand level active
#day is a datetime
#listing_data: id, listing_cluster, location_cluster
def _get_feature_data(listing_data, day):
global feature_data_space, listing_important_dates, json_files, current_location
final = []
for this_dict in feature_data_space:
for dict_name, specification in this_dict.iteritems():
#load json if it doesn't exist in json_files:
if dict_name in ["price_dict", "k-means_season_clusters", 'cluster_averages', 'cluster_averages_year'] and dict_name not in json_files.keys():
with open("data/" + dict_name + ".json") as jsonFile:
json_files[dict_name] = json.load(jsonFile)
#check if there's a point of view requirement
global point_of_view
try:
view_date = day + datetime.timedelta(point_of_view)
except TypeError: #point of view is still none
view_date = day
#the real stuff of feature addition
if dict_name in ["listing_cluster", "days_active", "day_features", "day_features_normalised"]:
if dict_name == "listing_cluster":
if normalised:
#FIX THIS
normalised_listing_cluster = {1:float(1)/11, 2:float(2)/11, 3:float(3)/11, 4: float(4)/11, 5: float(5)/11, 7: float(6)/11, 13: float(7)/11, 14: float(8)/11, 15: float(9)/11, 26: float(10)/11, 27: float(11)/11}
final.append(normalised_listing_cluster[listing_data[1]])
else:
final.append(listing_data[1])
elif dict_name == "days_active":
days_diff = int( (day - listing_important_dates[listing_data[0]]["created_at"].date()).days)
if normalised:
possible_normalised = my_time.get_listing_activity_span(listing_important_dates[listing_data[0]])
if days_diff:
days_diff = possible_normalised
else:
#last confirmed dates of known occupancies
days_diff = float(days_diff)/(int((datetime.date(2016, 1, 29) - day).days ) )
else:
final.append(days_diff)
elif dict_name in ["day_features", "day_features_normalised"]:
final.append(json_files[dict_name][day.strftime("%Y-%m-%d")][specification])
elif dict_name in ["k-means_season_clusters", "cluster_averages", 'cluster_averages_year']:
#specification in this case is just going to be the amount of data around the week to include around the view_date, which is a year ago.
#k-Means_cluster
#year: {location_id: k_means_type: day: cluster_designation}
#
#cluster_averages_year
#full_year structure is by year for the primary key, then location_cluster, then listing_cluster: day: average
#
#for now just take the 3 clusters
#
#HAVE TO USE CURRENT YEAR CLUSTERS
#last_year = datetime.date((day.year - 1), day.month, day.day)
if specification:
if specification == "one_week":
#reverse input for the week. Shouldn't be a problem
for x in range(0,7):
this_day = view_date - datetime.timedelta(-1 * x)
if dict_name == "k-means_season_clusters":
#need to cast to int
final.append(int (json_files[dict_name][this_day.strftime("%Y")][str(current_location)]["3"][this_day.strftime("%Y-%m-%d")]) )
else:
#append all averages
for listing_cluster in json_files[dict_name][str(current_location)].keys():
final.append(int(json_files[dict_name][this_day.strftime("%Y")][str(current_location)][listing_cluster][this_day]))
else:
#HAVE TO USE THIS YEAR'S DATE
if dict_name == "k-means_season_clusters":
to_add = int(json_files[dict_name][view_date.strftime("%Y")][str(current_location)]["3"][view_date.strftime("%Y-%m-%d")])
final.append(to_add)
else:
for listing_cluster in json_files[dict_name][str(current_location)].keys():
final.append(json_files[dict_name][view_date.strtime("%Y")][str(current_location)][listing_cluster][view_date])
elif not specification and dict_name not in ["listing_cluster", "days_active"]: #for dicts that just have dates
try:
to_add = json_files[dict_name][str(listing_data[0])][view_date.strftime("%Y")][view_date.strftime("%Y-%m-%d")]
except KeyError:
#delete this entry from consideration
#hopefully only returns false when the listing id is not in the dictionary we are looking at (looking at you Price_dict)
if dict_name == "price_dict":
return False
else:
#for other dictionaries, it just means that this listing has none of these types of data
to_add = 0
#for enquiries: item = {"id": value}
if isinstance(to_add, list):
for item in to_add:
if item is None:
final.append(0)
elif isinstance(item, dict):
final.append(item.values()[0]) #should only be one value
else:
final.append(item)
else:
#make none's 0 for learning algorithms
if to_add is None:
final.append(0)
elif isinstance(to_add, dict):
final.append(to_add.values()[0])
else:
final.append(to_add)
return final
#else:
#this is a day specific entry, need to check if there's a "time ahead"
#time_ahead will tell us to add known data known the number of days in time_ahead ahead of the current day.
#if "time_ahead" in feature_data_space.keys():
'''
The features that we can put in point of view are: #of enquiries known for that day, #of cancellations for that day, %booking for the day that we are looking at.
'''
def _get_feature_data_with_pointofview(listing_data, day):
global feature_data_space, listing_important_dates, json_files
pass
def _load_json(filename, inclusive):
if inclusive and filename in ["k-means_season_clusters", "cluster_averages", "occupancy_dict", "cluster_averages_year"]:
filepath = "data/inclusive_occupancy_jsons/" + filename + ".json"
else:
filepath = "data/" + filename + ".json"
#there is a json file for this
with open(filepath) as jsonFile:
this_json = json.load(jsonFile)
'''
elif filename == "day_features": #need to pull this data from database
thesis_data = database.database("Thesis")
#week_number, day_number, month_number, weekday, year, null, null
day_features_list = thesis_data.get_data("SELECT * FROM `date_to_day`")
this_json = {datetime.date(entry[4], entry[2], entry[1]).strftime("%Y-%m-%d"): {"weekday_number": float(entry[3] + 1)/7, "day_number": float(entry[1])/7, "week_number": float(entry[0])/5, "month_number": float(entry[2])/12, "quarter_in_year": float((entry[2]-1)/3 + 1)/4, "year": float(entry[4] - 2008)/(2016 - 2008)} for entry in day_features_list}
'''
return this_json
'''
time ahead up to one year
time ahead should be passed as an INT for days
new_feature_specifications must be dictionary with:
key: json file name
specification: special specification if any
'''
def add_feature_to_feature_space(feature_dict, inclusive = True):
#if there is a time_ahead, then
global json_files, feature_data_space, point_of_view
if isinstance(feature_dict, list):
for ind_feature_dict in feature_dict:
feature_data_space.append(ind_feature_dict)
#load json
dict_filename = ind_feature_dict.keys()[0]
if dict_filename not in json_files.keys():
json_files[dict_filename] = _load_json(dict_filename, inclusive)
elif isinstance(feature_dict, list):
feature_data_space.append(feature_dict)
#load json
dict_filename = ind_feature_dict.keys()[0]
if dict_filename not in json_files.keys():
json_files[dict_filename] = _load_json(dict_filename, inclusive)
#load feature_dict if necessary
'''
Classification part of code
'''
def make_classification_model(model_name, training_dict):
prediction_class = classification.classification(model_name)
prediction_class.train_with(training_dict["features"], training_dict["classification"])
return prediction_class
def test_classification(classification_model, testing_data_dict):
testing_features = testing_data_dict["features"]
answers = testing_data_dict["classification"]
predictions = classification_model.predict(testing_features)
if predictions is not False:
return classification.results(predictions, answers).get_results()
else:
return False
'''
other things
'''
def save_to_file(folder, filename, to_save):
with open(folder + filename + ".json", "w") as output:
json.dump(to_save, output)
'''
expecting dicts to be like:
listing_id{ value1: , value2: ...}
or for average:
value: , value:
'''
def save_to_database(table_name, experiment_name, city_name, full_dict):
thesis_data = database.database("Thesis")
#delete similar entries
query = "DELETE FROM `" + table_name + "` WHERE `city` = '" + city_name + "' AND `experiment` = '" + experiment_name + "';"
#print query
thesis_data.execute(query)
print "saving to database " + table_name + " experiment results: " + experiment_name
#put entries in, then the keys are lists and what I want to store are the true_true,
if full_dict and isinstance(full_dict.keys()[0], long):
insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s',%s,'%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
for listing_id, full_records in full_dict.iteritems():
#experiment , city , listing_id, method, true_true, true_false, false_true, false_false, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne, correct_overall
for method, method_results in full_records.iteritems():
to_insert = [experiment_name, city_name, listing_id, method]
for this_thing in ["true_true", "true_false", "false_true", "false_false", "occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne", "correct_overall"]:
if method_results[this_thing]:
to_insert.append(method_results[this_thing])
else:
to_insert.append("null")
#print (insert_query % to_insert)
thesis_data.execute(insert_query % tuple(to_insert))
elif full_dict:
insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s','%s',%s,%s,%s,%s,%s, %s)"
#experiment , city , method, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne,
for method, method_results in full_dict.iteritems():
to_insert = [experiment_name, city_name, method]
for this_thing in ["occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne"]:
if method_results[this_thing]:
to_insert.append(method_results[this_thing])
else:
to_insert.append("null")
thesis_data.execute(insert_query % tuple(to_insert))
thesis_data.destroy_connection()
def instructions():
print "set up global data have fun! Here's what you need to do to finish setup:"
print "def get_testing_listings(list_of_location_ids = [1]), defaults to just barcelona"
print "if you want to change the point of view, use change_point_of_view(days_in_advance), enter a positive number to indicate how many days in the future you are predicting."
#point_of_view(7)
print "you can play with the feature space too with:\n"
print "add_feature_to_feature_space(new_feature_specifications, time_ahead = None) where time_ahead is an int, new_feature_specifications is either a dict {feature_space_dict: specific feature name or None to indicate day} "
print "and the feature space dic codes are: "
print '{"occupancy": occupancy, "enquiry" : enquiry, "cancellation": cancellation, "doublebooked": doublebooked, "confirmed_advanced": confirmed_advanced, "enquiry_advanced": enquiry_advanced, "cluster_designations": cluster_designations, "day_features" : day_features, "unavailable": unavailable, "unconfirmed": unconfirmed} \n'
print "fill_all_data(start_date, end_date, point_of_view = False) [start and end date should should include both testing and training data times], datetime objects please!"
print "fill_training_and_testing_data(training_dates, testing_dates, listing_ids = None), \nreturns a list with two dictionaries: \ntesting_data{ features: [testing_data_features], classifications: []; \ntraining_data{ features: [], classifications : []}}"
print "where testing_dates and training dates are a dict with these values: {start_date: datetimeObject; end_date: datetimeObject}"
print "returns a tuple with (training_data, testing_data)\n"
print "for classification tests use the classification class:"
print "classification.classification(model_name).train_with(training_data_list, answers).predict(testing_data), returns list of predictions"
print "model_names: random_forest, centroid_prediction, linearSVC, nearest_neighbor, decision_tree, svc"
print "for getting results, use classification's result's class"
print "classification.results(prediction, classification).get_results(), returns dict"
'''
expecting structure to be:
method: listing_ids: full results
'''
def results_averaging(final_results_dict):
'''
"true_true", "true_false":, false_false": , "false_true":, "occupancy_precision", "empty_precision", "correct_overall", "occupancy_recall", "empty_recall", "occupancy_fOne", "empty_fOne", }
'''
#method: occupancy_precision_count, occupancy_tot, ...
results_store = {}
for listing_ids, full_data in final_results_dict.iteritems():
if full_data:
for method, full_results in full_data.iteritems():
if method not in results_store.keys():
results_store[method] = {}
for result_type in ["occupancy_precision", "empty_precision", "occupancy_recall", "empty_recall", "occupancy_fOne", "empty_fOne"]:
if result_type not in results_store[method].keys():
results_store[method][result_type] = []
if full_results[result_type]:
results_store[method][result_type].append(full_results[result_type])
else: #if the result was None or 0
#often because there weren't many occupancies or falses in a test set
print "didn't have good data here"
print listing_ids, ", ", method
pass
#get the average
final = {}
for method, result_type_data in results_store.iteritems():
final[method] = {}
for result_type, tot_in_list in result_type_data.iteritems():
if len(tot_in_list) > 0:
final[method][result_type] = float(sum(tot_in_list))/len(tot_in_list)
else:
final[method][result_type] = None
return final
'''
Full experiments record
'''
def single_listing_oneyr_history_training():
global current_location
#barcelona, varenna, rotterdam, majorca
fill_global_data()
start_date = datetime.date(2014, 1, 1)
end_date = datetime.date(2016, 1, 1)
training_dates = {"start_date": datetime.date(2014, 1, 1), "end_date": datetime.date(2015, 1, 1)}
testing_dates = {"start_date": datetime.date(2015, 1, 1), "end_date": datetime.date(2016, 1, 1)}
get_testing_listings([1])
fill_all_data(start_date, end_date, normalised = False) #good
#classification_data = fill_training_and_testing_data(training_dates, testing_dates)
all_results = {}
for listing in testing_listings:
all_results[listing[0]] = {}
classification_data = fill_training_and_testing_data(listing[0], testing_dates, training_dates)
if not classification_data:
print "not enough data for listing ", listing[0]
del all_results[listing[0]]
else:
training_dict = classification_data[0]
testing_dict = classification_data[1] #good
for model_name in ["random_forest", "centroid_prediction", "linearSVC", "nearest_neighbor", "decision_tree", "svc"]:
try:
prediction_model = make_classification_model(model_name, training_dict)
except ValueError: #there isn't enough training material sorry
break
results = test_classification(prediction_model, testing_dict)
if results is not False:
all_results[listing[0]][model_name] = results
else:
del all_results[listing[0]]
break
print "number of records we predicted for, " , len(all_results)
analysis = results_averaging(all_results)
location_dict = {1: "Barcelona", 0: "Rome", 6: "Varenna", 11: "Mallorca", 19: "Rotterdam"}
save_to_database("machine_learning_average_results", "single_listing_date_only_added", "Barcelona", analysis)
save_to_database("machine_learning_individual_results", "single_listing_date_only_added", "Barcelona", all_results)
print "finished!"
def fullLocation_training(experiment_name, normalised = False):
global current_location
fill_global_data()
start_date = datetime.date(2014, 1, 1)
end_date = datetime.date(2016, 1, 1)
training_dates = {"start_date": datetime.date(2014, 1, 1), "end_date": datetime.date(2015, 1, 1)}
testing_dates = {"start_date": datetime.date(2015, 1, 1), "end_date": datetime.date(2016, 1, 1)}
#three city, single listing training and prediction test
#take out 6
#for location_id in [1, 6, 11, 19]:
for location_id in [0, 1, 19]:
current_location = location_id
print "on location: ", location_id
get_testing_listings([location_id])
print "number of listings for this location: ", len(testing_listings)
fill_all_data(start_date, end_date, normalised) #good
#classification_data = fill_training_and_testing_data(training_dates, testing_dates)
all_results = {}
all_training = {"features": [], "classification": []}
all_testing = {}
#set up trianing and testing
for listing in testing_listings:
classification_data = fill_training_and_testing_data(listing[0], testing_dates, training_dates)
if classification_data:
temp_training = classification_data[0]
all_training["features"] += temp_training["features"]
all_training["classification"] += temp_training["classification"]
all_testing[listing[0]] = classification_data[1] #good
#train please
for model_name in ["random_forest", "centroid_prediction", "linearSVC", "nearest_neighbor", "decision_tree", "svc"]:
try:
prediction_model = make_classification_model(model_name, all_training)
except ValueError: #there isn't enough training material sorry
break
for listing_id, testing_dict in all_testing.iteritems():
results = test_classification(prediction_model, testing_dict)
if results is not False:
if listing_id not in all_results.keys():
all_results[listing_id] = {}
all_results[listing_id][model_name] = results
#save all_results
location_dict = {1: "Barcelona", 0: "Rome", 6: "Varenna", 11: "Mallorca", 19: "Rotterdam"}
save_to_database("machine_learning_individual_results", experiment_name, location_dict[location_id], all_results)
print "saved individual results"
analysis = results_averaging(all_results)
save_to_database("machine_learning_average_results", experiment_name, location_dict[location_id], analysis)
print "saved average results"
print analysis
print "analyzed ", len(all_results), " records"
'''
folder = "data/prediction_results/machine_learning/"
filename = location_dict[location_id] + "_fullLocation_ListingInfo_training"
save_to_file(folder, filename, analysis)
'''
print "finished!"
def listingCluster_training(experiment_name, normalised = False):
#listing type training
global current_location
fill_global_data()
start_date = datetime.date(2014, 1, 1)
end_date = datetime.date(2016, 1, 1)
training_dates = {"start_date": datetime.date(2014, 1, 1), "end_date": datetime.date(2015, 1, 1)}
testing_dates = {"start_date": datetime.date(2015, 1, 1), "end_date": datetime.date(2016, 1, 1)}
#three city, single listing training and prediction test
for location_id in [0, 1, 19]:
current_location = location_id
print "on location: ", location_id
get_testing_listings([location_id])
print "number of listings for this location: ", len(testing_listings)
#make listing_clusters: listing_cluster [ids]
global testing_listings
listing_clusters = {entry[1]: [] for entry in testing_listings}
fill_all_data(start_date, end_date, normalised) #good
#fill listing_clusters
for entry in testing_listings:
listing_clusters[entry[1]].append(entry[0])
#classification_data = fill_training_and_testing_data(training_dates, testing_dates)
all_results = {}
#set up trianing and testing
for listing_cluster, listing_id_list in listing_clusters.iteritems():
all_training = {"features": [], "classification": []}
all_testing = {}
if len(listing_id_list) > 2:
for listing in listing_id_list:
classification_data = fill_training_and_testing_data(listing, testing_dates, training_dates)
if classification_data:
temp_training = classification_data[0]
all_training["features"] += temp_training["features"]
all_training["classification"] += temp_training["classification"]
all_testing[listing] = classification_data[1] #good
#train please
for model_name in ["random_forest", "centroid_prediction", "linearSVC", "nearest_neighbor", "decision_tree", "svc"]:
try:
prediction_model = make_classification_model(model_name, all_training)
except ValueError: #there isn't enough training material sorry
break
for listing_id, testing_dict in all_testing.iteritems():
results = test_classification(prediction_model, testing_dict)
if results:
if listing_id not in all_results.keys():
all_results[listing_id] = {}
all_results[listing_id][model_name] = results
print "analyzed ", len(all_results), " records"
analysis = results_averaging(all_results)
location_dict = {1: "Barcelona", 0: "Rome", 6: "Varenna", 11: "Mallorca", 19: "Rotterdam"}
save_to_database("machine_learning_individual_results", experiment_name, location_dict[location_id], all_results)
save_to_database("machine_learning_average_results", experiment_name, location_dict[location_id], analysis)
print "finished!" #sys.exit()
def date_only_experiments():
global feature_data_space, point_of_view
##SIMPLE SOPHISTICATION
#date only testing
feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}]
single_listing_oneyr_history_training()
listingCluster_training("listing_cluster_date_only")
fullLocation_training("full_location_date_only_added")
#date + listing_cluster
feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}, {"listing_cluster": True}]
fullLocation_training("full_location_listing_cluster_added")
#date + listing_cluste + days active
feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}, {"listing_cluster": True}, {"days_active": True}]
#not normalised leads to bad results, many nulls
fullLocation_training("full_location_date_listingC_daysActive", False)
listingCluster_training("listing_cluster_date_listingC_daysActive", False)
#fullLocation_training("full_location_date_listingC_daysActiveNormalised", True)
#listingCluster_training("listing_cluster_date_listingC_daysActiveNormalised", True)
def price_experiments():
#date + listing_cluste + days active, sold_for
feature_data_space = [{"price_dict": None}]
#
fullLocation_training("full_location_date_price_alone", False)
#listingCluster_training("listing_cluster_price_alone", False) Not enough data
feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}, {"listing_cluster": True}, {"days_active": True}, {"price_dict": None}]
#
fullLocation_training("full_location_date_price_fullListingF", False)
listingCluster_training("listing_cluster_price_fullListingF", False)
#where the magic happens
if __name__ == '__main__':
global feature_data_space
#with market intelligence
#same year k-means
feature_data_space = [{"k-means_season_clusters": None}]
fullLocation_training("full_location_lastYearSeason", False)
#
#MID SOPHISTICATION
#with "additional features", since it seams that
#feature_data_space = [{"day_features": "weekday_number"}, {"day_features": "week_number"}, {"day_features": "quarter_in_year"}, {"day_features": "day_number"}, {"day_features": "month_number"}, {"listing_cluster": True}]
#piont of view
'''
feature_data_space = [{'CANCELLED':None}, {'CANCELLED':'occupancy_advanced'}, {'ENQUIRY': None}, {'ENQUIRY': 'occupancy_advanced'}, {'confirmed_advanced': None}, {'occupancy_advanced': 'percent_of_occupation'}]
point_of_view = 7
fullLocation_training("full_location_point_view_alone", False)
listingCluster_training("listing_cluster_price_alone", False)
'''