-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpart2.py
725 lines (548 loc) · 18.5 KB
/
part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
"""
Part 2: Performance Comparisons
**Released: Wednesday, October 16**
In this part, we will explore comparing the performance
of different pipelines.
First, we will set up some helper classes.
Then we will do a few comparisons
between two or more versions of a pipeline
to report which one is faster.
"""
import part1
"""
=== Questions 1-5: Throughput and Latency Helpers ===
We will design and fill out two helper classes.
The first is a helper class for throughput (Q1).
The class is created by adding a series of pipelines
(via .add_pipeline(name, size, func))
where name is a title describing the pipeline,
size is the number of elements in the input dataset for the pipeline,
and func is a function that can be run on zero arguments
which runs the pipeline (like def f()).
The second is a similar helper class for latency (Q3).
1. Throughput helper class
Fill in the add_pipeline, eval_throughput, and generate_plot functions below.
"""
# Number of times to run each pipeline in the following results.
# You may modify this as you go through the file if you like, but make sure
# you set it back to 10 at the end before you submit.
NUM_RUNS = 10
class ThroughputHelper:
def __init__(self):
# Initialize the object.
# Pipelines: a list of functions, where each function
# can be run on no arguments.
# (like: def f(): ... )
self.pipelines = []
# Pipeline names
# A list of names for each pipeline
self.names = []
# Pipeline input sizes
self.sizes = []
# Pipeline throughputs
# This is set to None, but will be set to a list after throughputs
# are calculated.
self.throughputs = None
def add_pipeline(self, name, size, func):
raise NotImplementedError
def compare_throughput(self):
# Measure the throughput of all pipelines
# and store it in a list in self.throughputs.
# Make sure to use the NUM_RUNS variable.
# Also, return the resulting list of throughputs,
# in **number of items per second.**
raise NotImplementedError
def generate_plot(self, filename):
# Generate a plot for throughput using matplotlib.
# You can use any plot you like, but a bar chart probably makes
# the most sense.
# Make sure you include a legend.
# Save the result in the filename provided.
raise NotImplementedError
"""
As your answer to this part,
return the name of the method you decided to use in
matplotlib.
(Example: "boxplot" or "scatter")
"""
def q1():
# Return plot method (as a string) from matplotlib
raise NotImplementedError
"""
2. A simple test case
To make sure your monitor is working, test it on a very simple
pipeline that adds up the total of all elements in a list.
We will compare three versions of the pipeline depending on the
input size.
"""
LIST_SMALL = [10] * 100
LIST_MEDIUM = [10] * 100_000
LIST_LARGE = [10] * 100_000_000
def add_list(l):
# TODO
# Please use a for loop (not a built-in)
raise NotImplementedError
def q2a():
# Create a ThroughputHelper object
h = ThroughputHelper()
# Add the 3 pipelines.
# (You will need to create a pipeline for each one.)
# Pipeline names: small, medium, large
raise NotImplementedError
# Generate a plot.
# Save the plot as 'output/part2-q2a.png'.
# TODO
# Finally, return the throughputs as a list.
# TODO
"""
2b.
Which pipeline has the highest throughput?
Is this what you expected?
=== ANSWER Q2b BELOW ===
=== END OF Q2b ANSWER ===
"""
"""
3. Latency helper class.
Now we will create a similar helper class for latency.
The helper should assume a pipeline that only has *one* element
in the input dataset.
It should use the NUM_RUNS variable as with throughput.
"""
class LatencyHelper:
def __init__(self):
# Initialize the object.
# Pipelines: a list of functions, where each function
# can be run on no arguments.
# (like: def f(): ... )
self.pipelines = []
# Pipeline names
# A list of names for each pipeline
self.names = []
# Pipeline latencies
# This is set to None, but will be set to a list after latencies
# are calculated.
self.latencies = None
def add_pipeline(self, name, func):
raise NotImplementedError
def compare_latency(self):
# Measure the latency of all pipelines
# and store it in a list in self.latencies.
# Also, return the resulting list of latencies,
# in **milliseconds.**
raise NotImplementedError
def generate_plot(self, filename):
# Generate a plot for latency using matplotlib.
# You can use any plot you like, but a bar chart probably makes
# the most sense.
# Make sure you include a legend.
# Save the result in the filename provided.
raise NotImplementedError
"""
As your answer to this part,
return the number of input items that each pipeline should
process if the class is used correctly.
"""
def q3():
# Return the number of input items in each dataset,
# for the latency helper to run correctly.
raise NotImplementedError
"""
4. To make sure your monitor is working, test it on
the simple pipeline from Q2.
For latency, all three pipelines would only process
one item. Therefore instead of using
LIST_SMALL, LIST_MEDIUM, and LIST_LARGE,
for this question run the same pipeline three times
on a single list item.
"""
LIST_SINGLE_ITEM = [10] # Note: a list with only 1 item
def q4a():
# Create a LatencyHelper object
h = LatencyHelper()
# Add the single pipeline three times.
raise NotImplementedError
# Generate a plot.
# Save the plot as 'output/part2-q4a.png'.
# TODO
# Finally, return the latencies as a list.
# TODO
"""
4b.
How much did the latency vary between the three copies of the pipeline?
Is this more or less than what you expected?
=== ANSWER Q4b BELOW ===
=== END OF Q4b ANSWER ===
"""
"""
Now that we have our helpers, let's do a simple comparison.
NOTE: you may add other helper functions that you may find useful
as you go through this file.
5. Comparison on Part 1
Finally, use the helpers above to calculate the throughput and latency
of the pipeline in part 1.
"""
# You will need these:
# part1.load_input
# part1.PART_1_PIPELINE
def q5a():
# Return the throughput of the pipeline in part 1.
raise NotImplementedError
def q5b():
# Return the latency of the pipeline in part 1.
raise NotImplementedError
"""
===== Questions 6-10: Performance Comparison 1 =====
For our first performance comparison,
let's look at the cost of getting input from a file, vs. in an existing DataFrame.
6. We will use the same population dataset
that we used in lecture 3.
Load the data using load_input() given the file name.
- Make sure that you clean the data by removing
continents and world data!
(World data is listed under OWID_WRL)
Then, set up a simple pipeline that computes summary statistics
for the following:
- *Year over year increase* in population, per country
(min, median, max, mean, and standard deviation)
How you should compute this:
- For each country, we need the maximum year and the minimum year
in the data. We should divide the population difference
over this time by the length of the time period.
- Make sure you throw out the cases where there is only one year
(if any).
- We should at this point have one data point per country.
- Finally, as your answer, return a list of the:
min, median, max, mean, and standard deviation
of the data.
Hints:
You can use the describe() function in Pandas to get these statistics.
You should be able to do something like
df.describe().loc["min"]["colum_name"]
to get a specific value from the describe() function.
You shouldn't use any for loops.
See if you can compute this using Pandas functions only.
"""
def load_input(filename):
# Return a dataframe containing the population data
# **Clean the data here**
raise NotImplementedError
def population_pipeline(df):
# Input: the dataframe from load_input()
# Return a list of min, median, max, mean, and standard deviation
raise NotImplementedError
def q6():
# As your answer to this part,
# call load_input() and then population_pipeline()
# Return a list of min, median, max, mean, and standard deviation
raise NotImplementedError
"""
7. Varying the input size
Next we want to set up three different datasets of different sizes.
Create three new files,
- data/population-small.csv
with the first 600 rows
- data/population-medium.csv
with the first 6000 rows
- data/population-single-row.csv
with only the first row
(for calculating latency)
You can edit the csv file directly to extract the first rows
(remember to also include the header row)
and save a new file.
Make four versions of load input that load your datasets.
(The _large one should use the full population dataset.)
Each should return a dataframe.
The input CSV file will have 600 rows, but the DataFrame (after your cleaning) may have less than that.
"""
def load_input_small():
raise NotImplementedError
def load_input_medium():
raise NotImplementedError
def load_input_large():
raise NotImplementedError
def load_input_single_row():
# This is the pipeline we will use for latency.
raise NotImplementedError
def q7():
# Don't modify this part
s = load_input_small()
m = load_input_medium()
l = load_input_large()
x = load_input_single_row()
return [len(s), len(m), len(l), len(x)]
"""
8.
Create baseline pipelines
First let's create our baseline pipelines.
Create four pipelines,
baseline_small
baseline_medium
baseline_large
baseline_latency
based on the three datasets above.
Each should call your population_pipeline from Q6.
Your baseline_latency function will not be very interesting
as the pipeline does not produce any meaningful output on a single row!
You may choose to instead run an example with two rows,
or you may fill in this function in any other way that you choose
that you think is meaningful.
"""
def baseline_small():
raise NotImplementedError
def baseline_medium():
raise NotImplementedError
def baseline_large():
raise NotImplementedError
def baseline_latency():
raise NotImplementedError
def q8():
# Don't modify this part
_ = baseline_medium()
return ["baseline_small", "baseline_medium", "baseline_large", "baseline_latency"]
"""
9.
Finally, let's compare whether loading an input from file is faster or slower
than getting it from an existing Pandas dataframe variable.
Create four new dataframes (constant global variables)
directly in the script.
Then use these to write 3 new pipelines:
fromvar_small
fromvar_medium
fromvar_large
fromvar_latency
These pipelines should produce the same answers as in Q8.
As your answer to this part;
a. Generate a plot in output/part2-q9a.png of the throughputs
Return the list of 6 throughputs in this order:
baseline_small, baseline_medium, baseline_large, fromvar_small, fromvar_medium, fromvar_large
b. Generate a plot in output/part2-q9b.png of the latencies
Return the list of 2 latencies in this order:
baseline_latency, fromvar_latency
"""
# TODO
# POPULATION_SMALL =
# POPULATION_MEDIUM =
# POPULATION_LARGE =
# POPULATION_SINGLE_ROW =
def fromvar_small():
raise NotImplementedError
def fromvar_medium():
raise NotImplementedError
def fromvar_large():
raise NotImplementedError
def fromvar_latency():
raise NotImplementedError
def q9a():
# Add all 6 pipelines for a throughput comparison
# Generate plot in ouptut/q9a.png
# Return list of 6 throughputs
raise NotImplementedError
def q9b():
# Add 2 pipelines for a latency comparison
# Generate plot in ouptut/q9b.png
# Return list of 2 latencies
raise NotImplementedError
"""
10.
Comment on the plots above!
How dramatic is the difference between the two pipelines?
Which differs more, throughput or latency?
What does this experiment show?
===== ANSWER Q10 BELOW =====
===== END OF Q10 ANSWER =====
"""
"""
===== Questions 11-14: Performance Comparison 2 =====
Our second performance comparison will explore vectorization.
Operations in Pandas use Numpy arrays and vectorization to enable
fast operations.
In particular, they are often much faster than using for loops.
Let's explore whether this is true!
11.
First, we need to set up our pipelines for comparison as before.
We already have the baseline pipelines from Q8,
so let's just set up a comparison pipeline
which uses a for loop to calculate the same statistics.
Your pipeline should produce the same answers as in Q6 and Q8.
Create a new pipeline:
- Iterate through the dataframe entries. You can assume they are sorted.
- Manually compute the minimum and maximum year for each country.
- Compute the same answers as in Q6.
- Manually compute the summary statistics for the resulting list (min, median, max, mean, and standard deviation).
"""
def for_loop_pipeline(df):
# Input: the dataframe from load_input()
# Return a list of min, median, max, mean, and standard deviation
raise NotImplementedError
def q11():
# As your answer to this part, call load_input() and then
# for_loop_pipeline() to return the 5 numbers.
# (these should match the numbers you got in Q6.)
raise NotImplementedError
"""
12.
Now, let's create our pipelines for comparison.
As before, write 4 pipelines based on the datasets from Q7.
"""
def for_loop_small():
raise NotImplementedError
def for_loop_medium():
raise NotImplementedError
def for_loop_large():
raise NotImplementedError
def for_loop_latency():
raise NotImplementedError
def q12():
# Don't modify this part
_ = for_loop_medium()
return ["for_loop_small", "for_loop_medium", "for_loop_large", "for_loop_latency"]
"""
13.
Finally, let's compare our two pipelines,
as we did in Q9.
a. Generate a plot in output/part2-q13a.png of the throughputs
Return the list of 6 throughputs in this order:
baseline_small, baseline_medium, baseline_large, for_loop_small, for_loop_medium, for_loop_large
b. Generate a plot in output/part2-q13b.png of the latencies
Return the list of 2 latencies in this order:
baseline_latency, for_loop_latency
"""
def q13a():
# Add all 6 pipelines for a throughput comparison
# Generate plot in ouptut/q13a.png
# Return list of 6 throughputs
raise NotImplementedError
def q13b():
# Add 2 pipelines for a latency comparison
# Generate plot in ouptut/q13b.png
# Return list of 2 latencies
raise NotImplementedError
"""
14.
Comment on the results you got!
14a. Which pipelines is faster in terms of throughput?
===== ANSWER Q14a BELOW =====
===== END OF Q14a ANSWER =====
14b. Which pipeline is faster in terms of latency?
===== ANSWER Q14b BELOW =====
===== END OF Q14b ANSWER =====
14c. Do you notice any other interesting observations?
What does this experiment show?
===== ANSWER Q14c BELOW =====
===== END OF Q14c ANSWER =====
"""
"""
===== Questions 15-17: Reflection Questions =====
15.
Take a look at all your pipelines above.
Which factor that we tested (file vs. variable, vectorized vs. for loop)
had the biggest impact on performance?
===== ANSWER Q15 BELOW =====
===== END OF Q15 ANSWER =====
16.
Based on all of your plots, form a hypothesis as to how throughput
varies with the size of the input dataset.
(Any hypothesis is OK as long as it is supported by your data!
This is an open ended question.)
===== ANSWER Q16 BELOW =====
===== END OF Q16 ANSWER =====
17.
Based on all of your plots, form a hypothesis as to how
throughput is related to latency.
(Any hypothesis is OK as long as it is supported by your data!
This is an open ended question.)
===== ANSWER Q17 BELOW =====
===== END OF Q17 ANSWER =====
"""
"""
===== Extra Credit =====
This part is optional.
Use your pipeline to compare something else!
Here are some ideas for what to try:
- the cost of random sampling vs. the cost of getting rows from the
DataFrame manually
- the cost of cloning a DataFrame
- the cost of sorting a DataFrame prior to doing a computation
- the cost of using different encodings (like one-hot encoding)
and encodings for null values
- the cost of querying via Pandas methods vs querying via SQL
For this part: you would want to use something like
pandasql that can run SQL queries on Pandas data frames. See:
https://stackoverflow.com/a/45866311/2038713
As your answer to this part,
as before, return
a. the list of 6 throughputs
and
b. the list of 2 latencies.
and generate plots for each of these in the following files:
output/part2-ec-a.png
output/part2-ec-b.png
"""
# Extra credit (optional)
def extra_credit_a():
raise NotImplementedError
def extra_credit_b():
raise NotImplementedError
"""
===== Wrapping things up =====
**Don't modify this part.**
To wrap things up, we have collected
your answers and saved them to a file below.
This will be run when you run the code.
"""
ANSWER_FILE = "output/part2-answers.txt"
UNFINISHED = 0
def log_answer(name, func, *args):
try:
answer = func(*args)
print(f"{name} answer: {answer}")
with open(ANSWER_FILE, 'a') as f:
f.write(f'{name},{answer}\n')
print(f"Answer saved to {ANSWER_FILE}")
except NotImplementedError:
print(f"Warning: {name} not implemented.")
with open(ANSWER_FILE, 'a') as f:
f.write(f'{name},Not Implemented\n')
global UNFINISHED
UNFINISHED += 1
def PART_2_PIPELINE():
open(ANSWER_FILE, 'w').close()
# Q1-5
log_answer("q1", q1)
log_answer("q2a", q2a)
# 2b: commentary
log_answer("q3", q3)
log_answer("q4a", q4a)
# 4b: commentary
log_answer("q5a", q5a)
log_answer("q5b", q5b)
# Q6-10
log_answer("q6", q6)
log_answer("q7", q7)
log_answer("q8", q8)
log_answer("q9a", q9a)
log_answer("q9b", q9b)
# 10: commentary
# Q11-14
log_answer("q11", q11)
log_answer("q12", q12)
log_answer("q13a", q13a)
log_answer("q13b", q13b)
# 14: commentary
# 15-17: reflection
# 15: commentary
# 16: commentary
# 17: commentary
# Extra credit
log_answer("extra credit (a)", extra_credit_a)
log_answer("extra credit (b)", extra_credit_b)
# Answer: return the number of questions that are not implemented
if UNFINISHED > 0:
print("Warning: there are unfinished questions.")
return UNFINISHED
"""
=== END OF PART 2 ===
Main function
"""
if __name__ == '__main__':
log_answer("PART 2", PART_2_PIPELINE)