-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_version_control_presentation.html
769 lines (697 loc) · 36.7 KB
/
data_version_control_presentation.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.269">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="Mainye B">
<title>Data Version Control</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1.6em;
vertical-align: middle;
}
</style>
<script src="data_version_control_presentation_files/libs/clipboard/clipboard.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/quarto.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/popper.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/anchor.min.js"></script>
<link href="data_version_control_presentation_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="data_version_control_presentation_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="data_version_control_presentation_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="data_version_control_presentation_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="data_version_control_presentation_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="dark">
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#what-is-it" id="toc-what-is-it" class="nav-link active" data-scroll-target="#what-is-it"><span class="toc-section-number">1</span> What is it?</a></li>
<li><a href="#why-is-it-important" id="toc-why-is-it-important" class="nav-link" data-scroll-target="#why-is-it-important"><span class="toc-section-number">2</span> Why is it important?</a>
<ul class="collapse">
<li><a href="#needs" id="toc-needs" class="nav-link" data-scroll-target="#needs"><span class="toc-section-number">2.1</span> Needs</a>
<ul class="collapse">
<li><a href="#data-examples" id="toc-data-examples" class="nav-link" data-scroll-target="#data-examples"><span class="toc-section-number">2.1.1</span> Data Examples</a></li>
<li><a href="#data-science-process" id="toc-data-science-process" class="nav-link" data-scroll-target="#data-science-process"><span class="toc-section-number">2.1.2</span> Data science process</a></li>
</ul></li>
<li><a href="#try-something-different-with-dvc-and-makefiles" id="toc-try-something-different-with-dvc-and-makefiles" class="nav-link" data-scroll-target="#try-something-different-with-dvc-and-makefiles"><span class="toc-section-number">2.2</span> Try something different with DVC and Makefiles</a>
<ul class="collapse">
<li><a href="#makefile" id="toc-makefile" class="nav-link" data-scroll-target="#makefile"><span class="toc-section-number">2.2.1</span> Makefile</a></li>
<li><a href="#using-a-makefile-for-machine-learning-workflow" id="toc-using-a-makefile-for-machine-learning-workflow" class="nav-link" data-scroll-target="#using-a-makefile-for-machine-learning-workflow"><span class="toc-section-number">2.2.2</span> Using a Makefile for Machine Learning Workflow</a></li>
<li><a href="#dvc" id="toc-dvc" class="nav-link" data-scroll-target="#dvc"><span class="toc-section-number">2.2.3</span> DVC</a></li>
<li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion"><span class="toc-section-number">2.2.4</span> Conclusion</a></li>
</ul></li>
<li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references"><span class="toc-section-number">2.3</span> References</a></li>
</ul></li>
</ul>
</nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Data Version Control</h1>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>Mainye B </p>
</div>
</div>
</div>
</header>
<section id="what-is-it" class="level1" data-number="1">
<h1 data-number="1"><span class="header-section-number">1</span> What is it?</h1>
<p>Data version control is way of making a reproducible journal to replicate your data science workflow. Imagine when you are working with teams everyone has their own way of doing things but how can we make a consensus to have unified way of working together so that you don’t step on each others toes. On the other hand, is there a way of managing data science projects a bit easier to be able to track project a bit better? We will discuss that in this presentation.</p>
<p>They are several tools that have been created to address this problem. They include the following:</p>
<ul>
<li><a href="https://dvc.org/">DVC</a></li>
<li><a href="https://mlflow.org/">Mlflow</a></li>
<li><a href="https://neptune.ai/">Neptuneai</a></li>
<li><a href="https://delta.io/">Delta Lake</a></li>
<li><a href="https://metaflow.org/">Metaflow</a></li>
</ul>
<div class="callout callout callout-style-simple no-icon">
<div class="callout-body d-flex">
<div class="callout-icon-container">
<i class="callout-icon no-icon"></i>
</div>
<div class="callout-body-container">
<p>We’ll go through DVC, and Makefiles. Great Expectations is another tool that can be used to validate data.</p>
</div>
</div>
</div>
</section>
<section id="why-is-it-important" class="level1" data-number="2">
<h1 data-number="2"><span class="header-section-number">2</span> Why is it important?</h1>
<p>As professionals who have worked on various projects in data science and machine learning, we have discovered that the path from idea to product needs a frictionless workflow. This allows us to focus on implementing ideas rather than handling all that goes on in the background.</p>
<p>It is important mostly because it can get very confusing when handling projects and keeping track of our experiments. In data science, we don’t have predefined outputs. We can create reports, dashboards, applications, and APIs. There are so many things that go into that process, such as data importing, exploratory data analysis, feature engineering, and modeling. Each of these steps can take different routes to reach our destination.</p>
<p><img src="https://unsplash.com/photos/aerial-photography-of-road-zS4lUqLEiNA" title="Title: Aerial Photography by Jack Anstey" alt="Aerial Photography by Jack Anstey"></p>
<section id="needs" class="level2" data-number="2.1">
<h2 data-number="2.1" class="anchored" data-anchor-id="needs"><span class="header-section-number">2.1</span> Needs</h2>
<ul>
<li>How can we track different parts of our work?</li>
<li>How can we record hyperparameters for different versions of our experiments?</li>
<li>How can we store metadata of our projects, such as models and slices of data?</li>
<li>How can we unify and organize metrics?</li>
<li>Can I fully replicate their work or at least a significant portion of it?</li>
</ul>
<blockquote class="blockquote">
<p>All of the solutions mentioned above can help address these challenges and can improve your workflows.</p>
</blockquote>
<section id="data-examples" class="level3" data-number="2.1.1">
<h3 data-number="2.1.1" class="anchored" data-anchor-id="data-examples"><span class="header-section-number">2.1.1</span> Data Examples</h3>
<p>We will be using two datasets for this presentation. The first dataset is the Medical Cost Personal Datasets. This dataset contains information about the medical costs of individuals. The second dataset is the Telco dataset. This dataset contains information about the customers of a telecommunications company. Both datasets are available on Kaggle.</p>
<p>We recommend visiting the <a href="https://www.kaggle.com/">Kaggle website</a> to download the datasets and explore them further. As well as implement the ideas with the second dataset.</p>
<div class="callout-important callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p><strong><a href="https://www.kaggle.com/datasets/mirichoi0218/insurance">Medical Cost Personal Datasets</a></strong></p>
</div>
</div>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p><strong><a href="https://www.kaggle.com/datasets/blastchar/telco-customer-churn">Telco dataset</a></strong></p>
</div>
</div>
<p>The dataset has a number of observations and measurements that are crucial for a prediction task, which is finding churn. Churn refers to the likelihood that a client will stop using the telecommunications company. This is particularly relevant if you are looking at the second dataset, the Telco dataset.</p>
<p>Other very common metrics that you can be asked to calculate in the data science team include:</p>
<table class="table">
<thead>
<tr class="header">
<th>Metric</th>
<th style="text-align: left;">Explanation</th>
<th style="text-align: right;">Associated link</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Hypothesis testing</td>
<td style="text-align: left;">Making the website better via focus group testing.</td>
<td style="text-align: right;"><a href="https://medium.com/@gajendra.k.s/hypothesis-testing-33aaeeff5336">link</a></td>
</tr>
<tr class="even">
<td>Conversion rate</td>
<td style="text-align: left;">time it takes for a client to move from discovery to becoming a paying customer.</td>
<td style="text-align: right;"><a href="https://www.geeksforgeeks.org/conversion-rate-what-is-it-how-to-calculate-it/">link</a></td>
</tr>
<tr class="odd">
<td>Customer life time value (LTV)</td>
<td style="text-align: left;">how much a client(s) will generate in their lifetime.</td>
<td style="text-align: right;"><a href="https://www.datacamp.com/tutorial/customer-life-time-value">link</a></td>
</tr>
<tr class="even">
<td>Recommendation systems</td>
<td style="text-align: left;">how can we sell cross sell our existing products better</td>
<td style="text-align: right;"><a href="https://medium.com/@Karthickk_Rajah/clustering-based-algorithms-in-recommendation-system-205fcb15bc9b">link</a></td>
</tr>
<tr class="odd">
<td>Optimization</td>
<td style="text-align: left;">adjusting cost of product this involves using specific techniques to find the maximum or minimum value of something to reap better revenues</td>
<td style="text-align: right;"><a href="https://towardsdatascience.com/production-fixed-horizon-planning-with-python-8dd38b468e86">link</a></td>
</tr>
</tbody>
</table>
</section>
<section id="data-science-process" class="level3" data-number="2.1.2">
<h3 data-number="2.1.2" class="anchored" data-anchor-id="data-science-process"><span class="header-section-number">2.1.2</span> Data science process</h3>
<p>We will be referencing a cool notebook that someone in the kaggle community had done. Here’s the original <a href="https://www.kaggle.com/code/hely333/eda-regression">notebook</a>.</p>
<p>The person did are really cool job. However, I wish more one hot encoding was done and exploring techniques such as OneR were done. We’ll explore that later. At the moment, let’s set out attention to the data science process.</p>
<div id="fig-datasci" class="quarto-layout-panel">
<figure class="figure">
<div class="quarto-layout-row quarto-layout-valign-top">
<div class="quarto-figure quarto-figure-center" style="flex-basis: 50.0%;justify-content: center;">
<figure class="figure">
<p><a href="https://www.manning.com/books/data-science-with-python-and-dask" id="fig-process"><img src="Screenshot%20from%202023-02-13-10-57-10.png" class="img-fluid figure-img"></a></p>
<p></p><figcaption class="figure-caption">Data science process</figcaption><p></p>
</figure>
</div>
<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="Screenshot%20from%202023-02-13-10-57-41.png" class="img-fluid figure-img" width="100"></p>
<p></p><figcaption class="figure-caption">Transforming-data</figcaption><p></p>
</figure>
</div>
</div>
</div>
<p></p><figcaption class="figure-caption">Figure 1: What is done in data science</figcaption><p></p>
</figure>
</div>
<p>As you can see above we change data in various forms that we can use to understand it better. We can use it to make predictions, make recommendations, and optimize our products.</p>
<p>Often times you can easily just make a notebook, and your work is done. They are tools that allow you to do <a href="https://www.kaggle.com/discussions/getting-started/293861">scheduled notebook reruns</a> on kaggle, using <a href="https://papermill.readthedocs.io/en/latest/">papermill</a> and <a href="https://towardsdatascience.com/how-to-schedule-jupyter-notebooks-in-amazon-sagemaker-d50fa1c8c0ad">Sagemaker</a>.</p>
</section>
</section>
<section id="try-something-different-with-dvc-and-makefiles" class="level2" data-number="2.2">
<h2 data-number="2.2" class="anchored" data-anchor-id="try-something-different-with-dvc-and-makefiles"><span class="header-section-number">2.2</span> Try something different with DVC and Makefiles</h2>
<section id="makefile" class="level3" data-number="2.2.1">
<h3 data-number="2.2.1" class="anchored" data-anchor-id="makefile"><span class="header-section-number">2.2.1</span> Makefile</h3>
<p>In most Unix systems (Mac Os and Linux) you’ll find that the <code>make</code> command is already installed. If not it very easy to install it.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>How to install</p>
<pre class="{bash}"><code># update packages
sudo apt-get update
# just say yes to make
sudo apt-get -y install make
# what version was installed
make -v</code></pre>
</div>
</div>
<p>Using these files makes it easy to hide the complexity of running commands that you require to follow best practices as an example:</p>
<blockquote class="blockquote">
<p>Running in bash</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false
# This code runs the pylint tool with specific configurations to check for errors in Python files.
# The `--disable=R,C` flag disables the pylint checks for code style and convention violations.
# The `--errors-only` flag ensures that only error messages are displayed.
# The `*.py utils/*.py testing/*.py` argument specifies the files and directories to be checked by pylint.
pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py</code></pre>
<p>Code Linting Linting is crucial for maintaining high-quality code. It helps catch errors and inconsistencies early on, reducing bugs and improving readability.</p>
<p>Why Lint?</p>
<ul>
<li>Reduced bugs: Catch errors before runtime.</li>
<li>Improved readability: Enforce consistent coding standards.</li>
<li>Faster development: Identify issues quickly.</li>
</ul>
<blockquote class="blockquote">
<p>Within your Makefile</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false
lint: activate install format # These are prerequisites: that is they must be run first
# flake8 or #pylint
pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py</code></pre>
<blockquote class="blockquote">
<p>In Terminal</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false
make lint</code></pre>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>Instead of memorizing long commands you can store them in a Makefile and run them in a single command for example <code>make all</code> will run each command until the end of the file. Also, <a href="https://www.youtube.com/watch?v=2wSBAkJGcug">Continous Integration/Continous Deployment</a></p>
</div>
</div>
</section>
<section id="using-a-makefile-for-machine-learning-workflow" class="level3" data-number="2.2.2">
<h3 data-number="2.2.2" class="anchored" data-anchor-id="using-a-makefile-for-machine-learning-workflow"><span class="header-section-number">2.2.2</span> Using a Makefile for Machine Learning Workflow</h3>
<p>At this juncture, you are probably acknowledging how much a Makefile is amazing. Get this you can use it with any language you prefer for data science and machine learning. Here are more <a href="https://gist.github.com/Shuyib/ae87774fd82c69706803725db9a681dc">examples in Julia and R</a></p>
<p>Let create a Makefile to assist us with <strong>Making</strong> a machine learning workflow to help us handle the project better.</p>
<p>In the directory <code>datavc_makefile</code> we have a custom Makefile that we can use to run our commands. Specifically, for a machine learning project.</p>
<pre class="{makefile}"><code>#| echo: false
# .DEFAULT_GOAL tells make which target to run when no target is specified
.DEFAULT_GOAL := all
# .PHONY tells make that these targets do not represent actual files
.PHONY: all install clean format lint create_dirs activate_venv import_data clean_data eda split_data evaluate_model
# run all commands
all: create_dirs install activate_venv import_data clean_data eda split_data evaluate_model
# Specify python location in virtual environment it ensures that the correct version of python is used
# Specify pip location in virtual environment it ensures that the correct version of pip is used
ORIGINAL_PY_VERSION := $(shell python3 --version)
PYTHON := .venv/bin/python3
PIP := .venv/bin/pip3
DOCKER_CONTAINER_NAME := ml_regression_workflow:v0.0.0
DATA_DIR := data/
OUTPUT_DIR := output/
MODEL_OUTPUT_DIR := model_output/
venv/bin/activate: requirements.txt
# create virtual environment
python3 -m venv .venv
# make command executable
chmod +x .venv/bin/activate
# activate virtual environment
. .venv/bin/activate
activate_venv:
# activate virtual environment
# run . .venv/bin/activate manually if it doesn't work
@echo "Activating virtual environment"
chmod +x activate_venv.sh
./activate_venv.sh
install: venv/bin/activate requirements.txt # prerequisite
# install commands
# This is step 1: install the virtual environment
# Py version using py 3.10 from envname
@echo "Python version: $(ORIGINAL_PY_VERSION)"
@echo "Installing virtual environment"
@echo "This is step 1: install the virtual environment"
$(PIP) --no-cache-dir install --upgrade pip &&\
$(PIP) --no-cache-dir install -r requirements.txt
docstring:
# format docstring
pyment -w -o numpydoc *.py
format:
# format code
black *.py
clean:
@echo "Cleaning up"
# clean directory of cache
rm -rf __pycache__ &&\
rm -rf utils/__pycache__ &&\
rm -rf testing/__pycache__ &&\
rm -rf .pytest_cache &&\
rm -rf .venv
rm -rf db
rm -rf data
rm -rf output
rm -rf model_output
lint: activate install format
# flake8 or #pylint
pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py
# Make sure the directories have been created
create_dirs:
@echo "Creating directories"
@echo "This is step 2: create directories"
mkdir -p -v $(DATA_DIR)
mkdir -p -v $(OUTPUT_DIR)
mkdir -p -v $(MODEL_OUTPUT_DIR)
@echo "Directories created"
@echo "remember to follow these steps https://www.kaggle.com/discussions/general/74235"
import_data: create_dirs
@echo "Importing data from Kaggle"
@echo "This is step 3: import data"
@echo "The data folder has a new dataset"
@echo "Your task Can you accurately predict insurance costs? Regression problem"
# make sure script is executable
chmod +x import_data.sh
# run script
./import_data.sh
clean_data: import_data data/original_data/insurance.csv
@echo "Cleaning data"
@echo "This is step 4: clean data"
@echo "The data folder has a cleaned dataset in data/transform"
$(PYTHON) cleandata.py load_data --file_path data/original_data/insurance.csv
$(PYTHON) cleandata.py summary --file_path data/original_data/insurance.csv
$(PYTHON) cleandata.py check_missing --file_path data/original_data/insurance.csv
$(PYTHON) cleandata.py check_duplicate --file_path data/original_data/insurance.csv
$(PYTHON) cleandata.py encode_data --file_path data/original_data/insurance.csv --version 000
@echo "Data cleaned"
eda: clean_data
@echo "Performing EDA"
@echo "This is step 5: EDA"
@echo "The output folder has an EDA report in output/eda"
$(PYTHON) eda.py --input data/transform/insurance_000.parquet --output output/eda_combined_plots.png
split_data: eda
@echo "Splitting data"
@echo "This is step 6: split data"
@echo "The output folder has a split dataset in data/transform/validation"
@echo "For train test split"
$(PYTHON) split_data.py --data data/transform/insurance_000.parquet --strategy train_test_split --test_size 0.2
@echo "For kfold split"
#$(PYTHON) split_data.py --data data/transform/insurance_000.parquet --strategy kfold --test_size 0.2 --n_splits 5
evaluate_model: split_data
@echo "Evaluating model"
@echo "This is step 7: evaluate model"
@echo "The output folder has a model evaluation in output/model_evaluation"
$(PYTHON) evaluate.py --criterion squared_error --min_samples_leaf 10 --max_leaf_nodes 5 --degree 3
docker_build: requirements.txt Dockerfile
@echo "Building docker image"
sudo docker build -t $(DOCKER_CONTAINER_NAME) .
docker_run: docker_build
@echo "Running docker container"
sudo docker run -it --rm $(DOCKER_CONTAINER_NAME)
docker_clean:
@echo "Cleaning up docker"
sudo docker rmi $(DOCKER_CONTAINER_NAME)</code></pre>
<p>This Makefile encompasses the whole machine learning workflow. It is a great way to keep track of your work, and also to <code>Make</code> sure that you are following best practices. For example, this can encompasses your development, testing, and deployment workflow based on software engineering principles. In addition, the addition of a Dockerfile improves the reproducibility of your work. You can run the commands in the Makefile by running <code>make all</code> in the terminal. In case something goes wrong in part of the workflow other parts of the workflow will not run. This helps us isolate any potential issues that may arise, improve reliability and maintainability of the project.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>$(PYTHON) is a variable that is used to specify the python version that you want to use. This is important because you may have multiple versions of python installed on your machine. This ensures that the correct version of python is used.</p>
<p>$(PIP) is a variable that is used to specify the pip version that you want to use. This is important because you may have multiple versions of pip installed on your machine. This ensures that the correct version of pip is used.</p>
<p>It is also convenient that you can specify the $(DOCKER_CONTAINER_NAME) variable and easily change it for different versions of your project.</p>
</div>
</div>
<p>That’s it for the Makefile. Let’s move on to DVC.</p>
</section>
<section id="dvc" class="level3" data-number="2.2.3">
<h3 data-number="2.2.3" class="anchored" data-anchor-id="dvc"><span class="header-section-number">2.2.3</span> DVC</h3>
<p>Is another tool that can help you track your data science projects. Most of the time, it is used independently. But, we thought wouldn’t it be awesome if we combined Makefile + DVC. That’s what we did, and the gains are tremendous. With DVC, you can version control your data, models, and experiments. It allows you to track changes, collaborate with others, and reproduce your results. By integrating DVC with Makefile, you can automate your data science workflow and ensure that all the necessary steps are executed in the correct order. This combination provides a powerful and efficient way to manage your projects and make them more reproducible.</p>
<p>Here is a simple example of how you can use DVC with Makefile.</p>
<pre class="{makefile}"><code>#| echo: false
# .DEFAULT_GOAL tells make which target to run when no target is specified
.DEFAULT_GOAL := all
# .PHONY tells make that these targets do not represent actual files
.PHONY: all install clean format lint create_dirs activate_venv import_data clean_data eda split_data evaluate_model
# run all commands
all:
dvc repro
# Specify python location in virtual environment
# Specify pip location in virtual environment
ORIGINAL_PY_VERSION := $(shell python3 --version)
PYTHON := .venv/bin/python3
PIP := .venv/bin/pip3
DOCKER_CONTAINER_NAME := ML_workflow:v0.0.0
DATA_DIR := data/
OUTPUT_DIR := output/
MODEL_OUTPUT_DIR := model_output/
venv/bin/activate: requirements.txt
# create virtual environment
python3 -m venv .venv
# make command executable
chmod +x .venv/bin/activate
# activate virtual environment
. .venv/bin/activate
activate_venv:
# activate virtual environment
# run . .venv/bin/activate manually if it doesn't work
@echo "Activating virtual environment"
dvc repro activate_venv
install: venv/bin/activate requirements.txt # prerequisite
# install commands
# This is step 1: install the virtual environment
# Py version using py 3.10 from envname
@echo "Python version: $(ORIGINAL_PY_VERSION)"
@echo "Installing virtual environment"
@echo "This is step 1: install the virtual environment"
$(PIP) --no-cache-dir install --upgrade pip &&\
$(PIP) --no-cache-dir install -r requirements.txt
docstring:
# format docstring
pyment -w -o numpydoc *.py
format:
# format code
black *.py
clean:
@echo "Cleaning up"
# clean directory of cache
rm -rf __pycache__ &&\
rm -rf utils/__pycache__ &&\
rm -rf testing/__pycache__ &&\
rm -rf .pytest_cache &&\
rm -rf .venv
rm -rf db
rm -rf data
rm -rf output
rm -rf model_output
lint: activate install format
# flake8 or #pylint
pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py
init:
@echo "Initializing DVC"
dvc init
# Make sure the directories have been created
create_dirs:
@echo "Creating directories"
@echo "This is step 2: create directories"
dvc repro create_dirs
import_data:
@echo "Importing data from Kaggle"
@echo "This is step 3: import data"
@echo "The data folder has a new dataset"
@echo "Your task Can you accurately predict insurance costs? Regression problem"
dvc repro import_data
clean_data: import_data data/original_data/insurance.csv
@echo "Cleaning data"
@echo "This is step 4: clean data"
@echo "The data folder has a cleaned dataset in data/transform"
dvc repro clean_data
eda:
@echo "Performing EDA"
@echo "This is step 5: EDA"
@echo "The output folder has an EDA report in output/eda"
dvc repro eda
split_data:
@echo "Splitting data"
@echo "This is step 6: split data"
@echo "The output folder has a split dataset in data/transform/validation"
@echo "For train test split"
dvc repro split_data
evaluate_model:
@echo "Evaluating model"
@echo "This is step 7: evaluate model"
@echo "The output folder has a model evaluation in output/model_evaluation"
dvc repro evaluate_model
compare_metrics:
@echo "Comparing metrics"
@echo "This is step 8: compare metrics"
@echo "The output folder has a model evaluation in output/model_evaluation"
dvc metrics diff
hyperparam_diff:
@echo "Comparing hyperparameters"
@echo "This is step 9: compare hyperparameters"
@echo "The output folder has a model evaluation in output/model_evaluation"
dvc params diff
clear_cache:
@echo "Clearing cache"
@echo "This is step 10: clear cache"
@echo "The output folder has a model evaluation in output/model_evaluation"
rm -rf .dvc/cache
docker_build: requirements.txt Dockerfile
@echo "Building docker image"
sudo docker build -t $(DOCKER_CONTAINER_NAME) .
docker_run: docker_build
@echo "Running docker container"
sudo docker run -it --rm $(DOCKER_CONTAINER_NAME)</code></pre>
<p>The difference here is that DVC has specific commands they include <code>dvc init</code>, <code>dvc repro</code>, <code>dvc metrics diff</code>, <code>dvc params diff</code>, and <code>rm -rf .dvc/cache</code>. These commands are used to track changes, compare metrics, compare hyperparameters, and clear the cache respectively. The <code>dvc repro</code> command is used to reproduce the results of the workflow. This ensures that the workflow is executed in the correct order and that all the necessary steps are executed. The <code>dvc metrics diff</code> command is used to compare the metrics of different experiments. The <code>dvc params diff</code> command is used to compare the hyperparameters of different experiments. The <code>rm -rf .dvc/cache</code> command is used to clear the cache. This is important since the cache can take up a lot of space and slow down the workflow. By clearing the cache, you can free up space and speed up the workflow.</p>
</section>
<section id="conclusion" class="level3" data-number="2.2.4">
<h3 data-number="2.2.4" class="anchored" data-anchor-id="conclusion"><span class="header-section-number">2.2.4</span> Conclusion</h3>
<p>In conclusion, combining Makefile and DVC is a powerful way to manage your data science projects. It allows you to automate your workflow, track changes, collaborate with others, and reproduce your results. By using Makefile and DVC together, you can ensure that your projects are more reproducible, reliable, and maintainable. This can help you save time, reduce errors, and improve the quality of your work. So, next time you start a new data science project, consider using Makefile and DVC to manage your workflow. You won’t regret it.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>We recommend visiting the <a href="datavc_makefile/README.md">Makefile ML</a> & <a href="datavc_full/README.md">Makefile & DVC</a> files for implementing the ideas we have put across for the Makefile and for DVC.</p>
</div>
</div>
</section>
</section>
<section id="references" class="level2" data-number="2.3">
<h2 data-number="2.3" class="anchored" data-anchor-id="references"><span class="header-section-number">2.3</span> References</h2>
<p>1.DVC documentation: <a href="https://dvc.org/doc" class="uri">https://dvc.org/doc</a><br>
2.DVC YouTube channel: <a href="https://www.youtube.com/playlist?list=PL7WG7YrwYcnDb0qdPl9-KEStsL-3oaEjg" class="uri">https://www.youtube.com/playlist?list=PL7WG7YrwYcnDb0qdPl9-KEStsL-3oaEjg</a><br>
3.Pragmatic AI labs: <a href="https://youtu.be/rKRG6oQf-bQ?si=4BzXMhS7owl6uWef" class="uri">https://youtu.be/rKRG6oQf-bQ?si=4BzXMhS7owl6uWef</a><br>
4.Kaggle notebook by Dandelion: <a href="https://www.kaggle.com/code/hely333/eda-regression" class="uri">https://www.kaggle.com/code/hely333/eda-regression</a><br>
5.Predicting Chronic kidney Disease: <a href="https://github.com/Shuyib/chronic-kidney-disease-kaggle" class="uri">https://github.com/Shuyib/chronic-kidney-disease-kaggle</a></p>
</section>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const clipboard = new window.ClipboardJS('.code-copy-button', {
target: function(trigger) {
return trigger.previousElementSibling;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>