data_version_control_presentation.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.269">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">

<meta name="author" content="Mainye B">

<title>Data Version Control</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
</style>


<script src="data_version_control_presentation_files/libs/clipboard/clipboard.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/quarto.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/popper.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="data_version_control_presentation_files/libs/quarto-html/anchor.min.js"></script>
<link href="data_version_control_presentation_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="data_version_control_presentation_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="data_version_control_presentation_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="data_version_control_presentation_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="data_version_control_presentation_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="dark">


</head>

<body>

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
  <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#what-is-it" id="toc-what-is-it" class="nav-link active" data-scroll-target="#what-is-it"><span class="toc-section-number">1</span>  What is it?</a></li>
  <li><a href="#why-is-it-important" id="toc-why-is-it-important" class="nav-link" data-scroll-target="#why-is-it-important"><span class="toc-section-number">2</span>  Why is it important?</a>
  <ul class="collapse">
  <li><a href="#needs" id="toc-needs" class="nav-link" data-scroll-target="#needs"><span class="toc-section-number">2.1</span>  Needs</a>
  <ul class="collapse">
  <li><a href="#data-examples" id="toc-data-examples" class="nav-link" data-scroll-target="#data-examples"><span class="toc-section-number">2.1.1</span>  Data Examples</a></li>
  <li><a href="#data-science-process" id="toc-data-science-process" class="nav-link" data-scroll-target="#data-science-process"><span class="toc-section-number">2.1.2</span>  Data science process</a></li>
  </ul></li>
  <li><a href="#try-something-different-with-dvc-and-makefiles" id="toc-try-something-different-with-dvc-and-makefiles" class="nav-link" data-scroll-target="#try-something-different-with-dvc-and-makefiles"><span class="toc-section-number">2.2</span>  Try something different with DVC and Makefiles</a>
  <ul class="collapse">
  <li><a href="#makefile" id="toc-makefile" class="nav-link" data-scroll-target="#makefile"><span class="toc-section-number">2.2.1</span>  Makefile</a></li>
  <li><a href="#using-a-makefile-for-machine-learning-workflow" id="toc-using-a-makefile-for-machine-learning-workflow" class="nav-link" data-scroll-target="#using-a-makefile-for-machine-learning-workflow"><span class="toc-section-number">2.2.2</span>  Using a Makefile for Machine Learning Workflow</a></li>
  <li><a href="#dvc" id="toc-dvc" class="nav-link" data-scroll-target="#dvc"><span class="toc-section-number">2.2.3</span>  DVC</a></li>
  <li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion"><span class="toc-section-number">2.2.4</span>  Conclusion</a></li>
  </ul></li>
  <li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references"><span class="toc-section-number">2.3</span>  References</a></li>
  </ul></li>
  </ul>
</nav>
</div>
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Data Version Control</h1>
</div>


<div class="quarto-title-meta">

    <div>
    <div class="quarto-title-meta-heading">Author</div>
    <div class="quarto-title-meta-contents">
             <p>Mainye B </p>
          </div>
  </div>
    
  
  </div>
  

</header>

<section id="what-is-it" class="level1" data-number="1">
<h1 data-number="1"><span class="header-section-number">1</span> What is it?</h1>
<p>Data version control is way of making a reproducible journal to replicate your data science workflow. Imagine when you are working with teams everyone has their own way of doing things but how can we make a consensus to have unified way of working together so that you don’t step on each others toes. On the other hand, is there a way of managing data science projects a bit easier to be able to track project a bit better? We will discuss that in this presentation.</p>
<p>They are several tools that have been created to address this problem. They include the following:</p>
<ul>
<li><a href="https://dvc.org/">DVC</a></li>
<li><a href="https://mlflow.org/">Mlflow</a></li>
<li><a href="https://neptune.ai/">Neptuneai</a></li>
<li><a href="https://delta.io/">Delta Lake</a></li>
<li><a href="https://metaflow.org/">Metaflow</a></li>
</ul>
<div class="callout callout callout-style-simple no-icon">
<div class="callout-body d-flex">
<div class="callout-icon-container">
<i class="callout-icon no-icon"></i>
</div>
<div class="callout-body-container">
<p>We’ll go through DVC, and Makefiles. Great Expectations is another tool that can be used to validate data.</p>
</div>
</div>
</div>
</section>
<section id="why-is-it-important" class="level1" data-number="2">
<h1 data-number="2"><span class="header-section-number">2</span> Why is it important?</h1>
<p>As professionals who have worked on various projects in data science and machine learning, we have discovered that the path from idea to product needs a frictionless workflow. This allows us to focus on implementing ideas rather than handling all that goes on in the background.</p>
<p>It is important mostly because it can get very confusing when handling projects and keeping track of our experiments. In data science, we don’t have predefined outputs. We can create reports, dashboards, applications, and APIs. There are so many things that go into that process, such as data importing, exploratory data analysis, feature engineering, and modeling. Each of these steps can take different routes to reach our destination.</p>
<p><img src="https://unsplash.com/photos/aerial-photography-of-road-zS4lUqLEiNA" title="Title: Aerial Photography by Jack Anstey" alt="Aerial Photography by Jack Anstey"></p>
<section id="needs" class="level2" data-number="2.1">
<h2 data-number="2.1" class="anchored" data-anchor-id="needs"><span class="header-section-number">2.1</span> Needs</h2>
<ul>
<li>How can we track different parts of our work?</li>
<li>How can we record hyperparameters for different versions of our experiments?</li>
<li>How can we store metadata of our projects, such as models and slices of data?</li>
<li>How can we unify and organize metrics?</li>
<li>Can I fully replicate their work or at least a significant portion of it?</li>
</ul>
<blockquote class="blockquote">
<p>All of the solutions mentioned above can help address these challenges and can improve your workflows.</p>
</blockquote>
<section id="data-examples" class="level3" data-number="2.1.1">
<h3 data-number="2.1.1" class="anchored" data-anchor-id="data-examples"><span class="header-section-number">2.1.1</span> Data Examples</h3>
<p>We will be using two datasets for this presentation. The first dataset is the Medical Cost Personal Datasets. This dataset contains information about the medical costs of individuals. The second dataset is the Telco dataset. This dataset contains information about the customers of a telecommunications company. Both datasets are available on Kaggle.</p>
<p>We recommend visiting the <a href="https://www.kaggle.com/">Kaggle website</a> to download the datasets and explore them further. As well as implement the ideas with the second dataset.</p>
<div class="callout-important callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p><strong><a href="https://www.kaggle.com/datasets/mirichoi0218/insurance">Medical Cost Personal Datasets</a></strong></p>
</div>
</div>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p><strong><a href="https://www.kaggle.com/datasets/blastchar/telco-customer-churn">Telco dataset</a></strong></p>
</div>
</div>
<p>The dataset has a number of observations and measurements that are crucial for a prediction task, which is finding churn. Churn refers to the likelihood that a client will stop using the telecommunications company. This is particularly relevant if you are looking at the second dataset, the Telco dataset.</p>
<p>Other very common metrics that you can be asked to calculate in the data science team include:</p>
<table class="table">
<thead>
<tr class="header">
<th>Metric</th>
<th style="text-align: left;">Explanation</th>
<th style="text-align: right;">Associated link</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Hypothesis testing</td>
<td style="text-align: left;">Making the website better via focus group testing.</td>
<td style="text-align: right;"><a href="https://medium.com/@gajendra.k.s/hypothesis-testing-33aaeeff5336">link</a></td>
</tr>
<tr class="even">
<td>Conversion rate</td>
<td style="text-align: left;">time it takes for a client to move from discovery to becoming a paying customer.</td>
<td style="text-align: right;"><a href="https://www.geeksforgeeks.org/conversion-rate-what-is-it-how-to-calculate-it/">link</a></td>
</tr>
<tr class="odd">
<td>Customer life time value (LTV)</td>
<td style="text-align: left;">how much a client(s) will generate in their lifetime.</td>
<td style="text-align: right;"><a href="https://www.datacamp.com/tutorial/customer-life-time-value">link</a></td>
</tr>
<tr class="even">
<td>Recommendation systems</td>
<td style="text-align: left;">how can we sell cross sell our existing products better</td>
<td style="text-align: right;"><a href="https://medium.com/@Karthickk_Rajah/clustering-based-algorithms-in-recommendation-system-205fcb15bc9b">link</a></td>
</tr>
<tr class="odd">
<td>Optimization</td>
<td style="text-align: left;">adjusting cost of product this involves using specific techniques to find the maximum or minimum value of something to reap better revenues</td>
<td style="text-align: right;"><a href="https://towardsdatascience.com/production-fixed-horizon-planning-with-python-8dd38b468e86">link</a></td>
</tr>
</tbody>
</table>
</section>
<section id="data-science-process" class="level3" data-number="2.1.2">
<h3 data-number="2.1.2" class="anchored" data-anchor-id="data-science-process"><span class="header-section-number">2.1.2</span> Data science process</h3>
<p>We will be referencing a cool notebook that someone in the kaggle community had done. Here’s the original <a href="https://www.kaggle.com/code/hely333/eda-regression">notebook</a>.</p>
<p>The person did are really cool job. However, I wish more one hot encoding was done and exploring techniques such as OneR were done. We’ll explore that later. At the moment, let’s set out attention to the data science process.</p>
<div id="fig-datasci" class="quarto-layout-panel">
<figure class="figure">
<div class="quarto-layout-row quarto-layout-valign-top">
<div class="quarto-figure quarto-figure-center" style="flex-basis: 50.0%;justify-content: center;">
<figure class="figure">
<p><a href="https://www.manning.com/books/data-science-with-python-and-dask" id="fig-process"><img src="Screenshot%20from%202023-02-13-10-57-10.png" class="img-fluid figure-img"></a></p>
<p></p><figcaption class="figure-caption">Data science process</figcaption><p></p>
</figure>
</div>
<div class="quarto-layout-cell" style="flex-basis: 50.0%;justify-content: center;">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="Screenshot%20from%202023-02-13-10-57-41.png" class="img-fluid figure-img" width="100"></p>
<p></p><figcaption class="figure-caption">Transforming-data</figcaption><p></p>
</figure>
</div>
</div>
</div>
<p></p><figcaption class="figure-caption">Figure&nbsp;1: What is done in data science</figcaption><p></p>
</figure>
</div>
<p>As you can see above we change data in various forms that we can use to understand it better. We can use it to make predictions, make recommendations, and optimize our products.</p>
<p>Often times you can easily just make a notebook, and your work is done. They are tools that allow you to do <a href="https://www.kaggle.com/discussions/getting-started/293861">scheduled notebook reruns</a> on kaggle, using <a href="https://papermill.readthedocs.io/en/latest/">papermill</a> and <a href="https://towardsdatascience.com/how-to-schedule-jupyter-notebooks-in-amazon-sagemaker-d50fa1c8c0ad">Sagemaker</a>.</p>
</section>
</section>
<section id="try-something-different-with-dvc-and-makefiles" class="level2" data-number="2.2">
<h2 data-number="2.2" class="anchored" data-anchor-id="try-something-different-with-dvc-and-makefiles"><span class="header-section-number">2.2</span> Try something different with DVC and Makefiles</h2>
<section id="makefile" class="level3" data-number="2.2.1">
<h3 data-number="2.2.1" class="anchored" data-anchor-id="makefile"><span class="header-section-number">2.2.1</span> Makefile</h3>
<p>In most Unix systems (Mac Os and Linux) you’ll find that the <code>make</code> command is already installed. If not it very easy to install it.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>How to install</p>
<pre class="{bash}"><code># update packages
sudo apt-get update
# just say yes to make
sudo apt-get -y install make
# what version was installed
make -v</code></pre>
</div>
</div>
<p>Using these files makes it easy to hide the complexity of running commands that you require to follow best practices as an example:</p>
<blockquote class="blockquote">
<p>Running in bash</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false

# This code runs the pylint tool with specific configurations to check for errors in Python files.
# The `--disable=R,C` flag disables the pylint checks for code style and convention violations.
# The `--errors-only` flag ensures that only error messages are displayed.
# The `*.py utils/*.py testing/*.py` argument specifies the files and directories to be checked by pylint.
pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py</code></pre>
<p>Code Linting Linting is crucial for maintaining high-quality code. It helps catch errors and inconsistencies early on, reducing bugs and improving readability.</p>
<p>Why Lint?</p>
<ul>
<li>Reduced bugs: Catch errors before runtime.</li>
<li>Improved readability: Enforce consistent coding standards.</li>
<li>Faster development: Identify issues quickly.</li>
</ul>
<blockquote class="blockquote">
<p>Within your Makefile</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false
lint: activate install format # These are prerequisites: that is they must be run first
    # flake8 or #pylint
    pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py</code></pre>
<blockquote class="blockquote">
<p>In Terminal</p>
</blockquote>
<pre class="{bash}"><code>#| echo: false
make lint</code></pre>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>Instead of memorizing long commands you can store them in a Makefile and run them in a single command for example <code>make all</code> will run each command until the end of the file. Also, <a href="https://www.youtube.com/watch?v=2wSBAkJGcug">Continous Integration/Continous Deployment</a></p>
</div>
</div>
</section>
<section id="using-a-makefile-for-machine-learning-workflow" class="level3" data-number="2.2.2">
<h3 data-number="2.2.2" class="anchored" data-anchor-id="using-a-makefile-for-machine-learning-workflow"><span class="header-section-number">2.2.2</span> Using a Makefile for Machine Learning Workflow</h3>
<p>At this juncture, you are probably acknowledging how much a Makefile is amazing. Get this you can use it with any language you prefer for data science and machine learning. Here are more <a href="https://gist.github.com/Shuyib/ae87774fd82c69706803725db9a681dc">examples in Julia and R</a></p>
<p>Let create a Makefile to assist us with <strong>Making</strong> a machine learning workflow to help us handle the project better.</p>
<p>In the directory <code>datavc_makefile</code> we have a custom Makefile that we can use to run our commands. Specifically, for a machine learning project.</p>
<pre class="{makefile}"><code>#| echo: false
# .DEFAULT_GOAL tells make which target to run when no target is specified
.DEFAULT_GOAL := all

# .PHONY tells make that these targets do not represent actual files
.PHONY: all install clean format lint create_dirs activate_venv import_data clean_data eda split_data evaluate_model

# run all commands
all: create_dirs install activate_venv import_data clean_data eda split_data evaluate_model

# Specify python location in virtual environment it ensures that the correct version of python is used
# Specify pip location in virtual environment it ensures that the correct version of pip is used
ORIGINAL_PY_VERSION := $(shell python3 --version)
PYTHON := .venv/bin/python3
PIP := .venv/bin/pip3
DOCKER_CONTAINER_NAME := ml_regression_workflow:v0.0.0
DATA_DIR := data/
OUTPUT_DIR := output/
MODEL_OUTPUT_DIR := model_output/


venv/bin/activate: requirements.txt
    # create virtual environment
    python3 -m venv .venv
    # make command executable
    chmod +x .venv/bin/activate
    # activate virtual environment
    . .venv/bin/activate

activate_venv:
    # activate virtual environment
    # run . .venv/bin/activate manually if it doesn't work
    @echo "Activating virtual environment"
    chmod +x activate_venv.sh
    ./activate_venv.sh


install: venv/bin/activate requirements.txt # prerequisite
    # install commands
    # This is step 1: install the virtual environment
    # Py version using py 3.10 from envname
    @echo "Python version: $(ORIGINAL_PY_VERSION)"
    @echo "Installing virtual environment"
    @echo "This is step 1: install the virtual environment"
    $(PIP) --no-cache-dir install --upgrade pip &amp;&amp;\
        $(PIP) --no-cache-dir install -r requirements.txt

docstring: 
    # format docstring
    pyment -w -o numpydoc *.py
  
format: 
    # format code
    black *.py

clean:
    @echo "Cleaning up"
    # clean directory of cache
    rm -rf __pycache__ &amp;&amp;\
    rm -rf utils/__pycache__ &amp;&amp;\
    rm -rf testing/__pycache__ &amp;&amp;\
    rm -rf .pytest_cache &amp;&amp;\
    rm -rf .venv
    rm -rf db
    rm -rf data
    rm -rf output
    rm -rf model_output


lint: activate install format
    # flake8 or #pylint
    pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py

# Make sure the directories have been created
create_dirs:
    @echo "Creating directories"
    @echo "This is step 2: create directories"
    mkdir -p -v $(DATA_DIR)
    mkdir -p -v $(OUTPUT_DIR)
    mkdir -p -v $(MODEL_OUTPUT_DIR)
    @echo "Directories created"
    @echo "remember to follow these steps https://www.kaggle.com/discussions/general/74235"

import_data: create_dirs 
    @echo "Importing data from Kaggle"
    @echo "This is step 3: import data"
    @echo "The data folder has a new dataset"
    @echo "Your task Can you accurately predict insurance costs? Regression problem"
    # make sure script is executable
    chmod +x import_data.sh
    # run script
    ./import_data.sh

clean_data: import_data data/original_data/insurance.csv
    @echo "Cleaning data"
    @echo "This is step 4: clean data"
    @echo "The data folder has a cleaned dataset in data/transform"
    $(PYTHON) cleandata.py load_data --file_path data/original_data/insurance.csv
    $(PYTHON) cleandata.py summary --file_path data/original_data/insurance.csv
    $(PYTHON) cleandata.py check_missing --file_path data/original_data/insurance.csv
    $(PYTHON) cleandata.py check_duplicate --file_path data/original_data/insurance.csv
    $(PYTHON) cleandata.py encode_data --file_path data/original_data/insurance.csv --version 000
    @echo "Data cleaned"

eda: clean_data
    @echo "Performing EDA"
    @echo "This is step 5: EDA"
    @echo "The output folder has an EDA report in output/eda"
    $(PYTHON) eda.py --input data/transform/insurance_000.parquet --output output/eda_combined_plots.png

split_data: eda
    @echo "Splitting data"
    @echo "This is step 6: split data"
    @echo "The output folder has a split dataset in data/transform/validation"
    @echo "For train test split"
    $(PYTHON) split_data.py --data data/transform/insurance_000.parquet --strategy train_test_split --test_size 0.2
    @echo "For kfold split"
    #$(PYTHON) split_data.py --data data/transform/insurance_000.parquet --strategy kfold --test_size 0.2 --n_splits 5

evaluate_model: split_data
    @echo "Evaluating model"
    @echo "This is step 7: evaluate model"
    @echo "The output folder has a model evaluation in output/model_evaluation"
    $(PYTHON) evaluate.py --criterion squared_error --min_samples_leaf 10 --max_leaf_nodes 5 --degree 3

docker_build: requirements.txt Dockerfile
    @echo "Building docker image"
    sudo docker build -t $(DOCKER_CONTAINER_NAME) .

docker_run: docker_build
    @echo "Running docker container"
    sudo docker run -it --rm $(DOCKER_CONTAINER_NAME)

docker_clean:
    @echo "Cleaning up docker"
    sudo docker rmi $(DOCKER_CONTAINER_NAME)</code></pre>
<p>This Makefile encompasses the whole machine learning workflow. It is a great way to keep track of your work, and also to <code>Make</code> sure that you are following best practices. For example, this can encompasses your development, testing, and deployment workflow based on software engineering principles. In addition, the addition of a Dockerfile improves the reproducibility of your work. You can run the commands in the Makefile by running <code>make all</code> in the terminal. In case something goes wrong in part of the workflow other parts of the workflow will not run. This helps us isolate any potential issues that may arise, improve reliability and maintainability of the project.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>$(PYTHON) is a variable that is used to specify the python version that you want to use. This is important because you may have multiple versions of python installed on your machine. This ensures that the correct version of python is used.</p>
<p>$(PIP) is a variable that is used to specify the pip version that you want to use. This is important because you may have multiple versions of pip installed on your machine. This ensures that the correct version of pip is used.</p>
<p>It is also convenient that you can specify the $(DOCKER_CONTAINER_NAME) variable and easily change it for different versions of your project.</p>
</div>
</div>
<p>That’s it for the Makefile. Let’s move on to DVC.</p>
</section>
<section id="dvc" class="level3" data-number="2.2.3">
<h3 data-number="2.2.3" class="anchored" data-anchor-id="dvc"><span class="header-section-number">2.2.3</span> DVC</h3>
<p>Is another tool that can help you track your data science projects. Most of the time, it is used independently. But, we thought wouldn’t it be awesome if we combined Makefile + DVC. That’s what we did, and the gains are tremendous. With DVC, you can version control your data, models, and experiments. It allows you to track changes, collaborate with others, and reproduce your results. By integrating DVC with Makefile, you can automate your data science workflow and ensure that all the necessary steps are executed in the correct order. This combination provides a powerful and efficient way to manage your projects and make them more reproducible.</p>
<p>Here is a simple example of how you can use DVC with Makefile.</p>
<pre class="{makefile}"><code>#| echo: false
# .DEFAULT_GOAL tells make which target to run when no target is specified
.DEFAULT_GOAL := all

# .PHONY tells make that these targets do not represent actual files
.PHONY: all install clean format lint create_dirs activate_venv import_data clean_data eda split_data evaluate_model

# run all commands
all: 
    dvc repro

# Specify python location in virtual environment
# Specify pip location in virtual environment
ORIGINAL_PY_VERSION := $(shell python3 --version)
PYTHON := .venv/bin/python3
PIP := .venv/bin/pip3
DOCKER_CONTAINER_NAME := ML_workflow:v0.0.0
DATA_DIR := data/
OUTPUT_DIR := output/
MODEL_OUTPUT_DIR := model_output/


venv/bin/activate: requirements.txt
    # create virtual environment
    python3 -m venv .venv
    # make command executable
    chmod +x .venv/bin/activate
    # activate virtual environment
    . .venv/bin/activate

activate_venv:
    # activate virtual environment
    # run . .venv/bin/activate manually if it doesn't work
    @echo "Activating virtual environment"
    dvc repro activate_venv


install: venv/bin/activate requirements.txt # prerequisite
    # install commands
    # This is step 1: install the virtual environment
    # Py version using py 3.10 from envname
    @echo "Python version: $(ORIGINAL_PY_VERSION)"
    @echo "Installing virtual environment"
    @echo "This is step 1: install the virtual environment"
    $(PIP) --no-cache-dir install --upgrade pip &amp;&amp;\
        $(PIP) --no-cache-dir install -r requirements.txt
docstring: 
    # format docstring
    pyment -w -o numpydoc *.py
  
format: 
    # format code
    black *.py

clean:
    @echo "Cleaning up"
    # clean directory of cache
    rm -rf __pycache__ &amp;&amp;\
    rm -rf utils/__pycache__ &amp;&amp;\
    rm -rf testing/__pycache__ &amp;&amp;\
    rm -rf .pytest_cache &amp;&amp;\
    rm -rf .venv
    rm -rf db
    rm -rf data
    rm -rf output
    rm -rf model_output
    

lint: activate install format
    # flake8 or #pylint
    pylint --disable=R,C --errors-only *.py utils/*.py testing/*.py

init:
    @echo "Initializing DVC"
    dvc init

# Make sure the directories have been created
create_dirs:
    @echo "Creating directories"
    @echo "This is step 2: create directories"
    dvc repro create_dirs

import_data: 
    @echo "Importing data from Kaggle"
    @echo "This is step 3: import data"
    @echo "The data folder has a new dataset"
    @echo "Your task Can you accurately predict insurance costs? Regression problem"
    dvc repro import_data

clean_data: import_data data/original_data/insurance.csv
    @echo "Cleaning data"
    @echo "This is step 4: clean data"
    @echo "The data folder has a cleaned dataset in data/transform"
    dvc repro clean_data

eda:
    @echo "Performing EDA"
    @echo "This is step 5: EDA"
    @echo "The output folder has an EDA report in output/eda"
    dvc repro eda
    
split_data: 
    @echo "Splitting data"
    @echo "This is step 6: split data"
    @echo "The output folder has a split dataset in data/transform/validation"
    @echo "For train test split"
    dvc repro split_data
evaluate_model: 
    @echo "Evaluating model"
    @echo "This is step 7: evaluate model"
    @echo "The output folder has a model evaluation in output/model_evaluation"
    dvc repro evaluate_model

compare_metrics:
    @echo "Comparing metrics"
    @echo "This is step 8: compare metrics"
    @echo "The output folder has a model evaluation in output/model_evaluation"
    dvc metrics diff

hyperparam_diff:
    @echo "Comparing hyperparameters"
    @echo "This is step 9: compare hyperparameters"
    @echo "The output folder has a model evaluation in output/model_evaluation"
    dvc params diff

clear_cache:
    @echo "Clearing cache"
    @echo "This is step 10: clear cache"
    @echo "The output folder has a model evaluation in output/model_evaluation"
    rm -rf .dvc/cache

docker_build: requirements.txt Dockerfile
  @echo "Building docker image"
  sudo docker build -t $(DOCKER_CONTAINER_NAME) .

docker_run: docker_build
  @echo "Running docker container"
  sudo docker run -it --rm $(DOCKER_CONTAINER_NAME)</code></pre>
<p>The difference here is that DVC has specific commands they include <code>dvc init</code>, <code>dvc repro</code>, <code>dvc metrics diff</code>, <code>dvc params diff</code>, and <code>rm -rf .dvc/cache</code>. These commands are used to track changes, compare metrics, compare hyperparameters, and clear the cache respectively. The <code>dvc repro</code> command is used to reproduce the results of the workflow. This ensures that the workflow is executed in the correct order and that all the necessary steps are executed. The <code>dvc metrics diff</code> command is used to compare the metrics of different experiments. The <code>dvc params diff</code> command is used to compare the hyperparameters of different experiments. The <code>rm -rf .dvc/cache</code> command is used to clear the cache. This is important since the cache can take up a lot of space and slow down the workflow. By clearing the cache, you can free up space and speed up the workflow.</p>
</section>
<section id="conclusion" class="level3" data-number="2.2.4">
<h3 data-number="2.2.4" class="anchored" data-anchor-id="conclusion"><span class="header-section-number">2.2.4</span> Conclusion</h3>
<p>In conclusion, combining Makefile and DVC is a powerful way to manage your data science projects. It allows you to automate your workflow, track changes, collaborate with others, and reproduce your results. By using Makefile and DVC together, you can ensure that your projects are more reproducible, reliable, and maintainable. This can help you save time, reduce errors, and improve the quality of your work. So, next time you start a new data science project, consider using Makefile and DVC to manage your workflow. You won’t regret it.</p>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>We recommend visiting the <a href="datavc_makefile/README.md">Makefile ML</a> &amp; <a href="datavc_full/README.md">Makefile &amp; DVC</a> files for implementing the ideas we have put across for the Makefile and for DVC.</p>
</div>
</div>
</section>
</section>
<section id="references" class="level2" data-number="2.3">
<h2 data-number="2.3" class="anchored" data-anchor-id="references"><span class="header-section-number">2.3</span> References</h2>
<p>1.DVC documentation: <a href="https://dvc.org/doc" class="uri">https://dvc.org/doc</a><br>
2.DVC YouTube channel: <a href="https://www.youtube.com/playlist?list=PL7WG7YrwYcnDb0qdPl9-KEStsL-3oaEjg" class="uri">https://www.youtube.com/playlist?list=PL7WG7YrwYcnDb0qdPl9-KEStsL-3oaEjg</a><br>
3.Pragmatic AI labs: <a href="https://youtu.be/rKRG6oQf-bQ?si=4BzXMhS7owl6uWef" class="uri">https://youtu.be/rKRG6oQf-bQ?si=4BzXMhS7owl6uWef</a><br>
4.Kaggle notebook by Dandelion: <a href="https://www.kaggle.com/code/hely333/eda-regression" class="uri">https://www.kaggle.com/code/hely333/eda-regression</a><br>
5.Predicting Chronic kidney Disease: <a href="https://github.com/Shuyib/chronic-kidney-disease-kaggle" class="uri">https://github.com/Shuyib/chronic-kidney-disease-kaggle</a></p>
</section>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>