diff --git a/README.md b/README.md index bf8ee23..582c0c9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,23 @@ # redback-data-warehouse Data Warehouse storage of code and configurations + +## Garmin Run Data – ETL Pipeline Update + +This ETL pipeline processes `Garmin_run_data.csv` and includes: + +### Data cleaning: +- Removes duplicate rows +- Standardizes column names (lowercase, underscores) +- Converts timestamps to datetime +- Fills missing numeric values with column means +- Removes outliers in `heart_rate` (keeps values between 30–220 bpm) +- Converts distance from meters to kilometers +- Converts speed from m/s to km/h + +### Data aggregation: +- Groups data by year and week +- Calculates total runs, total distance (km), average speed (km/h), and average pace (min/km) per week + +### Outputs: +- `cleaned_garmin_run_data.csv` → cleaned dataset + diff --git a/Requirement Gathering (4).pdf b/Requirement Gathering (4).pdf new file mode 100644 index 0000000..aaa2831 Binary files /dev/null and b/Requirement Gathering (4).pdf differ diff --git a/etl_scripts/ETL pipeline.ipynb b/etl_scripts/ETL pipeline.ipynb new file mode 100644 index 0000000..cae4104 --- /dev/null +++ b/etl_scripts/ETL pipeline.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "27156c5d-4dfb-4f95-a4e6-d88976d9c7c5", + "metadata": {}, + "source": [ + "# Importing required libraries\r\n", + "# We use pandas for data manipulation and matplotlib for visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ced94ba9-4fef-457e-9c36-855cc56c90bb", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n" + ] + }, + { + "cell_type": "markdown", + "id": "0501979d-4574-4efc-8409-43de021a8bb6", + "metadata": {}, + "source": [ + "# Extract – Load the Garmin running data\r\n", + "# Reading the original raw CSV file" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5445f384-8e1a-4d1a-9936-49f6ae3cbacb", + "metadata": {}, + "outputs": [], + "source": [ + "df_raw = pd.read_csv(\"Garmin_run_data.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "6221e8e8-af73-4fbe-be56-d992d7f63132", + "metadata": {}, + "source": [ + "# Transform – Cleaning and enhancing the data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "368a5cac-e8e5-444b-8dbe-9d96a7581f3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Cleaned data and weekly stats saved.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# ============================\n", + "# 📥 Load raw data\n", + "# ============================\n", + "df_raw = pd.read_csv(\"Garmin_run_data.csv\")\n", + "\n", + "# ============================\n", + "# 🧹 Data Cleaning\n", + "# ============================\n", + "\n", + "# 1. Remove duplicate rows\n", + "df_cleaned = df_raw.drop_duplicates()\n", + "\n", + "# 2. Standardize column names (lowercase, underscores)\n", + "df_cleaned.columns = [col.strip().lower().replace(\" \", \"_\") for col in df_cleaned.columns]\n", + "\n", + "# 3. Convert timestamps to datetime\n", + "if 'timestamp' in df_cleaned.columns:\n", + " df_cleaned['timestamp'] = pd.to_datetime(df_cleaned['timestamp'], errors='coerce')\n", + "\n", + "# 4. Fill missing numeric values with column means\n", + "numeric_cols = df_cleaned.select_dtypes(include='number').columns\n", + "df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].mean())\n", + "\n", + "# 5. Remove outliers in heart_rate (keep values between 30 and 220 bpm)\n", + "if 'heart_rate' in df_cleaned.columns:\n", + " df_cleaned = df_cleaned[(df_cleaned['heart_rate'] >= 30) & (df_cleaned['heart_rate'] <= 220)]\n", + "\n", + "# 6. Unit conversion: meters to kilometers\n", + "if 'distance' in df_cleaned.columns:\n", + " df_cleaned['distance_km'] = df_cleaned['distance'] / 1000\n", + "\n", + "# 7. Unit conversion: speed from m/s to km/h\n", + "if 'speed' in df_cleaned.columns:\n", + " df_cleaned['speed_kmh'] = df_cleaned['speed'] * 3.6\n", + "\n", + "# ============================\n", + "# 📊 Data Aggregation (Weekly Stats)\n", + "# ============================\n", + "\n", + "if 'timestamp' in df_cleaned.columns:\n", + " # Extract week, month, year for grouping\n", + " df_cleaned['week'] = df_cleaned['timestamp'].dt.isocalendar().week\n", + " df_cleaned['month'] = df_cleaned['timestamp'].dt.month\n", + " df_cleaned['year'] = df_cleaned['timestamp'].dt.year\n", + "\n", + " # Group by year + week to compute stats\n", + " weekly_stats = df_cleaned.groupby(['year', 'week']).agg(\n", + " total_runs=('timestamp', 'count'),\n", + " total_distance_km=('distance_km', 'sum'),\n", + " average_speed_kmh=('speed_kmh', 'mean')\n", + " ).reset_index()\n", + "\n", + " # Calculate average pace (min/km) if speed exists\n", + " if 'average_speed_kmh' in weekly_stats.columns:\n", + " weekly_stats['average_pace_min_per_km'] = 60 / weekly_stats['average_speed_kmh']\n", + "\n", + "# ============================\n", + "# 💾 Save outputs\n", + "# ============================\n", + "\n", + "# Save cleaned data\n", + "df_cleaned.to_csv(\"cleaned_garmin_run_data.csv\", index=False)\n", + "\n", + "# Save weekly statistics (if generated)\n", + "if 'weekly_stats' in locals():\n", + " weekly_stats.to_csv(\"weekly_stats_garmin_run_data.csv\", index=False)\n", + "\n", + "print(\"✅ Cleaned data and weekly stats saved.\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "1eb98d8e-3551-4b80-962a-810059c40992", + "metadata": {}, + "source": [ + "# Visualize – Ploting distributions for insight" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6425c501-52f4-44eb-b1a3-46c5ecec29dc", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot heart rate distribution\n", + "if 'heart_rate' in df_cleaned.columns:\n", + " plt.figure(figsize=(8, 4))\n", + " plt.hist(df_cleaned['heart_rate'], bins=30, color='skyblue', edgecolor='black')\n", + " plt.title(\"Heart Rate Distribution\")\n", + " plt.xlabel(\"Heart Rate (bpm)\")\n", + " plt.ylabel(\"Frequency\")\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "# Plot distance (km) if available\n", + "if 'distance_km' in df_cleaned.columns:\n", + " plt.figure(figsize=(8, 4))\n", + " plt.hist(df_cleaned['distance_km'], bins=20, color='lightgreen', edgecolor='black')\n", + " plt.title(\"Distance Distribution (km)\")\n", + " plt.xlabel(\"Distance (km)\")\n", + " plt.ylabel(\"Frequency\")\n", + " plt.grid(True)\n", + " plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f458eeb5-f7a3-4ed7-a260-0dc2c3bfb0d7", + "metadata": {}, + "source": [ + "# Load – Saved the cleaned dataset to a new CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4581ef6c-925f-4cc9-87f5-f31aaaaac393", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Cleaned data saved to 'cleaned_garmin_run_data.csv'\n" + ] + } + ], + "source": [ + "df_cleaned.to_csv(\"cleaned_garmin_run_data.csv\", index=False)\n", + "print(\"✅ Cleaned data saved to 'cleaned_garmin_run_data.csv'\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "98056b45-85a5-4848-9014-5854c922c2af", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}