feat: simple column updates/conversions

thekaveman · thekaveman · commit 624a7b32bebc · 2024-04-05T20:49:58.000Z
* rename columns that can be imported as-is
* add static calculated columns
* calculate email column, drop name columns
* calculate string duration column
diff --git a/notebooks/harvest-to-toggl.ipynb b/notebooks/harvest-to-toggl.ipynb
@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from datetime import datetime\n",
     "import os\n",
     "from pathlib import Path\n",
     "import pandas as pd\n",
@@ -21,8 +22,76 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source = pd.read_csv(DATA_SOURCE)\n",
-    "source.head()"
+    "# assign category dtype for efficiency on repeating text columns\n",
+    "dtypes = {\n",
+    "    \"Client\": \"category\",\n",
+    "    \"Project\": \"category\",\n",
+    "    \"First Name\": \"category\",\n",
+    "    \"Last Name\": \"category\",\n",
+    "}\n",
+    "# skip reading the columns we don't care about for Toggl\n",
+    "cols = list(dtypes) + [\n",
+    "    \"Date\",\n",
+    "    \"Notes\",\n",
+    "    \"Hours\",\n",
+    "]\n",
+    "# read CSV file, parsing dates\n",
+    "source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=[\"Date\"], cache_dates=True)\n",
+    "source.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rename columns that can be imported as-is\n",
+    "source.rename(columns={\"Project\": \"Task\", \"Notes\": \"Description\", \"Date\": \"Start date\"}, inplace=True)\n",
+    "source.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update static calculated columns\n",
+    "source[\"Client\"] = \"Xentrans\"\n",
+    "source[\"Client\"] = source[\"Client\"].astype(\"category\")\n",
+    "source[\"Project\"] = \"Xentrans\"\n",
+    "source[\"Project\"] = source[\"Project\"].astype(\"category\")\n",
+    "source.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add the Email column\n",
+    "source[\"Email\"] = source[\"First Name\"].apply(lambda x: f\"{x.lower()}@compiler.la\").astype(\"category\")\n",
+    "# drop individual name columns\n",
+    "source.drop(columns=[\"First Name\", \"Last Name\"], inplace=True)\n",
+    "source.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert numeric Hours to string Duration\n",
+    "source[\"Duration\"] = source[\"Hours\"].apply(\n",
+    "    # first convert the numeric hours e.g. 1.5 to a timedelta\n",
+    "    # then use the total seconds to convert to a datetime\n",
+    "    # and format as a string e.g. 01:30\n",
+    "    lambda x: datetime.fromtimestamp(pd.to_timedelta(x, unit=\"hours\").total_seconds()).strftime(\"%H:%M\")\n",
+    ")\n",
+    "source[\"Duration\"].head()"
    ]
   }
  ],