|
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
| 9 | + "from datetime import datetime\n", |
9 | 10 | "import os\n",
|
10 | 11 | "from pathlib import Path\n",
|
11 | 12 | "import pandas as pd\n",
|
|
21 | 22 | "metadata": {},
|
22 | 23 | "outputs": [],
|
23 | 24 | "source": [
|
24 |
| - "source = pd.read_csv(DATA_SOURCE)\n", |
25 |
| - "source.head()" |
| 25 | + "# assign category dtype for efficiency on repeating text columns\n", |
| 26 | + "dtypes = {\n", |
| 27 | + " \"Client\": \"category\",\n", |
| 28 | + " \"Project\": \"category\",\n", |
| 29 | + " \"First Name\": \"category\",\n", |
| 30 | + " \"Last Name\": \"category\",\n", |
| 31 | + "}\n", |
| 32 | + "# skip reading the columns we don't care about for Toggl\n", |
| 33 | + "cols = list(dtypes) + [\n", |
| 34 | + " \"Date\",\n", |
| 35 | + " \"Notes\",\n", |
| 36 | + " \"Hours\",\n", |
| 37 | + "]\n", |
| 38 | + "# read CSV file, parsing dates\n", |
| 39 | + "source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=[\"Date\"], cache_dates=True)\n", |
| 40 | + "source.dtypes" |
| 41 | + ] |
| 42 | + }, |
| 43 | + { |
| 44 | + "cell_type": "code", |
| 45 | + "execution_count": null, |
| 46 | + "metadata": {}, |
| 47 | + "outputs": [], |
| 48 | + "source": [ |
| 49 | + "# rename columns that can be imported as-is\n", |
| 50 | + "source.rename(columns={\"Project\": \"Task\", \"Notes\": \"Description\", \"Date\": \"Start date\"}, inplace=True)\n", |
| 51 | + "source.dtypes" |
| 52 | + ] |
| 53 | + }, |
| 54 | + { |
| 55 | + "cell_type": "code", |
| 56 | + "execution_count": null, |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [], |
| 59 | + "source": [ |
| 60 | + "# update static calculated columns\n", |
| 61 | + "source[\"Client\"] = \"Xentrans\"\n", |
| 62 | + "source[\"Client\"] = source[\"Client\"].astype(\"category\")\n", |
| 63 | + "source[\"Project\"] = \"Xentrans\"\n", |
| 64 | + "source[\"Project\"] = source[\"Project\"].astype(\"category\")\n", |
| 65 | + "source.dtypes" |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": null, |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [], |
| 73 | + "source": [ |
| 74 | + "# add the Email column\n", |
| 75 | + "source[\"Email\"] = source[\"First Name\"].apply(lambda x: f\"{x.lower()}@compiler.la\").astype(\"category\")\n", |
| 76 | + "# drop individual name columns\n", |
| 77 | + "source.drop(columns=[\"First Name\", \"Last Name\"], inplace=True)\n", |
| 78 | + "source.dtypes" |
| 79 | + ] |
| 80 | + }, |
| 81 | + { |
| 82 | + "cell_type": "code", |
| 83 | + "execution_count": null, |
| 84 | + "metadata": {}, |
| 85 | + "outputs": [], |
| 86 | + "source": [ |
| 87 | + "# Convert numeric Hours to string Duration\n", |
| 88 | + "source[\"Duration\"] = source[\"Hours\"].apply(\n", |
| 89 | + " # first convert the numeric hours e.g. 1.5 to a timedelta\n", |
| 90 | + " # then use the total seconds to convert to a datetime\n", |
| 91 | + " # and format as a string e.g. 01:30\n", |
| 92 | + " lambda x: datetime.fromtimestamp(pd.to_timedelta(x, unit=\"hours\").total_seconds()).strftime(\"%H:%M\")\n", |
| 93 | + ")\n", |
| 94 | + "source[\"Duration\"].head()" |
26 | 95 | ]
|
27 | 96 | }
|
28 | 97 | ],
|
|
0 commit comments