Skip to content

Commit 624a7b3

Browse files
committed
feat: simple column updates/conversions
* rename columns that can be imported as-is * add static calculated columns * calculate email column, drop name columns * calculate string duration column
1 parent 13dde34 commit 624a7b3

File tree

1 file changed

+71
-2
lines changed

1 file changed

+71
-2
lines changed

notebooks/harvest-to-toggl.ipynb

+71-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"metadata": {},
77
"outputs": [],
88
"source": [
9+
"from datetime import datetime\n",
910
"import os\n",
1011
"from pathlib import Path\n",
1112
"import pandas as pd\n",
@@ -21,8 +22,76 @@
2122
"metadata": {},
2223
"outputs": [],
2324
"source": [
24-
"source = pd.read_csv(DATA_SOURCE)\n",
25-
"source.head()"
25+
"# assign category dtype for efficiency on repeating text columns\n",
26+
"dtypes = {\n",
27+
" \"Client\": \"category\",\n",
28+
" \"Project\": \"category\",\n",
29+
" \"First Name\": \"category\",\n",
30+
" \"Last Name\": \"category\",\n",
31+
"}\n",
32+
"# skip reading the columns we don't care about for Toggl\n",
33+
"cols = list(dtypes) + [\n",
34+
" \"Date\",\n",
35+
" \"Notes\",\n",
36+
" \"Hours\",\n",
37+
"]\n",
38+
"# read CSV file, parsing dates\n",
39+
"source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=[\"Date\"], cache_dates=True)\n",
40+
"source.dtypes"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": null,
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"# rename columns that can be imported as-is\n",
50+
"source.rename(columns={\"Project\": \"Task\", \"Notes\": \"Description\", \"Date\": \"Start date\"}, inplace=True)\n",
51+
"source.dtypes"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"# update static calculated columns\n",
61+
"source[\"Client\"] = \"Xentrans\"\n",
62+
"source[\"Client\"] = source[\"Client\"].astype(\"category\")\n",
63+
"source[\"Project\"] = \"Xentrans\"\n",
64+
"source[\"Project\"] = source[\"Project\"].astype(\"category\")\n",
65+
"source.dtypes"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": null,
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"# add the Email column\n",
75+
"source[\"Email\"] = source[\"First Name\"].apply(lambda x: f\"{x.lower()}@compiler.la\").astype(\"category\")\n",
76+
"# drop individual name columns\n",
77+
"source.drop(columns=[\"First Name\", \"Last Name\"], inplace=True)\n",
78+
"source.dtypes"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": null,
84+
"metadata": {},
85+
"outputs": [],
86+
"source": [
87+
"# Convert numeric Hours to string Duration\n",
88+
"source[\"Duration\"] = source[\"Hours\"].apply(\n",
89+
" # first convert the numeric hours e.g. 1.5 to a timedelta\n",
90+
" # then use the total seconds to convert to a datetime\n",
91+
" # and format as a string e.g. 01:30\n",
92+
" lambda x: datetime.fromtimestamp(pd.to_timedelta(x, unit=\"hours\").total_seconds()).strftime(\"%H:%M\")\n",
93+
")\n",
94+
"source[\"Duration\"].head()"
2695
]
2796
}
2897
],

0 commit comments

Comments
 (0)