Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 140 additions & 31 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "4a3485d6-ba58-4660-a983-5680821c5719",
"metadata": {},
"outputs": [],
Expand All @@ -59,7 +59,35 @@
"execution_count": null,
"id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 178 entries, 0 to 177\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 alcohol 178 non-null float64\n",
" 1 malic_acid 178 non-null float64\n",
" 2 ash 178 non-null float64\n",
" 3 alcalinity_of_ash 178 non-null float64\n",
" 4 magnesium 178 non-null float64\n",
" 5 total_phenols 178 non-null float64\n",
" 6 flavanoids 178 non-null float64\n",
" 7 nonflavanoid_phenols 178 non-null float64\n",
" 8 proanthocyanins 178 non-null float64\n",
" 9 color_intensity 178 non-null float64\n",
" 10 hue 178 non-null float64\n",
" 11 od280/od315_of_diluted_wines 178 non-null float64\n",
" 12 proline 178 non-null float64\n",
" 13 class 178 non-null int64 \n",
"dtypes: float64(13), int64(1)\n",
"memory usage: 19.6 KB\n"
]
}
],
"source": [
"from sklearn.datasets import load_wine\n",
"\n",
Expand Down Expand Up @@ -91,12 +119,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "56916892",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"178"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your answer here"
"wine_df.shape[0]"
]
},
{
Expand All @@ -109,12 +148,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "df0ef103",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"14"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your answer here"
"wine_df.shape[1]"
]
},
{
Expand All @@ -127,12 +177,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "47989426",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"int64\n",
"[0 1 2]\n"
]
}
],
"source": [
"# Your answer here"
"print(wine_df['class'].dtype) # e.g., int64\n",
"print(wine_df['class'].unique()) # array([0, 1, 2])"
]
},
{
Expand All @@ -146,12 +206,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "bd7b0910",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"13"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your answer here"
"wine_df.shape[1] - 1 # or len(wine_df.columns) - 1"
]
},
{
Expand All @@ -175,10 +246,37 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "cc899b59",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
"0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
"1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
"2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
"3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
"4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
"\n",
" total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
"0 0.808997 1.034819 -0.659563 1.224884 \n",
"1 0.568648 0.733629 -0.820719 -0.544721 \n",
"2 0.808997 1.215533 -0.498407 2.135968 \n",
"3 2.491446 1.466525 -0.981875 1.032155 \n",
"4 0.808997 0.663351 0.226796 0.401404 \n",
"\n",
" color_intensity hue od280/od315_of_diluted_wines proline \n",
"0 0.251717 0.362177 1.847920 1.013009 \n",
"1 -0.293321 0.406051 1.113449 0.965242 \n",
"2 0.269020 0.318304 0.788587 1.395148 \n",
"3 1.186068 -0.427544 1.184071 2.334574 \n",
"4 -0.319276 0.362177 0.449601 -0.037874 \n"
]
}
],
"source": [
"# Select predictors (excluding the last column)\n",
"predictors = wine_df.iloc[:, :-1]\n",
Expand All @@ -204,7 +302,7 @@
"id": "403ef0bb",
"metadata": {},
"source": [
"> Your answer here..."
"Since we are using Euclidean distance to measure how close values are to each other, if we have unstandardized values, then predictors measured with larger numbers will dominate distances, and bias results towards those predictors. Standardizing all predictors allows for equal contribution independent of the scale of values used in predictors."
]
},
{
Expand All @@ -220,7 +318,7 @@
"id": "fdee5a15",
"metadata": {},
"source": [
"> Your answer here..."
"Class is the categorical response, and so standardizing these values will distort the meaning of the dependent variable we are trying to predict."
]
},
{
Expand All @@ -236,7 +334,7 @@
"id": "f0676c21",
"metadata": {},
"source": [
"> Your answer here..."
"Having a random seed allow for randomness to be used in the analysis, while also allowing for reproducibility of the results if run at a later point or by someone else. The number doesnt matter, any value will work, since it just identifies the type of randomness that must be replicated."
]
},
{
Expand All @@ -251,7 +349,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"id": "72c101f2",
"metadata": {},
"outputs": [],
Expand All @@ -260,8 +358,13 @@
"np.random.seed(123)\n",
"\n",
"# split the data into a training and testing set. hint: use train_test_split !\n",
"\n",
"# Your code here ..."
"X_train, X_test, y_train, y_test = train_test_split(\n",
" predictors_standardized, \n",
" wine_df['class'],\n",
" train_size = 0.75,\n",
" shuffle = True,\n",
" stratify = wine_df['class']\n",
")"
]
},
{
Expand Down Expand Up @@ -289,7 +392,15 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here..."
"knn = KNeighborsClassifier()\n",
"\n",
"param_grid = {'n_neighbors': range(1, 51)}\n",
"\n",
"grid_search = GridSearchCV(knn, param_grid, cv=10)\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"best_k = grid_search.best_params_['n_neighbors']\n",
"best_k"
]
},
{
Expand All @@ -310,7 +421,10 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here..."
"best_knn = KNeighborsClassifier(n_neighbors=best_k)\n",
"best_knn.fit(X_train, y_train)\n",
"\n",
"best_knn.score(X_test, y_test)"
]
},
{
Expand Down Expand Up @@ -365,7 +479,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4",
"display_name": "lcr-env (3.11.13)",
"language": "python",
"name": "python3"
},
Expand All @@ -379,12 +493,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
},
"vscode": {
"interpreter": {
"hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
}
"version": "3.11.13"
}
},
"nbformat": 4,
Expand Down