diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb
index 28d4df017..a3c53aac5 100644
--- a/02_activities/assignments/assignment_1.ipynb
+++ b/02_activities/assignments/assignment_1.ipynb
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "4a3485d6-ba58-4660-a983-5680821c5719",
"metadata": {},
"outputs": [],
@@ -56,10 +56,288 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 173 | \n",
+ " 13.71 | \n",
+ " 5.65 | \n",
+ " 2.45 | \n",
+ " 20.5 | \n",
+ " 95.0 | \n",
+ " 1.68 | \n",
+ " 0.61 | \n",
+ " 0.52 | \n",
+ " 1.06 | \n",
+ " 7.70 | \n",
+ " 0.64 | \n",
+ " 1.74 | \n",
+ " 740.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 174 | \n",
+ " 13.40 | \n",
+ " 3.91 | \n",
+ " 2.48 | \n",
+ " 23.0 | \n",
+ " 102.0 | \n",
+ " 1.80 | \n",
+ " 0.75 | \n",
+ " 0.43 | \n",
+ " 1.41 | \n",
+ " 7.30 | \n",
+ " 0.70 | \n",
+ " 1.56 | \n",
+ " 750.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 175 | \n",
+ " 13.27 | \n",
+ " 4.28 | \n",
+ " 2.26 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.59 | \n",
+ " 0.69 | \n",
+ " 0.43 | \n",
+ " 1.35 | \n",
+ " 10.20 | \n",
+ " 0.59 | \n",
+ " 1.56 | \n",
+ " 835.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 176 | \n",
+ " 13.17 | \n",
+ " 2.59 | \n",
+ " 2.37 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.65 | \n",
+ " 0.68 | \n",
+ " 0.53 | \n",
+ " 1.46 | \n",
+ " 9.30 | \n",
+ " 0.60 | \n",
+ " 1.62 | \n",
+ " 840.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 14.13 | \n",
+ " 4.10 | \n",
+ " 2.74 | \n",
+ " 24.5 | \n",
+ " 96.0 | \n",
+ " 2.05 | \n",
+ " 0.76 | \n",
+ " 0.56 | \n",
+ " 1.35 | \n",
+ " 9.20 | \n",
+ " 0.61 | \n",
+ " 1.60 | \n",
+ " 560.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
178 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ ".. ... ... ... ... ... ... \n",
+ "173 13.71 5.65 2.45 20.5 95.0 1.68 \n",
+ "174 13.40 3.91 2.48 23.0 102.0 1.80 \n",
+ "175 13.27 4.28 2.26 20.0 120.0 1.59 \n",
+ "176 13.17 2.59 2.37 20.0 120.0 1.65 \n",
+ "177 14.13 4.10 2.74 24.5 96.0 2.05 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ ".. ... ... ... ... ... \n",
+ "173 0.61 0.52 1.06 7.70 0.64 \n",
+ "174 0.75 0.43 1.41 7.30 0.70 \n",
+ "175 0.69 0.43 1.35 10.20 0.59 \n",
+ "176 0.68 0.53 1.46 9.30 0.60 \n",
+ "177 0.76 0.56 1.35 9.20 0.61 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline class \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 \n",
+ ".. ... ... ... \n",
+ "173 1.74 740.0 2 \n",
+ "174 1.56 750.0 2 \n",
+ "175 1.56 835.0 2 \n",
+ "176 1.62 840.0 2 \n",
+ "177 1.60 560.0 2 \n",
+ "\n",
+ "[178 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from sklearn.datasets import load_wine\n",
"\n",
@@ -89,14 +367,39 @@
"_(i)_ How many observations (rows) does the dataset contain?"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "5c61ece2",
+ "metadata": {},
+ "source": [
+ "#### **Question 1:** \n",
+ "#### Data inspection\n",
+ "\n",
+ "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n",
+ "\n",
+ "_(i)_ How many observations (rows) does the dataset contain?"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "56916892",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of observations (rows): 178\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "from sklearn.datasets import load_wine\n",
+ "wine = load_wine()\n",
+ "X = wine.data\n",
+ "num_rows = X.shape[0]\n",
+ "print(\"Number of observations (rows):\", num_rows)"
]
},
{
@@ -109,12 +412,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "df0ef103",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of variables (columns): 13\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "# Your answer here\n",
+ "num_columns = X.shape[1]\n",
+ "print(\"Number of variables (columns):\", num_columns)\n"
]
},
{
@@ -127,12 +440,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "47989426",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "class variable type: int64\n",
+ "class levels: {np.int64(0), np.int64(1), np.int64(2)}\n",
+ "class names: ['class_0' 'class_1' 'class_2']\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "# Your answer here\n",
+ "y = wine.target\n",
+ "print(\"class variable type:\", y.dtype)\n",
+ "unique_levels = set(y)\n",
+ "print(\"class levels:\", unique_levels)\n",
+ "print(\"class names:\", wine.target_names)"
]
},
{
@@ -146,12 +474,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "bd7b0910",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of predictor variables: 13\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "# Your answer here\n",
+ "num_predictors = X.shape[1]\n",
+ "print(\"Number of predictor variables:\", num_predictors)\n"
]
},
{
@@ -175,10 +513,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"id": "cc899b59",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
+ "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
+ "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
+ "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
+ "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "0 0.808997 1.034819 -0.659563 1.224884 \n",
+ "1 0.568648 0.733629 -0.820719 -0.544721 \n",
+ "2 0.808997 1.215533 -0.498407 2.135968 \n",
+ "3 2.491446 1.466525 -0.981875 1.032155 \n",
+ "4 0.808997 0.663351 0.226796 0.401404 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \n",
+ "0 0.251717 0.362177 1.847920 1.013009 \n",
+ "1 -0.293321 0.406051 1.113449 0.965242 \n",
+ "2 0.269020 0.318304 0.788587 1.395148 \n",
+ "3 1.186068 -0.427544 1.184071 2.334574 \n",
+ "4 -0.319276 0.362177 0.449601 -0.037874 \n"
+ ]
+ }
+ ],
"source": [
"# Select predictors (excluding the last column)\n",
"predictors = wine_df.iloc[:, :-1]\n",
@@ -196,7 +561,8 @@
"id": "9981ca48",
"metadata": {},
"source": [
- "(i) Why is it important to standardize the predictor variables?"
+ "(i) Why is it important to standardize the predictor variables?\n",
+ "\n"
]
},
{
@@ -204,7 +570,10 @@
"id": "403ef0bb",
"metadata": {},
"source": [
- "> Your answer here..."
+ "> Your answer here...\n",
+ "It eliminates bias caused by different scales, enhancing accuracy \n",
+ "\n",
+ "\n"
]
},
{
@@ -220,7 +589,8 @@
"id": "fdee5a15",
"metadata": {},
"source": [
- "> Your answer here..."
+ "> Your answer here...\n",
+ "class is an identifier, not a continuous variable\n"
]
},
{
@@ -236,7 +606,12 @@
"id": "f0676c21",
"metadata": {},
"source": [
- "> Your answer here..."
+ "> Your answer here...Setting a seed makes results reproducible, and the specific value doesn’t matter as long as it’s consistent.\n",
+ "\n",
+ "import random\n",
+ "random.seed(10)\n",
+ "\n",
+ "Random operations will yield different outputs every time unless a seed is set. Fixing a seed guarantees that the same sequence is generated each time. thus, results are consistent and repeatable. The specific seed doesn’t matter for reproducibility. Each sequence is deterministic for its chosen seed.\n"
]
},
{
@@ -251,17 +626,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "72c101f2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training set (predictors): (133, 13)\n",
+ "Training set (response): (133,)\n",
+ "Test set (predictors): (45, 13)\n",
+ "Test set (response): (45,)\n"
+ ]
+ }
+ ],
"source": [
"# set a seed for reproducibility\n",
"np.random.seed(123)\n",
"\n",
"# split the data into a training and testing set. hint: use train_test_split !\n",
"\n",
- "# Your code here ..."
+ "\n",
+ "# Your code here ...\n",
+ "\n",
+ "scaler = StandardScaler()\n",
+ "X_std = scaler.fit_transform(X)\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X_std, y, test_size=0.25, random_state=42, stratify=y\n",
+ ")\n",
+ "\n",
+ "print(\"Training set (predictors):\", X_train.shape)\n",
+ "print(\"Training set (response):\", y_train.shape)\n",
+ "print(\"Test set (predictors):\", X_test.shape)\n",
+ "print(\"Test set (response):\", y_test.shape)"
]
},
{
@@ -287,9 +686,41 @@
"execution_count": null,
"id": "08818c64",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best n_neighbors: 21\n",
+ "Best cross-validation accuracy: 0.976923076923077\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "# Your code here...\n",
+ "\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "knn = KNeighborsClassifier()\n",
+ "\n",
+ "param_grid = {'n_neighbors': list(range(1, 51))}\n",
+ "\n",
+ "grid_search = GridSearchCV(\n",
+ " estimator=knn,\n",
+ " param_grid=param_grid,\n",
+ " cv=10, \n",
+ " scoring='accuracy', \n",
+ " n_jobs=-1 \n",
+ ")\n",
+ "\n",
+ "grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "best_n_neighbors = grid_search.best_params_['n_neighbors']\n",
+ "best_score = grid_search.best_score_\n",
+ "\n",
+ "print(\"Best n_neighbors:\", best_n_neighbors)\n",
+ "print(\"Best cross-validation accuracy:\", best_score)\n"
]
},
{
@@ -308,9 +739,34 @@
"execution_count": null,
"id": "ffefa9f2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best n_neighbors: 21\n",
+ "Test set accuracy: 1.0\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "# Your code here...\n",
+ "\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "best_n_neighbors = grid_search.best_params_['n_neighbors']\n",
+ "\n",
+ "knn_best = KNeighborsClassifier(n_neighbors=best_n_neighbors)\n",
+ "\n",
+ "knn_best.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = knn_best.predict(X_test)\n",
+ "\n",
+ "test_accuracy = accuracy_score(y_test, y_pred)\n",
+ "\n",
+ "print(\"Best n_neighbors:\", best_n_neighbors)\n",
+ "print(\"Test set accuracy:\", test_accuracy)\n"
]
},
{
@@ -365,7 +821,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.10.4",
+ "display_name": "lcr-env (3.11.13)",
"language": "python",
"name": "python3"
},
@@ -379,12 +835,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
- },
- "vscode": {
- "interpreter": {
- "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
- }
+ "version": "3.11.13"
}
},
"nbformat": 4,