diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..281c767b4 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -20,6 +20,12 @@ "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue." ] }, + { + "cell_type": "markdown", + "id": "e4c1403f", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629", @@ -34,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -43,6 +49,7 @@ "import pandas as pd\n", "import numpy as np\n", "import random\n", + "import stats\n", "import matplotlib.pyplot as plt\n", "import matplotlib.colors as mcolors\n", "from sklearn.preprocessing import StandardScaler\n", @@ -56,10 +63,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +376,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", + "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", + "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", + "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", + "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", + "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", + "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", + "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 2.295112 2.029270 0.361854 1.590899 \n", + "std 0.625851 0.998859 0.124453 0.572359 \n", + "min 0.980000 0.340000 0.130000 0.410000 \n", + "25% 1.742500 1.205000 0.270000 1.250000 \n", + "50% 2.355000 2.135000 0.340000 1.555000 \n", + "75% 2.800000 2.875000 0.437500 1.950000 \n", + "max 3.880000 5.080000 0.660000 3.580000 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 5.058090 0.957449 2.611685 746.893258 \n", + "std 2.318286 0.228572 0.709990 314.907474 \n", + "min 1.280000 0.480000 1.270000 278.000000 \n", + "25% 3.220000 0.782500 1.937500 500.500000 \n", + "50% 4.690000 0.965000 2.780000 673.500000 \n", + "75% 6.200000 1.120000 3.170000 985.000000 \n", + "max 13.000000 1.710000 4.000000 1680.000000 \n", + "\n", + " class \n", + "count 178.000000 \n", + "mean 0.938202 \n", + "std 0.775035 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 1.000000 \n", + "75% 2.000000 \n", + "max 2.000000 \n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "print(wine_df.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3637c1dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "178\n" + ] + } + ], + "source": [ + "summary = wine_df.describe()\n", + "# From Pandas bonus material .loc[] allows us to access data by label.\n", + "# I used alcohol as it is the first column in the dataset, and count would have the same value.\n", + "count = summary.loc['count', 'alcohol']\n", + "\n", + "print(int(count))" ] }, { @@ -104,17 +459,27 @@ "id": "f7573b59", "metadata": {}, "source": [ + "\n", "_(ii)_ How many variables (columns) does the dataset contain?" ] }, { "cell_type": "code", "execution_count": null, - "id": "df0ef103", + "id": "376319b4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], "source": [ - "# Your answer here" + "# .shape returns a tuple (rows, columns)and [1] accesses the number of columns.\n", + "print(wine_df.shape[1])\n" ] }, { @@ -127,12 +492,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "int64\n", + "[0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "#bonus panda material .dtype checks data type of a column and .unique() shows unique values in a column.\n", + "print(wine_df['class'].dtype) \n", + "\n", + "print(wine_df['class'].unique())" ] }, { @@ -146,12 +523,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "#Predictors = Total Variables - Response Variable.\n", + "wine_df.shape[1] - 1\n" ] }, { @@ -175,10 +564,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +620,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "KNN calculates the euclidean distance between points. If variables are on different scales (one on a scale of 1-1000 and onother on a scale of .001 to .1 then it will ignore the smaller variable. Standardization makes the data have a mean of 0 to 1 ensuring each variable contributes equally)" ] }, { @@ -220,7 +636,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "Variable Classes act like labels. KNN calculates distance between numbers to find neighbours, in order to predict results, but it can't caluclate the distance between the target classes. If we standardized the class column it would turn the integers we want into decimals that would mean nothing. " ] }, { @@ -236,7 +652,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "This is for reproducibly so that later, we can do a random split of data exactly the same way every time it is run. 123 is not special, any number is fine as long as it is logged to ensure it can be reused." ] }, { @@ -251,17 +667,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "72c101f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Data Shape: (133, 13)\n", + "Testing Data Shape: (45, 13)\n" + ] + } + ], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", - "# split the data into a training and testing set. hint: use train_test_split !\n", + "# split the data into a training and testing set. hint: use train_test_split\n", + "# X = predictors_standardized (The features we scaled above)\n", + "# y = wine_df['class'] (The response variable)\n", + "# test_size=0.25 (To get the requested 75% Train / 25% Test split)\n", "\n", - "# Your code here ..." + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized, \n", + " wine_df['class'], \n", + " test_size=0.25,\n", + " stratify=wine_df['class']\n", + ")\n", + "\n", + "print(\"Training Data Shape:\", X_train.shape)\n", + "print(\"Testing Data Shape:\", X_test.shape)" ] }, { @@ -284,12 +720,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best k value: 7\n" + ] + } + ], "source": [ - "# Your code here..." + "#1. Initialize the KNN classifier\n", + "knn = KNeighborsClassifier()\n", + "#2. range(1, 51) gives us 1 through 50. (Python ranges excludes the stop number)\n", + "param_grid = {'n_neighbors': range(1, 51)}\n", + "\n", + "#3. Set up GridSearchCV\n", + "# cv=10 splits training data 10 times.\n", + "grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')\n", + "\n", + "#4. Fit the model to the training data (Never use the test data for training)\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "print(\"Best k value:\", best_k)" ] }, { @@ -305,12 +762,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Set Accuracy: 0.9333333333333333\n" + ] + } + ], + "source": [ + "## Initialize the final model using the best value for n_neighbors found in Q3\n", + "final_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "## Fit the model to the training data\n", + "final_knn.fit(X_train, y_train)\n", + "#Evaluate the model on the test data\n", + "y_pred = final_knn.predict(X_test)\n", + "# Calculate accuracy using accuracy_score\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Test Set Accuracy:\", accuracy)" + ] + }, + { + "cell_type": "markdown", + "id": "e37d304a", + "metadata": {}, "source": [ - "# Your code here..." + "I added in stratify into Q4 from rewatching class, but this didn't improve the value, it changed the K to 7 from 15, but not the accuracy." ] }, { @@ -365,7 +846,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "assignments", "language": "python", "name": "python3" }, @@ -379,12 +860,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.14" } }, "nbformat": 4,