diff --git a/02_assignments/assignment_3.ipynb b/02_assignments/assignment_3.ipynb index bfe27b7e4..36914b223 100644 --- a/02_assignments/assignment_3.ipynb +++ b/02_assignments/assignment_3.ipynb @@ -70,7 +70,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "def alpha_func(D, idx):\n", + " # Extract Sepal's width and length based on idx\n", + " sepal_width = D.loc[idx, 'Sepal.Width']\n", + " sepal_length = D.loc[idx, 'Sepal.Length']\n", + " alpha = np.mean(sepal_width)*np.mean(sepal_length)\n", + " \n", + " return alpha" ] }, { @@ -106,11 +112,15 @@ "metadata": {}, "outputs": [], "source": [ + "def alpha_func(D, idx):\n", + " sepal_length = D.loc[idx, 'Sepal.Length']\n", + " avg_sepal_length = np.mean(sepal_length)\n", + " return avg_sepal_length\n", + "\n", "rng = np.random.default_rng(0)\n", - "alpha_func(df,\n", - " rng.choice(100,\n", - " 100,\n", - " replace=True))" + "bootstrapped = rng.choice(100, 100, replace=True)\n", + "average_sepal_length = alpha_func(df, bootstrapped)\n", + "print(\"Average Sepal Length (bootstrapped):\", average_sepal_length)" ] }, { @@ -122,6 +132,8 @@ "\n", "_(iii)_ Why is it (perhaps) not sufficient to simply calculate the mean of `Sepal.Length`? What more information will preforming a bootstrap provide to us? \n", "\n", + "It is only bootstrapped once here. The aim of Bootstrapping is to resampling multiple times to observe the variability in the statitis of interest across the different bootstrpa samples, and calculate the average results from multiple samples to compute the confdience interval. But since there is only one sample here, no average result could be achieved, and the obtained mean of 'Sepal.Length' could be highly affected by bias in a speicific bootstrap sample.\n", + "\n", "_(iv)_ We can perform bootstrapping in Python by defining a simple function using `boot_SE()` for computing the bootstrap standard error. Remember, because bootstrapping involves randomness, we must first set a seed for reproducibility!" ] }, @@ -132,7 +144,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Add your code here to set the seed" + "def boot_SE(D, statistic_func, num_bootstraps=1000):\n", + " rng = np.random.default_rng() \n", + " bootstrapped_result = []\n", + "\n", + " for i in range(num_bootstraps):\n", + " idx = rng.choice(len(D), size=len(D), replace=True)\n", + " value = alpha_func(D, idx)\n", + " bootstrapped_result.append(value)\n", + "\n", + " bootstrap_se = np.std(bootstrapped_result)\n", + " return bootstrap_se" ] }, { @@ -150,7 +172,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Your code heredef boot_SE(D, statistic_func, num_bootstraps=1000):\n", + " rng = np.random.default_rng() \n", + " bootstrapped_result = []\n", + "\n", + " for i in range(num_bootstraps):\n", + " idx = rng.choice(len(D), size=len(D), replace=True)\n", + " value = alpha_func(D, idx)\n", + " bootstrapped_result.append(value)\n", + "\n", + " bootstrap_se = np.std(bootstrapped_result)\n", + " return bootstrap_se\n", + "\n", + "bootstrap_standard_error = boot_SE(df, average_sepal_length)\n", + "print(\"Bootstrap Standard Error (SE) for mean Sepal Length:\", bootstrap_standard_error)" ] }, { @@ -160,6 +195,9 @@ "source": [ "_(vi)_ What is the original mean value of `Sepal.Length`?\n", "\n", + "The mean value is 5.8433\n", + "\n", + "\n", "Next, let's look _inside_ our bootstrapping to understand the new, bootstrapped sample we have created. Let's review the bootstrapped range, by using `t_range = np.ptp(boot_se_samples)`.\n", "\n", "_(vii)_. Write code to review the bootstrapped mean value, and the standard deviation of the bootstrapped samples. Compare the mean against its original value." @@ -172,7 +210,31 @@ "metadata": {}, "outputs": [], "source": [ - "# Add your code here" + "# original mean\n", + "def original_mean_sepal_length(D):\n", + " return np.mean(D['Sepal.Length'])\n", + "original_mean = original_mean_sepal_length(df)\n", + "\n", + "# bootstrapped mean\n", + "def bootstrap_mean_sepal_length(D, num_bootstraps=1000):\n", + " rng = np.random.default_rng(0)\n", + " bootstrapped_means = []\n", + "\n", + " for _ in range(num_bootstraps):\n", + " idx = rng.choice(len(D), size=len(D), replace=True)\n", + " bootstrapped_mean = np.mean(D.loc[idx, 'Sepal.Length'])\n", + " bootstrapped_means.append(bootstrapped_mean)\n", + "\n", + " return bootstrapped_means\n", + "\n", + "bootstrapped_means = bootstrap_mean_sepal_length(df) \n", + "bootstrapped_mean = np.mean(bootstrapped_means)\n", + "\n", + "# compare the mean\n", + "mean_difference = bootstrapped_mean - original_mean\n", + "\n", + "# the standard deviation of Bootstrapped Means:\n", + "bootstrapped_std = np.std(bootstrapped_means)" ] }, { @@ -190,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Add your code here" + "confidence_interval = np.percentile(bootstrapped_means, [2.5, 97.5])" ] }, { @@ -214,19 +276,19 @@ "fig, ax = plt.subplots()\n", "\n", "# Create the histogram\n", - "#Add your code here\n", + "plt.hist(bootstrapped_means, bins=30, edgecolor='black', alpha=0.75)\n", "\n", "# Add a title\n", - "#Add your code here\n", + "plt.title('Bootstrapped Means for Sepal Length')\n", "\n", "# Add a label to the x-axis\n", - "#Add your code here\n", + "plt.xlabel('Bootstrapped Mean Sepal Length')\n", "\n", "# Add a label to the y-axis\n", - "#Add your code here\n", + "plt.ylabel('Frequency')\n", "\n", "# Show the plot\n", - "plt.show()" + "plt.show()\n" ] }, { @@ -244,7 +306,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your answer here" + "There is 95% confident that the true mean Sepal Length falls between 5.7186 and 5.9747 units. \n", + "I would recommend the shipping company to prepare a space with 6 units in length to fit the iris flowers.\n" ] }, {