Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 78 additions & 15 deletions 02_assignments/assignment_3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,13 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"def alpha_func(D, idx):\n",
" # Extract Sepal's width and length based on idx\n",
" sepal_width = D.loc[idx, 'Sepal.Width']\n",
" sepal_length = D.loc[idx, 'Sepal.Length']\n",
" alpha = np.mean(sepal_width)*np.mean(sepal_length)\n",
" \n",
" return alpha"
]
},
{
Expand Down Expand Up @@ -106,11 +112,15 @@
"metadata": {},
"outputs": [],
"source": [
"def alpha_func(D, idx):\n",
" sepal_length = D.loc[idx, 'Sepal.Length']\n",
" avg_sepal_length = np.mean(sepal_length)\n",
" return avg_sepal_length\n",
"\n",
"rng = np.random.default_rng(0)\n",
"alpha_func(df,\n",
" rng.choice(100,\n",
" 100,\n",
" replace=True))"
"bootstrapped = rng.choice(100, 100, replace=True)\n",
"average_sepal_length = alpha_func(df, bootstrapped)\n",
"print(\"Average Sepal Length (bootstrapped):\", average_sepal_length)"
]
},
{
Expand All @@ -122,6 +132,8 @@
"\n",
"_(iii)_ Why is it (perhaps) not sufficient to simply calculate the mean of `Sepal.Length`? What more information will preforming a bootstrap provide to us? \n",
"\n",
"It is only bootstrapped once here. The aim of Bootstrapping is to resampling multiple times to observe the variability in the statitis of interest across the different bootstrpa samples, and calculate the average results from multiple samples to compute the confdience interval. But since there is only one sample here, no average result could be achieved, and the obtained mean of 'Sepal.Length' could be highly affected by bias in a speicific bootstrap sample.\n",
"\n",
"_(iv)_ We can perform bootstrapping in Python by defining a simple function using `boot_SE()` for computing the bootstrap standard error. Remember, because bootstrapping involves randomness, we must first set a seed for reproducibility!"
]
},
Expand All @@ -132,7 +144,17 @@
"metadata": {},
"outputs": [],
"source": [
"# Add your code here to set the seed"
"def boot_SE(D, statistic_func, num_bootstraps=1000):\n",
" rng = np.random.default_rng() \n",
" bootstrapped_result = []\n",
"\n",
" for i in range(num_bootstraps):\n",
" idx = rng.choice(len(D), size=len(D), replace=True)\n",
" value = alpha_func(D, idx)\n",
" bootstrapped_result.append(value)\n",
"\n",
" bootstrap_se = np.std(bootstrapped_result)\n",
" return bootstrap_se"
]
},
{
Expand All @@ -150,7 +172,20 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here"
"# Your code heredef boot_SE(D, statistic_func, num_bootstraps=1000):\n",
" rng = np.random.default_rng() \n",
" bootstrapped_result = []\n",
"\n",
" for i in range(num_bootstraps):\n",
" idx = rng.choice(len(D), size=len(D), replace=True)\n",
" value = alpha_func(D, idx)\n",
" bootstrapped_result.append(value)\n",
"\n",
" bootstrap_se = np.std(bootstrapped_result)\n",
" return bootstrap_se\n",
"\n",
"bootstrap_standard_error = boot_SE(df, average_sepal_length)\n",
"print(\"Bootstrap Standard Error (SE) for mean Sepal Length:\", bootstrap_standard_error)"
]
},
{
Expand All @@ -160,6 +195,9 @@
"source": [
"_(vi)_ What is the original mean value of `Sepal.Length`?\n",
"\n",
"The mean value is 5.8433\n",
"\n",
"\n",
"Next, let's look _inside_ our bootstrapping to understand the new, bootstrapped sample we have created. Let's review the bootstrapped range, by using `t_range = np.ptp(boot_se_samples)`.\n",
"\n",
"_(vii)_. Write code to review the bootstrapped mean value, and the standard deviation of the bootstrapped samples. Compare the mean against its original value."
Expand All @@ -172,7 +210,31 @@
"metadata": {},
"outputs": [],
"source": [
"# Add your code here"
"# original mean\n",
"def original_mean_sepal_length(D):\n",
" return np.mean(D['Sepal.Length'])\n",
"original_mean = original_mean_sepal_length(df)\n",
"\n",
"# bootstrapped mean\n",
"def bootstrap_mean_sepal_length(D, num_bootstraps=1000):\n",
" rng = np.random.default_rng(0)\n",
" bootstrapped_means = []\n",
"\n",
" for _ in range(num_bootstraps):\n",
" idx = rng.choice(len(D), size=len(D), replace=True)\n",
" bootstrapped_mean = np.mean(D.loc[idx, 'Sepal.Length'])\n",
" bootstrapped_means.append(bootstrapped_mean)\n",
"\n",
" return bootstrapped_means\n",
"\n",
"bootstrapped_means = bootstrap_mean_sepal_length(df) \n",
"bootstrapped_mean = np.mean(bootstrapped_means)\n",
"\n",
"# compare the mean\n",
"mean_difference = bootstrapped_mean - original_mean\n",
"\n",
"# the standard deviation of Bootstrapped Means:\n",
"bootstrapped_std = np.std(bootstrapped_means)"
]
},
{
Expand All @@ -190,7 +252,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Add your code here"
"confidence_interval = np.percentile(bootstrapped_means, [2.5, 97.5])"
]
},
{
Expand All @@ -214,19 +276,19 @@
"fig, ax = plt.subplots()\n",
"\n",
"# Create the histogram\n",
"#Add your code here\n",
"plt.hist(bootstrapped_means, bins=30, edgecolor='black', alpha=0.75)\n",
"\n",
"# Add a title\n",
"#Add your code here\n",
"plt.title('Bootstrapped Means for Sepal Length')\n",
"\n",
"# Add a label to the x-axis\n",
"#Add your code here\n",
"plt.xlabel('Bootstrapped Mean Sepal Length')\n",
"\n",
"# Add a label to the y-axis\n",
"#Add your code here\n",
"plt.ylabel('Frequency')\n",
"\n",
"# Show the plot\n",
"plt.show()"
"plt.show()\n"
]
},
{
Expand All @@ -244,7 +306,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Write your answer here"
"There is 95% confident that the true mean Sepal Length falls between 5.7186 and 5.9747 units. \n",
"I would recommend the shipping company to prepare a space with 6 units in length to fit the iris flowers.\n"
]
},
{
Expand Down