diff --git a/02_activities/assignments/assignment_1 (1).ipynb b/02_activities/assignments/assignment_1 (1).ipynb new file mode 100644 index 00000000..3e2b3ac4 --- /dev/null +++ b/02_activities/assignments/assignment_1 (1).ipynb @@ -0,0 +1,176 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 469 + }, + "id": "XOe9KOlXbmT1", + "outputId": "902b42af-a146-495a-d1cb-85532cf8a750" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "import numpy as np\n", + "np.random.seed(42)\n", + "# Import necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Note: Suppressing FutureWarnings to maintain a clean output. This is specifically to ignore warnings about\n", + "# deprecated features in the libraries we're using (e.g., 'use_inf_as_na' option in Pandas, used by Seaborn),\n", + "# which we currently have no direct control over. This action is taken to ensure that our output remains\n", + "# focused on relevant information, acknowledging that we rely on external library updates to fully resolve\n", + "# these deprecations. Always consider reviewing and removing this suppression after significant library updates.\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "# Constants representing the parameters of the model\n", + "ATTACK_RATE = 0.10\n", + "TRACE_SUCCESS = 0.20\n", + "SECONDARY_TRACE_THRESHOLD = 2\n", + "\n", + "def simulate_event(m):\n", + " \"\"\"\n", + " Simulates the infection and tracing process for a series of events.\n", + "\n", + " This function creates a DataFrame representing individuals attending weddings and brunches,\n", + " infects a subset of them based on the ATTACK_RATE, performs primary and secondary contact tracing,\n", + " and calculates the proportions of infections and traced cases that are attributed to weddings.\n", + "\n", + " Parameters:\n", + " - m: Dummy parameter for iteration purposes.\n", + "\n", + " Returns:\n", + " - A tuple containing the proportion of infections and the proportion of traced cases\n", + " that are attributed to weddings.\n", + " \"\"\"\n", + " # Create DataFrame for people at events with initial infection and traced status\n", + " events = ['wedding'] * 200 + ['brunch'] * 800\n", + " ppl = pd.DataFrame({\n", + " 'event': events,\n", + " 'infected': False,\n", + " 'traced': np.nan # Initially setting traced status as NaN\n", + " })\n", + "\n", + " # Explicitly set 'traced' column to nullable boolean type\n", + " ppl['traced'] = ppl['traced'].astype(pd.BooleanDtype())\n", + "\n", + " # Infect a random subset of people\n", + " infected_indices = np.random.choice(ppl.index, size=int(len(ppl) * ATTACK_RATE), replace=False)\n", + " ppl.loc[infected_indices, 'infected'] = True\n", + "\n", + " # Primary contact tracing: randomly decide which infected people get traced\n", + " ppl.loc[ppl['infected'], 'traced'] = np.random.rand(sum(ppl['infected'])) < TRACE_SUCCESS\n", + "\n", + " # Secondary contact tracing based on event attendance\n", + " event_trace_counts = ppl[ppl['traced'] == True]['event'].value_counts()\n", + " events_traced = event_trace_counts[event_trace_counts >= SECONDARY_TRACE_THRESHOLD].index\n", + " ppl.loc[ppl['event'].isin(events_traced) & ppl['infected'], 'traced'] = True\n", + "\n", + " # Calculate proportions of infections and traces attributed to each event type\n", + " ppl['event_type'] = ppl['event'].str[0] # 'w' for wedding, 'b' for brunch\n", + " wedding_infections = sum(ppl['infected'] & (ppl['event_type'] == 'w'))\n", + " brunch_infections = sum(ppl['infected'] & (ppl['event_type'] == 'b'))\n", + " p_wedding_infections = wedding_infections / (wedding_infections + brunch_infections)\n", + "\n", + " wedding_traces = sum(ppl['infected'] & ppl['traced'] & (ppl['event_type'] == 'w'))\n", + " brunch_traces = sum(ppl['infected'] & ppl['traced'] & (ppl['event_type'] == 'b'))\n", + " p_wedding_traces = wedding_traces / (wedding_traces + brunch_traces)\n", + "\n", + " return p_wedding_infections, p_wedding_traces\n", + "\n", + "# Run the simulation 1000 times\n", + "results = [simulate_event(m) for m in range(100)]\n", + "props_df = pd.DataFrame(results, columns=[\"Infections\", \"Traces\"])\n", + "\n", + "# Plotting the results\n", + "plt.figure(figsize=(10, 6))\n", + "sns.histplot(props_df['Infections'], color=\"blue\", alpha=0.75, binwidth=0.05, kde=False, label='Infections from Weddings')\n", + "sns.histplot(props_df['Traces'], color=\"red\", alpha=0.75, binwidth=0.05, kde=False, label='Traced to Weddings')\n", + "plt.xlabel(\"Proportion of cases\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.title(\"Impact of Contact Tracing on Perceived Flu Infection Sources\")\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Question 1: Sampling Stages in the Model\n", + "\n", + "The simulation involves multiple stages of sampling, each of which contributes to variability and potential bias in the results.\n", + "\n", + "First, a random sample of individuals is selected to become infected. From a total population of 1,000 event attendees (200 wedding attendees and 800 brunch attendees), 10% are randomly chosen using np.random.choice() without replacement. This represents the infection process and assumes equal probability of infection across all individuals.\n", + "\n", + "Second, primary contact tracing is applied to infected individuals. For each infected person, tracing success is determined using a Bernoulli trial implemented through np.random.rand() with a success probability of 0.20. This stage samples from the infected population and introduces additional randomness.\n", + "\n", + "Third, secondary contact tracing is applied at the event level. If the number of successfully traced infected individuals at an event reaches or exceeds a predefined threshold, all infected individuals from that event are marked as traced. This step is conditional on earlier sampling and is non-random, leading to systematic overrepresentation of certain events.\n", + "\n", + "Together, these sampling stages demonstrate how non-random tracing mechanisms can bias observed outcomes, even when the initial infection process is random." + ], + "metadata": { + "id": "Ij9sfdKJe8Ij" + } + }, + { + "cell_type": "markdown", + "source": [ + "Question 2\n", + "\n", + "When the simulation is run with only 10 repetitions, the resulting distributions are highly variable and differ substantially between runs. The small number of simulations leads to unstable estimates and noisy histograms, making the results poorly reproducible.\n", + "\n", + "Increasing the number of repetitions to 100 reduces variability and produces smoother distributions; however, noticeable differences between runs remain. This demonstrates that while larger sample sizes improve stability, randomness still influences the results when no reproducibility controls are applied.\n", + "\n", + "With 1,000 repetitions, the distributions become more stable, but repeated executions of the script still produce slightly different outputs due to the use of random sampling without a fixed seed." + ], + "metadata": { + "id": "Ayqm_ss_g7zK" + } + }, + { + "cell_type": "markdown", + "source": [ + "Question 3\n", + "\n", + "To ensure reproducibility, a fixed random seed was set using np.random.seed(42) at the beginning of the script. This change ensures that the sequence of random numbers generated during the simulation remains identical across runs.\n", + "\n", + "By fixing the random seed, the same individuals are selected for infection, the same tracing outcomes occur, and the resulting histograms are identical every time the script is executed. This modification does not remove randomness from the model but ensures that the randomness is controlled and reproducible, which is essential for scientific transparency and verification." + ], + "metadata": { + "id": "uTwLBWM5g-z1" + } + } + ] +} \ No newline at end of file