diff --git a/FOI/Children_Act/FOI_251007047.ipynb b/FOI/Children_Act/FOI_251007047.ipynb new file mode 100644 index 0000000..585ee3f --- /dev/null +++ b/FOI/Children_Act/FOI_251007047.ipynb @@ -0,0 +1,548 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b672a306-f4fa-4786-a991-5e86bfa0559d", + "metadata": {}, + "outputs": [], + "source": [ + "# FOI 251007047\n", + "#I would like to request the following information regarding Financial Dispute Resolution (FDR) hearings at the Family Court at Nottingham:\n", + "#For all Financial Dispute Resolution cases that were initiated in the second half of the year 2022 (1 July 2022 to 31 December 2022), please provide the mean, median, and mode average time taken, in weeks, from the date of the application to the date of the final resolution of the case. Family Court Nottingham only please.\n", + "#If you are unable to provide the exact figures, please provide the closest available data and specify the time period it covers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbd30d59-4a31-4686-85af-ceeed7d76a55", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd # a module which provides the data structures and functions to store and manipulate tables in dataframes\n", + "import pydbtools as pydb # A module which allows SQL queries to be run on the Analytical Platform from Python, see https://github.com/moj-analytical-services/pydbtools\n", + "import boto3 # allows you to directly create, update, and delete AWS resources from Python scripts\n", + "\n", + "# sets parameters to view dataframes for tables easier\n", + "pd.set_option(\"display.max_columns\", 100)\n", + "pd.set_option(\"display.width\", 900)\n", + "pd.set_option(\"display.max_colwidth\", 200)" + ] + }, + { + "cell_type": "markdown", + "id": "cb7b93ef-fada-41b0-bbf4-ed9ffddbdd20", + "metadata": {}, + "source": [ + "Uncomment the variables below if you need to run this notebook individually" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1a341e3-307e-40e5-b721-dffdb5b936d1", + "metadata": {}, + "outputs": [], + "source": [ + "#Variables to be used in this notebook\n", + "\n", + "#this is the athena database we will be storing our tables in\n", + "fcsq_database = \"fcsq\"\n", + "\n", + "#this is the s3 bucket we will be saving data to\n", + "s3 = boto3.resource(\"s3\")\n", + "bucket = s3.Bucket(\"alpha-family-data\")\n", + "\n", + "#Last full year to be published - including this publication\n", + "#annual_year = 2024\n", + "\n", + "#Current publication variables\n", + "#current_year = 2025\n", + "#current_quarter = 2\n", + "\n", + "#Next publication variables\n", + "#next_quarter_year = 2025\n", + "#next_quarter = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e060ee63-0976-4f3d-b234-8de753e35fcf", + "metadata": {}, + "outputs": [], + "source": [ + "#this test should return no results as SDP extraction step shouldve excluded all schedule 1 cases\n", + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_data_filtered where casetypekey = '3.002305'\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0656aad-0dc7-4a83-9b69-5f17deed9b63", + "metadata": {}, + "outputs": [], + "source": [ + "#pydb.delete_database_and_data(\"__temp__\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63e972dd-d58e-4376-84c4-650755a5af8a", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Table 10 creation started\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e5035cd-f0eb-41f9-ab5f-1748366a368f", + "metadata": {}, + "outputs": [], + "source": [ + "#bringing in dim date dataset\n", + "create_dim_date_table = f\"\"\"\n", + " select \n", + " * \n", + " from common_lookup.dim_date\n", + "\"\"\"\n", + "pydb.create_temp_table(create_dim_date_table,'dim_date')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d263883b-2432-4ed5-a89b-1e53a32f98cc", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.dim_date limit 10\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f48f445-1578-48fe-91a0-aed4c14930eb", + "metadata": {}, + "outputs": [], + "source": [ + "#bringing in court dfj region dataset\n", + "create_dfj_region_table = f\"\"\"\n", + " select \n", + " * \n", + " from fcsq.div_court_dfj_region_lookup\n", + "\"\"\"\n", + "pydb.create_temp_table(create_dfj_region_table,'dfj_region')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8ec2d1f-ac77-4a42-a6b2-38a9c4b5cdfb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.court_dfj_region limit 10\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1a1b375-3242-4ca2-aa9f-88bf0b94a184", + "metadata": {}, + "outputs": [], + "source": [ + "#imports fr application type key table \n", + "fr_application_type_table = pd.read_csv(\"s3://alpha-family-data/CSVs/lookups/fr_application_type_lookup.csv\", low_memory=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e8a0dfa-fe53-4968-b011-cbbb00d860d2", + "metadata": {}, + "outputs": [], + "source": [ + "pydb.dataframe_to_temp_table(fr_application_type_table, \"fr_dim\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0da8f52-7a47-41f7-bbf9-65f617cf33c2", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_dim limit 10\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7d1be05-a191-4a14-be08-a5d8d0e76b87", + "metadata": {}, + "outputs": [], + "source": [ + "# Removing hyphens and filtering for only substantive and first applications\n", + "# Adding a date key for case received date as start year is needed for FOI\n", + "create_fr_data_filtered_table = f\"\"\"\n", + " select \n", + " *,\n", + " cast(date_format(casedisposeddate,'%Y%m%d') as int) as date_key,\n", + " cast(date_format(casereceiveddate,'%Y%m%d') as int) as date_key2\n", + " from fcsq.fr_fct_legal_case_details\n", + " where substantiveapplicationind = 1\n", + " and firstapplicationforcaseind = 1\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_data_filtered_table,'fr_data_filtered')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "324a93e6-66ed-47de-98fc-1278e8dc80c8", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_data_filtered limit 10\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a41239-4bbc-4e45-b918-ab5e8fbfb430", + "metadata": {}, + "outputs": [], + "source": [ + "#fr_timeliness_info with start year and quarter added\n", + "create_fr_timeliness_info_table = f\"\"\"\n", + " select \n", + " d.calendar_year as year,\n", + " d.calendar_year_qtr as year_quarter,\n", + " dd.calendar_year as start_year,\n", + " dd.calendar_year_qtr as start_year_quarter,\n", + " fr_data_filtered.sourcecasereferencecid,\n", + " fr_data_filtered.divorcecasereferencecid,\n", + " fr_data_filtered.casereceiveddate,\n", + " fr_data_filtered.finalorderdate,\n", + " fr_dim.applicantrepresentedcind as petitioner_represented,\n", + " fr_dim.respondentrepresentedcind as respondent_represented,\n", + " round((to_unixtime(fr_data_filtered.finalorderdate) - \n", + " to_unixtime(fr_data_filtered.casereceiveddate))/86400 , 0)\n", + " as fr_first_disp_day_durtn,\n", + " fr_data_filtered.locationkey as location_key,\n", + " fr_data_filtered.sourcetypename,\n", + " fr_data_filtered.adtclmninsertedbyprocessname\n", + " from __temp__.fr_data_filtered\n", + " left join __temp__.dim_date d\n", + " on fr_data_filtered.date_key = d.date_key\n", + " left join __temp__.dim_date dd\n", + " on fr_data_filtered.date_key2 = dd.date_key\n", + " left join __temp__.fr_dim\n", + " on fr_data_filtered.financialremedyapplicationtypekey = \n", + " fr_dim.financialremedyapplicationtypekey\n", + "\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_timeliness_info_table,'fr_timeliness_info')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf53a95-9f63-4726-885b-4c2297f26967", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_info\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5c4defd-7776-40b3-bfd6-4f64e839d338", + "metadata": {}, + "outputs": [], + "source": [ + "#fr_timeliness_info_minus - filters out cases where start date is after final order date and any nulls in final order date\n", + "create_fr_timeliness_info_minus_table = f\"\"\"\n", + " select * from __temp__.fr_timeliness_info\n", + " where fr_first_disp_day_durtn > -1\n", + " and finalorderdate is not null\n", + "\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_timeliness_info_minus_table,'fr_timeliness_info_minus')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "469a31e2-4d58-46c1-8730-443954709ae4", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_info_minus\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02486b70-a321-4b7f-8a37-a9b3ba8010e5", + "metadata": {}, + "outputs": [], + "source": [ + "#Cases with FDR hearings taking place in Nottingham\n", + "#4.000072 Hearing Type Key for Financial Dispute Resolution\n", + "#Check RDM.dimHearingType for types\n", + "create_fdr_hearings = f\"\"\"\n", + "select distinct sourcecasereferencecid\n", + "FROM fcsq.fr_fct_contested_hearing t1\n", + "left join __temp__.dfj_region t2\n", + " on t1.locationkey \n", + " = t2.location_key\n", + "WHERE t1.hearingtypekey = 4.000072\n", + "AND t2.location_name in ('Nottingham County', 'Nottingham FRC')\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fdr_hearings, \"fdr_nottingham_cases\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8ebce27-25f2-4964-902e-c9c4e4f28b93", + "metadata": {}, + "outputs": [], + "source": [ + "# Modified versions of FCSQ Timeliness Table with an extra column for FDR hearings and FDR hearings in Nottingham\n", + "create_fr_timeliness_with_rep_table = f\"\"\"\n", + " select\n", + " fr_timeliness_info_minus.sourcecasereferencecid as case_number,\n", + " fr_timeliness_info_minus.year,\n", + " fr_timeliness_info_minus.year_quarter,\n", + " fr_timeliness_info_minus.start_year,\n", + " fr_timeliness_info_minus.start_year_quarter,\n", + " case when fr_timeliness_info_minus.petitioner_represented = 'Yes'\n", + " and fr_timeliness_info_minus.respondent_represented = 'No'\n", + " then 1 else 0\n", + " end as pet_rep,\n", + " case when fr_timeliness_info_minus.petitioner_represented = 'No'\n", + " and fr_timeliness_info_minus.respondent_represented = 'Yes'\n", + " then 1 else 0\n", + " end as resp_rep,\n", + " case when fr_timeliness_info_minus.petitioner_represented = 'Yes'\n", + " and fr_timeliness_info_minus.respondent_represented = 'Yes'\n", + " then 1 else 0\n", + " end as both_rep,\n", + " case when fr_timeliness_info_minus.petitioner_represented = 'No'\n", + " and fr_timeliness_info_minus.respondent_represented = 'No'\n", + " then 1 else 0\n", + " end as neither_rep,\n", + " case when fr_timeliness_info_minus.sourcecasereferencecid in (SELECT distinct sourcecasereferencecid FROM fcsq.fr_fct_contested_hearing WHERE hearingtypekey = 4.000072)\n", + " then 1 else 0\n", + " end as fdr_hearing_ind,\n", + " case when fr_timeliness_info_minus.sourcecasereferencecid in (SELECT distinct sourcecasereferencecid FROM __temp__.fdr_nottingham_cases)\n", + " then 1 else 0\n", + " end as fdr_nottingham_hearing_ind,\n", + " dfj_region.location_name,\n", + " dfj_region.region_lookup as region,\n", + " round((fr_timeliness_info_minus.fr_first_disp_day_durtn/7),1)\n", + " as app_to_first_fr_weeks,\n", + " sourcetypename\n", + " from __temp__.fr_timeliness_info_minus\n", + " join __temp__.dfj_region\n", + " on fr_timeliness_info_minus.location_key \n", + " = dfj_region.location_key\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_timeliness_with_rep_table,'fr_timeliness_with_rep')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d7e7dda-127c-442e-bb62-dfee34c17dbf", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_with_rep WHERE fdr_nottingham_hearing_ind = 1 and start_year = 2022\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "201fb6e3-380d-4863-b119-c119122f4bb2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1f9a526-f63d-4066-bc44-bc6dec18c0ad", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_with_rep WHERE fdr_hearing_ind = 1 and start_year = 2022\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae5919b4-c060-48b6-bd0a-129540650812", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_with_rep WHERE fdr_nottingham_hearing_ind = 1 and start_year_quarter in ('2022-Q3', '2022-Q4')\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4946afb-4839-436e-97dd-858bf6ee720d", + "metadata": {}, + "outputs": [], + "source": [ + "#Filtering only Nottingham Court in the second half of 2022\n", + "#Year and Quarter is from the case start rather than case end\n", + "#Time is for final order\n", + "#Hearing Location is Nottingham\n", + "create_fr_repgrp_timeliness_table = f\"\"\"\n", + " select \n", + " case_number,\n", + " start_year as year,\n", + " start_year_quarter as quarter,\n", + " app_to_first_fr_weeks\n", + " from __temp__.fr_timeliness_with_rep\n", + " where \n", + " start_year_quarter in ('2022-Q3', '2022-Q4')\n", + " and fdr_nottingham_hearing_ind = 1 \n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_repgrp_timeliness_table,'fr_repgrp_timeliness')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2096586d-c9e8-408d-89c2-f5b0541dd094", + "metadata": {}, + "outputs": [], + "source": [ + "#pydb.read_sql_query(\"SELECT * FROM __temp__.fr_repgrp_timeliness limit 10\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a1a3399-9c81-4567-a05a-3e810c40ec3b", + "metadata": {}, + "outputs": [], + "source": [ + "#Mode Duration - Rounded week with the most count will be the mode\n", + "mode = pydb.read_sql_query(\"SELECT ROUND(app_to_first_fr_weeks, 0) AS round, count(*) as count FROM __temp__.fr_repgrp_timeliness GROUP BY ROUND(app_to_first_fr_weeks, 0)\")\n", + "mode.sort_values(by='count', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60c65e04-e5bb-4246-9f3e-d13093616108", + "metadata": {}, + "outputs": [], + "source": [ + "# Adding a column for median and mean durations\n", + "create_fr_timeliness_rep_ew_table = f\"\"\"\n", + "with fr_repgrp_timeliness_ew_data_half as (\n", + "\n", + " select\n", + " case_number,\n", + " quarter,\n", + " app_to_first_fr_weeks,\n", + " ntile(2)\n", + " OVER (\n", + " order by app_to_first_fr_weeks\n", + " ) AS data_half\n", + " from __temp__.fr_repgrp_timeliness\n", + " where app_to_first_fr_weeks is not null\n", + "),\n", + "\n", + "fr_timeliness_rep_ew as (\n", + " select\n", + " 'Financial Remedy' as case_type,\n", + " 'Nottingham' as region_ew,\n", + " COUNT(*) as number_of_disposals,\n", + " round(avg(app_to_first_fr_weeks), 1) as mean_duration,\n", + " round(case when count(*) % 2 = 0 \n", + " then(max(case when data_half = 1 then app_to_first_fr_weeks end) + min(case when data_half = 2 then app_to_first_fr_weeks END)) / 2.0\n", + " else \n", + " max(case when data_half = 1 then app_to_first_fr_weeks end)\n", + " end, 1) as median_duration\n", + " from fr_repgrp_timeliness_ew_data_half\n", + " \n", + ")\n", + "\n", + "select * from fr_timeliness_rep_ew\n", + "\n", + "\"\"\"\n", + "pydb.create_temp_table(create_fr_timeliness_rep_ew_table,'fr_timeliness_rep_ew')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f91763e-89db-4266-96e6-93fa49e91921", + "metadata": {}, + "outputs": [], + "source": [ + "#test = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_rep_region\")\n", + "#test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be910a15-97f8-43ea-a588-0ceed6dba08a", + "metadata": {}, + "outputs": [], + "source": [ + "#Creating final data\n", + "final = pydb.read_sql_query(\"SELECT * from __temp__.fr_timeliness_rep_ew\")\n", + "final" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fcsq_data_min", + "language": "python", + "name": "venv_fcsq_data_min" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ad_hoc_requests/children_act/public_law_regional_breakdown/Adhoc_Secondary_Suppression.ipynb b/ad_hoc_requests/children_act/public_law_regional_breakdown/Adhoc_Secondary_Suppression.ipynb new file mode 100644 index 0000000..2b8cd15 --- /dev/null +++ b/ad_hoc_requests/children_act/public_law_regional_breakdown/Adhoc_Secondary_Suppression.ipynb @@ -0,0 +1,411 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "80fdd3b4-24d3-4512-b7da-56c5540e40fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Import packages\n", + "import pandas as pd\n", + "import itertools\n", + "import numpy as np\n", + "import pydbtools as pydb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6150030b-1559-4d2b-8158-28c11c940d8a", + "metadata": {}, + "outputs": [], + "source": [ + "#Setting folders to read files from\n", + "folder_link = \"s3://alpha-family-data/CSVs/Public_Law_SDP\"\n", + "adhoc_link = \"s3://alpha-family-data/Adhoc\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd667bf-5977-41f7-9ed2-86e65a37beba", + "metadata": {}, + "outputs": [], + "source": [ + "#pydb.delete_database_and_data(database=\"__temp__\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e37e07fc-781f-43a5-957f-48993ccd1850", + "metadata": {}, + "outputs": [], + "source": [ + "#Adhoc for regional breakdown of applications and orders by region from 2019 Q1 - 2025 Q2\n", + "#There is a need for secondary suppression which is done in this notebook\n", + "#First getting the files created in SDP SQL\n", + "apps_regional_breakdown = pd.read_csv(f\"\"\"{adhoc_link}/apps_regional_breakdown_v2.csv\"\"\", keep_default_na = False, na_values = ['', 'NULL'])\n", + "pydb.dataframe_to_temp_table(apps_regional_breakdown, \"apps_regional_breakdown\")\n", + "\n", + "ords_regional_breakdown = pd.read_csv(f\"\"\"{adhoc_link}/ords_regional_breakdown_v2.csv\"\"\", keep_default_na = False, na_values = ['', 'NULL'])\n", + "pydb.dataframe_to_temp_table(ords_regional_breakdown, \"ords_regional_breakdown\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "728efe67-ac3b-470f-84c5-d853728f4ac6", + "metadata": {}, + "outputs": [], + "source": [ + "# Getting the lookups\n", + "ca_apps_lookup = pd.read_csv(f\"\"\"{folder_link}/ca_apps_lookup.csv\"\"\")\n", + "pydb.dataframe_to_temp_table(ca_apps_lookup, \"ca_apps_lookup\")\n", + "\n", + "ca_ords_lookup = pd.read_csv(f\"\"\"{folder_link}/ca_ords_lookup.csv\"\"\")\n", + "pydb.dataframe_to_temp_table(ca_ords_lookup, \"ca_ords_lookup\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f6d3bf-ec2e-4ba1-b3aa-5d5fefe91ffc", + "metadata": {}, + "outputs": [], + "source": [ + "#Joining applications to lookup\n", + "pydb.create_temp_table(\n", + " f\"\"\" SELECT t1.Year, t1.Qtr, t1.Type, \n", + " CASE WHEN t1.Region IS NULL or t1.Region = 'Not Specified' \n", + " THEN 'Not Specified' ELSE t1.Region \n", + " END AS Region, \n", + " t2.Order_type_code, t2.Order_desc as Order_description, SUM(Count) as Count\n", + " FROM __temp__.apps_regional_breakdown t1\n", + " LEFT JOIN __temp__.ca_apps_lookup t2\n", + " ON t1.applicationtypekey = t2.applicationtypekey\n", + " GROUP BY t1.Year, t1.Qtr, t1.Type, t1.Region, t2.Order_type_code, t2.Order_desc\n", + " \"\"\",\n", + " \"ca_apps_regional_join\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eac2076-8db7-4982-8bd4-789049cb598d", + "metadata": {}, + "outputs": [], + "source": [ + "#Joining orders to lookup\n", + "pydb.create_temp_table(\n", + " f\"\"\" SELECT t1.Year, t1.Qtr, t1.Type, \n", + " CASE WHEN t1.Region IS NULL or t1.Region = 'Not Specified' \n", + " THEN 'Not Specified' ELSE t1.Region \n", + " END AS Region,\n", + " t2.Order_type_code, t2.Order_desc as Order_description, SUM(Count) as Count\n", + " FROM __temp__.ords_regional_breakdown t1\n", + " LEFT JOIN __temp__.ca_ords_lookup t2\n", + " ON t1.OrderMadeTypeKey = t2.OrderTypeKey\n", + " GROUP BY t1.Year, t1.Qtr, t1.Type, t1.Region, t2.Order_type_code, t2.Order_desc\n", + " \"\"\",\n", + " \"ca_ords_regional_join\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7ffa92d-1ca6-4fcc-ad55-1e28af77e0c0", + "metadata": {}, + "outputs": [], + "source": [ + "#Joining applications and orders data together\n", + "df = pydb.read_sql_query(f\"\"\"SELECT * FROM __temp__.ca_apps_regional_join\n", + "UNION ALL\n", + "SELECT * FROM __temp__.ca_ords_regional_join\n", + "ORDER BY Year, Qtr, Type, Order_type_code, Order_description, Region\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68fcba63-4d79-4b09-9a98-0d319cb79ba4", + "metadata": {}, + "outputs": [], + "source": [ + "#Read the CSV and convert to pandas dataframe\n", + "#df1 = pd.read_csv(\"combined_regional_breakdown.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f408058-87d9-4382-bd58-93be4db1c5d7", + "metadata": {}, + "outputs": [], + "source": [ + "#Applying some formatting to the data frame\n", + "df['order_type_code'] = df['order_type_code'].apply(np.int64)\n", + "df.columns = df.columns.str.capitalize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf86057a-498a-443a-93ad-34ec8f188c9f", + "metadata": {}, + "outputs": [], + "source": [ + "# Reordering columns\n", + "df = df[['Year', 'Qtr', 'Type', 'Order_type_code', 'Order_description', 'Region', 'Count']]\n", + "#df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb3c7f14-7b18-4727-b4d8-2904cef57bf9", + "metadata": {}, + "outputs": [], + "source": [ + "#Extracting data before suppression for QA\n", + "df.to_csv (r'Regional_Unsuppressed.csv', header = True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "565bdfbe-9332-49d4-9dad-403af9076ceb", + "metadata": {}, + "outputs": [], + "source": [ + "#These are the variables which change in the regional totals\n", + "iterating_variables = ['Year','Qtr','Type', 'Order_type_code']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eafe4ec3-9315-4c5b-a626-4447b5f63c5b", + "metadata": {}, + "outputs": [], + "source": [ + "#Aggregating to remove any duplicates in model\n", + "#df1 = df1.groupby(['Year','Qtr','Type', 'Order_type_code', 'Order_description', 'Region']).sum(\"Count\").reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f4fbe29-8d7f-4af1-85c1-f655101e3fe2", + "metadata": {}, + "outputs": [], + "source": [ + "#Creates lists of all the distinct values for each variable except Region and redundant order description\n", + "Years = df.Year.unique()\n", + "Quarters = df.Qtr.unique()\n", + "Types = df.Type.unique()\n", + "#Region = df.Region.unique()\n", + "#Order_descriptions = df.Order_description.unique()\n", + "Order_type_codes = df.Order_type_code.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "415181a2-c61b-43d5-aee6-289535eafa0a", + "metadata": {}, + "outputs": [], + "source": [ + "#Creates a list of all the unique combinations of totals \n", + "combinations = [(a,b,c,d) for a in Years for b in Quarters for c in Types for d in Order_type_codes]\n", + "#combinations = list(itertools.product(Years, Quarters, Types, Order_type_codes))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1678c318-1690-4757-b5c0-4a9265bb3a30", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#Checking Length\n", + "len(combinations)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "657f3b30-0be9-4241-87ac-2fde95e847c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Checking categories\n", + "#df.loc[(df[iterating_variables[0]] == combinations[0][0]) &\n", + "# (df[iterating_variables[1]] == combinations[0][1])&\n", + "# (df[iterating_variables[2]] == combinations[0][2])& \n", + "# (df[iterating_variables[3]] == combinations[0][3])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6951e556-606a-4c30-b59d-152e46228225", + "metadata": {}, + "outputs": [], + "source": [ + "list_to_secondary_suppress = []\n", + "for i in range(len(combinations)): #Runs through each combination\n", + " \n", + " #For each combination, get a list of all the Count totals\n", + " iteration_of_Count = df.loc[(df[iterating_variables[0]] == combinations[i][0]) &\n", + " (df[iterating_variables[1]] == combinations[i][1])&\n", + " (df[iterating_variables[2]] == combinations[i][2])& \n", + " (df[iterating_variables[3]] == combinations[i][3])].Count\n", + " \n", + " iteration_app_total = sum(iteration_of_Count) #Calculate total Count for this combination\n", + " \n", + " if iteration_app_total > 5: #If regional total isn't suppressed \n", + " suppressed_apps = df.loc[(df[iterating_variables[0]] == combinations[i][0]) &\n", + " (df[iterating_variables[1]] == combinations[i][1])&\n", + " (df[iterating_variables[2]] == combinations[i][2])& \n", + " (df[iterating_variables[3]] == combinations[i][3])\n", + " & (df['Count']<=5)& (df['Count']>=1)].Count \n", + " #gets a list of values in the range to be suppressed\n", + " \n", + " #If there is only 1 value to be primary suppressed, or if values are all 1s or all 5s then flag combination as needing secondary suppression\n", + " if (len(suppressed_apps)==1 or ((len(suppressed_apps)==sum(suppressed_apps) or 5*len(suppressed_apps)==sum(suppressed_apps)) and len(suppressed_apps)>0)):\n", + " list_to_secondary_suppress.append(combinations[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f3d91fe-872f-4a29-9e73-dec7f2c95555", + "metadata": {}, + "outputs": [], + "source": [ + "#check the list produced\n", + "len(list_to_secondary_suppress)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c6a5fa8-75a1-4e30-9093-e2d9a13ad943", + "metadata": {}, + "outputs": [], + "source": [ + "#list_to_secondary_suppress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d08c6d1-d640-487b-acba-8cb9b8ce4cc1", + "metadata": {}, + "outputs": [], + "source": [ + "#Applying primary suppression for data frame rows. -1 is used to keep as number, it is replaced later\n", + "for index,row in df.iterrows():\n", + " #primary suppress any values between 1 or 5\n", + " if row['Count']<=5 and row['Count']>=1:\n", + " df.at[index,'Count']= -1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6af7c83c-d34b-4d3a-a733-0f0e6bd6a510", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(list_to_secondary_suppress)):\n", + " \n", + " #Iterates through the secondary suppression flag list and creates a list of applicant totals for every combination (ignoring already suppressed values)\n", + " iteration_of_Count = df.loc[(df[iterating_variables[0]] == list_to_secondary_suppress[i][0]) &\n", + " (df[iterating_variables[1]] == list_to_secondary_suppress[i][1])&\n", + " (df[iterating_variables[2]] == list_to_secondary_suppress[i][2])& \n", + " (df[iterating_variables[3]] == list_to_secondary_suppress[i][3])&\n", + " (df['Count']!= -1)]\n", + " \n", + " #Finds the index of the lowest value not primary suppressed\n", + " min_apps = -1\n", + " for index in iteration_of_Count.index:\n", + " if min_apps==-1:\n", + " min_apps = df.at[index,'Count']\n", + " min_index = index\n", + " else:\n", + " if df.at[index,'Count']