diff --git a/data/deprivation_of_liberty/.ipynb_checkpoints/deprivation_csv-checkpoint.ipynb b/data/deprivation_of_liberty/.ipynb_checkpoints/deprivation_csv-checkpoint.ipynb new file mode 100644 index 0000000..345c4c3 --- /dev/null +++ b/data/deprivation_of_liberty/.ipynb_checkpoints/deprivation_csv-checkpoint.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "393de839-52ce-437d-8574-a852509b9f39", + "metadata": {}, + "source": [ + "## 1. Import packages and set options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "950a6017-0770-47a6-9d31-3cba3f6db693", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd # a module which provides the data structures and functions to store and manipulate tables in dataframes\n", + "import pydbtools as pydb # A module which allows SQL queries to be run on the Analytical Platform from Python, see https://github.com/moj-analytical-services/pydbtools\n", + "import boto3 # allows you to directly create, update, and delete AWS resources from Python scripts\n", + "import numpy as np\n", + "import re\n", + "import math\n", + "#import pandasql\n", + "\n", + "# sets parameters to view dataframes for tables easier\n", + "pd.set_option(\"display.max_columns\", 100)\n", + "pd.set_option(\"display.width\", 900)\n", + "pd.set_option(\"display.max_colwidth\", 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b16ae6d-7c13-4fd5-bc15-0bd1c8d8bc1b", + "metadata": {}, + "outputs": [], + "source": [ + "#change these to the current quarter not the quarter being published\n", + "latest_quarter = 3\n", + "latest_year = 2025\n", + "\n", + "#change these to the current quarter being published\n", + "pub_quarter = 2\n", + "pub_year = 2025" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "650f0c53-99f0-4c9d-91a9-8d80ce051bfe", + "metadata": {}, + "outputs": [], + "source": [ + "#imports DOL extract data from S3 bucket into a temporary table\n", + "dol_table = pd.read_csv(\"s3://alpha-family-data/CSVs/Deprivation_of_Liberty/DoL_extract.csv\", low_memory = False)\n", + "dol_table.columns = dol_table.columns.str.lower()\n", + "#Converting some columns to datetime\n", + "dol_table['issuedate'] = pd.to_datetime(dol_table['issuedate'], format = '%d/%m/%Y')\n", + "dol_table['orderdate'] = pd.to_datetime(dol_table['orderdate'], format = '%d/%m/%Y')\n", + "dol_table['orderfinished2'] = dol_table['orderfinished'].astype(str)\n", + "\n", + "#Adding some extra columns for year/quarter and time difference\n", + "dol_table['year'] = dol_table['issuedate'].dt.year\n", + "dol_table['quarter'] = dol_table['issuedate'].dt.quarter\n", + "\n", + "dol_table['ordyear'] = dol_table['orderdate'].dt.year\n", + "dol_table['ordquarter'] = dol_table['orderdate'].dt.quarter\n", + "\n", + "\n", + "#Adding age band column\n", + "dol_table['ageband'] = np.where(pd.isnull(dol_table['ageofchild']), 'Unknown',\n", + " np.where(dol_table['ageofchild'] < 13, '0-12 years',\n", + " np.where(dol_table['ageofchild'] < 16, '13-15 years',\n", + " np.where(dol_table['ageofchild'] < 19, '16-18 years', 'Other'))))\n", + "\n", + "# Order State\n", + "#Checks whether orderfinished has certain conditions or is empty\n", + "cond = [dol_table['orderfinished2'] == 'No order', dol_table['orderfinished2'] == 'Refused', dol_table['orderfinished2'] == 'Withdrawn', pd.notnull(dol_table['orderdate'])]\n", + "results = ['No Order', 'Refused', 'Withdrawn', 'Order Made']\n", + "dol_table['orderstate2'] = np.select(cond, results, 'Ongoing case')\n", + "dol_table['orderstate'] = np.where((dol_table['orderfinished2'] != 'nan') & (dol_table['orderstate2'] == 'Order Made'), 'Final Order Made', dol_table['orderstate2'])\n", + "\n", + "#Order Finished Date\n", + "dol_table['ordfindate'] = pd.to_datetime(np.where(dol_table['orderstate'] == 'Final Order Made', dol_table['orderfinished'], pd.NaT), format = '%d/%m/%Y', errors = 'coerce')\n", + "\n", + "dol_table['ordspan_days'] = (dol_table['ordfindate'] - dol_table['orderdate']).astype(int)\n", + "dol_table['ordspan_months'] = (dol_table['ordfindate'] - dol_table['orderdate'])/np.timedelta64(1, 'M')\n", + "\n", + "ordspan_cond = [dol_table['ordspan_months'] < 3, dol_table['ordspan_months'] < 6, dol_table['ordspan_months'] < 9, dol_table['ordspan_months'] < 12, dol_table['ordspan_months'] > 12]\n", + "ordspan_result = ['0-3 months', '3-6 months', '6-9 months', '9-12 months', 'Over 12 months']\n", + "dol_table['spanband'] = np.select(ordspan_cond, ordspan_result, 'N/A')\n", + "\n", + "# Final Order Date\n", + "dol_table['finordyear'] = dol_table['ordfindate'].dt.year\n", + "dol_table['finordquarter'] = dol_table['ordfindate'].dt.quarter\n", + " \n", + " \n", + "#Renaming region to avoid clash\n", + "dol_table.rename(columns = {\"region\":\"court_region\"}, inplace = True)\n", + "\n", + "#Filtering errors\n", + "dol_table = dol_table[dol_table['error'] != 'Yes'].drop('error', axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcf56274-2ea1-456e-b34e-4375730b0b73", + "metadata": {}, + "outputs": [], + "source": [ + "#dol_table[dol_table['orderfinished2'] == 'Refused']\n", + "#dol_table[pd.notnull(dol_table['orderfinished'])]\n", + "#dol_table[dol_table['orderstate2'] == 'Order Made']\n", + "#dol_table[(dol_table['orderfinished2'] != 'nan') & (dol_table['orderstate2'] == 'Order Made')]\n", + "#dol_table[dol_table['orderstate'] == 'Final Order Made']\n", + "#dol_table.dtypes\n", + "#dol_table = dol_table[dol_table['error'] != 'Yes'].drop('error', axis = 1)\n", + "#x = dol_table[(dol_table['ageofchild'] >= 13) & (dol_table['ageofchild'] < 16)]\n", + "#x[(x['quarter'] == 4)]\n", + "#pd.pivot_table(dol_table, values = 'casenumber', index = ['quarter', 'ageofchild'], aggfunc = 'count')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ef139a1-88c4-4fd8-8cde-0a34c633b77f", + "metadata": {}, + "outputs": [], + "source": [ + "# Deleting data from temp tables\n", + "#pydb.delete_table_and_data(database=\"__temp__\", table=\"dol_new\")\n", + "#pydb.delete_table_and_data(database=\"__temp__\", table=\"dol_region_lookup\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34a0416b-0756-47e4-bf12-3c2942c51843", + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a temporary table from dataframe\n", + "pydb.dataframe_to_temp_table(dol_table, \"dol_new\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b33fb4db-9fb8-4add-aac8-a46da9f9b511", + "metadata": {}, + "outputs": [], + "source": [ + "# Importing Region Lookup and making it a temporary table\n", + "dol_region_lookup = pd.read_csv(\"s3://alpha-family-data/CSVs/Deprivation_of_Liberty/Council_Lookup.csv\", low_memory = False)\n", + "pydb.dataframe_to_temp_table(dol_region_lookup, \"dol_region_lookup\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ee7619d-e196-4d8a-944e-ef5a462221b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Selecting distinct applications by removing child information and adding a count for rows. \n", + "#This definition may change when multiple children/extended applications start to appear in one case\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT\n", + "*,\n", + "ROW_NUMBER() OVER(PARTITION BY casenumber\n", + " ORDER BY issuedate) AS app_count,\n", + "\n", + "\n", + "\n", + "CASE WHEN ROW_NUMBER() OVER(PARTITION BY casenumber\n", + " ORDER BY issuedate) = 1\n", + "THEN 'Initial'\n", + "WHEN ROW_NUMBER() OVER(PARTITION BY casenumber\n", + " ORDER BY issuedate) > 1\n", + "THEN 'Extended'\n", + "ELSE 'Unknown'\n", + "END AS App_type\n", + "\n", + "FROM(\n", + "SELECT\n", + "DISTINCT\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.casenumber,\n", + "t1.issuedate,\n", + "t1.ordyear,\n", + "t1.ordquarter,\n", + "t1.orderdate,\n", + "t1.orderstate2,\n", + "t1.orderstate,\n", + "t1.ordfindate,\n", + "t1.finordyear,\n", + "t1.finordquarter,\n", + "t1.ordspan_days,\n", + "t1.spanband,\n", + "t2.party,\n", + "t2.region,\n", + "t2.party_type\n", + "FROM \n", + "__temp__.dol_new t1\n", + "LEFT JOIN \n", + "__temp__.dol_region_lookup t2\n", + "ON t1.partyname = t2.Party)\n", + "\"\"\",\n", + "\"dol_apps_all\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c475268-ee81-4ee4-ab83-ce32f5543af6", + "metadata": {}, + "outputs": [], + "source": [ + "# Selecting distinct children in a case - \n", + "#currently this is done for a full year like in Children Act meaning that if a child appears twice in the same year in different quarters, they will only be counted for the first one\n", + "#This definition may change when multiple children/extended applications start to appear in one case\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT\n", + "*,\n", + "ROW_NUMBER() OVER(PARTITION BY year, casenumber\n", + " ORDER BY issuedate) AS dup_rank\n", + "\n", + "FROM \n", + "__temp__.dol_new t1\n", + "LEFT JOIN \n", + "__temp__.dol_region_lookup t2\n", + "ON t1.partyname = t2.Party\n", + "\"\"\",\n", + "\"dol_child_dup\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2098a8a-44f4-4b2b-b238-cc2bc8e79470", + "metadata": {}, + "outputs": [], + "source": [ + "# Application Count\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.Year, \n", + "t1.Quarter, \n", + "'N/A' as ageband,\n", + "'Applications' as Count_type,\n", + "t1.App_type,\n", + "t1.Region,\n", + "t1.Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_apps_all t1\n", + "GROUP BY\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.App_type,\n", + "t1.Region,\n", + "t1.Party_type\n", + "\"\"\",\n", + "\"dol_apps_agg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f3bd5c9-fc81-4bfa-b2e4-35a880169b30", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Case Count.\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.Year, \n", + "t1.Quarter, \n", + "'N/A' as ageband,\n", + "'Cases Started' as Count_type,\n", + "'N/A' as App_type,\n", + "t1.Region,\n", + "t1.Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_apps_all t1\n", + "WHERE app_count = 1\n", + "GROUP BY\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.Region,\n", + "t1.Party_type\n", + "\n", + "\n", + "\"\"\",\n", + "\"dol_cases\")\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b45c4fd-9284-41c3-93ab-e9481dcc56f9", + "metadata": {}, + "outputs": [], + "source": [ + "# Child Age Count of DOL\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.ageband,\n", + "'Child_Age' as Count_type,\n", + "'All' as App_type,\n", + "'N/A' as Region,\n", + "'N/A' as Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "\n", + "FROM \n", + "__temp__.dol_child_dup t1\n", + "WHERE dup_rank = 1\n", + "\n", + "GROUP BY\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.ageband\n", + "\"\"\",\n", + "\"dol_child_age\")\n", + "\n", + "# Child Region of DOL - Likely redundant for now\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.Year, \n", + "t1.Quarter, \n", + "'N/A' as ageband,\n", + "'Child_Region' as Count_type,\n", + "'All' as App_type,\n", + "t1.Region,\n", + "t1.Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_child_dup t1\n", + "WHERE dup_rank = 1\n", + "GROUP BY\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.Region,\n", + "t1.Party_type\n", + "\"\"\",\n", + "\"dol_child_region\")\n", + "\n", + "\n", + "# Child Gender of DOL\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.Year, \n", + "t1.Quarter, \n", + "'N/A' as ageband,\n", + "'Child_Sex' as Count_type,\n", + "'All' as App_type,\n", + "'N/A' as Region,\n", + "'N/A' as Party_type,\n", + "t1.gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_child_dup t1\n", + "WHERE dup_rank = 1\n", + "GROUP BY\n", + "t1.Year, \n", + "t1.Quarter, \n", + "t1.gender\n", + "\"\"\",\n", + "\"dol_child_sex\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51edcd37-8287-4240-9d2a-1ccb72cc8ac5", + "metadata": {}, + "outputs": [], + "source": [ + "# Child Count of DOL\n", + "#pydb.create_temp_table(\n", + "#f\"\"\"\n", + "#SELECT \n", + "#t1.Year, \n", + "#t1.Quarter, \n", + "#t1.ageband,\n", + "#'Child' as Count_type,\n", + "#'All' as App_type,\n", + "#t2.Region,\n", + "#t2.Party_type,\n", + "#t1.gender,\n", + "#Count(*) as Count\n", + "#FROM \n", + "#__temp__.dol_new t1\n", + "#LEFT JOIN \n", + "#__temp__.dol_region_lookup t2\n", + "#ON t1.partyname = t2.Party\n", + "#GROUP BY\n", + "#t1.Year, \n", + "#t1.Quarter, \n", + "#t1.ageband,\n", + "#t2.Region,\n", + "#t2.Party_type,\n", + "#t1.gender\n", + "#\"\"\",\n", + "#\"dol_child\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d72ed02-6238-4530-b5c0-adf022ce12bb", + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregating first DOL Orders made\n", + "\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.OrdYear as Year, \n", + "t1.OrdQuarter as Quarter, \n", + "'N/A' as ageband,\n", + "'Orders' as Count_type,\n", + "'N/A' as App_type,\n", + "'N/A' as Region,\n", + "'N/A' as Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_apps_all t1\n", + "WHERE t1.orderstate2 = 'Order Made'\n", + "GROUP BY\n", + "t1.OrdYear, \n", + "t1.OrdQuarter \n", + "\"\"\",\n", + "\"dol_ords_agg\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "978ea9ae-1acb-41c4-8d81-d11382630583", + "metadata": {}, + "outputs": [], + "source": [ + "# Selecting final orders based on date of original order\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.OrdYear as Year, \n", + "t1.OrdQuarter as Quarter, \n", + "'N/A' as ageband,\n", + "'Final Order Made' as Count_type,\n", + "'N/A' as App_type,\n", + "'N/A' as Region,\n", + "'N/A' as Party_type,\n", + "'N/A' as gender,\n", + "t1.spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_apps_all t1\n", + "WHERE t1.orderstate = 'Final Order Made'\n", + "GROUP BY\n", + "t1.OrdYear, \n", + "t1.OrdQuarter,\n", + "t1.spanband\n", + "\"\"\",\n", + "\"dol_final_agg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee2cd18e-9977-4b34-b51c-07fef60eb064", + "metadata": {}, + "outputs": [], + "source": [ + "# Selecting final orders based on date of final order\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT \n", + "t1.FinOrdYear as Year, \n", + "t1.FinOrdQuarter as Quarter, \n", + "'N/A' as ageband,\n", + "'Cases Closed' as Count_type,\n", + "'N/A' as App_type,\n", + "'N/A' as Region,\n", + "'N/A' as Party_type,\n", + "'N/A' as gender,\n", + "'N/A' as spanband,\n", + "Count(*) as Count\n", + "FROM \n", + "__temp__.dol_apps_all t1\n", + "WHERE t1.orderstate = 'Final Order Made'\n", + "GROUP BY\n", + "t1.FinOrdYear, \n", + "t1.FinOrdQuarter \n", + "\"\"\",\n", + "\"dol_close_agg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e87f5b25-aa0d-4e64-9700-6df7acf48dd2", + "metadata": {}, + "outputs": [], + "source": [ + "# Joining different parts together into one csv\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_cases\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_apps_agg\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_child_age\n", + "\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_child_sex\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_close_agg\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_ords_agg\n", + "\n", + "UNION ALL\n", + "\n", + "SELECT\n", + "*\n", + "FROM __temp__.dol_final_agg\n", + "\n", + "\"\"\",\n", + "\"dol_csv_gender\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af761846-337f-4686-b41b-7347fb1c17df", + "metadata": {}, + "outputs": [], + "source": [ + "#pydb.read_sql_query('SELECT * FROM __temp__.dol_close_agg')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f0874f9-6e4f-489d-ad0a-6e5cbb86892e", + "metadata": {}, + "outputs": [], + "source": [ + "#csv_frame2 = pydb.read_sql_query(f\"SELECT * FROM __temp__.dol_csv WHERE year != {latest_year} OR quarter != {latest_quarter} ORDER BY year, quarter, count_type, region, party_type, ageband, gender, spanband\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f508e8c-90de-4f77-add0-8c462f0c5f60", + "metadata": {}, + "outputs": [], + "source": [ + "#Apply trim to sex column\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "select\n", + " year, \n", + " quarter, \n", + " ageband, \n", + " count_type,\n", + " app_type,\n", + " region,\n", + " party_type,\n", + " trim(gender) as sex, \n", + " spanband,\n", + " count\n", + "from __temp__.dol_csv_gender\n", + "\"\"\",\n", + "\"dol_csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8e3a07-46d6-43bd-b66d-f4655ba3a803", + "metadata": {}, + "outputs": [], + "source": [ + "test = pydb.read_sql_query(\"SELECT * from __temp__.dol_csv LIMIT 10\")\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e64c6b89-acc4-4575-983f-c714db14dc6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Ordering csv and filtering out next quarter data. Done in Pandas rather than SQL due to issues with ordering\n", + "csv_frame = pydb.read_sql_query(\"SELECT * FROM __temp__.dol_csv\").sort_values(by = [\"year\", \"quarter\", \"count_type\", \"region\", \"party_type\", \"ageband\", \"sex\", \"spanband\"])\n", + "csv_frame2 = csv_frame[(csv_frame.year != latest_year) | (csv_frame.quarter != latest_quarter)]\n", + "csv_frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41607a03-73e4-4249-9f8a-82be425ec570", + "metadata": {}, + "outputs": [], + "source": [ + "#Exporting csv to S3 bucket\n", + "csv_frame2.to_csv(path_or_buf = 's3://alpha-family-data/CSVs/Deprivation_of_Liberty/dol_csv.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c24b161-870f-40ae-988f-9334875e4e83", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0a87e8b-828d-49bf-82d4-1ca5fae0eab1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5b67177-ab23-4f43-9e15-72b295931a51", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b30fc4a-d2a8-4ef8-b9ad-a7f5adfba40b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "FCSQ_data", + "language": "python", + "name": "venv_fcsq_data" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/deprivation_of_liberty/deprivation_csv.ipynb b/data/deprivation_of_liberty/deprivation_csv.ipynb index d218ca2..d8a90a5 100644 --- a/data/deprivation_of_liberty/deprivation_csv.ipynb +++ b/data/deprivation_of_liberty/deprivation_csv.ipynb @@ -37,12 +37,12 @@ "outputs": [], "source": [ "#change these to the current quarter not the quarter being published\n", - "latest_quarter = 3\n", - "latest_year = 2024\n", + "latest_quarter = 4\n", + "latest_year = 2025\n", "\n", "#change these to the current quarter being published\n", - "pub_quarter = 2\n", - "pub_year = 2024" + "pub_quarter = 3\n", + "pub_year = 2025" ] }, { @@ -85,7 +85,7 @@ "dol_table['ordfindate'] = pd.to_datetime(np.where(dol_table['orderstate'] == 'Final Order Made', dol_table['orderfinished'], pd.NaT), format = '%d/%m/%Y', errors = 'coerce')\n", "\n", "dol_table['ordspan_days'] = (dol_table['ordfindate'] - dol_table['orderdate']).astype(int)\n", - "dol_table['ordspan_months'] = (dol_table['ordfindate'] - dol_table['orderdate']) /np.timedelta64(1, 'M')\n", + "dol_table['ordspan_months'] = (dol_table['ordfindate'] - dol_table['orderdate'])/pd.Timedelta(days=30.417)\n", "\n", "ordspan_cond = [dol_table['ordspan_months'] < 3, dol_table['ordspan_months'] < 6, dol_table['ordspan_months'] < 9, dol_table['ordspan_months'] < 12, dol_table['ordspan_months'] > 12]\n", "ordspan_result = ['0-3 months', '3-6 months', '6-9 months', '9-12 months', 'Over 12 months']\n", @@ -145,18 +145,6 @@ "pydb.dataframe_to_temp_table(dol_table, \"dol_new\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b33fb4db-9fb8-4add-aac8-a46da9f9b511", - "metadata": {}, - "outputs": [], - "source": [ - "# Importing Region Lookup and making it a temporary table\n", - "dol_region_lookup = pd.read_csv(\"s3://alpha-family-data/CSVs/Deprivation_of_Liberty/Council_Lookup.csv\", low_memory = False)\n", - "pydb.dataframe_to_temp_table(dol_region_lookup, \"dol_region_lookup\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -200,15 +188,10 @@ "t1.finordyear,\n", "t1.finordquarter,\n", "t1.ordspan_days,\n", - "t1.spanband,\n", - "t2.party,\n", - "t2.region,\n", - "t2.party_type\n", + "t1.spanband\n", "FROM \n", - "__temp__.dol_new t1\n", - "LEFT JOIN \n", - "__temp__.dol_region_lookup t2\n", - "ON t1.partyname = t2.Party)\n", + "__temp__.dol_new t1)\n", + "\n", "\"\"\",\n", "\"dol_apps_all\")" ] @@ -232,9 +215,6 @@ "\n", "FROM \n", "__temp__.dol_new t1\n", - "LEFT JOIN \n", - "__temp__.dol_region_lookup t2\n", - "ON t1.partyname = t2.Party\n", "\"\"\",\n", "\"dol_child_dup\")" ] @@ -255,9 +235,7 @@ "'N/A' as ageband,\n", "'Applications' as Count_type,\n", "t1.App_type,\n", - "t1.Region,\n", - "t1.Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -265,9 +243,7 @@ "GROUP BY\n", "t1.Year, \n", "t1.Quarter, \n", - "t1.App_type,\n", - "t1.Region,\n", - "t1.Party_type\n", + "t1.App_type\n", "\"\"\",\n", "\"dol_apps_agg\")" ] @@ -289,9 +265,7 @@ "'N/A' as ageband,\n", "'Cases Started' as Count_type,\n", "'N/A' as App_type,\n", - "t1.Region,\n", - "t1.Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -299,9 +273,7 @@ "WHERE app_count = 1\n", "GROUP BY\n", "t1.Year, \n", - "t1.Quarter, \n", - "t1.Region,\n", - "t1.Party_type\n", + "t1.Quarter\n", "\n", "\n", "\"\"\",\n", @@ -326,9 +298,7 @@ "t1.ageband,\n", "'Child_Age' as Count_type,\n", "'All' as App_type,\n", - "'N/A' as Region,\n", - "'N/A' as Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "\n", @@ -343,44 +313,16 @@ "\"\"\",\n", "\"dol_child_age\")\n", "\n", - "# Child Region of DOL - Likely redundant for now\n", + "# Child Sex of DOL\n", "pydb.create_temp_table(\n", "f\"\"\"\n", "SELECT \n", "t1.Year, \n", "t1.Quarter, \n", "'N/A' as ageband,\n", - "'Child_Region' as Count_type,\n", + "'Child_Sex' as Count_type,\n", "'All' as App_type,\n", - "t1.Region,\n", - "t1.Party_type,\n", - "'N/A' as gender,\n", - "'N/A' as spanband,\n", - "Count(*) as Count\n", - "FROM \n", - "__temp__.dol_child_dup t1\n", - "WHERE dup_rank = 1\n", - "GROUP BY\n", - "t1.Year, \n", - "t1.Quarter, \n", - "t1.Region,\n", - "t1.Party_type\n", - "\"\"\",\n", - "\"dol_child_region\")\n", - "\n", - "\n", - "# Child Gender of DOL\n", - "pydb.create_temp_table(\n", - "f\"\"\"\n", - "SELECT \n", - "t1.Year, \n", - "t1.Quarter, \n", - "'N/A' as ageband,\n", - "'Child_Gender' as Count_type,\n", - "'All' as App_type,\n", - "'N/A' as Region,\n", - "'N/A' as Party_type,\n", - "t1.gender,\n", + "t1.gender as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -391,7 +333,7 @@ "t1.Quarter, \n", "t1.gender\n", "\"\"\",\n", - "\"dol_child_gender\")" + "\"dol_child_sex\")" ] }, { @@ -447,9 +389,7 @@ "'N/A' as ageband,\n", "'Orders' as Count_type,\n", "'N/A' as App_type,\n", - "'N/A' as Region,\n", - "'N/A' as Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -478,9 +418,7 @@ "'N/A' as ageband,\n", "'Final Order Made' as Count_type,\n", "'N/A' as App_type,\n", - "'N/A' as Region,\n", - "'N/A' as Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "t1.spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -510,9 +448,7 @@ "'N/A' as ageband,\n", "'Cases Closed' as Count_type,\n", "'N/A' as App_type,\n", - "'N/A' as Region,\n", - "'N/A' as Party_type,\n", - "'N/A' as gender,\n", + "'N/A' as sex,\n", "'N/A' as spanband,\n", "Count(*) as Count\n", "FROM \n", @@ -556,7 +492,7 @@ "\n", "SELECT\n", "*\n", - "FROM __temp__.dol_child_gender\n", + "FROM __temp__.dol_child_sex\n", "\n", "UNION ALL\n", "\n", @@ -577,7 +513,7 @@ "FROM __temp__.dol_final_agg\n", "\n", "\"\"\",\n", - "\"dol_csv\")\n" + "\"dol_csv_sex\")\n" ] }, { @@ -600,6 +536,81 @@ "#csv_frame2 = pydb.read_sql_query(f\"SELECT * FROM __temp__.dol_csv WHERE year != {latest_year} OR quarter != {latest_quarter} ORDER BY year, quarter, count_type, region, party_type, ageband, gender, spanband\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f508e8c-90de-4f77-add0-8c462f0c5f60", + "metadata": {}, + "outputs": [], + "source": [ + "#Apply trim to sex column\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "select\n", + " year, \n", + " quarter, \n", + " ageband, \n", + " count_type,\n", + " app_type,\n", + " trim(sex) as sex, \n", + " spanband,\n", + " count\n", + "from __temp__.dol_csv_sex\n", + "\"\"\",\n", + "\"dol_csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8e3a07-46d6-43bd-b66d-f4655ba3a803", + "metadata": {}, + "outputs": [], + "source": [ + "test = pydb.read_sql_query(\"SELECT * from __temp__.dol_csv LIMIT 10\")\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "170d0abc-bac8-44b6-ad92-6403672652f7", + "metadata": {}, + "outputs": [], + "source": [ + "#Convert everything apart from male/female to other\n", + "pydb.create_temp_table(\n", + "f\"\"\"\n", + "select\n", + " year, \n", + " quarter, \n", + " ageband, \n", + " count_type,\n", + " app_type,\n", + " case\n", + " when sex in('Male') then 'Male'\n", + " when sex in('Female') then 'Female'\n", + " when sex in('N/A') then 'N/A'\n", + " else 'Other'\n", + " end as sex, \n", + " spanband,\n", + " count\n", + "from __temp__.dol_csv\n", + "\"\"\",\n", + "\"dol_csv_other\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7645b874-1d33-41d9-86cc-608ab313e18c", + "metadata": {}, + "outputs": [], + "source": [ + "test = pydb.read_sql_query(\"SELECT * from __temp__.dol_csv_other\")\n", + "test" + ] + }, { "cell_type": "code", "execution_count": null, @@ -608,7 +619,7 @@ "outputs": [], "source": [ "# Ordering csv and filtering out next quarter data. Done in Pandas rather than SQL due to issues with ordering\n", - "csv_frame = pydb.read_sql_query(\"SELECT * FROM __temp__.dol_csv\").sort_values(by = [\"year\", \"quarter\", \"count_type\", \"region\", \"party_type\", \"ageband\", \"gender\", \"spanband\"])\n", + "csv_frame = pydb.read_sql_query(\"SELECT * FROM __temp__.dol_csv_other\").sort_values(by = [\"year\", \"quarter\", \"count_type\", \"ageband\", \"sex\", \"spanband\"])\n", "csv_frame2 = csv_frame[(csv_frame.year != latest_year) | (csv_frame.quarter != latest_quarter)]\n", "csv_frame2" ] @@ -659,9 +670,9 @@ ], "metadata": { "kernelspec": { - "display_name": "My project (Python3)", + "display_name": "FCSQ_data", "language": "python", - "name": "venv_fcsqproject" + "name": "venv_fcsq_data" }, "language_info": { "codemirror_mode": {