diff --git a/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb b/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb new file mode 100644 index 000000000..1cb5af0c7 --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "bb24db55", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "import pyogrio\n", + "from data_pipeline.etl.sources.census.etl import CensusETL\n", + "from data_pipeline.etl.sources.geocorr_alternatives.etl import GeoCorrAlternativesETL\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel\n", + "from data_pipeline.etl.sources.geo_utils import (\n", + " add_tracts_for_geometries,\n", + " get_tract_geojson,\n", + ")\n", + "from data_pipeline.score import field_names\n", + "from data_pipeline.utils import get_module_logger, unzip_file_from_url\n", + "\n", + "logger = get_module_logger(__name__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "41bd360f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# ZCTA_2020_SHAPEFILE_PATH = (\n", + "# \"https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_zcta520_500k.zip\"\n", + "# )\n", + "\n", + "# ZCTA_2010_SHAPEFILE_PATH = (\n", + "# \"https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_zcta510_500k.zip\"\n", + "# )\n", + "\n", + "ZCTA_2020_SHAPEFILE_PATH = (\n", + " \"~/Downloads/cb_2020_us_zcta520_500k\"\n", + ")\n", + "\n", + "ZCTA_2010_SHAPEFILE_PATH = (\n", + " \"~/Downloads/cb_2019_us_zcta510_500k\"\n", + ")\n", + "\n", + "ZCTA_2010_FIELD = \"ZCTA5CE10\"\n", + "\n", + "PERCENT_OF_2020_in_2010_FIELD = \"percent of 2020 in 2010\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "37ca370a", + "metadata": {}, + "outputs": [], + "source": [ + "# Read in ZCTA data.\n", + "zcta_2020_gdf = gpd.read_file(\n", + " filename=ZCTA_2020_SHAPEFILE_PATH\n", + ")\n", + "zcta_2020_gdf = zcta_2020_gdf.rename(\n", + " columns={GeoCorrAlternativesETL.ZIP_CODE_INPUT_FIELD: field_names.ZIP_CODE},\n", + " errors=\"raise\",\n", + ")\n", + "\n", + "\n", + "# Read in ZCTA data.\n", + "zcta_2010_gdf = gpd.read_file(\n", + " filename=ZCTA_2010_SHAPEFILE_PATH\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6178cd2f", + "metadata": {}, + "outputs": [], + "source": [ + "#switch to projected \n", + "zcta_2020_gdf=zcta_2020_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)\n", + "zcta_2010_gdf=zcta_2010_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ff3532eb", + "metadata": {}, + "outputs": [], + "source": [ + "zcta_2020_gdf[\"zcta_2020_area\"] = zcta_2020_gdf.area" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7d95b1ef", + "metadata": {}, + "outputs": [], + "source": [ + "joined_gdf = gpd.overlay(\n", + " df1=zcta_2020_gdf,\n", + " df2=zcta_2010_gdf,\n", + " how=\"intersection\",\n", + " keep_geom_type=False,\n", + " )\n", + "\n", + "# Calculating the areas of the newly-created overlapping geometries\n", + "joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] = joined_gdf.area\n", + "\n", + "# Calculating the areas of the newly-created geometries in relation\n", + "# to the original tract geometries\n", + "joined_gdf[PERCENT_OF_2020_in_2010_FIELD] = (\n", + " joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] / joined_gdf[\"zcta_2020_area\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b206ad11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 164183\n", + "1.0 17832\n", + "0.9 9812\n", + "0.1 8771\n", + "0.8 2787\n", + "0.2 2029\n", + "0.7 1454\n", + "0.3 1146\n", + "0.6 988\n", + "0.4 894\n", + "0.5 800\n", + "Name: percent of 2020 in 2010, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joined_gdf[PERCENT_OF_2020_in_2010_FIELD].round(decimals=1).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4df18de2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Zip codeAFFGEOID20GEOID20NAME20LSAD20ALAND20AWATER20zcta_2020_areaZCTA5CE10AFFGEOID10GEOID10ALAND10AWATER10geometryarea_joinedpercent of 2020 in 2010
035768860Z200US357683576835768Z544623199037360146.688056e+08357768600000US35776357762340724611041223GEOMETRYCOLLECTION (POLYGON ((-9597648.456 411...2.193417e+060.003280
135769860Z200US357693576935769Z5163279214578357093.270629e+08357768600000US35776357762340724611041223GEOMETRYCOLLECTION (POLYGON ((-9596251.174 410...4.423302e+030.000014
235776860Z200US357763577635776Z526837668912770833.994217e+08357768600000US35776357762340724611041223GEOMETRYCOLLECTION (POLYGON ((-9609657.158 411...3.377086e+080.845494
335774860Z200US357743577435774Z5361393373629695.424829e+07357768600000US35776357762340724611041223GEOMETRYCOLLECTION (POLYGON ((-9592577.408 413...3.181710e+060.058651
435747860Z200US357473574735747Z519511209493008853.016341e+08357768600000US35776357762340724611041223GEOMETRYCOLLECTION (POLYGON ((-9608245.738 410...5.117346e+060.016965
...................................................
21069172046860Z200US720467204672046Z5603015221203835139.214782e+08720378600000US72037720373887140MULTIPOLYGON (((-10226266.281 4102252.880, -10...5.753217e+050.000624
21069213156860Z200US131561315613156Z57621482360282691.521320e+08130648600000US13064130644618302948339MULTIPOLYGON (((-8538756.977 5360283.214, -853...6.455585e+060.042434
21069306850860Z200US068500685006850Z5175638362066643.140506e+07068568600000US068560685695680POLYGON ((-8173142.581 5028860.482, -8173156.2...1.245927e+040.000397
21069499632860Z200US996329963299632Z56515394702.965687e+08996328600000US9963299632651539470POLYGON ((-18231773.509 8888791.228, -18230845...2.959838e+080.998028
21069599658860Z200US996589965899658Z51110576513181455.059522e+08996588600000US9965899658110490326913652POLYGON ((-18196204.928 8884024.527, -18158121...5.010581e+080.990327
\n", + "

210696 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " Zip code AFFGEOID20 GEOID20 NAME20 LSAD20 ALAND20 AWATER20 \\\n", + "0 35768 860Z200US35768 35768 35768 Z5 446231990 3736014 \n", + "1 35769 860Z200US35769 35769 35769 Z5 163279214 57835709 \n", + "2 35776 860Z200US35776 35776 35776 Z5 268376689 1277083 \n", + "3 35774 860Z200US35774 35774 35774 Z5 36139337 362969 \n", + "4 35747 860Z200US35747 35747 35747 Z5 195112094 9300885 \n", + "... ... ... ... ... ... ... ... \n", + "210691 72046 860Z200US72046 72046 72046 Z5 603015221 20383513 \n", + "210692 13156 860Z200US13156 13156 13156 Z5 76214823 6028269 \n", + "210693 06850 860Z200US06850 06850 06850 Z5 17563836 206664 \n", + "210694 99632 860Z200US99632 99632 99632 Z5 65153947 0 \n", + "210695 99658 860Z200US99658 99658 99658 Z5 111057651 318145 \n", + "\n", + " zcta_2020_area ZCTA5CE10 AFFGEOID10 GEOID10 ALAND10 AWATER10 \\\n", + "0 6.688056e+08 35776 8600000US35776 35776 234072461 1041223 \n", + "1 3.270629e+08 35776 8600000US35776 35776 234072461 1041223 \n", + "2 3.994217e+08 35776 8600000US35776 35776 234072461 1041223 \n", + "3 5.424829e+07 35776 8600000US35776 35776 234072461 1041223 \n", + "4 3.016341e+08 35776 8600000US35776 35776 234072461 1041223 \n", + "... ... ... ... ... ... ... \n", + "210691 9.214782e+08 72037 8600000US72037 72037 388714 0 \n", + "210692 1.521320e+08 13064 8600000US13064 13064 461830 2948339 \n", + "210693 3.140506e+07 06856 8600000US06856 06856 9568 0 \n", + "210694 2.965687e+08 99632 8600000US99632 99632 65153947 0 \n", + "210695 5.059522e+08 99658 8600000US99658 99658 110490326 913652 \n", + "\n", + " geometry area_joined \\\n", + "0 GEOMETRYCOLLECTION (POLYGON ((-9597648.456 411... 2.193417e+06 \n", + "1 GEOMETRYCOLLECTION (POLYGON ((-9596251.174 410... 4.423302e+03 \n", + "2 GEOMETRYCOLLECTION (POLYGON ((-9609657.158 411... 3.377086e+08 \n", + "3 GEOMETRYCOLLECTION (POLYGON ((-9592577.408 413... 3.181710e+06 \n", + "4 GEOMETRYCOLLECTION (POLYGON ((-9608245.738 410... 5.117346e+06 \n", + "... ... ... \n", + "210691 MULTIPOLYGON (((-10226266.281 4102252.880, -10... 5.753217e+05 \n", + "210692 MULTIPOLYGON (((-8538756.977 5360283.214, -853... 6.455585e+06 \n", + "210693 POLYGON ((-8173142.581 5028860.482, -8173156.2... 1.245927e+04 \n", + "210694 POLYGON ((-18231773.509 8888791.228, -18230845... 2.959838e+08 \n", + "210695 POLYGON ((-18196204.928 8884024.527, -18158121... 5.010581e+08 \n", + "\n", + " percent of 2020 in 2010 \n", + "0 0.003280 \n", + "1 0.000014 \n", + "2 0.845494 \n", + "3 0.058651 \n", + "4 0.016965 \n", + "... ... \n", + "210691 0.000624 \n", + "210692 0.042434 \n", + "210693 0.000397 \n", + "210694 0.998028 \n", + "210695 0.990327 \n", + "\n", + "[210696 rows x 16 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joined_gdf" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}