diff --git a/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb b/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb new file mode 100644 index 000000000..1cb5af0c7 --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/compare_take_2_2010_and_2020_zip_codes.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "bb24db55", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import geopandas as gpd\n", + "import pyogrio\n", + "from data_pipeline.etl.sources.census.etl import CensusETL\n", + "from data_pipeline.etl.sources.geocorr_alternatives.etl import GeoCorrAlternativesETL\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel\n", + "from data_pipeline.etl.sources.geo_utils import (\n", + " add_tracts_for_geometries,\n", + " get_tract_geojson,\n", + ")\n", + "from data_pipeline.score import field_names\n", + "from data_pipeline.utils import get_module_logger, unzip_file_from_url\n", + "\n", + "logger = get_module_logger(__name__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "41bd360f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# ZCTA_2020_SHAPEFILE_PATH = (\n", + "# \"https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_zcta520_500k.zip\"\n", + "# )\n", + "\n", + "# ZCTA_2010_SHAPEFILE_PATH = (\n", + "# \"https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_zcta510_500k.zip\"\n", + "# )\n", + "\n", + "ZCTA_2020_SHAPEFILE_PATH = (\n", + " \"~/Downloads/cb_2020_us_zcta520_500k\"\n", + ")\n", + "\n", + "ZCTA_2010_SHAPEFILE_PATH = (\n", + " \"~/Downloads/cb_2019_us_zcta510_500k\"\n", + ")\n", + "\n", + "ZCTA_2010_FIELD = \"ZCTA5CE10\"\n", + "\n", + "PERCENT_OF_2020_in_2010_FIELD = \"percent of 2020 in 2010\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "37ca370a", + "metadata": {}, + "outputs": [], + "source": [ + "# Read in ZCTA data.\n", + "zcta_2020_gdf = gpd.read_file(\n", + " filename=ZCTA_2020_SHAPEFILE_PATH\n", + ")\n", + "zcta_2020_gdf = zcta_2020_gdf.rename(\n", + " columns={GeoCorrAlternativesETL.ZIP_CODE_INPUT_FIELD: field_names.ZIP_CODE},\n", + " errors=\"raise\",\n", + ")\n", + "\n", + "\n", + "# Read in ZCTA data.\n", + "zcta_2010_gdf = gpd.read_file(\n", + " filename=ZCTA_2010_SHAPEFILE_PATH\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6178cd2f", + "metadata": {}, + "outputs": [], + "source": [ + "#switch to projected \n", + "zcta_2020_gdf=zcta_2020_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)\n", + "zcta_2010_gdf=zcta_2010_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ff3532eb", + "metadata": {}, + "outputs": [], + "source": [ + "zcta_2020_gdf[\"zcta_2020_area\"] = zcta_2020_gdf.area" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7d95b1ef", + "metadata": {}, + "outputs": [], + "source": [ + "joined_gdf = gpd.overlay(\n", + " df1=zcta_2020_gdf,\n", + " df2=zcta_2010_gdf,\n", + " how=\"intersection\",\n", + " keep_geom_type=False,\n", + " )\n", + "\n", + "# Calculating the areas of the newly-created overlapping geometries\n", + "joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] = joined_gdf.area\n", + "\n", + "# Calculating the areas of the newly-created geometries in relation\n", + "# to the original tract geometries\n", + "joined_gdf[PERCENT_OF_2020_in_2010_FIELD] = (\n", + " joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] / joined_gdf[\"zcta_2020_area\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b206ad11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0 164183\n", + "1.0 17832\n", + "0.9 9812\n", + "0.1 8771\n", + "0.8 2787\n", + "0.2 2029\n", + "0.7 1454\n", + "0.3 1146\n", + "0.6 988\n", + "0.4 894\n", + "0.5 800\n", + "Name: percent of 2020 in 2010, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joined_gdf[PERCENT_OF_2020_in_2010_FIELD].round(decimals=1).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4df18de2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | Zip code | \n", + "AFFGEOID20 | \n", + "GEOID20 | \n", + "NAME20 | \n", + "LSAD20 | \n", + "ALAND20 | \n", + "AWATER20 | \n", + "zcta_2020_area | \n", + "ZCTA5CE10 | \n", + "AFFGEOID10 | \n", + "GEOID10 | \n", + "ALAND10 | \n", + "AWATER10 | \n", + "geometry | \n", + "area_joined | \n", + "percent of 2020 in 2010 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "35768 | \n", + "860Z200US35768 | \n", + "35768 | \n", + "35768 | \n", + "Z5 | \n", + "446231990 | \n", + "3736014 | \n", + "6.688056e+08 | \n", + "35776 | \n", + "8600000US35776 | \n", + "35776 | \n", + "234072461 | \n", + "1041223 | \n", + "GEOMETRYCOLLECTION (POLYGON ((-9597648.456 411... | \n", + "2.193417e+06 | \n", + "0.003280 | \n", + "
1 | \n", + "35769 | \n", + "860Z200US35769 | \n", + "35769 | \n", + "35769 | \n", + "Z5 | \n", + "163279214 | \n", + "57835709 | \n", + "3.270629e+08 | \n", + "35776 | \n", + "8600000US35776 | \n", + "35776 | \n", + "234072461 | \n", + "1041223 | \n", + "GEOMETRYCOLLECTION (POLYGON ((-9596251.174 410... | \n", + "4.423302e+03 | \n", + "0.000014 | \n", + "
2 | \n", + "35776 | \n", + "860Z200US35776 | \n", + "35776 | \n", + "35776 | \n", + "Z5 | \n", + "268376689 | \n", + "1277083 | \n", + "3.994217e+08 | \n", + "35776 | \n", + "8600000US35776 | \n", + "35776 | \n", + "234072461 | \n", + "1041223 | \n", + "GEOMETRYCOLLECTION (POLYGON ((-9609657.158 411... | \n", + "3.377086e+08 | \n", + "0.845494 | \n", + "
3 | \n", + "35774 | \n", + "860Z200US35774 | \n", + "35774 | \n", + "35774 | \n", + "Z5 | \n", + "36139337 | \n", + "362969 | \n", + "5.424829e+07 | \n", + "35776 | \n", + "8600000US35776 | \n", + "35776 | \n", + "234072461 | \n", + "1041223 | \n", + "GEOMETRYCOLLECTION (POLYGON ((-9592577.408 413... | \n", + "3.181710e+06 | \n", + "0.058651 | \n", + "
4 | \n", + "35747 | \n", + "860Z200US35747 | \n", + "35747 | \n", + "35747 | \n", + "Z5 | \n", + "195112094 | \n", + "9300885 | \n", + "3.016341e+08 | \n", + "35776 | \n", + "8600000US35776 | \n", + "35776 | \n", + "234072461 | \n", + "1041223 | \n", + "GEOMETRYCOLLECTION (POLYGON ((-9608245.738 410... | \n", + "5.117346e+06 | \n", + "0.016965 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
210691 | \n", + "72046 | \n", + "860Z200US72046 | \n", + "72046 | \n", + "72046 | \n", + "Z5 | \n", + "603015221 | \n", + "20383513 | \n", + "9.214782e+08 | \n", + "72037 | \n", + "8600000US72037 | \n", + "72037 | \n", + "388714 | \n", + "0 | \n", + "MULTIPOLYGON (((-10226266.281 4102252.880, -10... | \n", + "5.753217e+05 | \n", + "0.000624 | \n", + "
210692 | \n", + "13156 | \n", + "860Z200US13156 | \n", + "13156 | \n", + "13156 | \n", + "Z5 | \n", + "76214823 | \n", + "6028269 | \n", + "1.521320e+08 | \n", + "13064 | \n", + "8600000US13064 | \n", + "13064 | \n", + "461830 | \n", + "2948339 | \n", + "MULTIPOLYGON (((-8538756.977 5360283.214, -853... | \n", + "6.455585e+06 | \n", + "0.042434 | \n", + "
210693 | \n", + "06850 | \n", + "860Z200US06850 | \n", + "06850 | \n", + "06850 | \n", + "Z5 | \n", + "17563836 | \n", + "206664 | \n", + "3.140506e+07 | \n", + "06856 | \n", + "8600000US06856 | \n", + "06856 | \n", + "9568 | \n", + "0 | \n", + "POLYGON ((-8173142.581 5028860.482, -8173156.2... | \n", + "1.245927e+04 | \n", + "0.000397 | \n", + "
210694 | \n", + "99632 | \n", + "860Z200US99632 | \n", + "99632 | \n", + "99632 | \n", + "Z5 | \n", + "65153947 | \n", + "0 | \n", + "2.965687e+08 | \n", + "99632 | \n", + "8600000US99632 | \n", + "99632 | \n", + "65153947 | \n", + "0 | \n", + "POLYGON ((-18231773.509 8888791.228, -18230845... | \n", + "2.959838e+08 | \n", + "0.998028 | \n", + "
210695 | \n", + "99658 | \n", + "860Z200US99658 | \n", + "99658 | \n", + "99658 | \n", + "Z5 | \n", + "111057651 | \n", + "318145 | \n", + "5.059522e+08 | \n", + "99658 | \n", + "8600000US99658 | \n", + "99658 | \n", + "110490326 | \n", + "913652 | \n", + "POLYGON ((-18196204.928 8884024.527, -18158121... | \n", + "5.010581e+08 | \n", + "0.990327 | \n", + "
210696 rows × 16 columns
\n", + "