Skip to content

Commit 40620d6

Browse files
feat: adds generation script for income.json (#672)
* feat: adds generation script for income.json * style: format income.py with ruff * refactor: convert lambda sort key to named function Replace lambda sort key in process_state_records with a named get_state_income_sort_key function for better readability and maintainability. This makes the sorting logic more explicit and follows Python's guidance on avoiding complex lambdas. * ci(typing): Include `income.py` for `pyright` * fix: Avoid `CRLF` on `win32` c8f3056, #653 * feat(typing): Utilize typing some more - shared `group` field is now hinted for all 4 places it is used - Including as a key function - added `Region` to indicate only `5` unique values - changed global constants `dict` -> `Mapping` to reflect they are not mutated - changed `AggregatedIncomeGroup` field to required - Otherwise it is exactly `BaseIncomeGroup` - The annotation already reflects that `BaseIncomeGroup | AggregatedIncomeGroup` * build: run `build_datapackage.py` Should have been done during #671 (`point.json`) The diff on `income.json` seems like removing a newline char? --------- Co-authored-by: dangotbanned <[email protected]>
1 parent 4ff1874 commit 40620d6

File tree

5 files changed

+252
-48
lines changed

5 files changed

+252
-48
lines changed

data/income.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4159,4 +4159,4 @@
41594159
"total": 1254274,
41604160
"group": "200000+"
41614161
}
4162-
]
4162+
]

datapackage.json

+3-31
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
}
2121
],
2222
"version": "2.11.0",
23-
"created": "2025-01-23T15:50:44.713255+00:00",
23+
"created": "2025-01-25T11:55:43.947980+00:00",
2424
"resources": [
2525
{
2626
"name": "7zip.png",
@@ -1644,8 +1644,8 @@
16441644
"format": "json",
16451645
"mediatype": "text/json",
16461646
"encoding": "utf-8",
1647-
"hash": "sha1:ebfd02fd584009ee391bfc5d33972e4c94f507ab",
1648-
"bytes": 72771,
1647+
"hash": "sha1:50bc780ef4a81e4f67c5ab2686ff10ba9798a951",
1648+
"bytes": 72770,
16491649
"dialect": {
16501650
"json": {
16511651
"keyed": true
@@ -2332,34 +2332,6 @@
23322332
]
23332333
}
23342334
},
2335-
{
2336-
"name": "points.json",
2337-
"type": "table",
2338-
"path": "points.json",
2339-
"scheme": "file",
2340-
"format": "json",
2341-
"mediatype": "text/json",
2342-
"encoding": "utf-8",
2343-
"hash": "sha1:4716a117308962f3596179d7d7d2ad729a19cda7",
2344-
"bytes": 4926,
2345-
"dialect": {
2346-
"json": {
2347-
"keyed": true
2348-
}
2349-
},
2350-
"schema": {
2351-
"fields": [
2352-
{
2353-
"name": "x",
2354-
"type": "number"
2355-
},
2356-
{
2357-
"name": "y",
2358-
"type": "number"
2359-
}
2360-
]
2361-
}
2362-
},
23632335
{
23642336
"name": "political-contributions.json",
23652337
"type": "table",

datapackage.md

+1-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# vega-datasets
2-
`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2025-01-23 15:50:44 [UTC]
2+
`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2025-01-25 11:55:43 [UTC]
33

44
Common repository for example datasets used by Vega related projects.
55
BSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets
@@ -988,15 +988,6 @@ Assets from the video game Celeste.
988988
| title | path |
989989
|:-------------|:----------------------------|
990990
| Celeste Game | http://www.celestegame.com/ |
991-
## `points.json`
992-
### path
993-
points.json
994-
### schema
995-
996-
| name | type |
997-
|:-------|:-------|
998-
| x | number |
999-
| y | number |
1000991
## `political-contributions.json`
1001992
### path
1002993
political-contributions.json

pyproject.toml

+10-6
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,13 @@ select = [
109109

110110
[tool.pyright]
111111
enableExperimentalFeatures = true
112-
ignore = ["../../../**/Lib", ".venv"]
113-
include = ["./scripts/build_datapackage.py", "./scripts/flights.py"]
114-
pythonPlatform = "All"
115-
pythonVersion = "3.12"
116-
reportUnusedExpression = "none"
117-
typeCheckingMode = "basic"
112+
ignore = ["../../../**/Lib", ".venv"]
113+
include = [
114+
"./scripts/build_datapackage.py",
115+
"./scripts/flights.py",
116+
"./scripts/income.py",
117+
]
118+
pythonPlatform = "All"
119+
pythonVersion = "3.12"
120+
reportUnusedExpression = "none"
121+
typeCheckingMode = "basic"

scripts/income.py

+237
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import typing
5+
from pathlib import Path
6+
from typing import TYPE_CHECKING, Any, Literal, TypedDict
7+
8+
import niquests
9+
10+
if TYPE_CHECKING:
11+
from collections.abc import Callable, Mapping, Sequence
12+
13+
type IncomeGroup = Literal[
14+
"<10000",
15+
"10000 to 14999",
16+
"15000 to 24999",
17+
"25000 to 34999",
18+
"35000 to 49999",
19+
"75000 to 99999",
20+
"50000 to 74999",
21+
"100000 to 149999",
22+
"150000 to 199999",
23+
"200000+",
24+
]
25+
"""Income group order for sorting."""
26+
27+
type Region = Literal["south", "west", "northeast", "midwest", "other"]
28+
29+
# Repository structure
30+
REPO_ROOT = Path(__file__).parent.parent
31+
OUTPUT_DIR = REPO_ROOT / "data"
32+
OUTPUT_FILE = OUTPUT_DIR / "income.json"
33+
34+
# API Configuration
35+
URL_BASE = "https://api.census.gov/data/2013/acs/acs3"
36+
QUERY_FIELDS = "group(B19001)"
37+
URL_QUERY = f"{URL_BASE}?get={QUERY_FIELDS}&for=state:*"
38+
39+
40+
class CensusResponse(TypedDict):
41+
"""Census API response after initial parsing."""
42+
43+
header: list[str]
44+
data: list[list[str]]
45+
46+
47+
class BaseIncomeGroup(TypedDict):
48+
"""Base income group with required fields."""
49+
50+
group: IncomeGroup
51+
52+
53+
class AggregatedIncomeGroup(BaseIncomeGroup):
54+
"""Income group that includes aggregation variables."""
55+
56+
sum_vars: Sequence[str]
57+
58+
59+
class StateIncome(BaseIncomeGroup):
60+
"""Record for each state and income group combination."""
61+
62+
name: str
63+
region: Region
64+
id: int
65+
pct: float
66+
total: int
67+
68+
69+
# Income groupings and variable mappings
70+
INCOME_MAPPING: Mapping[str, str | BaseIncomeGroup | AggregatedIncomeGroup] = {
71+
"B19001_001E": "total",
72+
"B19001_002E": {"group": "<10000"},
73+
"B19001_003E": {"group": "10000 to 14999"},
74+
"B19001_004E,B19001_005E": {
75+
"group": "15000 to 24999",
76+
"sum_vars": ["B19001_004E", "B19001_005E"],
77+
},
78+
"B19001_006E,B19001_007E": {
79+
"group": "25000 to 34999",
80+
"sum_vars": ["B19001_006E", "B19001_007E"],
81+
},
82+
"B19001_008E,B19001_009E,B19001_010E": {
83+
"group": "35000 to 49999",
84+
"sum_vars": ["B19001_008E", "B19001_009E", "B19001_010E"],
85+
},
86+
"B19001_011E,B19001_012E": {
87+
"group": "50000 to 74999",
88+
"sum_vars": ["B19001_011E", "B19001_012E"],
89+
},
90+
"B19001_013E": {"group": "75000 to 99999"},
91+
"B19001_014E,B19001_015E": {
92+
"group": "100000 to 149999",
93+
"sum_vars": ["B19001_014E", "B19001_015E"],
94+
},
95+
"B19001_016E": {"group": "150000 to 199999"},
96+
"B19001_017E": {"group": "200000+"},
97+
}
98+
99+
# Region definitions
100+
REGION_MAPPING: Mapping[str, Region] = {
101+
"01": "south",
102+
"02": "west",
103+
"04": "west",
104+
"05": "south",
105+
"06": "west",
106+
"08": "west",
107+
"09": "northeast",
108+
"10": "south",
109+
"11": "south",
110+
"12": "south",
111+
"13": "south",
112+
"15": "west",
113+
"16": "west",
114+
"17": "midwest",
115+
"18": "midwest",
116+
"19": "midwest",
117+
"20": "midwest",
118+
"21": "south",
119+
"22": "south",
120+
"23": "northeast",
121+
"24": "south",
122+
"25": "northeast",
123+
"26": "midwest",
124+
"27": "midwest",
125+
"28": "south",
126+
"29": "midwest",
127+
"30": "west",
128+
"31": "midwest",
129+
"32": "west",
130+
"33": "northeast",
131+
"34": "northeast",
132+
"35": "west",
133+
"36": "northeast",
134+
"37": "south",
135+
"38": "midwest",
136+
"39": "midwest",
137+
"40": "south",
138+
"41": "west",
139+
"42": "northeast",
140+
"44": "northeast",
141+
"45": "south",
142+
"46": "midwest",
143+
"47": "south",
144+
"48": "south",
145+
"49": "west",
146+
"50": "northeast",
147+
"51": "south",
148+
"53": "west",
149+
"54": "south",
150+
"55": "midwest",
151+
"56": "west",
152+
"72": "other",
153+
}
154+
155+
156+
def get_census_data() -> CensusResponse:
157+
"""Fetches household income data from Census API."""
158+
response = niquests.get(URL_QUERY)
159+
response.raise_for_status()
160+
data = response.json()
161+
162+
if not data or len(data) < 2:
163+
msg = f"Invalid API response format: {data}"
164+
raise ValueError(msg)
165+
166+
return CensusResponse(header=data[0], data=data[1:])
167+
168+
169+
def by_state_income_group(tp: Any, /) -> Callable[[StateIncome], tuple[int, int]]:
170+
"""
171+
Create sort key for state income records.
172+
173+
Returns tuple of (state_id, income_group_index) for consistent sorting
174+
by state and then by income group order.
175+
"""
176+
order = typing.get_args(getattr(tp, "__value__", tp))
177+
178+
def key(record: StateIncome, /) -> tuple[int, int]:
179+
return (record["id"], order.index(record["group"]))
180+
181+
return key
182+
183+
184+
def process_state_records(census_data: CensusResponse) -> list[StateIncome]:
185+
"""Processes census data into state income records."""
186+
header = census_data["header"]
187+
records = []
188+
189+
for row in census_data["data"]:
190+
state_data = dict(zip(header, row, strict=True))
191+
state_fips = state_data["state"]
192+
193+
# Skip if not in region mapping
194+
if state_fips not in REGION_MAPPING:
195+
continue
196+
197+
total = int(state_data["B19001_001E"])
198+
199+
for code_str, mapping in INCOME_MAPPING.items():
200+
if isinstance(mapping, dict):
201+
group = mapping["group"]
202+
if "sum_vars" in mapping:
203+
count = sum(int(state_data[var]) for var in mapping["sum_vars"])
204+
else:
205+
count = int(state_data[code_str])
206+
207+
record = StateIncome(
208+
name=state_data["NAME"],
209+
region=REGION_MAPPING[state_fips],
210+
id=int(state_fips),
211+
pct=round(count / total, 3),
212+
total=total,
213+
group=group,
214+
)
215+
records.append(record)
216+
217+
return sorted(records, key=by_state_income_group(IncomeGroup))
218+
219+
220+
def write_json(data: list[StateIncome], output: Path) -> None:
221+
"""Writes data to JSON file."""
222+
output.write_text(json.dumps(data, indent=2), encoding="utf-8", newline="\n")
223+
224+
225+
def main() -> None:
226+
"""Generate household income by state dataset."""
227+
census_data = get_census_data()
228+
records = process_state_records(census_data)
229+
print(f"Found {len(records)} state-income group combinations")
230+
231+
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
232+
write_json(records, OUTPUT_FILE)
233+
print(f"Data written to {OUTPUT_FILE}")
234+
235+
236+
if __name__ == "__main__":
237+
main()

0 commit comments

Comments
 (0)