Skip to content

Commit 9f23ccd

Browse files
committed
refactor(DRAFT): Migrate to datapackage.json only
Major switch from multiple github/npm endpoints -> a single file. Was Only possible following vega/vega-datasets#665 Still need to rewrite/fill out the `Metadata` doc, then moving onto features
1 parent df26bc2 commit 9f23ccd

16 files changed

+106
-981
lines changed

altair/datasets/_cache.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@
4343
_T = TypeVar("_T")
4444

4545
_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
46-
_SCHEMA: Final[Path] = (
47-
Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz"
48-
)
46+
_SCHEMA: Final[Path] = Path(__file__).parent / "_metadata" / "schemas.json.gz"
4947

5048
_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
5149
"integer": nw.Int64,
@@ -118,7 +116,7 @@ def __init__(
118116
fp: Path,
119117
/,
120118
*,
121-
columns: tuple[str, str] = ("dataset_name", "url_npm"),
119+
columns: tuple[str, str],
122120
tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
123121
) -> None:
124122
self.fp: Path = fp
@@ -253,7 +251,7 @@ def download_all(self) -> None:
253251
stems = tuple(fp.stem for fp in self)
254252
predicates = (~(nw.col("sha").is_in(stems)),) if stems else ()
255253
frame = (
256-
self._rd._scan_metadata(predicates, is_image=False) # type: ignore
254+
self._rd._scan_metadata(predicates, is_image=False)
257255
.select("sha", "suffix", "url")
258256
.unique("sha")
259257
.collect()
@@ -338,5 +336,7 @@ def _ensure_active(self) -> None:
338336
raise ValueError(msg)
339337

340338

341-
url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
339+
url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(
340+
_URL, columns=("dataset_name", "url")
341+
)
342342
schema_cache = SchemaCache(_SCHEMA)
Binary file not shown.
-9.85 KB
Binary file not shown.

altair/datasets/_metadata/url.csv.gz

3 Bytes
Binary file not shown.

altair/datasets/_readers.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,6 @@
8686
__all__ = ["backend"]
8787

8888
_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
89-
_DATAPACKAGE: Final[Path] = (
90-
Path(__file__).parent / "_metadata" / "datapackage_features.parquet"
91-
)
9289

9390

9491
class AltairDatasetsError(Exception): ...
@@ -209,7 +206,7 @@ def query(
209206
def _scan_metadata(
210207
self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
211208
) -> nw.LazyFrame:
212-
frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy()
209+
frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
213210
if predicates or constraints:
214211
return frame.filter(*predicates, **constraints)
215212
return frame

altair/datasets/_typing.py

+44-91
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,10 @@
2222
from typing_extensions import TypeAlias
2323

2424

25-
__all__ = [
26-
"EXTENSION_SUFFIXES",
27-
"VERSION_LATEST",
28-
"Dataset",
29-
"Extension",
30-
"Metadata",
31-
"Version",
32-
"is_ext_read",
33-
]
25+
__all__ = ["EXTENSION_SUFFIXES", "Dataset", "Extension", "Metadata", "is_ext_read"]
3426

3527
Dataset: TypeAlias = Literal[
28+
"7zip",
3629
"airports",
3730
"annual-precip",
3831
"anscombe",
@@ -42,13 +35,13 @@
4235
"budgets",
4336
"burtin",
4437
"cars",
45-
"climate",
4638
"co2-concentration",
4739
"countries",
4840
"crimea",
4941
"disasters",
5042
"driving",
5143
"earthquakes",
44+
"ffox",
5245
"flare",
5346
"flare-dependencies",
5447
"flights-10k",
@@ -61,12 +54,11 @@
6154
"football",
6255
"gapminder",
6356
"gapminder-health-income",
57+
"gimp",
6458
"github",
6559
"global-temp",
66-
"graticule",
6760
"income",
6861
"iowa-electricity",
69-
"iris",
7062
"jobs",
7163
"la-riots",
7264
"londonBoroughs",
@@ -86,10 +78,8 @@
8678
"political-contributions",
8779
"population",
8880
"population_engineers_hurricanes",
89-
"seattle-temps",
9081
"seattle-weather",
9182
"seattle-weather-hourly-normals",
92-
"sf-temps",
9383
"sp500",
9484
"sp500-2000",
9585
"stocks",
@@ -102,71 +92,24 @@
10292
"us-state-capitals",
10393
"volcano",
10494
"weather",
105-
"weball26",
95+
"weekly-weather",
10696
"wheat",
10797
"windvectors",
10898
"world-110m",
10999
"zipcodes",
110100
]
111-
Version: TypeAlias = Literal[
112-
"v2.11.0",
113-
"v2.10.0",
114-
"v2.9.0",
115-
"v2.8.1",
116-
"v2.8.0",
117-
"v2.7.0",
118-
"v2.5.4",
119-
"v2.5.3",
120-
"v2.5.3-next.0",
121-
"v2.5.2",
122-
"v2.5.2-next.0",
123-
"v2.5.1",
124-
"v2.5.1-next.0",
125-
"v2.5.0",
126-
"v2.5.0-next.0",
127-
"v2.4.0",
128-
"v2.3.1",
129-
"v2.3.0",
130-
"v2.1.0",
131-
"v2.0.0",
132-
"v1.31.1",
133-
"v1.31.0",
134-
"v1.30.4",
135-
"v1.30.3",
136-
"v1.30.2",
137-
"v1.30.1",
138-
"v1.29.0",
139-
"v1.24.0",
140-
"v1.22.0",
141-
"v1.21.1",
142-
"v1.21.0",
143-
"v1.20.0",
144-
"v1.19.0",
145-
"v1.18.0",
146-
"v1.17.0",
147-
"v1.16.0",
148-
"v1.15.0",
149-
"v1.14.0",
150-
"v1.12.0",
151-
"v1.11.0",
152-
"v1.10.0",
153-
"v1.8.0",
154-
"v1.7.0",
155-
"v1.5.0",
156-
]
157-
Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"]
158-
VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0"
101+
Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".tsv"]
159102
EXTENSION_SUFFIXES: tuple[
103+
Literal[".arrow"],
160104
Literal[".csv"],
161105
Literal[".json"],
162-
Literal[".tsv"],
163-
Literal[".arrow"],
164106
Literal[".parquet"],
165-
] = (".csv", ".json", ".tsv", ".arrow", ".parquet")
107+
Literal[".tsv"],
108+
] = (".arrow", ".csv", ".json", ".parquet", ".tsv")
166109

167110

168111
def is_ext_read(suffix: Any) -> TypeIs[Extension]:
169-
return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}
112+
return suffix in {".arrow", ".csv", ".json", ".parquet", ".tsv"}
170113

171114

172115
class Metadata(TypedDict, total=False):
@@ -177,29 +120,34 @@ class Metadata(TypedDict, total=False):
177120
----------
178121
dataset_name
179122
Name of the dataset/`Path.stem`_.
180-
ext_supported
181-
Dataset can be read as tabular data.
123+
suffix
124+
File extension/`Path.suffix`_.
182125
file_name
183126
Equivalent to `Path.name`_.
184-
name_collision
185-
Dataset is available via multiple formats.
186-
187-
.. note::
188-
Requires specifying a preference in calls to ``data(name, suffix=...)``
127+
bytes
128+
File size in *bytes*.
129+
is_image
130+
_description_
131+
is_tabular
132+
Can be read as tabular data.
133+
is_geo
134+
_description_
135+
is_topo
136+
_description_
137+
is_spatial
138+
_description_
139+
is_json
140+
_description_
141+
has_schema
142+
Data types available for improved ``pandas`` parsing.
189143
sha
190144
Unique hash for the dataset.
191145
192146
.. note::
193-
If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
147+
E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
194148
195-
then all ``tag``(s) in this range would **share** this value.
196-
size
197-
File size (*bytes*).
198-
suffix
199-
File extension/`Path.suffix`_.
200-
tag
201-
Version identifier for a `vega-datasets release`_.
202-
url_npm
149+
then this value would remain stable.
150+
url
203151
Remote url used to access dataset.
204152
205153
.. _Path.stem:
@@ -208,13 +156,14 @@ class Metadata(TypedDict, total=False):
208156
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name
209157
.. _Path.suffix:
210158
https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
211-
.. _vega-datasets release:
212-
https://github.com/vega/vega-datasets/releases
159+
213160
214161
Examples
215162
--------
216163
``Metadata`` keywords form constraints to filter a table like the below sample:
217164
165+
### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION
166+
218167
```
219168
shape: (2_879, 9)
220169
┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
@@ -249,14 +198,18 @@ class Metadata(TypedDict, total=False):
249198
"""
250199

251200
dataset_name: str
252-
ext_supported: bool
201+
suffix: str
253202
file_name: str
254-
name_collision: bool
203+
bytes: int
204+
is_image: bool
205+
is_tabular: bool
206+
is_geo: bool
207+
is_topo: bool
208+
is_spatial: bool
209+
is_json: bool
210+
has_schema: bool
255211
sha: str
256-
size: int
257-
suffix: str
258-
tag: str
259-
url_npm: str
212+
url: str
260213

261214

262215
FlFieldStr: TypeAlias = Literal[

tests/test_datasets.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from narwhals.stable.v1 import dependencies as nw_dep
1717

1818
from altair.datasets import Loader, url
19-
from altair.datasets._readers import _METADATA, AltairDatasetsError
19+
from altair.datasets._readers import AltairDatasetsError
2020
from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
2121
from tests import skip_requires_pyarrow, slow
2222

@@ -296,9 +296,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
296296
assert match_url("flights-10k", url("flights-10k"))
297297
assert match_url("flights-200k", url("flights-200k"))
298298

299-
with pytest.raises(TypeError, match="cannot be loaded via url"):
300-
url("climate")
301-
302299
with pytest.raises(TypeError, match="cannot be loaded via url"):
303300
url("flights-3m")
304301

@@ -690,9 +687,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
690687
def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
691688
"""Ensure all backends will query the same column names."""
692689
data = Loader.from_backend(backend)
693-
fn = data._reader.scan_fn(_METADATA)
694-
native = fn(_METADATA)
695-
schema_columns = nw.from_native(native).lazy().collect().columns
690+
schema_columns = data._reader._scan_metadata().collect().columns
696691
assert set(schema_columns) == metadata_columns
697692

698693

0 commit comments

Comments
 (0)