From 7933771fdfd8590c892935b23e2bf3816100db36 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 2 Oct 2024 22:13:46 +0100
Subject: [PATCH 001/201] wip

---
 tools/vendor_datasets.py | 660 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 660 insertions(+)
 create mode 100644 tools/vendor_datasets.py

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
new file mode 100644
index 000000000..003e55062
--- /dev/null
+++ b/tools/vendor_datasets.py
@@ -0,0 +1,660 @@
+from __future__ import annotations
+
+import json
+import pkgutil
+import sys
+import textwrap
+from functools import partial
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Iterable, Literal, cast
+from urllib.request import urlopen
+
+if sys.version_info >= (3, 10):
+    from typing import TypeAlias
+else:
+    from typing_extensions import TypeAlias
+
+import pandas as pd
+import polars as pl
+
+# This is the tag in http://github.com/vega/vega-datasets from
+# which the datasets in this repository are sourced.
+SOURCE_TAG = "v1.29.0"  # 5 years ago
+CURRENT_TAG = "v2.9.0"
+USE_TAG = CURRENT_TAG
+
+BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/"
+
+ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
+
+
+def _load_dataset_info() -> dict[str, dict[str, Any]]:
+    """
+    Loads dataset info from three package files.
+
+    vega_datasets/datasets.json
+    vega_datasets/dataset_info.json
+    vega_datasets/local_datasets.json
+
+    It returns a dictionary with dataset information.
+    """
+
+    def load_json(path: str) -> dict[str, Any]:
+        raw = pkgutil.get_data("vega_datasets", path)
+        if raw is None:
+            msg = f"Cannot locate package path vega_datasets:{path}"
+            raise ValueError(msg)
+        return json.loads(raw.decode())
+
+    info = load_json("datasets.json")
+    descriptions = load_json("dataset_info.json")
+    local_datasets = load_json("local_datasets.json")
+
+    for name in info:
+        info[name]["is_local"] = name in local_datasets
+    for name in descriptions:
+        info[name].update(descriptions[name])
+
+    return info
+
+
+class Dataset:
+    """Class to load a particular dataset by name."""
+
+    _instance_doc = """Loader for the {name} dataset.
+
+    {data_description}
+
+    {bundle_info}
+    Dataset source: {url}
+
+    Usage
+    -----
+
+        >>> from vega_datasets import data
+        >>> {methodname} = data.{methodname}()
+        >>> type({methodname})
+        {return_type}
+
+    Equivalently, you can use
+
+        >>> {methodname} = data('{name}')
+
+    To get the raw dataset rather than the dataframe, use
+
+        >>> data_bytes = data.{methodname}.raw()
+        >>> type(data_bytes)
+        bytes
+
+    To find the dataset url, use
+
+        >>> data.{methodname}.url
+        '{url}'
+    {additional_docs}
+    Attributes
+    ----------
+    filename : string
+        The filename in which the dataset is stored
+    url : string
+        The full URL of the dataset at http://vega.github.io
+    format : string
+        The format of the dataset: usually one of {{'csv', 'tsv', 'json'}}
+    pkg_filename : string
+        The path to the local dataset within the vega_datasets package
+    is_local : bool
+        True if the dataset is available locally in the package
+    filepath : string
+        If is_local is True, the local file path to the dataset.
+
+    {reference_info}
+    """
+    _additional_docs = ""
+    _reference_info = """
+    For information on this dataset, see https://github.com/vega/vega-datasets/
+    """
+    base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/"
+    _dataset_info = _load_dataset_info()
+    _pd_read_kwds: dict[str, Any] = {}
+    _return_type = pd.DataFrame
+    name: str
+
+    @classmethod
+    def init(cls, name: str) -> Dataset:
+        """Return an instance of this class or an appropriate subclass."""
+        clsdict = {
+            subcls.name: subcls
+            for subcls in cls.__subclasses__()
+            if hasattr(subcls, "name")
+        }
+        return clsdict.get(name, cls)(name)
+
+    def __init__(self, name: str):
+        info = self._infodict(name)
+        self.name = name
+        self.methodname = name.replace("-", "_")
+        self.filename = info["filename"]
+        self.url = self.base_url + info["filename"]
+        self.format = info["format"]
+        self.pkg_filename = "_data/" + self.filename
+        self.is_local = info["is_local"]
+        self.description = info.get("description", None)
+        self.references = info.get("references", None)
+        self.__doc__ = self._make_docstring()
+
+    @classmethod
+    def list_datasets(cls) -> list[str]:
+        """Return a list of names of available datasets."""
+        return sorted(cls._dataset_info.keys())
+
+    @classmethod
+    def list_local_datasets(cls) -> list[str]:
+        return sorted(
+            name for name, info in cls._dataset_info.items() if info["is_local"]
+        )
+
+    @classmethod
+    def _infodict(cls, name: str) -> dict[str, str]:
+        """Load the info dictionary for the given name."""
+        info = cls._dataset_info.get(name, None)
+        if info is None:
+            msg = (
+                f"No such dataset {name} exists, "
+                "use list_datasets() to get a list "
+                "of available datasets."
+            )
+            raise ValueError(msg)
+        return info
+
+    def raw(self, use_local: bool = True) -> bytes:
+        """Load the raw dataset from remote URL or local file."""
+        if use_local and self.is_local:
+            out = pkgutil.get_data("vega_datasets", self.pkg_filename)
+            if out is not None:
+                return out
+            msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}"
+            raise ValueError(msg)
+        else:
+            return urlopen(self.url).read()
+
+    def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame:
+        """Load and parse the dataset from remote URL or local file."""
+        datasource = BytesIO(self.raw(use_local=use_local))
+
+        kwds = self._pd_read_kwds.copy()
+        kwds.update(kwargs)
+
+        if self.format == "json":
+            return pd.read_json(datasource, **kwds)
+        elif self.format == "csv":
+            return pd.read_csv(datasource, **kwds)
+        elif self.format == "tsv":
+            kwds.setdefault("sep", "\t")
+            return pd.read_csv(datasource, **kwds)
+        else:
+            msg = (
+                f"Unrecognized file format: {self.format}. "
+                "Valid options are ['json', 'csv', 'tsv']."
+            )
+            raise ValueError(msg)
+
+    @property
+    def filepath(self) -> str:
+        if not self.is_local:
+            msg = "filepath is only valid for local datasets"
+            raise ValueError(msg)
+        else:
+            return str((Path(__file__).parent / "_data" / self.filename).resolve())
+
+    def _make_docstring(self) -> str:
+        info = self._infodict(self.name)
+
+        # construct, indent, and line-wrap dataset description
+        description = info.get("description", "")
+        if not description:
+            description = (
+                "This dataset is described at " "https://github.com/vega/vega-datasets/"
+            )
+        wrapper = textwrap.TextWrapper(
+            width=70, initial_indent="", subsequent_indent=4 * " "
+        )
+        description = "\n".join(wrapper.wrap(description))
+
+        # construct, indent, and join references
+        reflist: Iterable[str] = info.get("references", [])
+        reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist))
+        wrapper = textwrap.TextWrapper(
+            width=70, initial_indent=4 * " ", subsequent_indent=7 * " "
+        )
+        reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist)
+        references: str = "\n\n".join(reflist)
+        if references.strip():
+            references = "References\n    ----------\n" + references
+
+        # add information about bundling of data
+        if self.is_local:
+            bundle_info = (
+                "This dataset is bundled with vega_datasets; "
+                "it can be loaded without web access."
+            )
+        else:
+            bundle_info = (
+                "This dataset is not bundled with vega_datasets; "
+                "it requires web access to load."
+            )
+
+        return self._instance_doc.format(
+            additional_docs=self._additional_docs,
+            data_description=description,
+            reference_info=references,
+            bundle_info=bundle_info,
+            return_type=self._return_type,
+            **self.__dict__,
+        )
+
+
+def getattr_to_df(name: str, /) -> pl.DataFrame:
+    """Subset of what `Dataset` does."""
+    js_name = name.replace("_", "-")
+    file_name = DATASETS_JSON[js_name]["filename"]
+    suffix = Path(file_name).suffix
+    if suffix in {".csv", ".json", ".tsv"}:
+        extension = cast(ExtSupported, suffix)
+    else:
+        raise NotImplementedError(suffix, file_name)
+
+    url = f"{BASE_URL}{file_name}"
+    with urlopen(url) as f:
+        content = ext_fn(extension)(f)
+    return content
+
+
+class DSet:
+    def __init__(self, name: str, /) -> None:
+        self.name: str = name
+        js_name = name.replace("_", "-")
+        file_name = DATASETS_JSON[js_name]["filename"]
+        suffix = Path(file_name).suffix
+        self.extension: ExtSupported
+        if suffix in {".csv", ".json", ".tsv"}:
+            self.extension = cast(ExtSupported, suffix)
+        else:
+            raise NotImplementedError(suffix, file_name)
+
+        self.url: str = f"{BASE_URL}{file_name}"
+
+    def __call__(self, **kwds: Any) -> pl.DataFrame:
+        with urlopen(self.url) as f:
+            content = ext_fn(self.extension, **kwds)(f)
+        return content
+
+    def __repr__(self) -> str:
+        return (
+            f"{type(self).__name__}(\n  "
+            f"name={self.name!r},\n  "
+            f"url={self.url!r}\n"
+            ")"
+        )
+
+
+def ext_fn(ext: ExtSupported, /):
+    """Very basic mapping to `polars` eager functions."""
+    if ext == ".csv":
+        return pl.read_csv
+    elif ext == ".json":
+        return pl.read_json
+    elif ext == ".tsv":
+        return partial(pl.read_csv, separator="\t")
+    else:
+        raise
+
+
+DATASET_NAMES_USED = [
+    "airports",
+    "anscombe",
+    "barley",
+    "cars",
+    "co2_concentration",
+    "countries",
+    "disasters",
+    "driving",
+    "earthquakes",
+    "flights_2k",
+    "flights_5k",
+    "flights_airport",
+    "gapminder_health_income",
+    "github",
+    "income",
+    "iowa_electricity",
+    "iris",
+    "jobs",
+    "londonBoroughs",
+    "londonCentroids",
+    "londonTubeLines",
+    "monarchs",
+    "movies",
+    "normal_2d",
+    "ohlc",
+    "population",
+    "population_engineers_hurricanes",
+    "seattle_weather",
+    "sp500",
+    "stocks",
+    "unemployment",
+    "unemployment_across_industries",
+    "us_10m",
+    "us_employment",
+    "us_state_capitals",
+    "us_unemployment",
+    "wheat",
+    "windvectors",
+    "world_110m",
+    "zipcodes",
+]
+
+DATASETS_JSON = {
+    # "7zip": {"filename": "7zip.png", "format": "png"},
+    "airports": {"filename": "airports.csv", "format": "csv"},
+    "annual-precip": {"filename": "annual-precip.json", "format": "json"},
+    "anscombe": {"filename": "anscombe.json", "format": "json"},
+    "barley": {"filename": "barley.json", "format": "json"},
+    "birdstrikes": {"filename": "birdstrikes.json", "format": "json"},
+    "budget": {"filename": "budget.json", "format": "json"},
+    "budgets": {"filename": "budgets.json", "format": "json"},
+    "burtin": {"filename": "burtin.json", "format": "json"},
+    "cars": {"filename": "cars.json", "format": "json"},
+    "climate": {"filename": "climate.json", "format": "json"},
+    "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"},
+    "countries": {"filename": "countries.json", "format": "json"},
+    "crimea": {"filename": "crimea.json", "format": "json"},
+    "disasters": {"filename": "disasters.csv", "format": "csv"},
+    "driving": {"filename": "driving.json", "format": "json"},
+    "earthquakes": {"filename": "earthquakes.json", "format": "json"},
+    # "ffox": {"filename": "ffox.png", "format": "png"},
+    "flare": {"filename": "flare.json", "format": "json"},
+    "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"},
+    "flights-10k": {"filename": "flights-10k.json", "format": "json"},
+    "flights-200k": {"filename": "flights-200k.json", "format": "json"},
+    "flights-20k": {"filename": "flights-20k.json", "format": "json"},
+    "flights-2k": {"filename": "flights-2k.json", "format": "json"},
+    "flights-3m": {"filename": "flights-3m.csv", "format": "csv"},
+    "flights-5k": {"filename": "flights-5k.json", "format": "json"},
+    "flights-airport": {"filename": "flights-airport.csv", "format": "csv"},
+    "gapminder": {"filename": "gapminder.json", "format": "json"},
+    "gapminder-health-income": {
+        "filename": "gapminder-health-income.csv",
+        "format": "csv",
+    },
+    # "gimp": {"filename": "gimp.png", "format": "png"},
+    "github": {"filename": "github.csv", "format": "csv"},
+    "graticule": {"filename": "graticule.json", "format": "json"},
+    "income": {"filename": "income.json", "format": "json"},
+    "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"},
+    "iris": {"filename": "iris.json", "format": "json"},
+    "jobs": {"filename": "jobs.json", "format": "json"},
+    "la-riots": {"filename": "la-riots.csv", "format": "csv"},
+    "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"},
+    "londonCentroids": {"filename": "londonCentroids.json", "format": "json"},
+    "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"},
+    "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"},
+    "lookup_people": {"filename": "lookup_people.csv", "format": "csv"},
+    "miserables": {"filename": "miserables.json", "format": "json"},
+    "monarchs": {"filename": "monarchs.json", "format": "json"},
+    "movies": {"filename": "movies.json", "format": "json"},
+    "normal-2d": {"filename": "normal-2d.json", "format": "json"},
+    "obesity": {"filename": "obesity.json", "format": "json"},
+    "ohlc": {"filename": "ohlc.json", "format": "json"},
+    "points": {"filename": "points.json", "format": "json"},
+    "population": {"filename": "population.json", "format": "json"},
+    "population_engineers_hurricanes": {
+        "filename": "population_engineers_hurricanes.csv",
+        "format": "csv",
+    },
+    "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"},
+    "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"},
+    "sf-temps": {"filename": "sf-temps.csv", "format": "csv"},
+    "sp500": {"filename": "sp500.csv", "format": "csv"},
+    "stocks": {"filename": "stocks.csv", "format": "csv"},
+    "udistrict": {"filename": "udistrict.json", "format": "json"},
+    "unemployment": {"filename": "unemployment.tsv", "format": "tsv"},
+    "unemployment-across-industries": {
+        "filename": "unemployment-across-industries.json",
+        "format": "json",
+    },
+    "uniform-2d": {"filename": "uniform-2d.json", "format": "json"},
+    "us-10m": {"filename": "us-10m.json", "format": "json"},
+    "us-employment": {"filename": "us-employment.csv", "format": "csv"},
+    "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"},
+    "volcano": {"filename": "volcano.json", "format": "json"},
+    "weather": {"filename": "weather.json", "format": "json"},
+    "weball26": {"filename": "weball26.json", "format": "json"},
+    "wheat": {"filename": "wheat.json", "format": "json"},
+    "windvectors": {"filename": "windvectors.csv", "format": "csv"},
+    "world-110m": {"filename": "world-110m.json", "format": "json"},
+    "zipcodes": {"filename": "zipcodes.csv", "format": "csv"},
+}
+
+
+class Stocks(Dataset):
+    name = "stocks"
+    _additional_docs = """
+    For convenience, the stocks dataset supports pivoted output using the
+    optional `pivoted` keyword. If pivoted is set to True, each company's
+    price history will be returned in a separate column:
+
+        >>> df = data.stocks()  # not pivoted
+        >>> df.head(3)
+          symbol       date  price
+        0   MSFT 2000-01-01  39.81
+        1   MSFT 2000-02-01  36.35
+        2   MSFT 2000-03-01  43.22
+
+        >>> df_pivoted = data.stocks(pivoted=True)
+        >>> df_pivoted.head()
+        symbol       AAPL   AMZN  GOOG     IBM   MSFT
+        date
+        2000-01-01  25.94  64.56   NaN  100.52  39.81
+        2000-02-01  28.66  68.87   NaN   92.11  36.35
+        2000-03-01  33.95  67.00   NaN  106.11  43.22
+    """
+    _pd_read_kwds = {"parse_dates": ["date"]}
+
+    def __call__(self, pivoted=False, use_local=True, **kwargs):
+        """
+        Load and parse the dataset from remote URL or local file.
+
+        Parameters
+        ----------
+        pivoted : boolean, default False
+            If True, then pivot data so that each stock is in its own column.
+        use_local : boolean
+            If True (default), then attempt to load the dataset locally. If
+            False or if the dataset is not available locally, then load the
+            data from an external URL.
+        **kwargs :
+            additional keyword arguments are passed to data parser (usually
+            pd.read_csv or pd.read_json, depending on the format of the data
+            source)
+
+        Returns
+        -------
+        data : DataFrame
+            parsed data
+        """
+        __doc__ = super().__call__.__doc__  # noqa:F841
+        data = super().__call__(use_local=use_local, **kwargs)
+        if pivoted:
+            data = data.pivot(index="date", columns="symbol", values="price")
+        return data
+
+
+class Cars(Dataset):
+    name = "cars"
+    _pd_read_kwds = {"convert_dates": ["Year"]}
+
+
+class Climate(Dataset):
+    name = "climate"
+    _pd_read_kwds = {"convert_dates": ["DATE"]}
+
+
+class Github(Dataset):
+    name = "github"
+    _pd_read_kwds = {"parse_dates": ["time"]}
+
+
+class IowaElectricity(Dataset):
+    name = "iowa-electricity"
+    _pd_read_kwds = {"parse_dates": ["year"]}
+
+
+class LARiots(Dataset):
+    name = "la-riots"
+    _pd_read_kwds = {"parse_dates": ["death_date"]}
+
+
+class Miserables(Dataset):
+    name = "miserables"
+    _return_type = tuple
+    _additional_docs = """
+    The miserables data contains two dataframes, ``nodes`` and ``links``,
+    both of which are returned from this function.
+    """
+
+    def __call__(self, use_local=True, **kwargs):
+        __doc__ = super().__call__.__doc__  # noqa:F841
+        dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs)
+        nodes = pd.DataFrame.from_records(dct["nodes"], index="index")
+        links = pd.DataFrame.from_records(dct["links"])
+        return nodes, links
+
+
+class SeattleTemps(Dataset):
+    name = "seattle-temps"
+    _pd_read_kwds = {"parse_dates": ["date"]}
+
+
+class SeattleWeather(Dataset):
+    name = "seattle-weather"
+    _pd_read_kwds = {"parse_dates": ["date"]}
+
+
+class SFTemps(Dataset):
+    name = "sf-temps"
+    _pd_read_kwds = {"parse_dates": ["date"]}
+
+
+class Sp500(Dataset):
+    name = "sp500"
+    _pd_read_kwds = {"parse_dates": ["date"]}
+
+
+class UnemploymentAcrossIndustries(Dataset):
+    name = "unemployment-across-industries"
+    _pd_read_kwds = {"convert_dates": ["date"]}
+
+
+class US_10M(Dataset):
+    name = "us-10m"
+    _return_type = dict
+    _additional_docs = """
+    The us-10m dataset is a TopoJSON file, with a structure that is not
+    suitable for storage in a dataframe. For this reason, the loader returns
+    a simple Python dictionary.
+    """
+
+    def __call__(self, use_local=True, **kwargs):
+        __doc__ = super().__call__.__doc__  # noqa:F841
+        return json.loads(self.raw(use_local=use_local).decode(), **kwargs)
+
+
+class World_110M(Dataset):
+    name = "world-110m"
+    _return_type = dict
+    _additional_docs = """
+    The world-100m dataset is a TopoJSON file, with a structure that is not
+    suitable for storage in a dataframe. For this reason, the loader returns
+    a simple Python dictionary.
+    """
+
+    def __call__(self, use_local=True, **kwargs):
+        __doc__ = super().__call__.__doc__  # noqa:F841
+        return json.loads(self.raw(use_local=use_local).decode(), **kwargs)
+
+
+class ZIPCodes(Dataset):
+    name = "zipcodes"
+    _pd_read_kwds = {"dtype": {"zip_code": "object"}}
+
+
+class DataLoader:
+    """
+    Load a dataset from a local file or remote URL.
+
+    There are two ways to call this; for example to load the iris dataset, you
+    can call this object and pass the dataset name by string:
+
+        >>> from vega_datasets import data
+        >>> df = data("iris")
+
+    or you can call the associated named method:
+
+        >>> df = data.iris()
+
+    Optionally, additional parameters can be passed to either of these
+
+    Optional parameters
+    -------------------
+    return_raw : boolean
+        If True, then return the raw string or bytes.
+        If False (default), then return a pandas dataframe.
+    use_local : boolean
+        If True (default), then attempt to load the dataset locally. If
+        False or if the dataset is not available locally, then load the
+        data from an external URL.
+    **kwargs :
+        additional keyword arguments are passed to the pandas parsing function,
+        either ``read_csv()`` or ``read_json()`` depending on the data format.
+    """
+
+    _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()}
+
+    def list_datasets(self):
+        return Dataset.list_datasets()
+
+    def __call__(self, name, return_raw=False, use_local=True, **kwargs):
+        loader = getattr(self, name.replace("-", "_"))
+        if return_raw:
+            return loader.raw(use_local=use_local, **kwargs)
+        else:
+            return loader(use_local=use_local, **kwargs)
+
+    def __getattr__(self, dataset_name):
+        if dataset_name in self._datasets:
+            return Dataset.init(self._datasets[dataset_name])
+        else:
+            msg = f"No dataset named '{dataset_name}'"
+            raise AttributeError(msg)
+
+    def __dir__(self):
+        return list(self._datasets.keys())
+
+
+class LocalDataLoader(DataLoader):
+    _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()}
+
+    def list_datasets(self):
+        return Dataset.list_local_datasets()
+
+    def __getattr__(self, dataset_name):
+        if dataset_name in self._datasets:
+            return Dataset.init(self._datasets[dataset_name])
+        elif dataset_name in DataLoader._datasets:
+            msg = (
+                f"'{dataset_name}' dataset is not available locally. To "
+                f"download it, use ``vega_datasets.data.{dataset_name}()"
+            )
+            raise ValueError(msg)
+        else:
+            msg = f"No dataset named '{dataset_name}'"
+            raise AttributeError(msg)

From b30081e9de975bed60247c65b477012d68b4e132 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 4 Oct 2024 18:33:06 +0100
Subject: [PATCH 002/201] feat(DRAFT): Minimal reimplementation

---
 tools/vendor_datasets.py | 478 ++-------------------------------------
 1 file changed, 17 insertions(+), 461 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 003e55062..4a435c253 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -1,13 +1,9 @@
 from __future__ import annotations
 
-import json
-import pkgutil
 import sys
-import textwrap
-from functools import partial
-from io import BytesIO
+from functools import cached_property, partial
 from pathlib import Path
-from typing import Any, Iterable, Literal, cast
+from typing import Any, Literal, cast
 from urllib.request import urlopen
 
 if sys.version_info >= (3, 10):
@@ -15,7 +11,6 @@
 else:
     from typing_extensions import TypeAlias
 
-import pandas as pd
 import polars as pl
 
 # This is the tag in http://github.com/vega/vega-datasets from
@@ -29,247 +24,7 @@
 ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
 
 
-def _load_dataset_info() -> dict[str, dict[str, Any]]:
-    """
-    Loads dataset info from three package files.
-
-    vega_datasets/datasets.json
-    vega_datasets/dataset_info.json
-    vega_datasets/local_datasets.json
-
-    It returns a dictionary with dataset information.
-    """
-
-    def load_json(path: str) -> dict[str, Any]:
-        raw = pkgutil.get_data("vega_datasets", path)
-        if raw is None:
-            msg = f"Cannot locate package path vega_datasets:{path}"
-            raise ValueError(msg)
-        return json.loads(raw.decode())
-
-    info = load_json("datasets.json")
-    descriptions = load_json("dataset_info.json")
-    local_datasets = load_json("local_datasets.json")
-
-    for name in info:
-        info[name]["is_local"] = name in local_datasets
-    for name in descriptions:
-        info[name].update(descriptions[name])
-
-    return info
-
-
 class Dataset:
-    """Class to load a particular dataset by name."""
-
-    _instance_doc = """Loader for the {name} dataset.
-
-    {data_description}
-
-    {bundle_info}
-    Dataset source: {url}
-
-    Usage
-    -----
-
-        >>> from vega_datasets import data
-        >>> {methodname} = data.{methodname}()
-        >>> type({methodname})
-        {return_type}
-
-    Equivalently, you can use
-
-        >>> {methodname} = data('{name}')
-
-    To get the raw dataset rather than the dataframe, use
-
-        >>> data_bytes = data.{methodname}.raw()
-        >>> type(data_bytes)
-        bytes
-
-    To find the dataset url, use
-
-        >>> data.{methodname}.url
-        '{url}'
-    {additional_docs}
-    Attributes
-    ----------
-    filename : string
-        The filename in which the dataset is stored
-    url : string
-        The full URL of the dataset at http://vega.github.io
-    format : string
-        The format of the dataset: usually one of {{'csv', 'tsv', 'json'}}
-    pkg_filename : string
-        The path to the local dataset within the vega_datasets package
-    is_local : bool
-        True if the dataset is available locally in the package
-    filepath : string
-        If is_local is True, the local file path to the dataset.
-
-    {reference_info}
-    """
-    _additional_docs = ""
-    _reference_info = """
-    For information on this dataset, see https://github.com/vega/vega-datasets/
-    """
-    base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/"
-    _dataset_info = _load_dataset_info()
-    _pd_read_kwds: dict[str, Any] = {}
-    _return_type = pd.DataFrame
-    name: str
-
-    @classmethod
-    def init(cls, name: str) -> Dataset:
-        """Return an instance of this class or an appropriate subclass."""
-        clsdict = {
-            subcls.name: subcls
-            for subcls in cls.__subclasses__()
-            if hasattr(subcls, "name")
-        }
-        return clsdict.get(name, cls)(name)
-
-    def __init__(self, name: str):
-        info = self._infodict(name)
-        self.name = name
-        self.methodname = name.replace("-", "_")
-        self.filename = info["filename"]
-        self.url = self.base_url + info["filename"]
-        self.format = info["format"]
-        self.pkg_filename = "_data/" + self.filename
-        self.is_local = info["is_local"]
-        self.description = info.get("description", None)
-        self.references = info.get("references", None)
-        self.__doc__ = self._make_docstring()
-
-    @classmethod
-    def list_datasets(cls) -> list[str]:
-        """Return a list of names of available datasets."""
-        return sorted(cls._dataset_info.keys())
-
-    @classmethod
-    def list_local_datasets(cls) -> list[str]:
-        return sorted(
-            name for name, info in cls._dataset_info.items() if info["is_local"]
-        )
-
-    @classmethod
-    def _infodict(cls, name: str) -> dict[str, str]:
-        """Load the info dictionary for the given name."""
-        info = cls._dataset_info.get(name, None)
-        if info is None:
-            msg = (
-                f"No such dataset {name} exists, "
-                "use list_datasets() to get a list "
-                "of available datasets."
-            )
-            raise ValueError(msg)
-        return info
-
-    def raw(self, use_local: bool = True) -> bytes:
-        """Load the raw dataset from remote URL or local file."""
-        if use_local and self.is_local:
-            out = pkgutil.get_data("vega_datasets", self.pkg_filename)
-            if out is not None:
-                return out
-            msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}"
-            raise ValueError(msg)
-        else:
-            return urlopen(self.url).read()
-
-    def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame:
-        """Load and parse the dataset from remote URL or local file."""
-        datasource = BytesIO(self.raw(use_local=use_local))
-
-        kwds = self._pd_read_kwds.copy()
-        kwds.update(kwargs)
-
-        if self.format == "json":
-            return pd.read_json(datasource, **kwds)
-        elif self.format == "csv":
-            return pd.read_csv(datasource, **kwds)
-        elif self.format == "tsv":
-            kwds.setdefault("sep", "\t")
-            return pd.read_csv(datasource, **kwds)
-        else:
-            msg = (
-                f"Unrecognized file format: {self.format}. "
-                "Valid options are ['json', 'csv', 'tsv']."
-            )
-            raise ValueError(msg)
-
-    @property
-    def filepath(self) -> str:
-        if not self.is_local:
-            msg = "filepath is only valid for local datasets"
-            raise ValueError(msg)
-        else:
-            return str((Path(__file__).parent / "_data" / self.filename).resolve())
-
-    def _make_docstring(self) -> str:
-        info = self._infodict(self.name)
-
-        # construct, indent, and line-wrap dataset description
-        description = info.get("description", "")
-        if not description:
-            description = (
-                "This dataset is described at " "https://github.com/vega/vega-datasets/"
-            )
-        wrapper = textwrap.TextWrapper(
-            width=70, initial_indent="", subsequent_indent=4 * " "
-        )
-        description = "\n".join(wrapper.wrap(description))
-
-        # construct, indent, and join references
-        reflist: Iterable[str] = info.get("references", [])
-        reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist))
-        wrapper = textwrap.TextWrapper(
-            width=70, initial_indent=4 * " ", subsequent_indent=7 * " "
-        )
-        reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist)
-        references: str = "\n\n".join(reflist)
-        if references.strip():
-            references = "References\n    ----------\n" + references
-
-        # add information about bundling of data
-        if self.is_local:
-            bundle_info = (
-                "This dataset is bundled with vega_datasets; "
-                "it can be loaded without web access."
-            )
-        else:
-            bundle_info = (
-                "This dataset is not bundled with vega_datasets; "
-                "it requires web access to load."
-            )
-
-        return self._instance_doc.format(
-            additional_docs=self._additional_docs,
-            data_description=description,
-            reference_info=references,
-            bundle_info=bundle_info,
-            return_type=self._return_type,
-            **self.__dict__,
-        )
-
-
-def getattr_to_df(name: str, /) -> pl.DataFrame:
-    """Subset of what `Dataset` does."""
-    js_name = name.replace("_", "-")
-    file_name = DATASETS_JSON[js_name]["filename"]
-    suffix = Path(file_name).suffix
-    if suffix in {".csv", ".json", ".tsv"}:
-        extension = cast(ExtSupported, suffix)
-    else:
-        raise NotImplementedError(suffix, file_name)
-
-    url = f"{BASE_URL}{file_name}"
-    with urlopen(url) as f:
-        content = ext_fn(extension)(f)
-    return content
-
-
-class DSet:
     def __init__(self, name: str, /) -> None:
         self.name: str = name
         js_name = name.replace("_", "-")
@@ -435,226 +190,27 @@ def ext_fn(ext: ExtSupported, /):
 }
 
 
-class Stocks(Dataset):
-    name = "stocks"
-    _additional_docs = """
-    For convenience, the stocks dataset supports pivoted output using the
-    optional `pivoted` keyword. If pivoted is set to True, each company's
-    price history will be returned in a separate column:
-
-        >>> df = data.stocks()  # not pivoted
-        >>> df.head(3)
-          symbol       date  price
-        0   MSFT 2000-01-01  39.81
-        1   MSFT 2000-02-01  36.35
-        2   MSFT 2000-03-01  43.22
-
-        >>> df_pivoted = data.stocks(pivoted=True)
-        >>> df_pivoted.head()
-        symbol       AAPL   AMZN  GOOG     IBM   MSFT
-        date
-        2000-01-01  25.94  64.56   NaN  100.52  39.81
-        2000-02-01  28.66  68.87   NaN   92.11  36.35
-        2000-03-01  33.95  67.00   NaN  106.11  43.22
-    """
-    _pd_read_kwds = {"parse_dates": ["date"]}
-
-    def __call__(self, pivoted=False, use_local=True, **kwargs):
-        """
-        Load and parse the dataset from remote URL or local file.
-
-        Parameters
-        ----------
-        pivoted : boolean, default False
-            If True, then pivot data so that each stock is in its own column.
-        use_local : boolean
-            If True (default), then attempt to load the dataset locally. If
-            False or if the dataset is not available locally, then load the
-            data from an external URL.
-        **kwargs :
-            additional keyword arguments are passed to data parser (usually
-            pd.read_csv or pd.read_json, depending on the format of the data
-            source)
-
-        Returns
-        -------
-        data : DataFrame
-            parsed data
-        """
-        __doc__ = super().__call__.__doc__  # noqa:F841
-        data = super().__call__(use_local=use_local, **kwargs)
-        if pivoted:
-            data = data.pivot(index="date", columns="symbol", values="price")
-        return data
-
-
-class Cars(Dataset):
-    name = "cars"
-    _pd_read_kwds = {"convert_dates": ["Year"]}
-
-
-class Climate(Dataset):
-    name = "climate"
-    _pd_read_kwds = {"convert_dates": ["DATE"]}
-
-
-class Github(Dataset):
-    name = "github"
-    _pd_read_kwds = {"parse_dates": ["time"]}
-
-
-class IowaElectricity(Dataset):
-    name = "iowa-electricity"
-    _pd_read_kwds = {"parse_dates": ["year"]}
-
-
-class LARiots(Dataset):
-    name = "la-riots"
-    _pd_read_kwds = {"parse_dates": ["death_date"]}
-
-
-class Miserables(Dataset):
-    name = "miserables"
-    _return_type = tuple
-    _additional_docs = """
-    The miserables data contains two dataframes, ``nodes`` and ``links``,
-    both of which are returned from this function.
-    """
-
-    def __call__(self, use_local=True, **kwargs):
-        __doc__ = super().__call__.__doc__  # noqa:F841
-        dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs)
-        nodes = pd.DataFrame.from_records(dct["nodes"], index="index")
-        links = pd.DataFrame.from_records(dct["links"])
-        return nodes, links
-
-
-class SeattleTemps(Dataset):
-    name = "seattle-temps"
-    _pd_read_kwds = {"parse_dates": ["date"]}
-
-
-class SeattleWeather(Dataset):
-    name = "seattle-weather"
-    _pd_read_kwds = {"parse_dates": ["date"]}
-
-
-class SFTemps(Dataset):
-    name = "sf-temps"
-    _pd_read_kwds = {"parse_dates": ["date"]}
-
-
-class Sp500(Dataset):
-    name = "sp500"
-    _pd_read_kwds = {"parse_dates": ["date"]}
-
-
-class UnemploymentAcrossIndustries(Dataset):
-    name = "unemployment-across-industries"
-    _pd_read_kwds = {"convert_dates": ["date"]}
-
-
-class US_10M(Dataset):
-    name = "us-10m"
-    _return_type = dict
-    _additional_docs = """
-    The us-10m dataset is a TopoJSON file, with a structure that is not
-    suitable for storage in a dataframe. For this reason, the loader returns
-    a simple Python dictionary.
-    """
-
-    def __call__(self, use_local=True, **kwargs):
-        __doc__ = super().__call__.__doc__  # noqa:F841
-        return json.loads(self.raw(use_local=use_local).decode(), **kwargs)
-
-
-class World_110M(Dataset):
-    name = "world-110m"
-    _return_type = dict
-    _additional_docs = """
-    The world-100m dataset is a TopoJSON file, with a structure that is not
-    suitable for storage in a dataframe. For this reason, the loader returns
-    a simple Python dictionary.
-    """
-
-    def __call__(self, use_local=True, **kwargs):
-        __doc__ = super().__call__.__doc__  # noqa:F841
-        return json.loads(self.raw(use_local=use_local).decode(), **kwargs)
-
-
-class ZIPCodes(Dataset):
-    name = "zipcodes"
-    _pd_read_kwds = {"dtype": {"zip_code": "object"}}
-
-
 class DataLoader:
-    """
-    Load a dataset from a local file or remote URL.
+    @cached_property
+    def _dataset_names(self) -> list[str]:
+        return sorted(DATASETS_JSON)
 
-    There are two ways to call this; for example to load the iris dataset, you
-    can call this object and pass the dataset name by string:
+    @cached_property
+    def _py_js_names(self) -> dict[str, str]:
+        return {name.replace("-", "_"): name for name in self._dataset_names}
 
-        >>> from vega_datasets import data
-        >>> df = data("iris")
+    def list_datasets(self) -> list[str]:
+        return list(self._py_js_names)
 
-    or you can call the associated named method:
-
-        >>> df = data.iris()
-
-    Optionally, additional parameters can be passed to either of these
-
-    Optional parameters
-    -------------------
-    return_raw : boolean
-        If True, then return the raw string or bytes.
-        If False (default), then return a pandas dataframe.
-    use_local : boolean
-        If True (default), then attempt to load the dataset locally. If
-        False or if the dataset is not available locally, then load the
-        data from an external URL.
-    **kwargs :
-        additional keyword arguments are passed to the pandas parsing function,
-        either ``read_csv()`` or ``read_json()`` depending on the data format.
-    """
-
-    _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()}
-
-    def list_datasets(self):
-        return Dataset.list_datasets()
-
-    def __call__(self, name, return_raw=False, use_local=True, **kwargs):
-        loader = getattr(self, name.replace("-", "_"))
-        if return_raw:
-            return loader.raw(use_local=use_local, **kwargs)
+    def __getattr__(self, name: str) -> Dataset:
+        if name in self._py_js_names:
+            return Dataset(self._py_js_names[name])
         else:
-            return loader(use_local=use_local, **kwargs)
-
-    def __getattr__(self, dataset_name):
-        if dataset_name in self._datasets:
-            return Dataset.init(self._datasets[dataset_name])
-        else:
-            msg = f"No dataset named '{dataset_name}'"
+            msg = f"No dataset named '{name}'"
             raise AttributeError(msg)
 
-    def __dir__(self):
-        return list(self._datasets.keys())
-
+    def __dir__(self) -> list[str]:
+        return self.list_datasets()
 
-class LocalDataLoader(DataLoader):
-    _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()}
 
-    def list_datasets(self):
-        return Dataset.list_local_datasets()
-
-    def __getattr__(self, dataset_name):
-        if dataset_name in self._datasets:
-            return Dataset.init(self._datasets[dataset_name])
-        elif dataset_name in DataLoader._datasets:
-            msg = (
-                f"'{dataset_name}' dataset is not available locally. To "
-                f"download it, use ``vega_datasets.data.{dataset_name}()"
-            )
-            raise ValueError(msg)
-        else:
-            msg = f"No dataset named '{dataset_name}'"
-            raise AttributeError(msg)
+data = DataLoader()

From 279586b17dc766382b7a06e5874983e704789bf9 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:26:09 +0100
Subject: [PATCH 003/201] refactor: Make version accessible via
 `data.source_tag`

- Allow quickly switching between version tags

https://github.com/vega/altair/discussions/3150#discussioncomment-6719752
---
 tools/vendor_datasets.py | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 4a435c253..a50297420 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -3,7 +3,7 @@
 import sys
 from functools import cached_property, partial
 from pathlib import Path
-from typing import Any, Literal, cast
+from typing import Any, ClassVar, Literal, cast
 from urllib.request import urlopen
 
 if sys.version_info >= (3, 10):
@@ -15,20 +15,25 @@
 
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
-SOURCE_TAG = "v1.29.0"  # 5 years ago
-CURRENT_TAG = "v2.9.0"
-USE_TAG = CURRENT_TAG
+_OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
+_CURRENT_SOURCE_TAG = "v2.9.0"
+
+
+def _py_to_js(s: str, /):
+    return s.replace("_", "-")
+
+
+def _js_to_py(s: str, /):
+    return s.replace("-", "_")
 
-BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/"
 
 ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
 
 
 class Dataset:
-    def __init__(self, name: str, /) -> None:
+    def __init__(self, name: str, /, base_url: str) -> None:
         self.name: str = name
-        js_name = name.replace("_", "-")
-        file_name = DATASETS_JSON[js_name]["filename"]
+        file_name = DATASETS_JSON[_py_to_js(name)]["filename"]
         suffix = Path(file_name).suffix
         self.extension: ExtSupported
         if suffix in {".csv", ".json", ".tsv"}:
@@ -36,7 +41,7 @@ def __init__(self, name: str, /) -> None:
         else:
             raise NotImplementedError(suffix, file_name)
 
-        self.url: str = f"{BASE_URL}{file_name}"
+        self.url: str = f"{base_url}{file_name}"
 
     def __call__(self, **kwds: Any) -> pl.DataFrame:
         with urlopen(self.url) as f:
@@ -191,22 +196,29 @@ def ext_fn(ext: ExtSupported, /):
 
 
 class DataLoader:
+    source_tag: ClassVar[str] = "v2.9.0"
+    _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/"
+
+    @property
+    def base_url(self) -> str:
+        return self._base_url_fmt.format(self.source_tag)
+
     @cached_property
     def _dataset_names(self) -> list[str]:
         return sorted(DATASETS_JSON)
 
     @cached_property
     def _py_js_names(self) -> dict[str, str]:
-        return {name.replace("-", "_"): name for name in self._dataset_names}
+        return {_js_to_py(name): name for name in self._dataset_names}
 
     def list_datasets(self) -> list[str]:
         return list(self._py_js_names)
 
     def __getattr__(self, name: str) -> Dataset:
         if name in self._py_js_names:
-            return Dataset(self._py_js_names[name])
+            return Dataset(self._py_js_names[name], self.base_url)
         else:
-            msg = f"No dataset named '{name}'"
+            msg = f"No dataset named {name!r}"
             raise AttributeError(msg)
 
     def __dir__(self) -> list[str]:

From 32150ad6b4b1f79b05be988bcf359e172ea017bf Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:47:09 +0100
Subject: [PATCH 004/201] refactor: `ext_fn` -> `Dataset.read_fn`

---
 tools/vendor_datasets.py | 45 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index a50297420..e79ad6010 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -3,9 +3,13 @@
 import sys
 from functools import cached_property, partial
 from pathlib import Path
-from typing import Any, ClassVar, Literal, cast
+from typing import Any, Callable, ClassVar, Literal
 from urllib.request import urlopen
 
+if sys.version_info >= (3, 13):
+    from typing import TypeIs
+else:
+    from typing_extensions import TypeIs
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
@@ -18,6 +22,12 @@
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
 _CURRENT_SOURCE_TAG = "v2.9.0"
 
+ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
+
+
+def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
+    return suffix in {".csv", ".json", ".tsv"}
+
 
 def _py_to_js(s: str, /):
     return s.replace("_", "-")
@@ -27,17 +37,19 @@ def _js_to_py(s: str, /):
     return s.replace("-", "_")
 
 
-ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
-
-
 class Dataset:
+    read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = {
+        ".csv": pl.read_csv,
+        ".json": pl.read_json,
+        ".tsv": partial(pl.read_csv, separator="\t"),
+    }
+
     def __init__(self, name: str, /, base_url: str) -> None:
         self.name: str = name
         file_name = DATASETS_JSON[_py_to_js(name)]["filename"]
         suffix = Path(file_name).suffix
-        self.extension: ExtSupported
-        if suffix in {".csv", ".json", ".tsv"}:
-            self.extension = cast(ExtSupported, suffix)
+        if is_ext_supported(suffix):
+            self.extension: ExtSupported = suffix
         else:
             raise NotImplementedError(suffix, file_name)
 
@@ -45,7 +57,8 @@ def __init__(self, name: str, /, base_url: str) -> None:
 
     def __call__(self, **kwds: Any) -> pl.DataFrame:
         with urlopen(self.url) as f:
-            content = ext_fn(self.extension, **kwds)(f)
+            fn = self.read_fn[self.extension]
+            content = fn(f, **kwds)
         return content
 
     def __repr__(self) -> str:
@@ -57,19 +70,7 @@ def __repr__(self) -> str:
         )
 
 
-def ext_fn(ext: ExtSupported, /):
-    """Very basic mapping to `polars` eager functions."""
-    if ext == ".csv":
-        return pl.read_csv
-    elif ext == ".json":
-        return pl.read_json
-    elif ext == ".tsv":
-        return partial(pl.read_csv, separator="\t")
-    else:
-        raise
-
-
-DATASET_NAMES_USED = [
+DATASET_NAMES_USED = (
     "airports",
     "anscombe",
     "barley",
@@ -110,7 +111,7 @@ def ext_fn(ext: ExtSupported, /):
     "windvectors",
     "world_110m",
     "zipcodes",
-]
+)
 
 DATASETS_JSON = {
     # "7zip": {"filename": "7zip.png", "format": "png"},

From f1d18a2d3baee9edbb9d17146c90b73a29d7905b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:47:57 +0100
Subject: [PATCH 005/201] docs: Add trailing docs to long literals

---
 tools/vendor_datasets.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index e79ad6010..5b0f25fe8 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -112,6 +112,8 @@ def __repr__(self) -> str:
     "world_110m",
     "zipcodes",
 )
+"""Every name that is referenced in *at least* one example/test."""
+
 
 DATASETS_JSON = {
     # "7zip": {"filename": "7zip.png", "format": "png"},
@@ -194,6 +196,13 @@ def __repr__(self) -> str:
     "world-110m": {"filename": "world-110m.json", "format": "json"},
     "zipcodes": {"filename": "zipcodes.csv", "format": "csv"},
 }
+"""Inlined `datasets.json`_.
+
+- Excluding images
+
+.. _datasets.json:
+    https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json
+"""
 
 
 class DataLoader:

From 4d3c5509f1e656adc08015f5456fe3f5671c7ecd Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:51:24 +0100
Subject: [PATCH 006/201] docs: Add module-level doc

---
 tools/vendor_datasets.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 5b0f25fe8..08c3094e7 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -1,3 +1,10 @@
+"""
+Adapted from `altair-viz/vega_datasets`_.
+
+.. _altair-viz/vega_datasets:
+    https://github.com/altair-viz/vega_datasets
+"""
+
 from __future__ import annotations
 
 import sys

From 3a284a5ea97ebe0ef500c9911eaeddebe88ad741 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 7 Oct 2024 17:05:34 +0100
Subject: [PATCH 007/201] feat: Adds `.arrow` support

To support [flights-200k.arrow](https://github.com/vega/vega-datasets/blob/f637f85f6a16f4b551b9e2eb669599cc21d77e69/data/flights-200k.arrow)
---
 tools/vendor_datasets.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 08c3094e7..26e1207c4 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import sys
+import tempfile
 from functools import cached_property, partial
 from pathlib import Path
 from typing import Any, Callable, ClassVar, Literal
@@ -29,11 +30,11 @@
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
 _CURRENT_SOURCE_TAG = "v2.9.0"
 
-ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"]
+ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
 
 
 def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
-    return suffix in {".csv", ".json", ".tsv"}
+    return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
 def _py_to_js(s: str, /):
@@ -49,6 +50,7 @@ class Dataset:
         ".csv": pl.read_csv,
         ".json": pl.read_json,
         ".tsv": partial(pl.read_csv, separator="\t"),
+        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
     }
 
     def __init__(self, name: str, /, base_url: str) -> None:
@@ -63,9 +65,10 @@ def __init__(self, name: str, /, base_url: str) -> None:
         self.url: str = f"{base_url}{file_name}"
 
     def __call__(self, **kwds: Any) -> pl.DataFrame:
-        with urlopen(self.url) as f:
-            fn = self.read_fn[self.extension]
-            content = fn(f, **kwds)
+        fn = self.read_fn[self.extension]
+        with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f:
+            tmp.write(f.read())
+            content = fn(tmp, **kwds)
         return content
 
     def __repr__(self) -> str:

From 22a50396822dc48d4ed63bae3c8837dc28dab6ad Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 7 Oct 2024 19:46:40 +0100
Subject: [PATCH 008/201] feat: Add support for caching metadata

---
 .../_vega_datasets_data/metadata-schema.json  |  12 ++
 tools/_vega_datasets_data/metadata.parquet    | Bin 0 -> 9100 bytes
 tools/vendor_datasets.py                      | 121 +++++++++++++++++-
 3 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 tools/_vega_datasets_data/metadata-schema.json
 create mode 100644 tools/_vega_datasets_data/metadata.parquet

diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json
new file mode 100644
index 000000000..2b5b9d955
--- /dev/null
+++ b/tools/_vega_datasets_data/metadata-schema.json
@@ -0,0 +1,12 @@
+{
+  "ext_supported": "bool",
+  "file_name": "str",
+  "name_collision": "bool",
+  "name_js": "str",
+  "name_py": "str",
+  "size": "int",
+  "suffix": "str",
+  "tag": "str",
+  "url_github": "str",
+  "url_npm": "str"
+}
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1ab0fb17143528da9cd460e84a0fb18a9f1d5b73
GIT binary patch
literal 9100
zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{
zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D
znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX<zK3G0`yUs~)CAb<s#u^0}YUE!Xzh>#F!
zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep
zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@
z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn
zm+PlC$Uc@#rY?$x`Mjg<JY%|h(4{cS#t;Cuh;p0ibp~ML8u(NqcNuuiR-X5i>`Yv5
z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T
z<PKWUMR%DESOb=T?S?8#3{3@yxG4g|U<_x~HJg`mv?a%#RS$}awcO5iY@|5vy?fdo
zHQHTO*3Nzy(>Olgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU
z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S
zuGHkN$akkNd7d(<tS0kN_q49O?3;Zy$&t*lhssCX4UiAJOi9^eArhH}M#6e1S!{M`
z4`Mvs-0%F2(54pF_!r0cl}2MWcbwGUV<X_ldA3})Hw_&eIA3hZHCvdbP!@e7^sa`^
zjyOYum<=eOOEn@vQgUq;%_h9F%(oL?<$n^aWgLBQ#x2D%@O*l+xv<FlprhPF8~l5Z
zvs&;*$y*9b;6bKpGIqnu`3AAK2Nqo091qW9q^`#@WRPlg`FZMXdV-HmpSbpw;yxm)
zj$#mxCvy6q*<mVEw5`3!#KTzfyoPs!6KHPH=yjq~c<`v#h2_H+rjDO7%y}y)n}pOi
zJQ7QuU*az|5){Z2<Xwz2@DqRK`Kr0F;|chF?9|7H>wDi)Mn$5-8AU)8RuX_P&+6+R
z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y)
z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP
z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J
z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDu<rd8P5t@Ut#0
zE-{Czyon5A5&*y*&&`IEd#K-9bUKuF{k^Nz%*0gqD-<<4x^l{?{Af*rz|3Lcb}drD
zEYCt|EMiNw`r+BA$<mI7548c@`ka+Fv}tkiKBHNg&bTAD<B}j#E5b=JrdOcvi?`?0
zDm^VkQLlCTDl?9rS3aOLp@I99=XlTlLj~uTV}cJa2Y>EgQVsa*pFnN&&60_DkTv3H
z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0l<s;TLRz+W==pFi(1+I@DWUR^3?
z5;b$&u6c5Ay*=yKAuRXto0X+U1T`BQ@}D#fobrs`;`}h-)6DTI7ycOUSp5<U{DJr~
zt~VWbM0QH_e>E^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg
zh9JkIRbpE>9&m`sJ*@}sOH>d%$V<jx3-V#F8?LSz4Di^p#Kvpo>gLVL#d^fe=wu#W
z<IU$y3HEyYyr$xvEW8mp6|dikh<VMI2eRYGEThDirVBNS&xx}<h)womRF6I<c5K);
z<<<^?=_J?Z%WV91J#Aj|m(Z$iJnsQ3l6VlCp4LPM>zWa;&;hxFwt~c39&`sQ?gST%
z#lOG}f<A=GxgSMk5`@Z8uN73DX?}Fj&WY`z7E%69&ctgkSE<>^cj8lQl0thFevP7|
z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F<UEtQvppUg%}{-dQn+QkUJ~QD
zc>!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk*
zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dy<X=#=MH
zGwZQ3ttG08_w2=Y_RsqY-<5GRKP)gh-A^(}7La_Y)BXA=djIB1>Ib4vP46txCpA^=
zEv{}WiRnf+yZ3ry7?c?hC<zcKd+OCCS3ueL4V1z+PnM>B2TH~2sn)%_?heQv6j8zF
zTi~1xTl<VI)y<{|vd(|);nC+x-y^&E|0*a$c7FrqHC9{~_U}M>oE4m7{L@yMJX-<n
zj{tx{TGCVc19pSFtPx2pzzfu>Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T
zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV
zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_<NCpa!{%9p;#K=W(3m@Qe
zjVDl>C9s9b?J_Xz%#2_K29&h1G6M|fQh<siLN{xj5g^2M*3Rq^0<jK|M2;;X!tObA
z3Dak0TF&j-EHuo;eUa;E-8MGE$#?G?x$Nfl6|X<z8KEGz4rE$6@|bA;QLCrj0xP<U
z3AeD3uGN2~E|3Y_3U0Rm*+7T|-Gh{KK>$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y
zX*;3)!V2`FFAVpZx5%UjHDH)mI<C=`A|Td0Q$}!qsi|G5X{M@~Ex(v8zu5r^v+Pm{
zGYRu7aU&=XZCh#pGdC8)OpEaK_20FEkYomQB?GQf2A<2GI6$-D2XPi4!?#(m*#ZD8
z>`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n<R|kB)=6g(h=x-1g=pY7UM58AVwF%
z!4vEM2vqX+@bD0tmXZ=F#9z_RKYV)xS&>2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V
zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW
z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+<
zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl
zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B=
z$<Oh242=S=?z4NFYIdd+_4qZ!86|I&PwdnYn=DLWCEhP@P3^)DG*qE-S}Z#+AW!j6
zUl%B-c~jPR)RZ^fxX44D@C4WVu{Pk#;az4WHy+M9>TjO3X9<wWNS{6Ad9O%1vSa-7
zy*B{^1H|i`(~EmYCvvBV(fpdx@-n_l>+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%!
zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa
z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PTh<vm9&K8z%V#Tnn3>S+=1#saM$R
zH=l}V%jZdU?oPq9mN2?X<DHq1fyB7hxDAtWjeu+HCL6C<(KtDqOn^byO8`GS9LvD0
zDH@07=l04R9=;m84v1yu6x8I_d-@3^#R+qliCh|ruh}9R!Mx9>U|Y^`iej-xhS>|h
zD9c$>dzC7Ig&g-Sd4@rs<;>n<ZMQG;Ex+bFC)+(3JaPS4_g<pe%fNx)cc*ws?}#5t
z<K_Ai-(+ZUz9UW6a62WpCFpuS<7XI1@o3`8@TI(_Jw>DM?r|GwE3OMWJS}|VSO$~d
z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro
z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo
z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G
z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM
z9M#!^!&pX`^Ou7npXPcVbLz%_><Q0Zpd#&`vG?9Ppq6KtRjF<#wzEvJ$x%N)qWOyw
z+S10ht25-1gV$~F)*`oSo<}Lp!;osc-$$_;e=70q9Y)(2ubb6*zh*lh9vq5sxL)af
zTw<ADWT0=Hzq@Jh<?Gkltp0O+ya`r@3GN)fNtv5(b538kY!Y++6Gy$g-Mj)1LxI^n
z*0TBcS?0{_a&A%r$y!qf6#Q$GPm0~TiT$lRbTRnVW&N_iM;2Qx+dpx;C3uhWCEhgT
z-|&VI|ET1&alF*TWtn*u<H`4asZVQDMA>9jDY<gCz4~UGzaYt}t+!=o-!a{4DT}vg
z6s}B`9nd+hOEG^*dik40vdjWkCI3kWTdo(y*VQB3D=I|}-S$h;Oc&6-yTJP-O;C2p
z`cuKk?5=~i?%q-tmFc#w`!$?YHtv3cT4C^Ts~UT6>7Aigq}T?m@)5(+is#5Jy0>3X
z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX
zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq
z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OM<oCV?BQtDm?<S5Emsf5m
z_)vJV!>lZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=s<qPJIo);S072{@6YBN};
zVWKa6IsXE#*=P6gskc|2>K09lR<fTdDRQj2=X3TfGH+s&XQzRVx=v0;S5BRoZ+pbc
z4>pAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB&
zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L%
zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G
zMXkI^_Nwl*GUp3<fmvG`X@R{jJsMP|+!OOG%6YWoxkZ_D8QwDv=Zuh%7^bgZTVgQD
zBxO`WYptd-Dpv~CPIlH6Wp2NUmrrgTy#_r=9eC6N-(f+K;cIwQGRX^rZYL{alCR_C
zbNQ0fK2ZD$By?0FYb9~x9*k=j9JYwCK+lzvb>)=)7>OPh!rfGaPwAgH8vnbFU&y<7
z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM)
z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW
z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_
z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ<VFtLmNunD2lr*wyASnF*<~T)FU3fUiHbtL
zs-EbwtPAd1ZDPQO110TKp~7?QmVEVfu9iKUETki92F7RVyii4czWb#i`e`RxRd!zu
z(NGsS;LQ-4g_4Mw>|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk8<urhHpD+>2_*i
znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp
zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ
zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x)
zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC
zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP<X8YQ8#$C&o2Dkt2ryG)O<3D*z@$fw^*
z9LnmDZ;-m`9AO)NLAfKV{<gWwwHA@jCRE$kzT;dy)1dTKrFh!OpvADP(jLymh3n!K
zW4XOMn$_opd<(?xj5dwM8E@!3h2i>=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m
z`o_brZCW#Wb+me0<?r~Zp0IB9yVs@f?Ugg2@`d}J*UP&D&aq54`aiHU-;1=cN|D!Z
ztyXNlV2G)jSm4=4&Ej|7k7atcJw@GiQ<9JxpGkVK_CDRKqKpb}vu_cOuxG!R*p<r`
zS8!r(Ic?u%lwvMG<XYQ$Sm3P(vb2?b=Y$iwfD@N!1MfTkC5DORVZcIf2%sPQ9smsh
z^onmWd@SY<VbJfo{ACP1B8kO(iy3{__aB<RUCyDAYgf?kMy_5Hu^9TD?)!$7%?i2&
zu3h533I7v2`j1|Aus?I7`q5+KyV?Hi28_l0C*Za!kRTcf217^zu!86=1enRbnjho<
zLsqjVxWEq9wJuoj+FE{LBiO~XT3;;-mLpd43-VytYWAigI4rc9PqYCGIN?0VfW2(W
z6h@_ngZuypNHYBCZ505bpt|3q`I3NzAUq82`KNgMO4qfR=UPX2_)1;K?zK85sE!^j
zSsAh#ZdhxDN5J>~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<<kG^?B_xpqV=ov3i9Kbz6
z39jugzfXB(hxwrw80blV5QEMpfyMx?gL6%aKj~?O-e2BJpksG{X!HXTEXO$Jn;dKU
zTp9I`F=TY^4v70l5~vQ~GB^H-!Vd-f&^0iHS~5(bO<M5ZiEN=eU=rhGa0`IzmxG1)
z-^G;BL-6=g3aA3$CKY}k|AUbr^a8BKK?*#Ba*J;|_zpiSiu`}+sG?H?Al;>UgQpn*
z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N
zH(8Y6kEW^tkw`RxenickP<fXvoQ2Bts$Ymi8WHLMrM5&l8>l@p5^gsm68%>DQK3|y
z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R
z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@U<sfS
zG*!B&`o>Te62cyq14`{y^<fEjf#$0T38@6h4>iG6`gj*7%1_HjXKiYw>>A`6;sa5o
z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn}
zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y
zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y
wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In

literal 0
HcmV?d00001

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 26e1207c4..871ac14af 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -7,11 +7,12 @@
 
 from __future__ import annotations
 
+import json
 import sys
 import tempfile
 from functools import cached_property, partial
 from pathlib import Path
-from typing import Any, Callable, ClassVar, Literal
+from typing import Any, Callable, ClassVar, Literal, TypedDict
 from urllib.request import urlopen
 
 if sys.version_info >= (3, 13):
@@ -25,12 +26,130 @@
 
 import polars as pl
 
+
+class GitHubTree(TypedDict):
+    path: str
+    mode: str
+    type: str
+    sha: str
+    size: int
+    url: str
+
+
+class GitHubTreeResponse(TypedDict):
+    sha: str
+    url: str
+    tree: list[GitHubTree]
+    truncated: bool
+
+
+class GitHubBlobResponse(TypedDict):
+    content: str
+    sha: str
+    node_id: str
+    size: int | None
+    encoding: str
+    url: str
+
+
+class ParsedTree(TypedDict):
+    file_name: str
+    name_js: str
+    name_py: str
+    suffix: str
+    size: int
+    url: str
+    ext_supported: bool
+
+
+class ParsedTreeResponse(TypedDict):
+    tag: str
+    url: str
+    tree: list[ParsedTree]
+
+
+_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/"
+_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
+_SUB_DIR = "data"
+
+
+def request_trees(tag: str, /) -> GitHubTreeResponse:
+    with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response:
+        content: GitHubTreeResponse = json.load(response)
+    query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
+    if data_url := next(query, None):
+        with urlopen(data_url) as response:
+            data_dir: GitHubTreeResponse = json.load(response)
+        return data_dir
+    else:
+        raise FileNotFoundError
+
+
+def parse_github_tree(tree: GitHubTree, /) -> ParsedTree:
+    path = Path(tree["path"])
+    return ParsedTree(
+        file_name=path.name,
+        name_js=path.stem,
+        name_py=_js_to_py(path.stem),
+        suffix=path.suffix,
+        size=tree["size"],
+        url=tree["url"],
+        ext_supported=is_ext_supported(path.suffix),
+    )
+
+
+def parse_github_tree_response(
+    tree: GitHubTreeResponse, /, tag: str
+) -> ParsedTreeResponse:
+    return ParsedTreeResponse(
+        tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]]
+    )
+
+
+def request_trees_to_df(tag: str, /) -> pl.DataFrame:
+    response = request_trees(tag)
+    parsed = parse_github_tree_response(response, tag=tag)
+    df = (
+        pl.DataFrame(parsed["tree"])
+        .lazy()
+        .rename({"url": "url_github"})
+        .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag))
+        .with_columns(
+            url_npm=pl.concat_str(
+                pl.lit(_NPM_BASE_URL),
+                pl.col("tag"),
+                pl.lit(f"/{_SUB_DIR}/"),
+                pl.col("file_name"),
+            )
+        )
+        .collect()
+    )
+    return df.select(*sorted(df.columns))
+
+
+def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None:
+    metadata = request_trees_to_df(tag)
+    if not fp.exists():
+        fp.touch()
+    metadata.write_parquet(fp, compression="zstd", compression_level=17)
+    if write_schema:
+        schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()}
+        fp_schema = fp.with_name(f"{fp.stem}-schema.json")
+        if not fp_schema.exists():
+            fp_schema.touch()
+        with fp_schema.open("w") as f:
+            json.dump(schema, f, indent=2)
+
+
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
 _CURRENT_SOURCE_TAG = "v2.9.0"
 
 ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
+"""
+- `'flights-200k.(arrow|json)'` key collison using stem
+"""
 
 
 def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:

From a618ffc6450922f602391b5511edda37b2fe325c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 7 Oct 2024 21:49:43 +0100
Subject: [PATCH 009/201] feat: Support env var `VEGA_GITHUB_TOKEN`

Not required for these requests, but may be helpful to avoid limits
---
 tools/vendor_datasets.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 871ac14af..259999fa0 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -8,12 +8,13 @@
 from __future__ import annotations
 
 import json
+import os
 import sys
 import tempfile
 from functools import cached_property, partial
 from pathlib import Path
 from typing import Any, Callable, ClassVar, Literal, TypedDict
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
 
 if sys.version_info >= (3, 13):
     from typing import TypeIs
@@ -73,8 +74,15 @@ class ParsedTreeResponse(TypedDict):
 _SUB_DIR = "data"
 
 
+def request_github(url: str, /) -> Request:
+    headers = {}
+    if tok := os.environ.get("VEGA_GITHUB_TOKEN"):
+        headers["Authorization"] = tok
+    return Request(url, headers=headers)
+
+
 def request_trees(tag: str, /) -> GitHubTreeResponse:
-    with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response:
+    with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response:
         content: GitHubTreeResponse = json.load(response)
     query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
     if data_url := next(query, None):

From 17923404866003e27a510be793ab65c290d8802a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 7 Oct 2024 21:51:45 +0100
Subject: [PATCH 010/201] feat: Add support for multi-version metadata

As an example, for comparing against the most recent I've added the 5 most recent
---
 .../metadata_v2.5.4-v2.9.0.parquet              | Bin 0 -> 11354 bytes
 tools/vendor_datasets.py                        |  11 +++++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet

diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5626093db560b805b33261bdc5f6b7754ab3451d
GIT binary patch
literal 11354
zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2<IC0i6(
zN>N!-mSkyBTJWA3R8P<Q^y&9L?;r2``~41c&bhbiT=#X~-|POK>ppmljUo(yIm2w_
zV1x892FjnH8XygUARqvENMBZ15tV0y0FP<G)F3@X0=zJ-uC3m*f#4z38$1L8<m8}X
z020Q4M6eMfLS4yW!NH`EP@*S{6+TE0&w;?xASkHzceyY+*fI>3$Bcz#Ltq<^af_m3
z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x
z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z<
z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G
zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIW<wS@-3c
zkdgpTD8ku#9Ew=DXU9rgJB2D=6P)udd!M`YtIRdK*F8H<RWv+5B39nyJ+PJG&E&&@
z@i22%r+1Nlbg{M*wegydXNI#iaZ|!+`;nzDUKwd+x^K1Ac&|Nw<YMZS`%9H#v2BA-
zzsg9ky-!{&au-yL3M;eGZ4$xr8{HHiBUPZngA!)cpClglZ&=9vaIJC**|t{gAji=&
zkhtVAt*E+nNUu~uPG&9!e=xOd+}RL1bkB^C{W4fG%g9(*zc=@u<kry13=40gOv8``
z9=c8e8MW56y=lg<k7wk21a<KTiUhWu3|K3$;@CIX{=vFwqJHY)jN#ZT)~JMQC?P8i
z*&?eZQ>Aw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor
zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3L<s5g)aLC_h|6$|x8KEzs`
z>c0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm
zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{)
z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~<d3C!wai55
zp2_ktk~xkm%kgXTYjCMP7GQF5D5(BqP{-`2m|JIp&E)tq&GhxrqP(n(Mg}!*DuNtl
zMoNXYa(q~mOXH#1B6u?u@m%Z{TZ_x?mz@<HuxiB*1Js>yS0|RD^xW_olJIm>Nl{S*
zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSV<n#)>d!vED+2Q8p1g@{
z5<yD~avyG&;O69Hh0?b`j^O1GY#p@RN3hbcyYxr<0yKo~Z&9K&gaAaoQc-#sD^e9;
z;6x&rynF+Qu0e!A;yUCr1mRg6sCftl^Z#f&(9~Wq+%FuD@)7`4Zx~gYVrzhw76<+$
zu>cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea-
z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa
zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7
z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=*
z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@
znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH;
z>XPbVx$`X<tQVR3ow{d_<s@xCPuf?g^k}*PkCgC8zt-S3N1*NKX7;4XNaeYk!FaX~
z`MkRzt4K}az%<>Wi%sctbgd<HZ3M<TXlR8uU>5vy*l7OgOs_&Ev#e7++rs(bqst-?
z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf
z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk
zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K
z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO
ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y<t84Kici1
z`h+ciU@k&ulmo%boYY#x^>?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&&
zLyhoI!1y<Nfph?b<<p;GDFwpvxyw(mJbH%n(9D?rqyq2R?4@MCf+sszD6q6%hoxPw
z8_QvT#5ye5zr*rM>?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY
zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K<Hvlh>7R9DMw0*R7WdN#WPN`3Q+ns>%lP*;
zF|Xp*P<m=r0wSuFUWfJU7P|-KYR|H7S1L!DkyNWZY=q~}HoZ*faxR`SAf~0M&NI|}
z2#%dV(JMhHsAT&KDm5Y$w(fkdJ0ul<V1bE#wh$#PMwej~C>}l^I?cB$A$OZ|;z!##
z^RFC9M&hw<S+u6Gx%M6rFVP2UhOT)D&3hDx{~wgUT5A9If--pDH&EVYL#2uT6DSv%
zV2N~C*y-<UWdx)f^l5Cd9Ka1#S|lgkIR;>hSm6ckhGqH!6W}UyS~7%e2E@igIvIdt
z<_T<Ur?{UQ0n$mQ@e|o>F|opl@6JDw6nK1vMkd^rx068<fTa0zpArL_r64?_fT7I+
z217x}7kZ%6f*mRZq-jbpjcY)x03aH$MPH<;aE7@g0w<upa0Ld)cq2@g2iV=p6`;)+
zaFfpIIxyzI0AT_iDrh6OKBT#n1e{OM>}IMl281}OY|Mut5PCog`f?c((r<fDnCl9?
zOaSBv>jfGBt1kGI7j)X-fh*&56dB{-Y~<iSSCMgpxZEM`i$dQB-%EFe{aP({-Tfb=
z`ztquf}NtaMnFMLHxJ62lba=QnGvY_)e`Vu;Dy;BFD&%@D=(DO`OXX5Jb%p#J3(HE
zS6b(VDzfXmFz*Lm=wACvUYPI#<b~46bzXR4gBM!f-*Qr^1Z+tIjByd6Hd!<<x(#Yb
zNA``}V1=t3@Kq@43Yt3K{xhqA3xPe%2Mc4~d?^%_-3<ZgK$}w7HuVbfZ+c4Fk4y?;
zpxh8bLP+7%qEPZ)aD+-}U}KQKju4$ZScG6$uL`!IzSSPpp#`!N`2=sxuHEb>aX%<>
z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ
z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(<i-1KO=KX4d3pIptfTKa
zExeKzby|yJjh{3G?cfQ+{b&azkP2r}`ny+2UQ-^eA+Ly5SCv<x1I|G2@a6QCi-H)1
z$;0&j+Hr>U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI
zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!*
zk_Y|?vs<Q$e0ksdgk3N0b8l$g2NHDWC`(^|ifX>mkqc_spMuQZ1oyvBV8;$2RmmN=
zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa
zcdAEAx#I8R(jIH1<s6(#fIo_It-Sb*u;u;kK(`~~P?qtK9Z;<)f<dpUA6h~g8#X6n
z_>wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(<IncnjuDz}&Y^Mhu`uHuQW
z&+|jKbe*cvyczn%F)R#U)AA*{V7WnRKc@&iyNG2fvZ-C!#=1$hCF)-2VFqvP%UY4^
z(I?y7axO1Ez{HiuXYRCp=Q^^#V9Y8brwwM3Z8!7!PR9%WE1^xi9Dfofgxb@S_;e%u
z9ZQ;x4D>S=O3(y}iGI6Q=DEgGMy<K6t#Nx9_p!#XTZf(wF*EbxG4jlvHQ4sKH35E3
zVGzT2sKsd^?lsQ$RIpdPnEg93li=xtaAsC`GvI`G3m#*LJi~U7f034%l~Zz&h8e1V
zi+=DREh7y=T2!BAn>6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3
z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r
zH<h|8Efx*)cT{yqzE!?ay6^1P;}2QBSjZPx7OAb~N`6d}u3Jr6T}s)V>*as--Sh6r
zrZZ<lulQYEwn?d8J`-iroa=l0gQ-x`GCFiuvaI2Fb3qk}Irf!d)zc)?OEpUkK`e)-
zZd3%1-*;%}49}eCHjPteJMdLbqUMBkq}c^%?q%^3rWTFP!Uqylc8YDo6}ir{<L~c{
z@xKUB;NVgC?4q!$9jJuZr#8-`=h`Tl2TcfiJ$m$ENU?U(?c{VcY2wEIoO}2g^>$yS
z$~-0I6juVF7vB>!@z{d&;?U(<dZzKoX4~65mxSy08{p`eyXs{9YEBxePZoWM86lSW
zoh#}XY<qnHHt4{&av~f}dZ^~XnP55^eeyz_^F0kO#okiK^f_nJ$XgE5XYZMrk#nM2
z8FLBc=6A=Bcl5@N`$QjAmn&r`&e8U4uXHpvY`XkGvuR$wuj#pVM*AHRX+AF-g*sN8
z&Hk0P#W|6l$gdZRPS|>e^kS8p0-17ZrrpFQ8P&U6$X-<!$tg|M!%rhp6ZUt)XFRrd
zz)#KF$+q+uM=;fF59_Sp?kaz`bfCs3F4f&7@kt5I&PubUq~6B&;;L`NBb5#0$b^Kg
zQla6($)9Nry>1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m
zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3
zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5(
zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ
z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-<VXP5|-_%Q#8?*LS0ub>`q37
zePB0%4&6yElvD~k4*Rmgbj|^8<EL0&`*N@%SR+I7{g=u-J|u_ZT-UPLVhX*E19~dW
z;PGMI{(I4PXXEQ`R^G$<b+s&GJ3GJhI*B-(&)j*{r`f!5k9{*-N^0@UsF{P9jcZ<;
z^Qg(EGYg85Z5%hhbe~%NgW30d;a!>)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd)
z?az%yAIw|1>U4?8ZDme6BzyeD<e`rUt;2at@)myh$BA8&_owSzq?j6OAcogo5Ii0^
z9Vpcc7nwotSrOHHnE2MqaD0*Z_M54kg0c$&CqA7lYBxU@*`nK8R>yF@tU@a7ig|r?
z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuz<xO6G4o(>t{wdV!^)
z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g
z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4
z<KiH3@6`cjmJbP!=~=j}%eb@+vN*p&J2}R3#|@xp=<3>mHN+K;)(CerBC!q4%J3wm
z*<DwW72fhW)`2y#8O_Q%z|`W7U}$keW4GirKEBP=LaFJ7V5Q^=v{XCbSQB3=_o(cf
zxZ`UabBVGJg8{^!Nus+?XlO85OF@AU>?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@
zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc<f%w>*HHEF&{R@YQgqkUM0+U`)CsB{
z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj
z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2
zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H
zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw
z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R
zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@
z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq
zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3
ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD
zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v
z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W
zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X
z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE
zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I
z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3
z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3
zdo8j<t?%CBqp4NKdIGkzqf(34%REHI-rX0kY_Q8NnN_@##^=7@p@RSU1^OUHdFfB+
z>t-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~
zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq<uV(C3
zr}d=0;GXJmHbV3ca;7Ui<dm+__=KnUf`Y(m<6>4>lP-U5@_>Z!i=|`j`#txKJ~=K$
zDl*ge?29<cBYZV0xPG|F=0QPyfn&Y1uiCK}F$)VN)2>*bH@mWc0)`m9dE5kJ|D$NV
zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n
zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm&
zXZ|?5l>L_0$0<iSx^R{plA45&>6E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$<E
z4#>USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo
zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC
ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w
zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@<
zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g
z<?-r5z;TCeeSxJZjZk8UeFLO_MCTw$77P<e0pNB&BV`Q)9sc?P*nf@|egT1M_n|C+
z|3elCBo_@5{FVXyfy@jE2LNiuw;fz0;xF5iKUDc!8|vl`67g;K;)lAwDEbx?1Y2%~
zynbl85tKqAs1cwa3)Vw6NNVf`jQIRe^e?Ez@A5q4zaf*q%U^!z(!V7FNW_1~blXr!
z5DtZ<LBOd3AO+!PSz*I;8~2~NU?9a?&mR$lwQuGp%YMIqssekwnSW+Gtej?}exBZU
znWcZgZZU4;xBI{@ASm}R8sv2|x)2g66vhj{04bWEzNG<RtYF^vjdm#jD@bWZsrl3X
z_j=XM4Y<vUl;-Ps!TUDz=)gQ`fOfsfMtpL!l+uC{2mCSp1MB@Ud(#nY<bj|4AO(@-
zs2qqOXw={TFbH0j27$rht%o2maR9Z-DflfO`Mu-ozUH^SFn~9K;t=pjP@D;XI>@uB
z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{
z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_<hF_sdSOz610jMJV?-G79U<~h}3b>F+
z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv
zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}|
z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9
z!KhQ@?W4-03NxjYV~p*<lQqV`oFWLk8dIJR9@Kgm3_g;gG$;=&qGXxb?{hcN4D&EW
z|7&@M6fM@}1rZ!<NS?dE{_LPWk4(`3Oo3gcs%l^Y-hxW(r04-&Z8q#-MX3VzR|8a1
z0n{HXqGTDMcklA}*ZNT(ZH7~H40H@8f>NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T
zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8
zMii@U$lI-?Np?34h$QUN3<JlHOc_5cSiCX$c2jOKHlV>MgX#_1i*mJIpAtsYLJUTM
ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1
Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O

literal 0
HcmV?d00001

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 259999fa0..61c701e1e 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -135,6 +135,17 @@ def request_trees_to_df(tag: str, /) -> pl.DataFrame:
     return df.select(*sorted(df.columns))
 
 
+def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame:
+    import random
+    import time
+
+    dfs: list[pl.DataFrame] = []
+    for tag in tags:
+        time.sleep(delay + random.triangular())
+        dfs.append(request_trees_to_df(tag))
+    return pl.concat(dfs)
+
+
 def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None:
     metadata = request_trees_to_df(tag)
     if not fp.exists():

From fa2c9e73c1e09e9721a2e095e4715e1dfac9939c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 8 Oct 2024 21:58:33 +0100
Subject: [PATCH 011/201] refactor: Renaming, docs, reorganize

---
 tools/vendor_datasets.py | 146 ++++++++++++++++++++++++++++++---------
 1 file changed, 113 insertions(+), 33 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 61c701e1e..296a5f590 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -9,26 +9,51 @@
 
 import json
 import os
-import sys
 import tempfile
+import warnings
 from functools import cached_property, partial
 from pathlib import Path
-from typing import Any, Callable, ClassVar, Literal, TypedDict
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar
 from urllib.request import Request, urlopen
 
-if sys.version_info >= (3, 13):
-    from typing import TypeIs
-else:
-    from typing_extensions import TypeIs
-if sys.version_info >= (3, 10):
-    from typing import TypeAlias
-else:
-    from typing_extensions import TypeAlias
-
 import polars as pl
 
+if TYPE_CHECKING:
+    import sys
+
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs
+    else:
+        from typing_extensions import TypeIs
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from tools.schemapi.utils import OneOrSeq
+
+    _T = TypeVar("_T")
+    _Guard: TypeAlias = Callable[[Any], TypeIs[_T]]
+
+_GITHUB_URL = "https://api.github.com/"
+_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/"
+_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/"
+_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
+_SUB_DIR = "data"
+
+def _is_str(obj: Any) -> TypeIs[str]:
+    return isinstance(obj, str)
+
+
+
 
 class GitHubTree(TypedDict):
+    """
+    A single file's metadata within the response of `Get a tree`_.
+
+    .. _Get a tree:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    """
+
     path: str
     mode: str
     type: str
@@ -37,7 +62,16 @@ class GitHubTree(TypedDict):
     url: str
 
 
-class GitHubTreeResponse(TypedDict):
+class GitHubTreesResponse(TypedDict):
+    """
+    Response from `Get a tree`_.
+
+    Describes directory metadata, with files stored in ``"tree"``.
+
+    .. _Get a tree:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    """
+
     sha: str
     url: str
     tree: list[GitHubTree]
@@ -45,6 +79,15 @@ class GitHubTreeResponse(TypedDict):
 
 
 class GitHubBlobResponse(TypedDict):
+    """
+    Response from `Get a blob`_.
+
+    Obtained by following ``GitHubTree["url"]``.
+
+    .. _Get a blob:
+        https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob
+    """
+
     content: str
     sha: str
     node_id: str
@@ -63,37 +106,55 @@ class ParsedTree(TypedDict):
     ext_supported: bool
 
 
-class ParsedTreeResponse(TypedDict):
+class ParsedTreesResponse(TypedDict):
     tag: str
     url: str
     tree: list[ParsedTree]
 
 
-_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/"
-_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
-_SUB_DIR = "data"
+def _request_github(url: str, /, *, raw: bool = False) -> Request:
+    """
+    Wrap a request url with a `personal access token`_ - if set as an env var.
 
+    By default the endpoint returns json, specify raw to get blob data.
+    See `Media types`_.
 
-def request_github(url: str, /) -> Request:
+    .. _personal access token:
+        https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+    .. _Media types:
+        https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
+    """
     headers = {}
     if tok := os.environ.get("VEGA_GITHUB_TOKEN"):
         headers["Authorization"] = tok
+    if raw:
+        headers["Accept"] = "application/vnd.github.raw+json"
     return Request(url, headers=headers)
 
 
-def request_trees(tag: str, /) -> GitHubTreeResponse:
-    with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response:
-        content: GitHubTreeResponse = json.load(response)
+def _request_trees(tag: str | Any, /) -> GitHubTreesResponse:
+    """
+    For a given ``tag``, perform 2x requests to get directory metadata.
+
+    Returns response unchanged - but with annotations.
+    """
+    if _is_str(tag):
+        url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}"
+    else:
+        url = tag["trees_url"]
+    with urlopen(_request_github(url)) as response:
+        content: GitHubTreesResponse = json.load(response)
     query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
     if data_url := next(query, None):
         with urlopen(data_url) as response:
-            data_dir: GitHubTreeResponse = json.load(response)
+            data_dir: GitHubTreesResponse = json.load(response)
         return data_dir
     else:
         raise FileNotFoundError
 
 
-def parse_github_tree(tree: GitHubTree, /) -> ParsedTree:
+def _parse_tree(tree: GitHubTree, /) -> ParsedTree:
+    """For a single tree (file) convert to an IR with only relevant properties."""
     path = Path(tree["path"])
     return ParsedTree(
         file_name=path.name,
@@ -106,17 +167,18 @@ def parse_github_tree(tree: GitHubTree, /) -> ParsedTree:
     )
 
 
-def parse_github_tree_response(
-    tree: GitHubTreeResponse, /, tag: str
-) -> ParsedTreeResponse:
-    return ParsedTreeResponse(
-        tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]]
+def _parse_trees_response(
+    tree: GitHubTreesResponse, /, tag: str
+) -> ParsedTreesResponse:
+    """For a tree response (directory of files) convert to an IR with only relevant properties."""
+    return ParsedTreesResponse(
+        tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]]
     )
 
 
 def request_trees_to_df(tag: str, /) -> pl.DataFrame:
-    response = request_trees(tag)
-    parsed = parse_github_tree_response(response, tag=tag)
+    response = _request_trees(tag)
+    parsed = _parse_trees_response(response, tag=tag)
     df = (
         pl.DataFrame(parsed["tree"])
         .lazy()
@@ -146,13 +208,21 @@ def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame:
     return pl.concat(dfs)
 
 
-def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None:
-    metadata = request_trees_to_df(tag)
+def _write_parquet(
+    frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool
+) -> None:
+    """
+    Write ``frame`` to ``fp``, with some extra safety.
+
+    When ``write_schema``, an addtional ``...-schema.json`` file is produced
+    that describes the metadata columns.
+    """
     if not fp.exists():
         fp.touch()
-    metadata.write_parquet(fp, compression="zstd", compression_level=17)
+    df = frame.lazy().collect()
+    df.write_parquet(fp, compression="zstd", compression_level=17)
     if write_schema:
-        schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()}
+        schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()}
         fp_schema = fp.with_name(f"{fp.stem}-schema.json")
         if not fp_schema.exists():
             fp_schema.touch()
@@ -160,6 +230,16 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non
             json.dump(schema, f, indent=2)
 
 
+def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None:
+    """
+    Retrieve directory info for a given version ``tag``, writing to ``fp``.
+
+    When ``write_schema``, an addtional ``...-schema.json`` file is produced
+    that describes the metadata columns.
+    """
+    metadata = request_trees_to_df(tag)
+    _write_parquet(metadata, fp, write_schema=write_schema)
+
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago

From 24cd7d7d9752d7424f9b8e37436d032f31bc54c1 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:02:13 +0100
Subject: [PATCH 012/201] feat: Support collecting release tags

See https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
---
 tools/vendor_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 296a5f590..0604df780 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -36,14 +36,32 @@
 
 _GITHUB_URL = "https://api.github.com/"
 _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/"
+_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags"
 _GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/"
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
 _SUB_DIR = "data"
+_TAGS_MAX_PAGE: Literal[100] = 100
+_SEM_VER_FIELDS: tuple[
+    Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
+] = "major", "minor", "patch", "pre_release"
+
 
 def _is_str(obj: Any) -> TypeIs[str]:
     return isinstance(obj, str)
 
 
+class GitHubTag(TypedDict):
+    name: str
+    node_id: str
+    commit: dict[Literal["sha", "url"], str]
+    zipball_url: str
+    tarball_url: str
+
+
+class ParsedTag(TypedDict):
+    tag: str
+    sha: str
+    trees_url: str
 
 
 class GitHubTree(TypedDict):
@@ -153,6 +171,55 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse:
         raise FileNotFoundError
 
 
+def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]:
+    """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
+    if n < 1 or n > _TAGS_MAX_PAGE:
+        raise ValueError(n)
+    with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response:
+        content: list[GitHubTag] = json.load(response)
+    if warn_lower and len(content) < n:
+        earliest = response[-1]["name"]
+        n_response = len(content)
+        msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}"
+        warnings.warn(msg, stacklevel=3)
+    return content
+
+
+def _parse_tag(tag: GitHubTag, /) -> ParsedTag:
+    sha = tag["commit"]["sha"]
+    return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}")
+
+
+def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame:
+    """
+    Extracts components of a `SemVer`_ string into sortable columns.
+
+    .. _SemVer:
+        https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions
+    """
+    fields = pl.col(_SEM_VER_FIELDS)
+    pattern = r"""(?x)
+        v(?<major>[[:digit:]]*)\.
+        (?<minor>[[:digit:]]*)\.
+        (?<patch>[[:digit:]]*)
+        (\-next\.)?
+        (?<pre_release>[[:digit:]]*)?
+    """
+    sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
+    return (
+        df.lazy()
+        .with_columns(sem_ver)
+        .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64))
+        .with_columns(is_pre_release=pl.col("pre_release").is_not_null())
+        .collect()
+    )
+
+
+def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame:
+    response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower)
+    return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver)
+
+
 def _parse_tree(tree: GitHubTree, /) -> ParsedTree:
     """For a single tree (file) convert to an IR with only relevant properties."""
     path = Path(tree["path"])
@@ -240,6 +307,13 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non
     metadata = request_trees_to_df(tag)
     _write_parquet(metadata, fp, write_schema=write_schema)
 
+
+def collect_tags(
+    n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True
+):
+    tags = request_tags_to_df(n_head, warn_lower=warn_lower)
+    _write_parquet(tags, fp, write_schema=write_schema)
+
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago

From 7dd461ff536205b5e07c62b2a4e09ab1e4bf5612 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:05:32 +0100
Subject: [PATCH 013/201] feat: Adds `refresh_tags`

- Basic mechanism for discovering new versions
- Tries to minimise number of and total size of requests
---
 tools/_vega_datasets_data/tags-schema.json |  10 ++++++++
 tools/_vega_datasets_data/tags.parquet     | Bin 0 -> 6210 bytes
 tools/vendor_datasets.py                   |  26 +++++++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 tools/_vega_datasets_data/tags-schema.json
 create mode 100644 tools/_vega_datasets_data/tags.parquet

diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/_vega_datasets_data/tags-schema.json
new file mode 100644
index 000000000..80f248a66
--- /dev/null
+++ b/tools/_vega_datasets_data/tags-schema.json
@@ -0,0 +1,10 @@
+{
+  "tag": "str",
+  "sha": "str",
+  "trees_url": "str",
+  "major": "int",
+  "minor": "int",
+  "patch": "int",
+  "pre_release": "int",
+  "is_pre_release": "bool"
+}
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..dc0ff652ed261eebeed70ead42c0f7352ea4e8c3
GIT binary patch
literal 6210
zcmds63pi9;``<H$#wBLT-py#_J~+wD$ZdqoprWbVN+cRXDECX`9OM#CrCf5UD3#ob
zgA}KTh<uJp9Un>;oFdYNbQk}%r{U;3o$qm;^Z)+O|M~V~_IlTT*Lv6Qec!cyYp>0U
zWy63WI1vxWVKrbUAd0{u6kSM7WS5pEQ<F}bw~L|4pli}02vWO1`@9W@X%d1`%?ZTZ
z=h+43>A9*I>nywL4i0T0cNwb{sg%0Bh)s3;^pobhe%i6lsF0)=)saa<QJEi_2E-U8
zsu3HvW@@!@Fq^l>FDoSWb_P6?EratJ1OkR3!b@jlc0tVsqD?z7=TnwlH@>#<r1sTe
zyYXelscKor^c@prU<tAcBtj+$1#>-M3B(EM#v>hAD&Fv$fPpjj<=_m$SAMCv>+9$-
z=-O^N%en5_`b=FtEw|-5I!qlsrYnQ1t;5rEXKr_O)6yZS(Y3g)dJL{RkH>IzqwBgd
z+_vlUv~}EEwe{V!-Q8T7JUyni)^@Gw(7X_tFAs$WPDm1S;XEPb3w8>{`%YPv6tAA@
z8G)WcLr_YQA#VV2;^-}39HTfe<l>G*kE+)sdB|FLmwLXuCe%07RdQYdKJ|0e{sRHs
zK5D5=1daF@vC|gzUccl#tEgbU+G{sZ+FPS`-12duTvmgOSuJa^hsTO_bY{HuZRJBH
zHf4q_rnXL=OCox68Q!0j><5mWy}7@mC09o*q~L?D%e_>+(2t)_ayk1~7gpPQSRHx4
ze?+YDcyU3|GtG^bstGRCiB3f&5mo1NJL2kH6%OOY$-c2?oASF}+y0o{<1D+QXJ2F0
zi1|<S%VMSLyT>C9D$-l2jz_JfoZ|yko{6!<m*AqIdJ_T;gBhn;{t#cWWz#jUPZ<Y%
zosNIFRZ4lEdiUznlylv&&hIA*&AlwPI;9pfwQd~m53uAZ^xMCweD0fJfA_}l*?PMn
zvHj|I6{K&TnEy+f`yD4v=CUfLw&KB-XN#QIx85ntIq|)Zb*T7jzl=Ktv=()*x}>zY
zMSIy{k2G*~gP*o)S9b2W+2Hgv_sI`zYg5EI(9_{lereafRV(7G!cz%OcQtD&117QJ
z;c-cajxQW?%bNFk$B<1`?vilBhff;JmYnMvLQJ!L!WNO&OWkYPJswb6LkPY3c17r>
zY`2;m=l1!x&-7WPpBXtwGbDbut6|q#lZ;-@xUA>c666<O%dwHXQet0SEaIW%xNP48
zqp|%fNPG3pMGo8(l_R;;+|qcevX$9%X~9alBN1n<Nk&c5RZAx<Bz-QM8d$`pxkiM<
zL^U1r2zxg_t2vQ<R-vqT>Zs<^0_C2xhy9JiN0JHsQPslfx!zH3za24=wWzc@uh1ct
zeg*e7Jwf@>jg;>Vn@=o%)KQhZVChw*kt<<iN)oDvD>YM)Ti1UHpg$~09XyrhBDYd<
zfkNKL-9g;W=5FF>R#b{3=fhQd_9v;Ahb=CzTUPNoyj|mzxpUL<{vsW7(dNyHOX4Cd
z``BV{h_AJ9UZK0v>*|xrC6gX!9qi035A7f=XpFsWXlweYFL?W%EZetnk?k9twk%#K
z>9nRgyT&(Sb?eoX%}Xwmq_1x@?R|el{Pv_?MW5cpKqax*`rR3$x7jP3aRI8GY6%6Z
z0iD^k!oJ4Ec^?LeKU05uQM<d-U2KYSXOY^-J4a-F53Wu05)u14c1>7ipMj!YZT`L7
zlA0@AA@^#HR!Oe@0l&#8#-%5bUNW`i!)LVi86`}8+?sia{)u{WVzUP$!nKoko_ShT
z;n{^Ec6KecHbRn{a`D00Lss$3%6B(jHFFa&j6&i$f-+7CufW1{-wTy3x`UDbh%q@q
zh=!gNU7+|*9G#CPU`J`!9)gu`m3#4_Q2TBg1euvY@rDX#XE-XzlgsC*3I$|@NUat^
zN$NksK7i*cE#OMxS3KwH864~%WI&^F{k=3jyn;PLTs3)qKD0o0f4?BwE_V+u|F+DH
z8_W%I4-TS%HniYCclRLLoEh?OjPX~w)seVY5jq;uD<$J^VQJ7JFXffz*JD&Tyut0L
zvbsZ2vB8dUA#uzr=mmtdypw*|d_GcH_oSyOVfCe47jDk;!WV6|Tip&cHn^sYC^$+8
z|I<9J^iIYK`<B-2kM|U{ZGBp!3OKbuE)(U{!ie+dbEuh=#sZJj=n|V!!<KS&wx>w=
z<J|I{pG`{Jxa^SlHAxbVO6LgLj|SFm(c<6%sp{u9DcRjWyS#uiVU7$pprpDq&Q3jt
z5=ovYF}fmtQ@p%5I-rTIN{lD_1_4q%Ian~JUsWr!vuEGv7F{zys;wm`seUX^Zyj(r
zYJF?(HI!7DrZl?$hTh!V`ri8a<0e}m^Imajl6-RgLN$E|V_YUC?Y|9*5fV{ukBPxZ
zETB-N%OJwG`aRJQ_8)`IPmWBDv<A?QB&~{aYdR?N>h<Zjc`6}(7RA(LueZ{NY-PlW
zb;Sifq33H_eK#fSTbo{5mX#`N&^O`WbX+5Cz*X$-0^NjF*ItGh^{!2>(B3gP=-l(`
zgG&?iwL`xu2#EJx97?>8uLy`YrVr=c5hw|WH=f3<JSUsa{s~UfbyYJGJ+*WTp7#7$
zsJ2u~;1cd{r?37PtkrhACU5NnD<kb!VVU9!nT|i%b~oZL_HA|3GxAWndRl0(RXkwO
zyDrGXJoyK48LLI7*D|udTkC+QG$`5bZf;vvoz>9h-ST5rK~%c%gP7|vH_aXeP+xM(
zNAi_#tsj<&o|3K9yS>AD<VA|hea|>Q`_F9`V&xLF9$po&@R;#bR>av2?+u&AF9H^B
zxVtMMI8{Wo>Em!NU}4oOpZd>?=Cps;yzxy!K4IK8^|^)LXtfMy&C?ev?^~&;^-Oh#
zQP-~?lh}!rM;4B&l$tWB*4-4*sOXofS$@Nd^RJ18$L*yZpTJ+fn%b#yr~gBk=z-?T
z8^@faA9r0BI&Fq8ii{TpEZjvL&6HETkTrJOzU*^CXlves{I}havO4=c582a5qa8<&
z=Z>Z){#vINP?q+fLUDahRj}+~dR)iQBlk6CC?QJ)EaoLeJkE^lRN$HB8n=l{8NcQ<
zN%rM0_i9uBo-Oz0b!TUDT*|Yj8EIW5rt(jETb3SrU|Jin(d^&}QPOj{RY=sRUP03O
zq)VpvTsFL_%aVUlBlG0dxbn@T+x%K?M1SWsx!a7L7Vs_}5OPD5<j|hDPqi5#zv_lD
z3m20z4=KL)XoM5*Yis}W)Wxxn{fa8_8m+4+&7(J_XvS6dY3okwZJJ;7gX^vLE_Khw
zqR;y+XpMq3@?*2!bbH_QB(#P56capXc~!SgF8%nj{o<=(rNjDtBM}Q9>5piZxz5wm
zDkK`RlL>2Zd-2OwmMs!1!C2!JC#I$*<@QWsxLG8L_#yk8W6#>|$1+?uOCXK&q+bg)
zZzIt);zxhH@pCmIA+ew9O5e<RZ=>FRw-Xz<8H>eDQ?)3c_gq2(z4?Sa#zDrUsfmkF
z!u}f+_y3DVVH_+ED|0|0cwLnztb-sd3lps{HHm}39)iGX=ng`dJ^|pdpe1^d@R%<G
zdPffd^5PDlE2ubOFv=Kx2*#0hAQ1wYB<#c8;TOmc;wcPLjiKJ5p+WqnC4giP4)*83
zCew(vK~M+=3T(iO5&W-+(Az@IjD@G78Z|6qD63c!Y{+TL&|#u+SGSF5j*y%irdJBr
z4g=gA*!gSRawtx4k?>zi{+*AT%D@P43(prlX=|x|=CDl*2{ZAs_T3fUmZdpb4THeC
ztZfTvmP=l~0+%icBQ;p+ID_xfw*~<&S;3kdSZvxOd9@G-iiyF-GKf>3-`tX$!k~fH
zfQ`Qfj1%DeBVeR_I?zI*`1*C05YD7t9U?%KSuywL2J<}mF@K6dx-rzFI(&fNG710^
zU=jz$&H%*ALJ$l3WBTpXW`g%eRA~Bz0cK$n*bst*W@07oAL#B9=<e+f-YEEhQZdLm
z4E4UAq3)m91JPIl)6r;vW9OHHCk8{}Pd9$Tq;^mHKQkT-kzsHIl=KR6`Pbp>knPk-
zChEPw2$_f_&zxY%B*B>$SU@0%ubcB3k4&2LtJ^H|e}g^d`hr9z{f$B9N>Q_1n&AR6
zX=Yi^R!=WEpq^eLf1;kApnsyCHiW9}zGMpnkx3Ygpgje_!yQ<VNQg{9s)gVYqCieu
z1dmAz<YYP=E)e%Dhs!VmxnBoPo|X^l&or52!rRdnFqWLX1{dh(7Yq|1SV9HM|KOkk
zaVV&gUk5Muv+KX_Ak$BApw|~V1WT1KqdX#z`AhbTJbTT6We8gS%glxlCo+Y6fO;QG
znclDF_W1)DW5WnivjWzKsCn@3o&V3yB}k$vOhU$2z*jI3_2beR1iuICzjwPua#7za
zjNp7Bh`NtKnc>L4vPB}*rXYjeCa^I?Ez_QDWA1wrauWrT!9s0}>CUq87q$&#0JYJ<
zw~@&F-Nx*>J2$>ZCQ*-4*(jRF1{AQA=gj5EVKdORoee8ONPaX1aiZdpWQH`l{7Dil
z8=DRG20=VecONdPUWA5TS{|k>7Rwy`So*s_J<J*vL7o3-EsGVz0u4ZE&01;&^r4}s
z-h##Q5VYf${Aw85$AZNM3OZ=c(nVWXqV3u1P!UzIHiQEs`a!4&jDUyy{%o`Wy=~EP
zKxree&+o@#ndyU0G|(TYw9!ufJO(qA=d)=$_ysxIuL^er<LROJ!uj_2x7Db9mO1L5
zHH*y$21;|(A8_&gV6m+1Kqn^u35ppV*OU(dlxUMR?A?x=n7bYA!@M0fulE+<M?rjr
z9}UO9!B0o=^Ub5J_+59R@c<#%2&(K*hs`!SxTxk!eEzrX2XpOpcJa`t{D<s2*!$16
z|A+jp0z?0s_{_@hPSBD~2YK}dIcD&D&?J}zq4DDYNsv4TRAkwJh@z9i1DHlj>n|xW
z<11M#8h^fmlmi3kBc7J0=bz2Lx#L5pB%AJF&tpL(5i37$ZeS4k0NC=r`3I@Y10TP@
He-Hi#`swv<

literal 0
HcmV?d00001

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 0604df780..86deec8ee 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -314,6 +314,32 @@ def collect_tags(
     tags = request_tags_to_df(n_head, warn_lower=warn_lower)
     _write_parquet(tags, fp, write_schema=write_schema)
 
+
+def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame:
+    if fp.exists():
+        print("Checking for new tags")
+        prev = pl.read_parquet(fp)
+        prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1)
+        curr_latest = request_tags_to_df(1)
+        if curr_latest.equals(prev_latest):
+            print(f"Already up-to-date {fp!s}")
+            return prev
+        else:
+            # Work out how far behind?
+            print(f"Refreshing {fp!s}")
+            fresh = (
+                pl.concat((request_tags_to_df(limit_new), prev), how="vertical")
+                .unique("sha")
+                .sort(_SEM_VER_FIELDS, descending=True)
+            )
+            _write_parquet(fresh, fp, write_schema=True)
+            print(f"Collected {fresh.height - prev.height} new tags")
+            return fresh
+    else:
+        print(f"Initializing {fp!s}")
+        collect_tags(None, fp)
+        return pl.read_parquet(fp)
+
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago

From 9768495f9974173ecb6b835464174a7b0bea808b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:20:03 +0100
Subject: [PATCH 014/201] feat(DRAFT): Adds `url_from`

Experimenting with querying the url cache w/ expressions
---
 tools/vendor_datasets.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 86deec8ee..65802d130 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -13,7 +13,7 @@
 import warnings
 from functools import cached_property, partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict
 from urllib.request import Request, urlopen
 
 import polars as pl
@@ -31,8 +31,9 @@
         from typing_extensions import TypeAlias
     from tools.schemapi.utils import OneOrSeq
 
-    _T = TypeVar("_T")
-    _Guard: TypeAlias = Callable[[Any], TypeIs[_T]]
+
+_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]"
+"""Query result scalar selection."""
 
 _GITHUB_URL = "https://api.github.com/"
 _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/"
@@ -340,6 +341,28 @@ def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame:
         collect_tags(None, fp)
         return pl.read_parquet(fp)
 
+
+def url_from(
+    fp: Path,
+    *predicates: OneOrSeq[str | pl.Expr],
+    item: _ItemSlice = (0, "url_npm"),
+    **constraints: Any,
+) -> str:
+    """Querying multi-version trees metadata for `npm` url to fetch."""
+    if fp.suffix != ".parquet":
+        raise NotImplementedError(fp.suffix)
+    items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
+    if items.is_empty():
+        msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
+        raise NotImplementedError(msg)
+    r = items.item(*item)
+    if _is_str(r):
+        return r
+    else:
+        msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
+        raise TypeError(msg)
+
+
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago

From c38c235fec976be66f7298f484e83828f2edf8a0 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:31:21 +0100
Subject: [PATCH 015/201] fix: Wrap all requests with auth

---
 tools/vendor_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 65802d130..c5ad91459 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -165,7 +165,7 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse:
         content: GitHubTreesResponse = json.load(response)
     query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
     if data_url := next(query, None):
-        with urlopen(data_url) as response:
+        with urlopen(_request_github(data_url)) as response:
             data_dir: GitHubTreesResponse = json.load(response)
         return data_dir
     else:

From a22cc8a2d8231d0ac56117c3cd2fc56a2cffe762 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 9 Oct 2024 21:16:45 +0100
Subject: [PATCH 016/201] chore: Remove `DATASET_NAMES_USED`

---
 tools/vendor_datasets.py | 45 ----------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index c5ad91459..167c55590 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -421,51 +421,6 @@ def __repr__(self) -> str:
         )
 
 
-DATASET_NAMES_USED = (
-    "airports",
-    "anscombe",
-    "barley",
-    "cars",
-    "co2_concentration",
-    "countries",
-    "disasters",
-    "driving",
-    "earthquakes",
-    "flights_2k",
-    "flights_5k",
-    "flights_airport",
-    "gapminder_health_income",
-    "github",
-    "income",
-    "iowa_electricity",
-    "iris",
-    "jobs",
-    "londonBoroughs",
-    "londonCentroids",
-    "londonTubeLines",
-    "monarchs",
-    "movies",
-    "normal_2d",
-    "ohlc",
-    "population",
-    "population_engineers_hurricanes",
-    "seattle_weather",
-    "sp500",
-    "stocks",
-    "unemployment",
-    "unemployment_across_industries",
-    "us_10m",
-    "us_employment",
-    "us_state_capitals",
-    "us_unemployment",
-    "wheat",
-    "windvectors",
-    "world_110m",
-    "zipcodes",
-)
-"""Every name that is referenced in *at least* one example/test."""
-
-
 DATASETS_JSON = {
     # "7zip": {"filename": "7zip.png", "format": "png"},
     "airports": {"filename": "airports.csv", "format": "csv"},

From 1181860ca6fa4abcd8662b0c9f5de2257b041b76 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 11 Oct 2024 17:01:24 +0100
Subject: [PATCH 017/201] feat: Major `GitHub` rewrite, handle rate limiting

- `metadata_full.parquet` stores **all known** file metadata
- `GitHub.refresh()` to maintain integrity in a safe manner
- Roughly 3000 rows
- Single release: **9kb** vs 46 releases: **21kb**
---
 .../metadata_full-schema.json                 |  12 +
 .../_vega_datasets_data/metadata_full.parquet | Bin 0 -> 21362 bytes
 tools/vendor_datasets.py                      | 668 +++++++++++++-----
 3 files changed, 488 insertions(+), 192 deletions(-)
 create mode 100644 tools/_vega_datasets_data/metadata_full-schema.json
 create mode 100644 tools/_vega_datasets_data/metadata_full.parquet

diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata_full-schema.json
new file mode 100644
index 000000000..2b5b9d955
--- /dev/null
+++ b/tools/_vega_datasets_data/metadata_full-schema.json
@@ -0,0 +1,12 @@
+{
+  "ext_supported": "bool",
+  "file_name": "str",
+  "name_collision": "bool",
+  "name_js": "str",
+  "name_py": "str",
+  "size": "int",
+  "suffix": "str",
+  "tag": "str",
+  "url_github": "str",
+  "url_npm": "str"
+}
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7a4e691cb414735738f276950d79e8c72c5f4b48
GIT binary patch
literal 21362
zcmeIa1y~i`{x-Zf-Mu&6-JQ}6(hVZr(xHGzcXvytA}NhD(j|&?hltWj!?!o;<8#jW
zpXYnN_q^YAz2A48VKXzUe)F3(Yu)#nd)ADaga#)F0GfaYeFCz8lG#BB&_5DB00R(6
z00cl5hI;mE6B~dwEI=9#77|Wo2R?)(Am7~HbZ7w8lqi6(Hxx=ZK=_+wl@4g^vi!cR
zw4^BfjUO12#MgnhjKjvl1Vp@9_G9Rv0pe1au)3?K3UI@s>%;awM3W<|amXJ|l$y{$
zSrD=_@zPRz*9`&?f)EG^&@DaPjomz)ot<3WEiFK3;AD1iGy=E?h7L*iw}&7&P!0j;
z0!0Z_00jMV7!&?E2#`u;O2v`jy4VORu1v1?pqiCVeebV^+D+}mbQ9sf&le{9^(z{y
z9>4&g325o;N<)QnbD+KurSa$Eo=tpvdhCOt+t6(fGkpHBqbyas)>$$aX6DL~%_}%3
zH2uqIshdgA+%{ZZo!+^{EH574eQ$@46zh>|%OgUnFsy~uU-CSK$>KlpMAOf9Q<u58
z^&Qpubk6R)C!Ge&_`LX{Tb8cUgJi!-FA@fFH8@E~!7{Ai(>(DeD1K}0Rue>^HSV=F
z7|<JuMNo}r({#xMhE;;6I+s(EeZN*WaG)@Q?^mW!b{n==U0T~JL%0;4MTl7%gYN9>
zR%@)fS;}`sS2w!*RyF43`B5oq7P^6jBK4AwkkFN0;*y<F{|KD)3n#CFF<x&rt+xXN
z$z4RDsLsb!!<7;l)k<uMA-T`XlsAo~&Kfm9U2bjNJ&~4((%!F|&p}o^>eAg$6$J}V
z2UTdVp6W8_0nhPZoW`wOw{_BdT^#ggyB+q=dHNY{<cl>DOmg_<Vha7!%T~3_vA8l5
zx@m@iCNx;mGAAwqtjd(iC{tF1()ygZh50-$!M0a(fKqS3H{|r5LP^#<hYs^8kCgiJ
zD7o#oC@tlyIJUG4k24F}2{&X;ddPUG0PWr73FnC4gbj(zl(h;3_coMfQRGoR`R2b`
z`?x2A+amc^f$?3#(sLz~cgx$rR^V8(OSlv_ubg<Wr3c({rLB+seN{1YWv&ywwd{df
z5qK=~a^Vucw3N7`ITtS+8W2*P*Aa}FC7Q4*vs?vSJU1t%9P-ykEmQ_=u6P_+Xq4~Z
znH%IL2+>zQ$2YwHWZEE<#E;B4yw%gmoxbHiD&UFaj`>h6!fD>OdJP9}Rh08B(-vd7
zhuexF_rfi^^mzEt;-b2E7f)erMD~UoPr-Zo_sbf-kiOW-2&zTD>x)>w`XZXQt+Tn4
zg{2#tx!Yp`Bz{Pc&DqiVS7SFpentd%&G@r-B7TGS_csw|AixL@HtVtxAd3&TkSy0@
zOOLkd+3^(3i?@+x%1|jww(GNNHmXjymwh?w*p%(qzkBZ6nddCeLKG)2F3v%LhlVIE
zS!2ROf+;V}m9NEuuOwTy>Ml&KCeK5aqC~HySZ`Ku$gZcvSNOr6-%zD-{hN=NiJAZn
zI7W_!f`R}GB@~t_Uyp+*7LkT3LyajFm9B8sSttjSfrcnSu>_Z?*r3Ls5}$=GYtpWP
zn5}gF(7T<SgMkDm>Mb=678V*3d=D@|jfDWc9~LJ;i2*bWpEPMNKsrv}m|Mg#8?95f
z=-NVBwS5T;>ZYu*5vBqHyZqHSNU)Im0w<k?DX<5FHEIkw2uDMw4t*pGu*)rk@-<$7
zUT%7KS#TSSJ|5nA+?KPCh=vl@TW}Z-2L}fm_|93JCN&biaMe?sDiOKZtl5N%NTvVS
zG*o(Y(1v&WYsS=9PmyIz0z8a}iA?m;_48l^xZ+iRc@_|=Y|o*;G!3p|pKF%|Pj>8f
z4_ZVfwZ@>kWU&^r&M<A^hwMgc-_DhIBvun~H6o1gUX650bvKLGnaAHpmM{rOaPno=
zv5}F`kQDWpz|o=WzzBIdZW_B-dD<!D6pdGUtZC6YZc90Wi?JW2s9t+Y!@wwzCHQyx
z=kbtWMHF*L6>NoTaXnXK{1ExU^$j_P0F7i62L%H<qO@80NPL+fm%*sl(=GRVM*H}E
z1X#(C3sa+IRVK);=4R|H9}LdlDodQS8=L3u*euE{Oxy`pEZ3<>y)XMx&}sZS@I8qk
z2T5=ywE!}&9OP{2a<}B`%?54Ozxhhr93rtnTA77ytQK=>^j4dQn~>4I+>U1^^@m8>
z!q|_O*!g|yfzU#d6l&58QjC~wI2=;S60ZOrGOq;&Y)F+(2)h=5a{$01r{BgN8<rCy
zFggcAvPmK#s3%3LAE2kjp=U;1bl}=)NgP!zr-O5bG1&fX2q&93BYY<SpYFs_pi2eI
z<a3vjcUHJ{fFSyrYY1ZW{rkcc8~EZ6yN3@<U0Ov*CmyhM;=6u_ML~p5+px#7BmU&Y
z(%|0vN}H{oC$Ib+ObaIgT`#AkEV2H3^s$nLuB-cH$#@_%LS6s@79jz$m94#{v7@Pj
z<qrZ{k_1-}(uKl88R*~kOGr0<4F}5mD@r8-@WKFGrBHD?cW=D+`M!Oh)tIL$Ztbug
z#6vjbSM2z(-{;GJ>YKFOVi0}y%(v<}CF1LA6biMMF6SICMsgI$b{w5XHjr?+37`C=
zOAM%!#^z4;_O@=ePL4m^J%9r*0O=}8|B)~&u|avrN}$Z&gsBV5ivyUZHmBrpnZqPf
zz+tX!PhCvSD#1pJkVq|exDB!~%|AQbYcn#?1sDUg0qM5ttXx`1ms+Ml%qfou$4~R?
zyavGs5Z2TRW4>p3%NMwLwQH<2jI1uub@^_qv9qu+vyk@kp@*!1+GQppUsRG)x0QU=
zSH-I={cTP(?#^1NKvM$$3Vpq`Cj9{AI<>*;C_Ka)ftW1%)DN`*V%CM4P6R>g+?IZl
zGVVmt{&6Qn)ACa}e$^=exF!LIg6(ey5;1meHEsl<`f6{sG7vFXuy%oF5qfmEiUF2u
z{#WGm)Ic)7vH-PP%;%;A2>ox87r!<i$S$~p*@;w$gU(!i9|=&tNzQFq=@^WIP^VnX
z%jTm}l=j(dnbZ^0{!kG7&Z{ktk>Q@)_l<38S&vQ(%c2u%-hMTN!^ZpYNUd-UXI$ld
z>QU9Lj-6y*gk>14#vIz=ma3<ER)Uxhj3+yz@rmdxZLr@ya^g2NyySS8{t~`~dryQ;
zqN0LQSPer@wMG-q$bm_NaDfsSBrZ1mzA*X>;<6^w6G9Z}lk#(Dk1(q{LW<E=W_Z*6
zB8LmosZbd-H$-aIw&x$*M#p_s5Uw-5XU2qn$TQ)@lD?Y?zoDEs;v%QDh;<khuh3x`
zXlYeLcRHzyb=dd7q6~kwySQ!Om1^J)?&FQ2<`m656kd6%UAOx8$7h_Bc`a(+VO836
zo#1w!?<+r%-E`*5Tb(oMnX+3==w{}%Zwqt8+;8@mKMs-DHAjDxN{xr0v?}|IACZLP
zf-wCDhaI5(BZndV;xGa@D1ZItE)PP$Yr4PUuD?w_Q1+1hSMK`T<O4x=8R~!KuHWQ?
zh=&vN2PS_+1t*Q#@Xn*B{UQb^nPC4bcO^$|1`gWrWydn}kyYLCtuqwKkFWhD@}YeD
zuhjLMuEi-Q{{-!3wS=#%dNv_QK7I;|5CUaBD0NA8L8vSC59$hmGG7LSp(Fw&75c1r
ziX9^Upu3QfJ4*CN2!`S^$sg2(a>sYXD*ZulA&I}~5|lNW+(7FimZvt*s~(@HreVgN
z`L3ZC4<$p&Y!V4c;19OK#KBXHlr8#69y<22nscepL4_oxn;s7g3^>NWGoo;WLL$HD
z_vk}-8Iu8zD{QSyUX6jVhEUn|!5W0q$9r6~Xc!nZ$kiAuFp45aZ4~EmMAs9ke$gp2
zUm>Hz{qtlz2rc<17Bra%0BL3Da*7y=4f?Dkx2oAJ6e2ZVXxo>(W_4+Xj2Bkj6AO~<
zKv~EcJ!^Mdq>yJ?V`dXh$HK;<9SW-^%OG8bD2>>8X%d<v?+{Rhq=G>c3nK_)C&)7Q
zg$_)xF$n{w)qlO}b9_v|D<Ub7A3jy>N(y)&O<rw*n)na{d)Yr*q=rsjDv`0>wbtvM
zwBiK05eG}io*F%p!YGA;wNy63jB|v(lx7o!j$xqqYMxO*d~5RL@fsEO4pKEO(0NlI
zZ5Ig(0X{8z)?{=@Eep&n_qbOttwaGf@f~Z4uVGKO+G~%>J=3hZb}o*5Z|y&*3Goi_
z?c9D6(><_1KHVpBDANB+XM@}%K`=l2pD~{ng87AJloSbbEcP;!Abs`H@7pEFKzJCA
zgOppm6>DG^if?QJH!BqLwSHs1L=a>0BLXPq+y0N3ulyVH&7qht$yW9(1~#j2kl&QM
z=;LGRo6(A6f>1(}X38!OH1P^;fq#$rXKP0pwPN{HEoicxa^C37MYiMjW)uN1hS_|R
z$KGdovh~d~f@nP~Wk$y%u!x}WZwF=KOgMu@c~maqH*rbv08GF1f53bR$gR>Cm7RdY
zm(}6(SF6D%8or+cZ|UywRS}c(R$2|^GM!nuf%V^okrM0R7Byym<YdM0(^x)|bf}zk
z%jc4POVk(ygN^&)5l7G(_MahtGZX^(R|i|FHtJdNKOq0XFUU{WRKrb<`wjWK-9uou
zo~JK#6s}?c5z;VSWQehBb1}L9Mf|(FmH#6CufhC(5uaJ-zli@&i0|z62jX9W=_dI9
z9pdMqgI>TZf!_Zf<*UIwL;*<S1YiQ*@WK*e!IdHa=qdAG;547X=gI;aQKG|vZt{SD
zVBjDEAYCB@9sN1Zf;503EJtlU5#2X{EcCd1o`!g)0fy;`7TzEN4FDMZ2<JH^po<ow
z>j9)R{1Bu_fS2%qK}8HCQb4pIHS($eP#FMt2GHVo3G>PjWE2X>M6w05BLG)hK%%$+
z{a$Q)VMO~5xCd>36+HwXGT;NdFd^dyn7S}Pd5GXBa*Ygt6thZ0VGam{2hbvIo&jCo
zX^oI!H^4L50~63*zyOr^Nfz(`?jOa>1Yz4b0g0TjEg}GfSll`qxTO}r$w%O)vz`MO
zTU}T;U)&NDfHMgU2__&F6vP5_K?Kax!=yTs&!(lY09y4O5$t?$7oOh9o&ei2EDiD%
zOssCq=*mo#T`SRR|5t4vO`A>N#yKe;a#&!8%pO517An`YVE)rk7g9}-@ht#AfCM@B
zU(214YJ&Wn3?R+%SHs=^1*s+xBGu$u{G(LEBJx|R>9hE2sb&x&)u?g(kZO3Ce@Hc{
zf0Am<YX4HI3E6;1H4KD5q?*j1QqBEw`s_wq9Tzr{+{nRytJuhsVIhYk{!na+^zRg#
zx6LPiC^k%Vf1%iPb9;%?<f+sCP;3<H?i8EtcgG2#(o^-D-h(#vFX}UYjm?cB>M~>8
zcFu!8zrvvUUZI_Z5Nv(qFE3uG_(Qx&vbqy*#)`M8(IZNJiZ>$;Q1PZ##{nYVqzEvZ
znTUJRVTQx;yeOiabZ>TOws>tY{bCfN;I!db%Y3<0aK`;*Q+_Hq0%<vMJT#d<RUFPn
zsERXeP!ApZ>#kKmRh&B!=Rk?93E4(#P(xm-VCTnRALKR<)p0y&$09i>gp&{*VTiQX
z4ZTI*#oGuiG#GLjus-_mhn|x&2+I$QVPYb;wGm>UnQ+f-yq^tP*lJzbDcfq;Oq}f2
zf5|6SmEK>rA6WWB-ASK^s5_|Euja1X%t&BY=<*<S;DaYjDzFX0weDLd(B>i;eX?`z
zUrOD!G|o)N^}<2`s4C~2Hn{4aOn<&3-?uOX)qi^R50L`0{Fw+b<h{&uVzowIMW@!=
z-DyBaU$&%&Lz%{pGh(%rUfb$>mr#RnSqm+pnviv0;n5d}A#I9In&gZ+*tV`W56hd-
zkx>Y~gtAl^AbjI06gZupGspq*w}?9+YWhzGk`k2R?vX#iS$hSOsZJ+#7L&Wm27d_H
z<Irw8pfU}n?Lq(=sxAyr3S!x>uY94j2xLdJ`679Q^d=B)d|gi(Spw64A~fquHpPkt
ztX5*uboWAGwSR)EEiz)TeN*1p>hKEkA;p$^*Qd{82I4y__KDd9!FI6C^r>C$(m^;_
zrSG)3IT-Ogi>ixAFij=<am?pL68$|E?dygSB>G0MXaW#Ga6jcDI5%7GpGwgTKX?H|
z_geK1p8qFEfDFp5K#ocvv|o=cD7Za<fVDtmtm<;uV24WHgf$MlLC*(LMa{Jxv?Dhq
zXp#kf0zt)M@&W_TZ@@r6$S^zvfxcja7U2I8f#BHOJ>3Zq*i2nrot{7d=ni`x!_uMx
z_;7`3ogIX(WI9&kY)JPGu9shOC00>H@G$k^nlU6McLha$YyG53jlu=s0&oJx8DBfU
ze?3o@Df>vwCHH08!w?o<vxSEBhzI(IUf<7K4AAS@X4{aL-;0w=Py|AdQv&%kY>=NZ
z$yGEwmiV){P>qa(eJr0%n_{5sX?!%V>bSI*#j}KstRtjdWltHW1>;XV*LC*H#wB(>
z(>x7jgo|`!b@a^RtfOJJrQuDh!7&UsPi=_lL(yFXt#-<oKbH_Op^~4Fe{6rrd%}+;
z&*(TBENCts_1I^$ta9w5KA6K(L5Ct`tLK7*n6G~NT=?9KorQu!mPe6E^aH%)(91{y
zE}$X;OWS?E=)4+L)X$+wJVjVxQ5k74ScJe&&IzKT?(=Z;)NBH5)H7Nrx9p0o727(5
zSP$so-sQNls(XF~3loJ#0ndR_<!EHM1K6?j>|651i&U5bR_Wg|`X|7;Uca?N1UC;W
zD_hSWY~6$lu12K0u!J^;f4V-9hH!-l`VLeA8UAVtCO3c>SO8Za23Ga1yEeg^hlA@d
z5Ap7S4Xk5kP$EiYoGpgp=kHIILPS$5uG>$#zui~3l(To@<Wiib=or+NK?EQHS<A4R
z)_D-CK_#46SU^q$^AygsWh~i;6}_!4byz~<qTSm0JgXOmC#+<(YU#PvtTfQ(6b=Kr
z_#s^W@%#bdF&CR48wUa6V@@_s4mOUTEs7$z7#1LjL_$HK9|{YnIz?<5t|pCR!2_m%
zqu_e|V+l>mdktF-LdnTPPNj(8wTo!NLUes*lrs~N-9%WTZ<U(9xtVUMzp1geq3F0F
zxwkIlxIXB~R=JkZ6#ld+^g&bby{384LgxpzmA%#{jjuA_oW2J9F&=_-H?{szyC*2%
z6*#(RM`-Q-tGkd2UU&rrozW_R=6;Wc9GOz?1LknLs0p3<X-Ox`WQ=GISeOrPJqreS
z6l@W+KA<wl!mvaBB!VURzweiKyA}#R=nRix*k60|G+7CChrdsst1^X)JA%BxzFMu)
zalI*UY2ZP<Ylfbu7YF=mgg*Zs*)TG-X34|J$W4+KM*LT&#~2|mJea%<Nz)(l&VY9d
zjE^z4UQJt}6&!qOjnhPuVACe4vw2+(DtTPV?{pr<f6e`AXQ`AzNad@3=-|+5hTN|B
zJ%i<q`aUgYMVEVMIA=Agm~VCudftbXaNFF5a(sJ(#rj;!Egq@pMT}NEF^;Ez?L`Hw
z=408H3pG=_JSGDI3Q?aL#k*Evp61#R(ED)eT34$#b>@q()7~4iKZ#YIEEgPQmi%6I
z#+!*WYkpmxB}$W9ZcQ(1Pa+#>`m{6hQ9EYk%f$xsXCOWfwN)h4RoA$Tv1VnY7FmLk
zfoP4zmiYXmMtws3{N%MaXOdH<mpeRkiNS^HrD^vu8Y9@-zNX&N7$trac-pRLlO<ZO
zegEK-K*t&FV#LbWOg6durGbZsTFuw%cGau4M3L~g=&$AqoL_r9WtoN=OGf*Z8p{OD
znZho*=KJ;0WM^wj`8f4!;xykM881D}T~Urr?gPmtYVUj*=-=Ql-0@*_%u-(`?TZP+
z7xjEV6lf5UPT7B&7~^1J*b=GL+2Tl#?Ly1QIOUkszhWIFzRegGti>xK4RU>0CX*{*
z(zkQJe#gw&bNBh+9vnJ-HlccW{y7%)%v^OvrwKM%H9R~5qGKN79H0y+me&c8fML(`
z2f!JcpbI!t_y@qi&`bmd0?=?sNJuEe{W}q&0_YgjMuweR;mn6XgHP(Fi}8_h*$eoo
zd%5sI9JEM_&xzIj@+?Es3Mp!6U%k04i=?DErr9YyE6d?4wHd2dF*e?gu$+veMR*>b
zX-MFxnw;#cE@*iXccXXusFnq%yD^whN8uf^^|zx0MT6O)(4@Nj29a`FbRAS48bZPK
zdcnOV<>&Ylj9Gk1+I0s@W<3-sXK<8U*nP53WAhq9kg2x@1&tNoL}RyH5o+>(0ZU5&
zU#4#cmf4683f*up%Dz;iJU)aQ?4VCqY9M<N$`tZc*zav}UYJCO<vY4+KQ(zuMUV&e
zEt(;2tkjFo{RKzb;w1f1x!c~zd$JgytI}u6m%=%n8{1ls<Q0!`XVisEO1H8n8-t~0
zbzY%N7L6@+RmaCf6DSsFr0JJ0lW%@IJ#c8(A6WY^bV(vXBM?2V;v3?`xra*7G^8?E
zbamOm#KcjHUO0Yh1SWRiNV(dR+T<xQVQvXil4EY(yT(!va15V{BW(|4Yoqia>fjo!
zj6BaDE)DCaIbwcO;#Gvq^%DOZrQn$%j>lmYO|!Lh=jYr{#<}U*ww2$7FxzGq@RO6r
zKy%EZWzt%0jJa@bmYHSO>iv|WulGz`^3^;KwtM(9)IE1oo<6O}G|1hpEc0BNH?j8X
z8&Yn6N@6T(HpX*xSQp)EAhVN-N2S#O!>Ek+Rso}!Ot4CkF?6vy@qtKoj%_7f+EcG|
z&&gBo!)^T}osHRG)hnN!L;{COP=QN_Fy1{8H5kMpCiL0WnX)~iM2_qD?}^$0UR56*
zp2A;`E8DRha9b-sucdXb%`s!Rj~vx7ltSzn@>L*lV*lRjwt*&F8{M~!o%G-nldf#$
z+A{oSMH!c=A8RT)FfuaUCx;zoBqp6s6N_>biK;a9p{Trzd1+G5!1y-Em%2f+sGTLM
zxjp7cVp;PflHB(+>)v&%g?{;Te)LU*av<uMug}UX2V%s0d=vVj0XDw(CbM90B*ro}
z{YA-Rr;CX~dvyhDX8{44q=hyw)%-dehAtTe^hg8L*(<{5A70Xw5I?f{niv*C+J3=(
z-Fzt@7SV%wq^XCxWXM25rX+BaeT1v+@hW4Sv0Gp#c&QAN5rqhWQB5~0=5fSx2NzGK
zljC&XI31BUMsHqVWvsn+FmCH$rNvR3uHl!H9B5ZG#O>f}I@CB_+RGh#f)#>_E_LrE
zNg&(PGpyAkpUnqeW&OpaM+wbbyn}{KY$8qp$0Mry%Uryf-dQY$1o&n*TF3<6;WMPv
zZ)5c2pLrITrKY>!rZyQPz?L$+boW_qHYi7_Gz%Af)A=6Ba8M`tF)0G>^?u+K{wV6B
znTngzY7=Am=PB@*+BGjj#iiO!cinsz9~M08^CF2#5K|ko(ik^Dpf4A>>O1&KKIVP(
zvLH2hz+!5IsoeoNx!i_1>bz_8=pcJ`ccNLEoWtGg5DvUoY)};UHQT_wMHk)n9{zW;
zntQ8>5Bemvw|D&>ACu{694oZUtAx!x9LRB2G03nrN)Z?XskRn{v)7U~d$7&3RRnv2
zi^~Yydh0(kgb8`Lm!!ry<sp~ibzeQNXWCkH&>>4vd8DDOJrW+{@)*_|DS4qvARNzN
z!e>^rx{sf@c)+npFV{k@@Ldj_VPRuzHAj)or`dTH?1w8`{``%EsfA!J_U}ns52jD$
z3E6t)U*jr!?kh}Ysy)*j$gHCCTU;u2MWCvz%1rAh!s^yxZaJOnT{V20Zm{(NtjCW}
ztyhbbg)YeZt-<(7Q$_XqgzWWFuvZqR#kf4Tc};1OI0~)etXqcZq?a^U6q`TKx}<zI
zEQ`WdN+EG)=R3hRZB$|=3Cr$w)T`~y?kb;x(4AQGei?O>XBhf~!g~S5{K8~sJp!Uc
z+~x=!w#q3T*CdnQ<sQdr6xJ6lJi%SgAnhGL(~DuP97Dl2YLTL3vDMUL9Yc!LvE=_I
z{5qv6HvEN`@_Ck7z#LC^o%86hFO4LPdwlDlxAr~E(&(d@=vWl`uPN#bqX!hkYzp_B
zO`24yXg2j{F(8b#)sQ$*J%8Z3q1@7BMtpuRt=^2pY3ZTW$x-+Q+hn6$y9B9%;FxL8
zvc8hZ_;K4OPV2)C+?M-wl!O*hH&Q8OnUSL3AGRSA_qup1nAaD3I5H(Vj7q9kfd@T0
z=%1(PiM$Y)TQ{oq_mUtcn~<~p@ag8J`f&n7SsDw98{RVOzP!0M!Y&dzAyo|OV@rIe
zGHhYVc&sa=LClquRW2k7q^sN88z4cb={?A!(I`_rw4PoLme6mooav$zXqOQIrkCQ;
z^aw%F1e5CL1mNR{J`>Ig^6)+r0!U1{3IVu>g8(6@%iXN}5lc1)8Z>8U7sP~VH9=^f
zs~LyA=(>`oDG9Qk>74!lESi|vxVt;M39+-AI@_{Y+q&C$n6a5VIk3B0Iy<?sKen_s
zy<4$bn7W&~S-QKiL-Mej**lrJv0GX3oAO)o@e7*s@L6$La9WxP@S2+oa`AF;nh6SW
zSaF*2oAR3Tn-cs7$Kc@L=Mdx-G~<U@;}GQG=QT4m<>i1_7Zl*);V`un;N;-qv*hRc
zb+h$fq30i`=KQ}b^nCk6=rQ{x^w@pJ`2H&8@98{_P@Shi$I-WwB2|Fd?jLlXS*Xr4
zH$4Q^d3qr_&)5GOou?J5^UNDGLv<d#f7E%}{#oa#yVH5Xf9X6<f6{rrKy@C6N1K1r
zd4hiHJZBT`XAqfZiLPyJr_8DmBJ;d`z|1ekkuh9$5cx~yS%k<u#7?gkt~)JQU=QwO
zo?tCu(m78=I*86=P6*L?ls$LO!%L}^=(r&w54Nh@VfyH*e>OcbyGSlX=lSx7&eL_L
z^9*<eL3N(W=!SaNJDrE?>nE+@)=;LY9;Nz6cPfu2YoWudh3ua~57$?SG3~s3me1)X
z_I8PM@F7Fa)ezOE6}CYYkB0aghHxtuYX0NSwlSPiUxR&_Ivf%-Kw*d&ntS-|Va~w3
zIEf4(nh$%G-srIe<<$!MgR@AP9-LHo44ks<DnTW>F7%%HI{830gf^8y$~K(2V^8T!
z5+Q&jJ`!Ozg-IFkMC)C{t4AoXMkpw>;bB??c}o(O=QL!&!Vkq;$7YduU>g_(`NHg?
z48tv*g$laxEkK376gz!*UHTI|WRfsreR*G719#hNW<N-%6Ivm?dja#uttX0ytG)3L
zoceJGdNT(ECkxVzxj}D1{{y7?KMTzuVEuN87!+=IColni%}A8`h1TX4j%;>r7MAw5
zk6qauE#282ogMxWsKtMX2kcPbX8$z_<ZdFP`2Q0lKa__bR{m)MWL}{wEZ}uGAX1J4
zjS<T&3#zW45!we<M!-1MQ8bLl=f8Myc~RAh*5W<OO533gSw7U(jskiCRe<q)CVOSU
zy5cjYm&#x)KIN1|O780Xy>h*v5Kd~>vG2iGU$BlmY>G<|#qbR685=$%PSr_7FdAS+
zT%Cy{*`^_BTkj6zEpLs6%%<>N6it7~s$8x(w3|^ol|Hx2;%$g}C&)r+<%WFt!D}*P
zF3H_vAS*@jSYMH7tfc&PNn$LQw1NcLZgWm5>^!KjC$uGOn(@{7TsPjUes4PLkfZ*U
zRb`l0$`2e8r%IO8n3y%}`mG(w>>~n&)m`z1fc198kaNX?VW5X>V1W4E3WtKpLpzz8
z=DM^v2HO2ETlkruy_1p3u0Xj(Wci%j>7_7~q5NT*TBXO}fo9CiLe_DC(5El4PLYc>
zlPi|`8!ywgMv02uOx&tOJ=NEfbbK7fD#yOq4aLpnN&9@nn|F;*r2RHIEgwNT^MaB3
zYh!uSY(a&Mlya7`@rQ{-2mQ2c%`lG;d8K~D$xKu!Z`bJ=)Z|PyK}%Xi#-V7RDcg8E
z8hJ^DXM3Zci5jM4H-fZ1JWQtK1#H<~FzJ2BYDr{6xP>fxe_k-Y{0MPz|J%h4q3|J-
z@SH4y^jjF%e05F$HfO+K{D8Qjp$%R4Y5<j{f{;)&=UW1E=qe~DfFZKr&p+rPe=nl&
zNQv&#-1>5XnkxDaor=y_>Unu<Da_r}98F%1+l}qaF~!WNg_Sdn*XDmi{#9=O&+gqF
zr`6==AwQk}$^_X;c4G0xz?B?znP@-E>Tz}P%yVKA?;O@<!N*LEk}Ba!Fp2my_X6=b
z46Ic?Wl!~&**}%NU0htWjOP5Vs6jpRBiavgPL~`=ffPS~7r%1{++s$ZlqGwS4h#U`
zkN|*qfO!NQ5V)03sSm*O2g>6>8VukJ$be)+0q}>k9vm1r0FntJfPx|b5`+OzH0=XC
zak3LyFP&9SbJ!!3HKk*j_fcizgm|k6P8{i?)%zxa@XO}j!Z|g@iYtC>1KHxi5htR_
z7}U2zGX&2pj(o3!xG03aMOoxj;~(90sms86`V3rF)?_;>;9Rn{iCl`lvC?ojgV)zZ
zNcf((JuLQwba^@rEI9Yf*S+^VH}v?LbA9|aw;24v73=C=e`&w&p?S=E`0k{>Q{qnz
z4BPqL^tHE2Qx9g3_YcO<2g1|J=ZlWAJgMErs_7ft68DP+Rd>s+E@^r<op}QNsS0#h
z>AOlb2QPFs*R9(ghVu7)LA7h8G|Lz}Jh(kkojjaC+V3$v6@(cpZ_f}rHBL=0dmVN#
zF4U2;9O>({gb{C)mUS!smP!mC6c^t91pVXCZ0pn$K5;4B^e^}AKkgTP#=XT>Da}zh
zXGHmGBx-Rp35Zslz9*Qi=_h(XMb{Of3FP8~i+ZUK??mpa`?i1RGj8%WBHWSDp7{6U
zq=Fk0^@=H}w@LV8amcYF=HmgiAG;$XwLZZXlOv8!Stm#(B?Z%4ya-`vO$MUG4m4`Y
zbzztHcDLHTTH`<N+v$T%4!y)>FT#p?G9rIL*N!)n=}Ftx?@5GF4@bl34<LG}B21<?
zF+;tfT8jL7nj_37E^auNFlPue2BZ}bwLhTqe!jc!^TCe(PHvabh0~#P@9dJdYm><6
z!o;-^mzqSK1-=S@jvN&m(W27;IsPbryk4(g>N?V)SM%zFNs-N~ZUn8N@e_}Cg+a3e
z@*ub5(ahnnoG7|>pdvu79X5PeFnqFsO2B8W#qvtsom`Fl3x)gU7|2uDj-2pz?EOAx
zNqa8aweGlTh`HFMdxG4B`)ZM02^ieJmqPK#C5ltgupAJ@wXC?<lbHhUm)&LBvEZ@S
zE|BY0VcG<Ji!IZ_BfcobgMWUY95IYvOjCQrlFBRgPNkWlg;R8dOn!Sbs6jSTATVD3
zAPIJ=k5UoyMN{Zg1kiII4wNAsxj-BjvvgRQCzC69WgGN}T9~tl;xZ($S4q6hkz&o_
zrP3FSl3VGy?%uf*%I9x}Y1BTx^Vzdi;9sc=xR1lB`>~VfLO_^^b;(nWyFF7K$P&?@
zfy$z%(3no~GWpUaSZ)sGyy!-bcq6$@jE$34x4OyTqlDrh<w%@HH%mm~o}y$h4(pdO
zKr&bnXPSPJ^2we8f7$(tBr-*-+NxD8Jl+L7)|MiD7i^iz27A%TP^!<;_tIa|p+xqQ
z&<Rx=&fmv#317#Vk$tw|6SyWd_5E?_1KW5-f23!QON;1&(d7puzOn6yzQ`S|AKyJQ
zdhZ9S*pH~=i(%Un<5Wy*3O_L<sW{n1-_Y832({pYDICWl?n5AZi$}7!s_1{y2|BE#
zwhKU>tn1L6cq+LM*UOAH?{V>_{QHa7yc+^#eeFr7L|jgikB4}CGv%oo5>&f{d$O~{
zc$4W2)7WE$@We9wa2)Sr5<7Hy!dxk~VnwJm35ur(B_B}Kb1iiHjEA-!NRPr(axN03
z3Q&0(iT64;iLX_x5FQILBxA7pBpPvBq&LML!m>q{#-v&K)^>G^4Aia7rAJ<GSWoTd
zAK7q^OHc|Z`8YLogq{`z&@^>$xs-Y;!qCPXTs~Argja6i@qZP6*ow-0YR#0??WL!w
zkbUs}lKL67qNiBV(w^8r^7F<D+zzBzIt1nM5zi(hiHnR6T~86ec_c{HXqyB+t2xFL
zN;&Zg^VJ;N+D<~cbv%2+L_)?(^S*0a`ZlI1salj1{e|sG<*Yc4hlw#PC3^@4k+EH(
z;(Z)aOIFNmhTQaP`kVV=L=F<3U%co&s1qx%d^=NbGMvS&I1Ei%2E%K7@SbBjn_}>2
z-4pl3kd$<ulRB#qAaZm4!g$F}l9<bBk#0b;p<RMk#ZW~T8_~5!9G+Cm4eJ+8U;5ea
z_{PALKh?4bhD3#gZ@>t7ne$f8kmN;g-Y!PsEkO(4)pT&7uB4GKnH__uMH|hzktgpn
z7eCid)=e4d^CzBkUW$+X(~k{@3Jlw1HW8Q7O^IsWm5t+I_G4^#7WbtTzIf-i3VX<u
z$<vbHI8?=cxz*p>Q-FA)-6s}w-8`;E1bc)zi8o(Rr?&EK=qcV7v+6~V!gL6(5)URG
z-q^`~5@MVjBJ3GPq_+<g$Hu2`_ZEfi3EtdfMli3`ICGo>UWf!2cY0+8UfIw3k$Rr4
ztS~*V6kSe~7t0X0iQ^m+>m@;?1smdoE7^oD+T@phK;JzEe5zuPNbyFSG<BZi4&$rr
zG@<ThyC+66_gzk+A&0BI#_zMmltXdau<$Vc+oP~(f*z#sG~F0Y`JTFLA>r6EII(+6
zxvWZY0;F_1+zX|N!wNTYpY|i|Z%AR(M2KFHNh8>UuO&}%b=wQcmxZ%2ltwVKf%!9e
zT5{9IYo753)8XPO5&$DcFJFT<Pjg*j4>?-I#zM6{{hy?d`4^L$(i#dVS&}v27~&W{
z?Q|1+sGH=g5bH<BZ3iUK!ZaS2upO^7R^KCbjlC@X@MI=Mwakq>$(?h!l^2`%%UhOZ
zk$6GXG}Jl7SfVFo2~Y2#EYAyG%nh8{9~ay7y~LhTkj>VItL|NVTVA&6C4<ft#vFk`
z@<=7Mq{lF9@YBQ|`Htn$KIKVcE#GYkZ8;o5BO+B~)|wyX&MoL_5?z%{8WHz9uC-bj
zAA_Isw<rNJoLVBP2~DhMUB$}f>D&D`USGIh+@vVzt{gg>906VL1zUC=`3Ir^NZKbt
z5SCCsZkp7er3SlO29_yP99(p*li?lse53Y$_IA~+?(>`O>TE%FrW4CGC#k*k>iEYo
z;<v(O!Ig0)JdeZQzS+uomH5dy+#_)3x?yJOinH5oDz`TT>GbPrO)u{uTE?Z@a}ftZ
z;EHF{x#A<wR87R;{Kyqtxr?BX9rI=LZl5p5$gUX+eyG=_6+4=5o}5=H_kQ@U*<D)4
zx^VB<eCjIzB#qwYJ)eZ9IE_-p!>{Wg>`7bS$M5u&=lOQGovjc(`DjSgK$OjH(hBnH
zK3&8e{q1VFM&W!h7`|3DE)(-ypPu0s_0gr26n|%UOBt(K`C1rXHjA{dmtXY=v9Ne_
zJbQz{4}4}_GG>ZD+WXC5iI~q66FVve%ZLyrRhYD#4!j0V>CtRf75EI-Ukq_OE;4_n
zHWRIiO1H=KL?3DW((?VP{C*m<!9%u>_XY18QY|JdeI|>ifM|=a9x<`LkGvt-3IWeg
zFJ8&di|D(FL|#mhgK7Ggt3cjQXy>e~x>HneyRvzhbzTBZU)m(ox0W$ctnp|+QMgB>
zeQ0cdt#uL-zM}O2cRAhL!%=&v%uV)1r<bn{<|_tWOb6mHrVu!c&(-dI0FX}vU9O7E
zDXZ$FtRar~%TxAjoWbR{)JK=On0=}*!un4wGVh0_O=O{6f2h&qw`>s&Y^<i_J0<dL
z7$S)J*#FJCWEpP1IBW|atjcnWV<dgsA4W#&lr)ZXFLs^<Ww_tw9B_XtuZG3*tKcff
z%$^nigOrGw<o<YXE{^6((nYbS3zw6((v6;v_^n18=UmP+J1r?YoT|iD<|U2i!dFRK
zJXZoaVYeKf6n<nl@asjp)C9Qny%nqZthB&76o;LD)2fr8L%JP{u0~t-=<T=MgT&b0
zksmo-2X8kX^csbN>f+E|Z+3*6c6F!c5bv)E4xES0Wjl*^n(@BJ8gDi@e7Zw<j%(hX
zyd8j6tKt1{zGe2gv)wB4wc=qNcAoK(t*^-RQ6WzRPNY;E(Y@25d^$ww3v)V4_h$(7
zHBx;0yx#UH1mLRvrzp82<PovMh%sThIjr{gA7GJ+lHDl!#wTiGKSt!PeKVTbES4zX
zEV<>^<7V24D-Eu8DhG_;YhY|i8;f7=!|g1+R6i{HuCSI8rAk4jn7!BHR_`O)i~+Ld
zS_FNwn`_1`2#CX0%O$YD6gsHJEH*51sL_QD7<i<;m4mycO1!6V=q&raG{M5zF0B3X
zdTPLLK<!()psy|*stWGtXFlOxVUpX76!W&ir?sEzM;y|R{0{e@zoGZJ!nrsz>%F&{
z5W_DaOstZWjh8FG+GS2fbFA8vb{q$CwGfRs5Re_5pj506_~w11Ry&LN0y)=$3e>LO
zUJJ|KC&&3rwXN=?aCLmNCyeUhP)fL2*_B^7>$eW+G=5U4wiqSutI<4`%P3*280=#=
z{x)`@u$(70H_c8`z?Ya=lNuyORX!W2>@*L7J@_iZq?h<?ilzDVe77_t;7SUTlrh4y
zl$PYw@|2Hsb!a*Xt90^(=hfXTSQQ8L7qZ#a1Wg|E;J(|3n0vdXtEWu1uqN9{d8S@j
zJeTAFk0YddkYAl#*o-WkJS`~3e~En+WM(H>1kNe0=*7MjmOtpQ#IAOjuEYeKKC3$5
z^i<W~mM%On(z&6}*oS>L{{VRZ0mWH|WuzIuvEl7y<>EY=b!uM3?vdA<KKLhM>wV+{
zERXHq!i3!@1j^=ikE0;;Nn<f!;lEJ{;2@7-?-7;<SI04GP#&aSX~p-(>*q|v=0rSO
zc~-kyH<Px5-JHn@crdmd|5~r>=;OO=AV2=i?t+Lj#UTc=Azoii3m0+!b6*0tODwIB
zgxhBbpv2fd(rJoUH!;)GbrIjjc-GYpFNsz!r`x6|iX6<6xM$OeHmM-f-o;E<KZC+g
z^Yv!^3SYhS8=ts62VN@zrjux}{Mq>@%%yPV!WZW?qI<RA$pFLvasacxd?JlPA`L5K
z)!;A>{q-QAo&30s1k%9%h;o?MSs~c?d2i`=4cP(aLQ6q@5kI$<cV%OKgpd&a918eh
zSW5zn2PHaeO--HvdXtYBzEW0{g$XE(fWt+MZZZSl<)HyTG6MYlt0*Y^0qQVd3KPJ0
z@;R9^7E?duB;0&N>8-N(@9XE>zQ{d2z4xNupzX3!Axgm#M`Ii@t=Ao%uFMd|05NBL
zN;g_5{FLK*KX$9|yQ*T>7uMdaIFrn|ks&YHY;SW=_#uzpzBP+kuWqb45YQYm{W9U~
zdCz3zE%4A^$dJJ!PDynt>!2p@IXYRDyqiJ=1${m#tt5hU9&BZDtPpvXtZjJk6Cm%}
z8X^JJ2$ETNB2eyz;<@Q!9#eKARywBvrIJ5^Oymr<Ss&<#enMX{7+(S<z2+>8;xfg?
zQBGH~V=8|Y1b=|Gen!h%xixnWQ9OxrMR5xmc58X%*}ZoU-iEPDU1=UHq;$}@h!qed
z+Q1$;X~hc2KFRkHFR~>Z1+d{u={dLI98rBkq>vG<y>+L`pI#ArwVLB0Ut!4`bjenb
z<;+3$2I)QEtw2dHnp!VqL%OW85%2>728M;m25P9mjuL=FQKnOhL>5HHwl|Rh08<Rb
zA68#4lpabj&x--<<|VQV<`*r2!H=J)soh@7aKp4SBQ=0AzdL%E>l^Z+9XjCa`KuhV
zQrWy-6Pt*;B`IFerEqEc8u@yHVHw`S{4F=|&3pa=Oj7PO=T@mF>K9#yT-P?8=;0N%
z!GVNBGl!WEq(^qS-)7rndoOK&h8e{pK@Y69sMqzE*WC))3S2xZq|X}ATQ_J7kmS4#
z3X(TsUcCZ*{0c)$gVVe+)L0q1W=2eCTZAxNiH{t+$}*N#fg+Yul@`=793F~oUV;Z>
z<6m%1>0=bxj^enrvySTR92GtEVLyNU;blX>W<T<(e%I7Sov_%KH$(1J2Gqx9$eL${
zLFPdCBGBiwlMK0bd0mgPfoI^`Owowh)5I@`k*PCFZqL9yIo%niH>VOec~pU}@AmyY
zD~q30uek@OFLd|J^eJ8q8yT;=mAl+NoNB{!vKpPwe~gwQV7YXWAgt6+>*MxevO>5Z
z>gt;2;YX@ZXuZbj`^2K)aq{9}{Ts?fz{i05aba;iLPfnYO;jZHr0<$U7W}e-Cpw<p
zk@_gU-$wf2o$nL-z-kPh82d3VXSm;(ZmnAY2aH%EyS{R_-sH5KZO1I*XL!fY0QITA
z`#Drue#-YT%@#0}M0T3;i=YlLyp=1Z*=S6VyqpNQ8ADE}U31asAtJ(Wa{(Usy?x+v
zTyvFBfL+zpkvMF4O6KER+P|^gB%tPBMWC0>*1L@E^lrhJ;~e%ORkvV6Zr`B(p%srH
zkEFJ-rg20+HuCqhZHs=tAZg-G*o#Hr#A^Y09cy%9VdKalWA!x7KpGazcROp2ehk?J
z4ch^y!-q6gTu)G4IIDy}-rgn?GQQ|KTsYXT8E7#m{njwI3#p7K>^Bt4M<(+(5Cwgd
z*vnC0%4_A>zeetay+ZhEP<GPmNkmqv>f%o#oD*EV`J~sIfL%J&G-RPSPXJ$0x-;Fv
z)^;kc>RsBTzCUJc^a!K7*^r-u+-6CUwHnTp$l#mp;sMn6UYvVP-mKz?eTY&!hcv-H
zS!cB5Y4T^eqRKda=rnWUh^dbB(&&9sexR3YhZ|3=1X6HI8M?$Q#@?}SS2huvF3UZi
zUriV0biA)fsVAZx?&%p1zRpTT2EbHPP<iE%MUh~HNt#GJGSlov3krG%f8GY^isOl>
zW|_iROztbX@88o?Vu%wB@{S~meRl8^ROv)redbGB2R;Sfb{T6BBBGeWMjRR2QHxuA
zmb^_Rf0K+T>jm@?$R@o-W;VB07q0t^upZ*Ck?T}HOkAWO0@K96<dwe^4moU~vxKdg
zhGcYI&>Z|uhq7VOc1RT-a@Gw<0X>uE0^lh80Z#F%qBLNZF~Je(b}*$^-w3e{(@$Yk
zm`OIqV#ui+9>}?ae>Qu3phH6N0)8c*#jq{5C+mEx)OxR7V_|A={1dt=)z%xD^8hln
z_q@|_w3}LoX3_2Kd7-*lz6tiO3|KWf`k(pZZrs1*e3*EemrDwx#}DWJlr<#RnykmO
zYDTaKuPNcf*zK1}MJKvzw~4N9!N?%>>7tYUSW1!G<FF>(x=XmDj*&Ci2&yqJ%^No4
zAe+gNYqN`NYy5$%y^)OM`55V&(l*eZb>QurEhz_8%vMwq=`J%hnPSk&!AKq9b5IuS
zdl5_vbT8Ge6Vs<0-@{KcNal(2>!KU1{H*NnBQh+f;P;juj30w^p9W>Gh!m`JGwy_V
zGm8q6FlJjF@-`O;#IFhowNm;ui47KE8VbJ$S@2$K(C`z`cs!$+aek;tah{(eYo$Ve
z5b7f=rs(@%uZJvdc@r_(oLPk*PyDdv3T9UNU~Wz9^w=|N&4Q%<>|iy^^_Jwh()UC?
z`wL@r;Pdao<1c_tXAwN(mXoMjE@rWuzAO*uBl*{N)G3R6r*^X7qSiz5IEOVlY)`g1
zw<{2qV6VSmbt*ku<M0lzw2+F*TkmnQx?*x$VvJf0cXjsrj@#Fdc1vn&h^f+ud#>{J
zqm-_X3_U5qQ5Y+^G-3rk<ps^F6jSl|j^bDN1KkBz%Ram<fCb5Qt+ES2r(CIJd9icp
zmY_)~t|?UpKlJwHuxoZ+ZnQ#NfhkMzX`h~Qqp0r7n)QJ*ynJ#^d_!rodvR8cYawWC
z#>l3#`*KxbVaGWeY0+UNVR((3dy0q@#K~l&n}w7Ku;|N!M$$M&Gaq(JQ*JPtB`Q`u
z4Od|;rlO?GdHZmqMN3Z$<0A)@N#r^7E*88te2=uv_?whp3yc)5KFES8N2=}$?caHJ
zgCr4`{Pap$XA<Vh-<#+0At$%GSCb2z<v@rNO>CJ{)NORw_7{|oXkp=g&8zE7*Iq(T
zI?x)J@?Fke;Wep6kBjA&MtQeKTrbypA#~?5+^UO`L_Cm=2_!G{y#aE@&z(fTLArby
zXOC-#*TFXBzV7aH;1i568_e)={@zAw1!u7v9FE#F;VkK)$2$y=XM(f<$oFaxp^pS1
z&%b^=4-6e81`pWNbyowS4+Q^Dd0<9}4MPa|un*)PG7pvz3;^6k{P9T<LV|zzroo@G
z{7ad;*SHA@{&>~&PkH|$>5n&JASHjziT_i{pHtQe3GQY{|0&_u)MG+|pR;@al=dGy
zZ2h~_Itc%hxg&q~iSD0H=|6rnN=Wd(qs9I7hy;uTDuRJD762g$I1dAqi1Bm3k`%N6
z{JCGo2<k-txgRa@d;dlYl!)~+KF}IehVXMg&lxoG%jTyHAm{?;X9A?hfClWL0T2vf
zn>?JWlao6L4*&wt!u-=Gp&%2%A#uOI&_N4OB7qiz=KQCxVf@JY>y7DO8KK30#5sHY
zii3m1-Mz)}qsY%W{J&D6C7`p;{~Z3)RDeH6|AK>`ageW95t0yA&{e>YK&ticf8Po`
zR0abAgV%ClKvV#_S`v~!rXc)2@gIouw?0h*UV)e+00-y6fMfx5k>tOe`KS8&Vex<X
zdI<RJuKW<BJj9((e81iKRk=T%`K#B_!SUr#C%!<8@B!#XRQ}n>-+IFlTy|%qApg!t
zyd}y1(n~nt&O0kJ5G$epx)%<=UHQ9jZ-8g+ETk5|fVcp3UtE7%_`9zsfDi603_vUh
z0O-~#fWJ-rscrqJ*Z+S`GXaM~rq&RF{R?40C9sfY`5e+o{y)U3;DS5%OG;oMI0*pf
z^pHb(I~@}EAQ1)+3Yr9H5*iw+x<YQ|HkJ;i1Otfd&_#epN<u<H2J$Z<@E8*IREO?D
z!go>kB_!M=AQ>QAbqQuQNcxi}&~ya}3G1KvoFH3Cb7)Ww8m%HBDIp=H4#_T|eP=@*
z5|+L*W_wrSmwgEdeyBaKJ9~G=<e=#iGTM-px`d<x)DUDV13f-uxyvUZq2>*B8e$KU
z1dWo{@iLPY^e~s>_>b*LLB054&(Ty*!^uJyaz3he$8&>v00}`(_0CmES;#KL2~DUU
zkgdkg`cQ^uft;@Z#7TCD|BxhTlq83~uK9oL-`%0*RX7bC44f??R`2T1`_7-cl^pcY
z5;D;0`iuJ0f_Nr)mlKi;vRFf}7Q|0$XnjM@U&Yzd0eT7kH(eh&UK=xA9V>JBNB{Zx
zL$CKwd!~-c|IQw?8vf$^f43iLXs!LUr_UwmW+rFvZK^Bi0l9u|(CeoJN&eaR^r5>F
z8juQuUQ}yHeL=VCKbnN}U7~~p`(6Kl99lv`5gKLSVDDz2Bj};8%V`e{|C{#y_x*)l
yFF73^3t2&L3q8pvX55M#2D<7J00IOxCwo&@H^K_Y$m1`5fDy<X95u+-Hvca~@te{B

literal 0
HcmV?d00001

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 167c55590..6bb0d0216 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -9,39 +9,62 @@
 
 import json
 import os
+import random
 import tempfile
+import time
+import urllib.request
 import warnings
 from functools import cached_property, partial
+from itertools import islice
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict
-from urllib.request import Request, urlopen
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Iterable,
+    Literal,
+    NamedTuple,
+    Sequence,
+    TypedDict,
+    get_args,
+)
+from urllib.request import urlopen
 
 import polars as pl
 
 if TYPE_CHECKING:
     import sys
+    from email.message import Message
+    from typing import MutableMapping, TypeVar
+    from urllib.request import OpenerDirector, Request
 
     if sys.version_info >= (3, 13):
         from typing import TypeIs
     else:
         from typing_extensions import TypeIs
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString, Required
+    else:
+        from typing_extensions import LiteralString, Required
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
     from tools.schemapi.utils import OneOrSeq
 
+    _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame)
+    _PathName: TypeAlias = Literal["dir", "tags", "trees"]
+
 
-_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]"
+_ItemSlice: TypeAlias = (
+    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
+)
 """Query result scalar selection."""
 
-_GITHUB_URL = "https://api.github.com/"
-_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/"
-_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags"
-_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/"
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
 _SUB_DIR = "data"
-_TAGS_MAX_PAGE: Literal[100] = 100
 _SEM_VER_FIELDS: tuple[
     Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
 ] = "major", "minor", "patch", "pre_release"
@@ -51,6 +74,14 @@ def _is_str(obj: Any) -> TypeIs[str]:
     return isinstance(obj, str)
 
 
+class GitHubUrl(NamedTuple):
+    BASE: LiteralString
+    RATE: LiteralString
+    REPO: LiteralString
+    TAGS: LiteralString
+    TREES: LiteralString
+
+
 class GitHubTag(TypedDict):
     name: str
     node_id: str
@@ -65,6 +96,14 @@ class ParsedTag(TypedDict):
     trees_url: str
 
 
+class ReParsedTag(ParsedTag):
+    major: int
+    minor: int
+    patch: int
+    pre_release: int | None
+    is_pre_release: bool
+
+
 class GitHubTree(TypedDict):
     """
     A single file's metadata within the response of `Get a tree`_.
@@ -97,24 +136,6 @@ class GitHubTreesResponse(TypedDict):
     truncated: bool
 
 
-class GitHubBlobResponse(TypedDict):
-    """
-    Response from `Get a blob`_.
-
-    Obtained by following ``GitHubTree["url"]``.
-
-    .. _Get a blob:
-        https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob
-    """
-
-    content: str
-    sha: str
-    node_id: str
-    size: int | None
-    encoding: str
-    url: str
-
-
 class ParsedTree(TypedDict):
     file_name: str
     name_js: str
@@ -123,6 +144,11 @@ class ParsedTree(TypedDict):
     size: int
     url: str
     ext_supported: bool
+    tag: str
+
+
+class QueryTree(ParsedTree, total=False):
+    name_js: Required[str]
 
 
 class ParsedTreesResponse(TypedDict):
@@ -131,64 +157,442 @@ class ParsedTreesResponse(TypedDict):
     tree: list[ParsedTree]
 
 
-def _request_github(url: str, /, *, raw: bool = False) -> Request:
+class GitHubRateLimit(TypedDict):
+    limit: int
+    used: int
+    remaining: int
+    reset: int
+
+
+class ParsedRateLimit(GitHubRateLimit):
+    reset_time: time.struct_time
+    is_limited: bool
+    is_auth: bool
+
+
+class GitHubRateLimitResources(TypedDict, total=False):
+    """
+    A subset of response from `Get rate limit status for the authenticated user`_.
+
+    .. _Get rate limit status for the authenticated user:
+        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
+    """
+
+    core: Required[GitHubRateLimit]
+    search: Required[GitHubRateLimit]
+    graphql: GitHubRateLimit
+    integration_manifest: GitHubRateLimit
+    code_search: GitHubRateLimit
+
+
+class _ErrorHandler(urllib.request.BaseHandler):
+    """
+    Adds `rate limit`_ info to a forbidden error.
+
+    .. _rate limit:
+        https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28
+    """
+
+    def http_error_default(
+        self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message
+    ):
+        if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)):
+            limit = hdrs.get("X-RateLimit-Limit", "")
+            remaining = hdrs.get("X-RateLimit-Remaining", "")
+            msg = (
+                f"{msg}\n\nFailed to balance rate limit.\n"
+                f"{limit=}, {remaining=}\n"
+                f"Reset: {time.localtime(int(reset))!r}"
+            )
+        raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp)
+
+
+class _GitHubRequestNamespace:
+    """
+    Fetching resources from the `GitHub API`_.
+
+    .. _GitHub API:
+        https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28
     """
-    Wrap a request url with a `personal access token`_ - if set as an env var.
 
-    By default the endpoint returns json, specify raw to get blob data.
-    See `Media types`_.
+    _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN"
+    _TAGS_MAX_PAGE: Literal[100] = 100
+    _VERSION: LiteralString = "2022-11-28"
+    _UNAUTH_RATE_LIMIT: Literal[60] = 60
+    _TAGS_COST: Literal[1] = 1
+    _TREES_COST: Literal[2] = 2
+    _UNAUTH_DELAY: Literal[5] = 5
+    _AUTH_DELAY: Literal[1] = 1
+    _UNAUTH_TREES_LIMIT: Literal[10] = 10
+
+    def __init__(self, gh: _GitHub, /) -> None:
+        self._gh = gh
+
+    @property
+    def url(self) -> GitHubUrl:
+        return self._gh.url
+
+    def rate_limit(self) -> GitHubRateLimitResources:
+        """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user."""
+        with self._gh._opener.open(self._request(self.url.RATE)) as response:
+            content: GitHubRateLimitResources = json.load(response)["resources"]
+        return content
+
+    def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
+        """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
+        if n < 1 or n > self._TAGS_MAX_PAGE:
+            raise ValueError(n)
+        req = self._request(f"{self.url.TAGS}?per_page={n}")
+        with self._gh._opener.open(req) as response:
+            content: list[GitHubTag] = json.load(response)
+        if warn_lower and len(content) < n:
+            earliest = response[-1]["name"]
+            n_response = len(content)
+            msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}"
+            warnings.warn(msg, stacklevel=3)
+        return content
+
+    def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
+        """
+        For a given ``tag``, perform **2x requests** to get directory metadata.
+
+        Returns response unchanged - but with annotations.
+        """
+        if _is_str(tag):
+            url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}"
+        else:
+            url = tag["trees_url"]
+        with self._gh._opener.open(self._request(url)) as response:
+            content: GitHubTreesResponse = json.load(response)
+        query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
+        if data_url := next(query, None):
+            with self._gh._opener.open(self._request(data_url)) as response:
+                data_dir: GitHubTreesResponse = json.load(response)
+            return data_dir
+        else:
+            raise FileNotFoundError
+
+    def _request(self, url: str, /, *, raw: bool = False) -> Request:
+        """
+        Wrap a request url with a `personal access token`_ - if set as an env var.
+
+        By default the endpoint returns json, specify raw to get blob data.
+        See `Media types`_.
 
-    .. _personal access token:
+        .. _personal access token:
         https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
-    .. _Media types:
+        .. _Media types:
         https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
+        """
+        headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION}
+        if tok := os.environ.get(self._ENV_VAR):
+            headers["Authorization"] = (
+                tok if tok.startswith("Bearer ") else f"Bearer {tok}"
+            )
+        if raw:
+            headers["Accept"] = "application/vnd.github.raw+json"
+        return urllib.request.Request(url, headers=headers)
+
+
+class _GitHubParseNamespace:
+    """
+    Transform responses into intermediate representations.
+
+    Where relevant:
+    - Adding cheap to compute metadata
+    - Dropping information that we don't need for the task
     """
-    headers = {}
-    if tok := os.environ.get("VEGA_GITHUB_TOKEN"):
-        headers["Authorization"] = tok
-    if raw:
-        headers["Accept"] = "application/vnd.github.raw+json"
-    return Request(url, headers=headers)
 
+    def __init__(self, gh: _GitHub, /) -> None:
+        self._gh = gh
 
-def _request_trees(tag: str | Any, /) -> GitHubTreesResponse:
+    @property
+    def url(self) -> GitHubUrl:
+        return self._gh.url
+
+    def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit:
+        core = rate_limit["core"]
+        reset = core["reset"]
+        return ParsedRateLimit(
+            **core,
+            reset_time=time.localtime(reset),
+            is_limited=core["remaining"] == 0,
+            is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT,
+        )
+
+    def tag(self, tag: GitHubTag, /) -> ParsedTag:
+        sha = tag["commit"]["sha"]
+        return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}")
+
+    def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]:
+        return [self.tag(t) for t in tags]
+
+    def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
+        """For a single tree (file) convert to an IR with only relevant properties."""
+        path = Path(tree["path"])
+        return ParsedTree(
+            file_name=path.name,
+            name_js=path.stem,
+            name_py=_js_to_py(path.stem),
+            suffix=path.suffix,
+            size=tree["size"],
+            url=tree["url"],
+            ext_supported=is_ext_supported(path.suffix),
+            tag=tag,
+        )
+
+    def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]:
+        """For a tree response (directory of files) convert to an IR with only relevant properties."""
+        return [self.tree(t, tag) for t in tree["tree"]]
+
+
+class _GitHubQueryNamespace:
+    """**WIP** Interfacing with the cached metadata."""
+
+    def __init__(self, gh: _GitHub, /) -> None:
+        self._gh = gh
+
+    @property
+    def paths(self) -> dict[_PathName, Path]:
+        return self._gh._paths
+
+    def url_from(
+        self,
+        *predicates: OneOrSeq[str | pl.Expr],
+        item: _ItemSlice = (0, "url_npm"),
+        **constraints: Any,
+    ) -> str:
+        """Querying multi-version trees metadata for `npm` url to fetch."""
+        fp = self.paths["trees"]
+        if fp.suffix != ".parquet":
+            raise NotImplementedError(fp.suffix)
+        items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
+        if items.is_empty():
+            msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
+            raise NotImplementedError(msg)
+        r = items.item(*item)
+        if _is_str(r):
+            return r
+        else:
+            msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
+            raise TypeError(msg)
+
+
+class _GitHub:
     """
-    For a given ``tag``, perform 2x requests to get directory metadata.
+    Primary interface with the GitHub API.
+
+    Maintains up-to-date metadata, describing **every** available dataset across **all known** releases.
+
+    - Uses `tags`_, `trees`_, `rate_limit`_ endpoints.
+    - Organizes distinct groups of operations into property accessor namespaces.
+
+
+    .. _tags:
+        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
+    .. _trees:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    .. _rate_limit:
+        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
 
-    Returns response unchanged - but with annotations.
     """
-    if _is_str(tag):
-        url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}"
-    else:
-        url = tag["trees_url"]
-    with urlopen(_request_github(url)) as response:
-        content: GitHubTreesResponse = json.load(response)
-    query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
-    if data_url := next(query, None):
-        with urlopen(_request_github(data_url)) as response:
-            data_dir: GitHubTreesResponse = json.load(response)
-        return data_dir
-    else:
-        raise FileNotFoundError
 
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler)
+
+    def __init__(
+        self,
+        output_dir: Path,
+        name_tags: str,
+        name_trees: str,
+        *,
+        write_schema: bool,
+        base_url: LiteralString = "https://api.github.com/",
+    ) -> None:
+        # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced
+        # that describes column types - in a non-binary format.
+        self._write_schema: bool = write_schema
+        output_dir.mkdir(exist_ok=True)
+        self._paths: dict[_PathName, Path] = {
+            "dir": output_dir,
+            "tags": output_dir / f"{name_tags}.parquet",
+            "trees": output_dir / f"{name_trees}.parquet",
+        }
+        repo = f"{base_url}repos/vega/vega-datasets/"
+        self._url = GitHubUrl(
+            BASE=base_url,
+            RATE=f"{base_url}rate_limit",
+            REPO=repo,
+            TAGS=f"{repo}tags",
+            TREES=f"{repo}git/trees/",
+        )
 
-def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]:
-    """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
-    if n < 1 or n > _TAGS_MAX_PAGE:
-        raise ValueError(n)
-    with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response:
-        content: list[GitHubTag] = json.load(response)
-    if warn_lower and len(content) < n:
-        earliest = response[-1]["name"]
-        n_response = len(content)
-        msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}"
-        warnings.warn(msg, stacklevel=3)
-    return content
+    @property
+    def req(self) -> _GitHubRequestNamespace:
+        return _GitHubRequestNamespace(self)
 
+    @property
+    def parse(self) -> _GitHubParseNamespace:
+        return _GitHubParseNamespace(self)
 
-def _parse_tag(tag: GitHubTag, /) -> ParsedTag:
-    sha = tag["commit"]["sha"]
-    return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}")
+    @property
+    def query(self) -> _GitHubQueryNamespace:
+        return _GitHubQueryNamespace(self)
+
+    @property
+    def url(self) -> GitHubUrl:
+        return self._url
+
+    def rate_limit(self) -> ParsedRateLimit:
+        return self.parse.rate_limit(self.req.rate_limit())
+
+    def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame:
+        tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
+        return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver)
+
+    def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
+        """Retrieve directory info for a given version ``tag``."""
+        trees = self.req.trees(tag)
+        tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"]
+        parsed = self.parse.trees(trees, tag=tag_v)
+        df = (
+            pl.DataFrame(parsed)
+            .lazy()
+            .rename({"url": "url_github"})
+            .with_columns(name_collision=pl.col("name_py").is_duplicated())
+            .with_columns(
+                url_npm=pl.concat_str(
+                    pl.lit(_NPM_BASE_URL),
+                    pl.col("tag"),
+                    pl.lit(f"/{_SUB_DIR}/"),
+                    pl.col("file_name"),
+                )
+            )
+            .collect()
+        )
+        return df.select(*sorted(df.columns))
+
+    def refresh(
+        self, fp_tags: Path | None = None, fp_trees: Path | None = None
+    ) -> pl.DataFrame:
+        """
+        Use known tags to discover and update missing trees metadata.
+
+        Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
+        """
+        rate_limit = self.rate_limit()
+        if rate_limit["is_limited"]:
+            raise NotImplementedError(rate_limit)
+        fp_tags = fp_tags or self._paths["tags"]
+        fp_trees = fp_trees or self._paths["trees"]
+        IS_AUTH = rate_limit["is_auth"]
+        UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT
+
+        tags = (
+            self._refresh_tags(fp_tags)
+            if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT
+            else pl.read_parquet(fp_tags)
+        )
+        trees = pl.read_parquet(fp_trees)
+
+        missing_trees = tags.join(
+            trees.select(pl.col("tag").unique()), on="tag", how="anti"
+        )
+        if missing_trees.is_empty():
+            print(f"Already up-to-date {fp_trees!s}")
+            return trees
+        else:
+            missing = (
+                ReParsedTag(**row)
+                for row in islice(
+                    missing_trees.iter_rows(named=True),
+                    None if IS_AUTH else UNAUTH_LIMIT,
+                )
+            )
+            fresh_rows = self._trees_batched(missing)
+            print(
+                f"Finished collection.\n"
+                f"Writing {fresh_rows.height} new rows to {fp_trees!s}"
+            )
+            refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver)
+            _write_parquet(refreshed, fp_trees, write_schema=self._write_schema)
+            return refreshed
+
+    def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
+        rate_limit = self.rate_limit()
+        if rate_limit["is_limited"]:
+            raise NotImplementedError(rate_limit)
+        elif not isinstance(tags, Sequence):
+            tags = tuple(tags)
+        req = self.req
+        n = len(tags)
+        cost = req._TREES_COST * n
+        if rate_limit["remaining"] < cost:
+            raise NotImplementedError(rate_limit, cost)
+        delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY
+        print(
+            f"Collecting metadata for {n} missing releases.\n"
+            f"Using {delay_secs=} between requests ..."
+        )
+        dfs: list[pl.DataFrame] = []
+        for tag in tags:
+            time.sleep(delay_secs + random.triangular())
+            dfs.append(self.trees(tag))
+        return pl.concat(dfs)
+
+    def _refresh_tags(
+        self, fp: Path | None = None, *, limit_new: int | None = None
+    ) -> pl.DataFrame:
+        n_new_tags: int = 0
+        fp = fp or self._paths["tags"]
+        if not fp.exists():
+            print(f"Initializing {fp!s}")
+            tags = self.tags(limit_new)
+            n_new_tags = tags.height
+        else:
+            print("Checking for new tags")
+            prev = pl.scan_parquet(fp)
+            curr_latest = self.tags(1)
+            if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()):
+                print(f"Already up-to-date {fp!s}")
+                return prev.collect()
+            else:
+                print(f"Refreshing {fp!s}")
+                prev_eager = prev.collect()
+                tags = (
+                    pl.concat((self.tags(limit_new), prev_eager), how="vertical")
+                    .unique("sha")
+                    .pipe(_sort_sem_ver)
+                )
+                n_new_tags = tags.height - prev_eager.height
+        print(f"Collected {n_new_tags} new tags")
+        _write_parquet(tags, fp, write_schema=self._write_schema)
+        return tags
+
+
+GitHub = _GitHub(
+    Path(__file__).parent / "_vega_datasets_data",
+    name_trees="metadata_full",
+    name_tags="tags",
+    write_schema=True,
+)
+
+#######################################################################################
+
+
+def _tag_from(s: str, /) -> str:
+    # - Actual tag
+    # - Trees url (using ref name)
+    # - npm url (works w/o the `v` prefix)
+    trees_url = GitHub.url.TREES
+    if s.startswith("v"):
+        return s
+    elif s.startswith(trees_url):
+        return s.replace(trees_url, "")
+    elif s.startswith(_NPM_BASE_URL):
+        s, _ = s.replace(_NPM_BASE_URL, "").split("/")
+        return s if s.startswith("v") else f"v{s}"
+    else:
+        raise TypeError(s)
 
 
 def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame:
@@ -216,64 +620,9 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame:
     )
 
 
-def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame:
-    response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower)
-    return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver)
-
-
-def _parse_tree(tree: GitHubTree, /) -> ParsedTree:
-    """For a single tree (file) convert to an IR with only relevant properties."""
-    path = Path(tree["path"])
-    return ParsedTree(
-        file_name=path.name,
-        name_js=path.stem,
-        name_py=_js_to_py(path.stem),
-        suffix=path.suffix,
-        size=tree["size"],
-        url=tree["url"],
-        ext_supported=is_ext_supported(path.suffix),
-    )
-
-
-def _parse_trees_response(
-    tree: GitHubTreesResponse, /, tag: str
-) -> ParsedTreesResponse:
-    """For a tree response (directory of files) convert to an IR with only relevant properties."""
-    return ParsedTreesResponse(
-        tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]]
-    )
-
-
-def request_trees_to_df(tag: str, /) -> pl.DataFrame:
-    response = _request_trees(tag)
-    parsed = _parse_trees_response(response, tag=tag)
-    df = (
-        pl.DataFrame(parsed["tree"])
-        .lazy()
-        .rename({"url": "url_github"})
-        .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag))
-        .with_columns(
-            url_npm=pl.concat_str(
-                pl.lit(_NPM_BASE_URL),
-                pl.col("tag"),
-                pl.lit(f"/{_SUB_DIR}/"),
-                pl.col("file_name"),
-            )
-        )
-        .collect()
-    )
-    return df.select(*sorted(df.columns))
-
-
-def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame:
-    import random
-    import time
-
-    dfs: list[pl.DataFrame] = []
-    for tag in tags:
-        time.sleep(delay + random.triangular())
-        dfs.append(request_trees_to_df(tag))
-    return pl.concat(dfs)
+def _sort_sem_ver(frame: _Frame, /) -> _Frame:
+    """Sort ``frame``, displaying in descending release order."""
+    return frame.sort(_SEM_VER_FIELDS, descending=True)
 
 
 def _write_parquet(
@@ -298,71 +647,6 @@ def _write_parquet(
             json.dump(schema, f, indent=2)
 
 
-def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None:
-    """
-    Retrieve directory info for a given version ``tag``, writing to ``fp``.
-
-    When ``write_schema``, an addtional ``...-schema.json`` file is produced
-    that describes the metadata columns.
-    """
-    metadata = request_trees_to_df(tag)
-    _write_parquet(metadata, fp, write_schema=write_schema)
-
-
-def collect_tags(
-    n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True
-):
-    tags = request_tags_to_df(n_head, warn_lower=warn_lower)
-    _write_parquet(tags, fp, write_schema=write_schema)
-
-
-def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame:
-    if fp.exists():
-        print("Checking for new tags")
-        prev = pl.read_parquet(fp)
-        prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1)
-        curr_latest = request_tags_to_df(1)
-        if curr_latest.equals(prev_latest):
-            print(f"Already up-to-date {fp!s}")
-            return prev
-        else:
-            # Work out how far behind?
-            print(f"Refreshing {fp!s}")
-            fresh = (
-                pl.concat((request_tags_to_df(limit_new), prev), how="vertical")
-                .unique("sha")
-                .sort(_SEM_VER_FIELDS, descending=True)
-            )
-            _write_parquet(fresh, fp, write_schema=True)
-            print(f"Collected {fresh.height - prev.height} new tags")
-            return fresh
-    else:
-        print(f"Initializing {fp!s}")
-        collect_tags(None, fp)
-        return pl.read_parquet(fp)
-
-
-def url_from(
-    fp: Path,
-    *predicates: OneOrSeq[str | pl.Expr],
-    item: _ItemSlice = (0, "url_npm"),
-    **constraints: Any,
-) -> str:
-    """Querying multi-version trees metadata for `npm` url to fetch."""
-    if fp.suffix != ".parquet":
-        raise NotImplementedError(fp.suffix)
-    items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
-    if items.is_empty():
-        msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
-        raise NotImplementedError(msg)
-    r = items.item(*item)
-    if _is_str(r):
-        return r
-    else:
-        msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
-        raise TypeError(msg)
-
-
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago

From 31eeb2042a6cfae6c2ca95874797b61e339e41d8 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 11 Oct 2024 17:26:47 +0100
Subject: [PATCH 018/201] feat(DRAFT): Partial implement `data("name")`

---
 tools/vendor_datasets.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 6bb0d0216..d02ef5130 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -824,5 +824,38 @@ def __getattr__(self, name: str) -> Dataset:
     def __dir__(self) -> list[str]:
         return self.list_datasets()
 
+    # BUG: # 1.6.0 exists on GH but not npm?
+    def __call__(
+        self,
+        name: str,
+        ext: ExtSupported | None = None,
+        /,
+        tag: LiteralString | Literal["latest"] | None = None,
+    ):
+        """
+        **WIP** Will be using this *instead of* attribute access.
+
+        - Original supports this as well
+        - Will only be using the actual (js_name)
+        - Some have hyphens, others underscores
+        """
+        constraints: dict[Literal["tag", "suffix"], str] = {}
+        if tag == "latest":
+            raise NotImplementedError(tag)
+        elif tag is not None:
+            constraints["tag"] = tag
+        if name.endswith(get_args(ExtSupported)):
+            name, suffix = name.rsplit(".", maxsplit=1)
+            suffix = "." + suffix
+        else:
+            suffix = ext
+        if suffix is not None:
+            if not is_ext_supported(suffix):
+                raise TypeError(suffix)
+            else:
+                constraints["suffix"] = suffix
+        q = QueryTree(name_js=name, **constraints)
+        return GitHub.query.url_from(**q)
+
 
 data = DataLoader()

From 511a8455f9caa285a7220bf989f6d607a704f070 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 11 Oct 2024 17:28:01 +0100
Subject: [PATCH 019/201] fix(typing): Resolve some `mypy` errors

---
 tools/vendor_datasets.py | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index d02ef5130..2c0f47a90 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -10,6 +10,7 @@
 import json
 import os
 import random
+import sys
 import tempfile
 import time
 import urllib.request
@@ -24,18 +25,23 @@
     Callable,
     ClassVar,
     Iterable,
+    Iterator,
     Literal,
     NamedTuple,
     Sequence,
-    TypedDict,
+    cast,
     get_args,
 )
 from urllib.request import urlopen
 
 import polars as pl
 
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 if TYPE_CHECKING:
-    import sys
     from email.message import Message
     from typing import MutableMapping, TypeVar
     from urllib.request import OpenerDirector, Request
@@ -147,8 +153,15 @@ class ParsedTree(TypedDict):
     tag: str
 
 
-class QueryTree(ParsedTree, total=False):
+class QueryTree(TypedDict, total=False):
+    file_name: str
     name_js: Required[str]
+    name_py: str
+    suffix: str
+    size: int
+    url: str
+    ext_supported: bool
+    tag: str
 
 
 class ParsedTreesResponse(TypedDict):
@@ -501,13 +514,10 @@ def refresh(
             print(f"Already up-to-date {fp_trees!s}")
             return trees
         else:
-            missing = (
-                ReParsedTag(**row)
-                for row in islice(
-                    missing_trees.iter_rows(named=True),
-                    None if IS_AUTH else UNAUTH_LIMIT,
-                )
+            it = islice(
+                missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT
             )
+            missing = cast("Iterator[ReParsedTag]", it)
             fresh_rows = self._trees_batched(missing)
             print(
                 f"Finished collection.\n"
@@ -847,14 +857,16 @@ def __call__(
         if name.endswith(get_args(ExtSupported)):
             name, suffix = name.rsplit(".", maxsplit=1)
             suffix = "." + suffix
-        else:
-            suffix = ext
-        if suffix is not None:
             if not is_ext_supported(suffix):
                 raise TypeError(suffix)
             else:
                 constraints["suffix"] = suffix
-        q = QueryTree(name_js=name, **constraints)
+        elif ext is not None:
+            if not is_ext_supported(ext):
+                raise TypeError(ext)
+            else:
+                constraints["suffix"] = ext
+        q = QueryTree(name_js=name, **constraints)  # type: ignore[typeddict-item]
         return GitHub.query.url_from(**q)
 
 
From a770ba9247300809cc18c6a6863cb38c0c7819f5 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 24 Oct 2024 09:27:35 +0100
Subject: [PATCH 020/201] fix(ruff): Apply `3.8` fixes

https://github.com/vega/altair/actions/runs/11495437283/job/31994955413
---
 tools/vendor_datasets.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 2c0f47a90..dc31cc61e 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -15,6 +15,7 @@
 import time
 import urllib.request
 import warnings
+from collections.abc import Iterable, Iterator, Sequence
 from functools import cached_property, partial
 from itertools import islice
 from pathlib import Path
@@ -24,11 +25,8 @@
     Any,
     Callable,
     ClassVar,
-    Iterable,
-    Iterator,
     Literal,
     NamedTuple,
-    Sequence,
     cast,
     get_args,
 )
@@ -42,8 +40,9 @@
     from typing_extensions import TypedDict
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from email.message import Message
-    from typing import MutableMapping, TypeVar
+    from typing import TypeVar
     from urllib.request import OpenerDirector, Request
 
     if sys.version_info >= (3, 13):

From 686a48599f86cffb49549d72e697c88aa4440d45 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 24 Oct 2024 09:31:28 +0100
Subject: [PATCH 021/201] docs(typing): Add `WorkInProgress` marker to
 `data(...)`

- Still undecided exactly how this functionality should work
- Need to resolve `npm` tags != `gh` tags issue as well
---
 tools/vendor_datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index dc31cc61e..ad8debbc5 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -61,6 +61,7 @@
 
     _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame)
     _PathName: TypeAlias = Literal["dir", "tags", "trees"]
+    WorkInProgress: TypeAlias = Any
 
 
 _ItemSlice: TypeAlias = (
@@ -840,7 +841,7 @@ def __call__(
         ext: ExtSupported | None = None,
         /,
         tag: LiteralString | Literal["latest"] | None = None,
-    ):
+    ) -> WorkInProgress:
         """
         **WIP** Will be using this *instead of* attribute access.
 

From 0bbf2e9ec2ff2f1d79b4d4f68128625daab2d947 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 5 Nov 2024 19:42:18 +0000
Subject: [PATCH 022/201] feat(DRAFT): Add a source for available `npm`
 versions

---
 .../_vega_datasets_data/tags_npm-schema.json  |   9 +++
 tools/_vega_datasets_data/tags_npm.parquet    | Bin 0 -> 3114 bytes
 tools/vendor_datasets.py                      |  56 +++++++++++++++++-
 3 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 tools/_vega_datasets_data/tags_npm-schema.json
 create mode 100644 tools/_vega_datasets_data/tags_npm.parquet

diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json
new file mode 100644
index 000000000..8de9881a0
--- /dev/null
+++ b/tools/_vega_datasets_data/tags_npm-schema.json
@@ -0,0 +1,9 @@
+{
+  "tag": "str",
+  "major": "int",
+  "minor": "int",
+  "patch": "int",
+  "pre_release": "int",
+  "is_pre_release": "bool",
+  "v_tag": "str"
+}
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..38be9c271c7638490835298d6fff114c9e921a7c
GIT binary patch
literal 3114
zcmcIndrVVT96q-%Dim#X=XkHF6Hz;u2nCUqh3PI2d03=WMIJM}+O?1(q!28~W<*>z
zgU?i>lWc@Jooqvw>9UZ>7*j^ns4+U3n^QM3G`<-7V;ZO3@7%Un1QyKZO?%Jp+}}Ch
z?>pZ;=WA-<jgeS_8~pKpksdoRLMY^D#G`_GCS8Jz@2FIVZ&e(1BzziHVmuk$<m~H`
zb=#G)k3MWap?2O~bTQ*fecoxl+nKTI$RWoc7v{-!7N2^%we69m{rhQLd-j&CJ-f9G
zni=<rvh$}SvSzHA9+X(nczj>8PN}`NwMf6SB`dMBtx24H;>-SBa}0yOvR&I#4rYah
zZpfNk+*0s>T#UMH`63h}T!HJEFd(SJGAvWapb%+Pcx<?ql|_a}YQwc2-a)!=86<*^
zgORCT4^ku+s#99R^uNU!W~VPVugczE_D26(1tD8ZB_CK@+ibTwD#FfIm0fkT-Gk>?
z!7V^CCC7@Z3N2Vc9OUN!vQ46q$Bj}BL<_>R_0V`rE7K7Y@yz;}uE!Dtj0p}IMp#aH
z(TkNc?ttE@+Cc$1FcmruhKXVpBP{U{k(L#fmfHm?Ee!dZ(F{r`mr;>`$$%J}^mySA
zqx<O$DysLB1$8j95|+7KuB#7z;PUfsI#mlV^%^HkU)G~!>RvQ4*Ct8MU|?K;?ZX+*
z@bF~io`5z2W9@pc&QjG4^|G|kkDbOIHkjr+AO5noKhIKk@7Q0L(>%3q?moY^-)C7R
zt3Au=kdosfjnhw9)@bEmSp_&sj}=3!(ryH?#NvkdV5RH+ck9+Y`f~=ATZhwzlb2F5
zaE&CdzV4M<ER}brry<E?<i{L9=wv9o2I8ZCMPXI3RnWgdL~byeE`NdOXwX2k0>|j_
z#37=(oe0%$5O0j1sd7CywdnD{)qEF2scNv!hpH(=E^{*-h3b?&2>C&F!t3}xqeEgT
zx@PxBPZ$PN*v<KNv&~#sX%=|ai^ye0;|z?883PyU2jgg39}bfHC@~Bx>`39ws^0U8
z<5L$S4$C-pVo7EG(*?6hsitodmBj-CLj3KIKO1Hz9gJCmin>+G<~vROeXgPfcb4q2
zt{{(hEZty~9XJFYl6vKvwnKS(QfaWid~9{<!@W~4&HKHtcJq&yt`xaiBVv-fcSZM{
z?OT3OiIgJ2Pon(<KWUIRZqM@HBvwWjtURc_u(jsq<|Wsa{M^R8&vV+H;ek8U+m4hj
za7J0ps+UA9<(YvAF8f{)s4&7&WYum<@vD9go+7Qz_qs~2g1j%$>{n6EM%Lt1BF^h-
z9P4&Dke?tVWQM$hV~1?OH|^O;2-4o;G)CIHiN{EL_asf5+_f1%Wf-6FUZsGrTRTrq
zO-_sOX$CkWPE8I=ajhqG%JFgU<1GQW$s6|t;!Q*GBwXqlJ}?EJ6CPs`*O(%)mzP)J
zi3ls^hy;(GgY;@BfhEv)TctOq{^E&+BkYcS4-1?_nDx|#c0;>gl?R+SI5k(N*)35&
z)6WBG<uFV35gd&)tunQb^^M=rNf(C+=_Pm}()3R7#TdVRl3vPV*Hntps_?;h24^D!
z6yt(`ad+Yq3|~3N=RM@zX@InYrEJb9)v#N}?JA^s8i;X@#lc9^iJ!cTai1&l6|FtU
zVjUubvxq|@C+C$f&<)Bec%w06<*Sv&R&!Y)ds`YouNq4N&+|#}<#pB2cVyBK`og0n
zJYUJf0BAG$xdzCuSwr)wJa6%gD~2|arg;;clVFL5K7GiS3M0IDJpT%vUn9&HCMVN8
zpR@vQnS6XI4-=tHqV?bs#__zN7AENc7*Z4}CHGBJ^5WHW#29!Kc3gOgltr?wHhX@2
zO>S0fP3}sEEhl@a4ft0=(TEIM#7!oLhI}S08%QQuV4<|lq&f(zrwt!V*QXUi$O==_
zC>4Ov(|&PEnk{>Nnyn}~%UT?@!p*-{kV3do;($sk{tN%*P$X6u38Ub$@Clhzc#B7$
zc&jOSX*taZ`uyMc$&nRBWwABS5#L(qi~k??i?D@KjMm&W{A-An8Om*i_DZe}ymaIT
Mt%ctS1N=+)2T|<QRR910

literal 0
HcmV?d00001

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index ad8debbc5..342575da9 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -70,10 +70,12 @@
 """Query result scalar selection."""
 
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
+_NPM_METADATA_URL = "https://data.jsdelivr.com/v1/packages/npm/vega-datasets"
 _SUB_DIR = "data"
 _SEM_VER_FIELDS: tuple[
     Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
 ] = "major", "minor", "patch", "pre_release"
+_CANARY: Literal["--canary"] = "--canary"
 
 
 def _is_str(obj: Any) -> TypeIs[str]:
@@ -142,6 +144,30 @@ class GitHubTreesResponse(TypedDict):
     truncated: bool
 
 
+class NpmVersion(TypedDict):
+    version: str
+    links: dict[Literal["self", "entrypoints", "stats"], str]
+
+
+class NpmPackageMetadataResponse(TypedDict):
+    """
+    Response from `Get package metadata`_.
+
+    Using:
+
+        headers={"Accept": "application/json"}
+
+    .. _Get package metadata:
+        https://data.jsdelivr.com/v1/packages/npm/vega-datasets
+    """
+
+    type: str
+    name: str
+    tags: dict[Literal["canary", "next", "latest"], str]
+    versions: list[NpmVersion]
+    links: dict[Literal["stats"], str]
+
+
 class ParsedTree(TypedDict):
     file_name: str
     name_js: str
@@ -589,6 +615,31 @@ def _refresh_tags(
 #######################################################################################
 
 
+def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame:
+    """
+    Request, parse npm tags metadata.
+
+    Notes
+    -----
+    - Ignores canary releases
+    - Github tag is stored as `"v_tag"`
+        - npm tag is `"tag"`
+    """
+    req = urllib.request.Request(
+        _NPM_METADATA_URL, headers={"Accept": "application/json"}
+    )
+    with urllib.request.urlopen(req) as response:
+        content: NpmPackageMetadataResponse = json.load(response)
+    versions = [
+        v["version"] for v in content["versions"] if _CANARY not in v["version"]
+    ]
+    return (
+        pl.DataFrame({"tag": versions})
+        .pipe(_with_sem_ver)
+        .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag")))
+    )
+
+
 def _tag_from(s: str, /) -> str:
     # - Actual tag
     # - Trees url (using ref name)
@@ -614,10 +665,10 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame:
     """
     fields = pl.col(_SEM_VER_FIELDS)
     pattern = r"""(?x)
-        v(?<major>[[:digit:]]*)\.
+        v?(?<major>[[:digit:]]*)\.
         (?<minor>[[:digit:]]*)\.
         (?<patch>[[:digit:]]*)
-        (\-next\.)?
+        (\-(next)?(beta)?\.)?
         (?<pre_release>[[:digit:]]*)?
     """
     sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
@@ -835,6 +886,7 @@ def __dir__(self) -> list[str]:
         return self.list_datasets()
 
     # BUG: # 1.6.0 exists on GH but not npm?
+    # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview
     def __call__(
         self,
         name: str,

From 9c386e26515b23b0bccbe5505ed9c9bbcb05b96c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:41:23 +0000
Subject: [PATCH 023/201] refactor: Bake `"v"` prefix into `tags_npm`

---
 .../_vega_datasets_data/tags_npm-schema.json  |   3 +--
 tools/_vega_datasets_data/tags_npm.parquet    | Bin 3114 -> 2596 bytes
 tools/vendor_datasets.py                      |  16 ++++++++--------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json
index 8de9881a0..90ea9d52e 100644
--- a/tools/_vega_datasets_data/tags_npm-schema.json
+++ b/tools/_vega_datasets_data/tags_npm-schema.json
@@ -4,6 +4,5 @@
   "minor": "int",
   "patch": "int",
   "pre_release": "int",
-  "is_pre_release": "bool",
-  "v_tag": "str"
+  "is_pre_release": "bool"
 }
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet
index 38be9c271c7638490835298d6fff114c9e921a7c..d2e9a34b78eef3da66b7b70e82ed4a6dcf0a5502 100644
GIT binary patch
delta 834
zcmaJ=O=uHA6rN2s+ikKT*6a+sWoxmAP!RK1tyP-f#G0f<+a{?^Q@kV_n^HFho7#pb
z>dAwkmjMwef<Y7t9;7|w)Ps2RP!z#~$08o|(sK{aTU!Z&4$GUF@8`XF4|YR)sfyDy
z8`;c>L^$SLDORvT8sv*Yo{0f@bQldvtI^=uU|8!~Wf9@7<Gzc%6+X+YF5u*T)+77b
z3&&$&hlQQ5<(7AJxI8FJHoWJY@$C6PCkUE=*cqvxy>UIt804b^?+64Tq#I=0g_{S*
zmW<yY!s{Nqet6#V;)j3EKD-2S?9lI|#pPOU1uJY*x>_a;-O#N-OEj>epGMLVUHgAK
zkzgbo3?oDys8}OgJ$M8K+LD3A1GlSMT&gYG<7f$_Ed=b=bsu@IVnTlO;5Z6=>o%C>
z=<OVbs%$~NjibP~+0QP+o=vZpd;t8rjqkIFJkU8URoMmhl^<MR*=_mw`Hs=?NA5$>
zT|XW`<cUTa0hzp0R2ZP9&}?=pf39vWUaJ<hr>+pJ6O{z(>yF0?l+p{7W+<Jc7b%^p
zz}@75&OiWyj@7|&39ekq7nnFzU}<;Xz0fo#*D^H0-2fNoCKt@~gf%xbVU<#|i)Qo+
zXPR6zatioy2nndeEqvoub5MuxSqcE9mm%=FVW<l)uvND8I@IAQ>Ksd~nbld(0uS~1
ze6l&8VIg<Mn*_O;N`?#33S}qVx!*`BZ`~>ljoZkettsV`Hv#!)gV_u;81HCs404xK
d$s3f$Vn~ywYgTc&u3H8|e|_kah!FfZzW{02$58+P

delta 1411
zcmbtUZAep57(REqyAyLZt8=<{-9QV6LfNKPla_IuwiT9}<XUE=PMX<rVqw$5fki}^
zeViE;gQO&EffarHkO&!lQ4$pjLZ5`tk3`IW{SZ3m>MAix=x}-Od)|-dc|W+fdr?#j
zIdQ2O^6*$9yrwFKK?nfIxVLJuvW?rqgVN(EDYw2h+zZ-ntk_%nJU0>^i3v{{^}^N5
zeUIhv*oL8!_iZ}|@RM-KuDdsb-v(C-o!%F%(cVd4-&2Fs=jk~8?1G5{Y1STn|J%Dl
z7c?$SEv#&R)ZJy#n?9YcF1B@EaToUYM%13icfOoYcTRj3V`m-L-I<w3-3z=|D!-$l
z>`~wCObB2DY~xnY3iVKcg1iP~XtJ~NvP`0I_G!h!ck|F0-k4zh%}gv@t9pXBk^m=?
z*W3ejSZgI|Dv|W4I@Ga5EBQcmm--YJCLh)Hx{>)Tnd3x`j^H+rIE$uf_rWS3OhQ34
zoP;j%#%n1oL};4g)i#;4%%&_808)xd5{0RoBx=Y${vz4Q%e!PWpn?M&qJWZ&ZfRi4
zv>Hx;kK*Qzc`y>!M|AMyPklbDn+_je2wy2?sFK<pd}APR5GIo{O@#+0rK`lLI@3Y1
zOo>oXlBFZY3%m@Iq}4#sOQJg7#UF+gZvjOmD57ozbAd4`$RxrU;<MACm;@0TMlc^3
zhsioYj!#mDhFRA6#mo~WAwlM1BqA(H9%tr3#XOcQlZWb5Wyl5{I$@xmb0SN4Np#vY
z?~JjdE(2wHCBkLE*bjeoFy~F6`wV|Vgcg7%mZP9C0lm=^r*=`|sAd&A%|08c*XD=3
z7~?{WT^QHkB8+Xm0(xggT*pKabx_H~S|>|8#8e*>b|!F985J&EunE&d`YysY#jeC*
z>};V)7Mf3849Zcyr^ddqo`ooQWwQYp?wkM%Ss8YJizla`x!j%CT)sW%-{INn$C#W+
zaD|E)iPy2&@5$NhueQ5uz1d|`q%BJQieO~430Cu8IJeOz@{~v#MUM~LTx=-5>81*5
zYwTMC%%e1=;D=^R>Hi4q=7#G1dCfBk{(&E2(Kx<_e{^6s*H-;bFncR=Ub8~!D6L&t
ceh9AzqTmeps~QhTN9Z>>`vHeF0I)yr4{sECDF6Tf

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 342575da9..abc90629f 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -622,8 +622,10 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame:
     Notes
     -----
     - Ignores canary releases
-    - Github tag is stored as `"v_tag"`
-        - npm tag is `"tag"`
+    - ``npm`` can accept either, but this endpoint returns without "v":
+
+        {tag}
+        v{tag}
     """
     req = urllib.request.Request(
         _NPM_METADATA_URL, headers={"Accept": "application/json"}
@@ -631,13 +633,11 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame:
     with urllib.request.urlopen(req) as response:
         content: NpmPackageMetadataResponse = json.load(response)
     versions = [
-        v["version"] for v in content["versions"] if _CANARY not in v["version"]
+        f"v{version}"
+        for v in content["versions"]
+        if (version := v["version"]) and _CANARY not in version
     ]
-    return (
-        pl.DataFrame({"tag": versions})
-        .pipe(_with_sem_ver)
-        .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag")))
-    )
+    return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver)
 
 
 def _tag_from(s: str, /) -> str:

From 1937f2b74df00d2a649ec52879e90ef2fe469cbc Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:31:53 +0000
Subject: [PATCH 024/201] refactor: Move `_npm_metadata` into a class

---
 tools/vendor_datasets.py | 87 +++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index abc90629f..ed094d0c0 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -90,6 +90,11 @@ class GitHubUrl(NamedTuple):
     TREES: LiteralString
 
 
+class NpmUrl(NamedTuple):
+    CDN: LiteralString
+    TAGS: LiteralString
+
+
 class GitHubTag(TypedDict):
     name: str
     node_id: str
@@ -446,6 +451,8 @@ def __init__(
         *,
         write_schema: bool,
         base_url: LiteralString = "https://api.github.com/",
+        org: LiteralString = "vega",
+        package: LiteralString = "vega-datasets",
     ) -> None:
         # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced
         # that describes column types - in a non-binary format.
@@ -456,7 +463,7 @@ def __init__(
             "tags": output_dir / f"{name_tags}.parquet",
             "trees": output_dir / f"{name_trees}.parquet",
         }
-        repo = f"{base_url}repos/vega/vega-datasets/"
+        repo = f"{base_url}repos/{org}/{package}/"
         self._url = GitHubUrl(
             BASE=base_url,
             RATE=f"{base_url}rate_limit",
@@ -605,8 +612,10 @@ def _refresh_tags(
         return tags
 
 
+_root_dir: Path = Path(__file__).parent
+
 GitHub = _GitHub(
-    Path(__file__).parent / "_vega_datasets_data",
+    _root_dir / "_vega_datasets_data",
     name_trees="metadata_full",
     name_tags="tags",
     write_schema=True,
@@ -615,29 +624,61 @@ def _refresh_tags(
 #######################################################################################
 
 
-def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame:
-    """
-    Request, parse npm tags metadata.
+class _Npm:
+    def __init__(
+        self,
+        output_dir: Path,
+        name_tags: str,
+        *,
+        write_schema: bool,
+        jsdelivr: Literal["jsdelivr"] = "jsdelivr",
+        npm: Literal["npm"] = "npm",
+        package: LiteralString = "vega-datasets",
+        jsdelivr_version: LiteralString = "v1",
+    ) -> None:
+        self._write_schema: bool = write_schema
+        output_dir.mkdir(exist_ok=True)
+        self._paths: dict[Literal["tags"], Path] = {
+            "tags": output_dir / f"{name_tags}.parquet"
+        }
+        self._url: NpmUrl = NpmUrl(
+            CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
+            TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}",
+        )
 
-    Notes
-    -----
-    - Ignores canary releases
-    - ``npm`` can accept either, but this endpoint returns without "v":
+    @property
+    def url(self) -> NpmUrl:
+        return self._url
 
-        {tag}
-        v{tag}
-    """
-    req = urllib.request.Request(
-        _NPM_METADATA_URL, headers={"Accept": "application/json"}
-    )
-    with urllib.request.urlopen(req) as response:
-        content: NpmPackageMetadataResponse = json.load(response)
-    versions = [
-        f"v{version}"
-        for v in content["versions"]
-        if (version := v["version"]) and _CANARY not in version
-    ]
-    return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver)
+    def tags(self) -> pl.DataFrame:
+        """
+        Request, parse tags from `Get package metadata`_.
+
+        Notes
+        -----
+        - Ignores canary releases
+        - ``npm`` can accept either, but this endpoint returns without "v":
+
+            {tag}
+            v{tag}
+
+        .. _Get package metadata:
+            https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package-
+        """
+        req = urllib.request.Request(
+            self.url.TAGS, headers={"Accept": "application/json"}
+        )
+        with urllib.request.urlopen(req) as response:
+            content: NpmPackageMetadataResponse = json.load(response)
+        versions = [
+            f"v{tag}"
+            for v in content["versions"]
+            if (tag := v["version"]) and _CANARY not in tag
+        ]
+        return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver)
+
+
+Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True)
 
 
 def _tag_from(s: str, /) -> str:

From 66fa6d15cd967a25752e35814a6c3f03ea771487 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:35:11 +0000
Subject: [PATCH 025/201] chore: Remove unused, add todo

---
 tools/vendor_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index ed094d0c0..048ff8771 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -70,7 +70,6 @@
 """Query result scalar selection."""
 
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
-_NPM_METADATA_URL = "https://data.jsdelivr.com/v1/packages/npm/vega-datasets"
 _SUB_DIR = "data"
 _SEM_VER_FIELDS: tuple[
     Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
@@ -595,6 +594,7 @@ def _refresh_tags(
             print("Checking for new tags")
             prev = pl.scan_parquet(fp)
             curr_latest = self.tags(1)
+            # TODO: Needs a hook for `_npm_metadata()`
             if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()):
                 print(f"Already up-to-date {fp!s}")
                 return prev.collect()

From 21b2edd0ee1c55ab09e8a31535a3a15f5ab55720 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:19:24 +0000
Subject: [PATCH 026/201] feat: Adds `app` context for github<->npm

---
 tools/_vega_datasets_data/tags.parquet | Bin 6210 -> 6200 bytes
 tools/vendor_datasets.py               | 222 ++++++++++++++-----------
 2 files changed, 127 insertions(+), 95 deletions(-)

diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet
index dc0ff652ed261eebeed70ead42c0f7352ea4e8c3..1cd7b957b3ce87cfa8e80b05548f72c9133b5d1c 100644
GIT binary patch
delta 3091
zcmc&%do)!07eB`ghGxbL&M`B_GmY06GhRLDW+<tKlo7WZW+o%LL>?*0Aj+?jG;+I<
zaMJ^IM-wU1NECW_Cz0~#Mfg#-x!ve@_|?rH-QW5>{`j4>&RTnY*Jq#a-fQpu*`ITh
zS-})}u-CI7G*qMvje{JB3jqLNQf4{D>j0urfcgqNK{6cWMe2|4b!)WI9v%-uU1Xb9
zd<RE*tiRH?c{`*lZI3^7J$&k};8W@w3m2Cx%HYDQMK1xs!1NrUV!%dO0-a$I2q3F|
z>oJz3-T!^(ON$pu#f$)J7CT?_L$8+4gpTf|)?ZFtjy6(+0LTa0N=Kp~WDxa4`gA@B
zC?ElGK_#4{K6W{u>RLeO3^;fgxp9TQK4(HVz1YJuIUEq28&ouN4q?f*fD(aBLPhbn
zKv<Xyk6_?iC>qAW4lq&LVZ~U?V3L$~uox^Rg9!jg+T&EH6F)cYMM^jwz$_>ehnI*7
zQAQ$CT1itxEnl^(1Ly$SfW)}9&q5uBR|+%^*rB(dp7#Oje02Er{hr4w>KD=JaD#3^
zorj07%`3CetC1-sAujmijXf#%clV!P9{*vxX$iaiVE01%$H5T|thiVjxyx**aW6I7
zFgY-9C7x2*sW#Y1rE#~Uw%^IKy{Rf<Pli@sUQnBy*Qa-2^F{}!y6a0K8*ljs<TFOj
z$DX+6<a(^N;n`0$ecc0<Ppb&U6p}YRN}YfFncbC*A(Ys7v-HTv#iivJNH>JeZ&jtm
zJ+G79#?I>fa6Tlrko<0wE8Y9j#Us{H0FboaHjyJeyECWNCsppXG_*2%$ZLmmiHN(Y
zC@Q5cbEH}2#e@EdfS+x&Jnv*l7TXxc-yR12lQIj}*)MzFojck&wEz3nr|@j|Reyx$
z7*!c--uxrb;Qi9Rv4Z@v4H{Tu-Y;2^gaZ_-xQ2Y;8zvNcA?PyE`PC|d?<1$3!F_R#
zS(6xTJ)HY#G5_%Mj@H6$=X5DMtC~_A|3<y=&~${4emu^&l$|Thc-oQ}Ih0ZnS>AV5
zYUo=3LvzA4lJ9|M-0vHmo)UC_ZnO^ER#T@L80hSN%VDbcu7Y?BlW8=axvJ9bP3hE<
zt#FI(uYSz_^R`Rb1LnlDqILZ|<4WO*oboVnuQEksx-$ON79A@)^y}KH@u$`gxFdvZ
zORSGXgbyCOJ(~Zd!p!jY1N5Qa)aD<&A<u4eIqTAE@a9)zxj%;x<CiueWoS%Cx+)v>
zjFpvSm)A70du0`Inl;O`FFFpM$v+R59eb#yWy%UzXV=d>95yZb`_-ZNV9NE<t*%2F
zyX!xF@MZm^ma|9fF<s1K>D(&09T@4dm)H5DXUd}82G{rCeKX2ay!G;Lscad`me=!R
z5+VBv@jVP_t+-|Dx{=Wg|DK>^(Bd3%gkY0Qb_qJfx}zr1Y6bN_ssZSo8``KJ2h_|J
z3>63pdq92t1fupR#>8A1NF@S*e<m{XU&rp1fb&0$U1Sh{cI+zjU<r{{gm%an^${Us
zCU=5H)#i>_m0GACaN@@}Jd|MmT83^+Sf0_8N<7+Xv`XE|aLn#4&;xV>_jhn(o<Ct&
z)${qKuD=m1_9eY8SmhnubkaOPXdNNrK4(Nd(OA1+qAh(5A6MYlUj68iyLipvXKmIE
zvJIx*x`SPs1~w?s(un4+vUq*(=gj;i&Zk|JE)X|hPJ~SsT99ezsc8>tTR+ikwBu(_
z+O*-kX#U!cwsy#=DAOH(13TTR+o$*<#iv77A;=eFYOz<Yb_m*<Cp~xNJoZZ~3$2q|
zXcu@N_Z`|`91^>1s%y1yl78>p3n#(VqtPij`D(nU)p<3d1Q1JEgbK~mahxaL3u4tx
z#uZTnBOW*`bKHwkL{mNTP)Y<|ULG5TLb$H0D7U=#-h^4@_c~?_uWL62t9QIip4_s1
z*Co!0gQfv%x~LT5je&Owmh8sXb0<&j4qY2FJW)$8T)SWY?CB{xcl9S4s?YXFkNlB4
z7Sqzau~<d4PeZb<X)$I6j$A$1d}+sV)$1E<x>NJt8l_$gF{rv6_N2u@ziC%;TDp>P
zYXfuT<WWYn^Sfr*fG>kH7<Ap8uQtDHUra{t+nh9XIzGqT^43Y|)cunXd%_~f(f1Ie
zF1&~?GziC}bB4&)SDp$~t)&LWg4}7Zz<-jZXf9jY*~_<^4CvfV@@qZjSz<mmnf~a?
zhvWyIoUqE9r;Kt!V_c1c!`GMmivQqUwX46<(vQ-oazU`kOBAHaUbf&-?prv$xF$Jy
zbh;Mjm2_z;J#9;Wl+go8AaQvAYXANvnVZ#9?q!T?$>WoLKU2+LU6Qxbhb`}KNSA2S
zgMAH$e&4BbsJ_x`Vw2|J!`rA|mSfKUoAf{viT=OzVEwP@p+&U++XNw;Ll9x!Q+FYR
zBtBvjYYCM3nIC0Z`%rAGv;5PiDNqknTQ6Y6vq{ckoxL?YR>^}>5if+kuTH>QT5A8k
zB{7Z9qLY+rr!Cz`x+)%+Y|I?m5Vt6Hdi?jip%N7#6h`_-M}*9>$GMfTM3W{HYtMTJ
za>Ycfg&`LNjX)-{us`p2#Zdm%u!vc30SH%sv_>%xtLub<{5TNvi-gga*@D1Cjgrcd
zYb;gAah$Q|CsCoNi+t1OH4CCR2F_@SvQ_{%jzddvNGcr24-p>X2CT(0E~tpRecTCS
zkW6seM*w?%yij(`-E@B3#nM7`tCX3uIe`Rk`y9wMG+d-W8(}$!o3SLZ8;*{|9*eWE
z<bo&_jLdB{fOeFbB63D^P8#9-sLesMX}kpCE|8XB>mW|Vk;J1o164!>K}wJk)s7=3
z<BdTZF&nRq<HAQlxCEr#T<CzD2jg{-OH%QAsz}HPcPKhSHsVq|MVX68pFrB6v=!eD
z26F&1QGrYX0ptZCfD8iwnF?nqK;<A}&e*vcP?f@G`>ru`8vWT$MV9qv^SKoy`2xRV
z3my4lzg_qzWKfJJW(l}tD)|uYE{ggeiZQs$2_nG-_Ru1L)(y?^{?<;hgvV#l-S$v5
z2++pOzrz2rmF}Cvsm_Sc@ukoi6hKR2QRfu#8{I^q!+FjS3|Bis-vP8No4N3B*PITQ
z&s^|v)}hO8j>BJ+*&Dt;<8X~VbHnJOuMYFVBZU4D!Vn=pQaIZkbBb<)L25Spkp8p)
z0De9bt9<rLjp8^>ZIjO}uy{uLf*?|T1oUwRRPF>B0`PA;VJ-~=k29q*%~%x1SQ!9j
H{sjI31)iZf

delta 3230
zcmai%3p7-F7r@V%85(9VgLBMikVk`%NAe~ODvydNVm!tpkLwZTk}i`+5-E&VqDIM^
zyoxKYh%5=ohe95=yi*=UCB7NA+r8_4_xrxH);VXdb@tx>z5i?P-+!MPrDi4AeE%_J
z2!LYHP!vKQqOwFzh?A5ALWyM@bSD(piUg93aCqjs%sitMrgZ9IlflNTGj_xQJ(+@?
zrH%`cNw#0_?OvTwywOh&j9;h;i=UyVeQurPRxXhaJ#L?-(rE_ToQhV+$B*^<&xw>l
zxym>ksLbh-qMSAWv}tj*{sdMgDm>A9d^}tA(X7>qf?kqL`V9@+7-0y)5_bZeL;~_G
z%^kwRRCo{#_aPF$5wQR?EC!|GwgP6nLOIi&7C3o!03iYEym%&*gOs{&C0-bN+q|Uk
z;O5XgFb&KAi3K_?lQ7jxeb4p?iOCs9H#mGzwl3aX#Mryk<Nagg_{=~_r8so^*Xp<{
z{)0X;Nv$~f=m_pR#@3!sv*s!)G(TLlnk*fwleuZ~CSNSQS=g|C|8{qG?ZZl%(H8XQ
zQrAk3mFct_SW-PiL&uoP-rppxCvOyW#r3r__i_j3eO7aPm82f@<y$t*Ebd@_jkUY^
z^|iQp?v|T{c?EO3kDEweawM<!OGt7`I~03GH93nXqcKF^$fDNVfo03fnL`dDUPG}h
z)$>O8lpb=I9vNH-)2c}6AlqhG@Hs@&19r}F@5hLu!ht4z90~+i6it4KuCP1t*z;@Z
z6<_Mj&)ub@wWODirW1<?BOTV(^Nl=>?WswHnkvt3PWYR+h)-C5ta|60YW?!rY*CZd
z40oLDOL4)jTRcx0ZoO2qG=*wSRf((Zb3zVBI(qZ7Zk_kB2*ND;rS|42w##}p(&HIX
zLKi8)qw=W6sjv2`RsCLF&D3e;+g~Ua2Czeb$H}j`r30}Cw4=;Jl5o_QyX&g_HxQVR
zsQ7C)1!i2+w=H|k9IIxEhUmP0+iWOWJTL<rWcmaP5s&b_YCpH)Us{I?>iVP|bRyHW
zF3X{Z=lR`n^OU>uR~2>eyMAu|*_5uII%c*a;;|$OKlL?Pn$Ic4k2glb?kctlu_L-m
zaoU87>cwG`ueihru65n=(>v`oTWk6EiCqsZvLNWT3RcUl8}s@+G|)1RWH^TgM$lVt
zxCgKDq_@RTip0wbH#2rm=SdAQUQe{lUQfVH&}%qSnBH{P->&P67+0BBiqre}QW{X7
zQZ7l=K1)2W({^jmXkT>#zuY6q`G(*nNvw2o)$T;N`^i&(rPn1%Q@0t8V*7ac#dE%#
zJ45Sl8^kZB(-UpYK0mUieC2zeZ2a)BLdCa`9{Jlw4y}783icXtwb@9BMunP;Q@B6k
zmsL=nK|iN7HpQ3o#=l9w+MiZV59%ZEw?sbIu{0PRKYOY--SSgZSkF<a-F5+9>Y<v<
zI^WQP9gh-iL?03apBy(BTf2^VzM)<*uD(85g)g*Ny{r2vb6*?EU%FrBQl7Mbe`Y<0
zuU=u!=PCTJ<lh$R&-J@;Z<2b2Wad|G;Uhz+POe%!Vmb0qaMie$gnE7MD>}2Ju7QSh
ztC8>ErD<I8+n_7gz76vft}maxs}iewY4eMH+BK!G<m`1DcjZv$ewRwkJJRBF_X{YQ
z^@#dVURvUVk)mtn(VA7O&px!#VnF45d@)W6C5aZ_kEX35%Y=Ht9bZ8GTexuGZNYt#
z?|KFg0*6>seEb>!0CT47gV)HObBX|9s1HQzh!_1J#~BaW7C9=!;dwl{2C2gsMcMGM
zC|6|+RpEc<$0Aoh-N_bzQ3A*a9ODx{ZET{|zJt#*$8ShCfA*+rhLr5df<i5?6(k1y
z04xA-`>NpUw#qOmwQLUq+`(F=BQ5J){z7NHz3Y{hX6M9taa%0Mzl<2Ay{X#P?H#Ay
zTqx+YpRSW;aU)MGjm?ex(8|gzavG^6&pj!;<XEXrJH1?%;=vj6hFSj8H~rF18YM9L
zP(0RFvKXg2I%#UBVuogcLnF6U(rUP9PoCMj5j@+>28Udfm24I%jJRH++kol9loy8k
zw^F3>(L~=fEO2B`<t=Gc*9-qN6uW4rX2=3ZM+qAoU&>QDCQoKqbYFbT=7y$$qK4Ax
zqhm~F(^wPFioP9SG=>o*>_})5kkJ4@y)tgW(@sDHl2fWDA_BznlSqPP0M4oDLO6h6
zWTMIR#=<<Gpvt-q-xHT&O;bwC(vw8A#@A0$Z^|<!ow;A~t6e(q_<gYMm}x?Vs@K$%
z!_bS7+I1z<nHS=0cCgT~8pTG(n}+Y`h!{tC_XY5>&=IYuSye2OOSuQdt2xW)a^04*
zLo2?!5v0o39w18l?argiXH`0Vp4a7=j+pDJeh5y(2x!{gvm9(eKNz=nRo8Wwd~^pn
z)q(M!@@_oiZj|r~M%Y~Fj;V6yF4L1}QnRGxxwcM)n)K#Q@Ak{-dGr*Hk%%V|U52Cn
z<oC4l`CO^)BeTNcn<7=}&%G?>7ZM$ZJ)#(kh-7+LG#9IH2k?t&ViNb$m!4afeY+IY
zku#F}X)sJ=Z=A<9Yem9hU&c-5VoJ=5Mj8Jy#z=+4k)i6dBFRcoeKVtOhYZ;*%;&$|
zB|h{`T3Elhivd%w6T_#sY}U#<p1a4hQ}#SX?BjBOe_K@I+;l2qpu}Lu+p%`JYa<5r
z{`BL9S8s6<-iaN6>5J-l@kioo4PH4O{m_`cW1&v??S~bqt_&x?_GjU{JU7l6QW*ZL
z(X1A3rt{8Ri27Qe8u&sjSTlb+A?=#PntKZrGpwrmuiFonzD!8$jF#^>Kx$ijwyCIB
zJ*;^6j`|6nf?u4w*Bl$?mclFj_&ey3d~PH?{o|l_mj|vh#HSGFu9#EZoh|p}eb0jr
zvy#ag<MW{cqZ;$O%bd5VtK{Q#C<(YjsEcTYePu%2C7=abVtsRSL+ru^NJ}SRqi3wM
zZp?KKM=CqpVBwZ+g3HJ@CxViE^y1}bzt+H5Y#hy5$;NE$m~79>enfx`0)b-n_V%r@
zfa`34o7)-3zyR;97GE-hmjuZ%jBx%g##@XuNM-O~6(p!2C<ST&5Q?}L;KUIL96mHJ
zzkn@++aO#6l6x6ctf(mpI&B8&LoArc+*kbO1OR~`5YUX~#+`29RO$>e)Z^Hsx5%4B
zkPhs}QUI#mOfp_?VDK1M7E;U=9OFavKrE)1K@KdUFzT=hx>S&Zhb!CCMD}j-v33Hu
z{=R;-!NpE4OQm^=WhG+SPWVeip1$8Bs^x&|5abm`gMctYpGTgRf)9^^0u@AZp_>ky
z04V(?SpWx+{*ujU+F2Kmt!$=2cn~CKFrM&eb5kMQ))=Q|#V{9ywYGBFfjJ2P9IOO6
z^dTJpK>k=Cb2`AyF~H5+jdsRu%Tp2v7lY(sb)DbMxmcNrLQ3q+&hv`Nu`>I^2?P)!
z)_BJ2dB*YYKCIwV<c-Q!-ae3S1rYHFA^`+gH^?%90f0z?(NG_PC9%aCBs7ozF2@K#
zOM>6!1SKfsyLe#_R0e*RC-y=Kf5=xgeyD6{LZ{do5Q3O_2o>PxcNW3{5Oybnh^LED
zi6r6`@-Tw*<6!vPNJ)@|mnaYn*J(o<067QxQ}G{95XMmfPMH;Lr~w4XU*vu$|JRD_
z-o-S42ylfil#2w&vC5<$yZ?{%D9i)b7_g!l)Q9u{av39AK#+?H|5+W|>0NAhsr%WV
zm(>2^?LX>auZ|iZgGRA7hJ6vxzrWc}c#`d-Z_7uF=AS<PS3Sslw`bT0@|kb2LrkKu
zV|b%Q`hUfs$qobrS#`i#I@yqA+CzaP05~@sgsEgSoS<C!KrjS}-nor+ptF8T{tc;0
B56u7o

diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 048ff8771..208834ebf 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -40,7 +40,7 @@
     from typing_extensions import TypedDict
 
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
+    from collections.abc import Mapping, MutableMapping
     from email.message import Message
     from typing import TypeVar
     from urllib.request import OpenerDirector, Request
@@ -487,10 +487,15 @@ def query(self) -> _GitHubQueryNamespace:
     def url(self) -> GitHubUrl:
         return self._url
 
-    def rate_limit(self) -> ParsedRateLimit:
-        return self.parse.rate_limit(self.req.rate_limit())
+    def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
+        limit = self.parse.rate_limit(self.req.rate_limit())
+        if strict and limit["is_limited"]:
+            raise NotImplementedError(limit)
+        return limit
 
-    def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame:
+    def tags(
+        self, n_head: int | None = None, *, warn_lower: bool = False
+    ) -> pl.DataFrame:
         tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
         return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver)
 
@@ -516,48 +521,65 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
         )
         return df.select(*sorted(df.columns))
 
-    def refresh(
-        self, fp_tags: Path | None = None, fp_trees: Path | None = None
-    ) -> pl.DataFrame:
+    def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
         """
         Use known tags to discover and update missing trees metadata.
 
         Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
         """
-        rate_limit = self.rate_limit()
-        if rate_limit["is_limited"]:
-            raise NotImplementedError(rate_limit)
-        fp_tags = fp_tags or self._paths["tags"]
-        fp_trees = fp_trees or self._paths["trees"]
-        IS_AUTH = rate_limit["is_auth"]
-        UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT
-
-        tags = (
-            self._refresh_tags(fp_tags)
-            if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT
-            else pl.read_parquet(fp_tags)
-        )
-        trees = pl.read_parquet(fp_trees)
-
-        missing_trees = tags.join(
+        rate_limit = self.rate_limit(strict=True)
+        fp = self._paths["trees"]
+        trees = pl.read_parquet(fp)
+        missing_trees = gh_tags.join(
             trees.select(pl.col("tag").unique()), on="tag", how="anti"
         )
         if missing_trees.is_empty():
-            print(f"Already up-to-date {fp_trees!s}")
+            print(f"Already up-to-date {fp!s}")
             return trees
         else:
-            it = islice(
-                missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT
-            )
+            stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
+            it = islice(missing_trees.iter_rows(named=True), stop)
             missing = cast("Iterator[ReParsedTag]", it)
             fresh_rows = self._trees_batched(missing)
             print(
                 f"Finished collection.\n"
-                f"Writing {fresh_rows.height} new rows to {fp_trees!s}"
+                f"Writing {fresh_rows.height} new rows to {fp!s}"
+            )
+            return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver)
+
+    def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
+        limit = self.rate_limit(strict=True)
+        npm_tag_only = npm_tags.lazy().select("tag")
+        fp = self._paths["tags"]
+        if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST:
+            return (
+                pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect()
+            )
+        elif not fp.exists():
+            print(f"Initializing {fp!s}")
+            tags = (
+                self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect()
+            )
+            print(f"Collected {tags.height} new tags")
+            return tags
+        else:
+            print("Checking for new tags")
+            prev = pl.scan_parquet(fp)
+            latest = (
+                self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect()
+            )
+            if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()):
+                print(f"Already up-to-date {fp!s}")
+                return prev.collect()
+            print(f"Refreshing {fp!s}")
+            prev_eager = prev.collect()
+            tags = (
+                pl.concat((self.tags(), prev_eager), how="vertical")
+                .unique("sha")
+                .pipe(_sort_sem_ver)
             )
-            refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver)
-            _write_parquet(refreshed, fp_trees, write_schema=self._write_schema)
-            return refreshed
+            print(f"Collected {tags.height - prev_eager.height} new tags")
+            return tags
 
     def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
         rate_limit = self.rate_limit()
@@ -581,45 +603,6 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
             dfs.append(self.trees(tag))
         return pl.concat(dfs)
 
-    def _refresh_tags(
-        self, fp: Path | None = None, *, limit_new: int | None = None
-    ) -> pl.DataFrame:
-        n_new_tags: int = 0
-        fp = fp or self._paths["tags"]
-        if not fp.exists():
-            print(f"Initializing {fp!s}")
-            tags = self.tags(limit_new)
-            n_new_tags = tags.height
-        else:
-            print("Checking for new tags")
-            prev = pl.scan_parquet(fp)
-            curr_latest = self.tags(1)
-            # TODO: Needs a hook for `_npm_metadata()`
-            if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()):
-                print(f"Already up-to-date {fp!s}")
-                return prev.collect()
-            else:
-                print(f"Refreshing {fp!s}")
-                prev_eager = prev.collect()
-                tags = (
-                    pl.concat((self.tags(limit_new), prev_eager), how="vertical")
-                    .unique("sha")
-                    .pipe(_sort_sem_ver)
-                )
-                n_new_tags = tags.height - prev_eager.height
-        print(f"Collected {n_new_tags} new tags")
-        _write_parquet(tags, fp, write_schema=self._write_schema)
-        return tags
-
-
-_root_dir: Path = Path(__file__).parent
-
-GitHub = _GitHub(
-    _root_dir / "_vega_datasets_data",
-    name_trees="metadata_full",
-    name_tags="tags",
-    write_schema=True,
-)
 
 #######################################################################################
 
@@ -678,14 +661,85 @@ def tags(self) -> pl.DataFrame:
         return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver)
 
 
-Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True)
+class Application:
+    """
+    Top-level context.
+
+    When ``write_schema``, addtional ``...-schema.json`` files are produced
+    that describes the metadata columns.
+    """
+
+    def __init__(
+        self,
+        output_dir: Path,
+        *,
+        write_schema: bool,
+        trees_gh: str = "metadata_full",
+        tags_gh: str = "tags",
+        tags_npm: str = "tags_npm",
+        kwds_gh: Mapping[str, Any] | None = None,
+        kwds_npm: Mapping[str, Any] | None = None,
+    ) -> None:
+        output_dir.mkdir(exist_ok=True)
+        kwds_gh = kwds_gh or {}
+        kwds_npm = kwds_npm or {}
+        self._write_schema: bool = write_schema
+        self._github: _GitHub = _GitHub(
+            output_dir,
+            name_tags=tags_gh,
+            name_trees=trees_gh,
+            write_schema=write_schema,
+            **kwds_gh,
+        )
+        self._npm: _Npm = _Npm(
+            output_dir,
+            name_tags=tags_npm,
+            write_schema=write_schema,
+            **kwds_npm,
+        )
+
+    @property
+    def github(self) -> _GitHub:
+        return self._github
+
+    @property
+    def npm(self) -> _Npm:
+        return self._npm
+
+    def refresh(self) -> pl.DataFrame:
+        npm_tags = self.npm.tags()
+        self.write_parquet(npm_tags, self.npm._paths["tags"])
+
+        gh_tags = self.github.refresh_tags(npm_tags)
+        self.write_parquet(gh_tags, self.github._paths["tags"])
+
+        gh_trees = self.github.refresh_trees(gh_tags)
+        self.write_parquet(gh_trees, self.github._paths["trees"])
+        return gh_trees
+
+    def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
+        """Write ``frame`` to ``fp``, with some extra safety."""
+        if not fp.exists():
+            fp.touch()
+        df = frame.lazy().collect()
+        df.write_parquet(fp, compression="zstd", compression_level=17)
+        if self._write_schema:
+            schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()}
+            fp_schema = fp.with_name(f"{fp.stem}-schema.json")
+            if not fp_schema.exists():
+                fp_schema.touch()
+            with fp_schema.open("w") as f:
+                json.dump(schema, f, indent=2)
+
+
+app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True)
 
 
 def _tag_from(s: str, /) -> str:
     # - Actual tag
     # - Trees url (using ref name)
     # - npm url (works w/o the `v` prefix)
-    trees_url = GitHub.url.TREES
+    trees_url = app.github.url.TREES
     if s.startswith("v"):
         return s
     elif s.startswith(trees_url):
@@ -727,28 +781,6 @@ def _sort_sem_ver(frame: _Frame, /) -> _Frame:
     return frame.sort(_SEM_VER_FIELDS, descending=True)
 
 
-def _write_parquet(
-    frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool
-) -> None:
-    """
-    Write ``frame`` to ``fp``, with some extra safety.
-
-    When ``write_schema``, an addtional ``...-schema.json`` file is produced
-    that describes the metadata columns.
-    """
-    if not fp.exists():
-        fp.touch()
-    df = frame.lazy().collect()
-    df.write_parquet(fp, compression="zstd", compression_level=17)
-    if write_schema:
-        schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()}
-        fp_schema = fp.with_name(f"{fp.stem}-schema.json")
-        if not fp_schema.exists():
-            fp_schema.touch()
-        with fp_schema.open("w") as f:
-            json.dump(schema, f, indent=2)
-
-
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
@@ -960,7 +992,7 @@ def __call__(
             else:
                 constraints["suffix"] = ext
         q = QueryTree(name_js=name, **constraints)  # type: ignore[typeddict-item]
-        return GitHub.query.url_from(**q)
+        return app.github.query.url_from(**q)
 
 
 data = DataLoader()

From 6527305cc5d82f54c529faafeceb90ca301b1e73 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:24:36 +0000
Subject: [PATCH 027/201] fix: Invalidate old trees

---
 .../_vega_datasets_data/metadata_full.parquet | Bin 21362 -> 20768 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet
index 7a4e691cb414735738f276950d79e8c72c5f4b48..071e4bd6cf68fcc17952c5057858fa29399c9415 100644
GIT binary patch
delta 9809
zcmdUVby!qi+wKG~bPpXvH_|22-6aBwGz!w)Lzi?bNH>BYsC0LiB1$)cG%|xQ9E|Vx
zzUO@3`ObCzIDedV?Q8F~)?WA8&ph|D?maW>IVlC^7Xk@&WcB!=0B9OIbO*={-RDXs
zB!{a3?t=jaXkY*Uu+IgQ1_A(Bs2fy=9#BUE1_YyE8pJ?!)UsBG*01hml|v;3IVp4$
zob2|V$nBjD(q&>^ZXg7;3}*)t!gSacFuIOMF&k`_=Vhm<EfpUoP-v(FUG{cDO3L#$
zqJU5UB@}~_0?)?B!_3{&#RbpV&BMkTic7Z7MK+0ESHA+n>Su*+;r=o&H3!T=ScIub
z2}oo6YagT|H5r>EHGoa9o*Io11c#*zJr$#!C=(YTGLd}olIdeCOHsnyPa+AJEKEm%
zT|X02APIvd;e}5VueApgdS{~ntldmyK%-qDO*oy@3V<SYNWnt>=PrwlNd^EnuMy2#
zJYvafxA}fXr{r|Bz^G!Uwzp{8N~~_pllatGqB2Kd^ejI<=#uTUv9%2t-WFgC?9n>~
z^&&%~<$dPq$vVE?M}H}gu_C8~FSD9&Zr6|3s{y{3*^`m7{f^?T&`QO$UNb)hYcIc?
zlE3jR?f6zh*OhvCC@B<|QW$_iNJ)WZYv*8N=49b$^T#P0(qt<XtdU?z5db@`pPh@9
zv$c&oua&zO1*WK9sx|}%s~8Ph`VXDN08J3U4U7w&+`eU$k%8qzM2QrQPcNgCk>~{o
z<49Dk<KtmpkmyBF>T<J593n92%_|1whHGq?PL(T?uW~e3wQy9F(bg?I=0)7yzii?E
zGRMry*}=ii-OkzRk6|llWNSdyHJV?B0f7{^fdhb|)&GE4{ARG`4}<2|47UtAQUHos
z{$a2_8C&odbE<uAUDj+IbuYXm^OD55U!)YK*+a9jUi2_Zz~o^%O6>Bt?9ovP2gl3X
z5xLTb5_+q%l3^GC>M??yT`=Na;>)>($w8tr<sks>(l=snsl)Lu<rgBmonpGsroxLA
zOar^ORJh!<tZy-fF6NOwGAosMDi%B}n&8?_C-izR%n@P=QhTXJx#-W`Yu^e5q+wE)
zI#6>lW+PrXaqu1x4UC;(yRgFD8EPpai(jU`^x7?{1gM!#HW}YlE0;92WbqB((*5;A
z3|0eK?0*P^{C1YT`(OIR$lA$R3sWsL$m)0S{{Qd|{Qt3UTzvoWO*0wmpvbQX!u*fW
zra%)-{YXL&G^jr(p#VxP_ye6V0QLbSx8Vf<*p(CN(XS8?b`wlS4kR?sFJok`iQ^9o
z#njt79x$<1GFrUXxD~it1?j~9tsv@9mfQGyM*}G4`6vE<DM)<)n+C9cnM@T3hi%4e
zh7LV#SD$zyA>e8#35o>k<z|&JLEX8EY<_9U0cEAa-5Cj(IxL?#t)K-g7NS>R84?ui
z0C1_cA@0aW0}Wm^1F$N?eNKaIXRR@Pd!96@%Swja)qvh<HKJJ8sg(^rEu+3<BJUCb
z9FRq5bZ&z1OjU9xt2B=^XyDItbT$CE4voVHA%IMY{EXMNJcIW}FyhiOUed#OS7VJL
z@63>_G1=bl{`|2(A^4=zcRG$b+86AeNE6KAi_QXb^8JFV5sSmEXQY>q%*QZ5XZ1xg
zCD?PuA$0^pv2T=+ITizo_E%ld-0l4S3d@oZ*%~S9H=t&!HKjS$Fd_6U`frzwQPRU%
zz=7BR+i3~5Bre*Un<lgEgMQ+w_S)IzJq<fjj3L`;ZHeyeyxRC3XE#nY_)H-`fFHmI
zc)PaqF7V*!OTz-&p=sYfr}gPMXi-i*zbO5%`QqGG_~m&tt0Cnxj1x>y5&-}}h*qan
z`W|U6cc(Yxi}iE2Q}IueThuHKSoHAO&&^(?Lrf2!D_E5%8`TYSU7jwfer9w4W_?4B
zOFlGsHOWHf$&#?d_0h+;Ji7Jmu#ynm+s8>OgTPfbTkc!zk9WA~EGtY(;2Xp7mC=?t
zzWhG94x3rS3(px4>m6O}*<z}?w(a4P9M9#HyW9uPmo)J}7MP5RkhM2Nfg}>10HiY~
z1|~ci<nx;Y)nUrulOG>5INp7z^k}m&&YVOKaV1AIqVRrLrp_}`rc%6>f`#;h$2=b|
zt95hKG_Dhtb?-nbdA(|hX)=kJZPt0)z$Dp&`~T8-40lgkTRWdWt004zEE2-{-s;yX
zK;!lB@u0v+mDI$9L(!m>5a<z518M-h4Sn`^cdpGp0zAP0Vnuwy+C7i^6l<P~n!`fW
zdq+>fHL@IyQe^HrdLNdpT%mOJvq5=nF4v=$H__jDl$0{4Hbk2$cK`qej&j1rbs<PC
zw3MHa5Xj$epUz)9Yj*Kzhi4n3iq~3@U$%hsa?$97mX7r8bWvV|Co}r(dinu`UjBZ=
zq|(qZ!u?CYfVU7DSnyY%ARkzeFS(9ElL<yW6t2+2OVA>@e^NWr!;4hKcw%2_qgJ`j
zS;K(yK5wt-)d;dh{Iz(Fc${xQlLPVrD(z<D4pJGFbhBQ+?!G>OXhm?LVx-sL2x?S>
z^r|!h2Nfc{(r~x!6)R2k#gZ)Gufmd2Qh+@yp8RoM5;oZvG}hc_^;0xR3|&PI&VCx#
zYKh?jE!tNVo9on1yE*2^{LNLz*mj-n(%O47_;-uj*_2{gpvFz0rw1ZjiE3rS6_uQ}
zz5R&FUe_9*FD!}y$=?U7-RHe5SbE*-shJ=Bex3c5lVH~QKXPLIpX4O@_La8%TS>bA
zO-bBx^KX@;o~y|5my%Fb8r=1pmw;*hhmrsn=Fx<~XJBh^@H3uN1bPG%cDe^5Ehk>9
z21v-snEz1|w=9qk$UBSku}>Xee}Q=zU|U=COqn;-5wO66{UUe-ovJxWQr7UP`uYno
zGVy{zAjM~w^hGd`*Heba*y3T5e+C`}2am9{%%g5-9j;j^Td(qHWq4hcd<<YS<g}Pl
zl<*W|i%uM%e}b(hvv6X7ud(#ss!1w87dh_j_qFiPK(d!&tY2JGTQtY;ZsX)=juJX;
z{aZzNxwGI1VK9Iu1`w-kfXnZqISstJ`CT9kUk8`oTRZ>w9vPtORVM8=ubPrY_7Eq0
z?MC!Cl_#C?+szH;^%$TR@CGnZ!0DhVUSERXtkR?)6wyqdX3(!KIH>p>7QxTrHvS{-
z3YY)H^Jz&bM4sZ21IPQ1DKqu5u^c9eF}PT8T?#vLtou*)Mo5>p#v<p_MJ|g!eax;|
zetqosS?59lX`e0p{`s^xH;b)1&hbazsmL#>9_NGE>1wCOT2$ku6%C~+2@(pbigbHT
zxf$TEP^R9<=BQ5`)ku>b(&_;}b|PRGeRz~8PJ@W!J5{Z0KY1PF<b1Dyt*x$%@<9?7
zxUx)=OtlHe?c$9H4k5k0gpm8G9*d_9@4nQxdca0(a(|$zS(GbqT2VRYHSP@#_qUu*
zU)A9knjb&0X!V#p)KB=lkbPPxwX>h#9D6-AwPItuQI)whMpfc&?*2w9OLslhFu-xV
zW_;g%IOz*aF<_4Lt6NeE+xgTdl^EL3@JbG*?;k1}=L=symDkMHH2XN2;%J<iGZW<*
zuc9%)HuX|n-p}pRXS}qRyy7-2Y8=CH0W)?<_QiwUs(}Y%->BZts9A^K_w+QMQ4zD_
ztrRf&nBAPhi}8!ZgMTk%`+r&jg8x%V?cjx^cJiY0xhoZ!^2(SDL@Ho_nYvtcVf?xw
z^ZA1+){marf*89Q^))3V;`sY1tbU!sazj&p@Y2B{*eM>L<c*a>^|g#mor@9N?}YD9
z!>rtOoXo2x?8kS%aK8Gi6IH=E@wVU`E3WfTXx(_f__y+VN*lhv6QBJJzh5Ka{N|Y1
zksy66*V@XO?-orm3X`Ig%qyFm8F82+=#2UmM#i-o9*!Tw61sphM@g<mKlcAV;P>VB
z;|las_zPE%{Oy*uimMBxK$9i>dtCoB8s2pSdH$n3%@|KW000C50L-cTcbMS85C9Ao
z5)1%<0I<eGz|qw!*kLGsW+g@YwsxO;KRe@5$#$5``36(e4SCU`a1Os7LwmjtqL@H?
z$Lj*CwWIHpI$BP*Eh>d+!)%qRhafJ;OnQ=AM)Nw^ga1Ij(pNmqj*o`mDp#m_ZeGQa
zn)<<b-rFi?Io#`#lW0izJ-C3|+D%rN_s?-g<%=PF8d0;U6^)C*qs<$~`9cZOY54XL
z@>pTNp}<oa(VFiITI4$MeSio(6Mq(D=y&BV<^_<w6!@Mi6cGPdrAezF%TsKX_Nkwp
zZ>3`%=6CU&Jwlxec7tauhz*_T?2r417tRw$D#^%T6Ke&db$8s8d+<PZ*F1*lYo9F7
z=wN*;v;GIy+cqSeRg*V@Y=_%ij2-bG1dA{92FM4?F(=x(Ni4@rXG*5`#&(;~F?B$!
zeya2DuF5YL2sT!C=}m)TBfSy{sLh0eb?1A=akgjk2=s8-T-*$U32}R>)b>`EdJdD1
zU!7c{DQ=?Q3eMf=Gx$~hku+26&x6U`{p!<a-GZlCZiQEkwT0c3mvYFjx;JO08%rBv
z$6+~6bwj*1!5Ivf8?z4rWI5j<94f02)<NEv$Dk6cpS?JC*2kJmdJ{w1EI({{u_jy-
zBDox8u_73JPP-|iAQ+WF8@n^&$^F<{u*1{iY5E?0q?tVMJ{-A5K0WN$;i7vF-p&CR
z{Hzpc+Dg+UG?z<jS@y!)!z=^OAlQ4pWaj%oNO`u>1<w3(;VRxnX1R1wu9AqU@D+P2
zzL<0apc;|qrRnDZAmY%qz;^|&Z*?9P@;kdybRX{k>Rd6TUm5}+^fbJ2aOi7rx!m2}
zj3uyQ2f>&+ydU)j<$IQPSf+lHj^J1|T9kTqt(wP+VK7!4@v*Lhc*AJHGOmyd+F7{p
zL9b}ivZU|x({?opO1x^dGs3!bHCe?mCQhh@{2Yg(JX<&i@}wY5p;{6rbStp|O0Kxq
zAGHaYZt~TsPA05@F=Gd%bb5G3ugO9`x)jMOve7?+%f7gHnXmAQ!YhBrM5qy6?P@4^
zn1LFiGu7R6!>p-w&L%p^(Gt|$H;Ba2+9!F*5;P$YBmPe4iNDG8J*30RQ-#rw!jkbp
zphTtTqPerdW`fO+)Irs=<2Z)X9IQv2i#4=@Q*CnX<=eXW?rJ{<(n~i$Wcqu#svK_7
zDastR@XW&EmXsv<7>pJxKuJVO_>rp6;4A-W+;0MV#o(QnE6>EB_pt6N;_&Vo<|rgr
z@qE=bD2NuH!rXcvQ^~j*5fj8MdO_?-v(`%gUYA}xFaL77_1@28L%LECDmk;4J|E-~
ztT^Q&%?b}f^12Us8-8>=|6mqT<qxV(@6YzuCWS8&(=oa~h`K+%Mr?QKpFznV)2Z|H
zJw|@ldVFY)U@LL7G(C35-dUxC+zv0?@<ig;9SzCZzw#L2D0h!*IrJRYzL0)s1mkm+
z4mBUtOgIJe`CyoNd#%%$8D1q;=13G%8#a7&qIa%j29Hih^80lOZ}d3nB5jHqEt^!*
zYdrWNU#^W6uVj&XSiOceo+2$#E-8apj$N|2p^#Dze)_QT^P3^>e#%l-<tq?*-|os{
z8k1eXD)yR#DV6%Q``WvTO9OtteKFl&eBx!{Z_7PxDl)`h=rzRkb5kU<ln~ciJt1<b
zyc`yLW;9#foL96<^$v8|70j%XcY{<HqrKo}1l3zV?wqh}bF}m_ffzbu>#0fT)Z{Ei
ztvy{1YrP)ry9|?Mkv&H6YUi+?=jL~$IbI=|3xlWw#-ULR+(6D1o3$rBsHa&Fjo?bE
zwrBQ93Z|bkWiLLgg5y6>eYR9Blr=t+#~&WhSni!D$?Naek#vz#zZ0FCc1>Sf9KZsn
zW5mOAe&O_FjYnGH6rFu^CjJ~BmfHv8%V8k&HQZN1(omPcpFdGm#wDH+fwa0?^>Oue
z4BfSB-oZbS+kLu|alCPMZC85FaG~0|u^c0cWE=1OvmPAVo9M|RM*ba%5LBL=Q+$We
z8fH3CrgqeKZ0#3shL&+|G|kL%X(v_SzR{gQ1j{Nksd)?mW|{M|IB%wq!8RCKN{-LY
zcehu|pHtN2x4IbUp&kYRBos-MMUT>__iU!Fl3cuwvCaz-cI9{1v%gqao+-};=?AcT
zz|DMV&abv!c=4?^xq>3Lz<qR0LC<^S*9@5i?qR<@Qk!4<`sLF#PKaZ|M@hjmxSC`h
zeyV7vqp}g8Hj$%rA|rdQ+dDCP;q=sgiwA4!?#$~9eyjPgRCk@_*bzdabX&wwB=cQU
z8!@V3x)x^LmcEJ-NvG>)mR+m>k)oZ7pL4$PcMZ*ln6~8o=-)WdKlE!Ft-rF@@ERo)
zeIDNEH}9e5;WMrul~52{R1g070$yTgv}_?DwR!C4_Tn`>3|01ewaLoazAj`SW^fb(
zVTSq5-^OcpH5G!eIPvMlqDV&{X7ayg#gFQ(jvPc0TCb<S?Hl-Zb4rOGmzcbR+;8g+
zTa4krZv9UDsbG-b^W358tnlbPTW&VeXz?sF{K6*G3~X(IM7ZF(`FUN(!@U+8y_fIo
zwx6ON%gK9FTVp@`ToaR|>&<<(0I7ge&`r8@FI?`m-TSI>LOQY7Q5qER5T?jw7C(_7
zuwu(Z_~eeLHWuxK@3%>Q10s(DbMN;;@oOzv`{N#u<#}0doC|K!L;}1=)dPi&#E6W)
z5~6Tq10K&O$oZUfc4SG?iot=(?_`ier7eA{49C?(#&n^$t$Mad2ghxcVrS&~vxzt)
zajtXJ5W&h<CT_qin3Rji<*d(}3h`!v3sO0d$lN1J^6d*ORD!-JQ}+09?uIDysc3Gt
ziBoK3*!LNIm0aRLp>N9FG3U_2F9WJJ%_y|(sRF^dhP#pJOqt;COmN%r?7pC$uSAJe
z9W3~a!sLm{F`_YS){C$7Gs@2~WhRhYTm=Ipi4sR~2AIpVbBR2f>W|1%=XNKaOt)C0
zxXPUs$I4Jko8FGZZ{Ft+6TWg>T{5Q2c7jC1TmT;*<`$>DHS{T=d~44nPf)?r?(1H*
zD@ksvK4Xl0%KJ#cNdfMkr=XF~(=4GgArZyaxT%UHs-2HW3CX$Kpnm*``Xjrku`1zP
z9YgDkG3L9ob8=q3jl&7)fI4j}MP*5cy1q<WGm;jcu}@LZPvA(?#?>uFqqEdHKI~He
z#5iic7)1jV=kN!IH*X}Z=GqDa6{b&{qlfTA<`|?WYMxExMR>r)3rjZLI?@Dt(Z5yi
zRUZI3LUUivH5c0}Rqsq#SUg?ze3zZru$by8T3jxEZ6WJ%b^T2S{-Ds&mZNc+pJ?p5
z-trx3()(B?-;?M{5aXxP%1jUIB2HA_mbPLGA*EJ{FhfGcgYB1UF%C|}*wPbG_#x2E
zlSxCjay^x!e0+8I9Inn{X7I_buE+Wb>DOrePj=_12z?N8(MJaL?q?Z7{ThWxqjCR!
zMd+8F>@Rh)Q>rkhT}v);wq^2cAfMO@`Ax;$9F@A#arEhGa7}zpefl~3%28)(ulO!a
z$ed@UqTO*^E`6&Fop1(TiT^kaAR~UbJ0vV)?Rt7X`Jp9zI<V4=^pQe8)r9pjjyAhl
z?10kz-V?lcgV~*n)=_ILAE%16Pg*1C%Lb0};s{Ri!0&x%Jg(7F^X21(rQWHqnpeZP
z;dEZ<C;>WV0`kD@@1e&+?mB)%92G8NN-$))NBoN2qlvS0%DO9RN(}SJ-FIF3k`VD5
z9p5K((eGcwGbjXJ*|P`nw6WdrvugO$CUJ<Uy-62-X-Z7MeU-n+B}L70_7J7X@!@k&
z`j)<o(xAoqc92-wM#4_ea7lR0ggK`Pd;dF;gaU#7laF-|SMzp7R|9<;kI%1DBb+!y
z3Zn;B35IYUyK#0On8&*j%)5P1AO=VF??3awi1UdZf`g7t(^uRFykgf+>aEJ2N+3Mo
z>{8u+s;&ELM7@yDKbJkn=f=%`=qoxjx5(saHqV87H8Ay#5HQWpL%eh%OCLS8p{xp5
z*ceOqaU#A*%r0Y=?0nrar{XVxD3EfxaJ-Thcq=)Qb&owXY$1FL?JgZ39sZ6-L~RgC
z^3f617F<YUH`iaeGl--Kg~RV7xU}qR)00j((G5{;k4-EHILaazYMVXUIrEH=;-gnY
z(CZ6)ix!&bJd-qgS~&lp1S{Qv4@z&+eh${|s~5OtZvAjpwL0Ioj?QlSEnUS*2jw})
zaJ{NfCi+UIHGW+1YMhbxI#CicoapqqFb%s{6g(IH^tA0z9>~S;b;pecKi?x5oq*XC
z&;!wuqCqm{49~=9zr1GE;$DzzOxLE$3U4;}P?5gLT90d}yUMCk6jR&HPPlbca*2^s
zOVfD%fhQJaO0ctMv3bF12O!-@mF<xevirdX+9ZKQlbG!COlLK1Unr>GF=S(h^v(vW
zz<EZW7{@T7Jd<EPBPnKIU(?b9Sg#%zSRLhdkH;t>UsYdez17An{_sI6!%wajK0-R+
z)ZTb?VcPP6a)~<xy0Cfb_Z-J5sJ}`&$!LEN7Z&+5wKrbMHxG&#iv`q$@%p_2SxQk&
zM2aV&0Vwp3<W_ioROH9#X5OoYNRi;Q!GF#UKS}1(XL1v>0yHcfGT8BiZ912V<2b>t
z=9<2)%Y2~?Z{&`(^$c-JXu0D1@q>XKpXuf&t{5}zb85@5hD++|k|!j)uXg(iz7UJ#
z?kAmUg%Jv0vD&<)6dqPwS9ugf@^F@m@Vh*}>`lCg4(c8NwFzmH1}G&nkOXtR7J+}&
z3JIqs!Y?*&;#o$EU!np~0;mALel$LbW>!sMR-F#Hy}Wn)`s-M?z-CsJNdf$6IeyJh
z!5sYi`|{1NvA4(XQwlT3@9~bmT0?)k{`7q9Hj?4fVMoI+G7m=FV&SRGQC8*kV|(?!
zXiVgkjwN^`I!Y@(AWYCQELaAHIg53LbqU!OYR_n0-l#9Dg{*xqBHUdoc-N%HElD-j
z5*1F@vof(y+4Hkt>gPt|s)$&-=4m4F;q_X!%44_d43l!UWgO$1iqAjAuH{|wqI$H2
z&?Xwjk`V~!oo_+xZg6VKB>^=`#3M~*IyQ-~E(m4G8`8k#=r`+S8zXB!>?v4X749Sl
z^tp@6nAG?uu)e7s=|j88&ZKBxsKnt*AC6LJ1O#rV-ao*{s07=?KMWHwB)P}J-$vAI
zrAje0@rs0hZKsWK<(Cb<p2YDWnOY1ETWOgZ&={Lf3YmZDcn(M3RWQSsqhnPOZ0#qn
zSTT@sSxd#{dmAhLlRtzH)tAa+&Ar2Gs$(xI-3QEU)Cn~*e_V-^mZb6p$A4J;ZW>vC
zIm;xbKxv-J`?#+-CL!3X>r?X5h$7_%ocZz0HNXkcqIm8EOTF??1cRGPxR`k?ni|m|
z|BU)rd=zpS2!osU6m6ONN?y{ES|ZW#(3q7><p=sWrZAEW2kEgAR-2OUSK^-RyR+9F
zh`Zi2i}{1?qSphy3`7hTtaU1Q9R*9GDGFd`6v?j@VT^VkOaO!5bk8(&x*dr$-B5R+
zABfM*jJ-MPnv|qBpXtqDKiyIe9XR5i%`gb{`$4Ie^Anzffb+aumf`FT%zeKuw~;TV
z9wVj0_R$koIZ!`&&ZO6k{z#l)f#BIbtuU_^MW>0R9m{L{CABR^6PHB1AVEuDcgK$^
zgw-@1L6cd66UD{bI`j*v`cwZajuX+71)`eA6&x@@R}=xl2;(N*iIRz&l_BGBxBd(U
zm_P7&lnPJFwSD=80jDNYzH=a9y9{?^le%>kgh0hb35<CkE60xdB$|;f;Qm-nzKB@Q
zIcnp{N!O`NWQwFNR9+X<-7^#D0P}MV8)ppQq=&5u3cC7)2hitPs@k<GEVTD;$1Kdn
z-%x2&-;;)PuH$ui4|K2iABDZ<#ik#`)U&XInZk8*Ee^3E@(VYJpIefu3xW&WezFT6
zgJLTO$Up47>dy?<?|r_he1e9QJ#%X<J9;ST)U4lbAtnyLnY=mJ{V;n&Ur@GY)r0Jv
zsVnPhDrF0~b7Ys`{+6Jt;6+&pTG8fc*Ts|zvTw9T?89-<70(h=YdFD(`1cc}ftT(3
zJArVzQi?%_9n0{*cN*!`5zVoN1{YeLzPxd5GBz<@n!AT<-^&pEc|kY%W(nHr(*iX~
zyIPwq;S=ZmaJ(|bnLO}HV9&}MD~qFhXV+Q0vCrLyS40^f-q50Sk=D^|Gyx0vkQnsR
z?LwxDmM<j+X90C&H+DHG2t|ZZt4a_dK{Q-@9FF~D<WNbvPZ&3E{hIdjP#XF0o_b+?
z#mShCE74Vd{2kVsgvR)RckRsW=FUtxg0zHDOZOa@1$Z*>mkH#bJKNT*O^w~zv~$H<
zwFxkpuA7>;{WGe_4tdUZ?sg?A-ham9<q^c`m`MH-gmURwTY{x}1TUCKCI>M{71qFM
z6S1|)-)u?sg~h=K21I+mOCZyH$aAC^TD0=ia;U~h%<%8w{h+#s)jg@|KGr2VgpV$H
zmjtcaAAd<X<2gOk+HgzholRdP{py`7Bi6}rW#!H6Civ<G<~;VGtBYm0H&;LvJXlX_
z0;g{SuNmw5LUA8wHe#oSzm<1Ua#$4JKQx8YjW%!=;{ghLZqU)6q$9sF(siI3cCa?L
z8V9V!S*UcPl)UG2{enPoI}p1Z$5;V1^u6ZF;!{v!i=9~fWdzdn>RI`vOkl3|a-lQ$
z2CF-;?k75kBjJW!T8H$V?VM;w+^Ylv2bp`~(8BBQ)Rw#cC-)0|QxHqS_`;HK)-4(_
zH03?{>TelcGhuUiNF9%*d}jY7&kg$YIJOq)(>c!_j%1pS^A~9jd=qaG4+!(sWZT$t
z{Q@>koI9E4Vcy*1>1V+nv5#q+W3{;t$275@j5-fYXeV`xq!KXCqhEYY7AhlPaRS>-
z4}i^js%_hfVPY=shX|{7Svem#C$YQ{AKeN4WAeN>ZT7>LemZIyW(-%@9nT?a>r)(?
z+4Hu?J6ZFl%lLUD0YVuj2m_Dc&+v)I8|!!<u#P4Qckni9J`bo$JTkyf>(QtKXA?Gz
zoH?iPh0rmO!FmcXiY9<dsr{|1yHk{4#Bs0dYg^-<!j%iew%{T=%>)M=o3}UOwC1#I
zn4Ia_ZxDIy)~>#?Ox}^Z(|2)a7h^{-NNNyj#KL;~Y@EGkgy`{zqc)5{x5Mn*o)m9E
z=G4s9&C}yDvc1mNPw~q@l8l>1E9o05Z)M1(D4r3R5(|HZj8H03re;y^=mhao@D8W?
zHb(@Tf*!K0$$gWnr~d(5&*;$$NTm3&_>zi<^(DTvA)lN&XF3En8cRQL$<(pqJ2SIE
z-o4wGXpMtuJvCC7vxaBf!d`<kTpe*)N3?+hu_Uj<it7;`WW;fH@oQE66}@}9+E%Tf
z3`6f^P0_fY1fWF;DA`LgvIv-f5*|++JU_){i^%J*N=qZ3#DLP_QnCPU(_kQ;ZVqNn
zE{=aTvueLklBGOhoqe6!tQB8>tgHyK?#Cm$-8Z15AzPz?t^xm}&JIQ4{;h}0{Hxxe
z56vR@+uv>tMWO$#zjlO<g8tUeGJw!v;=hgA*Fb2>lhl`L)1cdI8z*&`!tX2`vSJV`
z=D($Nkd@|vpk!q286YSlfOU(S=AT@L|1w!6RTKhqCTqRLP>`+MVw3=^y>vAHm#HLV
zv%m4BnZKX8BK)ss{_pZD$l?3HA%nLN5ddqb=Kl)$KcyYqW*`6nWa0V0>_+{WWYm)8
zzu7w1{GGTEd0KHx5dc6TfvNes^6NyF^Lv_xED%%*z}jm6FI)c0z28)RO;WCg-;k4A
zhyZ{U;ruVif8Op)ouVNlCL8??D1HHgiUC;9%7HZh)yCX7|HkbwDzb&!VWr?(`cgr^
mX8u2fUjilSH!AcM2wDsRkfn9d-ezdAVv>!<)LUp1q5Ti6Q?g$G

delta 10242
zcmeHt_g7TUvhNH744EO!kYUI<XC!9^1tf?lIVkWY2MGe3oRbVukQ^1sA_9_;oJ5f*
zQ4mD3l9`L)oO6G;@4oj3yjs2X?%lhpx~Ho8)3aAqej)gv6b#c-(iet<;Nv*(uV6lS
zs^AkC6<`lCf`C-9As`SaRS;|j27&OeFjwvRV7;42Fysn(lN^k5wW!nnYjja_S6x+E
z9_I=`T)&cNr(7f#;Nt;9uNDEWb~Z2t(vsincDj@(gxCbqV@9MwQ|nwXl%g`OPcSdz
zXz6FD_r41bqJiVmP!rqvdRcgSySWj&dU)B{!if;6f{1wB+WI9d{0?q-9r3T_p>rVI
zFiZSET2KS`-}OO%psP95>*=szSOC7b#&WPGN8Y;TE9(LK)gM8s_WO7O)Fjci4vB`m
z>GA8Wa-K5gyBeFmPgy_4vKJ+OIwvdWT@C#uq>R*4<x=4$Z6y~{xux_H<gL~q)o;&m
z^N6N%9y|*IEeHc^I|49QC^$ZsM7c@nNmzH9N+0O1q0n7xy5c>L4m&y^u14by`3cED
z+mAu=9J;LN%@8V%BNwSo9hl`-=S~0Y=!YTF#3vpR(0FZaS>zg(qFL9_o|Sux9K-m1
zfetE<pAZ5lPR5!O%#rS^A77(a&r2hLS7P}U@3ESNsX2OtmDTA?J`j(VHg;ZUm#0#~
ziD<+@xG)-O{09zBb`~xqR?c>R?Dkn1QAvX)gR<f`V8iqO<ymc7kT@2|LxqsM<J*=0
zuEdX@Gx~EZB`xg^1C+S?lBo8NyS+h@$3f`D=6mrcfkD;Jn4t}q1W3IX?x#Y}hjX<U
zH(gwZ*YGGrX?)PNS~P^$9@@i3%!4_q80z0aU4A=ikLb6fNMOv@j`F4fHE{gfQS}Ni
zQ7iz=uEvpD%vEx)_kr?yjexCItp0N&r_u&~_csh<oXcJ*g{hA4Z2YY5mEV5R%D1Yu
zwvXoE6W}w7gy^Yrsg^@kp&jRzk-3`AA=P*~By5RT(pZkte6!me2<o*7ENsKR%jJN>
zLuzq3WvPPb$r=v^(0x^=8e75?GZHdjF*rxAmP1n|g}cq8&hNb{YMjYjh%aJCj}uR8
z1gZ5<B?oufEyhH}pb2@$G*odp-#jF_C3WI(g@tSruZA7$_Qizg8y*Z7CnIOZa%501
z8^NpbsON4*sTL&VJ%4GCX?K@igLl=ARd#)wh){vQ(ZAgUb-8vB8l#2#%W$7>AcCnl
zzA~XLDX8nqh|T}M@%~?or%UlK<6R&)#wD+fCkR%pPo|&-%e&w@22jjDzJ{x7awv!C
zGcA$Ax=7=zo)Ua}Ab%PBvh|bI8%yjlVP-QSTe66m1MoHQT_Z~~>F<DDr)0kl*k_C&
zRiXa`Y&C}kxZWS31qQ&n8g3Z_%fYFfr<<C+^Mk+8rN?9+FC;sdhlWJc&n7p~aQJ0>
z`YXrZ2y?K#t~wOSGj^DjXn3>1!Nk9m6;8olWcLek9`+U;Y+oxfLUypFX4OMlJKvgB
zG!f$yP;W=_Ro=t>AyOoDJT-eS7cAMV=nORoo(N?CsLLsKn0&}rUSd7H&7pFVkhd&=
zvk%%4GHTjmu?lDJ#08t`&i6A*!<<%^wwY~11)=ua$_IFFL$SwJ@2cV}kp_=PW^d;p
zm-HcqDd?%L`J$TOCms&?&~T@w{L$s1CH#Hlx>x7dt<igmn@dh9In?2fkT;y^onES8
z<ghY@_lBZE+?2k>HO2I#R?2<kHnVam!QKl_^+UKyy~8lJ5L`I+Uo^t@bnyR+L(`Io
zc{oRdt}OaHB{h1ES_6NZ3_g$Z+u*!3Ou!=uJ0VDdLYCdlS>{5meL2aVVQ24h@r6iA
zH8Mtwr<cN-D<!ovEcQpsS6x;D5s(N-7&OM+;P#<mj`6AbKML-7FEY#`_=2qG8&_lQ
zo9z4jJZ-*5{7PV^6@T%AB9js_^g0}r@Q<SgB)QQZ^3jQOTZ%HZYR*oH685dAP(b}S
zIbK|MOx4de@X1>C0p7P;zG|-X7GK3K>z#NlN*x2H#2U+K7C0E|IeEwUN1`0cqMMe(
z6S%G(+tYDI61z)3*evJWDy8KiWIAH{)b>LBND`*W?J^QBZKD|XC}5<#YV?x{LdaL^
z4l-@M`;4AW;?>lt?5VXNA5utN4D}C>{6`$+!56XAB489QU#qr6e15Gi;Z`JCtQZy*
zmz9A9qXB<)dm=CIHHXc~Dj+4mI&DaR5k$3AZrq`P-RH!9pX<qg+xI&{mNqgDd<s^n
zAY!EGCrjiMT-PjFU?G)ykohC4ZyaIlM~Uu4Q2^Jt(?o%&fpVPL$x48@um!w)y{K^o
ztUNqieLx^MHv9q#{|UYcH-%sCA+A?~mzGzc1PDl^mjqVx&C7^-#oO6qNQ~~=o;}3n
zX;=zDRgwb<YU}6Yn-TKqm6vTtT|cz7&NZA|g+)+P$o2svH7E!V%wG;`S`~xVz)NL?
zVKA^Tu1%V7#v)AJtg@%Yy&fhrCf{XLAhvvFdc@CIr<a*m!_S6uy)~n;;Ag*2E;tO1
z3&*8LGik%oL3Fr4cnJigj7N_|nnXgdbtmcUqV-hCZN(5sY@|rTABxP#Z-7+533BuC
z(~&XajxkLzs7IiA?sQB}6HTefgY?WVUo!1XzUc3)p<LFKcUEOwRtJ0qs#da^q8~R!
z-fs%O(KH92@AxRNwA14A`sLHN#|@yrI8I4J4e_#i_~)QKB0wx*bHux#8MP7h{Pe51
zw;vrc-qk<UYM#@Hnl<asb<??*<zSvBH44{lDUKGbV|e2&FegwM?u#fXr}pf5wZ#=B
z<Ly<Np5&U3Uq;z=@$41P`hxQv#x$LO^o@*$qZ8cgtyoR5VE+cHe_6wvXBhFvMgIe8
z6=MNpx*GeLBg?@#ocMxrdDob={cp_b{Eb=teqp~bt17<nmB(+)68ZksaHu7cXR`a|
ztABnYmH~f}^UL|1zn~@Z{aTq(egWTBrlpf(3I|Tapj*u~g0(;zbt&2Cevrtvzz7Q-
zb+nF>mjwa$cGc?1>4`u^5eh_J(U|?*{y9Z@HBh_+*)pg3BPHgGCF1)hv1;Aq={O|h
z<r~$~H#s_qyXWdPLj`bKbq1JQ$!8CJRiDz!fRw55XmXI2<=`X3_l+<AA%K_@5U@u_
z8B*teR<b)~V+@xyQ*0TX!4rcta!X4@ImVer+qr4U6n0YC!i$2Co4u5sCgWm^%2=bl
z`QKYYzqQrQd{nwk^8oMtbF6=#%>>>aP8JUxyzITL|9JLl7K})h<{0%voAHbjyc?v0
zM?3z;ElJ)?pvVIPYKR8KY6uZ=!#uNp9T+DxPN7vXST6O*#<Apr=g-g2s#}Pf{b%^u
z+l{X2V_n?{s0UOH8Y|#&x+Ps-a>DcC76K-5D=mdtw5DLE;%-=kFssMt&+v<F*nziw
zNhwr;($tB&@ngzly;2PKJ<^zq6Gc3S3_PQU-+&>?#r2VhnKX&B;;E0>Rg0DTj?;R_
zs;7?G;*D|drTLg2c;fGW^qYv7MSFSoXQ!bKO;EI>r4<dODTyMgT1t%H-sGl3=HNx$
zk<C$4+%HdOyC`4w`E!s(9P}+M-@<x%>%Mc!Wa(!;9$tONzK1T1PBEdfw>>Ba!LJ-k
zKp+4F!{Xw+{?S-qDu1ng>D0l^rv^60?Bw^1`d&0k@&>+FQ^~0$z(D!7Qak*#2D5H`
zoMP4KzIWdsVR}CMuu$gfcA{(SLhZzooyppZjP()P5>HFdYI)z=tLQrc&ZAYM+m3@t
zv-zq4pD5=%l2h1!OiXFUFibz^X8r!UqG?96u+m=TR`xB6kK-xMCK)*fQQi@nH~XLy
zPYG50J*K7!Q=bY*+p(j#2jc@K9g-c{G?ldicSg2o8z<G*!i>DVEhjal90Z>8pmX@C
z@kzFKev=5glOKT|=f@WE#4R%8k)2h?RegsADYz{RA`=c7NbXlOHMQsHS`J|`(2|jf
z7k)==^Xn3p8-j)kFakJuDPB{eCK{mzIwoeW#V35P#Kr57HlBJemM_K}M>l7A5~lT{
zDtN}~3Vz^Y{?e|9=zr+9x)#g7n!ld@i|P8V+yRqFz%D)Lv@|ly?)Gr^&3ENd>=-iQ
zqasa@Qz;dBVrdfdb@n;*85ZP00dsfF_RN11#w;u>*u@L~MCr3m{}~3Z*M2U97mEDb
z&i=z4f+3ANDo^!e=<f$%<AFfQAoQgGD_{u)A(0`$*DePkn|49I<T+1T&fRp6a|L5l
z4OA0(cL~*#WW=kfk6bw7Z}(1saTaa-WpisSP)mS7e~w~!%#nO52`grBnmW+-Am}no
z1S#_)&NjD(>foyLwi?7Yp#Qw8HpfMa{G7j4?p*%u1AXTc91~;QCqGj*h7^1NhQ+B2
zg!F7+kXO%XUgY7W@ah;QuLSYY19owvudL5_-zMP$P8YiOnC@#M*G54XXC3C|<o%h$
z-Mvxb{^*Q~x#ELtUsli28qP+~l-=S1-ES2S&e?jtxQT@ZvlQOp=j<#q7&yE0W%XgJ
zS)^p|Hlbq+vvt<!{vPH?cVd4Wz}xM%I+n&7t!T?qIJQX7EN_U~8<T0zU5pKK{Y;W<
zo{^1Fe8-|d1y71@^CA8;IMXueBcZ55k-4qy^l7(fivmNYQ<kfB%1!XyT;BF-0u+y$
zx*?ro0LbsLaCF8PfJG#*<6fBHxH1J9zv~;^qDb9<Vjr0ADE>S|7hYKcw<{-A-l3^R
zlkgLVZN@_CK6S;$8h(Y8FhNHqA3jk*qr*9EpGR=Dq=E?&`(GPqbdpu{bhS9VT#-EN
z-Ry;=MxIj$7Q^CvhBePP+9;==`m(q7`O=cS!e-+R2GPFIk!8F)KFzwOTZZ2-B@`8q
zlr)q_lRHS705=SY+wH#te3<L%-P+qU+05&dIdk22>zVoN@6jYTGCzK4E~2MYZ%d^k
znXADfK)c}D&qOsMnS8egNMFU<_j|K^e?sodMHjB&;MkG(`=YR!eoeS%>d4cfsN6V?
zHZTgL(FTbgl8&BeWRVJ3X|}t#?Zu?Ww2k!oG79&Vb!4K!jDFY!giG7<1TKxobR%pO
z&b^*!oOvx5JC-64eLHE$f867QQKl83IFUuvLif`&M4$RMo=saZC!<1*9vz-9@E?ig
zhLm(?Wt2G2_HM-tQI)XO9q^@#E4<fv!__P-Kg_7PF%s6O9xD}^thtAVO!hLPNS`-F
zKE{PV3lJh0yrU5ckh@!FLezXFmMF{DIH885Gf+h}`os&g_?uXTH;QGdXWYu`nR#CR
zdE>WE-wv_qeR?0T<DezER3D;EE^Pd%L+ngSmX`msubybz)7xOam_~g<zPnnlGm$S+
z&)vf{W(iJ<uQce^Qd<=SgxQU2n(lp4LJcqvC+T<b#iZ<@0OfFU{_RmvDgs45#W}(3
zv!f+hu3d>{L_MghUN)o@pQq$+E;ex|Q>$uplAnlV*;2ic`GSKWwuhcWrp9zmo6<dc
zm3&$~a4jHoMP>5mqq6%B$*5qwK$p)8#M1E<d-OqxZO|b6_LfiY1I<4G@XFnodWi&q
z9R*=jMpN_=U`k(k^o@AUaMwA~RsySN3`W<B%lM9xeqk9EeANNpuVQr!!JnvaHyD4c
zyo=q#OEl+w_O{~Z^9J!Xsq)@7^f9f7tMa2kv7o1#ER9ceJ7v3bvK7QrIZQJI6J;nB
zvH)@yZBjbt4qvPbR0}LduSr@lO(t~@`ATHID_|@VXxUR8!C@9&piP%z@ikZMacfds
zsa&Etl;KJx;SWeL7q!i7O5BGC#FiywJP4}m?2_xRUzyE}y<B@Z`K{o<UUW=}S?Xqh
zYg2pVabXBsQ@e<JnJ)^9Jz?+M>=qQ~R<l^}%jEqQLf+$tJm@aJy9Qc0dmqkO16fhN
z3dNsy0EPb4XRj+M+VK)OaBqzb`!?Yzon?LOd<^~J{Y0hK$TBpr_K;L2?Z_`G$Y6AR
z1C57qIeE)N&nV9Jp>snOlhA~&kryU@?r>B!qe$*;X#rstj3A-4a7;mIlQY=yljd;c
zWnOY#X)Dk=EBS8waeA|+R9yshq+eyZDLxP~1uUBfqH6;vpOLy*k%$@IQ1m5HR(6|J
zIjNMQ_4L^0J{P1<$rHBCyhp!gR7zRRRn3tY)44(yjjj`g0P&n<TfpJfJuAs{yJ9SQ
z9eRm=bNoeNjD{)w^Pc=~Bq<o`W{Hcb@FHVn^B_h?E_vHlwo`Lo@j!RLql5p8n(C<!
z;LG8MdK8>_XgXME+N$;i`Z?2zw)TDb7&&Pl$(nCTZ(7mw_rNk_pXaGq^AnfBYQgjM
zzMk$v=#f#cLfGY-F+*C&0qF$gTw%T5(vQK%l<T~@XJJ}X5fnGYNGT~tkF@FO$a87Q
zrn&Ln-A9d%O<{HxWSywrUOkQBU8;2x0!~5C<-$uk{GNthIL!bIzQ;>TJkP4+7gID9
zvJ~x;ga;LR=%MTgQ}XDW_K^$r1!W(Jza4_URtv_Y`4df8xy_13NmO-MvUUmFP@tdv
zsiD-EE7DdAY}rmamt+jd4pF^3h>EA~#*5A{PB75yuFsK?O)SS&xbZoU|7MaD1HiE<
zI$xGDq;;k7bvM@OiUCVcj`lgDDy|daQu!#)xUGn3Q8tI<<}hgvxL`WpP-Dts#W#6x
zDq2xT31rUg7j*CK<2?7oeW7NB(MTiTV4uv<;1VV)c2lXFc8rbWrsSrNJ3JN4jL|__
zi2#SFBbeHd)M8A@VXVsH_70s#B5+>v(PuhMx7<?{?Ik?aB2GrP{f=)@E?HVPgK!p_
zNb6Jn<naxH#X0G-+5Tgv!xH=67i7~~>NzIZH9ZUOD$1Ar)QCl*cw<QD|ItY=?KX`X
z_&UDBv}t#+%Y5{@P6AWPUV)AK8p;xzy#g?AV&E4O#JY^CP>P=v5B16=xB$TIN1PNR
zc^xgwxB)EQ7*(}6h1q@Uw=MeoDox9HY2VHA0PJxi+^*vwIFtZH-!>kB`<d|57t2>C
z>ET{>q2;%bduN@ijFfu;KUn<(-z|IAZ@ulR$&nW1IkH=ERoTg`Nq&@|h><N1uS&8M
zdldcd?RxIZl&@~l-l3b9jnly7g>aYWWL{4M-tqV4+8*(JqO5bzXL8Oo;3eOtQ`A4c
z=?2iDg4iVrjkB<bO`AoVu7K@Be2=Vofbg=ca?{|g&uR6oo{v8by2|SL=WiU^OnwK!
zRf${u=g^d>DS~P-Nn>YOU-nnMlCIyyo^5nFI>>M`jYP!t$Ju{FFX00Xy~fZ@lZ_he
z*Rll)SQ0IIB9=CJ0o_B*w?{swr3Jg;*r`33(X7J?60psPdhz`(4GboB$G0a8aX(<j
zJ!`t;lM^rvle3vFp>w$zt|mu=uC#1E?r#83-evnzT^KO*>TFQdWr25#)mpwfF4KwB
zmw32kyZPrug?0w-Ju?B|leV<BDa*o>&s&Vi$mr*OFaP1;{}6jczaD{@n_9TgoRc&0
zl#4x^WJ0j@Emp(*eb{FoJm^Z(q3FyJ<Gu3&Z1uuEm9wRs2e~3<<fC<i)@a|t>C*5h
zB6`X2KE-0DzqgCgV7aII^A5itd(xL&#-z@4Q9KdYBwICJy&!l15W|tDqjt=%JE3k$
z?*HOgFo%3#@g3{Ic^+x6?(?X=W80_NQ5oaeM3*0H4J7TF<wIZBFiRZM`Zf+y$9?Mi
z@vw9ed$%NNoeH7Lhaopt#q>onvb&<k@NOi|@evI5*`I>6*Yj)ne7{RC3r+7Bf=C!>
zdFi#sdh*B(me6M<fUmoVtN+cby8((A{Z`@G+(1V|6-V;wlosC4`p;x9(CcCsQn^tW
zAzviGNRG2w{Ed~Gg0rV`xqzP?Tu<P<*=JRK6t>T?Y1{eQK`?&fo#+4^nSbmjVUGdK
z+Wj8$NO*k`QNx$^Xsgby%v`$N73u!dsM#Dh#SUxn53sQ}z`gy)o6M&aHeIP3Aw+fh
z{$_K{Gtb-{m+>!A`}Jh`76%SNa#IIIVlm{gDoM0Ajt2`kpsHs!9CluTxSX{r61(F5
zPHEJL>b}PWdBaRGi9^tYDC1myC++(%26@ISR8Vq?0ofy{Xx-b9r*9Neq}-I(fo@N$
z4hmI7jcWx67`xHP-JCI+yx2?8QFeZNzx=1xN?M#Qk`a}&)9m>wK>iI0{GrGK{D<T0
z8;Zh^Br?4`YFkp7y&BRI(_-gZV@OE<KSt}h6f3%PJ6ij0>Oae#*t$7JwVhv1_5=NT
zKQg6*jIjxIC`Ps<WP4=kF<EIgtwoRPzP=iE&O8A2f!$|sIRh@p&rYm+ZY)1ZkW`YT
z(?RD@=4mc>+OV)4>UL)wCc!;y<zx1w)Cb0yQLjRN_+RPO&5%CF&$DHLx0$rnK?Hj>
zgadV3>yL_-$40uNSj+~~qOHp>fN1_7?W!4)3@WV&H$^W-^7+o=WMK(phn|wHf-+IL
zKK55{09O_83)1WfeR}iifHguvHZyQHm5waKIh6patbkJj!$yy&Lei&=(ww9<r>0k=
zed4Gmy7Oe2L$m0#ri%|&x%cW!J*S4c$$Jjbi!meJ`DV3z%;NyD+(74B`DKfpGle{g
zQR&8id30t!Jb(1Ku!QOb*+rPOqjE7Kx1_QMAj8ON?zP*I)i_U8k%Eo`tM`O`bxk%@
zi}uX#TybXYLf+5a2W#I)p0wM=T1#4(V$Q1;=7=7q=f`|I@O#^f<D;<J%hb>J$mtzc
z)Rk7KdS2HU0dB7<j0;BfRwqP=DM7GXRufT^#I4V~mwsWG+>>mQI|a#&30w-S`&K^<
zWNebXc`6LLKf00JaJTc|)B7B-B-Pcoc{w-aJ_)`lWp8b>2wmT^AZpKZm|?^dOdu{i
zC9#)b3i<LXVQQ*A=Etbms^0!N?eh6l>m;(+*%~c6lS%u9g@=4V!IFOq9(`PJcLpeW
z`2rXl$DD#MQJ^U_8!T8*a@<CRd^Q?bnA4B%F~ng6(Seviyuq3&Y+5O7{MVO8=eb|4
z8<*(l&qpMf4f1EmdG5{<_1fPrOW^Xl1J3&;b=_kA-tB(NCj5EA!~O41A%7GbD#64E
z;-i2SJxyxdYZcJwrSjrzQczJ0HU%`k$r?nNPXzwN4GIpfMk0ejx3LgNOVCfIS+x^B
zD*zu&Q2<q4FHiotdMX-(-`(AFBmOPXH+~kV3WP46<bY?je*buBnkE5EIvr59)`A3X
zCG~vj(vkGgC3{s=ceUh3KkLbe|Ji;6*bqVj_#>Ei)(NXMuWR?D3`VWC$K8BySPs7f
z?+446a(O4+)Sb-UtIdB#%vi1Isa1*OEMQ<)##POSRHY`$FjcENM2Gu;#aC9K)GWhz
z*3l_ojVt6ctA%`?oD^85@IB_6!PIK8(`44Y@B_|q6I3{r5<zC|Nfh!t&E5sjFjj7#
zELeu)>=CV=u!~o%&)$G4qJ@`G>-doM#ihU-@9)2h5>&Y`*qcvlXLDC5q)xGi9Jm@L
zN~!x41Sl3e(2Rfts8sH{wUQsO{D30W<m)hAECo|b3NM#)y)`TC#KX=73bWmW7~kT3
z0KJnc?IF_ZVQ$P+zhw^oh)V*H*vhRDM&8@J2@*o7b*Rv*!#M<Y#<M^O<Y4lC&E<UA
zz7p@80?2VrDW`C5!44e$$VX2PbEzhZ)y9k0h#>vx;%#GMDnYdGOm#O{XP=+NzF|dS
zJPAWzGH;A@Z~Y$IFwV7zV{7wH6#VvsWFaYo=!#p5iqGw{&V7+f`woCOy3!#$lxA>x
z|LJ|z;cueva_n>bKW}VdjZo4Pht}A>G7i=>UXNH0T{tP?%<jLtdhd0JvM?qrOw*iq
z`2zImI~F?|`J1J|*Hw`#)^s!u#kfONRQQR@e4`na1PZy;8DZ^1(UD{}rIc9q!G)L1
z0p_u71TO2FtAze;aq)w|$K8Tev-8G~FMasSCY_UO^|A`vZwI|t?y(+P;~SiqhS`8|
zis4%sM_C$en#SJc{ecL~Q~8*g<CJY^Z2I(P&p<?XZdaDo)v?l5K1-;_``uvQsuG`?
z6|eBj`L6EiUev{qxy7nyg*(P<vX#>H!N^>}Bce1ZyU%A&WN-EX>;ax1Cn{wN<1Q}Q
z%s#PvCF-%by-O#L7-K3KGPz=20DTJ4PKrwEmMQL0Yhs~)#qhpKZXU=1AKmfoiZvk!
z`Z3&#<EBj)0MQ>fvH*A&v%IdX)>m!8{pNhJo!>=Uu5#O~Hxd@9viy^$!6vLf0q1JF
zuLS{CIZ~#|_^y)x5W^Z`iqR-zTYLRP`FuR&Y83xT-HN+@H!Uqyt2=lPcz56Zu=e6f
zAz5`(d&-dMF=IebS>M`5layX?HTB&bfu2QT*Z1=lLZ=WjmM-bWyxswm{Rd*wV#-Dq
z1{N`WWcWWbHf;NVFjcw^$k_sTyg^Fy&O>5ZS&P^~i`yB1a3~ue>HEzU7l12=x^W}q
zcxa!kTEvIYUAS5X?(c6ot`<amM}(ZLfs37l8CW6RC}J^3I<289h9?Tvpwa<11uF<&
zXd31_HQ;wZE^xo!D?jS-rDZJBbq}VO%?+>l;?v_#EvOo46*1qFFGYn??Z~usaF|T0
zexEU65=;st#t(CQSq}ov8ed9_AL@}$$_>2TDCsBs;3vG(<j=1N?S-mr?z4plWS_7z
zWoVw{$=@Oeh}mWpq3JH1s>Ho20Q|+u{@UXQQfU-rT%8KGqwfVbs+wr57B!yDEoaIK
zyJ(}B@5&iP`}!s$F0<3|L0C0N7QcMPIC_#OWlKQmA8UgyqOh>{IH#@GTXHOg(E6z?
zjK^!q_`?TIW)el(0rBA!g)L`aX`K$j<-j2Jdc-jp(`lhk10}G6#2i>SvMSncDPz)^
z-ljs;{lEcIISd$lUYm!vW$U+aS0jS;^ITsI(G_dSVKs5__!WGPzMePKP07JpUpc-$
z44B2)Y`<k%yb)1NiJyH1*CKw(b_Q~h{fLdas4NSa;eO(RhZ#ugF)@d>k^)*IvOMYu
zwu7z}IAGyTs*{<+y*u>O&vBLt_)J?9yR%Q%%O38u>CaCNjD026Wm$jAb{fK{_d$Fr
ziT#V=zIA+CTYjW*cF+?i4=z~k9g{7|B;d+xJNM)G<NQ1Zth<ufUXS@B@*XmF`&Lg&
z7gIJp`8bN%u0pwTTzZaoc1g#E-JU8w+D&AZ!yHC68P}g<AG8mjKw?-%{S2-I@WbpU
zhA*wpavoCkXYUMWrOqX&UX``NcOHgf-ma@S>yow*(yMk_>#3E%m-dG1X`aEeA;1SY
zQd?p_-OeMc$3j1&kF)6K=nCrN8y^4<oV1}_i)mCnWqV_XaO1~eIZJYdOI_TX5&pdL
z()8Rp5BA016iOv8%gD4a15FA8#iXXPAK<p)m-=jy)NI~?$Z0n-1LSEzuKEKV&b`P0
zSp`(k{he;cq{T1LcpF|FNlL~2THpd}Ms;s?MdA3+H+#jF{?*Cea<&JC{@Kl+DR-UD
zEN+9J{gfSh4t70>5gW6cAT)HhP81H}yU!Ubxw?6qxj1NYGaEZ@H6mYlNWb0TXhV3T
z68AIYavRohGjK)7Kf20RB`$xp+x5W(kLPFZxP@pBH{d5lZyync!NHVN=QTias`LGm
zigAD%Cj<3C6hD(Hw33tgjO}Hbm11&x$xEvKuELAO0P$wfyz;7H`I)q9p30)8!l`O=
z*o2D6q%Idg+_o5XDJU*VR74>)X{R_9&|P63*L7aI+J8b>z+^yWs%m{B>A~xj2qFOs
ze5;vVjq0eV!`!uu_^8q-%Gbb`9TXHvm&(ZSrHJ_ngm`hlT$S8>`r~F<+7-zgrOIVr
z(`AV5WSokPcrQh~eA#hPa%}%CdQG9bXY+n*K?g?Gl1;Z7q=t)@?`LCG;MH_S_H71U
z;VC7hKEAkhX9DXY*kA0?zOd+Rzb1EVyZ#7Qw#0JRI81!h#x}txqNr#9c(c69bLl7J
z(@xaLQ{aB`lCnuJeoP^+EY80z=5n#l54S6i3v*kZKIXn^LMT&F&=puXdG;s<`+Dn_
z3wL|8`|WKo?;3xb3iTn0vL}rmlk9o@P|Ho>id?8JLpED=@X;pMbux+_bX^Js{gsPy
zadZBYh>9GcL-gZ1adg#yWzq7yarJp>YFM@@B#7($3(fVvUIzF)_;0_Q8{R?uw;!+c
zpZ=90JO%c5JoF*F9QSWO-wi&D^|${u3k*La|NC$~7rfsIt*QAQ>$>QQUJJ>7ojH!j
zMHFLk;QdM*gTd6l>MRh0<yde8VkHj?&I017qeok5QQ`fnZTVY%goGiMt~sa?;rUo_
zbr44^6S_u=mgAb^Uu8r8UD?TR*1>DmYh_<0(0f{Nf?vx16w07UQB<(xieK8cuURBO
z9K$;FXeA2Dzr{3v>Oo*A1b_K&row{XOv!fi|JT9<7^361$n>>{Jc#4DGr9vsiMpQp
zf60|2rhkj17h=IhKpfj1^#9Hz{*TFFDzUxa{QcMbQXr0%O7MRK{@2vwh!`UJI!#H7
z2rj~cmqM;>_zdfp=>I8D{u3v(h{E53rKMQ#Vk{6Mx0~s@q>cj*v6oU$dW#JEzW_M8
BVU_>@


From 336eeca4d273ae756b57b234601f681ff30dcd7d Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:30:21 +0000
Subject: [PATCH 028/201] chore: Remove early test files#

---
 tools/_vega_datasets_data/metadata-schema.json  |  12 ------------
 tools/_vega_datasets_data/metadata.parquet      | Bin 9100 -> 0 bytes
 .../metadata_v2.5.4-v2.9.0.parquet              | Bin 11354 -> 0 bytes
 3 files changed, 12 deletions(-)
 delete mode 100644 tools/_vega_datasets_data/metadata-schema.json
 delete mode 100644 tools/_vega_datasets_data/metadata.parquet
 delete mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet

diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json
deleted file mode 100644
index 2b5b9d955..000000000
--- a/tools/_vega_datasets_data/metadata-schema.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "ext_supported": "bool",
-  "file_name": "str",
-  "name_collision": "bool",
-  "name_js": "str",
-  "name_py": "str",
-  "size": "int",
-  "suffix": "str",
-  "tag": "str",
-  "url_github": "str",
-  "url_npm": "str"
-}
\ No newline at end of file
diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet
deleted file mode 100644
index 1ab0fb17143528da9cd460e84a0fb18a9f1d5b73..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9100
zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{
zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D
znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX<zK3G0`yUs~)CAb<s#u^0}YUE!Xzh>#F!
zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep
zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@
z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn
zm+PlC$Uc@#rY?$x`Mjg<JY%|h(4{cS#t;Cuh;p0ibp~ML8u(NqcNuuiR-X5i>`Yv5
z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T
z<PKWUMR%DESOb=T?S?8#3{3@yxG4g|U<_x~HJg`mv?a%#RS$}awcO5iY@|5vy?fdo
zHQHTO*3Nzy(>Olgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU
z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S
zuGHkN$akkNd7d(<tS0kN_q49O?3;Zy$&t*lhssCX4UiAJOi9^eArhH}M#6e1S!{M`
z4`Mvs-0%F2(54pF_!r0cl}2MWcbwGUV<X_ldA3})Hw_&eIA3hZHCvdbP!@e7^sa`^
zjyOYum<=eOOEn@vQgUq;%_h9F%(oL?<$n^aWgLBQ#x2D%@O*l+xv<FlprhPF8~l5Z
zvs&;*$y*9b;6bKpGIqnu`3AAK2Nqo091qW9q^`#@WRPlg`FZMXdV-HmpSbpw;yxm)
zj$#mxCvy6q*<mVEw5`3!#KTzfyoPs!6KHPH=yjq~c<`v#h2_H+rjDO7%y}y)n}pOi
zJQ7QuU*az|5){Z2<Xwz2@DqRK`Kr0F;|chF?9|7H>wDi)Mn$5-8AU)8RuX_P&+6+R
z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y)
z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP
z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J
z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDu<rd8P5t@Ut#0
zE-{Czyon5A5&*y*&&`IEd#K-9bUKuF{k^Nz%*0gqD-<<4x^l{?{Af*rz|3Lcb}drD
zEYCt|EMiNw`r+BA$<mI7548c@`ka+Fv}tkiKBHNg&bTAD<B}j#E5b=JrdOcvi?`?0
zDm^VkQLlCTDl?9rS3aOLp@I99=XlTlLj~uTV}cJa2Y>EgQVsa*pFnN&&60_DkTv3H
z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0l<s;TLRz+W==pFi(1+I@DWUR^3?
z5;b$&u6c5Ay*=yKAuRXto0X+U1T`BQ@}D#fobrs`;`}h-)6DTI7ycOUSp5<U{DJr~
zt~VWbM0QH_e>E^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg
zh9JkIRbpE>9&m`sJ*@}sOH>d%$V<jx3-V#F8?LSz4Di^p#Kvpo>gLVL#d^fe=wu#W
z<IU$y3HEyYyr$xvEW8mp6|dikh<VMI2eRYGEThDirVBNS&xx}<h)womRF6I<c5K);
z<<<^?=_J?Z%WV91J#Aj|m(Z$iJnsQ3l6VlCp4LPM>zWa;&;hxFwt~c39&`sQ?gST%
z#lOG}f<A=GxgSMk5`@Z8uN73DX?}Fj&WY`z7E%69&ctgkSE<>^cj8lQl0thFevP7|
z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F<UEtQvppUg%}{-dQn+QkUJ~QD
zc>!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk*
zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dy<X=#=MH
zGwZQ3ttG08_w2=Y_RsqY-<5GRKP)gh-A^(}7La_Y)BXA=djIB1>Ib4vP46txCpA^=
zEv{}WiRnf+yZ3ry7?c?hC<zcKd+OCCS3ueL4V1z+PnM>B2TH~2sn)%_?heQv6j8zF
zTi~1xTl<VI)y<{|vd(|);nC+x-y^&E|0*a$c7FrqHC9{~_U}M>oE4m7{L@yMJX-<n
zj{tx{TGCVc19pSFtPx2pzzfu>Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T
zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV
zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_<NCpa!{%9p;#K=W(3m@Qe
zjVDl>C9s9b?J_Xz%#2_K29&h1G6M|fQh<siLN{xj5g^2M*3Rq^0<jK|M2;;X!tObA
z3Dak0TF&j-EHuo;eUa;E-8MGE$#?G?x$Nfl6|X<z8KEGz4rE$6@|bA;QLCrj0xP<U
z3AeD3uGN2~E|3Y_3U0Rm*+7T|-Gh{KK>$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y
zX*;3)!V2`FFAVpZx5%UjHDH)mI<C=`A|Td0Q$}!qsi|G5X{M@~Ex(v8zu5r^v+Pm{
zGYRu7aU&=XZCh#pGdC8)OpEaK_20FEkYomQB?GQf2A<2GI6$-D2XPi4!?#(m*#ZD8
z>`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n<R|kB)=6g(h=x-1g=pY7UM58AVwF%
z!4vEM2vqX+@bD0tmXZ=F#9z_RKYV)xS&>2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V
zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW
z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+<
zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl
zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B=
z$<Oh242=S=?z4NFYIdd+_4qZ!86|I&PwdnYn=DLWCEhP@P3^)DG*qE-S}Z#+AW!j6
zUl%B-c~jPR)RZ^fxX44D@C4WVu{Pk#;az4WHy+M9>TjO3X9<wWNS{6Ad9O%1vSa-7
zy*B{^1H|i`(~EmYCvvBV(fpdx@-n_l>+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%!
zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa
z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PTh<vm9&K8z%V#Tnn3>S+=1#saM$R
zH=l}V%jZdU?oPq9mN2?X<DHq1fyB7hxDAtWjeu+HCL6C<(KtDqOn^byO8`GS9LvD0
zDH@07=l04R9=;m84v1yu6x8I_d-@3^#R+qliCh|ruh}9R!Mx9>U|Y^`iej-xhS>|h
zD9c$>dzC7Ig&g-Sd4@rs<;>n<ZMQG;Ex+bFC)+(3JaPS4_g<pe%fNx)cc*ws?}#5t
z<K_Ai-(+ZUz9UW6a62WpCFpuS<7XI1@o3`8@TI(_Jw>DM?r|GwE3OMWJS}|VSO$~d
z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro
z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo
z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G
z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM
z9M#!^!&pX`^Ou7npXPcVbLz%_><Q0Zpd#&`vG?9Ppq6KtRjF<#wzEvJ$x%N)qWOyw
z+S10ht25-1gV$~F)*`oSo<}Lp!;osc-$$_;e=70q9Y)(2ubb6*zh*lh9vq5sxL)af
zTw<ADWT0=Hzq@Jh<?Gkltp0O+ya`r@3GN)fNtv5(b538kY!Y++6Gy$g-Mj)1LxI^n
z*0TBcS?0{_a&A%r$y!qf6#Q$GPm0~TiT$lRbTRnVW&N_iM;2Qx+dpx;C3uhWCEhgT
z-|&VI|ET1&alF*TWtn*u<H`4asZVQDMA>9jDY<gCz4~UGzaYt}t+!=o-!a{4DT}vg
z6s}B`9nd+hOEG^*dik40vdjWkCI3kWTdo(y*VQB3D=I|}-S$h;Oc&6-yTJP-O;C2p
z`cuKk?5=~i?%q-tmFc#w`!$?YHtv3cT4C^Ts~UT6>7Aigq}T?m@)5(+is#5Jy0>3X
z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX
zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq
z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OM<oCV?BQtDm?<S5Emsf5m
z_)vJV!>lZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=s<qPJIo);S072{@6YBN};
zVWKa6IsXE#*=P6gskc|2>K09lR<fTdDRQj2=X3TfGH+s&XQzRVx=v0;S5BRoZ+pbc
z4>pAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB&
zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L%
zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G
zMXkI^_Nwl*GUp3<fmvG`X@R{jJsMP|+!OOG%6YWoxkZ_D8QwDv=Zuh%7^bgZTVgQD
zBxO`WYptd-Dpv~CPIlH6Wp2NUmrrgTy#_r=9eC6N-(f+K;cIwQGRX^rZYL{alCR_C
zbNQ0fK2ZD$By?0FYb9~x9*k=j9JYwCK+lzvb>)=)7>OPh!rfGaPwAgH8vnbFU&y<7
z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM)
z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW
z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_
z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ<VFtLmNunD2lr*wyASnF*<~T)FU3fUiHbtL
zs-EbwtPAd1ZDPQO110TKp~7?QmVEVfu9iKUETki92F7RVyii4czWb#i`e`RxRd!zu
z(NGsS;LQ-4g_4Mw>|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk8<urhHpD+>2_*i
znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp
zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ
zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x)
zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC
zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP<X8YQ8#$C&o2Dkt2ryG)O<3D*z@$fw^*
z9LnmDZ;-m`9AO)NLAfKV{<gWwwHA@jCRE$kzT;dy)1dTKrFh!OpvADP(jLymh3n!K
zW4XOMn$_opd<(?xj5dwM8E@!3h2i>=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m
z`o_brZCW#Wb+me0<?r~Zp0IB9yVs@f?Ugg2@`d}J*UP&D&aq54`aiHU-;1=cN|D!Z
ztyXNlV2G)jSm4=4&Ej|7k7atcJw@GiQ<9JxpGkVK_CDRKqKpb}vu_cOuxG!R*p<r`
zS8!r(Ic?u%lwvMG<XYQ$Sm3P(vb2?b=Y$iwfD@N!1MfTkC5DORVZcIf2%sPQ9smsh
z^onmWd@SY<VbJfo{ACP1B8kO(iy3{__aB<RUCyDAYgf?kMy_5Hu^9TD?)!$7%?i2&
zu3h533I7v2`j1|Aus?I7`q5+KyV?Hi28_l0C*Za!kRTcf217^zu!86=1enRbnjho<
zLsqjVxWEq9wJuoj+FE{LBiO~XT3;;-mLpd43-VytYWAigI4rc9PqYCGIN?0VfW2(W
z6h@_ngZuypNHYBCZ505bpt|3q`I3NzAUq82`KNgMO4qfR=UPX2_)1;K?zK85sE!^j
zSsAh#ZdhxDN5J>~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<<kG^?B_xpqV=ov3i9Kbz6
z39jugzfXB(hxwrw80blV5QEMpfyMx?gL6%aKj~?O-e2BJpksG{X!HXTEXO$Jn;dKU
zTp9I`F=TY^4v70l5~vQ~GB^H-!Vd-f&^0iHS~5(bO<M5ZiEN=eU=rhGa0`IzmxG1)
z-^G;BL-6=g3aA3$CKY}k|AUbr^a8BKK?*#Ba*J;|_zpiSiu`}+sG?H?Al;>UgQpn*
z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N
zH(8Y6kEW^tkw`RxenickP<fXvoQ2Bts$Ymi8WHLMrM5&l8>l@p5^gsm68%>DQK3|y
z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R
z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@U<sfS
zG*!B&`o>Te62cyq14`{y^<fEjf#$0T38@6h4>iG6`gj*7%1_HjXKiYw>>A`6;sa5o
z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn}
zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y
zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y
wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In

diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet
deleted file mode 100644
index 5626093db560b805b33261bdc5f6b7754ab3451d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11354
zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2<IC0i6(
zN>N!-mSkyBTJWA3R8P<Q^y&9L?;r2``~41c&bhbiT=#X~-|POK>ppmljUo(yIm2w_
zV1x892FjnH8XygUARqvENMBZ15tV0y0FP<G)F3@X0=zJ-uC3m*f#4z38$1L8<m8}X
z020Q4M6eMfLS4yW!NH`EP@*S{6+TE0&w;?xASkHzceyY+*fI>3$Bcz#Ltq<^af_m3
z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x
z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z<
z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G
zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIW<wS@-3c
zkdgpTD8ku#9Ew=DXU9rgJB2D=6P)udd!M`YtIRdK*F8H<RWv+5B39nyJ+PJG&E&&@
z@i22%r+1Nlbg{M*wegydXNI#iaZ|!+`;nzDUKwd+x^K1Ac&|Nw<YMZS`%9H#v2BA-
zzsg9ky-!{&au-yL3M;eGZ4$xr8{HHiBUPZngA!)cpClglZ&=9vaIJC**|t{gAji=&
zkhtVAt*E+nNUu~uPG&9!e=xOd+}RL1bkB^C{W4fG%g9(*zc=@u<kry13=40gOv8``
z9=c8e8MW56y=lg<k7wk21a<KTiUhWu3|K3$;@CIX{=vFwqJHY)jN#ZT)~JMQC?P8i
z*&?eZQ>Aw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor
zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3L<s5g)aLC_h|6$|x8KEzs`
z>c0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm
zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{)
z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~<d3C!wai55
zp2_ktk~xkm%kgXTYjCMP7GQF5D5(BqP{-`2m|JIp&E)tq&GhxrqP(n(Mg}!*DuNtl
zMoNXYa(q~mOXH#1B6u?u@m%Z{TZ_x?mz@<HuxiB*1Js>yS0|RD^xW_olJIm>Nl{S*
zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSV<n#)>d!vED+2Q8p1g@{
z5<yD~avyG&;O69Hh0?b`j^O1GY#p@RN3hbcyYxr<0yKo~Z&9K&gaAaoQc-#sD^e9;
z;6x&rynF+Qu0e!A;yUCr1mRg6sCftl^Z#f&(9~Wq+%FuD@)7`4Zx~gYVrzhw76<+$
zu>cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea-
z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa
zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7
z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=*
z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@
znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH;
z>XPbVx$`X<tQVR3ow{d_<s@xCPuf?g^k}*PkCgC8zt-S3N1*NKX7;4XNaeYk!FaX~
z`MkRzt4K}az%<>Wi%sctbgd<HZ3M<TXlR8uU>5vy*l7OgOs_&Ev#e7++rs(bqst-?
z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf
z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk
zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K
z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO
ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y<t84Kici1
z`h+ciU@k&ulmo%boYY#x^>?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&&
zLyhoI!1y<Nfph?b<<p;GDFwpvxyw(mJbH%n(9D?rqyq2R?4@MCf+sszD6q6%hoxPw
z8_QvT#5ye5zr*rM>?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY
zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K<Hvlh>7R9DMw0*R7WdN#WPN`3Q+ns>%lP*;
zF|Xp*P<m=r0wSuFUWfJU7P|-KYR|H7S1L!DkyNWZY=q~}HoZ*faxR`SAf~0M&NI|}
z2#%dV(JMhHsAT&KDm5Y$w(fkdJ0ul<V1bE#wh$#PMwej~C>}l^I?cB$A$OZ|;z!##
z^RFC9M&hw<S+u6Gx%M6rFVP2UhOT)D&3hDx{~wgUT5A9If--pDH&EVYL#2uT6DSv%
zV2N~C*y-<UWdx)f^l5Cd9Ka1#S|lgkIR;>hSm6ckhGqH!6W}UyS~7%e2E@igIvIdt
z<_T<Ur?{UQ0n$mQ@e|o>F|opl@6JDw6nK1vMkd^rx068<fTa0zpArL_r64?_fT7I+
z217x}7kZ%6f*mRZq-jbpjcY)x03aH$MPH<;aE7@g0w<upa0Ld)cq2@g2iV=p6`;)+
zaFfpIIxyzI0AT_iDrh6OKBT#n1e{OM>}IMl281}OY|Mut5PCog`f?c((r<fDnCl9?
zOaSBv>jfGBt1kGI7j)X-fh*&56dB{-Y~<iSSCMgpxZEM`i$dQB-%EFe{aP({-Tfb=
z`ztquf}NtaMnFMLHxJ62lba=QnGvY_)e`Vu;Dy;BFD&%@D=(DO`OXX5Jb%p#J3(HE
zS6b(VDzfXmFz*Lm=wACvUYPI#<b~46bzXR4gBM!f-*Qr^1Z+tIjByd6Hd!<<x(#Yb
zNA``}V1=t3@Kq@43Yt3K{xhqA3xPe%2Mc4~d?^%_-3<ZgK$}w7HuVbfZ+c4Fk4y?;
zpxh8bLP+7%qEPZ)aD+-}U}KQKju4$ZScG6$uL`!IzSSPpp#`!N`2=sxuHEb>aX%<>
z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ
z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(<i-1KO=KX4d3pIptfTKa
zExeKzby|yJjh{3G?cfQ+{b&azkP2r}`ny+2UQ-^eA+Ly5SCv<x1I|G2@a6QCi-H)1
z$;0&j+Hr>U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI
zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!*
zk_Y|?vs<Q$e0ksdgk3N0b8l$g2NHDWC`(^|ifX>mkqc_spMuQZ1oyvBV8;$2RmmN=
zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa
zcdAEAx#I8R(jIH1<s6(#fIo_It-Sb*u;u;kK(`~~P?qtK9Z;<)f<dpUA6h~g8#X6n
z_>wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(<IncnjuDz}&Y^Mhu`uHuQW
z&+|jKbe*cvyczn%F)R#U)AA*{V7WnRKc@&iyNG2fvZ-C!#=1$hCF)-2VFqvP%UY4^
z(I?y7axO1Ez{HiuXYRCp=Q^^#V9Y8brwwM3Z8!7!PR9%WE1^xi9Dfofgxb@S_;e%u
z9ZQ;x4D>S=O3(y}iGI6Q=DEgGMy<K6t#Nx9_p!#XTZf(wF*EbxG4jlvHQ4sKH35E3
zVGzT2sKsd^?lsQ$RIpdPnEg93li=xtaAsC`GvI`G3m#*LJi~U7f034%l~Zz&h8e1V
zi+=DREh7y=T2!BAn>6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3
z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r
zH<h|8Efx*)cT{yqzE!?ay6^1P;}2QBSjZPx7OAb~N`6d}u3Jr6T}s)V>*as--Sh6r
zrZZ<lulQYEwn?d8J`-iroa=l0gQ-x`GCFiuvaI2Fb3qk}Irf!d)zc)?OEpUkK`e)-
zZd3%1-*;%}49}eCHjPteJMdLbqUMBkq}c^%?q%^3rWTFP!Uqylc8YDo6}ir{<L~c{
z@xKUB;NVgC?4q!$9jJuZr#8-`=h`Tl2TcfiJ$m$ENU?U(?c{VcY2wEIoO}2g^>$yS
z$~-0I6juVF7vB>!@z{d&;?U(<dZzKoX4~65mxSy08{p`eyXs{9YEBxePZoWM86lSW
zoh#}XY<qnHHt4{&av~f}dZ^~XnP55^eeyz_^F0kO#okiK^f_nJ$XgE5XYZMrk#nM2
z8FLBc=6A=Bcl5@N`$QjAmn&r`&e8U4uXHpvY`XkGvuR$wuj#pVM*AHRX+AF-g*sN8
z&Hk0P#W|6l$gdZRPS|>e^kS8p0-17ZrrpFQ8P&U6$X-<!$tg|M!%rhp6ZUt)XFRrd
zz)#KF$+q+uM=;fF59_Sp?kaz`bfCs3F4f&7@kt5I&PubUq~6B&;;L`NBb5#0$b^Kg
zQla6($)9Nry>1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m
zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3
zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5(
zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ
z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-<VXP5|-_%Q#8?*LS0ub>`q37
zePB0%4&6yElvD~k4*Rmgbj|^8<EL0&`*N@%SR+I7{g=u-J|u_ZT-UPLVhX*E19~dW
z;PGMI{(I4PXXEQ`R^G$<b+s&GJ3GJhI*B-(&)j*{r`f!5k9{*-N^0@UsF{P9jcZ<;
z^Qg(EGYg85Z5%hhbe~%NgW30d;a!>)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd)
z?az%yAIw|1>U4?8ZDme6BzyeD<e`rUt;2at@)myh$BA8&_owSzq?j6OAcogo5Ii0^
z9Vpcc7nwotSrOHHnE2MqaD0*Z_M54kg0c$&CqA7lYBxU@*`nK8R>yF@tU@a7ig|r?
z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuz<xO6G4o(>t{wdV!^)
z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g
z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4
z<KiH3@6`cjmJbP!=~=j}%eb@+vN*p&J2}R3#|@xp=<3>mHN+K;)(CerBC!q4%J3wm
z*<DwW72fhW)`2y#8O_Q%z|`W7U}$keW4GirKEBP=LaFJ7V5Q^=v{XCbSQB3=_o(cf
zxZ`UabBVGJg8{^!Nus+?XlO85OF@AU>?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@
zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc<f%w>*HHEF&{R@YQgqkUM0+U`)CsB{
z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj
z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2
zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H
zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw
z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R
zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@
z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq
zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3
ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD
zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v
z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W
zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X
z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE
zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I
z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3
z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3
zdo8j<t?%CBqp4NKdIGkzqf(34%REHI-rX0kY_Q8NnN_@##^=7@p@RSU1^OUHdFfB+
z>t-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~
zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq<uV(C3
zr}d=0;GXJmHbV3ca;7Ui<dm+__=KnUf`Y(m<6>4>lP-U5@_>Z!i=|`j`#txKJ~=K$
zDl*ge?29<cBYZV0xPG|F=0QPyfn&Y1uiCK}F$)VN)2>*bH@mWc0)`m9dE5kJ|D$NV
zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n
zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm&
zXZ|?5l>L_0$0<iSx^R{plA45&>6E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$<E
z4#>USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo
zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC
ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w
zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@<
zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g
z<?-r5z;TCeeSxJZjZk8UeFLO_MCTw$77P<e0pNB&BV`Q)9sc?P*nf@|egT1M_n|C+
z|3elCBo_@5{FVXyfy@jE2LNiuw;fz0;xF5iKUDc!8|vl`67g;K;)lAwDEbx?1Y2%~
zynbl85tKqAs1cwa3)Vw6NNVf`jQIRe^e?Ez@A5q4zaf*q%U^!z(!V7FNW_1~blXr!
z5DtZ<LBOd3AO+!PSz*I;8~2~NU?9a?&mR$lwQuGp%YMIqssekwnSW+Gtej?}exBZU
znWcZgZZU4;xBI{@ASm}R8sv2|x)2g66vhj{04bWEzNG<RtYF^vjdm#jD@bWZsrl3X
z_j=XM4Y<vUl;-Ps!TUDz=)gQ`fOfsfMtpL!l+uC{2mCSp1MB@Ud(#nY<bj|4AO(@-
zs2qqOXw={TFbH0j27$rht%o2maR9Z-DflfO`Mu-ozUH^SFn~9K;t=pjP@D;XI>@uB
z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{
z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_<hF_sdSOz610jMJV?-G79U<~h}3b>F+
z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv
zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}|
z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9
z!KhQ@?W4-03NxjYV~p*<lQqV`oFWLk8dIJR9@Kgm3_g;gG$;=&qGXxb?{hcN4D&EW
z|7&@M6fM@}1rZ!<NS?dE{_LPWk4(`3Oo3gcs%l^Y-hxW(r04-&Z8q#-MX3VzR|8a1
z0n{HXqGTDMcklA}*ZNT(ZH7~H40H@8f>NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T
zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8
zMii@U$lI-?Np?34h$QUN3<JlHOc_5cSiCX$c2jOKHlV>MgX#_1i*mJIpAtsYLJUTM
ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1
Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O


From 225be0a15520d166bddd162307ff1b82f2552bf7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:33:20 +0000
Subject: [PATCH 029/201] refactor: Rename `metadata_full` -> `metadata`

Suffix was only added due to *now-removed* test files
---
 ...tadata_full-schema.json => metadata-schema.json} |   0
 .../{metadata_full.parquet => metadata.parquet}     | Bin
 tools/vendor_datasets.py                            |   2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename tools/_vega_datasets_data/{metadata_full-schema.json => metadata-schema.json} (100%)
 rename tools/_vega_datasets_data/{metadata_full.parquet => metadata.parquet} (100%)

diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata-schema.json
similarity index 100%
rename from tools/_vega_datasets_data/metadata_full-schema.json
rename to tools/_vega_datasets_data/metadata-schema.json
diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata.parquet
similarity index 100%
rename from tools/_vega_datasets_data/metadata_full.parquet
rename to tools/_vega_datasets_data/metadata.parquet
diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py
index 208834ebf..45fa27614 100644
--- a/tools/vendor_datasets.py
+++ b/tools/vendor_datasets.py
@@ -674,7 +674,7 @@ def __init__(
         output_dir: Path,
         *,
         write_schema: bool,
-        trees_gh: str = "metadata_full",
+        trees_gh: str = "metadata",
         tags_gh: str = "tags",
         tags_npm: str = "tags_npm",
         kwds_gh: Mapping[str, Any] | None = None,

From e91baab65642dd9b81020b88f50314943d5b15c4 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:42:16 +0000
Subject: [PATCH 030/201] refactor: `tools.vendor_datasets` -> `tools.datasets`
 package

Will be following up with some more splitting into composite modules
---
 tools/{vendor_datasets.py => datasets/__init__.py}  |   2 +-
 .../_metadata}/metadata-schema.json                 |   0
 .../_metadata}/metadata.parquet                     | Bin
 .../_metadata}/tags-schema.json                     |   0
 .../_metadata}/tags.parquet                         | Bin
 .../_metadata}/tags_npm-schema.json                 |   0
 .../_metadata}/tags_npm.parquet                     | Bin
 7 files changed, 1 insertion(+), 1 deletion(-)
 rename tools/{vendor_datasets.py => datasets/__init__.py} (99%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/metadata-schema.json (100%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/metadata.parquet (100%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/tags-schema.json (100%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/tags.parquet (100%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm-schema.json (100%)
 rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm.parquet (100%)

diff --git a/tools/vendor_datasets.py b/tools/datasets/__init__.py
similarity index 99%
rename from tools/vendor_datasets.py
rename to tools/datasets/__init__.py
index 45fa27614..e27f011f0 100644
--- a/tools/vendor_datasets.py
+++ b/tools/datasets/__init__.py
@@ -732,7 +732,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
                 json.dump(schema, f, indent=2)
 
 
-app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True)
+app = Application(Path(__file__).parent / "_metadata", write_schema=True)
 
 
 def _tag_from(s: str, /) -> str:
diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json
similarity index 100%
rename from tools/_vega_datasets_data/metadata-schema.json
rename to tools/datasets/_metadata/metadata-schema.json
diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/datasets/_metadata/metadata.parquet
similarity index 100%
rename from tools/_vega_datasets_data/metadata.parquet
rename to tools/datasets/_metadata/metadata.parquet
diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/datasets/_metadata/tags-schema.json
similarity index 100%
rename from tools/_vega_datasets_data/tags-schema.json
rename to tools/datasets/_metadata/tags-schema.json
diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/datasets/_metadata/tags.parquet
similarity index 100%
rename from tools/_vega_datasets_data/tags.parquet
rename to tools/datasets/_metadata/tags.parquet
diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json
similarity index 100%
rename from tools/_vega_datasets_data/tags_npm-schema.json
rename to tools/datasets/_metadata/tags_npm-schema.json
diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet
similarity index 100%
rename from tools/_vega_datasets_data/tags_npm.parquet
rename to tools/datasets/_metadata/tags_npm.parquet

From 7782925b3291a8d3b6ff38b5572e3e47c06ebed3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:55:10 +0000
Subject: [PATCH 031/201] refactor: Move `TypedDict`, `NamedTuple`(s) ->
 `datasets.models`

---
 tools/datasets/__init__.py | 187 ++++---------------------------------
 tools/datasets/models.py   | 166 ++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+), 167 deletions(-)
 create mode 100644 tools/datasets/models.py

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index e27f011f0..2b87ded3b 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -10,7 +10,6 @@
 import json
 import os
 import random
-import sys
 import tempfile
 import time
 import urllib.request
@@ -19,27 +18,28 @@
 from functools import cached_property, partial
 from itertools import islice
 from pathlib import Path
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    ClassVar,
-    Literal,
-    NamedTuple,
-    cast,
-    get_args,
-)
+from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args
 from urllib.request import urlopen
 
 import polars as pl
 
-if sys.version_info >= (3, 14):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
+from tools.datasets.models import (
+    GitHubRateLimitResources,
+    GitHubTag,
+    GitHubTree,
+    GitHubTreesResponse,
+    GitHubUrl,
+    NpmPackageMetadataResponse,
+    NpmUrl,
+    ParsedRateLimit,
+    ParsedTag,
+    ParsedTree,
+    QueryTree,
+    ReParsedTag,
+)
 
 if TYPE_CHECKING:
+    import sys
     from collections.abc import Mapping, MutableMapping
     from email.message import Message
     from typing import TypeVar
@@ -50,9 +50,9 @@
     else:
         from typing_extensions import TypeIs
     if sys.version_info >= (3, 11):
-        from typing import LiteralString, Required
+        from typing import LiteralString
     else:
-        from typing_extensions import LiteralString, Required
+        from typing_extensions import LiteralString
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
@@ -81,153 +81,6 @@ def _is_str(obj: Any) -> TypeIs[str]:
     return isinstance(obj, str)
 
 
-class GitHubUrl(NamedTuple):
-    BASE: LiteralString
-    RATE: LiteralString
-    REPO: LiteralString
-    TAGS: LiteralString
-    TREES: LiteralString
-
-
-class NpmUrl(NamedTuple):
-    CDN: LiteralString
-    TAGS: LiteralString
-
-
-class GitHubTag(TypedDict):
-    name: str
-    node_id: str
-    commit: dict[Literal["sha", "url"], str]
-    zipball_url: str
-    tarball_url: str
-
-
-class ParsedTag(TypedDict):
-    tag: str
-    sha: str
-    trees_url: str
-
-
-class ReParsedTag(ParsedTag):
-    major: int
-    minor: int
-    patch: int
-    pre_release: int | None
-    is_pre_release: bool
-
-
-class GitHubTree(TypedDict):
-    """
-    A single file's metadata within the response of `Get a tree`_.
-
-    .. _Get a tree:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    """
-
-    path: str
-    mode: str
-    type: str
-    sha: str
-    size: int
-    url: str
-
-
-class GitHubTreesResponse(TypedDict):
-    """
-    Response from `Get a tree`_.
-
-    Describes directory metadata, with files stored in ``"tree"``.
-
-    .. _Get a tree:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    """
-
-    sha: str
-    url: str
-    tree: list[GitHubTree]
-    truncated: bool
-
-
-class NpmVersion(TypedDict):
-    version: str
-    links: dict[Literal["self", "entrypoints", "stats"], str]
-
-
-class NpmPackageMetadataResponse(TypedDict):
-    """
-    Response from `Get package metadata`_.
-
-    Using:
-
-        headers={"Accept": "application/json"}
-
-    .. _Get package metadata:
-        https://data.jsdelivr.com/v1/packages/npm/vega-datasets
-    """
-
-    type: str
-    name: str
-    tags: dict[Literal["canary", "next", "latest"], str]
-    versions: list[NpmVersion]
-    links: dict[Literal["stats"], str]
-
-
-class ParsedTree(TypedDict):
-    file_name: str
-    name_js: str
-    name_py: str
-    suffix: str
-    size: int
-    url: str
-    ext_supported: bool
-    tag: str
-
-
-class QueryTree(TypedDict, total=False):
-    file_name: str
-    name_js: Required[str]
-    name_py: str
-    suffix: str
-    size: int
-    url: str
-    ext_supported: bool
-    tag: str
-
-
-class ParsedTreesResponse(TypedDict):
-    tag: str
-    url: str
-    tree: list[ParsedTree]
-
-
-class GitHubRateLimit(TypedDict):
-    limit: int
-    used: int
-    remaining: int
-    reset: int
-
-
-class ParsedRateLimit(GitHubRateLimit):
-    reset_time: time.struct_time
-    is_limited: bool
-    is_auth: bool
-
-
-class GitHubRateLimitResources(TypedDict, total=False):
-    """
-    A subset of response from `Get rate limit status for the authenticated user`_.
-
-    .. _Get rate limit status for the authenticated user:
-        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-    """
-
-    core: Required[GitHubRateLimit]
-    search: Required[GitHubRateLimit]
-    graphql: GitHubRateLimit
-    integration_manifest: GitHubRateLimit
-    code_search: GitHubRateLimit
-
-
 class _ErrorHandler(urllib.request.BaseHandler):
     """
     Adds `rate limit`_ info to a forbidden error.
@@ -608,6 +461,8 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
 
 
 class _Npm:
+    """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
+
     def __init__(
         self,
         output_dir: Path,
@@ -958,8 +813,6 @@ def __getattr__(self, name: str) -> Dataset:
     def __dir__(self) -> list[str]:
         return self.list_datasets()
 
-    # BUG: # 1.6.0 exists on GH but not npm?
-    # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview
     def __call__(
         self,
         name: str,
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
new file mode 100644
index 000000000..5a6598fed
--- /dev/null
+++ b/tools/datasets/models.py
@@ -0,0 +1,166 @@
+"""API-related data structures."""
+
+from __future__ import annotations
+
+import sys
+from typing import TYPE_CHECKING, Literal, NamedTuple
+
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+if TYPE_CHECKING:
+    import time
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString, Required
+    else:
+        from typing_extensions import LiteralString, Required
+
+
+class GitHubUrl(NamedTuple):
+    BASE: LiteralString
+    RATE: LiteralString
+    REPO: LiteralString
+    TAGS: LiteralString
+    TREES: LiteralString
+
+
+class NpmUrl(NamedTuple):
+    CDN: LiteralString
+    TAGS: LiteralString
+
+
+class GitHubTag(TypedDict):
+    name: str
+    node_id: str
+    commit: dict[Literal["sha", "url"], str]
+    zipball_url: str
+    tarball_url: str
+
+
+class ParsedTag(TypedDict):
+    tag: str
+    sha: str
+    trees_url: str
+
+
+class ReParsedTag(ParsedTag):
+    major: int
+    minor: int
+    patch: int
+    pre_release: int | None
+    is_pre_release: bool
+
+
+class GitHubTree(TypedDict):
+    """
+    A single file's metadata within the response of `Get a tree`_.
+
+    .. _Get a tree:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    """
+
+    path: str
+    mode: str
+    type: str
+    sha: str
+    size: int
+    url: str
+
+
+class GitHubTreesResponse(TypedDict):
+    """
+    Response from `Get a tree`_.
+
+    Describes directory metadata, with files stored in ``"tree"``.
+
+    .. _Get a tree:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    """
+
+    sha: str
+    url: str
+    tree: list[GitHubTree]
+    truncated: bool
+
+
+class NpmVersion(TypedDict):
+    version: str
+    links: dict[Literal["self", "entrypoints", "stats"], str]
+
+
+class NpmPackageMetadataResponse(TypedDict):
+    """
+    Response from `Get package metadata`_.
+
+    Using:
+
+        headers={"Accept": "application/json"}
+
+    .. _Get package metadata:
+        https://data.jsdelivr.com/v1/packages/npm/vega-datasets
+    """
+
+    type: str
+    name: str
+    tags: dict[Literal["canary", "next", "latest"], str]
+    versions: list[NpmVersion]
+    links: dict[Literal["stats"], str]
+
+
+class ParsedTree(TypedDict):
+    file_name: str
+    name_js: str
+    name_py: str
+    suffix: str
+    size: int
+    url: str
+    ext_supported: bool
+    tag: str
+
+
+class QueryTree(TypedDict, total=False):
+    file_name: str
+    name_js: Required[str]
+    name_py: str
+    suffix: str
+    size: int
+    url: str
+    ext_supported: bool
+    tag: str
+
+
+class ParsedTreesResponse(TypedDict):
+    tag: str
+    url: str
+    tree: list[ParsedTree]
+
+
+class GitHubRateLimit(TypedDict):
+    limit: int
+    used: int
+    remaining: int
+    reset: int
+
+
+class ParsedRateLimit(GitHubRateLimit):
+    reset_time: time.struct_time
+    is_limited: bool
+    is_auth: bool
+
+
+class GitHubRateLimitResources(TypedDict, total=False):
+    """
+    A subset of response from `Get rate limit status for the authenticated user`_.
+
+    .. _Get rate limit status for the authenticated user:
+        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
+    """
+
+    core: Required[GitHubRateLimit]
+    search: Required[GitHubRateLimit]
+    graphql: GitHubRateLimit
+    integration_manifest: GitHubRateLimit
+    code_search: GitHubRateLimit

From bc86ca18101e9e688caec7ea5e66afc2810ef993 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:16:05 +0000
Subject: [PATCH 032/201] refactor: Move, rename `semver`-related tools

---
 tools/datasets/__init__.py | 55 ++++++------------------------------
 tools/datasets/semver.py   | 57 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 46 deletions(-)
 create mode 100644 tools/datasets/semver.py

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 2b87ded3b..ce61dbbe7 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -23,6 +23,7 @@
 
 import polars as pl
 
+from tools.datasets import semver
 from tools.datasets.models import (
     GitHubRateLimitResources,
     GitHubTag,
@@ -42,7 +43,6 @@
     import sys
     from collections.abc import Mapping, MutableMapping
     from email.message import Message
-    from typing import TypeVar
     from urllib.request import OpenerDirector, Request
 
     if sys.version_info >= (3, 13):
@@ -59,7 +59,6 @@
         from typing_extensions import TypeAlias
     from tools.schemapi.utils import OneOrSeq
 
-    _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame)
     _PathName: TypeAlias = Literal["dir", "tags", "trees"]
     WorkInProgress: TypeAlias = Any
 
@@ -71,10 +70,6 @@
 
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
 _SUB_DIR = "data"
-_SEM_VER_FIELDS: tuple[
-    Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
-] = "major", "minor", "patch", "pre_release"
-_CANARY: Literal["--canary"] = "--canary"
 
 
 def _is_str(obj: Any) -> TypeIs[str]:
@@ -350,7 +345,7 @@ def tags(
         self, n_head: int | None = None, *, warn_lower: bool = False
     ) -> pl.DataFrame:
         tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
-        return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver)
+        return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
 
     def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
         """Retrieve directory info for a given version ``tag``."""
@@ -398,7 +393,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
                 f"Finished collection.\n"
                 f"Writing {fresh_rows.height} new rows to {fp!s}"
             )
-            return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver)
+            return pl.concat((trees, fresh_rows)).pipe(semver.sort)
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)
@@ -421,7 +416,7 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
             latest = (
                 self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect()
             )
-            if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()):
+            if latest.equals(prev.pipe(semver.sort).head(1).collect()):
                 print(f"Already up-to-date {fp!s}")
                 return prev.collect()
             print(f"Refreshing {fp!s}")
@@ -429,16 +424,14 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
             tags = (
                 pl.concat((self.tags(), prev_eager), how="vertical")
                 .unique("sha")
-                .pipe(_sort_sem_ver)
+                .pipe(semver.sort)
             )
             print(f"Collected {tags.height - prev_eager.height} new tags")
             return tags
 
     def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
-        rate_limit = self.rate_limit()
-        if rate_limit["is_limited"]:
-            raise NotImplementedError(rate_limit)
-        elif not isinstance(tags, Sequence):
+        rate_limit = self.rate_limit(strict=True)
+        if not isinstance(tags, Sequence):
             tags = tuple(tags)
         req = self.req
         n = len(tags)
@@ -511,9 +504,9 @@ def tags(self) -> pl.DataFrame:
         versions = [
             f"v{tag}"
             for v in content["versions"]
-            if (tag := v["version"]) and _CANARY not in tag
+            if (tag := v["version"]) and semver.CANARY not in tag
         ]
-        return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver)
+        return pl.DataFrame({"tag": versions}).pipe(semver.with_columns)
 
 
 class Application:
@@ -606,36 +599,6 @@ def _tag_from(s: str, /) -> str:
         raise TypeError(s)
 
 
-def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame:
-    """
-    Extracts components of a `SemVer`_ string into sortable columns.
-
-    .. _SemVer:
-        https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions
-    """
-    fields = pl.col(_SEM_VER_FIELDS)
-    pattern = r"""(?x)
-        v?(?<major>[[:digit:]]*)\.
-        (?<minor>[[:digit:]]*)\.
-        (?<patch>[[:digit:]]*)
-        (\-(next)?(beta)?\.)?
-        (?<pre_release>[[:digit:]]*)?
-    """
-    sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
-    return (
-        df.lazy()
-        .with_columns(sem_ver)
-        .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64))
-        .with_columns(is_pre_release=pl.col("pre_release").is_not_null())
-        .collect()
-    )
-
-
-def _sort_sem_ver(frame: _Frame, /) -> _Frame:
-    """Sort ``frame``, displaying in descending release order."""
-    return frame.sort(_SEM_VER_FIELDS, descending=True)
-
-
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py
new file mode 100644
index 000000000..cb4c6c799
--- /dev/null
+++ b/tools/datasets/semver.py
@@ -0,0 +1,57 @@
+"""
+Parsing/transforming semantic versioning strings.
+
+.. _semantic versioning:
+    https://semver.org/
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import polars as pl
+
+if TYPE_CHECKING:
+    from typing import TypeVar
+
+    _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame)
+
+__all__ = ["CANARY", "sort", "with_columns"]
+
+_SEM_VER_FIELDS: tuple[
+    Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
+] = "major", "minor", "patch", "pre_release"
+CANARY: Literal["--canary"] = "--canary"
+
+
+def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame:
+    """
+    Extracts components of a `SemVer`_ string into sortable columns.
+
+    .. _SemVer:
+        https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions
+    """
+    fields = pl.col(_SEM_VER_FIELDS)
+    pattern = r"""(?x)
+        v?(?<major>[[:digit:]]*)\.
+        (?<minor>[[:digit:]]*)\.
+        (?<patch>[[:digit:]]*)
+        (\-(next)?(beta)?\.)?
+        (?<pre_release>[[:digit:]]*)?
+    """
+    sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
+    ldf = (
+        frame.lazy()
+        .with_columns(sem_ver)
+        .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64))
+        .with_columns(is_pre_release=pl.col("pre_release").is_not_null())
+    )
+    if isinstance(frame, pl.DataFrame):
+        return ldf.collect()
+    else:
+        return ldf
+
+
+def sort(frame: _Frame, /) -> _Frame:
+    """Sort ``frame``, displaying in descending release order."""
+    return frame.sort(_SEM_VER_FIELDS, descending=True)

From a6f56452df200ef2049aa3203e79f1d70005a198 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:19:13 +0000
Subject: [PATCH 033/201] refactor: Remove `write_schema` from `_Npm`,
 `_GitHub`

Handled in `Application` now
---
 tools/datasets/__init__.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index ce61dbbe7..e26472c2f 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -296,14 +296,10 @@ def __init__(
         name_tags: str,
         name_trees: str,
         *,
-        write_schema: bool,
         base_url: LiteralString = "https://api.github.com/",
         org: LiteralString = "vega",
         package: LiteralString = "vega-datasets",
     ) -> None:
-        # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced
-        # that describes column types - in a non-binary format.
-        self._write_schema: bool = write_schema
         output_dir.mkdir(exist_ok=True)
         self._paths: dict[_PathName, Path] = {
             "dir": output_dir,
@@ -461,13 +457,11 @@ def __init__(
         output_dir: Path,
         name_tags: str,
         *,
-        write_schema: bool,
         jsdelivr: Literal["jsdelivr"] = "jsdelivr",
         npm: Literal["npm"] = "npm",
         package: LiteralString = "vega-datasets",
         jsdelivr_version: LiteralString = "v1",
     ) -> None:
-        self._write_schema: bool = write_schema
         output_dir.mkdir(exist_ok=True)
         self._paths: dict[Literal["tags"], Path] = {
             "tags": output_dir / f"{name_tags}.parquet"
@@ -533,18 +527,9 @@ def __init__(
         kwds_npm = kwds_npm or {}
         self._write_schema: bool = write_schema
         self._github: _GitHub = _GitHub(
-            output_dir,
-            name_tags=tags_gh,
-            name_trees=trees_gh,
-            write_schema=write_schema,
-            **kwds_gh,
-        )
-        self._npm: _Npm = _Npm(
-            output_dir,
-            name_tags=tags_npm,
-            write_schema=write_schema,
-            **kwds_npm,
+            output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh
         )
+        self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm)
 
     @property
     def github(self) -> _GitHub:

From 07a8342c95544fbbacff808f8d4d3868a1215a2c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 16:00:12 +0000
Subject: [PATCH 034/201] refactor: Rename, split `_Npm`, `_GitHub` into own
 modules

`tools.datasets.npm` will later be performing the requests that are in `Dataset.__call__` currently
---
 tools/datasets/__init__.py | 497 +------------------------------------
 tools/datasets/github.py   | 455 +++++++++++++++++++++++++++++++++
 tools/datasets/npm.py      |  76 ++++++
 3 files changed, 541 insertions(+), 487 deletions(-)
 create mode 100644 tools/datasets/github.py
 create mode 100644 tools/datasets/npm.py

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index e26472c2f..bcbe725a1 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -8,42 +8,21 @@
 from __future__ import annotations
 
 import json
-import os
-import random
 import tempfile
-import time
-import urllib.request
-import warnings
-from collections.abc import Iterable, Iterator, Sequence
 from functools import cached_property, partial
-from itertools import islice
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args
 from urllib.request import urlopen
 
 import polars as pl
 
-from tools.datasets import semver
-from tools.datasets.models import (
-    GitHubRateLimitResources,
-    GitHubTag,
-    GitHubTree,
-    GitHubTreesResponse,
-    GitHubUrl,
-    NpmPackageMetadataResponse,
-    NpmUrl,
-    ParsedRateLimit,
-    ParsedTag,
-    ParsedTree,
-    QueryTree,
-    ReParsedTag,
-)
+from tools.datasets.github import GitHub
+from tools.datasets.models import QueryTree
+from tools.datasets.npm import Npm
 
 if TYPE_CHECKING:
     import sys
-    from collections.abc import Mapping, MutableMapping
-    from email.message import Message
-    from urllib.request import OpenerDirector, Request
+    from collections.abc import Mapping
 
     if sys.version_info >= (3, 13):
         from typing import TypeIs
@@ -57,450 +36,10 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from tools.schemapi.utils import OneOrSeq
 
-    _PathName: TypeAlias = Literal["dir", "tags", "trees"]
     WorkInProgress: TypeAlias = Any
 
-
-_ItemSlice: TypeAlias = (
-    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
-)
-"""Query result scalar selection."""
-
-_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
-_SUB_DIR = "data"
-
-
-def _is_str(obj: Any) -> TypeIs[str]:
-    return isinstance(obj, str)
-
-
-class _ErrorHandler(urllib.request.BaseHandler):
-    """
-    Adds `rate limit`_ info to a forbidden error.
-
-    .. _rate limit:
-        https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28
-    """
-
-    def http_error_default(
-        self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message
-    ):
-        if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)):
-            limit = hdrs.get("X-RateLimit-Limit", "")
-            remaining = hdrs.get("X-RateLimit-Remaining", "")
-            msg = (
-                f"{msg}\n\nFailed to balance rate limit.\n"
-                f"{limit=}, {remaining=}\n"
-                f"Reset: {time.localtime(int(reset))!r}"
-            )
-        raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp)
-
-
-class _GitHubRequestNamespace:
-    """
-    Fetching resources from the `GitHub API`_.
-
-    .. _GitHub API:
-        https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28
-    """
-
-    _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN"
-    _TAGS_MAX_PAGE: Literal[100] = 100
-    _VERSION: LiteralString = "2022-11-28"
-    _UNAUTH_RATE_LIMIT: Literal[60] = 60
-    _TAGS_COST: Literal[1] = 1
-    _TREES_COST: Literal[2] = 2
-    _UNAUTH_DELAY: Literal[5] = 5
-    _AUTH_DELAY: Literal[1] = 1
-    _UNAUTH_TREES_LIMIT: Literal[10] = 10
-
-    def __init__(self, gh: _GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._gh.url
-
-    def rate_limit(self) -> GitHubRateLimitResources:
-        """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user."""
-        with self._gh._opener.open(self._request(self.url.RATE)) as response:
-            content: GitHubRateLimitResources = json.load(response)["resources"]
-        return content
-
-    def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
-        """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
-        if n < 1 or n > self._TAGS_MAX_PAGE:
-            raise ValueError(n)
-        req = self._request(f"{self.url.TAGS}?per_page={n}")
-        with self._gh._opener.open(req) as response:
-            content: list[GitHubTag] = json.load(response)
-        if warn_lower and len(content) < n:
-            earliest = response[-1]["name"]
-            n_response = len(content)
-            msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}"
-            warnings.warn(msg, stacklevel=3)
-        return content
-
-    def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
-        """
-        For a given ``tag``, perform **2x requests** to get directory metadata.
-
-        Returns response unchanged - but with annotations.
-        """
-        if _is_str(tag):
-            url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}"
-        else:
-            url = tag["trees_url"]
-        with self._gh._opener.open(self._request(url)) as response:
-            content: GitHubTreesResponse = json.load(response)
-        query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
-        if data_url := next(query, None):
-            with self._gh._opener.open(self._request(data_url)) as response:
-                data_dir: GitHubTreesResponse = json.load(response)
-            return data_dir
-        else:
-            raise FileNotFoundError
-
-    def _request(self, url: str, /, *, raw: bool = False) -> Request:
-        """
-        Wrap a request url with a `personal access token`_ - if set as an env var.
-
-        By default the endpoint returns json, specify raw to get blob data.
-        See `Media types`_.
-
-        .. _personal access token:
-        https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
-        .. _Media types:
-        https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
-        """
-        headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION}
-        if tok := os.environ.get(self._ENV_VAR):
-            headers["Authorization"] = (
-                tok if tok.startswith("Bearer ") else f"Bearer {tok}"
-            )
-        if raw:
-            headers["Accept"] = "application/vnd.github.raw+json"
-        return urllib.request.Request(url, headers=headers)
-
-
-class _GitHubParseNamespace:
-    """
-    Transform responses into intermediate representations.
-
-    Where relevant:
-    - Adding cheap to compute metadata
-    - Dropping information that we don't need for the task
-    """
-
-    def __init__(self, gh: _GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._gh.url
-
-    def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit:
-        core = rate_limit["core"]
-        reset = core["reset"]
-        return ParsedRateLimit(
-            **core,
-            reset_time=time.localtime(reset),
-            is_limited=core["remaining"] == 0,
-            is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT,
-        )
-
-    def tag(self, tag: GitHubTag, /) -> ParsedTag:
-        sha = tag["commit"]["sha"]
-        return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}")
-
-    def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]:
-        return [self.tag(t) for t in tags]
-
-    def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
-        """For a single tree (file) convert to an IR with only relevant properties."""
-        path = Path(tree["path"])
-        return ParsedTree(
-            file_name=path.name,
-            name_js=path.stem,
-            name_py=_js_to_py(path.stem),
-            suffix=path.suffix,
-            size=tree["size"],
-            url=tree["url"],
-            ext_supported=is_ext_supported(path.suffix),
-            tag=tag,
-        )
-
-    def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]:
-        """For a tree response (directory of files) convert to an IR with only relevant properties."""
-        return [self.tree(t, tag) for t in tree["tree"]]
-
-
-class _GitHubQueryNamespace:
-    """**WIP** Interfacing with the cached metadata."""
-
-    def __init__(self, gh: _GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def paths(self) -> dict[_PathName, Path]:
-        return self._gh._paths
-
-    def url_from(
-        self,
-        *predicates: OneOrSeq[str | pl.Expr],
-        item: _ItemSlice = (0, "url_npm"),
-        **constraints: Any,
-    ) -> str:
-        """Querying multi-version trees metadata for `npm` url to fetch."""
-        fp = self.paths["trees"]
-        if fp.suffix != ".parquet":
-            raise NotImplementedError(fp.suffix)
-        items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
-        if items.is_empty():
-            msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
-            raise NotImplementedError(msg)
-        r = items.item(*item)
-        if _is_str(r):
-            return r
-        else:
-            msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
-            raise TypeError(msg)
-
-
-class _GitHub:
-    """
-    Primary interface with the GitHub API.
-
-    Maintains up-to-date metadata, describing **every** available dataset across **all known** releases.
-
-    - Uses `tags`_, `trees`_, `rate_limit`_ endpoints.
-    - Organizes distinct groups of operations into property accessor namespaces.
-
-
-    .. _tags:
-        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
-    .. _trees:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    .. _rate_limit:
-        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-
-    """
-
-    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler)
-
-    def __init__(
-        self,
-        output_dir: Path,
-        name_tags: str,
-        name_trees: str,
-        *,
-        base_url: LiteralString = "https://api.github.com/",
-        org: LiteralString = "vega",
-        package: LiteralString = "vega-datasets",
-    ) -> None:
-        output_dir.mkdir(exist_ok=True)
-        self._paths: dict[_PathName, Path] = {
-            "dir": output_dir,
-            "tags": output_dir / f"{name_tags}.parquet",
-            "trees": output_dir / f"{name_trees}.parquet",
-        }
-        repo = f"{base_url}repos/{org}/{package}/"
-        self._url = GitHubUrl(
-            BASE=base_url,
-            RATE=f"{base_url}rate_limit",
-            REPO=repo,
-            TAGS=f"{repo}tags",
-            TREES=f"{repo}git/trees/",
-        )
-
-    @property
-    def req(self) -> _GitHubRequestNamespace:
-        return _GitHubRequestNamespace(self)
-
-    @property
-    def parse(self) -> _GitHubParseNamespace:
-        return _GitHubParseNamespace(self)
-
-    @property
-    def query(self) -> _GitHubQueryNamespace:
-        return _GitHubQueryNamespace(self)
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._url
-
-    def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
-        limit = self.parse.rate_limit(self.req.rate_limit())
-        if strict and limit["is_limited"]:
-            raise NotImplementedError(limit)
-        return limit
-
-    def tags(
-        self, n_head: int | None = None, *, warn_lower: bool = False
-    ) -> pl.DataFrame:
-        tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
-        return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
-
-    def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
-        """Retrieve directory info for a given version ``tag``."""
-        trees = self.req.trees(tag)
-        tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"]
-        parsed = self.parse.trees(trees, tag=tag_v)
-        df = (
-            pl.DataFrame(parsed)
-            .lazy()
-            .rename({"url": "url_github"})
-            .with_columns(name_collision=pl.col("name_py").is_duplicated())
-            .with_columns(
-                url_npm=pl.concat_str(
-                    pl.lit(_NPM_BASE_URL),
-                    pl.col("tag"),
-                    pl.lit(f"/{_SUB_DIR}/"),
-                    pl.col("file_name"),
-                )
-            )
-            .collect()
-        )
-        return df.select(*sorted(df.columns))
-
-    def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
-        """
-        Use known tags to discover and update missing trees metadata.
-
-        Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
-        """
-        rate_limit = self.rate_limit(strict=True)
-        fp = self._paths["trees"]
-        trees = pl.read_parquet(fp)
-        missing_trees = gh_tags.join(
-            trees.select(pl.col("tag").unique()), on="tag", how="anti"
-        )
-        if missing_trees.is_empty():
-            print(f"Already up-to-date {fp!s}")
-            return trees
-        else:
-            stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
-            it = islice(missing_trees.iter_rows(named=True), stop)
-            missing = cast("Iterator[ReParsedTag]", it)
-            fresh_rows = self._trees_batched(missing)
-            print(
-                f"Finished collection.\n"
-                f"Writing {fresh_rows.height} new rows to {fp!s}"
-            )
-            return pl.concat((trees, fresh_rows)).pipe(semver.sort)
-
-    def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
-        limit = self.rate_limit(strict=True)
-        npm_tag_only = npm_tags.lazy().select("tag")
-        fp = self._paths["tags"]
-        if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST:
-            return (
-                pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect()
-            )
-        elif not fp.exists():
-            print(f"Initializing {fp!s}")
-            tags = (
-                self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect()
-            )
-            print(f"Collected {tags.height} new tags")
-            return tags
-        else:
-            print("Checking for new tags")
-            prev = pl.scan_parquet(fp)
-            latest = (
-                self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect()
-            )
-            if latest.equals(prev.pipe(semver.sort).head(1).collect()):
-                print(f"Already up-to-date {fp!s}")
-                return prev.collect()
-            print(f"Refreshing {fp!s}")
-            prev_eager = prev.collect()
-            tags = (
-                pl.concat((self.tags(), prev_eager), how="vertical")
-                .unique("sha")
-                .pipe(semver.sort)
-            )
-            print(f"Collected {tags.height - prev_eager.height} new tags")
-            return tags
-
-    def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
-        rate_limit = self.rate_limit(strict=True)
-        if not isinstance(tags, Sequence):
-            tags = tuple(tags)
-        req = self.req
-        n = len(tags)
-        cost = req._TREES_COST * n
-        if rate_limit["remaining"] < cost:
-            raise NotImplementedError(rate_limit, cost)
-        delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY
-        print(
-            f"Collecting metadata for {n} missing releases.\n"
-            f"Using {delay_secs=} between requests ..."
-        )
-        dfs: list[pl.DataFrame] = []
-        for tag in tags:
-            time.sleep(delay_secs + random.triangular())
-            dfs.append(self.trees(tag))
-        return pl.concat(dfs)
-
-
-#######################################################################################
-
-
-class _Npm:
-    """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
-
-    def __init__(
-        self,
-        output_dir: Path,
-        name_tags: str,
-        *,
-        jsdelivr: Literal["jsdelivr"] = "jsdelivr",
-        npm: Literal["npm"] = "npm",
-        package: LiteralString = "vega-datasets",
-        jsdelivr_version: LiteralString = "v1",
-    ) -> None:
-        output_dir.mkdir(exist_ok=True)
-        self._paths: dict[Literal["tags"], Path] = {
-            "tags": output_dir / f"{name_tags}.parquet"
-        }
-        self._url: NpmUrl = NpmUrl(
-            CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
-            TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}",
-        )
-
-    @property
-    def url(self) -> NpmUrl:
-        return self._url
-
-    def tags(self) -> pl.DataFrame:
-        """
-        Request, parse tags from `Get package metadata`_.
-
-        Notes
-        -----
-        - Ignores canary releases
-        - ``npm`` can accept either, but this endpoint returns without "v":
-
-            {tag}
-            v{tag}
-
-        .. _Get package metadata:
-            https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package-
-        """
-        req = urllib.request.Request(
-            self.url.TAGS, headers={"Accept": "application/json"}
-        )
-        with urllib.request.urlopen(req) as response:
-            content: NpmPackageMetadataResponse = json.load(response)
-        versions = [
-            f"v{tag}"
-            for v in content["versions"]
-            if (tag := v["version"]) and semver.CANARY not in tag
-        ]
-        return pl.DataFrame({"tag": versions}).pipe(semver.with_columns)
+__all__ = ["app", "data"]
 
 
 class Application:
@@ -526,17 +65,17 @@ def __init__(
         kwds_gh = kwds_gh or {}
         kwds_npm = kwds_npm or {}
         self._write_schema: bool = write_schema
-        self._github: _GitHub = _GitHub(
+        self._github: GitHub = GitHub(
             output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh
         )
-        self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm)
+        self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm)
 
     @property
-    def github(self) -> _GitHub:
+    def github(self) -> GitHub:
         return self._github
 
     @property
-    def npm(self) -> _Npm:
+    def npm(self) -> Npm:
         return self._npm
 
     def refresh(self) -> pl.DataFrame:
@@ -568,22 +107,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 app = Application(Path(__file__).parent / "_metadata", write_schema=True)
 
 
-def _tag_from(s: str, /) -> str:
-    # - Actual tag
-    # - Trees url (using ref name)
-    # - npm url (works w/o the `v` prefix)
-    trees_url = app.github.url.TREES
-    if s.startswith("v"):
-        return s
-    elif s.startswith(trees_url):
-        return s.replace(trees_url, "")
-    elif s.startswith(_NPM_BASE_URL):
-        s, _ = s.replace(_NPM_BASE_URL, "").split("/")
-        return s if s.startswith("v") else f"v{s}"
-    else:
-        raise TypeError(s)
-
-
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
new file mode 100644
index 000000000..e245b91b1
--- /dev/null
+++ b/tools/datasets/github.py
@@ -0,0 +1,455 @@
+from __future__ import annotations
+
+import json
+import os
+import random
+import time
+import urllib.request
+import warnings
+from collections.abc import Iterable, Iterator, Sequence
+from itertools import islice
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast
+
+import polars as pl
+
+from tools.datasets import semver
+from tools.datasets.models import (
+    GitHubRateLimitResources,
+    GitHubTag,
+    GitHubTree,
+    GitHubTreesResponse,
+    GitHubUrl,
+    ParsedRateLimit,
+    ParsedTag,
+    ParsedTree,
+)
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import MutableMapping
+    from email.message import Message
+    from urllib.request import OpenerDirector, Request
+
+    from tools.datasets import ExtSupported
+    from tools.datasets.models import ReParsedTag
+    from tools.schemapi.utils import OneOrSeq
+
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs
+    else:
+        from typing_extensions import TypeIs
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+
+    _PathName: TypeAlias = Literal["dir", "tags", "trees"]
+
+__all__ = ["GitHub"]
+
+_ItemSlice: TypeAlias = (
+    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
+)
+"""Query result scalar selection."""
+
+# TODO: Work on where these should live/be accessed
+_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
+_SUB_DIR = "data"
+
+
+def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
+    return suffix in {".csv", ".json", ".tsv", ".arrow"}
+
+
+def _is_str(obj: Any) -> TypeIs[str]:
+    return isinstance(obj, str)
+
+
+class _ErrorHandler(urllib.request.BaseHandler):
+    """
+    Adds `rate limit`_ info to a forbidden error.
+
+    .. _rate limit:
+        https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28
+    """
+
+    def http_error_default(
+        self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message
+    ):
+        if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)):
+            limit = hdrs.get("X-RateLimit-Limit", "")
+            remaining = hdrs.get("X-RateLimit-Remaining", "")
+            msg = (
+                f"{msg}\n\nFailed to balance rate limit.\n"
+                f"{limit=}, {remaining=}\n"
+                f"Reset: {time.localtime(int(reset))!r}"
+            )
+        raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp)
+
+
+class _GitHubRequestNamespace:
+    """
+    Fetching resources from the `GitHub API`_.
+
+    .. _GitHub API:
+        https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28
+    """
+
+    _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN"
+    _TAGS_MAX_PAGE: Literal[100] = 100
+    _VERSION: LiteralString = "2022-11-28"
+    _UNAUTH_RATE_LIMIT: Literal[60] = 60
+    _TAGS_COST: Literal[1] = 1
+    _TREES_COST: Literal[2] = 2
+    _UNAUTH_DELAY: Literal[5] = 5
+    _AUTH_DELAY: Literal[1] = 1
+    _UNAUTH_TREES_LIMIT: Literal[10] = 10
+
+    def __init__(self, gh: GitHub, /) -> None:
+        self._gh = gh
+
+    @property
+    def url(self) -> GitHubUrl:
+        return self._gh.url
+
+    def rate_limit(self) -> GitHubRateLimitResources:
+        """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user."""
+        with self._gh._opener.open(self._request(self.url.RATE)) as response:
+            content: GitHubRateLimitResources = json.load(response)["resources"]
+        return content
+
+    def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
+        """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
+        if n < 1 or n > self._TAGS_MAX_PAGE:
+            raise ValueError(n)
+        req = self._request(f"{self.url.TAGS}?per_page={n}")
+        with self._gh._opener.open(req) as response:
+            content: list[GitHubTag] = json.load(response)
+        if warn_lower and len(content) < n:
+            earliest = response[-1]["name"]
+            n_response = len(content)
+            msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}"
+            warnings.warn(msg, stacklevel=3)
+        return content
+
+    def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
+        """
+        For a given ``tag``, perform **2x requests** to get directory metadata.
+
+        Returns response unchanged - but with annotations.
+        """
+        if _is_str(tag):
+            url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}"
+        else:
+            url = tag["trees_url"]
+        with self._gh._opener.open(self._request(url)) as response:
+            content: GitHubTreesResponse = json.load(response)
+        query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
+        if data_url := next(query, None):
+            with self._gh._opener.open(self._request(data_url)) as response:
+                data_dir: GitHubTreesResponse = json.load(response)
+            return data_dir
+        else:
+            raise FileNotFoundError
+
+    def _request(self, url: str, /, *, raw: bool = False) -> Request:
+        """
+        Wrap a request url with a `personal access token`_ - if set as an env var.
+
+        By default the endpoint returns json, specify raw to get blob data.
+        See `Media types`_.
+
+        .. _personal access token:
+        https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+        .. _Media types:
+        https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
+        """
+        headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION}
+        if tok := os.environ.get(self._ENV_VAR):
+            headers["Authorization"] = (
+                tok if tok.startswith("Bearer ") else f"Bearer {tok}"
+            )
+        if raw:
+            headers["Accept"] = "application/vnd.github.raw+json"
+        return urllib.request.Request(url, headers=headers)
+
+
+class _GitHubParseNamespace:
+    """
+    Transform responses into intermediate representations.
+
+    Where relevant:
+    - Adding cheap to compute metadata
+    - Dropping information that we don't need for the task
+    """
+
+    def __init__(self, gh: GitHub, /) -> None:
+        self._gh = gh
+
+    @property
+    def url(self) -> GitHubUrl:
+        return self._gh.url
+
+    def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit:
+        core = rate_limit["core"]
+        reset = core["reset"]
+        return ParsedRateLimit(
+            **core,
+            reset_time=time.localtime(reset),
+            is_limited=core["remaining"] == 0,
+            is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT,
+        )
+
+    def tag(self, tag: GitHubTag, /) -> ParsedTag:
+        sha = tag["commit"]["sha"]
+        return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}")
+
+    def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]:
+        return [self.tag(t) for t in tags]
+
+    def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
+        """For a single tree (file) convert to an IR with only relevant properties."""
+        path = Path(tree["path"])
+        return ParsedTree(
+            file_name=path.name,
+            name_js=path.stem,
+            name_py=path.stem.replace("-", "_"),
+            suffix=path.suffix,
+            size=tree["size"],
+            url=tree["url"],
+            ext_supported=is_ext_supported(path.suffix),
+            tag=tag,
+        )
+
+    def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]:
+        """For a tree response (directory of files) convert to an IR with only relevant properties."""
+        return [self.tree(t, tag) for t in tree["tree"]]
+
+    def tag_from_str(self, s: str, /) -> str:
+        # - Actual tag
+        # - Trees url (using ref name)
+        # - npm url (works w/o the `v` prefix)
+        trees_url = self.url.TREES
+        if s.startswith("v"):
+            return s
+        elif s.startswith(trees_url):
+            return s.replace(trees_url, "")
+        elif s.startswith(_NPM_BASE_URL):
+            s, _ = s.replace(_NPM_BASE_URL, "").split("/")
+            return s if s.startswith("v") else f"v{s}"
+        else:
+            raise TypeError(s)
+
+
+class _GitHubQueryNamespace:
+    """**WIP** Interfacing with the cached metadata."""
+
+    def __init__(self, gh: GitHub, /) -> None:
+        self._gh = gh
+
+    @property
+    def paths(self) -> dict[_PathName, Path]:
+        return self._gh._paths
+
+    def url_from(
+        self,
+        *predicates: OneOrSeq[str | pl.Expr],
+        item: _ItemSlice = (0, "url_npm"),
+        **constraints: Any,
+    ) -> str:
+        """Querying multi-version trees metadata for `npm` url to fetch."""
+        fp = self.paths["trees"]
+        if fp.suffix != ".parquet":
+            raise NotImplementedError(fp.suffix)
+        items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
+        if items.is_empty():
+            msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
+            raise NotImplementedError(msg)
+        r = items.item(*item)
+        if _is_str(r):
+            return r
+        else:
+            msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
+            raise TypeError(msg)
+
+
+class GitHub:
+    """
+    Primary interface with the GitHub API.
+
+    Maintains up-to-date metadata, describing **every** available dataset across **all known** releases.
+
+    - Uses `tags`_, `trees`_, `rate_limit`_ endpoints.
+    - Organizes distinct groups of operations into property accessor namespaces.
+
+
+    .. _tags:
+        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
+    .. _trees:
+        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
+    .. _rate_limit:
+        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
+
+    """
+
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler)
+
+    def __init__(
+        self,
+        output_dir: Path,
+        name_tags: str,
+        name_trees: str,
+        *,
+        base_url: LiteralString = "https://api.github.com/",
+        org: LiteralString = "vega",
+        package: LiteralString = "vega-datasets",
+    ) -> None:
+        output_dir.mkdir(exist_ok=True)
+        self._paths: dict[_PathName, Path] = {
+            "dir": output_dir,
+            "tags": output_dir / f"{name_tags}.parquet",
+            "trees": output_dir / f"{name_trees}.parquet",
+        }
+        repo = f"{base_url}repos/{org}/{package}/"
+        self._url = GitHubUrl(
+            BASE=base_url,
+            RATE=f"{base_url}rate_limit",
+            REPO=repo,
+            TAGS=f"{repo}tags",
+            TREES=f"{repo}git/trees/",
+        )
+
+    @property
+    def req(self) -> _GitHubRequestNamespace:
+        return _GitHubRequestNamespace(self)
+
+    @property
+    def parse(self) -> _GitHubParseNamespace:
+        return _GitHubParseNamespace(self)
+
+    @property
+    def query(self) -> _GitHubQueryNamespace:
+        return _GitHubQueryNamespace(self)
+
+    @property
+    def url(self) -> GitHubUrl:
+        return self._url
+
+    def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
+        limit = self.parse.rate_limit(self.req.rate_limit())
+        if strict and limit["is_limited"]:
+            raise NotImplementedError(limit)
+        return limit
+
+    def tags(
+        self, n_head: int | None = None, *, warn_lower: bool = False
+    ) -> pl.DataFrame:
+        tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
+        return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
+
+    def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
+        """Retrieve directory info for a given version ``tag``."""
+        trees = self.req.trees(tag)
+        tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"]
+        parsed = self.parse.trees(trees, tag=tag_v)
+        df = (
+            pl.DataFrame(parsed)
+            .lazy()
+            .rename({"url": "url_github"})
+            .with_columns(name_collision=pl.col("name_py").is_duplicated())
+            .with_columns(
+                url_npm=pl.concat_str(
+                    pl.lit(_NPM_BASE_URL),
+                    pl.col("tag"),
+                    pl.lit(f"/{_SUB_DIR}/"),
+                    pl.col("file_name"),
+                )
+            )
+            .collect()
+        )
+        return df.select(*sorted(df.columns))
+
+    def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
+        """
+        Use known tags to discover and update missing trees metadata.
+
+        Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
+        """
+        rate_limit = self.rate_limit(strict=True)
+        fp = self._paths["trees"]
+        trees = pl.read_parquet(fp)
+        missing_trees = gh_tags.join(
+            trees.select(pl.col("tag").unique()), on="tag", how="anti"
+        )
+        if missing_trees.is_empty():
+            print(f"Already up-to-date {fp!s}")
+            return trees
+        else:
+            stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
+            it = islice(missing_trees.iter_rows(named=True), stop)
+            missing = cast("Iterator[ReParsedTag]", it)
+            fresh_rows = self._trees_batched(missing)
+            print(
+                f"Finished collection.\n"
+                f"Writing {fresh_rows.height} new rows to {fp!s}"
+            )
+            return pl.concat((trees, fresh_rows)).pipe(semver.sort)
+
+    def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
+        limit = self.rate_limit(strict=True)
+        npm_tag_only = npm_tags.lazy().select("tag")
+        fp = self._paths["tags"]
+        if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST:
+            return (
+                pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect()
+            )
+        elif not fp.exists():
+            print(f"Initializing {fp!s}")
+            tags = (
+                self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect()
+            )
+            print(f"Collected {tags.height} new tags")
+            return tags
+        else:
+            print("Checking for new tags")
+            prev = pl.scan_parquet(fp)
+            latest = (
+                self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect()
+            )
+            if latest.equals(prev.pipe(semver.sort).head(1).collect()):
+                print(f"Already up-to-date {fp!s}")
+                return prev.collect()
+            print(f"Refreshing {fp!s}")
+            prev_eager = prev.collect()
+            tags = (
+                pl.concat((self.tags(), prev_eager), how="vertical")
+                .unique("sha")
+                .pipe(semver.sort)
+            )
+            print(f"Collected {tags.height - prev_eager.height} new tags")
+            return tags
+
+    def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
+        rate_limit = self.rate_limit(strict=True)
+        if not isinstance(tags, Sequence):
+            tags = tuple(tags)
+        req = self.req
+        n = len(tags)
+        cost = req._TREES_COST * n
+        if rate_limit["remaining"] < cost:
+            raise NotImplementedError(rate_limit, cost)
+        delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY
+        print(
+            f"Collecting metadata for {n} missing releases.\n"
+            f"Using {delay_secs=} between requests ..."
+        )
+        dfs: list[pl.DataFrame] = []
+        for tag in tags:
+            time.sleep(delay_secs + random.triangular())
+            dfs.append(self.trees(tag))
+        return pl.concat(dfs)
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
new file mode 100644
index 000000000..bdc20f83b
--- /dev/null
+++ b/tools/datasets/npm.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import json
+import urllib.request
+from typing import TYPE_CHECKING, Literal
+
+import polars as pl
+
+from tools.datasets import semver
+from tools.datasets.models import NpmUrl
+
+if TYPE_CHECKING:
+    import sys
+    from pathlib import Path
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    from tools.datasets.models import NpmPackageMetadataResponse
+
+__all__ = ["Npm"]
+
+
+class Npm:
+    """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
+
+    def __init__(
+        self,
+        output_dir: Path,
+        name_tags: str,
+        *,
+        jsdelivr: Literal["jsdelivr"] = "jsdelivr",
+        npm: Literal["npm"] = "npm",
+        package: LiteralString = "vega-datasets",
+        jsdelivr_version: LiteralString = "v1",
+    ) -> None:
+        output_dir.mkdir(exist_ok=True)
+        self._paths: dict[Literal["tags"], Path] = {
+            "tags": output_dir / f"{name_tags}.parquet"
+        }
+        self._url: NpmUrl = NpmUrl(
+            CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
+            TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}",
+        )
+
+    @property
+    def url(self) -> NpmUrl:
+        return self._url
+
+    def tags(self) -> pl.DataFrame:
+        """
+        Request, parse tags from `Get package metadata`_.
+
+        Notes
+        -----
+        - Ignores canary releases
+        - ``npm`` can accept either, but this endpoint returns without "v":
+
+            {tag}
+            v{tag}
+
+        .. _Get package metadata:
+            https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package-
+        """
+        req = urllib.request.Request(
+            self.url.TAGS, headers={"Accept": "application/json"}
+        )
+        with urllib.request.urlopen(req) as response:
+            content: NpmPackageMetadataResponse = json.load(response)
+        versions = [
+            f"v{tag}"
+            for v in content["versions"]
+            if (tag := v["version"]) and semver.CANARY not in tag
+        ]
+        return pl.DataFrame({"tag": versions}).pipe(semver.with_columns)

From b89e6dc31691cdeb2c33811c27db92c70ded7940 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 16:21:28 +0000
Subject: [PATCH 035/201] refactor: Move `DataLoader.__call__` ->
 `DataLoader.url()`

-`data.name()` -> `data(name)`
- `data.name.url` -> `data.url(name)`
---
 tools/datasets/__init__.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index bcbe725a1..c9114aa01 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -284,20 +284,13 @@ def __getattr__(self, name: str) -> Dataset:
     def __dir__(self) -> list[str]:
         return self.list_datasets()
 
-    def __call__(
+    def url(
         self,
         name: str,
         ext: ExtSupported | None = None,
         /,
         tag: LiteralString | Literal["latest"] | None = None,
-    ) -> WorkInProgress:
-        """
-        **WIP** Will be using this *instead of* attribute access.
-
-        - Original supports this as well
-        - Will only be using the actual (js_name)
-        - Some have hyphens, others underscores
-        """
+    ) -> str:
         constraints: dict[Literal["tag", "suffix"], str] = {}
         if tag == "latest":
             raise NotImplementedError(tag)
@@ -318,5 +311,21 @@ def __call__(
         q = QueryTree(name_js=name, **constraints)  # type: ignore[typeddict-item]
         return app.github.query.url_from(**q)
 
+    def __call__(
+        self,
+        name: str,
+        ext: ExtSupported | None = None,
+        /,
+        tag: LiteralString | Literal["latest"] | None = None,
+    ) -> WorkInProgress:
+        """
+        **WIP** Will be using this *instead of* attribute access.
+
+        - Original supports this as well
+        - Will only be using the actual (js_name)
+        - Some have hyphens, others underscores
+        """
+        return self.url(name, ext, tag=tag)
+
 
 data = DataLoader()

From 7b0fe294fabe3a562cf7d291951f1bd0da3e2b93 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 17:53:59 +0000
Subject: [PATCH 036/201] feat(typing): Generate annotations based on known
 datasets

---
 tools/datasets/__init__.py |  62 +++++++++++++++++
 tools/datasets/_typing.py  | 137 +++++++++++++++++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 tools/datasets/_typing.py

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index c9114aa01..bf5b7f187 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -16,9 +16,11 @@
 
 import polars as pl
 
+from tools.codemod import ruff
 from tools.datasets.github import GitHub
 from tools.datasets.models import QueryTree
 from tools.datasets.npm import Npm
+from tools.schemapi import utils
 
 if TYPE_CHECKING:
     import sys
@@ -37,10 +39,17 @@
     else:
         from typing_extensions import TypeAlias
 
+    _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"]
+
     WorkInProgress: TypeAlias = Any
 
 __all__ = ["app", "data"]
 
+HEADER_COMMENT = """\
+# The contents of this file are automatically written by
+# tools/datasets.__init__.py. Do not modify directly.
+"""
+
 
 class Application:
     """
@@ -78,6 +87,14 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
+    @property
+    def _aliases(self) -> dict[_PathAlias, Path]:
+        return {
+            "npm_tags": self.npm._paths["tags"],
+            "gh_tags": self.github._paths["tags"],
+            "gh_trees": self.github._paths["trees"],
+        }
+
     def refresh(self) -> pl.DataFrame:
         npm_tags = self.npm.tags()
         self.write_parquet(npm_tags, self.npm._paths["tags"])
@@ -89,6 +106,21 @@ def refresh(self) -> pl.DataFrame:
         self.write_parquet(gh_trees, self.github._paths["trees"])
         return gh_trees
 
+    def read(self, name: _PathAlias, /) -> pl.DataFrame:
+        """Read existing metadata from file."""
+        return pl.read_parquet(self._from_alias(name))
+
+    def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
+        """Scan existing metadata from file."""
+        return pl.scan_parquet(self._from_alias(name))
+
+    def _from_alias(self, name: _PathAlias, /) -> Path:
+        if name not in {"npm_tags", "gh_tags", "gh_trees"}:
+            msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}'
+            raise TypeError(msg)
+        else:
+            return self._aliases[name]
+
     def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """Write ``frame`` to ``fp``, with some extra safety."""
         if not fp.exists():
@@ -118,6 +150,36 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 """
 
 
+def generate_datasets_typing(application: Application, output: Path, /) -> None:
+    app = application
+    tags = app.scan("gh_tags").select("tag").collect().to_series()
+    names = (
+        app.scan("gh_trees")
+        .filter("ext_supported")
+        .unique("name_js")
+        .select("name_js")
+        .sort("name_js")
+        .collect()
+        .to_series()
+    )
+    NAME = "DatasetName"
+    TAG = "VersionTag"
+    EXT = "Extension"
+    contents = (
+        f"{HEADER_COMMENT}",
+        "from __future__ import annotations\n",
+        "import sys",
+        "from typing import Literal, TYPE_CHECKING",
+        utils.import_typing_extensions((3, 10), "TypeAlias"),
+        "\n",
+        f"__all__ = {[NAME, TAG, EXT]}\n\n"
+        f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
+        f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
+        f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
+    )
+    ruff.write_lint_format(output, contents)
+
+
 def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py
new file mode 100644
index 000000000..9414aaab4
--- /dev/null
+++ b/tools/datasets/_typing.py
@@ -0,0 +1,137 @@
+# The contents of this file are automatically written by
+# tools/datasets.__init__.py. Do not modify directly.
+
+from __future__ import annotations
+
+import sys
+from typing import Literal
+
+if sys.version_info >= (3, 10):
+    from typing import TypeAlias
+else:
+    from typing_extensions import TypeAlias
+
+
+__all__ = ["DatasetName", "Extension", "VersionTag"]
+
+DatasetName: TypeAlias = Literal[
+    "airports",
+    "annual-precip",
+    "anscombe",
+    "barley",
+    "birdstrikes",
+    "budget",
+    "budgets",
+    "burtin",
+    "cars",
+    "climate",
+    "co2-concentration",
+    "countries",
+    "crimea",
+    "disasters",
+    "driving",
+    "earthquakes",
+    "flare",
+    "flare-dependencies",
+    "flights-10k",
+    "flights-200k",
+    "flights-20k",
+    "flights-2k",
+    "flights-3m",
+    "flights-5k",
+    "flights-airport",
+    "football",
+    "gapminder",
+    "gapminder-health-income",
+    "github",
+    "global-temp",
+    "graticule",
+    "income",
+    "iowa-electricity",
+    "iris",
+    "jobs",
+    "la-riots",
+    "londonBoroughs",
+    "londonCentroids",
+    "londonTubeLines",
+    "lookup_groups",
+    "lookup_people",
+    "miserables",
+    "monarchs",
+    "movies",
+    "normal-2d",
+    "obesity",
+    "ohlc",
+    "penguins",
+    "platformer-terrain",
+    "points",
+    "political-contributions",
+    "population",
+    "population_engineers_hurricanes",
+    "seattle-temps",
+    "seattle-weather",
+    "seattle-weather-hourly-normals",
+    "sf-temps",
+    "sp500",
+    "sp500-2000",
+    "stocks",
+    "udistrict",
+    "unemployment",
+    "unemployment-across-industries",
+    "uniform-2d",
+    "us-10m",
+    "us-employment",
+    "us-state-capitals",
+    "volcano",
+    "weather",
+    "weball26",
+    "wheat",
+    "windvectors",
+    "world-110m",
+    "zipcodes",
+]
+VersionTag: TypeAlias = Literal[
+    "v2.9.0",
+    "v2.8.1",
+    "v2.8.0",
+    "v2.7.0",
+    "v2.5.4",
+    "v2.5.3",
+    "v2.5.3-next.0",
+    "v2.5.2",
+    "v2.5.2-next.0",
+    "v2.5.1",
+    "v2.5.1-next.0",
+    "v2.5.0",
+    "v2.5.0-next.0",
+    "v2.4.0",
+    "v2.3.1",
+    "v2.3.0",
+    "v2.1.0",
+    "v2.0.0",
+    "v1.31.1",
+    "v1.31.0",
+    "v1.30.4",
+    "v1.30.3",
+    "v1.30.2",
+    "v1.30.1",
+    "v1.29.0",
+    "v1.24.0",
+    "v1.22.0",
+    "v1.21.1",
+    "v1.21.0",
+    "v1.20.0",
+    "v1.19.0",
+    "v1.18.0",
+    "v1.17.0",
+    "v1.16.0",
+    "v1.15.0",
+    "v1.14.0",
+    "v1.12.0",
+    "v1.11.0",
+    "v1.10.0",
+    "v1.8.0",
+    "v1.7.0",
+    "v1.5.0",
+]
+Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]

From 572d069842ea80c085db22cf90aee7286e5a4bfd Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 18:02:42 +0000
Subject: [PATCH 037/201] refactor(typing): Utilize `datasets._typing`

---
 tools/datasets/__init__.py | 28 ++++++++++++----------------
 tools/datasets/github.py   |  4 ++--
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index bf5b7f187..a92aeb2fc 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -11,7 +11,7 @@
 import tempfile
 from functools import cached_property, partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal
 from urllib.request import urlopen
 
 import polars as pl
@@ -38,6 +38,7 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
+    from tools.datasets._typing import DatasetName, Extension, VersionTag
 
     _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"]
 
@@ -144,11 +145,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
 _CURRENT_SOURCE_TAG = "v2.9.0"
 
-ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
-"""
-- `'flights-200k.(arrow|json)'` key collison using stem
-"""
-
 
 def generate_datasets_typing(application: Application, output: Path, /) -> None:
     app = application
@@ -180,7 +176,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
     ruff.write_lint_format(output, contents)
 
 
-def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
+def is_ext_supported(suffix: str) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
@@ -193,7 +189,7 @@ def _js_to_py(s: str, /):
 
 
 class Dataset:
-    read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = {
+    read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = {
         ".csv": pl.read_csv,
         ".json": pl.read_json,
         ".tsv": partial(pl.read_csv, separator="\t"),
@@ -205,7 +201,7 @@ def __init__(self, name: str, /, base_url: str) -> None:
         file_name = DATASETS_JSON[_py_to_js(name)]["filename"]
         suffix = Path(file_name).suffix
         if is_ext_supported(suffix):
-            self.extension: ExtSupported = suffix
+            self.extension: Extension = suffix
         else:
             raise NotImplementedError(suffix, file_name)
 
@@ -348,17 +344,17 @@ def __dir__(self) -> list[str]:
 
     def url(
         self,
-        name: str,
-        ext: ExtSupported | None = None,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
         /,
-        tag: LiteralString | Literal["latest"] | None = None,
+        tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
         constraints: dict[Literal["tag", "suffix"], str] = {}
         if tag == "latest":
             raise NotImplementedError(tag)
         elif tag is not None:
             constraints["tag"] = tag
-        if name.endswith(get_args(ExtSupported)):
+        if name.endswith((".csv", ".json", ".tsv", ".arrow")):
             name, suffix = name.rsplit(".", maxsplit=1)
             suffix = "." + suffix
             if not is_ext_supported(suffix):
@@ -375,10 +371,10 @@ def url(
 
     def __call__(
         self,
-        name: str,
-        ext: ExtSupported | None = None,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
         /,
-        tag: LiteralString | Literal["latest"] | None = None,
+        tag: VersionTag | Literal["latest"] | None = None,
     ) -> WorkInProgress:
         """
         **WIP** Will be using this *instead of* attribute access.
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index e245b91b1..fc0a899f2 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -31,7 +31,7 @@
     from email.message import Message
     from urllib.request import OpenerDirector, Request
 
-    from tools.datasets import ExtSupported
+    from tools.datasets._typing import Extension
     from tools.datasets.models import ReParsedTag
     from tools.schemapi.utils import OneOrSeq
 
@@ -62,7 +62,7 @@
 _SUB_DIR = "data"
 
 
-def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]:
+def is_ext_supported(suffix: str) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
From 07dcc0baaf955d10c65b68c65165c86bc2cb9ddb Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 20:16:07 +0000
Subject: [PATCH 038/201] feat: Adds `Npm.dataset` for remote reading]

---
 tools/datasets/__init__.py |  5 ++--
 tools/datasets/npm.py      | 55 +++++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index a92aeb2fc..b1a5b8550 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -375,7 +375,8 @@ def __call__(
         ext: Extension | None = None,
         /,
         tag: VersionTag | Literal["latest"] | None = None,
-    ) -> WorkInProgress:
+        **kwds: Any,
+    ) -> pl.DataFrame:
         """
         **WIP** Will be using this *instead of* attribute access.
 
@@ -383,7 +384,7 @@ def __call__(
         - Will only be using the actual (js_name)
         - Some have hyphens, others underscores
         """
-        return self.url(name, ext, tag=tag)
+        return app.npm.dataset(self.url(name, ext, tag=tag), **kwds)
 
 
 data = DataLoader()
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index bdc20f83b..589db4660 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -2,7 +2,9 @@
 
 import json
 import urllib.request
-from typing import TYPE_CHECKING, Literal
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal
 
 import polars as pl
 
@@ -11,20 +13,43 @@
 
 if TYPE_CHECKING:
     import sys
-    from pathlib import Path
+    from urllib.request import OpenerDirector
 
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs
+    else:
+        from typing_extensions import TypeIs
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from tools.datasets._typing import Extension
     from tools.datasets.models import NpmPackageMetadataResponse
 
+    ReadFn: TypeAlias = Callable[..., pl.DataFrame]
+
 __all__ = ["Npm"]
 
 
+def is_ext_supported(suffix: str) -> TypeIs[Extension]:
+    return suffix in {".csv", ".json", ".tsv", ".arrow"}
+
+
 class Npm:
     """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
 
+    _read_fn: ClassVar[dict[Extension, ReadFn]] = {
+        ".csv": pl.read_csv,
+        ".json": pl.read_json,
+        ".tsv": partial(pl.read_csv, separator="\t"),
+        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
+    }
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+
     def __init__(
         self,
         output_dir: Path,
@@ -48,6 +73,30 @@ def __init__(
     def url(self) -> NpmUrl:
         return self._url
 
+    @classmethod
+    def reader_from(cls, url: str, /) -> ReadFn:
+        suffix = Path(url).suffix
+        if is_ext_supported(suffix):
+            return cls._read_fn[suffix]
+        else:
+            msg = f"Unexpected file extension {suffix!r}, from:\n{url}"
+            raise NotImplementedError(msg)
+
+    def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame:
+        """
+        Fetch a remote dataset.
+
+        Parameters
+        ----------
+        url
+            Full path to a known dataset.
+        **kwds
+            Arguments passed to the underlying read function.
+        """
+        fn = self.reader_from(url)
+        with self._opener.open(url) as f:
+            return fn(f.read(), **kwds)
+
     def tags(self) -> pl.DataFrame:
         """
         Request, parse tags from `Get package metadata`_.
@@ -66,7 +115,7 @@ def tags(self) -> pl.DataFrame:
         req = urllib.request.Request(
             self.url.TAGS, headers={"Accept": "application/json"}
         )
-        with urllib.request.urlopen(req) as response:
+        with self._opener.open(req) as response:
             content: NpmPackageMetadataResponse = json.load(response)
         versions = [
             f"v{tag}"

From d8f37918b130d7f89defcb6f1104268db1997420 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 6 Nov 2024 20:24:38 +0000
Subject: [PATCH 039/201] refactor: Remove dead code

---
 tools/datasets/__init__.py | 173 ++-----------------------------------
 1 file changed, 6 insertions(+), 167 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index b1a5b8550..ab1af8d4b 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -8,11 +8,8 @@
 from __future__ import annotations
 
 import json
-import tempfile
-from functools import cached_property, partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal
-from urllib.request import urlopen
+from typing import TYPE_CHECKING, Any, Literal
 
 import polars as pl
 
@@ -180,167 +177,9 @@ def is_ext_supported(suffix: str) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
-def _py_to_js(s: str, /):
-    return s.replace("_", "-")
-
-
-def _js_to_py(s: str, /):
-    return s.replace("-", "_")
-
-
-class Dataset:
-    read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = {
-        ".csv": pl.read_csv,
-        ".json": pl.read_json,
-        ".tsv": partial(pl.read_csv, separator="\t"),
-        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
-    }
-
-    def __init__(self, name: str, /, base_url: str) -> None:
-        self.name: str = name
-        file_name = DATASETS_JSON[_py_to_js(name)]["filename"]
-        suffix = Path(file_name).suffix
-        if is_ext_supported(suffix):
-            self.extension: Extension = suffix
-        else:
-            raise NotImplementedError(suffix, file_name)
-
-        self.url: str = f"{base_url}{file_name}"
-
-    def __call__(self, **kwds: Any) -> pl.DataFrame:
-        fn = self.read_fn[self.extension]
-        with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f:
-            tmp.write(f.read())
-            content = fn(tmp, **kwds)
-        return content
-
-    def __repr__(self) -> str:
-        return (
-            f"{type(self).__name__}(\n  "
-            f"name={self.name!r},\n  "
-            f"url={self.url!r}\n"
-            ")"
-        )
-
-
-DATASETS_JSON = {
-    # "7zip": {"filename": "7zip.png", "format": "png"},
-    "airports": {"filename": "airports.csv", "format": "csv"},
-    "annual-precip": {"filename": "annual-precip.json", "format": "json"},
-    "anscombe": {"filename": "anscombe.json", "format": "json"},
-    "barley": {"filename": "barley.json", "format": "json"},
-    "birdstrikes": {"filename": "birdstrikes.json", "format": "json"},
-    "budget": {"filename": "budget.json", "format": "json"},
-    "budgets": {"filename": "budgets.json", "format": "json"},
-    "burtin": {"filename": "burtin.json", "format": "json"},
-    "cars": {"filename": "cars.json", "format": "json"},
-    "climate": {"filename": "climate.json", "format": "json"},
-    "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"},
-    "countries": {"filename": "countries.json", "format": "json"},
-    "crimea": {"filename": "crimea.json", "format": "json"},
-    "disasters": {"filename": "disasters.csv", "format": "csv"},
-    "driving": {"filename": "driving.json", "format": "json"},
-    "earthquakes": {"filename": "earthquakes.json", "format": "json"},
-    # "ffox": {"filename": "ffox.png", "format": "png"},
-    "flare": {"filename": "flare.json", "format": "json"},
-    "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"},
-    "flights-10k": {"filename": "flights-10k.json", "format": "json"},
-    "flights-200k": {"filename": "flights-200k.json", "format": "json"},
-    "flights-20k": {"filename": "flights-20k.json", "format": "json"},
-    "flights-2k": {"filename": "flights-2k.json", "format": "json"},
-    "flights-3m": {"filename": "flights-3m.csv", "format": "csv"},
-    "flights-5k": {"filename": "flights-5k.json", "format": "json"},
-    "flights-airport": {"filename": "flights-airport.csv", "format": "csv"},
-    "gapminder": {"filename": "gapminder.json", "format": "json"},
-    "gapminder-health-income": {
-        "filename": "gapminder-health-income.csv",
-        "format": "csv",
-    },
-    # "gimp": {"filename": "gimp.png", "format": "png"},
-    "github": {"filename": "github.csv", "format": "csv"},
-    "graticule": {"filename": "graticule.json", "format": "json"},
-    "income": {"filename": "income.json", "format": "json"},
-    "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"},
-    "iris": {"filename": "iris.json", "format": "json"},
-    "jobs": {"filename": "jobs.json", "format": "json"},
-    "la-riots": {"filename": "la-riots.csv", "format": "csv"},
-    "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"},
-    "londonCentroids": {"filename": "londonCentroids.json", "format": "json"},
-    "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"},
-    "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"},
-    "lookup_people": {"filename": "lookup_people.csv", "format": "csv"},
-    "miserables": {"filename": "miserables.json", "format": "json"},
-    "monarchs": {"filename": "monarchs.json", "format": "json"},
-    "movies": {"filename": "movies.json", "format": "json"},
-    "normal-2d": {"filename": "normal-2d.json", "format": "json"},
-    "obesity": {"filename": "obesity.json", "format": "json"},
-    "ohlc": {"filename": "ohlc.json", "format": "json"},
-    "points": {"filename": "points.json", "format": "json"},
-    "population": {"filename": "population.json", "format": "json"},
-    "population_engineers_hurricanes": {
-        "filename": "population_engineers_hurricanes.csv",
-        "format": "csv",
-    },
-    "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"},
-    "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"},
-    "sf-temps": {"filename": "sf-temps.csv", "format": "csv"},
-    "sp500": {"filename": "sp500.csv", "format": "csv"},
-    "stocks": {"filename": "stocks.csv", "format": "csv"},
-    "udistrict": {"filename": "udistrict.json", "format": "json"},
-    "unemployment": {"filename": "unemployment.tsv", "format": "tsv"},
-    "unemployment-across-industries": {
-        "filename": "unemployment-across-industries.json",
-        "format": "json",
-    },
-    "uniform-2d": {"filename": "uniform-2d.json", "format": "json"},
-    "us-10m": {"filename": "us-10m.json", "format": "json"},
-    "us-employment": {"filename": "us-employment.csv", "format": "csv"},
-    "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"},
-    "volcano": {"filename": "volcano.json", "format": "json"},
-    "weather": {"filename": "weather.json", "format": "json"},
-    "weball26": {"filename": "weball26.json", "format": "json"},
-    "wheat": {"filename": "wheat.json", "format": "json"},
-    "windvectors": {"filename": "windvectors.csv", "format": "csv"},
-    "world-110m": {"filename": "world-110m.json", "format": "json"},
-    "zipcodes": {"filename": "zipcodes.csv", "format": "csv"},
-}
-"""Inlined `datasets.json`_.
-
-- Excluding images
-
-.. _datasets.json:
-    https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json
-"""
-
-
 class DataLoader:
-    source_tag: ClassVar[str] = "v2.9.0"
-    _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/"
-
-    @property
-    def base_url(self) -> str:
-        return self._base_url_fmt.format(self.source_tag)
-
-    @cached_property
-    def _dataset_names(self) -> list[str]:
-        return sorted(DATASETS_JSON)
-
-    @cached_property
-    def _py_js_names(self) -> dict[str, str]:
-        return {_js_to_py(name): name for name in self._dataset_names}
-
-    def list_datasets(self) -> list[str]:
-        return list(self._py_js_names)
-
-    def __getattr__(self, name: str) -> Dataset:
-        if name in self._py_js_names:
-            return Dataset(self._py_js_names[name], self.base_url)
-        else:
-            msg = f"No dataset named {name!r}"
-            raise AttributeError(msg)
-
-    def __dir__(self) -> list[str]:
-        return self.list_datasets()
+    def __init__(self, application: Application, /) -> None:
+        self._app: Application = application
 
     def url(
         self,
@@ -367,7 +206,7 @@ def url(
             else:
                 constraints["suffix"] = ext
         q = QueryTree(name_js=name, **constraints)  # type: ignore[typeddict-item]
-        return app.github.query.url_from(**q)
+        return self._app.github.query.url_from(**q)
 
     def __call__(
         self,
@@ -384,7 +223,7 @@ def __call__(
         - Will only be using the actual (js_name)
         - Some have hyphens, others underscores
         """
-        return app.npm.dataset(self.url(name, ext, tag=tag), **kwds)
+        return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds)
 
 
-data = DataLoader()
+data = DataLoader(app)

From 4642a238971edea66b4bd5f5e3636a287de2db96 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 11:26:34 +0000
Subject: [PATCH 040/201] refactor: Replace `name_js`, `name_py` with
 `dataset_name`

Since we're just using strings, there is no need for 2 forms of the name.
The legacy package needed this for `__getattr__` access with valid identifiers
---
 tools/datasets/__init__.py                    |   9 +++++----
 tools/datasets/_metadata/metadata-schema.json |   3 +--
 tools/datasets/_metadata/metadata.parquet     | Bin 20768 -> 19087 bytes
 tools/datasets/github.py                      |   5 ++---
 tools/datasets/models.py                      |   6 ++----
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index ab1af8d4b..8217ab355 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -146,12 +146,13 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 def generate_datasets_typing(application: Application, output: Path, /) -> None:
     app = application
     tags = app.scan("gh_tags").select("tag").collect().to_series()
+    DATASET_NAME = "dataset_name"
     names = (
         app.scan("gh_trees")
         .filter("ext_supported")
-        .unique("name_js")
-        .select("name_js")
-        .sort("name_js")
+        .unique(DATASET_NAME)
+        .select(DATASET_NAME)
+        .sort(DATASET_NAME)
         .collect()
         .to_series()
     )
@@ -205,7 +206,7 @@ def url(
                 raise TypeError(ext)
             else:
                 constraints["suffix"] = ext
-        q = QueryTree(name_js=name, **constraints)  # type: ignore[typeddict-item]
+        q = QueryTree(dataset_name=name, **constraints)  # type: ignore[typeddict-item]
         return self._app.github.query.url_from(**q)
 
     def __call__(
diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json
index 2b5b9d955..d3da3f86d 100644
--- a/tools/datasets/_metadata/metadata-schema.json
+++ b/tools/datasets/_metadata/metadata-schema.json
@@ -1,9 +1,8 @@
 {
+  "dataset_name": "str",
   "ext_supported": "bool",
   "file_name": "str",
   "name_collision": "bool",
-  "name_js": "str",
-  "name_py": "str",
   "size": "int",
   "suffix": "str",
   "tag": "str",
diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet
index 071e4bd6cf68fcc17952c5057858fa29399c9415..97f235546beb0c56abede1cb419eab4afb89dd9c 100644
GIT binary patch
delta 1026
zcmY*YUr3Wt6u;lsHnmnR^<KZtt+McA7%iRCG{m|yb(_w%RdX9y?T>8b=EMyZU$h`A
zf<kYK4`LESdg#GgtsWvOB6_H(9zseeBBF<apdQ7#_iJk2hr_w&_dDnQ?m73)Ugxe%
za54vWRi@wM+>mROA%D`J=t`dG@yEKL5`Eu|zSN?JNX#WC3(=VTHlHcJb>FApGa6Eq
zGrwuvR9^%194BxfohY=+H@vN>CqIgW^atT1$IrB-A85QBKYEmGYenfjTEiCpi8tL@
zp4P}7OmOw2VfQ6+tw~N~%_U@^X)C$gWJ+&uzQ+we&x6ZK1OQNLQKlHC_^kiB3_jbO
z^~X){Gne(J>)}*>)}OP(WqH=W>V_{?au<i;NI^DEPlw}Swt>9*rbX5lk0)UPfXXsn
zpgmBnHK7c&X#pS}3nq^($Mw_VGo*z|bv00^)Qq~uT;DF04XGB+^H4x@e>hkJ#P@3L
z*kq|LQ@q+S0MJi~u}B#^fjC)3{MH?NGaANF4xQ=^#zS2RLom@V<nE(eLvK%*=3KbI
zsxH%<W+48qCiB)^y8q@t8M6}O=?KrN-f1O6wvv^>sL*{0{7S(FAf}FxM>d@})5eaJ
z8IprZL1FOs^~DEhnb#6Cb%uvVAb#>_$x?~>-z<x_ev`0w6srZ|;uxnT`^!t|a7kzR
z%K|iVkx9IKmWOj2fcR>OBbK&pt0ON#Qxw;rJLh;hS%Bt8b@ZW$xoG?sM^yHryd}k6
zwyKYjJ@x~}ZcJA<4r5$}n=rP8%@~`V*hwwh+o^*at&HSih61F)(U@;xe#m1>c#PEp
z0Vg@=z=f@Au~2WQB~s}Pla$@O5f4zn;)?hN7){^8Xu&FX%omHAFg}!EY7VntBQw}5
zS*MkF9XfrTgMt-OgbI@=HYS2nsXAK67_IS9jZ~DHcY0}#UXW@9V$DgPyY6(rIGFHz
z+Is^=6S1~87O?x8WB!i8C{-Q_bR1ik+RkeFJdp}dReNR7V(t(2xS~E<o~D+vYnZ#e
j9#<r2bXTmT13}@4NBw;X^)&h85R~*i14AwV?A!kZwniaz

delta 1714
zcmZ`)4NOy46n^iOma-I)pZ8Xv=s>BUQVU3F#n`@5T3{Awh1N1REd`OEQlui%ZB&u?
z6QkH&WGMci5F`ugW>uz9r*r<GIHJ+%*pEuoCCV}aY_h0j_m-k_)0@1PbI<qPbH3bj
z&U<y+@#g)wS0=5E1wS#JINQ06mBS*|GeAG*OW!`;ToD$9<6IoWu&=z@nRf3|qd9*`
zk(K07{WGWsV(JXV-#;0#*tWtllO-?T%JoWwCWet-<RO1@m(P51Cr-liQO9PzSgL14
z24Fw5<c7cJ=$%=Ta-B424*d07n-9(;gag}0gYFgJ8{Vnj>WGigV@t3stg%oN6PLw)
zmPN0Nt}o`=d!Fud7Isks7oQGgZJ3diKPswu`25L|`B#0^;&ab#JnB5#(z4SO`Rv0A
zxuE^iEsUlMU5Uo{*4wM5xUO{!an}bgJrRCR$dPqqi~GJdtW`TRyIyP#@^*>0G)-%`
zb!4rypk<+Dq;GJ?5$V2&lN{w)mCbiXYhh2)opg*?yvr`SK4cqdRvtTXN5VNjv;Cu?
zZiY9x@y~&F+4O*}*5-&6GFKhghWXhW*Jh6T?KW~*mrmFJ@;m+O#-{Hhip1vr`&a)y
zc)xa7Tm86Zl%FIz96CAS(6TFgr;jYNQidBF!a{X{t(`k>$Hq*sYwr%q&4;dA_r)nM
zhIFnY`~z=g7Yo<*`S)bpY#2N|sw<LT*z|BcL)-g&;L(ERnx<{iPl(mCqCXg(_hV$O
z{SP0<!_2_?I^t5B$dObzXL$3IZ?8~-^B;eqOMAW*+oT}QP4eC<=-sl{c@Q6EM30rW
ztm<WHPg4BLqBoS*dB9iAWIR^S1>t^Nr(fV*oH%ZB4g?u+Vo$O2eM&Kb=q+<PraD8|
zH#=|?AQ?EMGeHmiE~t~HfoZY;z?8*<JJL|}IK{F~+|DLj2?};eFpp}UKwk9q*nK>*
z4<EOqNEgRr*UDbmM;DTfejfjsJkrJV*d1oFdpzcF15U09@BmO1`B?$lkVNpQRIhT2
zg|U(h0@ZSw&xs_s<4qok&F3M~D_>dFm=BzmFJ*S{I|v>yv%<sJ{~8mjZ7)Gm)R_i?
zjKKJhLV4gSXY$xUh51vbAjqZeK~RbDznjDZHs1*QYhy;;bW_?scb~tPz#}=q2^ec)
z@Jm@Q3-tEcqx#)JF2f)qj9;VvZ;;oj9R21pV;EK2jMTcuqIBo-KyN_Ec;P2fyWHr}
zMuMD&@lO?jYDHM&aTB@<FTu*yE4eIzvAo<;3F91fGY26v4&#qlz*$Ae^gYFh$yyaB
zkO+ia?^UsaSfM}&$=z;p>kfhxWBlQEoCiv$hQp2mj;YrIsRs}Zp?-#F8v|Z1fEAb~
z4f}tiS(GV>E;qMkHvxMKqw2p2gWoJZo9f;~1ffRba%in~fqstFv^?Kz<X-lkg_d|W
zc%qbwEi_HbO>_ZGtLQYE&Y<NqovfiXG@VVSKuiPo3S`iQ=#>Hi_@-$<q{{u|m2i`)
ztIAbMDoiRNV5HK4$uN}1S880OwdCp3fIcnRM-AO_p+Rp#tSGsgl@Xg`&MC`>NMa_G
zWJ69Q1&Ki`B^mHiCq-pJOyYJRU5Eh{LJb8-EKHVBmT%66#w$&#gyp$<-7-^Z@mMyb
z)8jLis>iZbA?It^R4`i<SAhh0$c8wHHCI()GwLN3V6iGWCLNv0F_&0#bdm~#KDGqy
z-QwR8Zi)g)R_SKvDJ8bNMe@qr_;g{8UPEJCR;H!ISZ)pO04vf-uqK_wK~vbx#xV4o
F`UjGDE-3&2

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index fc0a899f2..33d7289af 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -217,8 +217,7 @@ def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
         path = Path(tree["path"])
         return ParsedTree(
             file_name=path.name,
-            name_js=path.stem,
-            name_py=path.stem.replace("-", "_"),
+            dataset_name=path.stem,
             suffix=path.suffix,
             size=tree["size"],
             url=tree["url"],
@@ -361,7 +360,7 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
             pl.DataFrame(parsed)
             .lazy()
             .rename({"url": "url_github"})
-            .with_columns(name_collision=pl.col("name_py").is_duplicated())
+            .with_columns(name_collision=pl.col("dataset_name").is_duplicated())
             .with_columns(
                 url_npm=pl.concat_str(
                     pl.lit(_NPM_BASE_URL),
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 5a6598fed..0271d09de 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -112,8 +112,7 @@ class NpmPackageMetadataResponse(TypedDict):
 
 class ParsedTree(TypedDict):
     file_name: str
-    name_js: str
-    name_py: str
+    dataset_name: str
     suffix: str
     size: int
     url: str
@@ -123,8 +122,7 @@ class ParsedTree(TypedDict):
 
 class QueryTree(TypedDict, total=False):
     file_name: str
-    name_js: Required[str]
-    name_py: str
+    dataset_name: Required[str]
     suffix: str
     size: int
     url: str

From 65f87fc2e99b49b781844993a6e45489ed648a65 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 11:28:20 +0000
Subject: [PATCH 041/201] fix: Remove invalid `semver.sort` op

I think this was added in error, since the schema of the file never had `semver` columns

Only noticed the bug when doing a full rebuild
---
 tools/datasets/github.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 33d7289af..9b6671646 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -397,7 +397,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
                 f"Finished collection.\n"
                 f"Writing {fresh_rows.height} new rows to {fp!s}"
             )
-            return pl.concat((trees, fresh_rows)).pipe(semver.sort)
+            return pl.concat((trees, fresh_rows))
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)

From 6349b0f255fab9df3173b5b75c660056317dfe82 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 13:08:04 +0000
Subject: [PATCH 042/201] fix: Add missing init path for `refresh_trees`

---
 tools/datasets/github.py | 81 ++++++++++++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 20 deletions(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 9b6671646..cb9d74751 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -3,13 +3,14 @@
 import json
 import os
 import random
+import sys
 import time
 import urllib.request
 import warnings
-from collections.abc import Iterable, Iterator, Sequence
+from collections.abc import Iterable, Iterator, Mapping, Sequence
 from itertools import islice
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast
+from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast
 
 import polars as pl
 
@@ -23,16 +24,20 @@
     ParsedRateLimit,
     ParsedTag,
     ParsedTree,
+    ReParsedTag,
 )
 
+if sys.version_info >= (3, 13):
+    from typing import is_typeddict
+else:
+    from typing_extensions import is_typeddict
+
 if TYPE_CHECKING:
-    import sys
     from collections.abc import MutableMapping
     from email.message import Message
     from urllib.request import OpenerDirector, Request
 
     from tools.datasets._typing import Extension
-    from tools.datasets.models import ReParsedTag
     from tools.schemapi.utils import OneOrSeq
 
     if sys.version_info >= (3, 13):
@@ -50,8 +55,11 @@
 
     _PathName: TypeAlias = Literal["dir", "tags", "trees"]
 
+
 __all__ = ["GitHub"]
 
+_TD = TypeVar("_TD", bound=Mapping[str, Any])
+
 _ItemSlice: TypeAlias = (
     "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
 )
@@ -379,25 +387,27 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
 
         Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
         """
+        if gh_tags.is_empty():
+            msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}"
+            raise NotImplementedError(msg)
         rate_limit = self.rate_limit(strict=True)
+        stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
         fp = self._paths["trees"]
-        trees = pl.read_parquet(fp)
-        missing_trees = gh_tags.join(
-            trees.select(pl.col("tag").unique()), on="tag", how="anti"
-        )
-        if missing_trees.is_empty():
-            print(f"Already up-to-date {fp!s}")
-            return trees
+        TP = ReParsedTag
+        if not fp.exists():
+            print(f"Initializing {fp!s}")
+            return self._trees_batched(_iter_rows(gh_tags, stop, TP))
         else:
-            stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
-            it = islice(missing_trees.iter_rows(named=True), stop)
-            missing = cast("Iterator[ReParsedTag]", it)
-            fresh_rows = self._trees_batched(missing)
-            print(
-                f"Finished collection.\n"
-                f"Writing {fresh_rows.height} new rows to {fp!s}"
+            trees = pl.read_parquet(fp)
+            missing_trees = gh_tags.join(
+                trees.select(pl.col("tag").unique()), on="tag", how="anti"
             )
-            return pl.concat((trees, fresh_rows))
+            if missing_trees.is_empty():
+                print(f"Already up-to-date {fp!s}")
+                return trees
+            else:
+                fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP))
+                return pl.concat((trees, fresh))
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)
@@ -451,4 +461,35 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
         for tag in tags:
             time.sleep(delay_secs + random.triangular())
             dfs.append(self.trees(tag))
-        return pl.concat(dfs)
+        df = pl.concat(dfs)
+        print(f"Finished collection.\n" f"Found {df.height} new rows")
+        return df
+
+
+def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator[_TD]:
+    """
+    Wraps `pl.DataFrame.iter_rows`_ with typing to preserve key completions.
+
+    Parameters
+    ----------
+    df
+        Target dataframe.
+    stop
+        Passed to `itertools.islice`_.
+    tp
+        Static type representing a row/record.
+
+        .. note::
+            Performs a **very basic** runtime check on the type of ``tp`` (*not* ``df``).
+
+            Primarily used to override ``dict[str, Any]`` when a *narrower* type is known.
+
+    .. _itertools.islice:
+        https://docs.python.org/3/library/itertools.html#itertools.islice
+    .. _pl.DataFrame.iter_rows:
+        https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html
+    """
+    if not TYPE_CHECKING:
+        assert is_typeddict(tp) or issubclass(tp, Mapping)
+
+    return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop))

From f1d610c528e81c12381114b2fafea13d53267bab Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 15:41:54 +0000
Subject: [PATCH 043/201] refactor: Move public interface to `_io`

Temporary home, see module docstring
---
 tools/datasets/__init__.py |  47 ++--------
 tools/datasets/_io.py      | 178 +++++++++++++++++++++++++++++++++++++
 tools/datasets/github.py   |  42 ---------
 tools/datasets/models.py   |  10 ---
 tools/datasets/npm.py      |  49 +---------
 5 files changed, 188 insertions(+), 138 deletions(-)
 create mode 100644 tools/datasets/_io.py

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 8217ab355..3adc2321b 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -14,8 +14,8 @@
 import polars as pl
 
 from tools.codemod import ruff
+from tools.datasets._io import Reader
 from tools.datasets.github import GitHub
-from tools.datasets.models import QueryTree
 from tools.datasets.npm import Npm
 from tools.schemapi import utils
 
@@ -23,10 +23,6 @@
     import sys
     from collections.abc import Mapping
 
-    if sys.version_info >= (3, 13):
-        from typing import TypeIs
-    else:
-        from typing_extensions import TypeIs
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
@@ -174,13 +170,9 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
     ruff.write_lint_format(output, contents)
 
 
-def is_ext_supported(suffix: str) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow"}
-
-
 class DataLoader:
-    def __init__(self, application: Application, /) -> None:
-        self._app: Application = application
+    def __init__(self, metadata: Path, /) -> None:
+        self._reader = Reader(metadata)
 
     def url(
         self,
@@ -189,25 +181,8 @@ def url(
         /,
         tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
-        constraints: dict[Literal["tag", "suffix"], str] = {}
-        if tag == "latest":
-            raise NotImplementedError(tag)
-        elif tag is not None:
-            constraints["tag"] = tag
-        if name.endswith((".csv", ".json", ".tsv", ".arrow")):
-            name, suffix = name.rsplit(".", maxsplit=1)
-            suffix = "." + suffix
-            if not is_ext_supported(suffix):
-                raise TypeError(suffix)
-            else:
-                constraints["suffix"] = suffix
-        elif ext is not None:
-            if not is_ext_supported(ext):
-                raise TypeError(ext)
-            else:
-                constraints["suffix"] = ext
-        q = QueryTree(dataset_name=name, **constraints)  # type: ignore[typeddict-item]
-        return self._app.github.query.url_from(**q)
+        """Return the address of a remote dataset."""
+        return self._reader.url(name, ext, tag=tag)
 
     def __call__(
         self,
@@ -217,14 +192,8 @@ def __call__(
         tag: VersionTag | Literal["latest"] | None = None,
         **kwds: Any,
     ) -> pl.DataFrame:
-        """
-        **WIP** Will be using this *instead of* attribute access.
-
-        - Original supports this as well
-        - Will only be using the actual (js_name)
-        - Some have hyphens, others underscores
-        """
-        return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds)
+        """Get a remote dataset and load as tabular data."""
+        return self._reader.dataset(self.url(name, ext, tag=tag), **kwds)
 
 
-data = DataLoader(app)
+data = DataLoader(app._from_alias("gh_trees"))
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
new file mode 100644
index 000000000..4a6dce431
--- /dev/null
+++ b/tools/datasets/_io.py
@@ -0,0 +1,178 @@
+"""
+Will be part of the public ``alt.datasets`` subpackage.
+
+- Interfacing with the cached metadata.
+    - But not updating it
+- Performing requests from those urls
+- Dispatching read function on file extension
+
+Note
+----
+- Building with ``polars`` first, then will work backwards with ``narwhals``.
+    - Since ``narwhals`` is a subset of ``polars``
+"""
+
+from __future__ import annotations
+
+import urllib.request
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar
+
+import polars as pl
+
+if TYPE_CHECKING:
+    import sys
+    from urllib.request import OpenerDirector
+
+    from _typeshed import StrPath
+
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs
+    else:
+        from typing_extensions import TypeIs
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from narwhals import typing as nw_typing  # noqa: F401
+
+    from tools.datasets._typing import DatasetName, Extension, VersionTag
+    from tools.schemapi.utils import OneOrSeq
+
+    _ExtensionScan: TypeAlias = Literal[".parquet"]
+
+    ReadFn: TypeAlias = Callable[..., pl.DataFrame]
+    ScanFn: TypeAlias = Callable[..., pl.LazyFrame]
+    _T = TypeVar("_T")
+
+__all__ = ["Reader"]
+
+_ItemSlice: TypeAlias = (
+    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
+)
+"""Query result scalar selection."""
+
+
+class Reader:
+    _read_fn: ClassVar[dict[Extension, ReadFn]] = {
+        ".csv": pl.read_csv,
+        ".json": pl.read_json,
+        ".tsv": partial(pl.read_csv, separator="\t"),
+        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
+    }
+    _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet}
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+
+    def __init__(self, fp_trees: Path, /) -> None:
+        self._fp_trees: Path = fp_trees
+
+    @classmethod
+    def reader_from(cls, source: StrPath, /) -> ReadFn:
+        suffix = validate_suffix(source, is_ext_supported)
+        return cls._read_fn[suffix]
+
+    @classmethod
+    def scanner_from(cls, source: StrPath, /) -> ScanFn:
+        suffix = validate_suffix(source, is_ext_scan)
+        return cls._scan_fn[suffix]
+
+    def url(
+        self,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
+        /,
+        tag: VersionTag | Literal["latest"] | None = None,
+    ) -> str:
+        constraints: dict[str, str] = {}
+        if tag == "latest":
+            raise NotImplementedError(tag)
+        elif tag is not None:
+            constraints["tag"] = tag
+        # NOTE: Probably need to remove/move this
+        if name.endswith((".csv", ".json", ".tsv", ".arrow")):
+            name, suffix = name.rsplit(".", maxsplit=1)
+            suffix = "." + suffix
+            if not is_ext_supported(suffix):
+                raise TypeError(suffix)
+            else:
+                constraints["suffix"] = suffix
+        elif ext is not None:
+            if not is_ext_supported(ext):
+                raise TypeError(ext)
+            else:
+                constraints["suffix"] = ext
+        return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints)
+
+    def _url_from(
+        self,
+        *predicates: OneOrSeq[str | pl.Expr],
+        item: _ItemSlice = (0, "url_npm"),
+        **constraints: Any,
+    ) -> str:
+        r"""
+        Querying multi-version trees metadata for `npm` url to fetch.
+
+        Parameters
+        ----------
+        \*predicates, \*\*constraints
+            Passed directly to `pl.LazyFrame.filter`_.
+        item
+            Scalar selection args for `pl.DataFrame.item`_.
+
+        .. _pl.LazyFrame.filter:
+            https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
+        .. _pl.DataFrame.item:
+            https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html
+        """
+        source = self._fp_trees
+        fn = self.scanner_from(self._fp_trees)
+        results = fn(source).filter(*predicates, **constraints).collect()
+        if not results.is_empty():
+            url = results.item(*item)
+            if isinstance(url, str):
+                return url
+            else:
+                msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
+                raise TypeError(msg)
+        else:
+            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
+            msg = f"Found no results for:\n{terms}"
+            raise NotImplementedError(msg)
+
+    def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame:
+        """
+        Fetch a remote dataset.
+
+        Parameters
+        ----------
+        url
+            Full path to a known dataset.
+        **kwds
+            Arguments passed to the underlying read function.
+        """
+        fn = self.reader_from(url)
+        with self._opener.open(url) as f:
+            return fn(f.read(), **kwds)
+
+
+def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T:
+    suffix: Any = Path(source).suffix
+    if guard(suffix):
+        return suffix
+    else:
+        msg = f"Unexpected file extension {suffix!r}, from:\n{source}"
+        raise TypeError(msg)
+
+
+def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
+    return suffix == ".parquet"
+
+
+def is_ext_supported(suffix: Any) -> TypeIs[Extension]:
+    return suffix in {".csv", ".json", ".tsv", ".arrow"}
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index cb9d74751..951221765 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -38,7 +38,6 @@
     from urllib.request import OpenerDirector, Request
 
     from tools.datasets._typing import Extension
-    from tools.schemapi.utils import OneOrSeq
 
     if sys.version_info >= (3, 13):
         from typing import TypeIs
@@ -60,10 +59,6 @@
 
 _TD = TypeVar("_TD", bound=Mapping[str, Any])
 
-_ItemSlice: TypeAlias = (
-    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
-)
-"""Query result scalar selection."""
 
 # TODO: Work on where these should live/be accessed
 _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
@@ -253,38 +248,6 @@ def tag_from_str(self, s: str, /) -> str:
             raise TypeError(s)
 
 
-class _GitHubQueryNamespace:
-    """**WIP** Interfacing with the cached metadata."""
-
-    def __init__(self, gh: GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def paths(self) -> dict[_PathName, Path]:
-        return self._gh._paths
-
-    def url_from(
-        self,
-        *predicates: OneOrSeq[str | pl.Expr],
-        item: _ItemSlice = (0, "url_npm"),
-        **constraints: Any,
-    ) -> str:
-        """Querying multi-version trees metadata for `npm` url to fetch."""
-        fp = self.paths["trees"]
-        if fp.suffix != ".parquet":
-            raise NotImplementedError(fp.suffix)
-        items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect()
-        if items.is_empty():
-            msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}"
-            raise NotImplementedError(msg)
-        r = items.item(*item)
-        if _is_str(r):
-            return r
-        else:
-            msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}."
-            raise TypeError(msg)
-
-
 class GitHub:
     """
     Primary interface with the GitHub API.
@@ -294,7 +257,6 @@ class GitHub:
     - Uses `tags`_, `trees`_, `rate_limit`_ endpoints.
     - Organizes distinct groups of operations into property accessor namespaces.
 
-
     .. _tags:
         https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
     .. _trees:
@@ -339,10 +301,6 @@ def req(self) -> _GitHubRequestNamespace:
     def parse(self) -> _GitHubParseNamespace:
         return _GitHubParseNamespace(self)
 
-    @property
-    def query(self) -> _GitHubQueryNamespace:
-        return _GitHubQueryNamespace(self)
-
     @property
     def url(self) -> GitHubUrl:
         return self._url
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 0271d09de..6ea7992ae 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -120,16 +120,6 @@ class ParsedTree(TypedDict):
     tag: str
 
 
-class QueryTree(TypedDict, total=False):
-    file_name: str
-    dataset_name: Required[str]
-    suffix: str
-    size: int
-    url: str
-    ext_supported: bool
-    tag: str
-
-
 class ParsedTreesResponse(TypedDict):
     tag: str
     url: str
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 589db4660..a5f068082 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -2,9 +2,7 @@
 
 import json
 import urllib.request
-from functools import partial
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal
+from typing import TYPE_CHECKING, ClassVar, Literal
 
 import polars as pl
 
@@ -13,41 +11,22 @@
 
 if TYPE_CHECKING:
     import sys
+    from pathlib import Path
     from urllib.request import OpenerDirector
 
-    if sys.version_info >= (3, 13):
-        from typing import TypeIs
-    else:
-        from typing_extensions import TypeIs
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
-    if sys.version_info >= (3, 10):
-        from typing import TypeAlias
-    else:
-        from typing_extensions import TypeAlias
-    from tools.datasets._typing import Extension
     from tools.datasets.models import NpmPackageMetadataResponse
 
-    ReadFn: TypeAlias = Callable[..., pl.DataFrame]
 
 __all__ = ["Npm"]
 
 
-def is_ext_supported(suffix: str) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow"}
-
-
 class Npm:
     """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
 
-    _read_fn: ClassVar[dict[Extension, ReadFn]] = {
-        ".csv": pl.read_csv,
-        ".json": pl.read_json,
-        ".tsv": partial(pl.read_csv, separator="\t"),
-        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
-    }
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
 
     def __init__(
@@ -73,30 +52,6 @@ def __init__(
     def url(self) -> NpmUrl:
         return self._url
 
-    @classmethod
-    def reader_from(cls, url: str, /) -> ReadFn:
-        suffix = Path(url).suffix
-        if is_ext_supported(suffix):
-            return cls._read_fn[suffix]
-        else:
-            msg = f"Unexpected file extension {suffix!r}, from:\n{url}"
-            raise NotImplementedError(msg)
-
-    def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame:
-        """
-        Fetch a remote dataset.
-
-        Parameters
-        ----------
-        url
-            Full path to a known dataset.
-        **kwds
-            Arguments passed to the underlying read function.
-        """
-        fn = self.reader_from(url)
-        with self._opener.open(url) as f:
-            return fn(f.read(), **kwds)
-
     def tags(self) -> pl.DataFrame:
         """
         Request, parse tags from `Get package metadata`_.

From c4ef112e0d21872807126c51a62cd144d535dccc Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 15:43:16 +0000
Subject: [PATCH 044/201] refactor(perf): Don't recreate path mapping on every
 attribute access

---
 tools/datasets/__init__.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 3adc2321b..47575278c 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import json
+import types
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -72,6 +73,13 @@ def __init__(
             output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh
         )
         self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm)
+        self._paths = types.MappingProxyType["_PathAlias", Path](
+            {
+                "npm_tags": self.npm._paths["tags"],
+                "gh_tags": self.github._paths["tags"],
+                "gh_trees": self.github._paths["trees"],
+            }
+        )
 
     @property
     def github(self) -> GitHub:
@@ -81,23 +89,15 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
-    @property
-    def _aliases(self) -> dict[_PathAlias, Path]:
-        return {
-            "npm_tags": self.npm._paths["tags"],
-            "gh_tags": self.github._paths["tags"],
-            "gh_trees": self.github._paths["trees"],
-        }
-
     def refresh(self) -> pl.DataFrame:
         npm_tags = self.npm.tags()
-        self.write_parquet(npm_tags, self.npm._paths["tags"])
+        self.write_parquet(npm_tags, self._paths["npm_tags"])
 
         gh_tags = self.github.refresh_tags(npm_tags)
-        self.write_parquet(gh_tags, self.github._paths["tags"])
+        self.write_parquet(gh_tags, self._paths["gh_tags"])
 
         gh_trees = self.github.refresh_trees(gh_tags)
-        self.write_parquet(gh_trees, self.github._paths["trees"])
+        self.write_parquet(gh_trees, self._paths["gh_trees"])
         return gh_trees
 
     def read(self, name: _PathAlias, /) -> pl.DataFrame:
@@ -113,7 +113,7 @@ def _from_alias(self, name: _PathAlias, /) -> Path:
             msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}'
             raise TypeError(msg)
         else:
-            return self._aliases[name]
+            return self._paths[name]
 
     def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """Write ``frame`` to ``fp``, with some extra safety."""

From eb876ebc945776b2f7524ad6e7774347dd7d45ac Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:58:30 +0000
Subject: [PATCH 045/201] refactor: Split `Reader._url_from` into `url`,
 `_query`

- Much more generic now in what it can be used for
- For the caching, I'll need more columns than just `"url_npm"`
- `"url_github" contains a hash
---
 tools/datasets/_io.py    | 89 ++++++++++++++++++++++------------------
 tools/datasets/models.py | 14 +++++++
 2 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 4a6dce431..812a9eeb0 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -28,14 +28,13 @@
     from _typeshed import StrPath
 
     if sys.version_info >= (3, 13):
-        from typing import TypeIs
+        from typing import TypeIs, Unpack
     else:
-        from typing_extensions import TypeIs
+        from typing_extensions import TypeIs, Unpack
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
-
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
@@ -43,6 +42,7 @@
     from narwhals import typing as nw_typing  # noqa: F401
 
     from tools.datasets._typing import DatasetName, Extension, VersionTag
+    from tools.datasets.models import Metadata
     from tools.schemapi.utils import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
@@ -56,7 +56,12 @@
 _ItemSlice: TypeAlias = (
     "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
 )
-"""Query result scalar selection."""
+"""
+Scalar selection args for `pl.DataFrame.item`_.
+
+.. _pl.DataFrame.item:
+            https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html
+"""
 
 
 class Reader:
@@ -89,57 +94,34 @@ def url(
         /,
         tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
-        constraints: dict[str, str] = {}
-        if tag == "latest":
-            raise NotImplementedError(tag)
-        elif tag is not None:
-            constraints["tag"] = tag
-        # NOTE: Probably need to remove/move this
-        if name.endswith((".csv", ".json", ".tsv", ".arrow")):
-            name, suffix = name.rsplit(".", maxsplit=1)
-            suffix = "." + suffix
-            if not is_ext_supported(suffix):
-                raise TypeError(suffix)
-            else:
-                constraints["suffix"] = suffix
-        elif ext is not None:
-            if not is_ext_supported(ext):
-                raise TypeError(ext)
-            else:
-                constraints["suffix"] = ext
-        return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints)
-
-    def _url_from(
-        self,
-        *predicates: OneOrSeq[str | pl.Expr],
-        item: _ItemSlice = (0, "url_npm"),
-        **constraints: Any,
-    ) -> str:
+        df = self._query(**validate_constraints(name, ext, tag))
+        item: _ItemSlice = (0, "url_npm")
+        url = df.item(*item)
+        if isinstance(url, str):
+            return url
+        else:
+            msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
+            raise TypeError(msg)
+
+    def _query(
+        self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata]
+    ) -> pl.DataFrame:
         r"""
-        Querying multi-version trees metadata for `npm` url to fetch.
+        Query multi-version trees metadata.
 
         Parameters
         ----------
         \*predicates, \*\*constraints
             Passed directly to `pl.LazyFrame.filter`_.
-        item
-            Scalar selection args for `pl.DataFrame.item`_.
 
         .. _pl.LazyFrame.filter:
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
-        .. _pl.DataFrame.item:
-            https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html
         """
         source = self._fp_trees
         fn = self.scanner_from(self._fp_trees)
         results = fn(source).filter(*predicates, **constraints).collect()
         if not results.is_empty():
-            url = results.item(*item)
-            if isinstance(url, str):
-                return url
-            else:
-                msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
-                raise TypeError(msg)
+            return results
         else:
             terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
             msg = f"Found no results for:\n{terms}"
@@ -161,6 +143,31 @@ def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame:
             return fn(f.read(), **kwds)
 
 
+def validate_constraints(
+    name: DatasetName | LiteralString,
+    ext: Extension | None,
+    tag: VersionTag | Literal["latest"] | None,
+    /,
+) -> Metadata:
+    constraints: Metadata = {}
+    if tag == "latest":
+        raise NotImplementedError(tag)
+    elif tag is not None:
+        constraints["tag"] = tag
+    if name.endswith((".csv", ".json", ".tsv", ".arrow")):
+        fp = Path(name)
+        constraints["dataset_name"] = fp.stem
+        constraints["suffix"] = fp.suffix
+        return constraints
+    elif ext is not None:
+        if not is_ext_supported(ext):
+            raise TypeError(ext)
+        else:
+            constraints["suffix"] = ext
+    constraints["dataset_name"] = name
+    return constraints
+
+
 def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T:
     suffix: Any = Path(source).suffix
     if guard(suffix):
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 6ea7992ae..fa0972035 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -126,6 +126,20 @@ class ParsedTreesResponse(TypedDict):
     tree: list[ParsedTree]
 
 
+class Metadata(TypedDict, total=False):
+    """Full schema for `metadata.parquet`."""
+
+    dataset_name: str
+    ext_supported: bool
+    file_name: str
+    name_collision: bool
+    size: int
+    suffix: str
+    tag: str
+    url_github: str
+    url_npm: str
+
+
 class GitHubRateLimit(TypedDict):
     limit: int
     used: int

From 661a3851034c39c1c8249a7426ae33821f802f14 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:01:41 +0000
Subject: [PATCH 046/201] feat(DRAFT): Adds `GitHubUrl.BLOBS`

- Common prefix to all rows in `metadata[url_github]`
- Stripping this leaves only `sha`
- For **2800** rows, there are only **109** unique hashes, so these can be used to reduce cache size
---
 tools/datasets/github.py | 1 +
 tools/datasets/models.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 951221765..4f15140e3 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -287,6 +287,7 @@ def __init__(
         repo = f"{base_url}repos/{org}/{package}/"
         self._url = GitHubUrl(
             BASE=base_url,
+            BLOBS=f"{repo}git/blobs/",
             RATE=f"{base_url}rate_limit",
             REPO=repo,
             TAGS=f"{repo}tags",
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index fa0972035..2bca343aa 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -21,6 +21,7 @@
 
 class GitHubUrl(NamedTuple):
     BASE: LiteralString
+    BLOBS: LiteralString
     RATE: LiteralString
     REPO: LiteralString
     TAGS: LiteralString

From 22dcb17868246c0d79796e3e65c1419442c11c61 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 18:31:36 +0000
Subject: [PATCH 047/201] feat: Store `sha` instead of `github_url`

Related 661a3851034c39c1c8249a7426ae33821f802f14
---
 tools/datasets/_io.py                         |  13 +------------
 tools/datasets/_metadata/metadata-schema.json |   2 +-
 tools/datasets/_metadata/metadata.parquet     | Bin 19087 -> 18495 bytes
 tools/datasets/github.py                      |   3 +--
 tools/datasets/models.py                      |   4 ++--
 5 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 812a9eeb0..e27bbcb7a 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -53,16 +53,6 @@
 
 __all__ = ["Reader"]
 
-_ItemSlice: TypeAlias = (
-    "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]"
-)
-"""
-Scalar selection args for `pl.DataFrame.item`_.
-
-.. _pl.DataFrame.item:
-            https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html
-"""
-
 
 class Reader:
     _read_fn: ClassVar[dict[Extension, ReadFn]] = {
@@ -95,8 +85,7 @@ def url(
         tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
         df = self._query(**validate_constraints(name, ext, tag))
-        item: _ItemSlice = (0, "url_npm")
-        url = df.item(*item)
+        url = df.item(0, "url_npm")
         if isinstance(url, str):
             return url
         else:
diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json
index d3da3f86d..53d9978b3 100644
--- a/tools/datasets/_metadata/metadata-schema.json
+++ b/tools/datasets/_metadata/metadata-schema.json
@@ -3,9 +3,9 @@
   "ext_supported": "bool",
   "file_name": "str",
   "name_collision": "bool",
+  "sha": "str",
   "size": "int",
   "suffix": "str",
   "tag": "str",
-  "url_github": "str",
   "url_npm": "str"
 }
\ No newline at end of file
diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet
index 97f235546beb0c56abede1cb419eab4afb89dd9c..8bf0e17e3673d2b7cfbbe1ddba345f492d12e674 100644
GIT binary patch
delta 3023
zcmaJ?c{G%L8=kT6>&zI8H)NNwO^qc@GnTPNwm~I>$&!6gCJZ8BGV<6$XsGBFvZX>V
zs;`tnNq9pfiiqr6-!o3%Io~<o`QG1i&hNRG>%Ol0x$i$7W-jz`7PPKfn7(ceuT-*y
z4@2P)2t=((d&jF)+{F^IDLSMmQg@S6iH^Loe>S3vEbu{%LmhnbQ`4l{)@X~<?tts(
zp=A5**ss}0Gqp&cUnLp`u-y&Zj1QNW(4&utFB-Gror;wwJ=FDqb%kyo!KrVDGBsze
zw7O{Kr={fcI*FAOL`~A??-lFlQHJ4n-Co})is+-^791))ZnOS1kT~RWG58*O^WcKl
zXkhR&c>!komO+ZxXUC_y85Ttz<iKg#&pi71x44TX^Kk;K$#*4`f@)MrOruC+`HG$A
z3EL(X?0%Pg;=Zlr-bW%h+c<xti>sl0PGf?7{Z<Eg(~jO5ww!uIS4d0+ZoM(mq4+g+
z%Ne&F<GjQG8ZXBS!cgeCwX{p&ol?{&9@!K@)bLNo`pH)o3m;(D3*WXX`u<8kU7nU@
z^1k%pSJS0yv}o@q&&U<XX5Q+SFI(dehl7pZlgON$@XR_f^jg8!;*sRCv&kKdS84-y
z1*J9UMY-YP*6O1fUfLMB=F($6idF_PQM3o2!;02BM?yC2CzRcp4by)nH-B|H(Zt2f
z{F!mO|2p41qh|wuzt)As3m+L>*n09tbLisw^AAVzU~i8w{3=+nXNO~7MP?N0--@WD
z7eDpN(P{>akmu+^kOcW=Dp?!xND}?P<$!Zk`>r-mVcAh#?XK?<40V#ySdT}(r;=^!
z<xz&%%ahK3<;X{#3o+YC%#-%aOnQ>nxVl@eaVhH+P*FNjP8%*}84eo7#a&m;Gsop2
zBNYQ9$So$v9XrjZa=zj=eLmK{sJG-BlQ`q1Y^7Q}@RCT^2%u=n9(#Rhc(o!A<EGV{
z<@=d2_@(6nHp=tp?Vw>>*PhM`8NwHxCnS@fo55ti8zv1SR2T)S)3NF+dKNQFezqU$
zPKa@HS9|9PvRF{%+4Zgp<Xu9_tA9x<{VMU&8=_Up+=0z6Zc95Q{lT-7XSG$9Maa?k
zX0a>XqJm1LI}b3;r~XVZ$dtmpS&{hr3QA+$eyeabh9Oz7T^uNE6%fy1Y|T6Ou2_FF
zFHCLZj*CL!yCGvFqW-G5sg?6LJ<XM^^48ry+Iz5q?yVj%rTZ^)r7jM=sQsb!EDp*^
z8dATOL;poG=eB4jFK>*!^ez{^l|2?$S^NwZWWJX5mF}-R*tSnmqgOb5^IjLJHw9a9
zY?5?q$VC?yxv~7Cz%2geAcl#-pIKoYQQ|GNVWD?(mDU)1Zab1Yvyu%OJp-Q8iP)m8
zr0o6&gbPo3b1$c3rvR@STgfKeH-jgs@8u4lVFIGNvFj@O0-<dYp-dO2S?WhP{7vKi
zj@P^C^xUG@HG}rpC&dBllYY`^P3iqw>SmQe+>V863;<<q65*NaDD;S|e4l1u^7f=1
z8mng}BQ@7u)6+!2%;%?i-{!R*Kk9x{!5AewaLQET*1mqvdJUa1&)Shyfz8sQcPGdF
zUKbisY-W5-o;u`7AcAu>v40DUge%<`JLeyBl^*`Kw}ug#Xn$a)Bgv5Ddt7_dQh!aW
z*2kSLr>9?~E;^~Nth#U)8|!5mR~(iBFRqX!bp#%FWbp1-1f9P8E}x=1c>nm3nG5Nj
zi(L!x)Z6s@1DXRyIeoZ$R$(1OOwq%#U*yBv6~bj7HYuW)yGapzaJlyV$CmBBk1pRJ
z4$)cJisNBnmMRZoewGmY`W%unvSnaRi24nEqd!Sjw%GT*!^)zv#<44Yo7A<dB9f{5
zKHsiGv_5b!I9w36Z8)WR=bMLqsZ(-zKY!cteU31UkYia5LiW3a@yNsv=b35Uoj>l<
zXl*nX1-+*MBkkwuC#;pqs(nH;le6M-TU6-rl(5O0^KBn-*B9gkq+~8dyBczxTK%EW
zIknM!f60MqV?VW@N(sz<5O#_-$}xSdbo4PL5VLZ~y+sH8WV=d2XIbo?u?3@CT(C;i
z-uab;en_Xp+Dor-7$rAz!I-4!(%~Z1cU!7JCQ)-Jy}zc*i}mYN*d{B6EL=433x@EV
z6Rc{eAHSgyJs{@)92*puS^LPQvDSd;W3r!KR@_rj(j;`u@xpq>rx9H7$~wthi~s72
zTi0A-M<mpo`dkq|FBVOGGHe*UK}|d2HM2Cqf8cMMCd+*zhD|e(wK95unH+_lIcJf+
zVik?i_#V)CkT%OFXd0MO($*u<#~mVao?1jU3{?2(jSFa}#4&R|>9yJ!_a(2D!0z%7
zU{k6$0+$BA$7%S<Pi)?a*{~5%2x?+osI^1ArZB1~HGzpyRNW-yjJlyLC#TS@sMMDN
z#v;M*7VHY{OPj7TF#3oKM*Q>&2Ud<rzBJ-G(f5;NI4A8)VPRYg{{%Ki=~_t$Gu*RY
zt?ETbeSG@|nkRa@TQ#s?-XQuUjQjm`R&K}SA8xd+$wVAuvRqgH%#Wh?&Ko0d#DnR+
zYib8QY`WvHi`R$n+YIKwhly|1VxMjr!#cAMr3_3k%KTlTo!*$9NSi%zxw3FD=dn7X
z(r*7@!NJx!qM{jr_d!uLsp)4}x7yhAnaZ$!PwA9?A<_Cz*^QjMmu8o(9ZPb*JCJxO
zA)5TckA$^}qZJvN%~u~&-8G*re)&kT&}LfBOT@JU(0vX8PFLm^M9)g4CDC56bcl-|
zmohP~l)B7bzR*f>c`j8hfeux^F46I8BJ%A}NCHHj1HuL2gNQ+of(``A#Ra-zlade!
zM@?gg8yJyKNO12|1poOEC<Kx~KEtb@Aa4R|SGop>h$5hBK(CTIcz2s?H1x>d;gsT}
z`jg>8h&fF}GZit8QRzQ)_z*<xzhPCvHr$+r!G{3_QDlRt<QSCmOA@r<2h1HR)te4w
zqj{qB(1vIoByY4XT2}|H({NIGMw%}(YcFXET*Vy#-r%JGCa!8vS&dT~$rexpw2qzz
zvT%5y3{S@to`4sKA;7k!QHakG@B-1__WVUCe3$#TGs=L%Uvd0)GP8i=gb)!SWez0!
zE0iI&C<p{-BVYhNQzx;`1P(Y7vAzL?AA+b1OQV2b9<(@OcrRMB356>{RPyvuz)PVc
z{D`H!XzwPlXlkI7p2#lD+uDQFpGCoiqz^)P#H9B`{%KU260E(_7O)iDmCAM^6a@@R
z$oyw`;)sWP`PFSG{0Ky4E)5FAo5`ty(_{~k3mthr63K%mM?{34gFq1dAXWy^xC4dj
zK~(NAp}=2eGMfKq;2?Z%5B>#&8$eXb)<9)doZSEJAk@Mfdnt*{T(S#=S8zZO6E{%c
z_Eq>09~Pi6_VkfLct=Njk`dL%pAt+)F7s%!4@L)=wJ=lg!TaIyx_EOu-WpH9;|W&y
z<9NIo+qc7m#}p4Z5KH;Z*tRdxA_Q==BGBDIhd}csc?6T)z-oIMNN0dyL#)^r0gQn+
zn;4{V!h?+9+Y!$W{H6sZbl6RRTHpb*d!=XD9e@<<D1k<H3kzZ=e)Ch=<{m!~V5O_b
z?heEd&GA?VkSOLHJH01BWLtQ1B7m|~Lz=Ne#Do6cB-b<UdWKYQVi1i?GK>OF5>2(N
z*+7rrAgYI}VU#;bcTeu0=n!V?)`+ecUkgK;uN&c<x4yNG2MO>XnoIAE%Y$g4<L-7W
Y2+aQ`G%*3ZtX27L^FkmFct_oT1N`y*fdBvi

delta 3599
zcmb_eXH-+!77kHk=*=_<3erLkp#-GJr3nZ~H;E`D0SQBuUepMK^d=XODhNmsq(~L1
zA_}MowgC%>qm*GNf(lA|3HsKX_v`)e-MiLa`|NL@?|%E7ebzl)X&kAU9Cg*a4b~#2
zd!SD>KpsO(Lxb1^2lT2%L*^ke2Q<bW{3L;5?nb=1$$?mtvOv7zKtLc6T$A#qi;rC9
zF~}>S9tn&eQA%{A!uYi0l7jrwc4%?Gnwba8{slMA1ggw>foKEb*Tu@a{v|pmyW^5^
zi|;!kGAy`fiG5^3O(sD#{$R0Jopp`RpRG~j{U@9Jb4a0K_&t;HWa7eshny02Wz16}
z2_La-vZ$M@D}0x31j&UviDsWtA4Sd;JFoCf-F$h5r+8`RPO2RoP;$UFFd8cnWx+K>
z-*2(RP1Tv%@p7@lR`cYTfx*d~qjUP&l!*^befIL^XSI2R*6NS*cg!vIK94O^Wo*ab
zzjO#F7T5=*!Aox>+joixg=%_yuV(JD3pS~GQ~zvE&81({EODkqzh@E}QOH2coKrdB
zUT4$VeN9(cN_)ubbBg6irPgD4gRiw~>bdYS`etRGo@8OAyNrn!)+B)%(VgJg$zN0T
zwwZn&p@FxagmX{&r{)aZwuHBtpda-o*}iQ{ySCgy#-Ofcyzf{uct%~HRXZu29$jKn
ze&rZnOT2RXeAc$)>GUs}5uN6YJiR7|V@ordU29Tr<KGR9<{u1PrvwFA*UxWu9^YtB
z*Nsa}nx~to%=d<x$PKoXJ$_)(QlUkci~Zj7`T?28Wb76g&mY39?sgI6soS#$w%4a<
z?<+_J$C09L2mn<sx(M0r<ie}zCw4Is3Y8p=ih}S6VX;^-{fphO#7JqJ^`nP=cXrbU
z5d$F&FR!CGg_Vnm688uw1YQdMwm`&&S>zUTr9`Y=>So7wMS{5GisWqhTE$h3a>h`T
zm9z77yxT~s6pUFAm+OS~Ii8W>XQSozJ#~w;>Um2+=x$3i#?kC4r~8-X%jT4^!5FaN
zTC;AY6t5$<nj}W+&nB&famxbKuxy?N=+Lk<PU{uRT-z;95bQIFNGWQ*#3?Z~pyh1d
zktEo*fw3e0jWhx{s<J;uRWS4i4sYRQO{%QLS3d0?=wixbTQncI5hHggLWg-jqbL^W
za(gOW$Fw#TH%A0XZ1Xq?rx@P&>p}6dgFf~_V&QZ+=Yk0zVx#=L<+{$*?hn)Uo~Gt2
z!lO2aUCO8OM_Qr{#~g2Rjg$_J_td2&C!x)YZLg3kCk}r6vbyBmN$!9DVsIS`NNOfM
zvAS@{S7m|8jcy&Z8YtaZ?~;?l-{LKKvV9sU;*HPTSTOvkR^}q#7He@-{`SJAfK8-N
z+_O~C&M2jJ@u2-(gvT`r->y9@k9{DyEZ<QUR?11JLVXd}T5}Q#`cx}<+uf-9ufiGU
z!fXeRny-iX+sEFZGBSn`ujKVAjO^QG3wH;&<yLIkFdvBPH7>ZEy=EP{G~G+gu?d~a
zjEJbtr4-K9RD_Pdc5!F+4O(_aV4d}7LuwnJ8j|i&jAyg<9kOqR%3AKbZ^n21fL5)!
zY|Pub^pm>zS3PQ^uSA4phmNd<f0`zPjvvONk8hltO-FmzAd1g+>Fm?iwTA8;l;a(n
z9IaT`pU%W@rhQF!hzzU!!#iTn<`c``l$KQ8Eemc*1>U+!!yV&HY#z)M@wqgwnLfOz
z-PqpW>cKd1zolCS`Pro>U;b7F>U?R=de$HH)m?l!InOg<Kjox@t1m_L@TGcIt$kcp
zPm`-$nsBoBf4?BnY*5;%ka)W@c^R0n`<!s_!fKv2bMuZXx$@<;q^)?%DDI&P=iZIs
z_r||YYvp}Q5k!T5l-G(*;F}PX`Cb<6`+c~?%f?L5Pg7G8eA6Cw{8|G8*JEtPn?T_n
z+rSjOsFExb@np=W$0mz*eplVRy>1#C-^;&jN8%oL!eI|sXl~^%3p)he%y}YvS93Oc
zyn<hri@6^rYkeXyIXJ$+`)sJ(=auXWsgAlGr#o&4<h*b6c5d%dloGOjSx-D_(BEn9
zB-}-4{baj3zEC(6B5;YD*HF6(8>JMnCNQ~t?&Ha@iU-%rmoMKYs1G=`D(U)0t~@%v
zI6+X)4bM|>LZfI~_MGVOxKUAw`^hBJ^P$DGtZe34;jC6?*sgM1Rp7aa+muSK8d{uQ
zNB48M(|}_}uqaG;b1~`}>M^(HX!TZkor|++LFOKQhx)1*eZx-bT)?@vr;5+_g<%sf
zAF&=fWBY^xld06*=v$gUI2687Rh$*w@A~YKT&Fi@MkPZ&@mtU1<)!?wx#8PJ2l0Vn
zpLQd$3)d;7sq^`iz_t^-9@?m{w0iBy^pkxC4%2f?&~cv|v)`wbROouN<mpvur;?VG
zI((_)%-CxM!Bg+1E)rWXStUq<@>g)`<jYl4j8gCGMq$g)MYEAy>+^Q~xwX>Fx8vph
zutPPqxmUVM1@1b^x2?XqH|cagn=*9+Ng|>oNVnj5yjtpCnw>*htLy$8HrX7H4$D(<
zePXIguP+DnnOsulV*xqTkuW2qo*=Qvy{vL9Hc#w=p;`Lu>{s-(RAPEAR$*>B@#gf$
zyS3+vV`fw64~%VG&hwEmIt!84i8=??dNuX-tI}az9+sJ1o7j=BM}t#sOPWgGga}XM
zh~9g$MoLz!8R8N=-DW7R;9*Bn9D*k}x)HzVG-kG@P~vWcS$@l-MZQwI+u-;3;RQ(p
z$-uNb1K|$Z{N+i@$w?_(GV_@>xW`LkB1&f3elD$6wUQsnV|=V%!BlhlaNX-A{|}aJ
ztu&Eu+E<!rSl{teXFe~-eNY-{0rw%TnJ;Fxw(5c}<0`Hwa0Tp}P+T;nJHY1PyqH7D
z+`(=r-wHt;gEWB+_yGUA%t-=V48F0wy~Tl!p=wjnutX|}r}rLSA?C|2m25pRp7r=h
z>bicC4D5F{;8ItBMy|y7xu{ehjO%kjvm)76Xk;%Q4g0+(kXGrbkdJ5buv24tSRB+k
zbWEQ=4ezD*r-dCA%SocX@<Kom_oWftd<uxa<q-l17y^dXfZ%A1z(Bu%!^+B3KMy5$
zk3dEcO^NR7t?cjS=Nq6L?B-70F^aC#Kx%+nV1P0!N15j3OAAmwbB0JIx@i!#=xQ2g
z2(Bui=n=FI`WFZr_)yPxk-38Wy_|n2LGU2|mPba9av*axrSFn;eoZgPdj$7O`anLE
zT*F?;DGand6I|3?2KA5FYyU#MEIynr=2upXRgx;0aip7&uY<_uN3!me7%Z~P60x=C
zr=Q`7cm)0FzhNRS3jOq11;pw5fG~i-{g3qq6#!>}(O4(~jV#;dKxjauUtzGIH%v(o
zxwI2~!-^h)NN1^H!G}D0oXEI1cJ9_TC~lz1U$is0cM-}?X*>ol8i@S34ez&U-GQc9
zkRC+3)DDZ_!Kg!E0vI$bAn42)kI)|*oh5{f*(-h3g1yhXl>$Qp(J&>dzrSw?1cDsi
zVJMA+A_x%acO)>}aQ}~MHIj*(*}-12FinW`o68*kq5ns1|KDEvNAH8^Bci|DR>H`O
z@$98ciGm`^c0r_vHaM{0u&(HjPD|EAX0zb^$i_=hL@5-4tcww4oeXI>GJS<(AA8f$
zJb<n3@e_vw=nOY+DtZH^%zo4zz!D=PaZkV<08{~E05Eqq0009Uz=mbnV@s9;7@C1_
z<3hz_EEjMknFo*vu70jZ87gGfETT1wCM<*vu|@!Jlx+ZG2k@Q=jtgMNO>G&J5D;k$
z;C3*&s-ri>+e-}qi~`saj<fR(*#;o*SQ|4?*;v$2i%pE*iLfN?q?iLNU941=E&!kt
zyfi3mbw7D5*zSMvkXbx9%nm=z+a!?UsP!9-5E?)wS^3cjY5-VgY@q1Lnq=TjJrU|f
zBB(QHCmjAKpB4K%6cPhZQngZ{n;HbueQdobtQeaQ$u8KylT5N@&<T$Ce{+HYrZR9P
LI0Rx1*sA;sbMfmN

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 4f15140e3..0238aab69 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -223,7 +223,7 @@ def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
             dataset_name=path.stem,
             suffix=path.suffix,
             size=tree["size"],
-            url=tree["url"],
+            sha=tree["sha"],
             ext_supported=is_ext_supported(path.suffix),
             tag=tag,
         )
@@ -326,7 +326,6 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
         df = (
             pl.DataFrame(parsed)
             .lazy()
-            .rename({"url": "url_github"})
             .with_columns(name_collision=pl.col("dataset_name").is_duplicated())
             .with_columns(
                 url_npm=pl.concat_str(
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 2bca343aa..556aafa1a 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -116,7 +116,7 @@ class ParsedTree(TypedDict):
     dataset_name: str
     suffix: str
     size: int
-    url: str
+    sha: str
     ext_supported: bool
     tag: str
 
@@ -134,10 +134,10 @@ class Metadata(TypedDict, total=False):
     ext_supported: bool
     file_name: str
     name_collision: bool
+    sha: str
     size: int
     suffix: str
     tag: str
-    url_github: str
     url_npm: str
 
 
From 669df027cef9d857f2207c77279281a8a42a03d6 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 19:07:54 +0000
Subject: [PATCH 048/201] feat(perf): Adds caching to `ALTAIR_DATASETS_DIR`

---
 tools/datasets/__init__.py |  2 +-
 tools/datasets/_io.py      | 51 ++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 47575278c..de98cd281 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -193,7 +193,7 @@ def __call__(
         **kwds: Any,
     ) -> pl.DataFrame:
         """Get a remote dataset and load as tabular data."""
-        return self._reader.dataset(self.url(name, ext, tag=tag), **kwds)
+        return self._reader.dataset(name, ext, tag=tag, **kwds)
 
 
 data = DataLoader(app._from_alias("gh_trees"))
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index e27bbcb7a..228bb9ce1 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -14,10 +14,11 @@
 
 from __future__ import annotations
 
+import os
 import urllib.request
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast
 
 import polars as pl
 
@@ -63,10 +64,25 @@ class Reader:
     }
     _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet}
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+    _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR"
 
     def __init__(self, fp_trees: Path, /) -> None:
         self._fp_trees: Path = fp_trees
 
+    @property
+    def _datasets_dir(self) -> Path | None:  # type: ignore[return]
+        """
+        Returns path to datasets cache, if possible.
+
+        Requires opt-in via environment variable::
+
+            Reader._ENV_VAR
+        """
+        if _dir := os.environ.get(self._ENV_VAR):
+            datasets_dir = Path(_dir)
+            datasets_dir.mkdir(exist_ok=True)
+            return datasets_dir
+
     @classmethod
     def reader_from(cls, source: StrPath, /) -> ReadFn:
         suffix = validate_suffix(source, is_ext_supported)
@@ -116,20 +132,41 @@ def _query(
             msg = f"Found no results for:\n{terms}"
             raise NotImplementedError(msg)
 
-    def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame:
+    def dataset(
+        self,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
+        /,
+        tag: VersionTag | Literal["latest"] | None = None,
+        **kwds: Any,
+    ) -> pl.DataFrame:
         """
-        Fetch a remote dataset.
+        Fetch a remote dataset, attempt caching if possible.
 
         Parameters
         ----------
-        url
-            Full path to a known dataset.
+        name, ext, tag
+            TODO
         **kwds
             Arguments passed to the underlying read function.
         """
+        df = self._query(**validate_constraints(name, ext, tag))
+        result = cast("Metadata", df.row(0, named=True))
+        url = result["url_npm"]
         fn = self.reader_from(url)
-        with self._opener.open(url) as f:
-            return fn(f.read(), **kwds)
+
+        if cache := self._datasets_dir:
+            fp = cache / (result["sha"] + result["suffix"])
+            if fp.exists():
+                return fn(fp, **kwds)
+            else:
+                fp.touch()
+                with self._opener.open(url) as f:
+                    fp.write_bytes(f.read())
+                return fn(fp, **kwds)
+        else:
+            with self._opener.open(url) as f:
+                return fn(f.read(), **kwds)
 
 
 def validate_constraints(

From 20514100497595b52bd14e55dec0b139b4d1578a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 7 Nov 2024 23:01:06 +0000
Subject: [PATCH 049/201] feat(DRAFT): Adds initial generic backends

---
 tools/datasets/__init__.py |   4 +-
 tools/datasets/_io.py      | 200 +++++++++++++++++++++++++++----------
 2 files changed, 151 insertions(+), 53 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index de98cd281..96932b9af 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -15,7 +15,7 @@
 import polars as pl
 
 from tools.codemod import ruff
-from tools.datasets._io import Reader
+from tools.datasets._io import get_backend
 from tools.datasets.github import GitHub
 from tools.datasets.npm import Npm
 from tools.schemapi import utils
@@ -172,7 +172,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
 
 class DataLoader:
     def __init__(self, metadata: Path, /) -> None:
-        self._reader = Reader(metadata)
+        self._reader = get_backend("polars")(metadata)
 
     def url(
         self,
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 228bb9ce1..2074def12 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -17,10 +17,25 @@
 import os
 import urllib.request
 from functools import partial
+from itertools import chain, islice
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast
-
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Generic,
+    Literal,
+    Protocol,
+    TypeVar,
+    cast,
+    overload,
+)
+
+import narwhals.stable.v1 as nw
+import pandas as pd
 import polars as pl
+from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
 if TYPE_CHECKING:
     import sys
@@ -40,34 +55,30 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from narwhals import typing as nw_typing  # noqa: F401
 
     from tools.datasets._typing import DatasetName, Extension, VersionTag
     from tools.datasets.models import Metadata
     from tools.schemapi.utils import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
-
-    ReadFn: TypeAlias = Callable[..., pl.DataFrame]
-    ScanFn: TypeAlias = Callable[..., pl.LazyFrame]
     _T = TypeVar("_T")
 
-__all__ = ["Reader"]
 
+__all__ = ["get_backend"]
 
-class Reader:
-    _read_fn: ClassVar[dict[Extension, ReadFn]] = {
-        ".csv": pl.read_csv,
-        ".json": pl.read_json,
-        ".tsv": partial(pl.read_csv, separator="\t"),
-        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
-    }
-    _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet}
-    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
-    _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR"
 
-    def __init__(self, fp_trees: Path, /) -> None:
-        self._fp_trees: Path = fp_trees
+class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
+    """
+    Common functionality between backends.
+
+    Trying to use ``narwhals`` as much as possible
+    """
+
+    _read_fn: dict[Extension, Callable[..., IntoDataFrameT]]
+    _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+    _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
+    _metadata: Path
 
     @property
     def _datasets_dir(self) -> Path | None:  # type: ignore[return]
@@ -83,15 +94,13 @@ def _datasets_dir(self) -> Path | None:  # type: ignore[return]
             datasets_dir.mkdir(exist_ok=True)
             return datasets_dir
 
-    @classmethod
-    def reader_from(cls, source: StrPath, /) -> ReadFn:
+    def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
         suffix = validate_suffix(source, is_ext_supported)
-        return cls._read_fn[suffix]
+        return self._read_fn[suffix]
 
-    @classmethod
-    def scanner_from(cls, source: StrPath, /) -> ScanFn:
+    def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         suffix = validate_suffix(source, is_ext_scan)
-        return cls._scan_fn[suffix]
+        return self._scan_fn[suffix]
 
     def url(
         self,
@@ -108,30 +117,6 @@ def url(
             msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
             raise TypeError(msg)
 
-    def _query(
-        self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata]
-    ) -> pl.DataFrame:
-        r"""
-        Query multi-version trees metadata.
-
-        Parameters
-        ----------
-        \*predicates, \*\*constraints
-            Passed directly to `pl.LazyFrame.filter`_.
-
-        .. _pl.LazyFrame.filter:
-            https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
-        """
-        source = self._fp_trees
-        fn = self.scanner_from(self._fp_trees)
-        results = fn(source).filter(*predicates, **constraints).collect()
-        if not results.is_empty():
-            return results
-        else:
-            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
-            msg = f"Found no results for:\n{terms}"
-            raise NotImplementedError(msg)
-
     def dataset(
         self,
         name: DatasetName | LiteralString,
@@ -139,7 +124,7 @@ def dataset(
         /,
         tag: VersionTag | Literal["latest"] | None = None,
         **kwds: Any,
-    ) -> pl.DataFrame:
+    ) -> IntoDataFrameT:
         """
         Fetch a remote dataset, attempt caching if possible.
 
@@ -151,7 +136,8 @@ def dataset(
             Arguments passed to the underlying read function.
         """
         df = self._query(**validate_constraints(name, ext, tag))
-        result = cast("Metadata", df.row(0, named=True))
+        it = islice(df.iter_rows(named=True), 1)
+        result = cast("Metadata", next(it))
         url = result["url_npm"]
         fn = self.reader_from(url)
 
@@ -168,6 +154,91 @@ def dataset(
             with self._opener.open(url) as f:
                 return fn(f.read(), **kwds)
 
+    def _query(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.DataFrame[IntoDataFrameT]:
+        r"""
+        Query multi-version trees metadata.
+
+        Parameters
+        ----------
+        \*predicates, \*\*constraints
+            Passed directly to `pl.LazyFrame.filter`_.
+
+        .. _pl.LazyFrame.filter:
+            https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
+        """
+        source = self._metadata
+        fn = self.scanner_from(source)
+        frame = nw.from_native(fn(source), pass_through=False)
+        result = frame.filter(_filter_reduce(predicates, constraints))
+        df: nw.DataFrame[Any] = (
+            result.collect() if isinstance(result, nw.LazyFrame) else result
+        )
+        if not df.is_empty():
+            return df
+        else:
+            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
+            msg = f"Found no results for:\n{terms}"
+            raise NotImplementedError(msg)
+
+
+class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
+    _read_fn = {
+        ".csv": cast(
+            partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow")
+        ),
+        ".json": cast(
+            partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow")
+        ),
+        ".tsv": cast(
+            partial["pd.DataFrame"],
+            partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"),
+        ),
+        ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"),
+    }
+    _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")}
+
+    def __init__(self, metadata: Path, /) -> None:
+        self._metadata = metadata
+
+
+class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
+    _read_fn = {
+        ".csv": pd.read_csv,
+        ".json": pd.read_json,
+        ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
+        ".arrow": pd.read_feather,
+    }
+    _scan_fn = {".parquet": pd.read_parquet}
+
+    def __init__(self, metadata: Path, /) -> None:
+        self._metadata = metadata
+
+
+class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
+    _read_fn = {
+        ".csv": pl.read_csv,
+        ".json": pl.read_json,
+        ".tsv": partial(pl.read_csv, separator="\t"),
+        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
+    }
+    _scan_fn = {".parquet": pl.scan_parquet}
+
+    def __init__(self, metadata: Path, /) -> None:
+        self._metadata = metadata
+
+
+def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
+    """
+    ``narwhals`` only accepts ``filter(*predicates)`.
+
+    Manually converts the constraints into ``==``
+    """
+    return nw.all_horizontal(
+        chain(predicates, (nw.col(name) == v for name, v in constraints.items()))
+    )
+
 
 def validate_constraints(
     name: DatasetName | LiteralString,
@@ -209,3 +280,30 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
 
 def is_ext_supported(suffix: Any) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
+
+
+@overload
+def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ...
+@overload
+def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ...
+@overload
+def get_backend(
+    backend: Literal["pandas[pyarrow]"], /
+) -> type[_PandasPyArrowReader]: ...
+def get_backend(
+    backend: Literal["polars", "pandas", "pandas[pyarrow]"], /
+) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]:
+    if backend == "polars":
+        return _PolarsReader
+    elif backend == "pandas[pyarrow]":
+        return _PandasPyArrowReader
+    elif backend == "pandas":
+        return _PandasReader
+    elif backend in {"pyarrow", "duckdb"}:
+        msg = "Included in ``dev``, not investigated yet"
+        raise NotImplementedError(msg)
+    elif backend in {"ibis", "cudf", "dask", "modin"}:
+        msg = "Supported by ``narwhals``, not investigated yet"
+        raise NotImplementedError(msg)
+    else:
+        raise TypeError(backend)

From 0ea4e21348bcc7cf799cec11c72f19e06e1c8a49 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 10:35:11 +0000
Subject: [PATCH 050/201] feat: Generate and move `Metadata` (`TypedDict`) to
 `datasets._typing`

---
 tools/datasets/__init__.py | 47 +++++++++++++++++++++++++++++++-
 tools/datasets/_io.py      |  3 +--
 tools/datasets/_typing.py  | 55 +++++++++++++++++++++++++++++++++++++-
 tools/datasets/models.py   | 14 ----------
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 96932b9af..b569e55d0 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -140,8 +140,12 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 
 
 def generate_datasets_typing(application: Application, output: Path, /) -> None:
+    from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
+
     app = application
     tags = app.scan("gh_tags").select("tag").collect().to_series()
+    metadata_schema = app.scan("gh_trees").collect_schema().to_python()
+
     DATASET_NAME = "dataset_name"
     names = (
         app.scan("gh_trees")
@@ -152,20 +156,61 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
         .collect()
         .to_series()
     )
+    indent = " " * 4
     NAME = "DatasetName"
     TAG = "VersionTag"
     EXT = "Extension"
+    METADATA_TD = "Metadata"
+    DESCRIPTION_DEFAULT = "_description_"
+    NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
+
+    name_collision = (
+        f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
+        "Requires specifying a preference in calls to ``data(ext=...)``."
+    )
+    sha = (
+        f"Unique hash for the dataset.{NOTE_SEP}"
+        f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
+        f"then all ``tag``(s) in this range would **share** this value."
+    )
+    descriptions: dict[str, str] = {
+        "dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
+        "ext_supported": "Dataset can be read as tabular data.",
+        "file_name": "Equivalent to ``Pathlib.Path.name``.",
+        "name_collision": name_collision,
+        "sha": sha,
+        "size": "File size (*bytes*).",
+        "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
+        "tag": "``vega-datasets`` release version.",
+        "url_npm": "Remote url used to access dataset.",
+    }
+    metadata_doc = f"\n{indent}".join(
+        f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
+        for param in metadata_schema
+    )
+
     contents = (
         f"{HEADER_COMMENT}",
         "from __future__ import annotations\n",
         "import sys",
         "from typing import Literal, TYPE_CHECKING",
+        utils.import_typing_extensions((3, 14), "TypedDict"),
         utils.import_typing_extensions((3, 10), "TypeAlias"),
         "\n",
-        f"__all__ = {[NAME, TAG, EXT]}\n\n"
+        f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
         f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
         f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
         f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
+        UNIVERSAL_TYPED_DICT.format(
+            name=METADATA_TD,
+            metaclass_kwds=", total=False",
+            td_args=f"\n{indent}".join(
+                f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
+            ),
+            summary="Full schema for ``metadata.parquet``.",
+            doc=metadata_doc,
+            comment="",
+        ),
     )
     ruff.write_lint_format(output, contents)
 
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 2074def12..14159218d 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -56,8 +56,7 @@
     else:
         from typing_extensions import TypeAlias
 
-    from tools.datasets._typing import DatasetName, Extension, VersionTag
-    from tools.datasets.models import Metadata
+    from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag
     from tools.schemapi.utils import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py
index 9414aaab4..0a86bc6ba 100644
--- a/tools/datasets/_typing.py
+++ b/tools/datasets/_typing.py
@@ -6,13 +6,18 @@
 import sys
 from typing import Literal
 
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
     from typing_extensions import TypeAlias
 
 
-__all__ = ["DatasetName", "Extension", "VersionTag"]
+__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"]
 
 DatasetName: TypeAlias = Literal[
     "airports",
@@ -135,3 +140,51 @@
     "v1.5.0",
 ]
 Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
+
+
+class Metadata(TypedDict, total=False):
+    """
+    Full schema for ``metadata.parquet``.
+
+    Parameters
+    ----------
+    dataset_name
+        Equivalent to ``Pathlib.Path.stem``.
+    ext_supported
+        Dataset can be read as tabular data.
+    file_name
+        Equivalent to ``Pathlib.Path.name``.
+    name_collision
+        Dataset is available via multiple ``suffix``(s).
+
+        .. note::
+            Requires specifying a preference in calls to ``data(ext=...)``.
+    sha
+        Unique hash for the dataset.
+
+        .. note::
+            If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
+
+            then all ``tag``(s) in this range would **share** this value.
+    size
+        File size (*bytes*).
+    suffix
+        File extension.
+
+        .. note::
+            Equivalent to ``Pathlib.Path.suffix``
+    tag
+        ``vega-datasets`` release version.
+    url_npm
+        Remote url used to access dataset.
+    """
+
+    dataset_name: str
+    ext_supported: bool
+    file_name: str
+    name_collision: bool
+    sha: str
+    size: int
+    suffix: str
+    tag: str
+    url_npm: str
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 556aafa1a..044447707 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -127,20 +127,6 @@ class ParsedTreesResponse(TypedDict):
     tree: list[ParsedTree]
 
 
-class Metadata(TypedDict, total=False):
-    """Full schema for `metadata.parquet`."""
-
-    dataset_name: str
-    ext_supported: bool
-    file_name: str
-    name_collision: bool
-    sha: str
-    size: int
-    suffix: str
-    tag: str
-    url_npm: str
-
-
 class GitHubRateLimit(TypedDict):
     limit: int
     used: int

From a2e9baa5ddd825efedd26d3aa3a3dfe5630d4e07 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 13:30:55 +0000
Subject: [PATCH 051/201] feat: Adds optional backends, `polars[pyarrow]`,
 `with_backend`

---
 tools/datasets/__init__.py |  48 +++++++++++--
 tools/datasets/_io.py      | 137 +++++++++++++++++++++++--------------
 2 files changed, 127 insertions(+), 58 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index b569e55d0..864829cf6 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -10,9 +10,10 @@
 import json
 import types
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Generic, Literal, overload
 
 import polars as pl
+from narwhals.typing import IntoDataFrameT, IntoFrameT
 
 from tools.codemod import ruff
 from tools.datasets._io import get_backend
@@ -24,6 +25,8 @@
     import sys
     from collections.abc import Mapping
 
+    import pandas as pd
+
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
@@ -32,6 +35,7 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
+    from tools.datasets._io import _Backend, _Reader
     from tools.datasets._typing import DatasetName, Extension, VersionTag
 
     _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"]
@@ -215,9 +219,8 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
     ruff.write_lint_format(output, contents)
 
 
-class DataLoader:
-    def __init__(self, metadata: Path, /) -> None:
-        self._reader = get_backend("polars")(metadata)
+class DataLoader(Generic[IntoDataFrameT, IntoFrameT]):
+    _reader: _Reader[IntoDataFrameT, IntoFrameT]
 
     def url(
         self,
@@ -236,9 +239,40 @@ def __call__(
         /,
         tag: VersionTag | Literal["latest"] | None = None,
         **kwds: Any,
-    ) -> pl.DataFrame:
+    ) -> IntoDataFrameT:
         """Get a remote dataset and load as tabular data."""
         return self._reader.dataset(name, ext, tag=tag, **kwds)
 
-
-data = DataLoader(app._from_alias("gh_trees"))
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["polars", "polars[pyarrow]"], /
+    ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ...
+
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["pandas", "pandas[pyarrow]"], /
+    ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ...
+
+    @classmethod
+    def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]:
+        """
+        Initialize a new loader, using the specified backend.
+
+        Parameters
+        ----------
+        backend
+            DataFrame package/config used to return data.
+
+            * *polars*: _
+            * *polars[pyarrow]*: Using ``use_pyarrow=True``
+            * *pandas*: _
+            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
+        """
+        obj = DataLoader.__new__(DataLoader)
+        obj._reader = get_backend(backend)
+        return obj
+
+
+data = DataLoader.with_backend("polars")
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 14159218d..9bdb6e5e9 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -17,6 +17,8 @@
 import os
 import urllib.request
 from functools import partial
+from importlib import import_module
+from importlib.util import find_spec
 from itertools import chain, islice
 from pathlib import Path
 from typing import (
@@ -33,14 +35,15 @@
 )
 
 import narwhals.stable.v1 as nw
-import pandas as pd
-import polars as pl
 from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
 if TYPE_CHECKING:
     import sys
     from urllib.request import OpenerDirector
 
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa  # noqa: F401
     from _typeshed import StrPath
 
     if sys.version_info >= (3, 13):
@@ -61,6 +64,9 @@
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
+    _Backend: TypeAlias = Literal[
+        "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]"
+    ]
 
 
 __all__ = ["get_backend"]
@@ -77,7 +83,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
-    _metadata: Path
+    _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
     @property
     def _datasets_dir(self) -> Path | None:  # type: ignore[return]
@@ -181,51 +187,76 @@ def _query(
             msg = f"Found no results for:\n{terms}"
             raise NotImplementedError(msg)
 
+    def _import(self, name: str, /) -> Any:
+        if spec := find_spec(name):
+            return import_module(spec.name)
+        else:
+            msg = f"{type(self).__name__!r} requires missing dependency {name!r}."
+            raise ModuleNotFoundError(msg, name=name)
+
+    def __init__(self, *specs: str) -> None: ...
+
 
 class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
-    _read_fn = {
-        ".csv": cast(
-            partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow")
-        ),
-        ".json": cast(
-            partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow")
-        ),
-        ".tsv": cast(
-            partial["pd.DataFrame"],
-            partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"),
-        ),
-        ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"),
-    }
-    _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")}
-
-    def __init__(self, metadata: Path, /) -> None:
-        self._metadata = metadata
+    def __init__(self, _pd: str, _pa: str, /) -> None:
+        if not TYPE_CHECKING:
+            pd = self._import(_pd)
+            pa = self._import(_pa)  # noqa: F841
+
+        self._read_fn = {
+            ".csv": cast(
+                partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow")
+            ),
+            ".json": cast(
+                partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow")
+            ),
+            ".tsv": cast(
+                partial["pd.DataFrame"],
+                partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"),
+            ),
+            ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"),
+        }
+        self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")}
 
 
 class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
-    _read_fn = {
-        ".csv": pd.read_csv,
-        ".json": pd.read_json,
-        ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
-        ".arrow": pd.read_feather,
-    }
-    _scan_fn = {".parquet": pd.read_parquet}
-
-    def __init__(self, metadata: Path, /) -> None:
-        self._metadata = metadata
+    def __init__(self, _pd: str, /) -> None:
+        if not TYPE_CHECKING:
+            pd = self._import(_pd)
+        self._read_fn = {
+            ".csv": pd.read_csv,
+            ".json": pd.read_json,
+            ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
+            ".arrow": pd.read_feather,
+        }
+        self._scan_fn = {".parquet": pd.read_parquet}
 
 
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    _read_fn = {
-        ".csv": pl.read_csv,
-        ".json": pl.read_json,
-        ".tsv": partial(pl.read_csv, separator="\t"),
-        ".arrow": partial(pl.read_ipc, use_pyarrow=True),
-    }
-    _scan_fn = {".parquet": pl.scan_parquet}
-
-    def __init__(self, metadata: Path, /) -> None:
-        self._metadata = metadata
+    def __init__(self, _pl: str, /) -> None:
+        if not TYPE_CHECKING:
+            pl = self._import(_pl)
+        self._read_fn = {
+            ".csv": pl.read_csv,
+            ".json": pl.read_json,
+            ".tsv": partial(pl.read_csv, separator="\t"),
+            ".arrow": pl.read_ipc,
+        }
+        self._scan_fn = {".parquet": pl.scan_parquet}
+
+
+class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
+    def __init__(self, _pl: str, _pa: str, /) -> None:
+        if not TYPE_CHECKING:
+            pl = self._import(_pl)
+            pa = self._import(_pa)  # noqa: F841
+        self._read_fn = {
+            ".csv": partial(pl.read_csv, use_pyarrow=True),
+            ".json": pl.read_json,
+            ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True),
+            ".arrow": partial(pl.read_ipc, use_pyarrow=True),
+        }
+        self._scan_fn = {".parquet": pl.scan_parquet}
 
 
 def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
@@ -281,23 +312,27 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
-@overload
-def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ...
-@overload
-def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ...
 @overload
 def get_backend(
-    backend: Literal["pandas[pyarrow]"], /
-) -> type[_PandasPyArrowReader]: ...
+    backend: Literal["polars", "polars[pyarrow]"], /
+) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
+
+
+@overload
 def get_backend(
-    backend: Literal["polars", "pandas", "pandas[pyarrow]"], /
-) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]:
+    backend: Literal["pandas", "pandas[pyarrow]"], /
+) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
+
+
+def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
     if backend == "polars":
-        return _PolarsReader
+        return _PolarsReader("polars")
+    elif backend == "polars[pyarrow]":
+        return _PolarsPyArrowReader("polars", "pyarrow")
     elif backend == "pandas[pyarrow]":
-        return _PandasPyArrowReader
+        return _PandasPyArrowReader("pandas", "pyarrow")
     elif backend == "pandas":
-        return _PandasReader
+        return _PandasReader("pandas")
     elif backend in {"pyarrow", "duckdb"}:
         msg = "Included in ``dev``, not investigated yet"
         raise NotImplementedError(msg)

From c8a1429064d20a1ed89e7723363c52779b5650cc Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 15:19:10 +0000
Subject: [PATCH 052/201] feat: Adds `pyarrow` backend

---
 tools/datasets/__init__.py |  7 +++++
 tools/datasets/_io.py      | 59 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 864829cf6..3c1c8b13d 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -26,6 +26,7 @@
     from collections.abc import Mapping
 
     import pandas as pd
+    import pyarrow as pa
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
@@ -255,6 +256,12 @@ def with_backend(
         cls, backend: Literal["pandas", "pandas[pyarrow]"], /
     ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ...
 
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["pyarrow"], /
+    ) -> DataLoader[pa.Table, pa.Table]: ...
+
     @classmethod
     def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]:
         """
diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index 9bdb6e5e9..a75d0bd17 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -43,8 +43,12 @@
 
     import pandas as pd
     import polars as pl
-    import pyarrow as pa  # noqa: F401
+    import pyarrow as pa
     from _typeshed import StrPath
+    from pyarrow.csv import read_csv as pa_read_csv  # noqa: F401
+    from pyarrow.feather import read_table as pa_read_feather  # noqa: F401
+    from pyarrow.json import read_json as pa_read_json  # noqa: F401
+    from pyarrow.parquet import read_table as pa_read_parquet  # noqa: F401
 
     if sys.version_info >= (3, 13):
         from typing import TypeIs, Unpack
@@ -65,7 +69,7 @@
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
     _Backend: TypeAlias = Literal[
-        "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]"
+        "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow"
     ]
 
 
@@ -259,6 +263,49 @@ def __init__(self, _pl: str, _pa: str, /) -> None:
         self._scan_fn = {".parquet": pl.scan_parquet}
 
 
+class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
+    """
+    Reader backed by `pyarrow.Table`_.
+
+    Warning
+    -------
+    **JSON**: Only supports `line-delimited`_ JSON.
+    Likely to raise the following error:
+
+        ArrowInvalid: JSON parse error: Column() changed from object to array in row 0
+
+    .. _pyarrow.Table:
+        https://arrow.apache.org/docs/python/generated/pyarrow.Table.html
+    .. _line-delimited:
+        https://arrow.apache.org/docs/python/json.html#reading-json-files
+    """
+
+    def __init__(self, _pa: str, /) -> None:
+        if not TYPE_CHECKING:
+            pa = self._import(_pa)  # noqa: F841
+            pa_csv = self._import(f"{_pa}.csv")
+            pa_feather = self._import(f"{_pa}.feather")
+            pa_json = self._import(f"{_pa}.json")
+            pa_parquet = self._import(f"{_pa}.parquet")
+
+            pa_read_csv = pa_csv.read_csv
+            pa_read_feather = pa_feather.read_table
+            pa_read_json = pa_json.read_json
+            pa_read_parquet = pa_parquet.read_table
+
+        # opt1 = ParseOptions(delimiter="\t")  # type: ignore
+        # Stubs suggest using a dataclass, but no way to construct it
+        opt2: Any = {"delimiter": "\t"}
+
+        self._read_fn = {
+            ".csv": pa_read_csv,
+            ".json": pa_read_json,
+            ".tsv": partial(pa_read_csv, parse_options=opt2),
+            ".arrow": pa_read_feather,
+        }
+        self._scan_fn = {".parquet": pa_read_parquet}
+
+
 def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
     """
     ``narwhals`` only accepts ``filter(*predicates)`.
@@ -324,6 +371,10 @@ def get_backend(
 ) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
 
 
+@overload
+def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ...
+
+
 def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
     if backend == "polars":
         return _PolarsReader("polars")
@@ -333,7 +384,9 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
         return _PandasPyArrowReader("pandas", "pyarrow")
     elif backend == "pandas":
         return _PandasReader("pandas")
-    elif backend in {"pyarrow", "duckdb"}:
+    elif backend == "pyarrow":
+        return _PyArrowReader("pyarrow")
+    elif backend == "duckdb":
         msg = "Included in ``dev``, not investigated yet"
         raise NotImplementedError(msg)
     elif backend in {"ibis", "cudf", "dask", "modin"}:

From 279fea952007d83bd99e6cba1dfb79ca1a8ff70a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 15:19:52 +0000
Subject: [PATCH 053/201] docs: Update `.with_backend()`

---
 tools/datasets/__init__.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 3c1c8b13d..6592d5d93 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -272,10 +272,21 @@ def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]:
         backend
             DataFrame package/config used to return data.
 
-            * *polars*: _
+            * *polars*: Using `polars defaults`_
             * *polars[pyarrow]*: Using ``use_pyarrow=True``
-            * *pandas*: _
+            * *pandas*: Using `pandas defaults`_.
             * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
+            * *pyarrow*: (*Experimental*)
+
+            .. warning::
+                Most datasets use a `JSON format not supported`_ by ``pyarrow``
+
+        .. _polars defaults:
+            https://docs.pola.rs/api/python/stable/reference/io.html
+        .. _pandas defaults:
+            https://pandas.pydata.org/docs/reference/io.html
+        .. _JSON format not supported:
+            https://arrow.apache.org/docs/python/json.html#reading-json-files
         """
         obj = DataLoader.__new__(DataLoader)
         obj._reader = get_backend(backend)

From 7d6c7ca2dce60c30b3c5e0107f9a496a17cb9695 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:17:40 +0000
Subject: [PATCH 054/201] chore: Remove `duckdb` comment

Not planning to support this anymore, requires `fsspec` which isn't in `dev`

```
InvalidInputException
Traceback (most recent call last)
Cell In[6], line 5
       3 with duck._reader._opener.open(url) as f:
       4     fn = duck._reader._read_fn['.json']
----> 5     thing = fn(f.read())

InvalidInputException: Invalid Input Error: This operation could not be completed because required module 'fsspec' is not installed"
```
---
 tools/datasets/_io.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
index a75d0bd17..7989ae282 100644
--- a/tools/datasets/_io.py
+++ b/tools/datasets/_io.py
@@ -386,9 +386,6 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
         return _PandasReader("pandas")
     elif backend == "pyarrow":
         return _PyArrowReader("pyarrow")
-    elif backend == "duckdb":
-        msg = "Included in ``dev``, not investigated yet"
-        raise NotImplementedError(msg)
     elif backend in {"ibis", "cudf", "dask", "modin"}:
         msg = "Supported by ``narwhals``, not investigated yet"
         raise NotImplementedError(msg)

From 0bb4210b5aa5ff22c345946a8e73a432373529ff Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:21:09 +0000
Subject: [PATCH 055/201] ci(typing): Add `pyarrow-stubs` to `dev` dependencies

Will put this in another PR, but need it here for IDE support
---
 pyproject.toml            | 1 +
 tests/utils/test_utils.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ae15a8a4b..4132f0a25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ dev = [
     "duckdb>=1.0",
     "ipython[kernel]",
     "pandas>=1.1.3",
+    "pyarrow-stubs",
     "pytest",
     "pytest-cov",
     "pytest-xdist[psutil]~=3.5",
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index c3b329cf0..36ed1b097 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -137,10 +137,11 @@ def test_sanitize_pyarrow_table_columns() -> None:
     )
 
     # Create pyarrow table with explicit schema so that date32 type is preserved
+    # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]"  [arg-type]
     pa_table = pa.Table.from_pandas(
         df,
         pa.schema(
-            [
+            (
                 pa.field("s", pa.string()),
                 pa.field("f", pa.float64()),
                 pa.field("i", pa.int64()),
@@ -148,7 +149,7 @@ def test_sanitize_pyarrow_table_columns() -> None:
                 pa.field("d", pa.date32()),
                 pa.field("c", pa.dictionary(pa.int8(), pa.string())),
                 pa.field("p", pa.timestamp("ns", tz="UTC")),
-            ]
+            )
         ),
     )
     sanitized = sanitize_narwhals_dataframe(nw.from_native(pa_table, eager_only=True))

From 89844253a51de27d4dac0590b013fb4f5361dd35 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:30:24 +0000
Subject: [PATCH 056/201] refactor: `generate_datasets_typing` ->
 `Application.generate_typing`

---
 tools/datasets/__init__.py | 150 ++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 76 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 6592d5d93..645775fb4 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -134,6 +134,80 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
             with fp_schema.open("w") as f:
                 json.dump(schema, f, indent=2)
 
+    def generate_typing(self, output: Path, /) -> None:
+        from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
+
+        tags = self.scan("gh_tags").select("tag").collect().to_series()
+        metadata_schema = self.scan("gh_trees").collect_schema().to_python()
+
+        DATASET_NAME = "dataset_name"
+        names = (
+            self.scan("gh_trees")
+            .filter("ext_supported")
+            .unique(DATASET_NAME)
+            .select(DATASET_NAME)
+            .sort(DATASET_NAME)
+            .collect()
+            .to_series()
+        )
+        indent = " " * 4
+        NAME = "DatasetName"
+        TAG = "VersionTag"
+        EXT = "Extension"
+        METADATA_TD = "Metadata"
+        DESCRIPTION_DEFAULT = "_description_"
+        NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
+
+        name_collision = (
+            f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
+            "Requires specifying a preference in calls to ``data(ext=...)``."
+        )
+        sha = (
+            f"Unique hash for the dataset.{NOTE_SEP}"
+            f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
+            f"then all ``tag``(s) in this range would **share** this value."
+        )
+        descriptions: dict[str, str] = {
+            "dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
+            "ext_supported": "Dataset can be read as tabular data.",
+            "file_name": "Equivalent to ``Pathlib.Path.name``.",
+            "name_collision": name_collision,
+            "sha": sha,
+            "size": "File size (*bytes*).",
+            "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
+            "tag": "``vega-datasets`` release version.",
+            "url_npm": "Remote url used to access dataset.",
+        }
+        metadata_doc = f"\n{indent}".join(
+            f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
+            for param in metadata_schema
+        )
+
+        contents = (
+            f"{HEADER_COMMENT}",
+            "from __future__ import annotations\n",
+            "import sys",
+            "from typing import Literal, TYPE_CHECKING",
+            utils.import_typing_extensions((3, 14), "TypedDict"),
+            utils.import_typing_extensions((3, 10), "TypeAlias"),
+            "\n",
+            f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
+            f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
+            f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
+            f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
+            UNIVERSAL_TYPED_DICT.format(
+                name=METADATA_TD,
+                metaclass_kwds=", total=False",
+                td_args=f"\n{indent}".join(
+                    f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
+                ),
+                summary="Full schema for ``metadata.parquet``.",
+                doc=metadata_doc,
+                comment="",
+            ),
+        )
+        ruff.write_lint_format(output, contents)
+
 
 app = Application(Path(__file__).parent / "_metadata", write_schema=True)
 
@@ -144,82 +218,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 _CURRENT_SOURCE_TAG = "v2.9.0"
 
 
-def generate_datasets_typing(application: Application, output: Path, /) -> None:
-    from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
-
-    app = application
-    tags = app.scan("gh_tags").select("tag").collect().to_series()
-    metadata_schema = app.scan("gh_trees").collect_schema().to_python()
-
-    DATASET_NAME = "dataset_name"
-    names = (
-        app.scan("gh_trees")
-        .filter("ext_supported")
-        .unique(DATASET_NAME)
-        .select(DATASET_NAME)
-        .sort(DATASET_NAME)
-        .collect()
-        .to_series()
-    )
-    indent = " " * 4
-    NAME = "DatasetName"
-    TAG = "VersionTag"
-    EXT = "Extension"
-    METADATA_TD = "Metadata"
-    DESCRIPTION_DEFAULT = "_description_"
-    NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
-
-    name_collision = (
-        f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
-        "Requires specifying a preference in calls to ``data(ext=...)``."
-    )
-    sha = (
-        f"Unique hash for the dataset.{NOTE_SEP}"
-        f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
-        f"then all ``tag``(s) in this range would **share** this value."
-    )
-    descriptions: dict[str, str] = {
-        "dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
-        "ext_supported": "Dataset can be read as tabular data.",
-        "file_name": "Equivalent to ``Pathlib.Path.name``.",
-        "name_collision": name_collision,
-        "sha": sha,
-        "size": "File size (*bytes*).",
-        "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
-        "tag": "``vega-datasets`` release version.",
-        "url_npm": "Remote url used to access dataset.",
-    }
-    metadata_doc = f"\n{indent}".join(
-        f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
-        for param in metadata_schema
-    )
-
-    contents = (
-        f"{HEADER_COMMENT}",
-        "from __future__ import annotations\n",
-        "import sys",
-        "from typing import Literal, TYPE_CHECKING",
-        utils.import_typing_extensions((3, 14), "TypedDict"),
-        utils.import_typing_extensions((3, 10), "TypeAlias"),
-        "\n",
-        f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
-        f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
-        f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
-        f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
-        UNIVERSAL_TYPED_DICT.format(
-            name=METADATA_TD,
-            metaclass_kwds=", total=False",
-            td_args=f"\n{indent}".join(
-                f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
-            ),
-            summary="Full schema for ``metadata.parquet``.",
-            doc=metadata_doc,
-            comment="",
-        ),
-    )
-    ruff.write_lint_format(output, contents)
-
-
 class DataLoader(Generic[IntoDataFrameT, IntoFrameT]):
     _reader: _Reader[IntoDataFrameT, IntoFrameT]
 

From 9d062c8c8e030d4ea6b1288cf9e93692c60c78a0 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 18:22:21 +0000
Subject: [PATCH 057/201] refactor: Split `datasets` into public/private
 packages

- `tools.datasets`: Building & updating metadata file(s), generating annotations
- `altair.datasets`: Consuming metadata, remote & cached dataset management
---
 altair/__init__.py                            |   3 +-
 altair/datasets/__init__.py                   | 117 ++++++++++++++++++
 .../datasets/_metadata/metadata.parquet       | Bin
 .../_io.py => altair/datasets/_readers.py     |  11 +-
 {tools => altair}/datasets/_typing.py         |   0
 tools/datasets/__init__.py                    | 112 +++--------------
 tools/datasets/_metadata/metadata-schema.json |  11 --
 tools/datasets/github.py                      |  14 ++-
 8 files changed, 146 insertions(+), 122 deletions(-)
 create mode 100644 altair/datasets/__init__.py
 rename {tools => altair}/datasets/_metadata/metadata.parquet (100%)
 rename tools/datasets/_io.py => altair/datasets/_readers.py (97%)
 rename {tools => altair}/datasets/_typing.py (100%)
 delete mode 100644 tools/datasets/_metadata/metadata-schema.json

diff --git a/altair/__init__.py b/altair/__init__.py
index d4e20f02f..d0d23dbaf 100644
--- a/altair/__init__.py
+++ b/altair/__init__.py
@@ -603,6 +603,7 @@
     "core",
     "data",
     "data_transformers",
+    "datasets",
     "datum",
     "default_data_transformer",
     "display",
@@ -653,7 +654,7 @@ def __dir__():
 from altair.jupyter import JupyterChart
 from altair.expr import expr
 from altair.utils import AltairDeprecationWarning, parse_shorthand, Undefined
-from altair import typing
+from altair import typing, datasets
 
 
 def load_ipython_extension(ipython):
diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
new file mode 100644
index 000000000..15c8069f9
--- /dev/null
+++ b/altair/datasets/__init__.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Generic, overload
+
+from narwhals.typing import IntoDataFrameT, IntoFrameT
+
+from altair.datasets._readers import _Reader, get_backend
+
+if TYPE_CHECKING:
+    import sys
+    from typing import Any, Literal
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    from altair.datasets._readers import _Backend
+    from altair.datasets._typing import DatasetName, Extension, VersionTag
+
+__all__ = ["Loader", "data"]
+
+
+class Loader(Generic[IntoDataFrameT, IntoFrameT]):
+    _reader: _Reader[IntoDataFrameT, IntoFrameT]
+
+    def url(
+        self,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
+        /,
+        tag: VersionTag | Literal["latest"] | None = None,
+    ) -> str:
+        """Return the address of a remote dataset."""
+        return self._reader.url(name, ext, tag=tag)
+
+    def __call__(
+        self,
+        name: DatasetName | LiteralString,
+        ext: Extension | None = None,
+        /,
+        tag: VersionTag | Literal["latest"] | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        """Get a remote dataset and load as tabular data."""
+        return self._reader.dataset(name, ext, tag=tag, **kwds)
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}[{type(self._reader).__name__}]"
+
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["polars", "polars[pyarrow]"], /
+    ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
+
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["pandas", "pandas[pyarrow]"], /
+    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
+
+    @overload
+    @classmethod
+    def with_backend(
+        cls, backend: Literal["pyarrow"], /
+    ) -> Loader[pa.Table, pa.Table]: ...
+
+    @classmethod
+    def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
+        """
+        Initialize a new loader, using the specified backend.
+
+        Parameters
+        ----------
+        backend
+            DataFrame package/config used to return data.
+
+            * *polars*: Using `polars defaults`_
+            * *polars[pyarrow]*: Using ``use_pyarrow=True``
+            * *pandas*: Using `pandas defaults`_.
+            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
+            * *pyarrow*: (*Experimental*)
+
+            .. warning::
+                Most datasets use a `JSON format not supported`_ by ``pyarrow``
+
+        .. _polars defaults:
+            https://docs.pola.rs/api/python/stable/reference/io.html
+        .. _pandas defaults:
+            https://pandas.pydata.org/docs/reference/io.html
+        .. _JSON format not supported:
+            https://arrow.apache.org/docs/python/json.html#reading-json-files
+        """
+        obj = Loader.__new__(Loader)
+        obj._reader = get_backend(backend)
+        return obj
+
+
+def __getattr__(name):
+    if name == "data":
+        global data
+        data = Loader.with_backend("pandas")
+        from altair.utils.deprecation import deprecated_warn
+
+        deprecated_warn(
+            "Added only for backwards compatibility with `altair-viz/vega_datasets`.",
+            version="5.5.0",
+            alternative="altair.datasets.Loader.with_backend(...)",
+            stacklevel=3,
+        )
+        return data
+    else:
+        raise AttributeError(name)
diff --git a/tools/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
similarity index 100%
rename from tools/datasets/_metadata/metadata.parquet
rename to altair/datasets/_metadata/metadata.parquet
diff --git a/tools/datasets/_io.py b/altair/datasets/_readers.py
similarity index 97%
rename from tools/datasets/_io.py
rename to altair/datasets/_readers.py
index 7989ae282..cbb02cd00 100644
--- a/tools/datasets/_io.py
+++ b/altair/datasets/_readers.py
@@ -1,15 +1,10 @@
 """
-Will be part of the public ``alt.datasets`` subpackage.
+Backends for ``alt.datasets.Loader``.
 
 - Interfacing with the cached metadata.
     - But not updating it
 - Performing requests from those urls
 - Dispatching read function on file extension
-
-Note
-----
-- Building with ``polars`` first, then will work backwards with ``narwhals``.
-    - Since ``narwhals`` is a subset of ``polars``
 """
 
 from __future__ import annotations
@@ -63,8 +58,8 @@
     else:
         from typing_extensions import TypeAlias
 
-    from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag
-    from tools.schemapi.utils import OneOrSeq
+    from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag
+    from altair.vegalite.v5.schema._typing import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
diff --git a/tools/datasets/_typing.py b/altair/datasets/_typing.py
similarity index 100%
rename from tools/datasets/_typing.py
rename to altair/datasets/_typing.py
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 645775fb4..d9b00d9a5 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -10,13 +10,11 @@
 import json
 import types
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal
 
 import polars as pl
-from narwhals.typing import IntoDataFrameT, IntoFrameT
 
 from tools.codemod import ruff
-from tools.datasets._io import get_backend
 from tools.datasets.github import GitHub
 from tools.datasets.npm import Npm
 from tools.schemapi import utils
@@ -25,25 +23,14 @@
     import sys
     from collections.abc import Mapping
 
-    import pandas as pd
-    import pyarrow as pa
-
-    if sys.version_info >= (3, 11):
-        from typing import LiteralString
-    else:
-        from typing_extensions import LiteralString
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from tools.datasets._io import _Backend, _Reader
-    from tools.datasets._typing import DatasetName, Extension, VersionTag
 
     _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"]
 
-    WorkInProgress: TypeAlias = Any
-
-__all__ = ["app", "data"]
+__all__ = ["app"]
 
 HEADER_COMMENT = """\
 # The contents of this file are automatically written by
@@ -61,7 +48,8 @@ class Application:
 
     def __init__(
         self,
-        output_dir: Path,
+        out_dir_tools: Path,
+        out_dir_altair: Path,
         *,
         write_schema: bool,
         trees_gh: str = "metadata",
@@ -70,14 +58,18 @@ def __init__(
         kwds_gh: Mapping[str, Any] | None = None,
         kwds_npm: Mapping[str, Any] | None = None,
     ) -> None:
-        output_dir.mkdir(exist_ok=True)
+        out_dir_tools.mkdir(exist_ok=True)
         kwds_gh = kwds_gh or {}
         kwds_npm = kwds_npm or {}
         self._write_schema: bool = write_schema
         self._github: GitHub = GitHub(
-            output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh
+            out_dir_tools,
+            out_dir_altair,
+            name_tags=tags_gh,
+            name_trees=trees_gh,
+            **kwds_gh,
         )
-        self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm)
+        self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm)
         self._paths = types.MappingProxyType["_PathAlias", Path](
             {
                 "npm_tags": self.npm._paths["tags"],
@@ -209,86 +201,14 @@ def generate_typing(self, output: Path, /) -> None:
         ruff.write_lint_format(output, contents)
 
 
-app = Application(Path(__file__).parent / "_metadata", write_schema=True)
+app = Application(
+    Path(__file__).parent / "_metadata",
+    Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata",
+    write_schema=False,
+)
 
 
 # This is the tag in http://github.com/vega/vega-datasets from
 # which the datasets in this repository are sourced.
 _OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
 _CURRENT_SOURCE_TAG = "v2.9.0"
-
-
-class DataLoader(Generic[IntoDataFrameT, IntoFrameT]):
-    _reader: _Reader[IntoDataFrameT, IntoFrameT]
-
-    def url(
-        self,
-        name: DatasetName | LiteralString,
-        ext: Extension | None = None,
-        /,
-        tag: VersionTag | Literal["latest"] | None = None,
-    ) -> str:
-        """Return the address of a remote dataset."""
-        return self._reader.url(name, ext, tag=tag)
-
-    def __call__(
-        self,
-        name: DatasetName | LiteralString,
-        ext: Extension | None = None,
-        /,
-        tag: VersionTag | Literal["latest"] | None = None,
-        **kwds: Any,
-    ) -> IntoDataFrameT:
-        """Get a remote dataset and load as tabular data."""
-        return self._reader.dataset(name, ext, tag=tag, **kwds)
-
-    @overload
-    @classmethod
-    def with_backend(
-        cls, backend: Literal["polars", "polars[pyarrow]"], /
-    ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ...
-
-    @overload
-    @classmethod
-    def with_backend(
-        cls, backend: Literal["pandas", "pandas[pyarrow]"], /
-    ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ...
-
-    @overload
-    @classmethod
-    def with_backend(
-        cls, backend: Literal["pyarrow"], /
-    ) -> DataLoader[pa.Table, pa.Table]: ...
-
-    @classmethod
-    def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]:
-        """
-        Initialize a new loader, using the specified backend.
-
-        Parameters
-        ----------
-        backend
-            DataFrame package/config used to return data.
-
-            * *polars*: Using `polars defaults`_
-            * *polars[pyarrow]*: Using ``use_pyarrow=True``
-            * *pandas*: Using `pandas defaults`_.
-            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
-            * *pyarrow*: (*Experimental*)
-
-            .. warning::
-                Most datasets use a `JSON format not supported`_ by ``pyarrow``
-
-        .. _polars defaults:
-            https://docs.pola.rs/api/python/stable/reference/io.html
-        .. _pandas defaults:
-            https://pandas.pydata.org/docs/reference/io.html
-        .. _JSON format not supported:
-            https://arrow.apache.org/docs/python/json.html#reading-json-files
-        """
-        obj = DataLoader.__new__(DataLoader)
-        obj._reader = get_backend(backend)
-        return obj
-
-
-data = DataLoader.with_backend("polars")
diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json
deleted file mode 100644
index 53d9978b3..000000000
--- a/tools/datasets/_metadata/metadata-schema.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "dataset_name": "str",
-  "ext_supported": "bool",
-  "file_name": "str",
-  "name_collision": "bool",
-  "sha": "str",
-  "size": "int",
-  "suffix": "str",
-  "tag": "str",
-  "url_npm": "str"
-}
\ No newline at end of file
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 0238aab69..8b58e8690 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -37,7 +37,7 @@
     from email.message import Message
     from urllib.request import OpenerDirector, Request
 
-    from tools.datasets._typing import Extension
+    from altair.datasets._typing import Extension
 
     if sys.version_info >= (3, 13):
         from typing import TypeIs
@@ -270,7 +270,8 @@ class GitHub:
 
     def __init__(
         self,
-        output_dir: Path,
+        out_dir_tools: Path,
+        out_dir_altair: Path,
         name_tags: str,
         name_trees: str,
         *,
@@ -278,11 +279,12 @@ def __init__(
         org: LiteralString = "vega",
         package: LiteralString = "vega-datasets",
     ) -> None:
-        output_dir.mkdir(exist_ok=True)
+        out_dir_tools.mkdir(exist_ok=True)
+        out_dir_altair.mkdir(exist_ok=True)
         self._paths: dict[_PathName, Path] = {
-            "dir": output_dir,
-            "tags": output_dir / f"{name_tags}.parquet",
-            "trees": output_dir / f"{name_trees}.parquet",
+            "dir": out_dir_tools,
+            "tags": out_dir_tools / f"{name_tags}.parquet",
+            "trees": out_dir_altair / f"{name_trees}.parquet",
         }
         repo = f"{base_url}repos/{org}/{package}/"
         self._url = GitHubUrl(

From a17d674303558f0989b2aaac835efa3d04de80cc Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 18:44:57 +0000
Subject: [PATCH 058/201] refactor: Provide `npm` url to `GitHub(...)`

---
 tools/datasets/__init__.py |  3 ++-
 tools/datasets/github.py   | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index d9b00d9a5..6319bd65e 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -62,14 +62,15 @@ def __init__(
         kwds_gh = kwds_gh or {}
         kwds_npm = kwds_npm or {}
         self._write_schema: bool = write_schema
+        self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm)
         self._github: GitHub = GitHub(
             out_dir_tools,
             out_dir_altair,
             name_tags=tags_gh,
             name_trees=trees_gh,
+            npm_cdn_url=self._npm.url.CDN,
             **kwds_gh,
         )
-        self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm)
         self._paths = types.MappingProxyType["_PathAlias", Path](
             {
                 "npm_tags": self.npm._paths["tags"],
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 8b58e8690..c2d7141aa 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -59,10 +59,7 @@
 
 _TD = TypeVar("_TD", bound=Mapping[str, Any])
 
-
-# TODO: Work on where these should live/be accessed
-_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@"
-_SUB_DIR = "data"
+_DATA = "data"
 
 
 def is_ext_supported(suffix: str) -> TypeIs[Extension]:
@@ -152,7 +149,7 @@ def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
             url = tag["trees_url"]
         with self._gh._opener.open(self._request(url)) as response:
             content: GitHubTreesResponse = json.load(response)
-        query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR)
+        query = (tree["url"] for tree in content["tree"] if tree["path"] == _DATA)
         if data_url := next(query, None):
             with self._gh._opener.open(self._request(data_url)) as response:
                 data_dir: GitHubTreesResponse = json.load(response)
@@ -237,12 +234,13 @@ def tag_from_str(self, s: str, /) -> str:
         # - Trees url (using ref name)
         # - npm url (works w/o the `v` prefix)
         trees_url = self.url.TREES
+        npm_url = self._gh._npm_cdn_url
         if s.startswith("v"):
             return s
         elif s.startswith(trees_url):
             return s.replace(trees_url, "")
-        elif s.startswith(_NPM_BASE_URL):
-            s, _ = s.replace(_NPM_BASE_URL, "").split("/")
+        elif s.startswith(npm_url):
+            s, _ = s.replace(npm_url, "").split("/")
             return s if s.startswith("v") else f"v{s}"
         else:
             raise TypeError(s)
@@ -275,6 +273,7 @@ def __init__(
         name_tags: str,
         name_trees: str,
         *,
+        npm_cdn_url: LiteralString,
         base_url: LiteralString = "https://api.github.com/",
         org: LiteralString = "vega",
         package: LiteralString = "vega-datasets",
@@ -295,6 +294,7 @@ def __init__(
             TAGS=f"{repo}tags",
             TREES=f"{repo}git/trees/",
         )
+        self._npm_cdn_url: LiteralString = npm_cdn_url
 
     @property
     def req(self) -> _GitHubRequestNamespace:
@@ -331,9 +331,9 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
             .with_columns(name_collision=pl.col("dataset_name").is_duplicated())
             .with_columns(
                 url_npm=pl.concat_str(
-                    pl.lit(_NPM_BASE_URL),
+                    pl.lit(self._npm_cdn_url),
                     pl.col("tag"),
-                    pl.lit(f"/{_SUB_DIR}/"),
+                    pl.lit(f"/{_DATA}/"),
                     pl.col("file_name"),
                 )
             )
@@ -345,7 +345,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
         """
         Use known tags to discover and update missing trees metadata.
 
-        Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users.
+        Aims to stay well-within API rate limits, both for authenticated and unauthenticated users.
         """
         if gh_tags.is_empty():
             msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}"

From 69a619caeaa803599dfc080ed7b3b34f0ca10386 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 18:55:42 +0000
Subject: [PATCH 059/201] refactor: Rename `ext` -> `suffix`

---
 altair/datasets/__init__.py |  8 ++++----
 altair/datasets/_readers.py | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 15c8069f9..0db434979 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -30,23 +30,23 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     def url(
         self,
         name: DatasetName | LiteralString,
-        ext: Extension | None = None,
+        suffix: Extension | None = None,
         /,
         tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
         """Return the address of a remote dataset."""
-        return self._reader.url(name, ext, tag=tag)
+        return self._reader.url(name, suffix, tag=tag)
 
     def __call__(
         self,
         name: DatasetName | LiteralString,
-        ext: Extension | None = None,
+        suffix: Extension | None = None,
         /,
         tag: VersionTag | Literal["latest"] | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         """Get a remote dataset and load as tabular data."""
-        return self._reader.dataset(name, ext, tag=tag, **kwds)
+        return self._reader.dataset(name, suffix, tag=tag, **kwds)
 
     def __repr__(self) -> str:
         return f"{type(self).__name__}[{type(self._reader).__name__}]"
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index cbb02cd00..cebbe1526 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -109,11 +109,11 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
     def url(
         self,
         name: DatasetName | LiteralString,
-        ext: Extension | None = None,
+        suffix: Extension | None = None,
         /,
         tag: VersionTag | Literal["latest"] | None = None,
     ) -> str:
-        df = self._query(**validate_constraints(name, ext, tag))
+        df = self._query(**validate_constraints(name, suffix, tag))
         url = df.item(0, "url_npm")
         if isinstance(url, str):
             return url
@@ -124,7 +124,7 @@ def url(
     def dataset(
         self,
         name: DatasetName | LiteralString,
-        ext: Extension | None = None,
+        suffix: Extension | None = None,
         /,
         tag: VersionTag | Literal["latest"] | None = None,
         **kwds: Any,
@@ -134,12 +134,12 @@ def dataset(
 
         Parameters
         ----------
-        name, ext, tag
+        name, suffix, tag
             TODO
         **kwds
             Arguments passed to the underlying read function.
         """
-        df = self._query(**validate_constraints(name, ext, tag))
+        df = self._query(**validate_constraints(name, suffix, tag))
         it = islice(df.iter_rows(named=True), 1)
         result = cast("Metadata", next(it))
         url = result["url_npm"]
@@ -314,7 +314,7 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.
 
 def validate_constraints(
     name: DatasetName | LiteralString,
-    ext: Extension | None,
+    suffix: Extension | None,
     tag: VersionTag | Literal["latest"] | None,
     /,
 ) -> Metadata:
@@ -328,11 +328,11 @@ def validate_constraints(
         constraints["dataset_name"] = fp.stem
         constraints["suffix"] = fp.suffix
         return constraints
-    elif ext is not None:
-        if not is_ext_supported(ext):
-            raise TypeError(ext)
+    elif suffix is not None:
+        if not is_ext_supported(suffix):
+            raise TypeError(suffix)
         else:
-            constraints["suffix"] = ext
+            constraints["suffix"] = suffix
     constraints["dataset_name"] = name
     return constraints
 

From a259b1070e1a2dcd356992bc6fd95982cf6b9ef6 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 8 Nov 2024 18:57:31 +0000
Subject: [PATCH 060/201] refactor: Remove unimplemented `tag="latest"`

Since `metadata.parquet` is sorted, this was already the behavior when not providing a tag
---
 altair/datasets/__init__.py |  4 ++--
 altair/datasets/_readers.py | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 0db434979..c2ccee2fe 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -32,7 +32,7 @@ def url(
         name: DatasetName | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | Literal["latest"] | None = None,
+        tag: VersionTag | None = None,
     ) -> str:
         """Return the address of a remote dataset."""
         return self._reader.url(name, suffix, tag=tag)
@@ -42,7 +42,7 @@ def __call__(
         name: DatasetName | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | Literal["latest"] | None = None,
+        tag: VersionTag | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         """Get a remote dataset and load as tabular data."""
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index cebbe1526..b344bd67a 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -111,7 +111,7 @@ def url(
         name: DatasetName | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | Literal["latest"] | None = None,
+        tag: VersionTag | None = None,
     ) -> str:
         df = self._query(**validate_constraints(name, suffix, tag))
         url = df.item(0, "url_npm")
@@ -126,7 +126,7 @@ def dataset(
         name: DatasetName | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | Literal["latest"] | None = None,
+        tag: VersionTag | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         """
@@ -315,13 +315,11 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.
 def validate_constraints(
     name: DatasetName | LiteralString,
     suffix: Extension | None,
-    tag: VersionTag | Literal["latest"] | None,
+    tag: VersionTag | None,
     /,
 ) -> Metadata:
     constraints: Metadata = {}
-    if tag == "latest":
-        raise NotImplementedError(tag)
-    elif tag is not None:
+    if tag is not None:
         constraints["tag"] = tag
     if name.endswith((".csv", ".json", ".tsv", ".arrow")):
         fp = Path(name)

From 88968c8bf188f5c6817fac2edf3c0b8a44602ec3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 11:56:01 +0000
Subject: [PATCH 061/201] feat: Rename `_datasets_dir`, make configurable, add
 docs

Still on the fence about `Loader.cache_dir` vs `Loader.cache`
---
 altair/datasets/__init__.py | 31 +++++++++++++++++++++++++++++++
 altair/datasets/_readers.py | 10 +++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index c2ccee2fe..c89163a48 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -8,11 +8,13 @@
 
 if TYPE_CHECKING:
     import sys
+    from pathlib import Path
     from typing import Any, Literal
 
     import pandas as pd
     import polars as pl
     import pyarrow as pa
+    from _typeshed import StrPath
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
@@ -99,6 +101,35 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
         obj._reader = get_backend(backend)
         return obj
 
+    @property
+    def cache_dir(self) -> Path | None:
+        """
+        Returns path to datasets cache.
+
+        By default, this can be configured using the environment variable:
+
+            "ALTAIR_DATASETS_DIR"
+
+        You *may* also set this directly, but the value will **not** persist between sessions:
+
+            from pathlib import Path
+
+            from altair.datasets import Loader
+
+            data = Loader.with_backend("polars")
+            data.cache_dir = Path.home() / ".altair_cache"
+
+            data.cache_dir.relative_to(Path.home()).as_posix()
+            '.altair_cache'
+        """
+        return self._reader._cache
+
+    @cache_dir.setter
+    def cache_dir(self, source: StrPath, /) -> None:
+        import os
+
+        os.environ[self._reader._ENV_VAR] = str(source)
+
 
 def __getattr__(name):
     if name == "data":
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index b344bd67a..673e2e6d1 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -85,7 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
     @property
-    def _datasets_dir(self) -> Path | None:  # type: ignore[return]
+    def _cache(self) -> Path | None:  # type: ignore[return]
         """
         Returns path to datasets cache, if possible.
 
@@ -94,9 +94,9 @@ def _datasets_dir(self) -> Path | None:  # type: ignore[return]
             Reader._ENV_VAR
         """
         if _dir := os.environ.get(self._ENV_VAR):
-            datasets_dir = Path(_dir)
-            datasets_dir.mkdir(exist_ok=True)
-            return datasets_dir
+            cache_dir = Path(_dir)
+            cache_dir.mkdir(exist_ok=True)
+            return cache_dir
 
     def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
         suffix = validate_suffix(source, is_ext_supported)
@@ -145,7 +145,7 @@ def dataset(
         url = result["url_npm"]
         fn = self.reader_from(url)
 
-        if cache := self._datasets_dir:
+        if cache := self._cache:
             fp = cache / (result["sha"] + result["suffix"])
             if fp.exists():
                 return fn(fp, **kwds)

From b98730887d0392ac0a2fbb5d226f5013862201c3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 12:13:25 +0000
Subject: [PATCH 062/201] docs: Adds examples to `Loader.with_backend`

---
 altair/datasets/__init__.py | 49 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index c89163a48..4bcf768b6 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -27,6 +27,13 @@
 
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
+    """
+    Load examples **remotely** from `vega-datasets`_, with *optional* caching.
+
+    .. _vega-datasets:
+        https://github.com/vega/vega-datasets
+    """
+
     _reader: _Reader[IntoDataFrameT, IntoFrameT]
 
     def url(
@@ -74,7 +81,7 @@ def with_backend(
     @classmethod
     def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
         """
-        Initialize a new loader, using the specified backend.
+        Initialize a new loader, with the specified backend.
 
         Parameters
         ----------
@@ -96,6 +103,46 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
             https://pandas.pydata.org/docs/reference/io.html
         .. _JSON format not supported:
             https://arrow.apache.org/docs/python/json.html#reading-json-files
+
+        Examples
+        --------
+        Using ``polars``:
+
+            from altair.datasets import Loader
+
+            data = Loader.with_backend("polars")
+            cars = data("cars")
+
+            type(cars)
+            polars.dataframe.frame.DataFrame
+
+        Using ``pandas``:
+
+            data = Loader.with_backend("pandas")
+            cars = data("cars")
+
+            type(cars)
+            pandas.core.frame.DataFrame
+
+        Using ``pandas``, backed by ``pyarrow`` dtypes:
+
+            data = Loader.with_backend("pandas[pyarrow]")
+            cars = data("cars", tag="v1.29.0")
+
+            type(cars)
+            pandas.core.frame.DataFrame
+
+            cars.dtypes
+            Name                string[pyarrow]
+            Miles_per_Gallon    double[pyarrow]
+            Cylinders            int64[pyarrow]
+            Displacement        double[pyarrow]
+            Horsepower           int64[pyarrow]
+            Weight_in_lbs        int64[pyarrow]
+            Acceleration        double[pyarrow]
+            Year                string[pyarrow]
+            Origin              string[pyarrow]
+            dtype: object
         """
         obj = Loader.__new__(Loader)
         obj._reader = get_backend(backend)

From 4a2a2e068f85d118244ceda09350cf3690781227 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 14:59:40 +0000
Subject: [PATCH 063/201] refactor: Clean up requirements -> imports

---
 altair/datasets/_readers.py | 100 ++++++++++++++++++++++++++----------
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 673e2e6d1..78ee784a6 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -63,9 +63,14 @@
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
-    _Backend: TypeAlias = Literal[
-        "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow"
-    ]
+
+    _Polars: TypeAlias = Literal["polars"]
+    _Pandas: TypeAlias = Literal["pandas"]
+    _PyArrow: TypeAlias = Literal["pyarrow"]
+    _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow)
+    _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"]
+    _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"]
+    _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow]
 
 
 __all__ = ["get_backend"]
@@ -80,6 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
 
     _read_fn: dict[Extension, Callable[..., IntoDataFrameT]]
     _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
+    _name: LiteralString
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
     _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
@@ -193,11 +199,16 @@ def _import(self, name: str, /) -> Any:
             msg = f"{type(self).__name__!r} requires missing dependency {name!r}."
             raise ModuleNotFoundError(msg, name=name)
 
-    def __init__(self, *specs: str) -> None: ...
+    def __repr__(self) -> str:
+        return f"Reader[{self._name}]"
+
+    def __init__(self, name: LiteralString, /) -> None: ...
 
 
 class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
-    def __init__(self, _pd: str, _pa: str, /) -> None:
+    def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
+        _pd, _pa = _requirements(name)
+        self._name = name
         if not TYPE_CHECKING:
             pd = self._import(_pd)
             pa = self._import(_pa)  # noqa: F841
@@ -219,9 +230,10 @@ def __init__(self, _pd: str, _pa: str, /) -> None:
 
 
 class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
-    def __init__(self, _pd: str, /) -> None:
+    def __init__(self, name: _Pandas, /) -> None:
+        self._name = _requirements(name)
         if not TYPE_CHECKING:
-            pd = self._import(_pd)
+            pd = self._import(self._name)
         self._read_fn = {
             ".csv": pd.read_csv,
             ".json": pd.read_json,
@@ -232,9 +244,10 @@ def __init__(self, _pd: str, /) -> None:
 
 
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    def __init__(self, _pl: str, /) -> None:
+    def __init__(self, name: _Polars, /) -> None:
+        self._name = _requirements(name)
         if not TYPE_CHECKING:
-            pl = self._import(_pl)
+            pl = self._import(self._name)
         self._read_fn = {
             ".csv": pl.read_csv,
             ".json": pl.read_json,
@@ -245,7 +258,9 @@ def __init__(self, _pl: str, /) -> None:
 
 
 class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    def __init__(self, _pl: str, _pa: str, /) -> None:
+    def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
+        _pl, _pa = _requirements(name)
+        self._name = name
         if not TYPE_CHECKING:
             pl = self._import(_pl)
             pa = self._import(_pa)  # noqa: F841
@@ -275,13 +290,14 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
         https://arrow.apache.org/docs/python/json.html#reading-json-files
     """
 
-    def __init__(self, _pa: str, /) -> None:
+    def __init__(self, name: _PyArrow, /) -> None:
+        self._name = _requirements(name)
         if not TYPE_CHECKING:
-            pa = self._import(_pa)  # noqa: F841
-            pa_csv = self._import(f"{_pa}.csv")
-            pa_feather = self._import(f"{_pa}.feather")
-            pa_json = self._import(f"{_pa}.json")
-            pa_parquet = self._import(f"{_pa}.parquet")
+            pa = self._import(self._name)  # noqa: F841
+            pa_csv = self._import(f"{self._name}.csv")
+            pa_feather = self._import(f"{self._name}.feather")
+            pa_json = self._import(f"{self._name}.json")
+            pa_parquet = self._import(f"{self._name}.parquet")
 
             pa_read_csv = pa_csv.read_csv
             pa_read_feather = pa_feather.read_table
@@ -353,34 +369,62 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]:
 
 
 @overload
-def get_backend(
-    backend: Literal["polars", "polars[pyarrow]"], /
-) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
+def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
 
 
 @overload
-def get_backend(
-    backend: Literal["pandas", "pandas[pyarrow]"], /
-) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
+def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
 
 
 @overload
-def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ...
+def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ...
 
 
 def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
     if backend == "polars":
-        return _PolarsReader("polars")
+        return _PolarsReader(backend)
     elif backend == "polars[pyarrow]":
-        return _PolarsPyArrowReader("polars", "pyarrow")
+        return _PolarsPyArrowReader(backend)
     elif backend == "pandas[pyarrow]":
-        return _PandasPyArrowReader("pandas", "pyarrow")
+        return _PandasPyArrowReader(backend)
     elif backend == "pandas":
-        return _PandasReader("pandas")
+        return _PandasReader(backend)
     elif backend == "pyarrow":
-        return _PyArrowReader("pyarrow")
+        return _PyArrowReader(backend)
     elif backend in {"ibis", "cudf", "dask", "modin"}:
         msg = "Supported by ``narwhals``, not investigated yet"
         raise NotImplementedError(msg)
     else:
         raise TypeError(backend)
+
+
+@overload
+def _requirements(s: _ConcreteT, /) -> _ConcreteT: ...
+
+
+@overload
+def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ...
+
+
+@overload
+def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ...
+
+
+def _requirements(s: _Backend, /):
+    concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"}
+    if s in concrete:
+        return s
+    else:
+        from packaging.requirements import Requirement
+
+        req = Requirement(s)
+        supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"}
+        if req.name in supports_extras:
+            name = req.name
+            if (extras := req.extras) and extras == {"pyarrow"}:
+                extra = "pyarrow"
+                return name, extra
+            else:
+                raise NotImplementedError(s)
+        else:
+            raise NotImplementedError(s)

From e6dd27e6fb680b965e7d698a636d47a389c3e7df Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 15:03:17 +0000
Subject: [PATCH 064/201] docs: Add basic example to `Loader` class

Also incorporates changes from previous commit into `__repr__`
4a2a2e068f85d118244ceda09350cf3690781227
---
 altair/datasets/__init__.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 4bcf768b6..6d7a922d3 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -30,12 +30,20 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
     Load examples **remotely** from `vega-datasets`_, with *optional* caching.
 
+    A new ``Loader`` must be initialized by specifying a backend:
+
+        from altair.datasets import Loader
+
+        data = Loader.with_backend("polars")
+        Loader[polars]
+
     .. _vega-datasets:
         https://github.com/vega/vega-datasets
     """
 
     _reader: _Reader[IntoDataFrameT, IntoFrameT]
 
+    # TODO: docs (parameters, examples)
     def url(
         self,
         name: DatasetName | LiteralString,
@@ -46,6 +54,7 @@ def url(
         """Return the address of a remote dataset."""
         return self._reader.url(name, suffix, tag=tag)
 
+    # TODO: docs (parameters, examples)
     def __call__(
         self,
         name: DatasetName | LiteralString,
@@ -58,7 +67,7 @@ def __call__(
         return self._reader.dataset(name, suffix, tag=tag, **kwds)
 
     def __repr__(self) -> str:
-        return f"{type(self).__name__}[{type(self._reader).__name__}]"
+        return f"{type(self).__name__}[{self._reader._name}]"
 
     @overload
     @classmethod

From 2a7bc4f5bbcfea11e416453fa00abbee11ad8c5b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 15:50:51 +0000
Subject: [PATCH 065/201] refactor: Reorder `alt.datasets` module

---
 altair/datasets/__init__.py | 52 ++++++++++-----------
 altair/datasets/_readers.py | 92 ++++++++++++++++++-------------------
 2 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 6d7a922d3..260258882 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -43,32 +43,6 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
 
     _reader: _Reader[IntoDataFrameT, IntoFrameT]
 
-    # TODO: docs (parameters, examples)
-    def url(
-        self,
-        name: DatasetName | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: VersionTag | None = None,
-    ) -> str:
-        """Return the address of a remote dataset."""
-        return self._reader.url(name, suffix, tag=tag)
-
-    # TODO: docs (parameters, examples)
-    def __call__(
-        self,
-        name: DatasetName | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: VersionTag | None = None,
-        **kwds: Any,
-    ) -> IntoDataFrameT:
-        """Get a remote dataset and load as tabular data."""
-        return self._reader.dataset(name, suffix, tag=tag, **kwds)
-
-    def __repr__(self) -> str:
-        return f"{type(self).__name__}[{self._reader._name}]"
-
     @overload
     @classmethod
     def with_backend(
@@ -157,6 +131,29 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
         obj._reader = get_backend(backend)
         return obj
 
+    # TODO: docs (parameters, examples)
+    def __call__(
+        self,
+        name: DatasetName | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: VersionTag | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        """Get a remote dataset and load as tabular data."""
+        return self._reader.dataset(name, suffix, tag=tag, **kwds)
+
+    # TODO: docs (parameters, examples)
+    def url(
+        self,
+        name: DatasetName | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: VersionTag | None = None,
+    ) -> str:
+        """Return the address of a remote dataset."""
+        return self._reader.url(name, suffix, tag=tag)
+
     @property
     def cache_dir(self) -> Path | None:
         """
@@ -186,6 +183,9 @@ def cache_dir(self, source: StrPath, /) -> None:
 
         os.environ[self._reader._ENV_VAR] = str(source)
 
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}[{self._reader._name}]"
+
 
 def __getattr__(name):
     if name == "data":
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 78ee784a6..53a18b2d6 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -86,24 +86,10 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     _read_fn: dict[Extension, Callable[..., IntoDataFrameT]]
     _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
     _name: LiteralString
-    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
-    @property
-    def _cache(self) -> Path | None:  # type: ignore[return]
-        """
-        Returns path to datasets cache, if possible.
-
-        Requires opt-in via environment variable::
-
-            Reader._ENV_VAR
-        """
-        if _dir := os.environ.get(self._ENV_VAR):
-            cache_dir = Path(_dir)
-            cache_dir.mkdir(exist_ok=True)
-            return cache_dir
-
     def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
         suffix = validate_suffix(source, is_ext_supported)
         return self._read_fn[suffix]
@@ -112,21 +98,6 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         suffix = validate_suffix(source, is_ext_scan)
         return self._scan_fn[suffix]
 
-    def url(
-        self,
-        name: DatasetName | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: VersionTag | None = None,
-    ) -> str:
-        df = self._query(**validate_constraints(name, suffix, tag))
-        url = df.item(0, "url_npm")
-        if isinstance(url, str):
-            return url
-        else:
-            msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
-            raise TypeError(msg)
-
     def dataset(
         self,
         name: DatasetName | LiteralString,
@@ -145,7 +116,7 @@ def dataset(
         **kwds
             Arguments passed to the underlying read function.
         """
-        df = self._query(**validate_constraints(name, suffix, tag))
+        df = self.query(**validate_constraints(name, suffix, tag))
         it = islice(df.iter_rows(named=True), 1)
         result = cast("Metadata", next(it))
         url = result["url_npm"]
@@ -164,7 +135,22 @@ def dataset(
             with self._opener.open(url) as f:
                 return fn(f.read(), **kwds)
 
-    def _query(
+    def url(
+        self,
+        name: DatasetName | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: VersionTag | None = None,
+    ) -> str:
+        df = self.query(**validate_constraints(name, suffix, tag))
+        url = df.item(0, "url_npm")
+        if isinstance(url, str):
+            return url
+        else:
+            msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
+            raise TypeError(msg)
+
+    def query(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.DataFrame[IntoDataFrameT]:
         r"""
@@ -192,6 +178,20 @@ def _query(
             msg = f"Found no results for:\n{terms}"
             raise NotImplementedError(msg)
 
+    @property
+    def _cache(self) -> Path | None:  # type: ignore[return]
+        """
+        Returns path to datasets cache, if possible.
+
+        Requires opt-in via environment variable::
+
+            Reader._ENV_VAR
+        """
+        if _dir := os.environ.get(self._ENV_VAR):
+            cache_dir = Path(_dir)
+            cache_dir.mkdir(exist_ok=True)
+            return cache_dir
+
     def _import(self, name: str, /) -> Any:
         if spec := find_spec(name):
             return import_module(spec.name)
@@ -205,6 +205,20 @@ def __repr__(self) -> str:
     def __init__(self, name: LiteralString, /) -> None: ...
 
 
+class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
+    def __init__(self, name: _Pandas, /) -> None:
+        self._name = _requirements(name)
+        if not TYPE_CHECKING:
+            pd = self._import(self._name)
+        self._read_fn = {
+            ".csv": pd.read_csv,
+            ".json": pd.read_json,
+            ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
+            ".arrow": pd.read_feather,
+        }
+        self._scan_fn = {".parquet": pd.read_parquet}
+
+
 class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
     def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
         _pd, _pa = _requirements(name)
@@ -229,20 +243,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
         self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")}
 
 
-class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
-    def __init__(self, name: _Pandas, /) -> None:
-        self._name = _requirements(name)
-        if not TYPE_CHECKING:
-            pd = self._import(self._name)
-        self._read_fn = {
-            ".csv": pd.read_csv,
-            ".json": pd.read_json,
-            ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
-            ".arrow": pd.read_feather,
-        }
-        self._scan_fn = {".parquet": pd.read_parquet}
-
-
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
     def __init__(self, name: _Polars, /) -> None:
         self._name = _requirements(name)

From c572180ebc7d876714a38688c53f7e4af87abd93 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 16:59:10 +0000
Subject: [PATCH 066/201] docs: Fill out `Loader.url`

---
 altair/datasets/__init__.py | 40 +++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 260258882..b7f87bdaa 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -143,7 +143,6 @@ def __call__(
         """Get a remote dataset and load as tabular data."""
         return self._reader.dataset(name, suffix, tag=tag, **kwds)
 
-    # TODO: docs (parameters, examples)
     def url(
         self,
         name: DatasetName | LiteralString,
@@ -151,7 +150,44 @@ def url(
         /,
         tag: VersionTag | None = None,
     ) -> str:
-        """Return the address of a remote dataset."""
+        """
+        Return the address of a remote dataset.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`stem`_ of filename.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+        tag
+            `vega-datasets release`_ version.
+
+        .. _stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+        .. _vega-datasets release:
+            https://github.com/vega/vega-datasets/releases
+
+        Examples
+        --------
+        The returned url will always point to an accessible dataset:
+
+            import altair as alt
+            from altair.datasets import Loader
+
+            data = Loader.with_backend("polars")
+            data.url("cars", tag="v2.9.0")
+            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
+
+        We can pass the result directly to a chart:
+
+            url = data.url("cars", tag="v2.9.0")
+            alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
+        """
         return self._reader.url(name, suffix, tag=tag)
 
     @property

From 9ab9463007a8509c25cc69665ba995f42e84792d Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 18:06:03 +0000
Subject: [PATCH 067/201] feat: Adds `_Reader._read_metadata`

---
 altair/datasets/_readers.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 53a18b2d6..ea8d7088c 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -166,7 +166,7 @@ def query(
         """
         source = self._metadata
         fn = self.scanner_from(source)
-        frame = nw.from_native(fn(source), pass_through=False)
+        frame = nw.from_native(fn(source))
         result = frame.filter(_filter_reduce(predicates, constraints))
         df: nw.DataFrame[Any] = (
             result.collect() if isinstance(result, nw.LazyFrame) else result
@@ -178,6 +178,19 @@ def query(
             msg = f"Found no results for:\n{terms}"
             raise NotImplementedError(msg)
 
+    def _read_metadata(self) -> IntoDataFrameT:
+        """
+        Return the full contents of ``metadata.parquet``.
+
+        Effectively an eager read, no filters.
+        """
+        fn = self.scanner_from(self._metadata)
+        frame = nw.from_native(fn(self._metadata))
+        df: nw.DataFrame[Any] = (
+            frame.collect() if isinstance(frame, nw.LazyFrame) else frame
+        )
+        return df.to_native()
+
     @property
     def _cache(self) -> Path | None:  # type: ignore[return]
         """

From dd3edd66e2eb38be3c73f0ad0411e738f2f81495 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 20:43:53 +0000
Subject: [PATCH 068/201] refactor: Rename `(reader|scanner_from()` ->
 `(read|scan)_fn()`

---
 altair/datasets/_readers.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index ea8d7088c..afa1d2f54 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -90,11 +90,11 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
-    def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
-        suffix = validate_suffix(source, is_ext_supported)
+    def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
+        suffix = validate_suffix(source, is_ext_read)
         return self._read_fn[suffix]
 
-    def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
+    def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         suffix = validate_suffix(source, is_ext_scan)
         return self._scan_fn[suffix]
 
@@ -120,7 +120,7 @@ def dataset(
         it = islice(df.iter_rows(named=True), 1)
         result = cast("Metadata", next(it))
         url = result["url_npm"]
-        fn = self.reader_from(url)
+        fn = self.read_fn(url)
 
         if cache := self._cache:
             fp = cache / (result["sha"] + result["suffix"])
@@ -165,7 +165,7 @@ def query(
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
         """
         source = self._metadata
-        fn = self.scanner_from(source)
+        fn = self.scan_fn(source)
         frame = nw.from_native(fn(source))
         result = frame.filter(_filter_reduce(predicates, constraints))
         df: nw.DataFrame[Any] = (
@@ -184,7 +184,7 @@ def _read_metadata(self) -> IntoDataFrameT:
 
         Effectively an eager read, no filters.
         """
-        fn = self.scanner_from(self._metadata)
+        fn = self.scan_fn(self._metadata)
         frame = nw.from_native(fn(self._metadata))
         df: nw.DataFrame[Any] = (
             frame.collect() if isinstance(frame, nw.LazyFrame) else frame
@@ -356,7 +356,7 @@ def validate_constraints(
         constraints["suffix"] = fp.suffix
         return constraints
     elif suffix is not None:
-        if not is_ext_supported(suffix):
+        if not is_ext_read(suffix):
             raise TypeError(suffix)
         else:
             constraints["suffix"] = suffix
@@ -377,7 +377,7 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
     return suffix == ".parquet"
 
 
-def is_ext_supported(suffix: Any) -> TypeIs[Extension]:
+def is_ext_read(suffix: Any) -> TypeIs[Extension]:
     return suffix in {".csv", ".json", ".tsv", ".arrow"}
 
 
From 146cb50c60d0839cf56552b00472f768ec58001c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 9 Nov 2024 21:29:44 +0000
Subject: [PATCH 069/201] refactor(typing): Replace some explicit casts

---
 altair/datasets/_readers.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index afa1d2f54..78e330047 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -226,7 +226,7 @@ def __init__(self, name: _Pandas, /) -> None:
         self._read_fn = {
             ".csv": pd.read_csv,
             ".json": pd.read_json,
-            ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")),
+            ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"),
             ".arrow": pd.read_feather,
         }
         self._scan_fn = {".parquet": pd.read_parquet}
@@ -241,19 +241,12 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
             pa = self._import(_pa)  # noqa: F841
 
         self._read_fn = {
-            ".csv": cast(
-                partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow")
-            ),
-            ".json": cast(
-                partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow")
-            ),
-            ".tsv": cast(
-                partial["pd.DataFrame"],
-                partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"),
-            ),
-            ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"),
+            ".csv": partial["pd.DataFrame"](pd.read_csv, dtype_backend=_pa),
+            ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa),
+            ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa),
+            ".arrow": partial(pd.read_feather, dtype_backend=_pa),
         }
-        self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")}
+        self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}
 
 
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):

From 94ad0d1b879f43359dbead2b796db540531a2504 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 12:51:27 +0000
Subject: [PATCH 070/201] refactor: Shorten and document request delays

---
 tools/datasets/github.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index c2d7141aa..2d0d16fca 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -106,8 +106,10 @@ class _GitHubRequestNamespace:
     _UNAUTH_RATE_LIMIT: Literal[60] = 60
     _TAGS_COST: Literal[1] = 1
     _TREES_COST: Literal[2] = 2
-    _UNAUTH_DELAY: Literal[5] = 5
-    _AUTH_DELAY: Literal[1] = 1
+    _UNAUTH_DELAY: Literal[5_000] = 5_000
+    """**ms** delay added between **unauthenticated** ``trees`` requests."""
+    _AUTH_DELAY: Literal[500] = 500
+    """**ms** delay added between **authenticated** ``trees`` requests."""
     _UNAUTH_TREES_LIMIT: Literal[10] = 10
 
     def __init__(self, gh: GitHub, /) -> None:
@@ -123,6 +125,10 @@ def rate_limit(self) -> GitHubRateLimitResources:
             content: GitHubRateLimitResources = json.load(response)["resources"]
         return content
 
+    def delay(self, *, is_auth: bool) -> float:
+        ms = self._AUTH_DELAY if is_auth else self._UNAUTH_DELAY
+        return (ms + random.triangular()) / 1_000
+
     def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
         """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
         if n < 1 or n > self._TAGS_MAX_PAGE:
@@ -314,6 +320,11 @@ def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
             raise NotImplementedError(limit)
         return limit
 
+    def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float:
+        """Return a delay time in seconds, corresponding with authentication status."""
+        limit = rate_limit or self.rate_limit(strict=True)
+        return self.req.delay(is_auth=limit["is_auth"])
+
     def tags(
         self, n_head: int | None = None, *, warn_lower: bool = False
     ) -> pl.DataFrame:
@@ -412,14 +423,13 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
         cost = req._TREES_COST * n
         if rate_limit["remaining"] < cost:
             raise NotImplementedError(rate_limit, cost)
-        delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY
         print(
             f"Collecting metadata for {n} missing releases.\n"
-            f"Using {delay_secs=} between requests ..."
+            f"Using {self.delay(rate_limit)}[ms] between requests ..."
         )
         dfs: list[pl.DataFrame] = []
         for tag in tags:
-            time.sleep(delay_secs + random.triangular())
+            time.sleep(self.delay(rate_limit))
             dfs.append(self.trees(tag))
         df = pl.concat(dfs)
         print(f"Finished collection.\n" f"Found {df.height} new rows")

From 409338397ebb9ff2ec7abb146394f17702762b08 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 17:26:01 +0000
Subject: [PATCH 071/201] feat(DRAFT): Make `[tag]` a `pl.Enum`

---
 altair/datasets/_metadata/metadata.parquet | Bin 18495 -> 18641 bytes
 tools/datasets/__init__.py                 |   6 +++++
 tools/datasets/github.py                   |  17 ++++++++++---
 tools/datasets/semver.py                   |  28 ++++++++++++++++++---
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 8bf0e17e3673d2b7cfbbe1ddba345f492d12e674..5e7b3bd06439ace1d5eb8387efecde322cca91d7 100644
GIT binary patch
delta 725
zcmdl#f$`!*#trL~>c24hh=zzVFfeFr)&HAtm7$fH;gbuWz=qc)p%RZu@A{rHk$(Ll
zgLz$*ldAAX^S8QgF~)a3?RlNF<M*T=R-p&KyG^}d$oih!T=DuBJ_Z3M%@qO%uIMBe
zXy-5s2sApfZ8c-o&}s2o^UtR5vVu?|Ti<zcrU`<ar+4njI~O~Vw~ECiI)`hrzH*p{
z^;Z!O=Fo`tsR`ZJ3i!nfh3iGz#oHy@rTb;4%h#W<|1|xNdi|;WpSIWRt2y?^c)scX
zv-3Z{|6}|Ay!;pbzpno;&i|r6*KDrc<g?0hjQ2LbQD&54oV59!Z8sC&qaH@lGfZMY
zPfE&&P1@Y<z``hStbs{XLhR2^Mo}{cwOf)hlXKbhIK=LOMK?{p=;*=z4a~kWS;5JY
zy{{3Zde-Cwr#kj!VD_HLPn{gu_kr2JChI!eYFq=drNnmpWfV1HP+Ql}C?m=vY0AJR
zASuCIl9(>aBlbpHYyy|qy~%HsrG>;kfK|T&s<vQITk@SzX7UAFS;k3|-`VzAh;=oA
zj9T=MQFI9tgV>{mGC;j*9AbCgGm3!xAi?7p6yzUnRh*oWnwu!m!l4fa*^?K$I4e1O
zI;J~120D5=ItG+GIXXHU06C7Sj;=r^i0@K7nZq@{J{P1V*AWOj%YlSbK8P6!BAkHQ
zfux(0r=xp1P#ci%4mPeFWJahXh=dZB0YF14K>8hhL4-4y4g}F)MM0saVIe@<fgl1z
zgt}V#28MZs_+-2KIfc1~d4w2xc?BA{`9>H9MTNQo%?5#hETE|tFav@90J_G}#R%kJ
aFbBj>2l+2>^K92SOdRGM3=9E|L52YRdh{Ux

delta 516
zcmcaOk#YY7#trL~>OV00h=zzVFfeFr)&HAtm7$cGLCBR)V8iQ@P>DyScYRNpNWcD&
z!Mx5<bBCQ+aX`@a`^kl(a+|Nz&EH$__e|3Nj<>pQZquZmnD1Qmf`LI%bA`Z;$0qC>
zM3)%~2rwER%bIGq;Y$4XYcF)4v#r)kF*bBuDDvxW#3xy~O_vtW-%`q?!aCVdIm{zH
zYgww-s>fRE6uGy}f4y_=>7TnT*I3T6?5V8Tne#LJ-<1DP^nbeloACef=Eud4&;Qu-
zL;GK+{iFIH(*IiRAKL%m{?};#V2kzM$>){j7;kNUr_3nD*thwEZ8sC&y&gu<GfZMY
zFG|XY^=<BSU}0oG)W9SvA@*bP6h{m8TVVFO$(J2H*gt{U7bYt?S+aLEg49f#oa9u;
zz6i|TG5NWZBl|8e``cuFXIqU+K(>_FmcNXmMht4J`Wa<Jc_d94*aRdcm`f7VMR~;D
zXp2qY61z3|ow78)*c-6wXF$~!3~CD|?{ijQ?3?_-w$DPWtqEk#ynl?MOPCnM9xaps
zYE<J8yYZe;1mtW99><^{|8T3~<c!qZM2RnK`XFFpFnOtqvyx-7W4fcGp`)v#qnD$T
rqob3jW1yp>3yAOQ2n5cKlet{uCri3IY_@cpz{Ihfoq-|1F~|@AOW4Er

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 6319bd65e..f318f292e 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -88,6 +88,7 @@ def npm(self) -> Npm:
         return self._npm
 
     def refresh(self) -> pl.DataFrame:
+        """Update and sync all metadata files."""
         npm_tags = self.npm.tags()
         self.write_parquet(npm_tags, self._paths["npm_tags"])
 
@@ -98,6 +99,11 @@ def refresh(self) -> pl.DataFrame:
         self.write_parquet(gh_trees, self._paths["gh_trees"])
         return gh_trees
 
+    def reset(self) -> None:
+        """Remove all metadata files."""
+        for fp in self._paths.values():
+            fp.unlink(missing_ok=True)
+
     def read(self, name: _PathAlias, /) -> pl.DataFrame:
         """Read existing metadata from file."""
         return pl.read_parquet(self._from_alias(name))
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 2d0d16fca..6bde876ae 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -357,6 +357,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
         Use known tags to discover and update missing trees metadata.
 
         Aims to stay well-within API rate limits, both for authenticated and unauthenticated users.
+
+        Notes
+        -----
+        Internally handles regenerating the ``tag`` enum.
         """
         if gh_tags.is_empty():
             msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}"
@@ -367,18 +371,23 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
         TP = ReParsedTag
         if not fp.exists():
             print(f"Initializing {fp!s}")
-            return self._trees_batched(_iter_rows(gh_tags, stop, TP))
+            result = self._trees_batched(_iter_rows(gh_tags, stop, TP))
         else:
-            trees = pl.read_parquet(fp)
+            trees = (
+                pl.scan_parquet(fp)
+                .with_columns(pl.col("tag").cast(pl.String))
+                .collect()
+            )
             missing_trees = gh_tags.join(
                 trees.select(pl.col("tag").unique()), on="tag", how="anti"
             )
             if missing_trees.is_empty():
                 print(f"Already up-to-date {fp!s}")
-                return trees
+                result = trees
             else:
                 fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP))
-                return pl.concat((trees, fresh))
+                result = pl.concat((trees, fresh))
+        return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags)))
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)
diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py
index cb4c6c799..57f6d509f 100644
--- a/tools/datasets/semver.py
+++ b/tools/datasets/semver.py
@@ -52,6 +52,28 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame:
         return ldf
 
 
-def sort(frame: _Frame, /) -> _Frame:
-    """Sort ``frame``, displaying in descending release order."""
-    return frame.sort(_SEM_VER_FIELDS, descending=True)
+def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum:
+    """Extract an **ascending** order ``pl.Enum`` from ``col_tag``."""
+    return pl.Enum(
+        frame.lazy()
+        .pipe(sort, descending=False)
+        .select(col_tag)
+        .collect()
+        .get_column(col_tag)
+    )
+
+
+def sort(frame: _Frame, /, descending: bool = True) -> _Frame:
+    """
+    Sort ``frame``, displaying in release order.
+
+    Parameters
+    ----------
+    descending
+        By default, **most recent** is first.
+
+    Notes
+    -----
+    Ensures pre release versions maintain order, always appearing before actual releases.
+    """
+    return frame.sort(_SEM_VER_FIELDS, descending=descending, nulls_last=not descending)

From 76cdd45af0e1dc7ac632899b3618c199be5291ee Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 19:12:55 +0000
Subject: [PATCH 072/201] fix: Handle `pyarrow` scalars conversion

---
 altair/datasets/_readers.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 78e330047..3b122df10 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -147,8 +147,15 @@ def url(
         if isinstance(url, str):
             return url
         else:
-            msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}."
-            raise TypeError(msg)
+            converted = nw.to_py_scalar(url)
+            if isinstance(converted, str):
+                return converted
+            else:
+                msg = (
+                    f"Expected 'str' but got {type(converted).__name__!r}\n"
+                    f"from {converted!r}."
+                )
+                raise TypeError(msg)
 
     def query(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]

From bb7bc171a7005fd63f39b3d949902f4d553801f0 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 19:15:52 +0000
Subject: [PATCH 073/201] test: Adds `test_datasets`

Initially quite basic, need to add more parameterize and test caching
---
 tests/test_datasets.py | 45 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/test_datasets.py

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 000000000..a15fb9411
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+import pytest
+from narwhals.dependencies import is_into_dataframe
+from narwhals.stable import v1 as nw
+
+import altair as alt  # noqa: F401
+from altair.datasets import Loader
+
+if TYPE_CHECKING:
+    from altair.datasets._readers import _Backend
+
+backends = pytest.mark.parametrize(
+    "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"]
+)
+
+
+@backends
+def test_loader_with_backend(backend: _Backend) -> None:
+    data = Loader.with_backend(backend)
+    assert data._reader._name == backend
+
+
+@backends
+def test_loader_url(backend: _Backend) -> None:
+    data = Loader.with_backend(backend)
+    dataset_name = "volcano"
+    pattern = re.compile(
+        rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+"
+    )
+    url = data.url(dataset_name)
+    assert isinstance(url, str)
+    assert pattern.match(url) is not None
+
+
+@backends
+def test_loader_call(backend: _Backend) -> None:
+    data = Loader.with_backend(backend)
+    frame = data("stocks", ".csv")
+    assert is_into_dataframe(frame)
+    nw_frame = nw.from_native(frame)
+    assert set(nw_frame.columns) == {"symbol", "date", "price"}

From ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 19:31:53 +0000
Subject: [PATCH 074/201] fix(DRAFT): hotfix `pyarrow` read

---
 altair/datasets/_readers.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 3b122df10..f58fcd56d 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -98,6 +98,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         suffix = validate_suffix(source, is_ext_scan)
         return self._scan_fn[suffix]
 
+    def _response_hook(self, f):
+        # HACK: pyarrow wants the file obj
+        return f.read()
+
     def dataset(
         self,
         name: DatasetName | LiteralString,
@@ -133,7 +137,7 @@ def dataset(
                 return fn(fp, **kwds)
         else:
             with self._opener.open(url) as f:
-                return fn(f.read(), **kwds)
+                return fn(self._response_hook(f), **kwds)
 
     def url(
         self,
@@ -329,6 +333,9 @@ def __init__(self, name: _PyArrow, /) -> None:
         }
         self._scan_fn = {".parquet": pa_read_parquet}
 
+    def _response_hook(self, f):
+        return f
+
 
 def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
     """

From fe0ae88201cc699b32ee1e9c07b602d9d7a8d439 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 20:56:22 +0000
Subject: [PATCH 075/201] fix(DRAFT): Treat `polars` as exception, invalidate
 cache

Possibly fix https://github.com/vega/altair/actions/runs/11768349827/job/32778071725?pr=3631
---
 altair/datasets/_readers.py | 13 ++++++++-----
 tests/test_datasets.py      |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index f58fcd56d..eea9f18db 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -99,8 +99,8 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         return self._scan_fn[suffix]
 
     def _response_hook(self, f):
-        # HACK: pyarrow wants the file obj
-        return f.read()
+        # HACK: `pyarrow` + `pandas` wants the file obj
+        return f
 
     def dataset(
         self,
@@ -273,6 +273,9 @@ def __init__(self, name: _Polars, /) -> None:
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
+    def _response_hook(self, f):
+        return f.read()
+
 
 class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
     def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
@@ -289,6 +292,9 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
+    def _response_hook(self, f):
+        return f.read()
+
 
 class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
     """
@@ -333,9 +339,6 @@ def __init__(self, name: _PyArrow, /) -> None:
         }
         self._scan_fn = {".parquet": pa_read_parquet}
 
-    def _response_hook(self, f):
-        return f
-
 
 def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
     """
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index a15fb9411..c37bc0046 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -39,6 +39,7 @@ def test_loader_url(backend: _Backend) -> None:
 @backends
 def test_loader_call(backend: _Backend) -> None:
     data = Loader.with_backend(backend)
+    data.cache_dir = ""
     frame = data("stocks", ".csv")
     assert is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)

From 7089f2af693c6db2025ee265f31ec4ef228dd8c3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 10 Nov 2024 21:11:07 +0000
Subject: [PATCH 076/201] test: Skip `pyarrow` tests on `3.9`

Forgot that this gets uninstalled in CI
https://github.com/vega/altair/actions/runs/11768424121/job/32778234026?pr=3631
---
 tests/test_datasets.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index c37bc0046..ec2f9014f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -9,12 +9,15 @@
 
 import altair as alt  # noqa: F401
 from altair.datasets import Loader
+from tests import skip_requires_pyarrow
 
 if TYPE_CHECKING:
     from altair.datasets._readers import _Backend
 
-backends = pytest.mark.parametrize(
-    "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"]
+backends = skip_requires_pyarrow(
+    pytest.mark.parametrize(
+        "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"]
+    )
 )
 
 
@@ -39,7 +42,7 @@ def test_loader_url(backend: _Backend) -> None:
 @backends
 def test_loader_call(backend: _Backend) -> None:
     data = Loader.with_backend(backend)
-    data.cache_dir = ""
+    data.cache_dir = ""  # type: ignore[assignment]
     frame = data("stocks", ".csv")
     assert is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)

From e1290d4384d4926c24f22a3a23f103e284cfbe1e Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 13:50:54 +0000
Subject: [PATCH 077/201] refactor: Tidy up changes from last 4 commits

- Rename and properly document "file-like object" handling
  - Also made a bit clearer what is being called and when
- Use a more granular approach to skipping in `@backends`
  - Previously, everything was skipped regardless of whether it required `pyarrow`
  - Now, `polars`, `pandas` **always** run - with `pandas` expected to fail
- I had to clean up `skip_requires_pyarrow` to make it compatible with `pytest.param`
  - It has a runtime check for if `MarkDecorator`, instead of just a callable

https://github.com/vega/altair/pull/3631/commits/bb7bc171a7005fd63f39b3d949902f4d553801f0, https://github.com/vega/altair/pull/3631/commits/ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63, https://github.com/vega/altair/pull/3631/commits/fe0ae88201cc699b32ee1e9c07b602d9d7a8d439,
https://github.com/vega/altair/pull/3631/commits/7089f2af693c6db2025ee265f31ec4ef228dd8c3
---
 altair/datasets/_readers.py | 33 ++++++++++++++++++++++-----------
 tests/__init__.py           | 31 +++++++++++++++++++------------
 tests/test_datasets.py      | 26 ++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 27 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index eea9f18db..a3435d231 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -12,6 +12,7 @@
 import os
 import urllib.request
 from functools import partial
+from http.client import HTTPResponse
 from importlib import import_module
 from importlib.util import find_spec
 from itertools import chain, islice
@@ -76,6 +77,10 @@
 __all__ = ["get_backend"]
 
 
+def _identity(_: _T, /) -> _T:
+    return _
+
+
 class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     """
     Common functionality between backends.
@@ -88,6 +93,18 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
     _name: LiteralString
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+    _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity)
+    """
+    Backends that do not support `file-like objects`_, must override with conversion.
+
+    Used only for **remote** files, as *cached* files use a `pathlib.Path`_.
+
+    .. _file-like objects:
+        https://docs.python.org/3/glossary.html#term-file-object
+    .. _pathlib.Path:
+        https://docs.python.org/3/library/pathlib.html#pathlib.Path
+    """
+
     _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
     def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
@@ -98,10 +115,6 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         suffix = validate_suffix(source, is_ext_scan)
         return self._scan_fn[suffix]
 
-    def _response_hook(self, f):
-        # HACK: `pyarrow` + `pandas` wants the file obj
-        return f
-
     def dataset(
         self,
         name: DatasetName | LiteralString,
@@ -137,7 +150,7 @@ def dataset(
                 return fn(fp, **kwds)
         else:
             with self._opener.open(url) as f:
-                return fn(self._response_hook(f), **kwds)
+                return fn(self._response(f), **kwds)
 
     def url(
         self,
@@ -261,6 +274,8 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
 
 
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
+    _response = staticmethod(HTTPResponse.read)
+
     def __init__(self, name: _Polars, /) -> None:
         self._name = _requirements(name)
         if not TYPE_CHECKING:
@@ -273,11 +288,10 @@ def __init__(self, name: _Polars, /) -> None:
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
-    def _response_hook(self, f):
-        return f.read()
-
 
 class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
+    _response = staticmethod(HTTPResponse.read)
+
     def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
         _pl, _pa = _requirements(name)
         self._name = name
@@ -292,9 +306,6 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
-    def _response_hook(self, f):
-        return f.read()
-
 
 class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
     """
diff --git a/tests/__init__.py b/tests/__init__.py
index 617cfca80..17a33e91e 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -5,14 +5,14 @@
 import sys
 from importlib.util import find_spec
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, overload
 
 import pytest
 
 from tests import examples_arguments_syntax, examples_methods_syntax
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Collection, Iterator, Mapping
+    from collections.abc import Collection, Iterator, Mapping
     from re import Pattern
 
     if sys.version_info >= (3, 11):
@@ -20,6 +20,7 @@
     else:
         from typing_extensions import TypeAlias
     from _pytest.mark import ParameterSet
+    from _pytest.mark.structures import Markable
 
     MarksType: TypeAlias = (
         "pytest.MarkDecorator | Collection[pytest.MarkDecorator | pytest.Mark]"
@@ -96,9 +97,21 @@ def windows_has_tzdata() -> bool:
 """
 
 
+@overload
 def skip_requires_pyarrow(
-    fn: Callable[..., Any] | None = None, /, *, requires_tzdata: bool = False
-) -> Callable[..., Any]:
+    fn: None = ..., /, *, requires_tzdata: bool = ...
+) -> pytest.MarkDecorator: ...
+
+
+@overload
+def skip_requires_pyarrow(
+    fn: Markable, /, *, requires_tzdata: bool = ...
+) -> Markable: ...
+
+
+def skip_requires_pyarrow(
+    fn: Markable | None = None, /, *, requires_tzdata: bool = False
+) -> pytest.MarkDecorator | Markable:
     """
     ``pytest.mark.skipif`` decorator.
 
@@ -109,7 +122,7 @@ def skip_requires_pyarrow(
     https://github.com/vega/altair/issues/3050
 
     .. _pyarrow:
-    https://pypi.org/project/pyarrow/
+        https://pypi.org/project/pyarrow/
     """
     composed = pytest.mark.skipif(
         find_spec("pyarrow") is None, reason="`pyarrow` not installed."
@@ -120,13 +133,7 @@ def skip_requires_pyarrow(
             reason="Timezone database is not installed on Windows",
         )(composed)
 
-    def wrap(test_fn: Callable[..., Any], /) -> Callable[..., Any]:
-        return composed(test_fn)
-
-    if fn is None:
-        return wrap
-    else:
-        return wrap(fn)
+    return composed if fn is None else composed(fn)
 
 
 def id_func_str_only(val) -> str:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ec2f9014f..7a4ab51f1 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import re
+from importlib.util import find_spec
 from typing import TYPE_CHECKING
 
 import pytest
@@ -14,10 +15,27 @@
 if TYPE_CHECKING:
     from altair.datasets._readers import _Backend
 
-backends = skip_requires_pyarrow(
-    pytest.mark.parametrize(
-        "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"]
-    )
+
+requires_pyarrow = skip_requires_pyarrow()
+
+backends = pytest.mark.parametrize(
+    "backend",
+    [
+        "polars",
+        pytest.param(
+            "pandas",
+            marks=pytest.mark.xfail(
+                find_spec("pyarrow") is None,
+                reason=(
+                    "`pandas` supports backends other than `pyarrow` for `.parquet`.\n"
+                    "However, none of these are currently an `altair` dependency."
+                ),
+            ),
+        ),
+        pytest.param("polars[pyarrow]", marks=requires_pyarrow),
+        pytest.param("pandas[pyarrow]", marks=requires_pyarrow),
+        pytest.param("pyarrow", marks=requires_pyarrow),
+    ],
 )
 
 
From 9d88e1bbb20b6b24bc3cefc40c62108e259edf65 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 14:37:21 +0000
Subject: [PATCH 078/201] refactor: Rework `_readers.py`

- Moved `_Reader._metadata` -> module-level constant `_METADATA`.
  - It was never modified and is based on the relative directory of this module
- Generally improved the readability with more method-chaining (less assignment)
- Renamed, improved doc `_filter_reduce` -> `_parse_predicates_constraints`
---
 altair/datasets/_readers.py | 55 ++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index a3435d231..b2f41af89 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -22,6 +22,7 @@
     Any,
     Callable,
     ClassVar,
+    Final,
     Generic,
     Literal,
     Protocol,
@@ -76,6 +77,8 @@
 
 __all__ = ["get_backend"]
 
+_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
+
 
 def _identity(_: _T, /) -> _T:
     return _
@@ -105,8 +108,6 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
         https://docs.python.org/3/library/pathlib.html#pathlib.Path
     """
 
-    _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet"
-
     def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
         suffix = validate_suffix(source, is_ext_read)
         return self._read_fn[suffix]
@@ -159,20 +160,13 @@ def url(
         /,
         tag: VersionTag | None = None,
     ) -> str:
-        df = self.query(**validate_constraints(name, suffix, tag))
-        url = df.item(0, "url_npm")
+        frame = self.query(**validate_constraints(name, suffix, tag))
+        url = nw.to_py_scalar(frame.item(0, "url_npm"))
         if isinstance(url, str):
             return url
         else:
-            converted = nw.to_py_scalar(url)
-            if isinstance(converted, str):
-                return converted
-            else:
-                msg = (
-                    f"Expected 'str' but got {type(converted).__name__!r}\n"
-                    f"from {converted!r}."
-                )
-                raise TypeError(msg)
+            msg = f"Expected 'str' but got {type(url).__name__!r}\n" f"from {url!r}."
+            raise TypeError(msg)
 
     def query(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
@@ -188,15 +182,14 @@ def query(
         .. _pl.LazyFrame.filter:
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
         """
-        source = self._metadata
-        fn = self.scan_fn(source)
-        frame = nw.from_native(fn(source))
-        result = frame.filter(_filter_reduce(predicates, constraints))
-        df: nw.DataFrame[Any] = (
-            result.collect() if isinstance(result, nw.LazyFrame) else result
+        frame = (
+            nw.from_native(self.scan_fn(_METADATA)(_METADATA))
+            .filter(_parse_predicates_constraints(predicates, constraints))
+            .lazy()
+            .collect()
         )
-        if not df.is_empty():
-            return df
+        if not frame.is_empty():
+            return frame
         else:
             terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
             msg = f"Found no results for:\n{terms}"
@@ -208,12 +201,12 @@ def _read_metadata(self) -> IntoDataFrameT:
 
         Effectively an eager read, no filters.
         """
-        fn = self.scan_fn(self._metadata)
-        frame = nw.from_native(fn(self._metadata))
-        df: nw.DataFrame[Any] = (
-            frame.collect() if isinstance(frame, nw.LazyFrame) else frame
+        return (
+            nw.from_native(self.scan_fn(_METADATA)(_METADATA))
+            .lazy()
+            .collect()
+            .to_native()
         )
-        return df.to_native()
 
     @property
     def _cache(self) -> Path | None:  # type: ignore[return]
@@ -351,11 +344,15 @@ def __init__(self, name: _PyArrow, /) -> None:
         self._scan_fn = {".parquet": pa_read_parquet}
 
 
-def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr:
+def _parse_predicates_constraints(
+    predicates: tuple[Any, ...], constraints: Metadata, /
+) -> nw.Expr:
     """
-    ``narwhals`` only accepts ``filter(*predicates)`.
+    ``narwhals`` only accepts ``filter(*predicates)``.
+
+    So we convert each item in ``**constraints`` here as::
 
-    Manually converts the constraints into ``==``
+       col("column_name") == literal_value
     """
     return nw.all_horizontal(
         chain(predicates, (nw.col(name) == v for name, v in constraints.items()))

From 60d39f5f7f175f94b2511b221ee2fd1760eacb9e Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:40:12 +0000
Subject: [PATCH 079/201] test: Adds tests for missing dependencies

---
 altair/datasets/_readers.py | 14 ++++++++++-
 tests/test_datasets.py      | 48 +++++++++++++++++++++++++++++++++++--
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index b2f41af89..20b308aed 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -226,7 +226,19 @@ def _import(self, name: str, /) -> Any:
         if spec := find_spec(name):
             return import_module(spec.name)
         else:
-            msg = f"{type(self).__name__!r} requires missing dependency {name!r}."
+            reqs = _requirements(self._name)  # type: ignore[call-overload]
+            if isinstance(reqs, tuple):
+                depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
+            else:
+                depends = f"{reqs!r} package"
+
+            msg = (
+                f"Backend {self._name!r} requires the {depends}, but {name!r} could not be found.\n"
+                f"This can be installed with pip using:\n"
+                f"    pip install {name}\n"
+                f"Or with conda using:\n"
+                f"    conda install -c conda-forge {name}"
+            )
             raise ModuleNotFoundError(msg, name=name)
 
     def __repr__(self) -> str:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 7a4ab51f1..de932137f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import re
+import sys
 from importlib.util import find_spec
 from typing import TYPE_CHECKING
 
@@ -13,8 +14,12 @@
 from tests import skip_requires_pyarrow
 
 if TYPE_CHECKING:
+    from typing import Literal
+
     from altair.datasets._readers import _Backend
 
+CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
+
 
 requires_pyarrow = skip_requires_pyarrow()
 
@@ -58,10 +63,49 @@ def test_loader_url(backend: _Backend) -> None:
 
 
 @backends
-def test_loader_call(backend: _Backend) -> None:
+def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv(CACHE_ENV_VAR, raising=False)
+
     data = Loader.with_backend(backend)
-    data.cache_dir = ""  # type: ignore[assignment]
     frame = data("stocks", ".csv")
     assert is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)
     assert set(nw_frame.columns) == {"symbol", "date", "price"}
+
+
+@backends
+def test_missing_dependency_single(
+    backend: _Backend, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    if backend in {"polars[pyarrow]", "pandas[pyarrow]"}:
+        pytest.skip("Testing single dependency backends only")
+
+    monkeypatch.setitem(sys.modules, backend, None)
+
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{backend}.+requires.+{backend}.+but.+{backend}.+not.+found.+pip install {backend}",
+            flags=re.DOTALL,
+        ),
+    ):
+        Loader.with_backend(backend)
+
+
+@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"])
+@skip_requires_pyarrow
+def test_missing_dependency_multi(
+    backend: _Backend, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    secondary = "pyarrow"
+    primary = backend.removesuffix(f"[{secondary}]")
+    monkeypatch.setitem(sys.modules, secondary, None)
+
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{re.escape(backend)}.+requires.+'{primary}', '{secondary}'.+but.+{secondary}.+not.+found.+pip install {secondary}",
+            flags=re.DOTALL,
+        ),
+    ):
+        Loader.with_backend(backend)

From d6f0e45a3ade1fd9ca08e22b2ae9f6710eabd496 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 18:36:28 +0000
Subject: [PATCH 080/201] test: Adds `test_dataset_not_found`

---
 altair/datasets/_readers.py | 10 ++--
 tests/test_datasets.py      | 95 +++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 20b308aed..ebd996d65 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -192,8 +192,8 @@ def query(
             return frame
         else:
             terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
-            msg = f"Found no results for:\n{terms}"
-            raise NotImplementedError(msg)
+            msg = f"Found no results for:\n    {terms}"
+            raise ValueError(msg)
 
     def _read_metadata(self) -> IntoDataFrameT:
         """
@@ -378,16 +378,18 @@ def validate_constraints(
     /,
 ) -> Metadata:
     constraints: Metadata = {}
+    suffixes = ".csv", ".json", ".tsv", ".arrow"
     if tag is not None:
         constraints["tag"] = tag
-    if name.endswith((".csv", ".json", ".tsv", ".arrow")):
+    if name.endswith(suffixes):
         fp = Path(name)
         constraints["dataset_name"] = fp.stem
         constraints["suffix"] = fp.suffix
         return constraints
     elif suffix is not None:
         if not is_ext_read(suffix):
-            raise TypeError(suffix)
+            msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}"
+            raise TypeError(msg)
         else:
             constraints["suffix"] = suffix
     constraints["dataset_name"] = name
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index de932137f..cf26fc0f8 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -109,3 +109,98 @@ def test_missing_dependency_multi(
         ),
     ):
         Loader.with_backend(backend)
+
+
+@backends
+def test_dataset_not_found(backend: _Backend) -> None:
+    """
+    Various queries that should **always raise** due to non-existent dataset.
+
+    ``Loader.url`` is used since it doesn't require a remote connection.
+    """
+    import polars as pl
+
+    data = Loader.with_backend(backend)
+    real_name: Literal["disasters"] = "disasters"
+    real_suffix: Literal[".csv"] = ".csv"
+    real_tag: Literal["v1.14.0"] = "v1.14.0"
+
+    invalid_name: Literal["fake name"] = "fake name"
+    invalid_suffix: Literal["fake suffix"] = "fake suffix"
+    invalid_tag: Literal["fake tag"] = "fake tag"
+
+    incorrect_suffix: Literal[".json"] = ".json"
+    incorrect_tag: Literal["v1.5.0"] = "v1.5.0"
+
+    ERR_NO_RESULT = ValueError
+    # NOTE: ``polars`` enforces enums stricter than other packages.
+    # Rather than returning an empty dataframe, filtering on a value
+    # *outside* of the enum range raises an internal error.
+    ERR_NO_RESULT_OR_ENUM = (ERR_NO_RESULT, pl.exceptions.InvalidOperationError)
+
+    MSG_NO_RESULT = "Found no results for"
+    NAME = "dataset_name"
+    SUFFIX = "suffix"
+    TAG = "tag"
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL),
+    ):
+        data.url(invalid_name)
+
+    with pytest.raises(
+        TypeError,
+        match=re.compile(
+            rf"Expected '{SUFFIX}' to be one of.+\(.+\).+but got.+{invalid_suffix}",
+            re.DOTALL,
+        ),
+    ):
+        data.url(real_name, invalid_suffix)  # type: ignore[arg-type]
+
+    with pytest.raises(
+        ERR_NO_RESULT_OR_ENUM,
+        match=re.compile(rf"{invalid_tag}", re.DOTALL),
+    ):
+        data.url(real_name, tag=invalid_tag)  # type: ignore[arg-type]
+
+    with pytest.raises(
+        ERR_NO_RESULT_OR_ENUM,
+        match=re.compile(rf"{invalid_tag}", re.DOTALL),
+    ):
+        data.url(real_name, real_suffix, tag=invalid_tag)  # type: ignore[arg-type]
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(
+            rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{SUFFIX}.+{real_suffix}.+{NAME}.+{real_name}",
+            re.DOTALL,
+        ),
+    ):
+        data.url(real_name, real_suffix, tag=incorrect_tag)
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(
+            rf"{MSG_NO_RESULT}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}",
+            re.DOTALL,
+        ),
+    ):
+        data.url(real_name, incorrect_suffix)
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(
+            rf"{MSG_NO_RESULT}.+{TAG}.+{real_tag}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}",
+            re.DOTALL,
+        ),
+    ):
+        data.url(real_name, incorrect_suffix, tag=real_tag)
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(
+            rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{NAME}.+{real_name}", re.DOTALL
+        ),
+    ):
+        data.url(real_name, tag=incorrect_tag)

From b7d57a0b497de6bc824f3e2600894cc75f5ad413 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 19:44:28 +0000
Subject: [PATCH 081/201] test: Adds `test_reader_cache`

---
 tests/test_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index cf26fc0f8..b3cd1ab8c 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -3,10 +3,10 @@
 import re
 import sys
 from importlib.util import find_spec
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import pytest
-from narwhals.dependencies import is_into_dataframe
+from narwhals.dependencies import is_into_dataframe, is_polars_dataframe
 from narwhals.stable import v1 as nw
 
 import altair as alt  # noqa: F401
@@ -14,6 +14,7 @@
 from tests import skip_requires_pyarrow
 
 if TYPE_CHECKING:
+    from pathlib import Path
     from typing import Literal
 
     from altair.datasets._readers import _Backend
@@ -204,3 +205,72 @@ def test_dataset_not_found(backend: _Backend) -> None:
         ),
     ):
         data.url(real_name, tag=incorrect_tag)
+
+
+@backends
+def test_reader_cache(
+    backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    """
+    Using a sample of the smallest datasets, make *"requests"* that are all caught by prior hits.
+
+    Note
+    ----
+    `tmp_path`_ is a built-in fixture.
+
+    .. _tmp_path:
+        https://docs.pytest.org/en/stable/getting-started.html#request-a-unique-temporary-directory-for-functional-tests
+    """
+    import polars as pl
+    from polars.testing import assert_frame_equal
+
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+
+    data = Loader.with_backend(backend)
+    cache_dir = data.cache_dir
+    assert cache_dir is not None
+    assert cache_dir == tmp_path
+
+    assert tuple(cache_dir.iterdir()) == ()
+
+    # smallest csvs
+    lookup_groups = data("lookup_groups", tag="v2.5.3")
+    data("lookup_people", tag="v2.4.0")
+    data("iowa-electricity", tag="v2.3.1")
+    data("global-temp", tag="v2.9.0")
+
+    cached_paths = tuple(cache_dir.iterdir())
+    assert len(cached_paths) == 4
+
+    if is_polars_dataframe(lookup_groups):
+        left, right = (
+            lookup_groups,
+            cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")),
+        )
+    else:
+        left, right = (
+            pl.DataFrame(lookup_groups),
+            pl.DataFrame(data("lookup_groups", tag="v2.5.3")),
+        )
+
+    assert_frame_equal(left, right)
+    assert len(tuple(cache_dir.iterdir())) == 4
+    assert cached_paths == tuple(cache_dir.iterdir())
+
+    data("iowa-electricity", tag="v1.30.2")
+    data("global-temp", tag="v2.8.1")
+    data("global-temp", tag="v2.8.0")
+
+    assert len(tuple(cache_dir.iterdir())) == 4
+    assert cached_paths == tuple(cache_dir.iterdir())
+
+    data("lookup_people", tag="v1.10.0")
+    data("lookup_people", tag="v1.11.0")
+    data("lookup_people", tag="v1.20.0")
+    data("lookup_people", tag="v1.21.0")
+    data("lookup_people", tag="v2.1.0")
+    data("lookup_people", tag="v2.3.0")
+    data("lookup_people", tag="v2.5.0-next.0")
+
+    assert len(tuple(cache_dir.iterdir())) == 4
+    assert cached_paths == tuple(cache_dir.iterdir())

From b70aef883721ce1ce905e1ec8e82938eb4859257 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 21:23:43 +0000
Subject: [PATCH 082/201] docs: Finish `_Reader`, fill parameters of
 `Loader.__call__`

Still need examples for `Loader.__call__`
---
 altair/datasets/__init__.py | 31 +++++++++++++++++++---
 altair/datasets/_readers.py | 52 +++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index b7f87bdaa..4260314d1 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -131,7 +131,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
         obj._reader = get_backend(backend)
         return obj
 
-    # TODO: docs (parameters, examples)
+    # TODO: docs (examples)
     def __call__(
         self,
         name: DatasetName | LiteralString,
@@ -140,7 +140,30 @@ def __call__(
         tag: VersionTag | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
-        """Get a remote dataset and load as tabular data."""
+        """
+        Get a remote dataset and load as tabular data.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`stem`_ of file name.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+        tag
+            Version identifier for a `vega-datasets release`_.
+        **kwds
+            Arguments passed to the underlying read function.
+
+        .. _stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+        .. _vega-datasets release:
+            https://github.com/vega/vega-datasets/releases
+        """
         return self._reader.dataset(name, suffix, tag=tag, **kwds)
 
     def url(
@@ -156,14 +179,14 @@ def url(
         Parameters
         ----------
         name
-            Name of the dataset/`stem`_ of filename.
+            Name of the dataset/`stem`_ of file name.
         suffix
             File extension/`Path.suffix`_.
 
             .. note::
                 Only needed if ``name`` is available in multiple formats.
         tag
-            `vega-datasets release`_ version.
+            Version identifier for a `vega-datasets release`_.
 
         .. _stem:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index ebd996d65..fe8f8212f 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -23,7 +23,6 @@
     Callable,
     ClassVar,
     Final,
-    Generic,
     Literal,
     Protocol,
     TypeVar,
@@ -84,16 +83,42 @@ def _identity(_: _T, /) -> _T:
     return _
 
 
-class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol):
+class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     """
-    Common functionality between backends.
+    Describes basic IO for remote & local tabular resources.
 
-    Trying to use ``narwhals`` as much as possible
+    Subclassing this protocol directly will provide a *mostly* complete implementation.
+
+    Each of the following must be explicitly assigned:
+
+        _Reader._read_fn
+        _Reader._scan_fn
+        _Reader._name
     """
 
     _read_fn: dict[Extension, Callable[..., IntoDataFrameT]]
+    """
+    Eager file read functions.
+
+    Each corresponds to a known file extension within ``vega-datasets``.
+    """
+
     _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
+    """
+    *Optionally*-lazy file read/scan functions.
+
+    Used exclusively for ``metadata.parquet``.
+
+    Currently ``polars`` backends are the only lazy options.
+    """
+
     _name: LiteralString
+    """
+    Used in error messages, repr and matching ``@overload``(s).
+
+    Otherwise, has no concrete meaning.
+    """
+
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
     _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity)
@@ -124,16 +149,6 @@ def dataset(
         tag: VersionTag | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
-        """
-        Fetch a remote dataset, attempt caching if possible.
-
-        Parameters
-        ----------
-        name, suffix, tag
-            TODO
-        **kwds
-            Arguments passed to the underlying read function.
-        """
         df = self.query(**validate_constraints(name, suffix, tag))
         it = islice(df.iter_rows(named=True), 1)
         result = cast("Metadata", next(it))
@@ -171,13 +186,12 @@ def url(
     def query(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.DataFrame[IntoDataFrameT]:
-        r"""
+        """
         Query multi-version trees metadata.
 
-        Parameters
-        ----------
-        \*predicates, \*\*constraints
-            Passed directly to `pl.LazyFrame.filter`_.
+        Notes
+        -----
+        Arguments correspond to those seen in `pl.LazyFrame.filter`_.
 
         .. _pl.LazyFrame.filter:
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html

From 403b7874f360fc2f1734de538e81a91e4c4ddffe Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 11 Nov 2024 21:48:21 +0000
Subject: [PATCH 083/201] refactor: Rename `backend` -> `backend_name`,
 `get_backend` -> `backend`

`get_` was the wrong term since it isn't a free operation
---
 altair/datasets/__init__.py | 14 ++++++-------
 altair/datasets/_readers.py | 40 +++++++++++++++++++------------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 4260314d1..b6f983754 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -4,7 +4,7 @@
 
 from narwhals.typing import IntoDataFrameT, IntoFrameT
 
-from altair.datasets._readers import _Reader, get_backend
+from altair.datasets._readers import _Reader, backend
 
 if TYPE_CHECKING:
     import sys
@@ -46,29 +46,29 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     @overload
     @classmethod
     def with_backend(
-        cls, backend: Literal["polars", "polars[pyarrow]"], /
+        cls, backend_name: Literal["polars", "polars[pyarrow]"], /
     ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
 
     @overload
     @classmethod
     def with_backend(
-        cls, backend: Literal["pandas", "pandas[pyarrow]"], /
+        cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
     ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
 
     @overload
     @classmethod
     def with_backend(
-        cls, backend: Literal["pyarrow"], /
+        cls, backend_name: Literal["pyarrow"], /
     ) -> Loader[pa.Table, pa.Table]: ...
 
     @classmethod
-    def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
+    def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
         """
         Initialize a new loader, with the specified backend.
 
         Parameters
         ----------
-        backend
+        backend_name
             DataFrame package/config used to return data.
 
             * *polars*: Using `polars defaults`_
@@ -128,7 +128,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]:
             dtype: object
         """
         obj = Loader.__new__(Loader)
-        obj._reader = get_backend(backend)
+        obj._reader = backend(backend_name)
         return obj
 
     # TODO: docs (examples)
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index fe8f8212f..9645d0bb2 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -74,7 +74,7 @@
     _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow]
 
 
-__all__ = ["get_backend"]
+__all__ = ["backend"]
 
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
@@ -428,33 +428,35 @@ def is_ext_read(suffix: Any) -> TypeIs[Extension]:
 
 
 @overload
-def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
+def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
 
 
 @overload
-def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
+def backend(name: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
 
 
 @overload
-def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ...
-
-
-def get_backend(backend: _Backend, /) -> _Reader[Any, Any]:
-    if backend == "polars":
-        return _PolarsReader(backend)
-    elif backend == "polars[pyarrow]":
-        return _PolarsPyArrowReader(backend)
-    elif backend == "pandas[pyarrow]":
-        return _PandasPyArrowReader(backend)
-    elif backend == "pandas":
-        return _PandasReader(backend)
-    elif backend == "pyarrow":
-        return _PyArrowReader(backend)
-    elif backend in {"ibis", "cudf", "dask", "modin"}:
+def backend(name: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ...
+
+
+def backend(name: _Backend, /) -> _Reader[Any, Any]:
+    """Reader initialization dispatcher."""
+    if name == "polars":
+        return _PolarsReader(name)
+    elif name == "polars[pyarrow]":
+        return _PolarsPyArrowReader(name)
+    elif name == "pandas[pyarrow]":
+        return _PandasPyArrowReader(name)
+    elif name == "pandas":
+        return _PandasReader(name)
+    elif name == "pyarrow":
+        return _PyArrowReader(name)
+    elif name in {"ibis", "cudf", "dask", "modin"}:
         msg = "Supported by ``narwhals``, not investigated yet"
         raise NotImplementedError(msg)
     else:
-        raise TypeError(backend)
+        msg = f"Unknown backend {name!r}"
+        raise TypeError(msg)
 
 
 @overload

From 3fbc759233fdf0203a2f8685245152732f57276a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 00:04:49 +0000
Subject: [PATCH 084/201] fix(DRAFT): Add multiple fallbacks for `pyarrow` JSON

---
 altair/datasets/_readers.py | 62 ++++++++++++++++++++++++++++++++-----
 tests/test_datasets.py      | 40 +++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 9645d0bb2..0f30e58b9 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -11,6 +11,7 @@
 
 import os
 import urllib.request
+from collections.abc import Mapping, Sequence
 from functools import partial
 from http.client import HTTPResponse
 from importlib import import_module
@@ -34,6 +35,7 @@
 from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
 if TYPE_CHECKING:
+    import json  # noqa: F401
     import sys
     from urllib.request import OpenerDirector
 
@@ -346,25 +348,71 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
     def __init__(self, name: _PyArrow, /) -> None:
         self._name = _requirements(name)
         if not TYPE_CHECKING:
-            pa = self._import(self._name)  # noqa: F841
+            pa = self._import(self._name)
             pa_csv = self._import(f"{self._name}.csv")
             pa_feather = self._import(f"{self._name}.feather")
-            pa_json = self._import(f"{self._name}.json")
             pa_parquet = self._import(f"{self._name}.parquet")
-
             pa_read_csv = pa_csv.read_csv
             pa_read_feather = pa_feather.read_table
-            pa_read_json = pa_json.read_json
             pa_read_parquet = pa_parquet.read_table
 
-        # opt1 = ParseOptions(delimiter="\t")  # type: ignore
+            # HACK: Multiple alternatives to `pyarrow.json.read_json`
+            # -------------------------------------------------------
+            # NOTE: Prefer `polars` since it is zero-copy and fast (1)
+            if find_spec("polars") is not None:
+                import polars as pl
+
+                def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
+                    return pl.read_json(source).to_arrow()
+
+            else:
+                import json
+
+                def stdlib_read_json(source: Any, /, **kwds) -> pa.Table:
+                    if not isinstance(source, (Path)):
+                        obj = json.load(source)
+                    else:
+                        with Path(source).open(encoding="utf-8") as f:
+                            obj = json.load(f)
+                    # Very naive check, but still less likely to fail
+                    if isinstance(obj, Sequence) and isinstance(obj[0], Mapping):
+                        return pa.Table.from_pylist(obj)
+                    else:
+                        # NOTE: Almost certainly will fail on read as of `v2.9.0`
+                        pa_json = self._import(f"{self._name}.json")
+                        return pa_json.read_json(source)
+
+                # NOTE: Use `pandas` as a slower fallback (2)
+                if find_spec("pandas") is not None:
+                    import pandas as pd
+
+                    def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
+                        try:
+                            table = (
+                                nw.from_native(
+                                    pd.read_json(
+                                        source, dtype_backend="pyarrow"
+                                    ).convert_dtypes(dtype_backend="pyarrow")
+                                )
+                                .with_columns(
+                                    nw.selectors.by_dtype(nw.Object).cast(nw.String)
+                                )
+                                .to_arrow()
+                            )
+                        except ValueError:
+                            table = stdlib_read_json(source)
+                        return table
+                else:
+                    # NOTE: Convert inline from stdlib json (3)
+                    pa_read_json = stdlib_read_json
+
         # Stubs suggest using a dataclass, but no way to construct it
-        opt2: Any = {"delimiter": "\t"}
+        tab_sep: Any = {"delimiter": "\t"}
 
         self._read_fn = {
             ".csv": pa_read_csv,
             ".json": pa_read_json,
-            ".tsv": partial(pa_read_csv, parse_options=opt2),
+            ".tsv": partial(pa_read_csv, parse_options=tab_sep),
             ".arrow": pa_read_feather,
         }
         self._scan_fn = {".parquet": pa_read_parquet}
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index b3cd1ab8c..e39497fb4 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -17,7 +17,8 @@
     from pathlib import Path
     from typing import Literal
 
-    from altair.datasets._readers import _Backend
+    from altair.datasets._readers import _Backend, _Pandas, _Polars
+    from altair.datasets._typing import DatasetName
 
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
@@ -274,3 +275,40 @@ def test_reader_cache(
 
     assert len(tuple(cache_dir.iterdir())) == 4
     assert cached_paths == tuple(cache_dir.iterdir())
+
+
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        "cars",
+        "movies",
+        "wheat",
+        "barley",
+        "gapminder",
+        "income",
+        "burtin",
+        pytest.param(
+            "earthquakes",
+            marks=pytest.mark.xfail(
+                reason="GeoJSON seems to not work with pandas -> pyarrow"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("fallback", ["polars", "pandas", None])
+@skip_requires_pyarrow
+def test_pyarrow_read_json(
+    fallback: _Polars | _Pandas | None,
+    dataset: DatasetName,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv(CACHE_ENV_VAR, "")
+
+    if fallback == "polars" or fallback is None:
+        monkeypatch.delitem(sys.modules, "pandas", raising=False)
+    elif fallback == "pandas" or fallback is None:
+        monkeypatch.setitem(sys.modules, "polars", None)
+
+    data = Loader.with_backend("pyarrow")
+
+    data(dataset, ".json")

From 4f5b4de6d894a1297bd2edfaecb72c5eefa48bc7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 13:56:07 +0000
Subject: [PATCH 085/201] test: Remove `pandas` fallback for `pyarrow`

There are enough alternatives here, it only added complexity
---
 altair/datasets/_readers.py | 40 ++++++++++----------------------
 tests/test_datasets.py      | 46 ++++++++++++++++++++++---------------
 2 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 0f30e58b9..2e20fd375 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -366,46 +366,30 @@ def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
                     return pl.read_json(source).to_arrow()
 
             else:
+                # NOTE: Convert inline from stdlib json (2)
                 import json
 
-                def stdlib_read_json(source: Any, /, **kwds) -> pa.Table:
-                    if not isinstance(source, (Path)):
+                pa_json = self._import(f"{self._name}.json")
+
+                def pa_read_json(source: Any, /, **kwds) -> pa.Table:
+                    if not isinstance(source, Path):
                         obj = json.load(source)
                     else:
                         with Path(source).open(encoding="utf-8") as f:
                             obj = json.load(f)
-                    # Very naive check, but still less likely to fail
+                    # NOTE: Common case of {"values": [{...}]}, missing the `"values"` keys
                     if isinstance(obj, Sequence) and isinstance(obj[0], Mapping):
                         return pa.Table.from_pylist(obj)
+                    elif isinstance(obj, Mapping) and "type" in obj:
+                        msg = (
+                            "Inferred file as geojson, unsupported by pyarrow.\n"
+                            "Try installing `polars` or using `Loader.url(...)` instead."
+                        )
+                        raise NotImplementedError(msg)
                     else:
                         # NOTE: Almost certainly will fail on read as of `v2.9.0`
-                        pa_json = self._import(f"{self._name}.json")
                         return pa_json.read_json(source)
 
-                # NOTE: Use `pandas` as a slower fallback (2)
-                if find_spec("pandas") is not None:
-                    import pandas as pd
-
-                    def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
-                        try:
-                            table = (
-                                nw.from_native(
-                                    pd.read_json(
-                                        source, dtype_backend="pyarrow"
-                                    ).convert_dtypes(dtype_backend="pyarrow")
-                                )
-                                .with_columns(
-                                    nw.selectors.by_dtype(nw.Object).cast(nw.String)
-                                )
-                                .to_arrow()
-                            )
-                        except ValueError:
-                            table = stdlib_read_json(source)
-                        return table
-                else:
-                    # NOTE: Convert inline from stdlib json (3)
-                    pa_read_json = stdlib_read_json
-
         # Stubs suggest using a dataclass, but no way to construct it
         tab_sep: Any = {"delimiter": "\t"}
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e39497fb4..01167cf10 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -11,21 +11,23 @@
 
 import altair as alt  # noqa: F401
 from altair.datasets import Loader
-from tests import skip_requires_pyarrow
+from altair.datasets._typing import DatasetName
 
 if TYPE_CHECKING:
     from pathlib import Path
     from typing import Literal
 
-    from altair.datasets._readers import _Backend, _Pandas, _Polars
-    from altair.datasets._typing import DatasetName
+    import polars as pl
+    from _pytest.mark.structures import ParameterSet
+
+    from altair.datasets._readers import _Backend, _Polars
 
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
 
-requires_pyarrow = skip_requires_pyarrow()
+requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow()
 
-backends = pytest.mark.parametrize(
+backends: pytest.MarkDecorator = pytest.mark.parametrize(
     "backend",
     [
         "polars",
@@ -277,36 +279,44 @@ def test_reader_cache(
     assert cached_paths == tuple(cache_dir.iterdir())
 
 
+movies_fail: ParameterSet = pytest.param(
+    "movies",
+    marks=pytest.mark.xfail(
+        reason="Only working for `polars`.\n"
+        "`pyarrow` isn't happy with the mixed `int`/`str` column."
+    ),
+)
+earthquakes_fail: ParameterSet = pytest.param(
+    "earthquakes",
+    marks=pytest.mark.xfail(
+        reason="Only working for `polars`.\n" "GeoJSON fails on native `pyarrow`"
+    ),
+)
+
+
 @pytest.mark.parametrize(
     "dataset",
     [
         "cars",
-        "movies",
+        movies_fail,
         "wheat",
         "barley",
         "gapminder",
         "income",
         "burtin",
-        pytest.param(
-            "earthquakes",
-            marks=pytest.mark.xfail(
-                reason="GeoJSON seems to not work with pandas -> pyarrow"
-            ),
-        ),
+        earthquakes_fail,
     ],
 )
-@pytest.mark.parametrize("fallback", ["polars", "pandas", None])
+@pytest.mark.parametrize("fallback", ["polars", None])
 @skip_requires_pyarrow
 def test_pyarrow_read_json(
-    fallback: _Polars | _Pandas | None,
+    fallback: _Polars | None,
     dataset: DatasetName,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     monkeypatch.setenv(CACHE_ENV_VAR, "")
-
-    if fallback == "polars" or fallback is None:
-        monkeypatch.delitem(sys.modules, "pandas", raising=False)
-    elif fallback == "pandas" or fallback is None:
+    monkeypatch.delitem(sys.modules, "pandas", raising=False)
+    if fallback is None:
         monkeypatch.setitem(sys.modules, "polars", None)
 
     data = Loader.with_backend("pyarrow")

From 69a72b6e32625687223987d04e3c3f925421c1ab Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 13:59:05 +0000
Subject: [PATCH 086/201] test: Adds `test_all_datasets`

Disabled by default, since there are 74 datasets
---
 pyproject.toml         |  8 ++++++--
 tests/test_datasets.py | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4132f0a25..2297ca2ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -420,10 +420,14 @@ docstring-code-line-length = 88
 # They contain examples which are being executed by the
 # test_examples tests.
 norecursedirs = ["tests/examples_arguments_syntax", "tests/examples_methods_syntax"]
-addopts = ["--numprocesses=logical"]
+addopts = [
+    "--numprocesses=logical",
+    "-m not datasets_debug"
+]
 # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks
 markers = [
-    "slow: Label tests as slow (deselect with '-m \"not slow\"')"
+    "slow: Label tests as slow (deselect with '-m \"not slow\"')",
+    "datasets_debug: Disabled by default due to high number of requests"
 ]
 
 [tool.mypy]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 01167cf10..d3f7625cd 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -3,15 +3,15 @@
 import re
 import sys
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, cast, get_args
 
 import pytest
 from narwhals.dependencies import is_into_dataframe, is_polars_dataframe
 from narwhals.stable import v1 as nw
 
-import altair as alt  # noqa: F401
 from altair.datasets import Loader
 from altair.datasets._typing import DatasetName
+from tests import skip_requires_pyarrow, slow
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -47,6 +47,27 @@
     ],
 )
 
+datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug)
+"""
+Custom ``pytest.mark`` decorator.
+
+Use for more exhaustive tests that require many requests.
+
+**Disabled** by default in ``pyproject.toml``:
+
+    [tool.pytest.ini_options]
+    addopts = ...
+"""
+
+
+@pytest.fixture(scope="session")
+def polars_loader(
+    tmp_path_factory: pytest.TempPathFactory,
+) -> Loader[pl.DataFrame, pl.LazyFrame]:
+    data = Loader.with_backend("polars")
+    data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars")
+    return data
+
 
 @backends
 def test_loader_with_backend(backend: _Backend) -> None:
@@ -322,3 +343,13 @@ def test_pyarrow_read_json(
     data = Loader.with_backend("pyarrow")
 
     data(dataset, ".json")
+
+
+@datasets_debug
+@pytest.mark.parametrize("name", get_args(DatasetName))
+def test_all_datasets(
+    name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame]
+) -> None:
+    """Ensure all annotated datasets can be loaded with the most reliable backend."""
+    frame = polars_loader(name)
+    assert is_polars_dataframe(frame)

From 08101cc33aa1d08f25323ea1de161c6863f30ceb Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 14:07:42 +0000
Subject: [PATCH 087/201] refactor: Remove `_Reader._response`

Can't reproduce the original issue that led to adding this.
All backends are supporting `HTTPResponse` directly
---
 altair/datasets/_readers.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 2e20fd375..65df737e8 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -13,7 +13,6 @@
 import urllib.request
 from collections.abc import Mapping, Sequence
 from functools import partial
-from http.client import HTTPResponse
 from importlib import import_module
 from importlib.util import find_spec
 from itertools import chain, islice
@@ -81,10 +80,6 @@
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
 
-def _identity(_: _T, /) -> _T:
-    return _
-
-
 class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     """
     Describes basic IO for remote & local tabular resources.
@@ -123,17 +118,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
 
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
-    _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity)
-    """
-    Backends that do not support `file-like objects`_, must override with conversion.
-
-    Used only for **remote** files, as *cached* files use a `pathlib.Path`_.
-
-    .. _file-like objects:
-        https://docs.python.org/3/glossary.html#term-file-object
-    .. _pathlib.Path:
-        https://docs.python.org/3/library/pathlib.html#pathlib.Path
-    """
 
     def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
         suffix = validate_suffix(source, is_ext_read)
@@ -168,7 +152,7 @@ def dataset(
                 return fn(fp, **kwds)
         else:
             with self._opener.open(url) as f:
-                return fn(self._response(f), **kwds)
+                return fn(f, **kwds)
 
     def url(
         self,
@@ -295,8 +279,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
 
 
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    _response = staticmethod(HTTPResponse.read)
-
     def __init__(self, name: _Polars, /) -> None:
         self._name = _requirements(name)
         if not TYPE_CHECKING:
@@ -311,8 +293,6 @@ def __init__(self, name: _Polars, /) -> None:
 
 
 class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    _response = staticmethod(HTTPResponse.read)
-
     def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
         _pl, _pa = _requirements(name)
         self._name = name

From 90428a625bc3928684018d57861f608574812fd8 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 15:49:33 +0000
Subject: [PATCH 088/201] fix: Correctly handle no remote connection

Previously, `Path.touch()` appeared to be a cache-hit - despite being an empty file.
- Fixes that bug
- Adds tests
---
 altair/datasets/_readers.py |  4 ++--
 tests/test_datasets.py      | 47 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 65df737e8..57b290c32 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -143,11 +143,11 @@ def dataset(
 
         if cache := self._cache:
             fp = cache / (result["sha"] + result["suffix"])
-            if fp.exists():
+            if fp.exists() and fp.stat().st_size:
                 return fn(fp, **kwds)
             else:
-                fp.touch()
                 with self._opener.open(url) as f:
+                    fp.touch()
                     fp.write_bytes(f.read())
                 return fn(fp, **kwds)
         else:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index d3f7625cd..1b866cf58 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -2,8 +2,10 @@
 
 import re
 import sys
+from functools import partial
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, cast, get_args
+from typing import TYPE_CHECKING, Any, cast, get_args
+from urllib.error import URLError
 
 import pytest
 from narwhals.dependencies import is_into_dataframe, is_polars_dataframe
@@ -353,3 +355,46 @@ def test_all_datasets(
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
     frame = polars_loader(name)
     assert is_polars_dataframe(frame)
+
+
+def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
+    raise e(*args, **kwds)
+
+
+def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    from polars.testing import assert_frame_equal
+
+    data = Loader.with_backend("polars")
+    data.cache_dir = tmp_path
+
+    data("londonCentroids")
+    data("stocks")
+    data("driving")
+
+    cached_paths = tuple(tmp_path.iterdir())
+    assert len(cached_paths) == 3
+
+    raiser = partial(_raise_exception, URLError)
+    with monkeypatch.context() as mp:
+        mp.setattr(data._reader._opener, "open", raiser)
+        # Existing cache entries don't trigger an error
+        data("londonCentroids")
+        data("stocks")
+        data("driving")
+        # Mocking cache-miss without remote conn
+        with pytest.raises(URLError):
+            data("birdstrikes")
+        assert len(tuple(tmp_path.iterdir())) == 3
+
+    # Now we can get a cache-hit
+    frame = data("birdstrikes")
+    assert is_polars_dataframe(frame)
+    assert len(tuple(tmp_path.iterdir())) == 4
+
+    with monkeypatch.context() as mp:
+        mp.setattr(data._reader._opener, "open", raiser)
+        # Here, the remote conn isn't considered - we already have the file
+        frame_from_cache = data("birdstrikes")
+        assert len(tuple(tmp_path.iterdir())) == 4
+
+    assert_frame_equal(frame, frame_from_cache)

From 8ad78c174933c9b728f30db653354da6aff64f23 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:26:41 +0000
Subject: [PATCH 089/201] docs: Align `_typing.Metadata` and
 `Loader.(url|__call__)` descriptions

Related https://github.com/vega/altair/commit/c572180ebc7d876714a38688c53f7e4af87abd93
---
 altair/datasets/__init__.py |  8 ++++----
 altair/datasets/_typing.py  | 24 +++++++++++++++---------
 tools/datasets/__init__.py  | 28 +++++++++++++++++++---------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index b6f983754..d6acbf4c2 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -146,7 +146,7 @@ def __call__(
         Parameters
         ----------
         name
-            Name of the dataset/`stem`_ of file name.
+            Name of the dataset/`Path.stem`_.
         suffix
             File extension/`Path.suffix`_.
 
@@ -157,7 +157,7 @@ def __call__(
         **kwds
             Arguments passed to the underlying read function.
 
-        .. _stem:
+        .. _Path.stem:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
         .. _Path.suffix:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
@@ -179,7 +179,7 @@ def url(
         Parameters
         ----------
         name
-            Name of the dataset/`stem`_ of file name.
+            Name of the dataset/`Path.stem`_.
         suffix
             File extension/`Path.suffix`_.
 
@@ -188,7 +188,7 @@ def url(
         tag
             Version identifier for a `vega-datasets release`_.
 
-        .. _stem:
+        .. _Path.stem:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
         .. _Path.suffix:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 0a86bc6ba..ed9ca99a6 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -149,16 +149,16 @@ class Metadata(TypedDict, total=False):
     Parameters
     ----------
     dataset_name
-        Equivalent to ``Pathlib.Path.stem``.
+        Name of the dataset/`Path.stem`_.
     ext_supported
         Dataset can be read as tabular data.
     file_name
-        Equivalent to ``Pathlib.Path.name``.
+        Equivalent to `Path.name`_.
     name_collision
-        Dataset is available via multiple ``suffix``(s).
+        Dataset is available via multiple formats.
 
         .. note::
-            Requires specifying a preference in calls to ``data(ext=...)``.
+            Requires specifying a preference in calls to ``data(name, suffix=...)``
     sha
         Unique hash for the dataset.
 
@@ -169,14 +169,20 @@ class Metadata(TypedDict, total=False):
     size
         File size (*bytes*).
     suffix
-        File extension.
-
-        .. note::
-            Equivalent to ``Pathlib.Path.suffix``
+        File extension/`Path.suffix`_.
     tag
-        ``vega-datasets`` release version.
+        Version identifier for a `vega-datasets release`_.
     url_npm
         Remote url used to access dataset.
+
+    .. _Path.stem:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+    .. _Path.name:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name
+    .. _Path.suffix:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+    .. _vega-datasets release:
+        https://github.com/vega/vega-datasets/releases
     """
 
     dataset_name: str
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index f318f292e..5e2ca1dd7 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -158,28 +158,38 @@ def generate_typing(self, output: Path, /) -> None:
         NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
 
         name_collision = (
-            f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
-            "Requires specifying a preference in calls to ``data(ext=...)``."
+            f"Dataset is available via multiple formats.{NOTE_SEP}"
+            "Requires specifying a preference in calls to ``data(name, suffix=...)``"
         )
         sha = (
             f"Unique hash for the dataset.{NOTE_SEP}"
             f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
             f"then all ``tag``(s) in this range would **share** this value."
         )
+        links = (
+            f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n"
+            f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n"
+            f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
+            f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases"
+        )
+
         descriptions: dict[str, str] = {
-            "dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
+            "dataset_name": "Name of the dataset/`Path.stem`_.",
             "ext_supported": "Dataset can be read as tabular data.",
-            "file_name": "Equivalent to ``Pathlib.Path.name``.",
+            "file_name": "Equivalent to `Path.name`_.",
             "name_collision": name_collision,
             "sha": sha,
             "size": "File size (*bytes*).",
-            "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
-            "tag": "``vega-datasets`` release version.",
+            "suffix": "File extension/`Path.suffix`_.",
+            "tag": "Version identifier for a `vega-datasets release`_.",
             "url_npm": "Remote url used to access dataset.",
         }
-        metadata_doc = f"\n{indent}".join(
-            f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
-            for param in metadata_schema
+        metadata_doc = (
+            f"\n{indent}".join(
+                f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
+                for param in metadata_schema
+            )
+            + f"\n\n{links}"
         )
 
         contents = (

From e6504546f89831930168e6bcaa7150f690ef4709 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 18:14:32 +0000
Subject: [PATCH 090/201] feat: Update to `v2.10.0`, fix tag inconsistency

- Noticed one branch that missed the join to `npm`
  - Moved the join to `.tags()` and added a doc
- https://github.com/vega/vega-datasets/releases/tag/v2.10.0
---
 altair/datasets/_metadata/metadata.parquet | Bin 18641 -> 19128 bytes
 altair/datasets/_typing.py                 |   1 +
 tools/datasets/_metadata/tags.parquet      | Bin 6200 -> 6247 bytes
 tools/datasets/_metadata/tags_npm.parquet  | Bin 2596 -> 2597 bytes
 tools/datasets/github.py                   |  43 ++++++++++++++-------
 5 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 5e7b3bd06439ace1d5eb8387efecde322cca91d7..969f64b18f44b812f11e0e1f34a58c6b592c994a 100644
GIT binary patch
delta 7562
zcmb7J1z1#1zdpNkFR=?t=Mo|zNP{%ef`BLu0*Z7fQfEOLq*IWRk`NF@8l+1>KtPag
zr9)EoBL3_9?!Di0@7a0I&Y5TC-PzwebDsB|-=|nebt0tP7D~{@2_Glsg`e@pJ40Du
zbNCEMKo5ll0Kge91n*O2HdGr520*eYe9Q&EL>g31>ZmZ^Q07uh!Yg4hGK%9TjVotu
z4&ri4V6o>-ND4h6m9ICt@=kym=o<%ojD*bfR+$s%sGOZW#oYn4(n&a5o!od<Q=0?t
z8Um?tL-V@nFr33a>J96Hq9Yp590$)P6c78=IGSoicTgB0G8T9cQqBwd%VjU2g#ttQ
zc|Y>=>|)I^<LX0$ThZ&R2{eW(%;q=Bh39c?Aqc8U0#`AU9h<}_yd?LE-k=?cTL~4E
zB|~1)g=)TaM5f%p;f50<uL8I*Bn|Oh3wH}QYj-n83kPdBG2)CDF(;1rhCty<i2z*q
zUiY1?obFn?(O`>u-3Ofk01m)&aNrA&Uy*|2{`%0L_k%!aAm^b1xZd*rG}i%?umJfs
zGQ<D_mOCsEYK$fdN|NVJwp92eZ=p)@Z7IGyr5g_nd`Ld<)0s^9^6mjqyD*3?G^BLB
z5;vmB48RvG!2%S?nVC7eN^ynBnZ<>samK!Ux2J;bK`Zhv<KYhJ5`Og>Zr0Vi(b5v{
zfJ;MoKI<tQL>3e^R~e1_6$(2iGmhk9im;0*#-GX2BYw|r4*w?-P~`fH+4yWIWo0vg
z(QkIqmambeHg&T%m6&fRa@|b+o82qu5WVqcSrH1g@1ZA9`L7NXKuos>gg^ifZ;20Z
zf<Yqr7y@0NSmK4mDue*#)#ff;BG<R}dadaqs+9~V_88)7_j@SP=~BB^<t6RSr-1il
zROa8_>v^WB+Quf5?CscE;}ENVAc8Fuv69YOgNCP$x8V0kk!T`0aUkePKps*QCPcDl
zOlqW-F>2X0h%Qi5U+pkAWQmCt81K#5a26>bSu&G{=l(@4fz5q;>woepi-<&`rs7ZK
z7){S<EXRhI{3jY|0Ba~P*h&l^SOx%MS(pxMlcgfdFf)g_5JfC^Q@;}czSuMRX}b2v
zZBz89w75NVdih`+>8dzE7Jrw7A3Wz|4I8lnK`jaYT1tN`5jZxyTnqk#L*sl6!T($_
zDHKoDfpB^ccGB`ZEOwCQDcg6~Rtv&0d8h(Z9_pWfBAjOr(Tz^Ol}FX;XxLrKsMRlg
zE55fuN~3<IVKm}e|Fbu&{V1QgW%cAXX8CNolLI{qutq22vU!_w!lUgeX|?>Iy|g)A
z<fvL|Y8#o3h#}XlcP_?qP79@XYBYH3rBVX6+DhmiMK1=%au2<_*X~L>nGi8@@;F5P
zvjklF6=~lqTKfo(Pi2qsb`2f~;p%wqO?GkzGAHvy-4-4nCcG&*n)dFrHSG+!=e~Ip
z#!&Esw-vm*xYhT<Ba5NbX=mKB2%cBXfJ<;XRuD4facJ>PVF>vKR7g2C4NcN262FeE
z`&pT1nuI_qUD$WTpIh>U{>Ip<2d%)}^jxJqqpTTc{|0U0NXo{PVxciE<C%es<TwUf
z5wFhtmErUyILpbcQ(DcywTiQ0O{Td1;Yvz25mJz?y<FP7{pvK{=x4`?ibg^@nnLHm
zAm^G+jpeT>Y!fzOo4d*{9ZkA!98Hu&M%+-_HucF~E&u6GOR3uOsc@*5BV?kDdg@^>
zNWtIjl`d_@@kTRqyokfoJ>BdyZbQ-8*8U?|t#4_qMB&&EziCcC({9E#m5v`3xhX#^
zq}br#qS4TjRld=1x?rNh&v{<l7xop>Z^OF7ebaN2FgsHx*iNpwysre}O4MvsWKbcq
zYRl>_6Opu=T#i!pTkj_ZVZ&-|#Nv&~_V#S^gZjD95f90*&`YYSsSnenaIYC*OK6L;
z=N1vL7R7v*zkJ*Vxuj)Qq)Gjx)o4@WA?H0YaJDa0p7=uu&HIsHYXHE8Y?p=Kp<s>I
z6e>Qf;<%BYQ_8!9C)(TUW>coHOyS!+u-Hy%Se<MaD^ulW_i5GiYVNx6SVa3qpt>WR
zNdI{mCc+=Zb)&D>xenW^2j2atawLzMfRaB)l&$RwHC%uLfATq<4wz*focw~hf}^wn
z#1w|Ju<x?379194h!ouW_*TozY&p#OQ#1$e^U!C;G>+N{3C=p=*2mFjh6fK`^HQ|D
z3q%^I4iebz@5HLz`O*^{UzuGc{gQ*kfJm4fDL(&Jd}B)UIrRW{nn=8U<<^vCD|6x@
zHp><Ac4hCVtg0Xawxw=yGqr|D^4e46O;B_Vp(qC_PFV~nyeHc&aVEg6TzrFNcN@FA
zi8Do`im@n|C&*h0>`2H8k!!LZM3sX#uCu7YJ=ieB#*|SCMc+DecJyWGJ0miey$Cjx
zN#UmjzM3ae9RwPFT5p4Mg|GQD<Y7Iu`-t>a7fW#XTm;knw#GnA^rb5_o1Moa6-+$-
zsbdO~O)sSd@tT&__me9Iv>vW4+&V<eH=_NE*Ag^(nlQyVP__wy<I_#+ich#VyEkR@
zw5$lLa=g3T-^6{NA|r>d1dII~o4vHsLZ@4>eQ9W|D21uk$IDZLwQ2CdGnHq|m0mhb
zaZma=bf__Eao)<<C1+u0`(d{emq5F+-R86pzH+Lo#@P*XPt+9K({@w^ZsmuKGkMU@
zDSRhIP!k5>figbv$e?$oHSao$UhdaygooRXPng?+?LC@}-t=ZNmLG*rw<{yt?kKG$
zQL*Y(LBT=_{hOaU8zV$5o``Aj-!G|p^HEdjb?I%cZ=YFfHV$$tDT-C#D&F8%B7^|{
zy;iJau2R~`X{HfA3Y`5oC5x!Jx<!WjB|YX0*(?gVF7IEhjT>K5ocDXo_0TH|n>0b6
z_H}E8&g4~gi({5`j$-KU`hKlLSr(>7^#hLIQpwAkRe@R~H;JveEj}64xVDy-GpJA;
z`-UV@BSlHpdkztYZ2^qDDQ-%9i2a;sVYlfWiKP8e7NPiOI|4E{U3aFuTJeI1I|<$y
zzE`bvftMgS+C2xH+6*eIDI6eRVy11j?i1~NRJeYX3ciFL)pc@j8fEc>QJ&5HNw+@z
z%J11hdEH@mle8aW_vG4;s>mP8_0>c@L4IsdD5yve8i*_b)xN4Qlm?F(+&{n=7FZ%S
zuZ<fy<xpv)7?W=_Vp^Y$j#=o0hYPA07e~1k-c+%*J37RD=B)N`m(frc+*Pfe&<Yth
z>B?}{x|4ceKBcMBpuN8CL(hxB`<g{X;m8WXMjn^+OBoMU$<6Lcx_&pVGBG6BPM<_G
zLOBjDk+KFMgw3kR?6*HCzhtY6we*c+c&g_8%=zh*L`zG|oorjDN=0}7G_0wwHil7m
zy6VTYpE=#0X0xs)b{f(Wo&_5?l^)3NYqC6Oe0J*Lo!Ns7t!}h4J&TjaH!5A3dO#WC
zhP1LvExuzuX&h<Z&1x^QFg!frH#z&dB}##A-<xvDp(93`?L%aHh8C1Nfy=tzI#_9$
z{Pv;ZS9C^dz73OOz#$2TYJS0lReCb7nw@KTNhW_)Mlw;;P8e;zWE99it>nKie%E@d
z8ZT*T%X5Zo{_tL3B=R)>s{YvzH^m92FVbpHN8((kHGOO!!miC*U9GB?xTNU0N*Yw2
zt7-98jE(C~;^gq|fWe#uYthoxMW=8ayCMAkmaF_HvG}nAavP}c`s-5ZS9(ORNbsb;
zxuu*f-1@}6nti?5e^#Oaj7zY!mK^>pmpY*)-|E=f8!Ngx7w)jIX*?3{Qvdi->Ze||
zxH`pY#v@!0_C>J}63|xAU%G!cr!&1BqqV~KA^2|0WeK-Jaw+*Nm`{N0=}amqb0Y*m
z1I$nW8^8wyFp!vH00@<W0%!;TMV||R88QG0!v|pa%mBbT4m)qTxS9(qOOgJ~WI_C@
z(&u93T+l_5U8wUf2<vlozQ_Ow^Z$#6i*F_nsuX8HLMlsX>><CTy15uLML3Q~-ke|S
z{4<h*`6F}@{7-qO!(7gukXnaTLXH(YF|OXx^2!IxKr*{h)(mY=T#c}$c$XyrT=6wQ
zL$Au$L$Y3wVJoK*&x(55gm4GiI){|*1W}V%(G!VWcRuq|02f%taNkerc43=pWql4}
zr@YGn3TJdyT&D~tM*9;{f034L&chv!hgSvF+TKRT2I%f$wQ(4e_ikAq1$x3Ni_G0U
zxY)92oSzA|;#!W&Zp2y%aP((UwOvl?@n2|YW(~qw^u(uhE}<FvL5@>$Vv}Yf!o_0~
z%zRn+FR_Q^cF*Fsd|!KnI2OT1m555k&mauxehDrT_-h;u_{iUa%Mj{I2n1H%qho<M
zb|xEO{|dQXAxxKPh78t1rj~gbC(xh)>!qQEFzp@ok40DOqEiYZgTGukNBElx_-}3f
z%O?DZL4&XPo4#N73o~#*mHYolRoI2({>>6Wo@Gs;(7&^!{oh$q;ZfurAd4XZMt3yk
zxVhwj)J8s8e_EC)nk=NlmB8tlFB`fv{WGLSi$wgi9byRkn3>a`!Dn4c<s<=hc`*d_
z>9WN{s;E6R3aMx-Fcw9H1hA<2UVEd_>Li7nm+Xl=`d<TVIC)EWq3YGFNwd8>1l!Cu
z`tMi1-B6YloDsesOw#to|0`Rkxmt!Ccc>As18&?LG=d6|FwdgVTw$p(p+Z%@k>>F5
zOIBeDGT_Ys9QzNah3$6V>ko&ub`?<tM@2akTzITs!Ha*vd<#UA!@F?);abiki_Pcm
z=}v>oXW{DV^ytr+%3^&c1ghdBxnnLdU@(~CR!gCF^4!Mq;o9#S_o;BZZ3eZ6pO3r=
zd^$6o!y0*LtpE`O1OWj+(}2FAs-M9$E$L*Tp*Y1P$iC4u%jh0~`<oq;gT^V4SGkh$
zQyY)f2U$kBneB5!7eMpW<I39)c$8i&q{AqqgtW4XV3BFAY|WLdj*+)Qn)F}pu+v_W
zZVA^n{gip2)q=2yQr6h_vCX(#q_sj3RZF3=nP^wftEXssn}h9Hlcba=XKzS#mL`jb
z(#f6R9*Ql&D4yxPcdod$Qw_LK@G|1~@{6k4vNVLi<1pPU)aTH3>05k@B_~N=EnL7M
zYA$90WnuhU$t%Q%qBB&20d1oKBF!WkEzK`UQuC4xU`WWJb1W0nNL6J|=!2*T0+GZP
z3{g8jWsN>=#|YAY&&`uVt<Z6GaJFuYWr#DOXC_A0z(?w_pgNBObxpG%7=7E!o7lm{
zLmBedw!(Guu(7%C`Fm&eQ6SRsP_Kd^zi9h6X3n=+GamdHq5(Jf%ht>Q&(Do;J1MOE
zySqNkoX4bQk1*Z$t$SFn_>K1i)lhRz<3^FyHr0!dN86nbIP|wYiUKyD@?={PPEk9`
zzglGgn6Y^ZVa>z5R?e5~v0yNN9bA_*NL4t_{l#y+phPJ%IpW2kH48*cm*sKtI=mQh
zH-yo)K=aowJ%zx}F5F$bG#e}E_ZAD73!!I<k;Xk=Fc^$NBV)JT^k+;H)*yo#$n`UO
zFJv$?=FZEB=5G(m3D*WF9A;@ZA#BdjON{({LVTipk}}t{6wYFs7OoxLJWFg^1Rp<r
z{QT&Fsnc0HxV(7;o@Ie6{O3wG7hH9!^{stcYf|lnbv}LIB9aonP8zV>Eo^_sv4jw@
zf{l6=brDD0hgbN-`2=Y29t!da3h)VB95IZQ(!#6nUA*rR>?zmm)v*BHFd$sTm{`bJ
za}aWR@h31Clk&pY28ZscwF#1=xHT<s;dMN_;N>a=emll#j?;)oX%GW8DM2c~v#-NI
zB^3ar%K<mPm5misY&um-EEDro*!6WBdp8tH2hahGK-1YdE|WF8YTS82oZrT)945{S
z>7|>$+QOJsWq&>XDkhBnN53aQhQSE^eX(7PWmm1d=%yS;tZT*Uf1Q+rh8B$ZjcflK
zf*^=Mft&@OG$_#G2!j=}SJ?XDd;DYUT?8^!b%zqBJmUk!hY~<VVfg3YF3?P7wCun&
z0N=FS^fW{9vrPf@z)o3?A<=#j-f>RKR*S+ilzp^tjOj*Sn0oWLq)saJ`<LmceTnpD
zngH^Ql&7TIV2DkCxhYVeE+i<)IhLDM{7B1eA^U|_omnIE+nNY^6Cd8<?VA!dYYg8$
z!oY(tv!2J>Dlq-px147{3X^ikdd;cH*a!R;wp>9K%7u2!B#zI_jEN(cL=Bl2tR5~D
z7vZT&8b991(bP8Lwa3fTM;R_B6cT>#eFP2usP+Lj2wyGd>s(HhRj5H{I|QOj)1#Ff
zUDNeE?5JK}&uuP?`?%~bT!Qgo_fcu}^z)h~=$-Bu%wUu2y|E&s5I(cfbadi2AR*kw
z*te?wWw(A?71!TGi|BiW{APf_iC$p?1AptOQ>VW+I~tGv#A0r4KyT`i-WDAq(<Ye$
z-|qt{$Ye|RDtpF)@QCNJafXcbJJsdEkq0v}E~jRwqw0l0+0aqx`>*qzlUWHeB9fE2
zs`Qo2zqJK(8l!iv&%_c#aEplSYMeaSG-KS(u8GLrB2r;7s0kOj%Qzk0c2AC^f=P{O
zl6S>%mif$4CodoOijDQ~GZr7YV{#M|cd$$D5qNt`iBOCNrnsew73?rKEe4ZhXezwh
zt|3{g&P@gXusP36iAuP^+^5bTV2Mp}Pxboa)j)Y++Rnm!*^XiA?zHVUXs6rc$1;At
zq-Pj%U)B$j*YRby+oYx!T;$FQJg-E`FtIRxW6cj}KM|WPgD=l7Oui5GZ529xS1<jQ
zDFNKjc*|f*_2g+QUVE5NJWJ^SXB_^n3uH8(XF-tcM$Xi9?vxeMi)=xbcM}j#(wypX
zLzQnb&COB^>Sq{}QEOj)OT6#&5r5r73Q>0%hxAkD^l9Vaaew1yiNxv1XUhtD$c0PR
zQ!=rEDbW$NRr)N^kI@wSXQ+L~-T`lBJWKE~URQ<vjzNZ)?4vHDW{<4G&Vs%+I&K~8
zSB9Ny(Rc3P9xQ(hJ*3O%b|{77X)Wd8S8IJR!Ez;;qu$7Mw@zt@l5G6w=wcw@`jP9f
zZ)0d4J@zsT6QFmfN-C=hUzZB)RVr6leb4SKOJL%yO`pf~s;dS(&M-fA%6~vcmrw_a
zli!)xP8lvwZw(L)hk%wpxJMe2$hz(aJ3Woqd2f6mIV_gWPbpfks3aI?_i6Cx*6H9O
zXLA!D$A+w%qE}j}_x(jV7jsg{L)pw}?U-DtSInc0Wp#=e^`R$sc!@gFgcPIs&~HE0
zZXHVra+QTkmXa@65HKHV)_>QA-wI9zcMpe__cXrtd6u8b*4&Eb@nurUWxYlRc5jk9
zCNt%Vt~hN+%xiurSZw&Bf0Qx1wC5XCpONo7I9e79PuzUw7f;PJo;G#$rf%eHT1V=T
z7L0xn6i8cnGt4y2^o;i>k~iVnvoh|eMeSVb_V-uBA6&~!AnhZi?u=S<TqZuX1p}vV
zorJ4=32x`v@zC5jMW`OgK2rSgeDuU}{l%WSjHI-DW~O#1yL&Wq55wBT-LJ_V5QkT!
zQ<-e|jg9DvZ2gs!!zMvr_^YWx=q9<aTjq}Tq9oSFrn9%l86~fI<7EknkBLm}gVBIf
zZ2wY&;-lQA2)tD?uXii6r?-yo9f3}-i#%<QANiPO@iz@kf3M6POIgCE;aO=et6ROg
z`W-DB-=LhBkFDjHC9Z#Egr$-5L{YW4Y<@S)>Zl{e@aV;n>^}cwgqXy1GD>iLw#?mv
zx9Zg8Zk?E!db-syzgUN;L4I_y%;M?HWYm;E;v4ym^1}qrEQWAQly<^3gc+!r6h5lK
zv|LPiyeIiH!Sr&|Vf2D_eS)VbDGI}M_{?H-KsO!hlZ^~swgO|$fq8<@y(6(ku$&h|
zL&M$lY?>=-kt~L~<0Mb~fpkqGY2R~dvZ#zw2z`*5vSjg~_afHH-obY6MgvQF<jLjK
zOj*4MzsU8UE#ogwZ6c)`vT4EU)Oxn(ma?&{K|)d^uv5Qsx3ZUuU5`AJtEVA6dhaOe
znEV`;9tHJ7JO<z|uN<)MzHk?8#gK|}r4ww0uU5agbz?u^%!E-?DO$5hw5>O%gM{kR
z_NLL}B4_e2lW7S-gsY4LG;Y0UyUPOu)7lua<*1RY!MvHxw;5`gBIpF=-3Ypj%}M<H
zyi!xN@}kIwDZO*6>fXIx6vwoT{iJu6d;QVC*CtMrCMlULL0AtxaE<CrTa<2y&N4nd
z^e2&H9z&K<o9QxPcye#%Q;|i^eDBUhlY5=B`FOf%>MH9oZ#hL5joH+5LacI@oHV!N
zaA&R$z2DxgeiFPjriyP5zW43&e0aOs>a4)W>6`5HI_I!8)pVm{pRd~5mzW|ek}!CI
zlLwoJoE81$dyU1eI9)qiTa|0|EHU($%TJ!mUXOVA^!>!#N$F`9eN#-JOjC*{=Xp5U
zT8XTsMx0Om2IgOhf4}~4U9oyn#)XjU{g{okyZ4!43V%2gwI0MY7hF1ea8x$e4?KGG
z)erOB`1+|<dQ-;STxE)|Vkq5x8GHuAg4>^@<#mEdVy3Cr&6q98GcYH1fsDp;9k*BK
z+#jA``!3y`3%{G8pz=+VN$2$H3S+b@IR1m)QCF<J4&B&?)q-1`y*Yo@v7le=^@6Zo
zWuVWu=UU<>649vJNdZdGFZ3;!&6~CG@8-EB%QG9rz4U>!zu0rP%eWkBEP_#OWc$fy
zmQaG;KuAAN<1%A0NER0EPxzU-L7Mq4n=@J?BPNH>L?lpA@b-(nVNcF&@NQL@nL^SG
z?0Q<f+aXh~f;+38R5l7Iea%4D-`M*ImvE8kL&5O<h*W}e6&By703$v>lTRt9MI73p
zj^iu0rA;8%7v@9v+NgCA3F6#eAn6fv)bK3mAN4~H_|Bj0OTzFK$RF)3Ch!H=AN3gz
z_yXA<^{NmEJOTHQhmIHseBe){<9G=C&@Mhow*KZBFOnXaj@m)c|0T2kKUt{rBorbc
z6bh$7^!P*JA^<9Ymi|`~NagP&5cNM24v3PAzUTlboE1QQ6Z-o{|M&a|6It8Epq%qT
z$^h!4NIZ``t-}AD&_H~>=(_S7Ki~0h{C~6Le>!pyv3D^d{0S6(el~(?3xd%9CzJS>
zNBrff|7oxhkzv20&=m-U%K)gJr{^$#X7&D)ZL#<lM~TQi2cbm-2SMRi08~=^KS2J^
zf*sL#fl+;qAqJp!&f=RC;QzAJ;<u%r(jvw#dTpKe%7p@m9~k=cWFrb6QFKrlqNs-b
EUzK|b0RR91

delta 6894
zcma)B2Urtbv)+)<TPO*HBB6!eyV83VL{N&-1pGkhp!5KO^ctFgAVok_I;emk9YLfj
zQltbB5Tzrb-+=z#_uu>6=f5}EXZOrGGjn#|Gs$^p_H`O)C>d0Fn}D<n1uLfzg#8e}
zxD&8L3rSoUfg)}&002J(K*TLI`2>ap=!-}-q>(1CS_X0c1tzJGcr_;)54JGqeLWPM
z$ONb%3UPBMip!!>^?aa89GRg;3<wri$W}9lQy!>IJ#2H0ZiNhm7pdP2YWt=w;Ca~|
zZ*$Gv=Ap0hY#F7ttq`{Xxbhug%{`JnT1I+j#n-A)U)r$W3&fR>igrdY8(A(%GhtOZ
z@G1xbCQEmmniYH(W+$yeRiZcj!ZYb9$RG_q7~fi{nlv<R`<=?p>xCr|%0z_>b4d+B
z{6he~APaBCj1w}GrV_%ILFW&uM9JW$3z&id6Ud(jRRRG3>B;Fyi!sPZ9|;1VAoXcM
zgeNnXTP~k<&uFh}s;epzo}l6QF=_lr3;hhOupo%|WCqO(f+CH0rifZMKakWq%#5oH
zGuvrcCNS#jf!tS@L!XvDS%?H-LWvm3@OLvv2rnW))z=gfO`Svk?KkvMq<nbj4G=&o
z&p*(&lDeKjuYxpE=e>%DULJ`|q6{CV#WsgP?`J~*d(WH71Rq*M-;lp!RtF08i1VkO
zd}*~^7-R<oh0R#1DQV@8J;%E^T$9rvL6JqSZ>u3onkdhPK`Q1<8RZQyVWz(*Y}#?y
z+};8FHqac@ZoEy<u^$$t<~z;?Z<#bD%u*w|I;(0#r9Ah<#;J?4Lm#Y=(Vm{NRL58?
zIs0tbxIv5&)gh*;8DKuj`te(B>ych*7(I*vAq5aY5sajcPIn!wU2R<)U=;APg;0Xe
zGNS64<S~h)EEF&RRs?|+|Gglxw15Et;0dOH4KCq>kAt0iOInj2B5Iso%t7;rfM`QT
zx8>7%XRyp(FlMH~A`TZ6T=AG)tM(X^OeuoKyPt)?u^?26q`Wl13j#6X7ZiX&U^rb^
zDVIKM0tCa$ymcbSHcd39+Q0$zI0sebQ!p2rF`jePv)z{L3oij5{+kgQ&G2^5e{;zk
zb8TTf-0&!HC52I)*ZbCKF2A5mZw+DGm#8h_=|-*hwMoHObGYX%_#`kmG<z=*`Qmr!
z1cBxFCR2fo=!gvBtI*s`pZT?_98qzH3D`@skNMf2L;gzZMx_=d1>K)Q$4wu&R0$tQ
zt=mjHyL2JR5}OZYBcKb$ono<?*%r3WOa9yGXx|Z~2f1rOWCio1xel2xSaSVpscH*%
z4XiBn>k8;!wIf5>4tE9{(Ns$ML4RF*@XL+l>J0hNu#OaQv~t;m_F^M@Xe6e1;G(F*
z?b=?}OK&fOAI4#7qk_rlS%fS1q91y+F?jh9F-MTIOl%uh&wtQd|3d#bYov+8{xrtA
zFgi)$b6&%W+SVigdv+aNwndEf#4lcdJDhEpaJlsPx-A$COQ?hj?`5J_awfx{+zD@u
z`@s9@B{_m0ot@@EqsuoPD<a8SpLh8dhmI_hkN=wm_iS9-r0c#RmfJkBX6bo&{mNC#
zI!I!|cC7XAW0JMF{(Y5K6{gpTVUyGAhaCd~;}38>Ulh~nM-=037Zn8FnF##g9h)Ul
z>{*J=>9k1|sxOfiGp|ZSUme}Do({;HHAqP9glVRjuKeifUJ$*p63FM8ePy1tGd_|*
z(btkG%py9Iy?ZYu-o^e#LyXD023H;$4{kob5m!w2?Cn_PCBDdT6DbvSnCHz>jXYVK
z&K1476+3s|)yMDGAe1~gh$~U~`%um=6O|M|2(sSGRs`NiC;Y|~EokD?d?G@W^+Z`r
zr2Qvt9N%@WnSP55D=z(}sOdOp-wl&@sYvhpuIk63=?JTYu#WWFUnhn950gG%it@09
z{u4O`a$^^Rf*$jvYbd2NdUH5?+P+q}V%(;&n7X2LaO-Qu`)X~H8Cpj(ZXKSSPkmba
zP7VUhmp?w7_*IlHX@(XWOtN2y8~gS;K*q;Px#aGI{<Z$LfLO{0CRn=g9(8)=pK@Un
za8z6-&r%@YuDIs%)@}W-m6lLK!WVYw<OKyF?p0iS5#vkMh!5R#T&KA<;^Y3MOl9<I
zn@5PNVHb#w({&D?B&nwIf7d)gyhgD^ntV#sSbQELo4}wnuuF@cif7@+86IZ+@{6O(
zJj!vV)Nu+XxTH(`V>CzNAl;pJvdok{Yjpe)BfL79Moq`$ySTvaVPVtBh30-4a`Pq&
zzdX(;NaW`D`-%;rt{@Qj`Z(XCRP^a}4MNR&+nxQHKD$E7;*^=7(wr{(yBd2*D}Qm1
zy=CLzAEfj+sA|8C9*mGFx;%fqc-&M>$$NiiBU3&2>6m1qq>AHifg%@io?e0QNgCH)
z9EsU6-SH}24S)Y9tEEtx?8C5>;Ww%Qoy2KTF*1u7n{s`+I;HQj9g&||1%>I!s80fM
zC<!vRx1Mf^rdL<JzQSMwwXa$~p2VbO2kyx>2X^E*;pT56(9v}<!$N%OrFRL9vv}h$
zEV>GwR%yl*jkesc{ACqJZWsv5h^sR({OB$3uTzy=%ZRiqA=aH$Hh;=~iG}&oO*PtL
zwqdJke$g4LipgJO2YK0}H)d~t%)02Hw`{M_X_QV2cS#eFnIfC?;L4x5|Ce8~$H-uL
zoLi`&;Bsr2+;tKAD<TKl5_>`wx6IM3;u6pJsOKfPdDdUb1lni^<+#Vfa*CL*w>n=j
zjw3$SylY)^i_Qgv#$DGCB29i@{KzzLl9u=C&^7q>gY3mQxtg(juV_V^<t;3!;FMmS
zHru3J-LiKDleh*Se;13#%Zt7FmsuY_TbyV9tBr5GKfp$(pJ&DO607rl_JB4yCx>EE
z+mnwLu<bQzfkfRBsb^*UjvZaNvJQ`O-wAGwz~rAjFTKX^*`m2SyzYVhNm8dOWTWUU
z1_Q?xD?4SFpC<g!jM>$>C&~ZQsZGj%m4sZ)IU@IMKkX2qE0wQTwym5q`?j5uQ?o-*
zV(OfDlfk8-@V#96m!zL$B3|q}Z;ky7;=hf=9+dg*>yfeFtt$wqFkt!U5ckZX+&R?8
zODv2#wsJh_>Qy)Np-*HV*(EBM(RG8&R|vIVagq}7fQzI>Mg^$~83_1eE&Dk-53s*u
zR#wn!QmjlW*P~q5L=UTK-e>PytAm@r_*)4d_3}7puF2lJK~MO3DJiXW{(_l*`+TTU
z+<c*!gyTl`XOsQO0U8&y{T{EBg<fxv%;w{9m7}=CPhYU#(J!ELPcPB8rASA7!p1%M
z%k<vW0X56$)u_^}vD6kmc&P!WJo#AD4=oOLRpK|<FRs^ZyZ7?W^emUU4_hH3hAF74
z=Td6H#1B_@bd7V<ei~gTc5oFSrEH`W(V8xb6{t^c@iG_a+Whv_K~p4AdyO`zS*uxv
zRY&flZTpP5z72X{yJ=bM_GSxRgI*0g!anvyEv+;wM)B9pCQ{}O4}YCobx(a<`BPl_
zRj*XQ-}|)iJWexHD3hI-x2g>BQ(?X^sO5Auer=ul4nuU9|N8<_t<A4n36j?wDia1t
z+)8PX5FQ9wD-TIK_3R@mWHSf|0gwa$1Rw#RAZ5HD2Y?_DybPVE0|0{Z+Ez0F03ky{
z%-cBddo_UHz(Z^ui6ugi3iQofkMMi}&s5)0of9}01_wM{s$&7{h5n&SRjk&a8WqG{
z5S}v`AfIrh<~M?6K~v)pDew;19vpH<2!kVxfFZY63Fu2{u`hv(s>(K__9TAow>>in
zd<pJn@>}^<iFW1NJOGvqWp=cL!(4zVA+kpyeS}O6N%AV!^yVMQ!sYUNK#X79)4qa1
z!amAE=C+ne0qK;Cd_q!6$~W3zRTS1Yig`M;K9q%5Jy(kc7DKm-5HiqhqHh=EfU8?%
zy2?{qCRF;<<buX##tF%y?A;e`O~|K&_>A1e^bu)vet>et5dF=ez+QjZ{$@}0Uhr8-
z)UY(BocRXh6G~Vap+0Q%A68>TaFYx$so`K@2RWai2RM~NjoD_Yp>5QI54#f@#%G8!
zuyRzrlF>EJE^C~qG7j5%{eQ<4289v9h*<FCiR9t-vvU`WBnIc#^V2zZI>)@|zhORb
zMp5<t1$U@%zFlZ=$X&qwr2}ot69_@iQ3F6)7Vf(p;`kftH+m}ap_lYwB28wD^G1~_
zA=@R)2b>J8(!d%#LjT<&0p(5Y+`;Dt8~QA<IYtWr#ogDvYoS2K#V6@6DusF!`iSCE
z)3ex+!gDcZ4@E}ku$*>|rfpw7<*SEmz4IH6V~+9zdnK}j@c9vPBVGL_DD-3gGt`hj
zbQR*|<o}1MPD{eE^r$SCGb(wG1}2jII%zKyR!w*weoO3ZXl`&28Q?g4kta!j_4M?O
z_0n1w?epf!(S}uhCwijyj>Co*+p_a2<Con$`E}7F1!900APQ7tm+OMoHYRGPc-C)r
z{@}OY-X-{|s`sQYZM|V{*HJ2KHwtwPaffJ&gdm9;0HBbn%f<EkHmV{Wp%eD8p4)Pt
z2fs0M*OIcKN53@OD}K*u8LNKlsg_w)ufW0fw9XffyP(YP`yb^kO&$+&U-02hm=^ft
zYhD`FRNbp7iN5dadO4liLnT}Fd(5voiZt$7PEGLXwU4tO8plMRbgH&Ej9%OG86iN4
zTp=4RVyb$()cZ8YXC~#6;F{Y3OFW1hsjMw&f1g;L?jbq>bisxelyJL8)PIbiibR=;
zVRMt+<&vf5jm6h-Hgv|gBUPF{_4;1rDxZhSWpYi7-1M)!$3=xv_6?E46t1N99o-m)
z8l9&%lj&p~Gk4#155ljT{*ycqdHFaxI{E&Nef;+KkQntX40N_25Mgg$Z$>ncu&t-3
z+x>HrUq%eu0qMg`{^t0ffCw<KrA`H{T=h0(#QJ<S=#^w%-S`V!?yt&Cw<+^eNb`oo
z)K=4~-Rr{?+(pyYJ<YU1F~W6n8TPRqDga8rR|<VMDtOCX;4T;n4OJnklLqq&hF^Yh
zBA+mVq_rhVU>jf{NTMO_d(-6l@n$lmzog||GW^f!7YKInJEalf;y?#H1q0Q@EXa^a
zkc22$LNvLG-GCFx{61X0U6{I2{>N5jU%T+W4$-AudTp?&S!9|`>}Pjs1`j$kERrD-
z5y_m$n!$$dQQFEl;z9SeY`qF>4(vLT$dfwl&)Z%<QaBx8+o3+zI33E{5qT{ASn)B6
z<>lVA3h?LegC4;M_O`wK8^a_r_%sAH7=uA;Fi_&}bYlzw8@4|WXEN?g1)LNZV2K7|
zv`i?(+zp07N2li>V)n_N>3Vy&Uq2rP>f9?)@*LMm$%I>yhhtA=HZekJ9N$k*Nsepg
zxPL;?Z<PSeWOlj`ffv$~mqWvuX{d^9-U!V=;-{Ga1i%EGIlLXkrIW&?GaQP))VoZc
z9rLziTsq1rz;DTAGF_Xb=lA5N6=%Bm<NTDu#dognawfh1)AhS&vdv6c6m7%{p(?Zq
z!QqWDT#>g*Yx-Af{2-hR2$!dnbVi!9z62x$GsqBSB*`f05$OT(iez(o)67CmNhLA%
zXCZV2n_u@vS5Tg*zcDiWLi_B%4@CQ6{?OsV>p5xJW`pfS+Vx{>wzie$ak@n*&kVWw
z>9a3~vd3!fd6Dggl8^y(ZGSQj=eGPknAejTF)gl(z}+y=y1;XB(jBKIe@Y)T6IC%^
zve1Y9<;;lkP!~xK?DUdTwkQusKvh)sbwW<FGZ>qv%E(32dLz|e13?QqrfXD0WngFY
zt6m!RB(E5Bbwv3$j3WCRVd?P6X4YsAF_n<xL2_@pp|2qiW*gB%-TM9GNulFeF1v&)
z>ef`M7f{*~O<fGnW=)jcu^2MZ>KLU%u~5;IPK=Pf;Ii;dBWD?<PEf{cqcAfYt1|M`
zB<%_EfL^^zsqn5+?m-gq;yms8xJh!0P_nuCAvGmjp;j&V_{R(^utoD#E_Z;tM(ce9
zyQh1&tW69=mj=BqHlo)bAGtpRLf&jI{AS}Pf51v_w-2F&aB1FD>+a+mB1*c}!$z7g
z_a@1-jAHAD7jM;?oX2T{Yyj9PYCdqHJEA8a`%eAdMu<E_L!2zVPz_s1^r3BS02ETu
zHd6b}b3^*gDRT?qn%vk(f5k@Ypgfz+NJl#F_BXAt?hUlyXu3(5|1X4Y&S4G?EtEB*
z%-<1|TR*S5@I+QGTG5E-lMk}2yJm2g)3}ZBh8*=2^_?HAQo@%R-&x2zalfFN*8Rp|
z;hsntEMW&~Yx(sYcWd|p^&9I1SH`{SD#AU*n(crizAc%pDVlPtXM9Krj}vN~8PRE)
zD}s{&#h#8{U#Zza0^@7GH!!KWj#(4z<mDM^@46F~N+|jknVaSaa3^t*;{EqGwPHT_
zCR)=^QT&=o-iw&e*?23IlV((w$oY`DKx<y&%Br<^3s_8`*F3$OJnaEiLc+s0JdiEV
zPRFT9eX6-@DSB!w{*=j(SwV^T-8^OM{qDBefQ<*}7s6z0JtW4qPRN^Aa&6bih}EV}
zafjdJb*3by1pQT}J_W~=bu+wLe%zH2ezhZZQELmbud?IWRI*_y@7i#+*;ZB#eL8r$
zw(@H9lr6smd#ioFW2CC2^-VEPsK|y>f>$+lYyP8>r;x(M53PGc?)#k|%4J_|79I&r
zb2f!jqx<B3Zta`gbF3E$x=_s6!@g`69#p5F#vIWQbIoM$@;g7_xVOp<(f15i*0DcJ
zaAJAEr%$XC4E2V^%ac|vFE)k`>~^6kOEgCEz_UT^vlX{&Hxzb`GlgSfy?SS5I4n<D
zPp<6SN4;eUoT4I*CM;em;M{ARcxti}SOq_I%0Z{#G;n53+QCrjD8r9vvfF*@no6Bg
z6nXQ<tOx5#`<4oN1@X_e`i(qjj=JJSP~{1)<GbscxteX<ICCUep^?)Hce%uc(y3;s
z)neTo%dtcKB8yHQlyeS&7Q<CT1Cc45g--jsqPw0)GICuzLifCb`CSqjvItJx+p*I~
z4B9(n%0fOlm|d};oHdcmkfGw6V&{W6ba%H*$IpxVslE(3itLS-^K^5VKGIoJDNz1m
zQXp*`)bZ+Xm3dD^D1V6#@-l#GS}Q%4jT773SS+&Wx6d{wa%6@Y{HUe1nEeJlw?HyZ
z*5l!B7k*zL@Hslf`0@aG=K^H|>G7V%4|4Qq#B#Y<lkivhUYV}<L*#9c?wx3Ff(NlC
zEnP`QYO{T<YdR0suw!#^pi1(oGFODWg75JoYK^5JvQlz$^^^BM%a3e72PZZ<$;KZ<
z?B6`PQ+l8rlxsLs;08V=Z40V8BqZQVIOSC`qOWrtlWB>2oIrh7SwRj~aQu_mZY2P{
zWm@2uf}572l9ES#W08eut*X8Jp58k0U@ULn$b0$;S3r`_0$W-fPovWIn9nj_GE2+u
zqtv^i1J$_e&?mYoZ+Ub50~ama-f@j1?+boR+X?ZGv0`nAF%(?yHz2$H!L571FsV%%
zL(Mf#xHp+BSwhY23U(Uq23xnkbbMQcM#{Q-t>bPrtEl?&)2f+?Uf5!@V#td#<XzA5
zH_}bFW<Mfpi6zF)5AiK#k2`KIXO7>Tp~|BRluWn4nRthMK@V6h%u~K1-54lXrd%lh
z(yepxh6z<_yM7fo8(Q19<CY>CdVw8|Y|keu8~{yYx|-%zh7e%fW{1ZMNAq@c!R#Jx
zN#ZLn%K#sla7V;;`F2S!1Lv`S&;GGzv)=3sLrd;*=bp$4(XvFaVh_xW8?|}oBX4^j
zja8qEA(Hw^i}kKEJ>``0wzY?+kN3gD<|;pbjfw6gWl#Mk`U<A|TEqu3lxAQADb)xZ
zs+g!;{?&M63wJ0X!mqc)7r}#8H{LIK<FzQz-gVvB?8YPFhjN>8)7sMZ2UJSeL{;_p
z(}<BDV%WM5I9rzeMn)DG+EzLf?a4{(hx)2=u$1PFyyg4X=K9>{ac{vvr1HlPifVA)
zaN0Phc<l48XC13o3~x1kz7{5uImF_%6$ptG*L0TW;1;(aNU$1Mi`}LW;EBlVdY+of
zFnE@}a05gLMq(e&yVkDmF25N_<pctravPPB1d_+laK~aClm{VasRl+)Wej~n0mr8q
z2o`uf1FVSPPkWFX)<yZJogxXtg8sC#EMb|@KkWriSSIzK_Sypw?1b=-fQ<veKK#oQ
zeo|P<?XwXW0pf)=#M8~q8%B#KO56mD6!x~>m_*eAk{<#H7DNuJ3nQn-{P&v!OEm<v
zD}(^X2tV2f!K4Ay1cK$Pn9u!BF`wbzO3v`<bKU9#5R3yrrDIyv7%Bc|?HdaJb*?mu
zSJDJfJ5m@LbtcvSy@U-<2tAuS>+sxP^&OV~oEn;=;l<|$O?U%*%8fb-1+l>XKPk>x
zlK1ZkAH4b8&%qH0rU;;ZMC0B3tFZr{xcS91a{f0rhkwR9VS>Lt2EoJuRC(gRIr(3i
zDWmwtxxsh5fh>S3+B-M+zhj9B9usypQNalaA3XpZo6mxO<A5T8Pp#LuYU)D%3qf7w
Avj6}9

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index ed9ca99a6..270ac9ab8 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -96,6 +96,7 @@
     "zipcodes",
 ]
 VersionTag: TypeAlias = Literal[
+    "v2.10.0",
     "v2.9.0",
     "v2.8.1",
     "v2.8.0",
diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet
index 1cd7b957b3ce87cfa8e80b05548f72c9133b5d1c..b932af7c5de7eaa7decace6422fd8191fcecb3c0 100644
GIT binary patch
delta 2766
zcmai03p`VOA3w)tHny3|IksB4k2YId7a@5Jxpd>{vZ$~r8o4!8h%vVwDKxnhJ?TbL
z&)k-mu1Hdiq+XQhrg#-9l_Jlko9BI>_kGX#oO6DkbH2at|9AfX@8|pfRWn~Ob=}yj
zSr8Jsq5{1KS3n#H007gfxf9%8U>^doyOycDVEO6ZPfJw*0!S?;R6nhpc@!#U5eV_D
zMxTQ$_czBGoIPGI)>1E3q-#svs1xhc#_;N8{m%`n7VR)nh5*PN3NVxN5+D?WA|C*u
zObH(3i0`Nx&S0!~Y7JpY3jq{?L_~!1wnA8#1B)<lbLKc%*fa}@Q=eYZ7F5+Npd{cW
z4lg;0FcK2W7&is=Jk^d)Kyem%pO+=C-P8AoF|Uz#zR(%Amk?E{X%x?s^quE!Huqi8
zS)EradV6daHS>5ZS9Kd<6cv<T@M^);XGF)PVr%KLt8etwS8gwid>5b4C>c!`ZpsTe
zXEk@VVZwdW_JY0#yZx6xdf5LVC7DH^UHHVSfbIKaZaEDbU2^2qx!SdJ3dpf}!yEUe
z5%YJIm1rpKO>#3Z+ft1?2OsuoaWxhMM@JL~(eORxIv48(snu4zyPx};IMl;@f5#=e
zNX-XJX-K+-*k~0`xi9@jXVGt`HVH`=`}cQ68Up~^VZ7vswEOlOYOO~MluPyGv5Xfd
z){Ry0BFk4C+S_0L+&smxK_haa?xu^!jH<U`f6xbld0%d?{`5}S()iW+?FWXBxuJ&+
zC-r{VeR*&4{SCK0a@A&4=DV5(G}SG~)3^1sk8ZfbPa!OG)fCYlB;}aPyt!;)x@6=|
zm_Y&dE!T_gIRLlYr(dt#l-rarYSrztrvf)*P~Xp<X_;;#JE-$S)*1UkmMTs=e^;$W
zzh(isq&%y8B+2R2o3y-*p{(v(f#}rSBH=*~WMk2=`H={tvVR=AsHL6=YS}QGcKz<r
zy1Yh^8etr3Gw(C!I5R4BPVoAVLNU2(M@sksl*zSE1IWqLMJ>axXEOI`BysGy>n0a^
zwz|4VO)g$bxi?>5zhN>|jmqx)($qLV-s%9q(9gbDml}f^q^Y}vA2$53EA5x9mrjT1
z8*@ZX9Wf)%oQSJT{XGrnN78eiaxZK&Zdqz_*(R23Wam6uw4y60AoJBf6a0i4^f%Na
z8m?#OEMqomZvUiXY-8U^r#-2RS?;)q(K5buJ%0C%LGC+e&LDg4?%4KYl_wYsmwV(q
zKi3h9lVwQf{r&tHQnb@PeU>%x_WXtKX06&fdya~ePvoGF8+ko0{^?eR!{?r+<euTf
zcO8ozyK~-mgnc1uH59lXA1}SBE&O76Vy;`qP?cG}6qBnJZiwtt$sFd_M>qsWt*l~i
zjEt#AHHUg+&8iC}_KZ*Zgx{QKTM)JR-f~~<X#C`8Ekl?0ex==5nat-?L+N$DFh^Ir
z``gweSwt&GeJ0l#1l6wBEyRZLhYmm9S5GipJDB(?JI?HC-pj?U&z6{FcV}&S*6HGD
zSSM7W%eGW*(5{Vsrn8f&8sa{2s|sWkYm^d<kcMb|HkutP(P;zq#?=Af;hJrhAD^x>
zN5=trj<Fy>;HG;Hp>Lg018mU%`4a8%4>u5ZAcMY|fbL&SAdJtOGJy;|xJ!c~L0XG@
zknTdcKQ%Z&`Go0iU_da|17-xb;-NU&z!hY3oI~c#JsJnvj9fJ584Q_!0v-cB0Bp~W
zxO3u6<#QQSUQV*)Qp0qthJGv39P7V+QiA$*Za~jqV9^cZ>%;wdTexSkXzGH%RKlV7
z*_Qml4Zmd2!gAdT^j8}S6YisF$lK?~+On})qFABne*Y0P^kLxE94%pHQjMf&z<)qh
zf88>Z6mOJpTl$dbtzBsVcT-ubOCm4%!NY(u|CeK@tz|)fG<HdS#<OvEXt&M@&S;rm
z{5>}CWO-z^UwfE-*`^yccCywyqmThAL%2c|bBMpp^!TFgU)8)Ug>mtpMJNP{$?wHh
z#_s^=z3M2Y^U*6hSlf&^<x&I!hu0>A+Q#FFwsG+y6b>u*a4rJM`RWwma;K=a_MqIp
zrd@6vh~KxTXC&%Mj-Ic7yR7A+w5T>>jF;V3WS_C7Eq0~}W(bose95CJrsYE|pMUlq
zvL4e>EPPH*JeG|07Vte5uSoKu%?N66i_&yhFC}<&EgIs_&UkKJ*XWd>)xRb8c%(~C
zO<@$jcZ8us5mb5=Nm4Wd>uZ~WqhyXECzYRGp*IFEILdI$@v&H!T~4q_{O%P=d7T+r
zEZ0eDh3zGUSBx5#VTem-iY(fcCsh9x=oGQ_^(%wTWEkGNMV%#nt<tV`xuhs)XT^o)
zU`~p-0g*5E##nExO3+Q2H(M^#adf<hV1zDwrqT)}BiLxWkCT%VpzfZlO!rb<CdZz*
z4ZD}ea&K|JtcR*+GKr-o$tK(P@8d~`$=4dV?w{MTMl)%uJJi6R(jCG){wiI=zepDo
zF7WjY^NtV%P6_y(GyF)KGAxeQl!>3KnaDG6Sh^tx1SudBfRNudJ>?)?K!{)};TQ;4
zgOp;X*c0m|M_<6wZ8)GZ?j>;zT0fCs*F-Rgi#}T=ZJXnp*s@T%*6x(z-83Tq;BQ1V
zNVr6a(mYRmS&JmL#hJ;;f^c*?2Si|Of#B6jdVta@kO08bstDzUZ{|-`@e+hPK*}TW
zb=+dr?|ZiSDzTe3Y&F#u=&5-CfU;a)%5hY406_dGKrK|@>n-pN^yP*5PB9$-;Zl%t
z)WZ5}Iz~Q$vmvIu6A(w^b<O0z_MHR*NDzud#QTSN|2xN8*pE_YRndsQ3kp{OBn>4J
z5d`Fyd^Jcg0FcOVz7iw_<)I>$X+am1zQrY`&<W(XxQ_`%eG41rLz3_CQ|3^{_qfgC
z2c*sl@>8H8B~rdKQV<dn4iNwdD<*2h>2gS9QW~WVLH=)D3yBI#?I9u@XAM~ZlvH>c
z@Nb<>9hNEJ)z;7@5TMA+#5_$M3<v%xo)%i40`PtbbP54bdRgQj6aM$oA>rUJ_7Du`
zvmqOR@=jc+rLD?=r%7UlFv~$s*fsTCx@a2Te~4hAwK-slM#|?xaS+j8pNczNs^G}A
zQ;c!T@+Zgt5`pQre6hQ>x~tsHVfg-3e(_@R|5bpZ5D=8Fy?p2{MQWBkWB|aDXqZDm
M!!lNdR9l4j6FMyk+5i9m

delta 2375
zcmai#eK=Hk8^?dgjA<~&4D&n9Ov50?+jtqIL}kVnN@$y?cE#98Wm2eQWf1bTy>Zm4
zBy37l7l|Z^G&Z!7x1CUumn>EmZLXqrShe<f_PO?tb6xj!&iDRa_wRRq&V9eKD6wFB
z@W13iJj|U3M-W#iga81;BJ(NV7O)ow)K@04t@*ce`vK4~%Au5WThGgfmv|Hk!mHkK
zikIK)^R~6$^c5k`Y^xQIpGkZ7Y~!e=){Vv=F5KL!rw9Su2zPLyFzI_^M3}6ykjZuJ
zwudCfB0!)p=(sS^Hb_E+=nxToko1uy8DYUz^4uDh5L7jRabSunIX@exC!rs^S{G6!
zQkJ!V7SIG@BHg>h_JdA-8b@sKdoHTHgRM`G57c+GudH6e<)SrOSyB%VZ|h%-#kWJ_
zuLi9mr`C4F*MIx+$`?`Z_848|H-Fc**sgthNDVJCg2Rv*OL}V$vCiqn1!OppnRi;%
zdTLo5;kLx)hZ&3REAz+1<u~=K;xanfN50u?FOc3{8e03nXKSX}(3Oa@r2^+ujWu20
zS9Z2_-+56+$z#&JxMAv|)GnJ_n}e7UQN|}i+w%&FuhH-MJB}+C=5-9jt$lr&{neGA
z^lZkPubsJGH?B)gT7>}+`{kk-*TT!;X^k5bCpIh--#OR2VPD}=;n%rg@zRu`2BlYz
zUyg1)V?BTU!_)b;*1AzY4kA9WDcS4nR=jOXA8zeEylM3XGT+VRy*N#;%z$?PeSpr}
zWu33HGK)MlNCv(?oDQWNVOmDkWcvSV0VA#j-lRJI;zIF$Dv*S4?~HUfJw{x>R&~3G
ziH`TRG-mJ0(PD;|)y4mZb?s-PArZcP#O%ho^unYUPhvuQ<4Zz|J8u{2I#++y5M4_5
zKGG%JR4aHv(K=IW6|k#PN(%^ZbbDYwmG?*?*lX%h(INI>%jcEcetmCSY3b<lkEsXq
z+$Zd%<HaVEF881M>^mHmOfZnX8_1AI&l!}zf0_N;`3B?a4Q~!#qKvOk{?#S;w-TnS
zbN}#)4iA09_K&h~j+@gcX(w6`l{2n?-O-k_PR3G_#Jo004IWr<>49=YT)cVuB=JsV
za;tvhODvA)ff!+f#55!cJEF8mfSBe`hbY!@j8%b0c@Nd{=QkqSlWG7T?%Bke>{c^T
z&{d!)97MFWqp1sqiH0WgKq3_YpA`8|G7B~~f)~$_t@(GdLjy%KWS6kfd@3gwOIP%k
zObFnx+FZKJ)WmFXc4s6RMw<*&;A^A5NUBSuCO7K2s9Wm3wiyQ<pbgaT6YlSOZf036
z5*0ZAPBBf09msO=+Fp0wWUIebNQJP^EbO_4yZ&g?i8UhCtS!yuPoKJBYmRp{S=Cf{
z8hL5;$Y?s&xMd*?5?N7{wpX7;=2FLts|eSqp2V}kW7(z*4u0x`2dn8N)p)q&%zDlT
zU6s8e_m-w+D9BB5Bi|!^Xw~Xed=<a3rJ}?y6C+lUZryJ2Yibx<e;}=W%dsM{bi#B&
zmRG8Guctv!#EL1|YX33rPdTpyez%kN#;0Yf`MxO6sEkHP66O+|I78DxW#SM*Qa2n?
z#8LEok--XwLpVh|%Oe9vp!jBF@Nqc#0FvdV7eD)H)cDROO=GhSrT6_-@9U2n+qUPx
z4cD{Z8Esu7V=<}sy5B^b^J^P(&R_Ue?7n|+w2GVUepvhR#VH#%_2(MOT?Y$KzE6L>
z|4GB<Jf#GUe95{xTjEMpMtM)ejeV)gcRj7!5;Mp360Zm8l-&$|{={Cp?m*nJ69j|C
z8VjefWV5}FZyGAPz0F*E0`Izs)E3GT_9s0XPdkP`K~6I<e{jBVs{TCWkD3NLcpo*B
z?#|`<>+B|;aP4JS-Fo4tY*naZ;FtbkL%`=DrS^GB+1O3&G3>@j#AZYyr%@(^at<d+
zF;x`i!j#F1`3X2a$#LT2$0>wOtg80oMe*s5nC78MU*6Tn1rpyNZi3X$cUghm+b1!{
zL_98?Q1!ulEnQ2=gLsbk7nxehT{C3jr^&>U)KMiNn&X7#(>N7azIr`IBN-q<%${Vr
zP>3LUhy{T7*}1GJ7VQWQnc+7JK}!%$tvPm-v_-BSB$+N3B8sX%rV5BUaTGxU1?kv%
zt>~1S(Za~<1=;GB@zd|-7;WD*YgCFya}_v4JZxBlfd#8_<z{KBrlvv!r$nq+yBgpf
zDvW>vpv|dbw<zqJz?mxg5mbh7qA?zgiBZTp%0fti1V~`nxMC6-x^!k=X{&4kKvB+)
zq64%B0Qb2FmE9r!J|X@={-RL-8MkBvy^3(|FSh?PT}ggrTF^qirLbHwn=3!wCld%5
zkiejmw}<-tw}%i7;PluQz9Bau5?8>WDlq5>kUxk#2L*t^LQgBeVniNJ$LSidOkp;D
zWB@PXXX8Zq{GAQ^EMfj9`NKu<<fnLi@n=boEex1eLj^{fGd?6ZI1Ew%Bw<d}Xtt2S
zVjSf>!m<9}a%C8zG6AHcS$1#<;IzRx!vCFBo7K>fX>pq!EJpxm!~{F3KL2m{JT^&m
zlnYXq!Am&6iRH0AU-v)5!%#&t93hI9JHX|Da~f;VpeYN{IZ!Z-il+@HXTHm9=J@>+
z1`Dj}r`4tWX}94ef2sea+4J4|rmL8k$*Wz;`qvKI+W<oZk@pq=baFbiSO9ea`g=Gk
O<Pgyji^5)7Ebb55UTu#6

diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet
index d2e9a34b78eef3da66b7b70e82ed4a6dcf0a5502..acd04f2c79bb6936ef8776b2a066caa0c99d7515 100644
GIT binary patch
delta 891
zcmZ1?vQ$JNz%j^BltpwBx9B=1AJHmN1_lO=t@?ixPBP}QFf_3VO;ix6FZi|VsLYfr
ze_sRX6hn5^<(FPh+p@p1@NSady6ZD)@9KVA|0}6DCC9V*kB^qc-w+N40Y=*u0taqy
zk)9x`A*<lx&@pj_hzR=%rq$Q>2E{+PTUizQ>VQurU;O40#iLRxjtZHVGc{Cig!%c2
zxZF>wUXyor({D{t2GIi1CPp1m#;S>%wSdGu1Ie1_wW2(drVMNXk`l}%iRq#|VpU@A
zc*N#RzQ`&$*@01=v1)P#qpnI7BclnU8-pkdR4r?6Vpe_;SoI=Cu^o(RO{_MP^BE;3
zUu5(EYLQ~n^{irK737?o?4`MMN)VT%GYc>v7%J{pzpr`S^M3F5xM<C}Mu(nDMQh0h
zSG`9vH!}}v?mkwrBTQ;b*li{&GD%Ec!K4Q?_d1g<cNHTus{=z`5R-z)WPWCO?kces
zCb1WcYL~cdCf@_93}iOosuC3tWfNtSF-x1=&8*7Wp!I>-YVuZQ#fU1gCML04jA~1G
zZOqa%IM_gr0s;vR2n`~E!K5T=q63UNpa~!|7&I6eB^W_cpomlu>||hI0&=Pt*?}Y{
z*ldQ${OpqKRbmlhS!!P<`>`xyOq=|XMbzdBljsFTF`zO@8L<PbqE%4l5)RQLFyVRJ
zqH|#EFTA2%aQ0*;)&lt|Ng2r;wIwVvIAcjlyF)}&LTsC$s0D-CE;bpYpu`?UI${e%
zM8(A335wb=sJ-As)_^_iwZyi8Y*-@%(y)sMNkc(mNpc1_5kRA0@*Fm4zH=beufVGR
z@ySfy$1Gt$M6ioJ18G|!46@|jWL|b@b!c$%+aO{<>>tSK&xA#l7{t~{h*ha^h#lGN
I&wiB&05aa^ng9R*

delta 884
zcmZuuZAcVB7@pa^-8;`a+tr(CH#fSZ*2}u{lv4~2QG`r0x~?eW`k``SN~u`LrbQwW
zB#K>gNTfv^St*r2x<Ud!2oZk>i4>Y;5*&U&5csD+!*=Zy2?yq#XXbt1hj*T*$MVEt
z_2hcfkPfXG(F>{wDF^^ydz1R7WC)h)K-m}{uv%`h&-#56xdn5>uljvO_^PKaWw5$n
zwQKjZeIe4&{&{+?Y`rfvGk>hpHU1%TwwD7O<$lhMd~xX?BYCg2#n2WwXKbZ8SIPNa
z=f{rg%k3w=bGe7xd@nD&3p&jAq`Tp&j9~huaz2>~)tolxt=Dx?2%uWj438is5Ke(g
zVBRTST)c@)Vg_J2QJ}AtRv;5r@QevRAU2o-6;cW92NVd?;W2=8(K_ZzX;n?#Mr8=%
z0Vp>yZc+;cG6j#TEs2ypUx5t2B%LWd)D!fM3rVU1U}W`M<D#$P#?QfDvxUn+K6`q`
zyL^ADad8`OWp$Kyh{2<j93mZ*#yCUSV--j<hLMwnKo?IqdS9%9TPVB)<w>KP%u@pK
z(oUm-IK(1W%Jh>q`XG~oe#0#Cmfl~i;ARTXLOEo1XZmdhR+~p%g5lq(MOB+5<Wi;4
z7&H!GgHV84N;A*O-vNM9FACJFF=m4s5Kb@Z6<my~<P~yC7xMan$Wx-)rf!IUhyf5K
zJVl{Ns6q`7F{mL5Lk2Xu`P?0gx;OEP8MSU<lE>8g6;To|%ORb#vsXIV;8y2l>(d=a
z9Zo@CpXNw&I-CID+ZH6?Fpsi;JitoZl-}9F6#U47F#f@#Y#=YhY;V}XnmuM5*7*D4
zwekVd#9CsuY#|TX#JCA9y|j^Di<8JOEpY!^K}uMQDWcUqP0(n*k~n>0a#Yh@+P9)M
Y;5BWuNCHX%xKqH2Y{2hdd-R{EzgUUmzyJUM

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 6bde876ae..3e57cd469 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -326,10 +326,33 @@ def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float:
         return self.req.delay(is_auth=limit["is_auth"])
 
     def tags(
-        self, n_head: int | None = None, *, warn_lower: bool = False
+        self,
+        n_head: int | None = None,
+        *,
+        npm_tags: pl.DataFrame | pl.LazyFrame | None = None,
+        warn_lower: bool = False,
     ) -> pl.DataFrame:
+        """
+        Get release info, enhance with `SemVer`_ context.
+
+        Parameters
+        ----------
+        n_head
+            Limit to most recent releases.
+        npm_tags
+            Used to remove any github-only releases.
+        warn_lower
+            Emit a warning if fewer than ``n_head`` tags were returned.
+
+        .. _SemVer:
+            https://semver.org/#semantic-versioning-200
+        """
         tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
-        return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
+        frame = pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
+        if npm_tags is not None:
+            return frame.lazy().join(npm_tags.lazy().select("tag"), on="tag").collect()
+        else:
+            return frame
 
     def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
         """Retrieve directory info for a given version ``tag``."""
@@ -394,29 +417,23 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         npm_tag_only = npm_tags.lazy().select("tag")
         fp = self._paths["tags"]
         if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST:
-            return (
-                pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect()
-            )
+            return pl.scan_parquet(fp).join(npm_tag_only, on="tag").collect()
         elif not fp.exists():
             print(f"Initializing {fp!s}")
-            tags = (
-                self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect()
-            )
+            tags = self.tags(npm_tags=npm_tag_only)
             print(f"Collected {tags.height} new tags")
             return tags
         else:
             print("Checking for new tags")
             prev = pl.scan_parquet(fp)
-            latest = (
-                self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect()
-            )
+            latest = self.tags(1, npm_tags=npm_tag_only)
             if latest.equals(prev.pipe(semver.sort).head(1).collect()):
                 print(f"Already up-to-date {fp!s}")
                 return prev.collect()
             print(f"Refreshing {fp!s}")
             prev_eager = prev.collect()
             tags = (
-                pl.concat((self.tags(), prev_eager), how="vertical")
+                pl.concat((self.tags(npm_tags=npm_tag_only), prev_eager))
                 .unique("sha")
                 .pipe(semver.sort)
             )
@@ -434,7 +451,7 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
             raise NotImplementedError(rate_limit, cost)
         print(
             f"Collecting metadata for {n} missing releases.\n"
-            f"Using {self.delay(rate_limit)}[ms] between requests ..."
+            f"Using {self.delay(rate_limit):.2f}[ms] between requests ..."
         )
         dfs: list[pl.DataFrame] = []
         for tag in tags:

From 72296b0e630dad0d2d7c397c6e4887d74c537846 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 18:25:13 +0000
Subject: [PATCH 091/201] refactor: Tidying up `tools.datasets`

---
 tools/datasets/github.py | 31 ++++++++++++++-----------------
 tools/datasets/semver.py | 19 ++++++++-----------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 3e57cd469..385ac1079 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -13,6 +13,7 @@
 from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast
 
 import polars as pl
+from polars import col
 
 from tools.datasets import semver
 from tools.datasets.models import (
@@ -171,9 +172,9 @@ def _request(self, url: str, /, *, raw: bool = False) -> Request:
         See `Media types`_.
 
         .. _personal access token:
-        https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+            https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
         .. _Media types:
-        https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
+            https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
         """
         headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION}
         if tok := os.environ.get(self._ENV_VAR):
@@ -267,7 +268,6 @@ class GitHub:
         https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
     .. _rate_limit:
         https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-
     """
 
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler)
@@ -359,17 +359,16 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
         trees = self.req.trees(tag)
         tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"]
         parsed = self.parse.trees(trees, tag=tag_v)
+        url = pl.concat_str(
+            pl.lit(self._npm_cdn_url),
+            col("tag"),
+            pl.lit(f"/{_DATA}/"),
+            col("file_name"),
+        )
         df = (
-            pl.DataFrame(parsed)
-            .lazy()
-            .with_columns(name_collision=pl.col("dataset_name").is_duplicated())
+            pl.LazyFrame(parsed)
             .with_columns(
-                url_npm=pl.concat_str(
-                    pl.lit(self._npm_cdn_url),
-                    pl.col("tag"),
-                    pl.lit(f"/{_DATA}/"),
-                    pl.col("file_name"),
-                )
+                name_collision=col("dataset_name").is_duplicated(), url_npm=url
             )
             .collect()
         )
@@ -397,12 +396,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
             result = self._trees_batched(_iter_rows(gh_tags, stop, TP))
         else:
             trees = (
-                pl.scan_parquet(fp)
-                .with_columns(pl.col("tag").cast(pl.String))
-                .collect()
+                pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect()
             )
             missing_trees = gh_tags.join(
-                trees.select(pl.col("tag").unique()), on="tag", how="anti"
+                trees.select(col("tag").unique()), on="tag", how="anti"
             )
             if missing_trees.is_empty():
                 print(f"Already up-to-date {fp!s}")
@@ -410,7 +407,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
             else:
                 fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP))
                 result = pl.concat((trees, fresh))
-        return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags)))
+        return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags)))
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)
diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py
index 57f6d509f..f18e1e992 100644
--- a/tools/datasets/semver.py
+++ b/tools/datasets/semver.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Literal
 
 import polars as pl
+from polars import col
 
 if TYPE_CHECKING:
     from typing import TypeVar
@@ -24,14 +25,14 @@
 CANARY: Literal["--canary"] = "--canary"
 
 
-def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame:
+def with_columns(frame: _Frame, /, *, tag: str = "tag") -> _Frame:
     """
     Extracts components of a `SemVer`_ string into sortable columns.
 
     .. _SemVer:
         https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions
     """
-    fields = pl.col(_SEM_VER_FIELDS)
+    fields = col(_SEM_VER_FIELDS)
     pattern = r"""(?x)
         v?(?<major>[[:digit:]]*)\.
         (?<minor>[[:digit:]]*)\.
@@ -39,12 +40,12 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame:
         (\-(next)?(beta)?\.)?
         (?<pre_release>[[:digit:]]*)?
     """
-    sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
+    sem_ver = col(tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
     ldf = (
         frame.lazy()
         .with_columns(sem_ver)
         .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64))
-        .with_columns(is_pre_release=pl.col("pre_release").is_not_null())
+        .with_columns(is_pre_release=col("pre_release").is_not_null())
     )
     if isinstance(frame, pl.DataFrame):
         return ldf.collect()
@@ -52,14 +53,10 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame:
         return ldf
 
 
-def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum:
-    """Extract an **ascending** order ``pl.Enum`` from ``col_tag``."""
+def tag_enum(frame: _Frame, /, *, tag: str = "tag") -> pl.Enum:
+    """Extract an **ascending** order ``pl.Enum`` from ``tag``."""
     return pl.Enum(
-        frame.lazy()
-        .pipe(sort, descending=False)
-        .select(col_tag)
-        .collect()
-        .get_column(col_tag)
+        frame.lazy().pipe(sort, descending=False).select(tag).collect().get_column(tag)
     )
 
 
From ca1b500c220a5ef7042bac75070d679696923cc8 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 18:57:38 +0000
Subject: [PATCH 092/201] revert: Remove tags schema files

---
 tools/datasets/_metadata/tags-schema.json     | 10 ----------
 tools/datasets/_metadata/tags_npm-schema.json |  8 --------
 2 files changed, 18 deletions(-)
 delete mode 100644 tools/datasets/_metadata/tags-schema.json
 delete mode 100644 tools/datasets/_metadata/tags_npm-schema.json

diff --git a/tools/datasets/_metadata/tags-schema.json b/tools/datasets/_metadata/tags-schema.json
deleted file mode 100644
index 80f248a66..000000000
--- a/tools/datasets/_metadata/tags-schema.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "tag": "str",
-  "sha": "str",
-  "trees_url": "str",
-  "major": "int",
-  "minor": "int",
-  "patch": "int",
-  "pre_release": "int",
-  "is_pre_release": "bool"
-}
\ No newline at end of file
diff --git a/tools/datasets/_metadata/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json
deleted file mode 100644
index 90ea9d52e..000000000
--- a/tools/datasets/_metadata/tags_npm-schema.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "tag": "str",
-  "major": "int",
-  "minor": "int",
-  "patch": "int",
-  "pre_release": "int",
-  "is_pre_release": "bool"
-}
\ No newline at end of file

From 5bd70d11bce05e75ffce42274ffe5307aaf5cf21 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 19:21:24 +0000
Subject: [PATCH 093/201] ci: Introduce `datasets` refresh to
 `generate_schema_wrapper`

Unrelated to schema, but needs to hook in somewhere
---
 tools/datasets/__init__.py       | 21 ++++++++++++++++++---
 tools/generate_schema_wrapper.py |  3 +++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 5e2ca1dd7..b0730bd32 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -50,6 +50,7 @@ def __init__(
         self,
         out_dir_tools: Path,
         out_dir_altair: Path,
+        out_fp_typing: Path,
         *,
         write_schema: bool,
         trees_gh: str = "metadata",
@@ -78,6 +79,7 @@ def __init__(
                 "gh_trees": self.github._paths["trees"],
             }
         )
+        self._fp_typing: Path = out_fp_typing
 
     @property
     def github(self) -> GitHub:
@@ -87,8 +89,16 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
-    def refresh(self) -> pl.DataFrame:
-        """Update and sync all metadata files."""
+    def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
+        """
+        Update and sync all dataset metadata files.
+
+        Parameters
+        ----------
+        include_typing
+            Regenerate ``altair.datasets._typing``.
+        """
+        print("Syncing datasets ...")
         npm_tags = self.npm.tags()
         self.write_parquet(npm_tags, self._paths["npm_tags"])
 
@@ -97,6 +107,9 @@ def refresh(self) -> pl.DataFrame:
 
         gh_trees = self.github.refresh_trees(gh_tags)
         self.write_parquet(gh_trees, self._paths["gh_trees"])
+
+        if include_typing:
+            self.generate_typing(self._fp_typing)
         return gh_trees
 
     def reset(self) -> None:
@@ -218,9 +231,11 @@ def generate_typing(self, output: Path, /) -> None:
         ruff.write_lint_format(output, contents)
 
 
+_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets"
 app = Application(
     Path(__file__).parent / "_metadata",
-    Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata",
+    _alt_datasets / "_metadata",
+    _alt_datasets / "_typing.py",
     write_schema=False,
 )
 
diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py
index e024c2ca1..39b672082 100644
--- a/tools/generate_schema_wrapper.py
+++ b/tools/generate_schema_wrapper.py
@@ -1373,6 +1373,8 @@ def generate_encoding_artifacts(
 
 
 def main() -> None:
+    from tools import datasets
+
     parser = argparse.ArgumentParser(
         prog="generate_schema_wrapper.py", description="Generate the Altair package."
     )
@@ -1387,6 +1389,7 @@ def main() -> None:
         output=EXPR_FILE,
         header=HEADER_COMMENT,
     )
+    datasets.app.refresh(include_typing=True)
 
     # The modules below are imported after the generation of the new schema files
     # as these modules import Altair. This allows them to use the new changes

From 012f98b9516ddb05dfb6888e802f3d0c894f206f Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 19:34:28 +0000
Subject: [PATCH 094/201] docs: Add `tools.datasets.Application` doc

---
 tools/datasets/__init__.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index b0730bd32..f66c22795 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -42,8 +42,27 @@ class Application:
     """
     Top-level context.
 
-    When ``write_schema``, addtional ``...-schema.json`` files are produced
-    that describes the metadata columns.
+    Parameters
+    ----------
+    out_dir_tools, out_dir_altair
+        Directories to store ``.parquet`` metadata files.
+    out_fp_typing
+        Path to write metadata-derived typing module.
+    write_schema
+        Produce addtional ``...-schema.json`` files that describe table columns.
+    trees_gh
+        ``GitHub.trees`` metadata file name.
+    tags_gh
+        ``GitHub.tags`` metadata file name.
+    tags_npm
+        ``Npm.tags`` metadata file name.
+    kwds_gh, kwds_npm
+        Arguments passed to corresponding constructor.
+
+    See Also
+    --------
+    - tools.datasets.github.GitHub
+    - tools.datasets.npm.Npm
     """
 
     def __init__(

From 5e677c05447e177a5bcd78086a2f080584b731e9 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 12 Nov 2024 20:10:19 +0000
Subject: [PATCH 095/201] revert: Remove comment

---
 tests/utils/test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 36ed1b097..2e8ae1214 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -137,7 +137,6 @@ def test_sanitize_pyarrow_table_columns() -> None:
     )
 
     # Create pyarrow table with explicit schema so that date32 type is preserved
-    # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]"  [arg-type]
     pa_table = pa.Table.from_pandas(
         df,
         pa.schema(

From a99d2c924786f3a2585f2f84bc4641002f9bafce Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 10:44:10 +0000
Subject: [PATCH 096/201] docs: Add a table preview to `Metadata`

---
 altair/datasets/_typing.py | 36 ++++++++++++++++++++++++++++++++
 tools/datasets/__init__.py | 42 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 270ac9ab8..c13f847c0 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -184,6 +184,42 @@ class Metadata(TypedDict, total=False):
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
     .. _vega-datasets release:
         https://github.com/vega/vega-datasets/releases
+
+    Examples
+    --------
+    ``Metadata`` keywords form constraints to filter a table like the below sample:
+
+    ```
+    shape: (2_879, 9)
+    ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
+    │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag     ┆ url_npm  │
+    │ a…        ┆ or…      ┆ e        ┆ li…      ┆   ┆ ---    ┆ ---     ┆ ---      │
+    │ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ str    ┆ enum    ┆ str      │
+    │ str       ┆ bool     ┆ str      ┆ bool     ┆   ┆        ┆         ┆          │
+    ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡
+    │ cars      ┆ true     ┆ cars.jso ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
+    │           ┆          ┆ n        ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ flights-2 ┆ true     ┆ flights- ┆ true     ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │
+    │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ flights-2 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v2.9.0  ┆ https:// │
+    │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ unemploym ┆ true     ┆ unemploy ┆ false    ┆ … ┆ .json  ┆ v2.7.0  ┆ https:// │
+    │ e…        ┆          ┆ me…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ ffox      ┆ false    ┆ ffox.png ┆ false    ┆ … ┆ .png   ┆ v2.5.2  ┆ https:// │
+    │           ┆          ┆          ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ …         ┆ …        ┆ …        ┆ …        ┆ … ┆ …      ┆ …       ┆ …        │
+    │ flights-a ┆ true     ┆ flights- ┆ false    ┆ … ┆ .csv   ┆ v1.18.0 ┆ https:// │
+    │ i…        ┆          ┆ ai…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ income    ┆ true     ┆ income.j ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
+    │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ burtin    ┆ true     ┆ burtin.j ┆ false    ┆ … ┆ .json  ┆ v2.8.0  ┆ https:// │
+    │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ flights-5 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v1.8.0  ┆ https:// │
+    │ k         ┆          ┆ 5k…      ┆          ┆   ┆        ┆         ┆ cd…      │
+    │ wheat     ┆ true     ┆ wheat.js ┆ false    ┆ … ┆ .json  ┆ v1.18.0 ┆ https:// │
+    │           ┆          ┆ on       ┆          ┆   ┆        ┆         ┆ cd…      │
+    └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘
+    ```
     """
 
     dataset_name: str
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index f66c22795..44c766850 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -204,6 +204,45 @@ def generate_typing(self, output: Path, /) -> None:
             f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
             f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases"
         )
+        import textwrap
+
+        examples = f"""\
+        Examples
+        --------
+        ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample:
+
+        ```
+        shape: (2_879, 9)
+        ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
+        │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag     ┆ url_npm  │
+        │ a…        ┆ or…      ┆ e        ┆ li…      ┆   ┆ ---    ┆ ---     ┆ ---      │
+        │ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ str    ┆ enum    ┆ str      │
+        │ str       ┆ bool     ┆ str      ┆ bool     ┆   ┆        ┆         ┆          │
+        ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡
+        │ cars      ┆ true     ┆ cars.jso ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
+        │           ┆          ┆ n        ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ flights-2 ┆ true     ┆ flights- ┆ true     ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │
+        │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ flights-2 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v2.9.0  ┆ https:// │
+        │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ unemploym ┆ true     ┆ unemploy ┆ false    ┆ … ┆ .json  ┆ v2.7.0  ┆ https:// │
+        │ e…        ┆          ┆ me…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ ffox      ┆ false    ┆ ffox.png ┆ false    ┆ … ┆ .png   ┆ v2.5.2  ┆ https:// │
+        │           ┆          ┆          ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ …         ┆ …        ┆ …        ┆ …        ┆ … ┆ …      ┆ …       ┆ …        │
+        │ flights-a ┆ true     ┆ flights- ┆ false    ┆ … ┆ .csv   ┆ v1.18.0 ┆ https:// │
+        │ i…        ┆          ┆ ai…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ income    ┆ true     ┆ income.j ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
+        │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ burtin    ┆ true     ┆ burtin.j ┆ false    ┆ … ┆ .json  ┆ v2.8.0  ┆ https:// │
+        │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ flights-5 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v1.8.0  ┆ https:// │
+        │ k         ┆          ┆ 5k…      ┆          ┆   ┆        ┆         ┆ cd…      │
+        │ wheat     ┆ true     ┆ wheat.js ┆ false    ┆ … ┆ .json  ┆ v1.18.0 ┆ https:// │
+        │           ┆          ┆ on       ┆          ┆   ┆        ┆         ┆ cd…      │
+        └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘
+        ```
+        """
 
         descriptions: dict[str, str] = {
             "dataset_name": "Name of the dataset/`Path.stem`_.",
@@ -221,7 +260,8 @@ def generate_typing(self, output: Path, /) -> None:
                 f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
                 for param in metadata_schema
             )
-            + f"\n\n{links}"
+            + f"\n\n{links}\n\n"
+            f"{textwrap.indent(textwrap.dedent(examples), indent)}"
         )
 
         contents = (

From 7e6da39db8f9bbb691c5a734b2ed96e953fe35f4 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 11:49:30 +0000
Subject: [PATCH 097/201] docs: Add examples for `Loader.__call__`

---
 altair/datasets/__init__.py | 88 ++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 7 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index d6acbf4c2..d3a93cfa7 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -35,6 +35,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
         from altair.datasets import Loader
 
         data = Loader.with_backend("polars")
+        >>> data  # doctest: +SKIP
         Loader[polars]
 
     .. _vega-datasets:
@@ -96,7 +97,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
             data = Loader.with_backend("polars")
             cars = data("cars")
 
-            type(cars)
+            >>> type(cars)  # doctest: +SKIP
             polars.dataframe.frame.DataFrame
 
         Using ``pandas``:
@@ -104,7 +105,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
             data = Loader.with_backend("pandas")
             cars = data("cars")
 
-            type(cars)
+            >>> type(cars)  # doctest: +SKIP
             pandas.core.frame.DataFrame
 
         Using ``pandas``, backed by ``pyarrow`` dtypes:
@@ -112,10 +113,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
             data = Loader.with_backend("pandas[pyarrow]")
             cars = data("cars", tag="v1.29.0")
 
-            type(cars)
+            >>> type(cars)  # doctest: +SKIP
             pandas.core.frame.DataFrame
 
-            cars.dtypes
+            >>> cars.dtypes  # doctest: +SKIP
             Name                string[pyarrow]
             Miles_per_Gallon    double[pyarrow]
             Cylinders            int64[pyarrow]
@@ -131,7 +132,6 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
         obj._reader = backend(backend_name)
         return obj
 
-    # TODO: docs (examples)
     def __call__(
         self,
         name: DatasetName | LiteralString,
@@ -163,6 +163,80 @@ def __call__(
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
         .. _vega-datasets release:
             https://github.com/vega/vega-datasets/releases
+
+        Examples
+        --------
+        Using ``polars``:
+
+            from altair.datasets import Loader
+
+            data = Loader.with_backend("polars")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.columns  # doctest: +SKIP
+            ['symbol', 'date', 'price']
+
+            >>> source  # doctest: +SKIP
+            shape: (560, 3)
+            ┌────────┬────────────┬────────┐
+            │ symbol ┆ date       ┆ price  │
+            │ ---    ┆ ---        ┆ ---    │
+            │ str    ┆ str        ┆ f64    │
+            ╞════════╪════════════╪════════╡
+            │ MSFT   ┆ Jan 1 2000 ┆ 39.81  │
+            │ MSFT   ┆ Feb 1 2000 ┆ 36.35  │
+            │ MSFT   ┆ Mar 1 2000 ┆ 43.22  │
+            │ MSFT   ┆ Apr 1 2000 ┆ 28.37  │
+            │ MSFT   ┆ May 1 2000 ┆ 25.45  │
+            │ …      ┆ …          ┆ …      │
+            │ AAPL   ┆ Nov 1 2009 ┆ 199.91 │
+            │ AAPL   ┆ Dec 1 2009 ┆ 210.73 │
+            │ AAPL   ┆ Jan 1 2010 ┆ 192.06 │
+            │ AAPL   ┆ Feb 1 2010 ┆ 204.62 │
+            │ AAPL   ┆ Mar 1 2010 ┆ 223.02 │
+            └────────┴────────────┴────────┘
+
+        Using ``pandas``:
+
+            data = Loader.with_backend("pandas")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.columns  # doctest: +SKIP
+            Index(['symbol', 'date', 'price'], dtype='object')
+
+            >>> source  # doctest: +SKIP
+                symbol        date   price
+            0     MSFT  Jan 1 2000   39.81
+            1     MSFT  Feb 1 2000   36.35
+            2     MSFT  Mar 1 2000   43.22
+            3     MSFT  Apr 1 2000   28.37
+            4     MSFT  May 1 2000   25.45
+            ..     ...         ...     ...
+            555   AAPL  Nov 1 2009  199.91
+            556   AAPL  Dec 1 2009  210.73
+            557   AAPL  Jan 1 2010  192.06
+            558   AAPL  Feb 1 2010  204.62
+            559   AAPL  Mar 1 2010  223.02
+
+            [560 rows x 3 columns]
+
+        Using ``pyarrow``:
+
+            data = Loader.with_backend("pyarrow")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.column_names  # doctest: +SKIP
+            ['symbol', 'date', 'price']
+
+            >>> source  # doctest: +SKIP
+            pyarrow.Table
+            symbol: string
+            date: string
+            price: double
+            ----
+            symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]]
+            date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]]
+            price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]]
         """
         return self._reader.dataset(name, suffix, tag=tag, **kwds)
 
@@ -203,7 +277,7 @@ def url(
             from altair.datasets import Loader
 
             data = Loader.with_backend("polars")
-            data.url("cars", tag="v2.9.0")
+            >>> data.url("cars", tag="v2.9.0")  # doctest: +SKIP
             'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
 
         We can pass the result directly to a chart:
@@ -231,7 +305,7 @@ def cache_dir(self) -> Path | None:
             data = Loader.with_backend("polars")
             data.cache_dir = Path.home() / ".altair_cache"
 
-            data.cache_dir.relative_to(Path.home()).as_posix()
+            >>> data.cache_dir.relative_to(Path.home()).as_posix()  # doctest: +SKIP
             '.altair_cache'
         """
         return self._reader._cache

From b49e679e58729930513a54d13f039039bc9a0837 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 12:02:43 +0000
Subject: [PATCH 098/201] refactor: Rename `DatasetName` -> `Dataset`,
 `VersionTag` -> `Version`

---
 altair/datasets/__init__.py | 10 +++++-----
 altair/datasets/_readers.py | 15 ++++++---------
 altair/datasets/_typing.py  |  6 +++---
 tests/test_datasets.py      | 10 ++++------
 tools/datasets/__init__.py  |  4 ++--
 5 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index d3a93cfa7..3760a4f2a 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -21,7 +21,7 @@
     else:
         from typing_extensions import LiteralString
     from altair.datasets._readers import _Backend
-    from altair.datasets._typing import DatasetName, Extension, VersionTag
+    from altair.datasets._typing import Dataset, Extension, Version
 
 __all__ = ["Loader", "data"]
 
@@ -134,10 +134,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
 
     def __call__(
         self,
-        name: DatasetName | LiteralString,
+        name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | None = None,
+        tag: Version | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         """
@@ -242,10 +242,10 @@ def __call__(
 
     def url(
         self,
-        name: DatasetName | LiteralString,
+        name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | None = None,
+        tag: Version | None = None,
     ) -> str:
         """
         Return the address of a remote dataset.
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 57b290c32..9b0e7007c 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -60,7 +60,7 @@
     else:
         from typing_extensions import TypeAlias
 
-    from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag
+    from altair.datasets._typing import Dataset, Extension, Metadata, Version
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
@@ -129,10 +129,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
 
     def dataset(
         self,
-        name: DatasetName | LiteralString,
+        name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | None = None,
+        tag: Version | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         df = self.query(**validate_constraints(name, suffix, tag))
@@ -156,10 +156,10 @@ def dataset(
 
     def url(
         self,
-        name: DatasetName | LiteralString,
+        name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: VersionTag | None = None,
+        tag: Version | None = None,
     ) -> str:
         frame = self.query(**validate_constraints(name, suffix, tag))
         url = nw.to_py_scalar(frame.item(0, "url_npm"))
@@ -398,10 +398,7 @@ def _parse_predicates_constraints(
 
 
 def validate_constraints(
-    name: DatasetName | LiteralString,
-    suffix: Extension | None,
-    tag: VersionTag | None,
-    /,
+    name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
 ) -> Metadata:
     constraints: Metadata = {}
     suffixes = ".csv", ".json", ".tsv", ".arrow"
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index c13f847c0..e9546d2b1 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -17,9 +17,9 @@
     from typing_extensions import TypeAlias
 
 
-__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"]
+__all__ = ["Dataset", "Extension", "Metadata", "Version"]
 
-DatasetName: TypeAlias = Literal[
+Dataset: TypeAlias = Literal[
     "airports",
     "annual-precip",
     "anscombe",
@@ -95,7 +95,7 @@
     "world-110m",
     "zipcodes",
 ]
-VersionTag: TypeAlias = Literal[
+Version: TypeAlias = Literal[
     "v2.10.0",
     "v2.9.0",
     "v2.8.1",
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 1b866cf58..6d349dc9b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -12,7 +12,7 @@
 from narwhals.stable import v1 as nw
 
 from altair.datasets import Loader
-from altair.datasets._typing import DatasetName
+from altair.datasets._typing import Dataset
 from tests import skip_requires_pyarrow, slow
 
 if TYPE_CHECKING:
@@ -333,9 +333,7 @@ def test_reader_cache(
 @pytest.mark.parametrize("fallback", ["polars", None])
 @skip_requires_pyarrow
 def test_pyarrow_read_json(
-    fallback: _Polars | None,
-    dataset: DatasetName,
-    monkeypatch: pytest.MonkeyPatch,
+    fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     monkeypatch.setenv(CACHE_ENV_VAR, "")
     monkeypatch.delitem(sys.modules, "pandas", raising=False)
@@ -348,9 +346,9 @@ def test_pyarrow_read_json(
 
 
 @datasets_debug
-@pytest.mark.parametrize("name", get_args(DatasetName))
+@pytest.mark.parametrize("name", get_args(Dataset))
 def test_all_datasets(
-    name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame]
+    name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame]
 ) -> None:
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
     frame = polars_loader(name)
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 44c766850..c1c7e0655 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -182,8 +182,8 @@ def generate_typing(self, output: Path, /) -> None:
             .to_series()
         )
         indent = " " * 4
-        NAME = "DatasetName"
-        TAG = "VersionTag"
+        NAME = "Dataset"
+        TAG = "Version"
         EXT = "Extension"
         METADATA_TD = "Metadata"
         DESCRIPTION_DEFAULT = "_description_"

From 7a14394093cba4b78613f0afe0754a8d0886d966 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 12:18:49 +0000
Subject: [PATCH 099/201] fix: Ensure latest `[tag]` appears first

When updating from `v2.9.0` -> `v2.10.0`, new tags were appended to the bottom.
This invalidated an assumption in `Loader.(dataset|url)` that the first result is the latest
---
 altair/datasets/_metadata/metadata.parquet | Bin 19128 -> 18921 bytes
 tools/datasets/github.py                   |   7 ++++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 969f64b18f44b812f11e0e1f34a58c6b592c994a..d47c4ebed0528df5c68dedf307f03f66fec5e63f 100644
GIT binary patch
delta 12802
zcma)?2Uru|y6)2m9YP5Z2oNAt5vkI9?;VjQy$MPeX$id}5I{hY-m8F$i1aSKD2Pgz
zA_#~gO}J72d+&3edu~~oJekZ|Yi2U@{_?%wtkseU+`<7H+(E!K+-(pKfhq!k=Elv_
zBXP~3A<PDtBwno~eJe_<(yS)y3qj(Lb93%)%$u!%wkLapWF*nb_LD?JM9`>+Wvm7d
zT~f>{FkXX;R;$ma%Tbg`ZvcLy<?)??FDGH01qKpmy5xdXR~5b-y^0L?KKGY4Li0&x
zP0K#5%=H^*z(}R#RC7@fC<0h$D}ka`?~9o9!pNYghN6@z%>}YYVx|vJiiIWiBDqt~
zd)NJ>ZN`6sq0vg8lbn0D0u{hYU0w?4lmvp}<v<1YbmC~Er&fYF<mIc15lWhqaVElf
z2E4Ngvk9ZFS_{<etOh)eD-{|BJiJd4x^9*$B+(;mPhBApB%D;O=K!mMrXeII;Z)_o
z7!CO;m6}{JyK=pl*Hil>OrbDUlAjS6o~WkKYnWPKU_?uj)8lP2?<K{8=E<O~ol7cf
zzUnRcF=fL}f{!|bmY0%;RyfwE&snt8GhW79ld95Ds?ewkUJZXVnem)6WB`A@S_uLS
z6bhP?FRcWTu%QzBm$t$<+ESD1IUQ-t6a7IXosOoV*0Y4ss1ZA66_kMnkMcdrf;R|G
z08{J~sFhMV&XN<?*>D}OI2J;Gl{v~clAVIZrc8A$!NDUEpom#qXmEG|v5lZpGAgR$
zu@lj_;-j|)IR>F61eIkBTDGC}f{EUdR3oRbEb@Xc1hznf;2|rhD9oEO@rtU)@Hti@
z&hZQ@i3SKL4S~N9f}qjbBJNAj{laY9RO2t{FP<SC-4u*V3H&0NoRJQh;dNKY2;Q^X
zSpz(>{4)VGd@=+$X=PU6cicPy3vdZ1MUK2}2oM7Uj36+&31+Y#EDB_WC5_Hf;(chZ
zJgH=-Nv-%Xgqlv|Wa+_#0H~GqYgdKSMOb0>=LfdVtwe5bZ4Z8u0@z~w0azj!%7FU=
zwCiAmsM%?8UU7=aUez*%K}Q?t^<^`>v;uV&3&e#(Ss;%=Tsuw0&_xjNF)or=kbsjB
zpu&sSxIT`MBnu}8X2np%tFBTJ|8GLhNLY^;a=rdIWrN02-w1zO-yQzKD*o0fOB&mM
zROR#9wh>Y5{OaGT#8VNd@>|M^Zg?qc|CaI}CI1#B0x!xFLFJ;)j?KA;M<aho!r)Pg
zRdF4~Ydk~=95IELa@N14T>4uPjCuoyb}=fW{}#}C`lS2Vf{47WmqwHziiey~#lCVR
zwL((ZV)TCSXNvfVqJuaH1PUp|gUSCdS>q&_$72!HV(R}!T3fAJ^6oh17q*Ly%0h-6
z7+zwlV8BQb*K3r`X6XBgk^xCw*vA0NcxvLTF@qF9)P8Q0^~D4GlXQiQNXwj0WClrJ
zA$d&#t(0isJLD?1v77Vymd~i+Kis_GES`P*!X@ktUgz#9c%3VaQc3;MdGGIk2>gYe
zMpl~fV-Rhh9uQq+(HTv1?@ON?`0^``S@E~jO<V}s<WZ<p@~9~(??uJ94kzdL`~A*O
zUAIsWah`yEFDDDDI*H!vMC_Z3ubEO0u42=d2(s2{Z)kh<52|R;2ie^N6U#J&-^X^E
zlBRdVIbUO6VTm65+wxo5!U4~+M#N!1p%*@my^tcE@#F>9=g*+S&x%<a7-%hWpN409
z77VOg`3-3sklbW6FmVHZTp|;p$R0+4i2|_uuG-Gq&|WT!&b5aI%e-(hgctw{Log6K
zyLmX>x?|_*1Sg}r;HO(rpvyvVtue~r9uOl5;Q)9u1YY(V0}LZT7X<JDlffr8004jv
z%#gjqUWIK2#i=AkoyH5@7uV{T-Rlbz{~3l`Y_&)_4h^f%V^eSbMS?P@_C67;8hN5w
zG$PKy<r5W;NRfsE5O64hf#Ua(Z#mrc@Nn~WyL;#FIqXB|R)AbbqB7S3JU}1=5Pvv0
z1Oso-gU4{`z}J9q{05$9*M8Y%Q;%u`-ZNRFDH=+_XbeL##}}U-JCZp?kP=7{q(u|$
zovvJ|K2~j%U?*O)W=2`=l3P8rZo|T5^ZnD8zI+#5i!>J+xr4(}ZF^YZMDp->lR;${
zw=qp0-vS4OS&XD4PL`|~Iea_w=jm6B<mamS9~0eEYbwHT%4I-KMk28sZ0D%YRs&sR
z8=nfN&Bv5n)!xc;el1QhQ}(ECKEu=)d(YZh)+C0FN@u^YcqA0pUA!l!#|C+6;yqF{
zS!R~~{^h5(b=FVc#|xYfE{b3O;L_@f8iFr>J(K?sn9N*vwamgWVZko7^Fe6PCdN9J
zwM<SlPw`AVs&X&db*m&dQUzt|S@P{;YG;q;v<q`~qlm?NN}f$QP`O{zLZyR%go1wV
zOr=L+Iq{`bWO6qF#qNFHd2i?9dmagTqi!}-jhMdrNtHpCFZv(9vz_S3N(<Sq3L23u
zfX%E9L%pnbpEG}{ZU_%2Q8y=g7>pV58}n3}5bL&8bI#a)8%>tkF*@8rR7$(oN1scv
zpkFSOhDIh!y|(gc(G6Mld~wcwkX|z|e`u<-@V&CuB0_!Or0?369^_vC=ktPvcU9Gq
z37&gDLsvF3cN9Y$Jy^99+*tcXj11ewJms8QRYwuVk~8vS_JYH73&xMKYvwQmyzVF0
z_Fn7nbEyxnkZ=rD`30;w_%PJ@wbQ;BtvE<o>F*(BZM#aYSgSIUx*9yxe_dF>T9wb%
z`UNhoB|Rg<?ZuG8ev@j!VTQ6&RW7QC<fXt&kv_6v<$LLSKOu>>A7hU9B4TL9l6ZP2
zLgJ`AJk6m11=vA-f(11@PFJMK1V#(Q$nliGN)uYsFwi>U0XEYg4u&UusFc;dao^XF
zdmOPO7t`%!p)9KqF8Oi`Wux5u&Xg&jwAe;cJ127Zxw~l`m;Vt(Otgn%uK{&t>xkB`
z3!eIp*t>_LN_6#RpEiityX-;&zFi4P^uE>EX&Ru#wZ3?1Y8am8$q;(+vvJN96Ir=-
zkCm@nh`@4E)=|o0$bT<3j>J#-aoEm2+;5q@a{4&aEx>ey(ppYdhAZvXme?qVZtBg~
z6_i6=8|f8^7|tPC4v^WU%H?Tqjh64TclVb@Kerc@_|;9=94K-Yt>~Vhv#(66>^_j9
zI4JZdOeBr&j+AF@A4=-BtpBj!z&$&2YIRSwPh{7#p{c`uooi7x<r=Gy3**J{O*Q-4
zB0p^sDn83_5bl6FkX3v$IKk$iXD`gR95d8tRI7~mMe_(k|M=H&XOkTTHBOqs029*s
zlQ%BKI5^9$(I@cpq|`*pTk&~u^d03FX1&oiCt4JRMLqX=<J&7gK9c*+V`*XpK~hi*
zp~dfhGT!88)bcVkiY$7obRDDS@`EpatDn*0)ta3{Z2g4}=BwP4-O7ICIqlmBae9(i
zLYfr$@^cka>)W4w69o073dOgT6CyA%iX=0$_QlrrE|$nAB)??J7K=^w>u=Qasgy~0
zF3|7Y)sBZ``0tFS!5ZtS-}Tm$-D#V7{o<_rkXdC}sb^x0FSg(fi=0Y}%&ikMmTm8?
z$`Hpo9bIAiI_+`GC7HwNvllwaWuLf6>HDLz<xN2`EbIXTao(c#Y(JLc6K>jL`gP4$
z!VqMSi)yA0LO0w?`C1a#PQQbUIlMS64d44S=zQ-W)w?Bnt;VkDG`~9{l+E9aPR69K
zjuAwD5>q>V;D0av2)^N6<FZDX(UddOl+~r_PKgl;Ig4##QRU_9M|#&v4HxP$=T)w+
zGP!p1&W{G$YL0Sm-H~Q?zsDAg>6UbJDkHYL7_)#Me!YI>fmXR#Wfh-uZ~t*suXkPG
zdrsAmwB5m4-}wML&R*XDa1c?(hkA)PT8oL0dAWu@ai-Fg9kqlXngODGyKdcLL0^bS
zmEEJCydI^VB)CHn==;b{(w*(lUf!+UDfB_cvT&QO^kme}C$zQ1v*OV;Cm8o{?{-7^
zTu>{gRrgP{N!UD|6$RJpBFCJPDxK=w!~K1QA~+Kp=2MN0@1FU`z9EsO@SNGx&3=JU
zK1vc5p$nEGE-@&|kbQ`t%Tsq;rtt;`Ip^kvwh*s~XP|SQ+<D+_)5udf)YTZ)J{M#o
za6ZiKp8ZuKz>1dO?M7;5*D8j^Jg8?iTs~>FSV+Wq|Iu5MlZA0APmJSF9x)5;;ZX5|
zym`g*qzAKeE88`&<x*+d?)2-?vnxpjuBHLT<I0xtU*alq-(__0(pBhk$dJBk+fip%
zQ6hZ#sOHAAL$6_;#lg)AuPGZu^b{Fo<8pd4m@wwccP*nQnY#uz2%YX=t`L*IBEP2o
zsU-2r^Ry1%n^*b|K7VvlyY@iiD|Kk_8q-W##wzmqd0rpnNM<~d#h30<y6}}KQ@Y~w
zO?i{}s>)4=#lA-kipcVZqGpyB8)fYCZ~19|xTjhD44%3fKU<toEc=y!G@HJ885o*j
zmi%>(@-}^ZM9@%?p!&hbZz&j2GpB}>iBz_!7TX&Eo^|jd?+zJSX;}jh;V>Wh{IY)q
z6$-))AyHIfP>fq4T*{$9odaM*00=uEPQe=J^$s72{eA108&a<VZSw&DERYNZpi<}z
z1ptUT3p^Q^000oIGhB-k0tg47P#^_BlGIr)5U}5HlFko;0|?-~-0()6E4>FnC=0+*
z@VCpQCL>f)dTJ|*q|{JJFEPo`b~eVff;e#245VyhVD`d5Va60fG`pq|-!NxW<-AwH
z6G|R5W13bO4PR*jE}nq}L$Co%Gmt@$4}huD(53M8^!XVN34?J$TIysC&S~OIBh{N%
z-;vVcMqR}g$RAGc2Ffl{GSL1PQ+<ff4xjwZ4%4R+e1<&SuOllqXpAkjuILm#(T6+z
zcW*6085gxOgyyZ~c{oz$EP{y+lKYq4HqDs*Kilo_Z5b4`=@;A|yRB(tVJMKJJ4(BK
zVx`Q!Bf8-88gID+-i-b$d41J;)0yN)h|=5GgJ8x$rZF&R_PzqT>Mi%*9I~Xj$V&K8
zF7M1C=4Fh^+&|8{qUE>qUh<TXw{6gV4DS#`8RQYF6uF0*y-OvMhQF{9wjggK-w23r
zTKW0MfLEE+>i<nDhsq4bHyEky{&C+$D_MB=9sXh&Z@vY*&3>EjMq7mfDauuq*;B`w
zIyR~RKE69I@g5wrL+#SXK@DSoRgCpW7(SEBQIgHStr5kXV2O9+&`&Xc9XT11$%ftI
z3XA-4u_$JGG!2n-y$;t{Laet#j7WgV2cIEV*Zu2{U*D3{b^mcQ88JTh*QFbJ<6U}T
z*Me8rx@c7r&Fn}}e+oJIm2yLJIRr&N*^iJh%p8@J&E5^YRmM<ZBrL$_rQ>xm=ZcL;
z=E@@b-_Bj)3QG^rjyv=HTpAx=)@0{z|2}6O+RjXv;>~P`YT(L1G=U|AI!$SPh`tjH
z_G53KOdNRRXp+9(tp(jREYm$Ew~^^|(wV#t5)YyBBkJ$5M)pgPnPC-KvcsC<S};uD
zHM($I0gzBoDr-%r9@TZ2%jTn$Mj+@#pYah4!b&Ofepst<U2beiwa+X3s_+>ccW6V&
zAOtjkjwhE-dNi3O8t^Thg2Kb6OAz8Pk>EEe$Z+2K1e~cg<_S>td0p%#effus81j^2
zC8q))ghHafIT#K6(mr4IkbEkERV*A8{~w1B_I0)U>#KL}(KU#3J;&8D*}_{0bl`9Q
z<D|_&nIr%|96M5(VBnz{?GWxZG6>9f&jh!Qv=PHY4gk1Uz>HY;=gDE+loK%z^862f
zJ?<OQOiuReq?Q=~S`^3Mf#`p%2TJ3q{tb0D?jAEg1kM5|!OhVd;4@cB&_hSuOAz`G
z$HRDz&Hr+oW5AcCEnGbBmHl5hcK^%qI-X<xzZ~m4hQHy=ne>p*`7a!c{XaPVhUYlz
zpQk)y3G3-2uGf)e8#MZs5?5q$p6Jgy{&$XfD2G;R%ms#$oDUuG2xI<9nN4y37lwgV
zc!mwP(*LH+y*Hm)r6WdhTPHfo?AuN6|1kW+{of2f?Y;SV=zEBgbF5JCQ7af^f+x6W
zit{hQ@lEzG(IUCr(}y`RDl=4-m_H9+xL*7w`2Cmb^0qKM!A*iFg-k*fc0TLr)HtcN
z=Xij1gFk+@Kl(#&g-WURUwUy*bjDp6sSW?3H}}JDdfn<5`2V7d&zrG)lpj@0S{*CP
z*r@!ghCF|x+lwcc0ah@qEKZdvfVnR$oA0Kf!tC>#+p^bp{+&r1tQS6t(|U-%hU20t
z%=F*mr1rjW;?cbWb{6zsz>{mg>;LHW;yoQlKO^$&o|(Vo>fXVV8~^N`XV{`px*yFX
z{%3cVrRdh2rKFJwASj?C-$S}*exv)7nZ~8h0fDi`<9pcaqv<SaTehLhT$yY2Z+wOH
z$7a&1hB7Ccf746PY+c6ej%T+M&u%qXV%do8slqj^2d&rDG4w@N&Yk7j_RvXA<>TaA
zWVS9k-UnIa*Y+dL|4<8vL<#<Kif>^84PGU%3Pe!Y-s6|dv9XSD!Fcl*HHW{#?y$g?
zjKC&CBnwJkZ<=+MG{7nl!uPN(J+oLU8i3Q!569BR1b`;EC%uvzHktVG6NQN-kEtZ4
zxSL?F2S_aM{Rm-SODRt%yOVd`4^ji?^if5!1_>WiD5Js>)<Nf$I=ij4=)z8;qTy36
z7+Qz`4*8p#(=b4M-Gctf%Xh@+2H{*|xKK7bNDLMH6K;hKjZp?)B?WM0$n&ONVZOu+
zR8=KNrw~NV&|jZB4;z@`AzO(#cyv_C$n%tH{Vk~e({Kz(3?Pbs33T^2s17#1JC*FY
z-RSe+c?a!uqQd$YNNdaj_Kwk6?XTCL_*ARMD!h`dYGXNw_@w~&Oz3b(hO7uD<`FE%
zUCOxV=ef&+NqDQ=>3pKlohg$xVcl}M+muzITy!;@>)nZRpP0?g-7{dmPT{Ap+II$d
zUad#pa>XX^JkcogbJx<VI}Ky<s;<=9gc<ei_j4Oo-POOU#<aK`@ye}*Rdii$bB~|@
z<JTM1lnz=qcG}*uv*<_i-8!{)W%k6h@8(5mhg`xOl08VcFfM_KUW8mI12s{{0KyjG
zJWG|}r3FN60yM-{Lw@GN$Vu|i_0Hz6EuY1=5W1ZEXVJEabpk&P*8*Y%<ZlikS}S8{
zO0UE=m%h1XrszI*S&~Z5DtS|(fdMG_u?1%<NL8mVd)?LBv=E3xsnQWmQPVXNa$#d}
zksLp8L1;~29<CeB4$69f!>H2ZexV6LjgB}dX91>}8F1Y#@EZ2El!zes&k{xFnUh|m
z<2LK@{^L}ZZvrV3O6Y|lO<Dso;AL`i4zWe~BFGcq32+2Z8cPOe{dkJOU2WoAj#Vpb
zIQ{kE8?Wj$$@hDPx65#~R8?pdp~E+g4l;T-pIeeI3MjAd9NZ14bxnIKt0zf@N*$!Z
zAOg{|z4FYg#nDD)u;Hs~zWH^{PHPO~YGfcqub=X=EDwmW3InSLL_|r26Zf6ok2kI>
zkq3C0T(}c?;|j!Oqo*{DM$F!QC-=3;o8((J$aN(zbZHl*H5p?`0N~PGMO&Cy_0a5g
zODfQ;OSId9!JKpRmaxVA9D9lXIP@;&It@j8(iW4iqD5`<)x5n;MmwYT{Le7pyRG|U
zR+-nY@i)jmwEIQqb}4L0p3{Ck;g-{jgpViHIQLI7&*NsfJ*D1qeQgU;LTZ{RSKXI=
zR@Kn&G~A>(x25Ie`*=+D&Q`Z9#ciKiwnB9(-k*2PLeg2pjkAi^p>J*GkR;^v#F&@}
zQnkhcL+qdxlFO#cfIoLwQK@%?cc3%D!uPPLrf7D=L64En#xc1%rk-Mcc;H^#a867R
zeAMJ(Jfg$Z|D~^pRfnd3izMGq+3B!%o072y#_VYt;fJ$szXC1trXAdCJT9|so(?uk
zV>>H^ZnBGT3LB-OHGOT>(29(?yBO-ueb*#51qp~E5iAB^M??#XMzI+X^%HfnW=A+w
z`M{7>6$w?mHGEni$~&eid~>aQz>yygrZ^b|l%=_a>cW<5wFV4mP!}=nN6eppL-(Fr
zJ7A?u$^mSsT;rn{_0&GX=;re48b%?vY&fxE!Fi2F?&HaXt}ur((<0Z~m}k^Y#4fdg
zT&i>KD;f%x&-{(<^wE~MMiA67>RhM4L8;lcUbVKK^0Hgu^VDoqy0&%Sd?3*)&>*@@
z)$PqEzK6)6<a5mo*_Z2=nJG+HI)!#F;#(+{Y;?*SW=)I;oqTD+;DF5&4sSP`jO)V!
zCN}(80xKRn`3Z^ZLlea3l$c<lhKhw;hW5SKWX&IPfho^3gF@5U={a6KjCqM^sWnr>
z>@19xw^)zZ$GM0Gdyofz<c1oYo)$)Dys3$|7=CSIp0!sz7!lJGnQzf#`2BoRVse*i
zo~T&ae6M?Y>G&N>yumW|;<|J?`4v${NrMeqhSXI5q^>D8V|{=sqYP%-S&Vc>3UNxZ
z&a-r+FC(LNzlWg>-P7i|&384?J_4Og7}4vi*Zoq|uIdAwHtt&jfMHrnRkNvNVDe-O
zLS&jm0x;Jwwz#RKpsOq)AcXCz*hhdHxYRUrDX8PpA;l052}PxS2gnr&8Sny_Oi5uD
z?Ijqf62dx&Ee2moVxVu?yT`>|PK2T3ot{ljy)g`*dZiOvLa!aG`)>2c(#9UA9ETil
z2#X)jiC9-+|6PcF?Az5Y`76h7$_(pz*~(tzvM%Px-q7%0PAYVT-(pJk3sOB9oT^@<
znHDd9pk7&)lYT|CVMupsd5UzCqx7EI>o)CYSa`e1WR0LIlnx_PHn_deLmbuVZE2G0
zo{Fj{6Wk|ylz#QZ$Cw-pSfvH{Ryi&6`|};BxRx84-guIpdMU-%Um%wmAus8<RI4E7
zPPr9P6);Lw6?&(E!qg{Bz;O3fW@7~-PlAgH@9_6x_X2U1D#)I_X68dSy-JP2905~C
ztK%HLn{8gkKgKcC<rY+oK~*O23lgH>)+S;Gh$xe%Q^YFd0O}|Lq9r#n+f))!aB4->
z@3MV~FNEO{6?Va7-g;y&l7Q5MY!D6zHwkq_KHG5=-~!ss$dS4znod^QP#SU2qH%}3
zO{7HWP-h}8Z(z+QcdM1c?Ae5VPxY@{tVt&SehOxaPk3aXy1~g6%>RuzR57Sn87`>q
zKy7ZM(fiz+0i2%KiY@B7BjHPweCG+#24&Y43N;Pa<Iq&E`Su;ebw3yQC@T=#ppfZ6
zb%*9^cz|Yf=M_appF?JMqN(_8Ee5T3c}tS_0>pY}R2Wz+siysz&rg-|q4h^%KqDuI
z2l1E-9rRbDi63XQd(XbBt%tGxLN{F<tOUKXKA|5i3RB;P?s1XL9LSvB3sY;We*T(!
zJ?kMupniu`NY-tFR-7iK-z7=?+0Of@byzMOI~SThAt5=Cw<R8F`%W0Eg=J+E`sK{X
z!=9Zbt-?)EbXRz8(dsC~+|yE>gU>)I%*YXQ$<<izeY(4|jhS&7!RUTIie+-Yi1-kE
z+_C<Hzx&~`i|BIxcyRGPZ{Yz&^oiJT$8D!W!lA4Fns;s-*tx%m#iZ-ku(h!O8e^&9
z1)da3cEL4w-)hxrnu$7!5Z^Gh0-Kte!tr;{1poQBJS2$PmqAgeMYzf6L}I)c2-SqL
z`E|RN_%pOrK(O=Qw-^%%u9!NKj6mC&Q?a=@P*<RcoMC7PQ5}ly7!eVrz@`8@rI3@E
zyOmfp`txm|263kGRRzlM67hj|8Q;23x_*=&EzWcdA7nJj6n`ad6WLM^{oWiT_XKX<
zuEsu)c7XL^kq@J(2LzxoWevC51~OimK8wB(t1rRWr#xUX`VpUUqEt6cx%&$ImFh<{
zhIvI}_e&zxbSJmqhZxWMubuDIKch=n@QhW2(=PV;UaUIb3zr!(R-RG(K=V2H;_zbg
z{VjErf_jfuc|q6b9e&m73#SnwW#POx29Sxac}I5wmIwwUaIidR10zk-k(JC%GgxAd
zHsK1=l6X8pMNsf@!?x=FMWiHM-ivn2GZ<pHgaiu`S$#lpi9YKi7CO@T^}4zvcQws6
zSnZf5$Ytxw`iZB1XrNldlVs{x@k8SI3(+GRz`KeWvi_g<a$8?8TE4$^0RN)D9QJC6
z@5x{u&5Z2UbRhHkEe;Ik)#?P@mQ@I%&o95h8SJd6NURzj#3d#0c$5|tfU;k>ncm>{
zi?GXouzI=vKnGg|e?utUHz6z8pxiXTy8l7LT-3yFA4T;Vg8tNT=)zNHXd3mROPDr@
z;Ka)u<>dcl-PP|Zl6Az6#_pLN_i~1pkAgZpJa{4a;nA~a?!lOuEss~^GX(?BcK~Jv
z1|wp2oV+)sUm4lX2F`OmTMbgq>%T_dIi(ZYyf0ucfU+Q%1MMzt`+iTVuet70UQafc
zH0?;FKO$iBJo(ATOiw7;utbKFwB~cq)KqT?tB+5Ew<hLNINV*I_DJvPdfdBxM7e-V
z47ZcEZ&96VcO)agEPPE9vaR>uf9Ji}dCHzL?_a;q%Ddt3iX#u)WuGmZfA%wETH~Es
z&znAVtpY0gJsO2+%Qhfz4ty*4>bu5*w|2hR3H#9w=iLlHJ-QmMgsn-7+QsvY(@yJ`
z?c_HIi`ddzMl{ClO`EdbSAo~VD-w)r%cN)%`zr!=?9VE#F(kYa@Pa(Pvd_}=6*F!Y
z9R99{n+ERIhIZ6a6a-X3gr*Aa0MpHg6BC*3ZzgOvI+d$q-9Sg~>{HC9!vL)_k@+v1
z4TFL?r0iuM>rBeN$S|<<v@^cwH>5VZY6W4=j2hSMA{mu91{|slt13sYDDle7A6HXl
zQsvGHpo+1in3R4ewhNM#r3MABACHrJ=5CV73H5MuZe!`f9}W5en0W!m7qtN(v<P5I
z4@rmBjS;ea2g{q}Pk%b_MveDpvoPB)XtLzSW8X2!4J7`JUr9r2rsJ3Ac$6uTvEVe6
zAT)Krp509aN!!=ekd0y?SMTCiOQgcrV=xX$Lj4-)qLn&ykr$WR=_5s+3(x;77nhjM
zOV${|is{kV{?(fh5qizsH5x1+3MClRsY?JO0&&A2q7XHh0jyvVxB=dH7k4A+HLax9
zy)CbSPYZnqu91>_a@uhuq*Ld<uB}EP9zIM8Wa9BOT36+Ej)zs45=(CS31Lvk20_2a
z&aE1IVG}r_d#!7u;4@n08|*0q6yzIYQ!!C%C*DznZyv;n1ki*?TtS#!_C#)t1xn@;
z%nWs>3hWqyo+MetnDm|w5ROcLMm9$plF5>q%(51RNV`zb1dF})M7@pu4xFS94Y<RJ
zMQGpV56lK|BN-Tz9i&pbzhTzZXJZ#t+l_VXSIYY)1f*G`L=AT_wtd%YnOH~8Z#~ZK
z3J_)w(n9xaZ^($8Fx1_O1aVpmPZ8IVOhiuhz!J=_U0@HcA(c6Y$d2EmXg^U`6Dd3J
z<Z~P&#J#7<K||2=Xw~9^8S>1;$6rJ<JRqp7X{54uZ?Dv<kgH_g7KUq*-tJnwj&4DQ
zEt4<$2Spdu>MR{Z&&AkAgMq}_Pcl4+L%+E2MaeANd}S!$5!01$az)T1w7VJ^pjcKb
zS~!;<q%|&hr+++YbrJv-Im&{;xih}4Wyl(x&wrOv_#{V70ominwx(Pd_#zwE?=GDR
zF6p%)q#lgWqIVWC3ZkK99{FWF(Zwf4vd?Ekk9(;GzTW*FKl{n{+1Jzkb3~zha;|V7
zqMQCx#>7Tvyih{P>Cr?_NB;L-U@|0t+uuQ9v|^h8?t!Q#!~gJlj(tSpTn(_raYTzv
zypm}hUaJtPk{joh_GazdftkS}Sz&=!<CXnOV3{h>y!jK1rY}VTz<CBi6+1SWljW$H
zB%`IA73w27x|iRWuAn^sL(W6t0qp7E4gC}?PxsunZw<^ee#EL2OWTiY5J`Ym-O?1Y
z_*mFj4J;ILDog;^1>F(nJmik&tJ?mVrDV)>2?!6KSKan71A&o2wT48mF9i~o<W6ZT
zr=AoJ<a8t$B9b%t_!HlL#;@7oFE6vk!}542E37sb&fkT^$DU6N_Uy0GW653+UR)b}
zV3AJ&sN7``+B~bS*=Dhpmt)kbF9#>AsKuFXpS&;<e-}I_1Xq-pyM0bgM={|&7`I1w
zov{M!uZm^g<p7H6-Jxz5u9<3Kj}KIH2j)bFk>?*_IDi2M)85*g@jx@`R)A-(V*Wlc
zFGn}ul9ZliL43>L)nr97{#Twr)uJ|Y20$`3#mYQDQ$jlRhTTNPN>rVoxtjJTVUY2u
z`cn<sXj>|SNO_=tPBb(&oYCHO?|~cLGJsDD4L^N6Va;rTlslv%WpXYkGcwB4SZ+Ef
z|Md++kh|7R_uRo;ctPuI_2M~z!6jR3d%{5`{fS}C%6^)#q`<dz--4aKYZfjcjJh`Z
z^ts<C>`gb%^&&2fmc~1EGoHmL0mXTj!yogcob7Q>4lxmDs6Ap=SZ~!}XDrQ+s%J3i
z5By&Doj`MqdgqI6r|5Uj3$bfX8tK{@%*U%wFvCbNt)?UKA~uTksdULT>LS!3gGLvw
z0!7Ne#<1z(k6>X-=@Rx9CDTjb22Xhjr26zdFzf=FdB0NOL<shlS0R^B4G9RoENB{R
zhn=90Cqj(w!_v@DYClI&K`FrS#V5^A4<i$mMdYU@4%CCDeo)x7#I!qie9lR#f14PA
zX#inQ1}%CSyXLsh`NbYIQxF1(zb3m5+CiH>sq&EAJ$B2+#>p4Zs4M81E|8aZO*VQv
zLv_NN34Rrb-&|I7=1B~!e^&YJxb9ctMAOzG^NIoNHi#Vkuzl=n$?(9|#769jc=-kB
zCvUuu`XVl@<Ga#U=*`Q^rV(5By)lt93}NLN<E0!Hpxlw&EE#6}9{*&1Lu~Je5d31V
z#!tO%YWmGk>A~~~MmlPZl=kE9iZ}ZD`K(G4()o;j_%3f`p)6B|HkVaTDVYB;l|M^<
z43>1!xICugjn{hZZ6Y7(*&zKbf-3VImzN$UQoCepEc4JH#@>r*aLG31lu0)dQ{L@;
z-^&?VHqxx{<KTwyR-_s&xFvtm1Zj1*Q~^LY%}G;GIR)hAK3%-)ruO^;I+f<vVlXp+
z8Ndd>8%0B+-=W4R9IZL^H)_y%*$HprD=7c@{yZ<|p8{6?V=eJ@r(h~7s^1X`-xK5U
zC*1rAS-cAA?Ee(%zai|OSVTz-g%AON;8=WT`F~n~V&7NbM!(Rw;{%hf<ra;BG9Vl^
zf%38mSBFU`*%AODz*FWeQLe-RRqBJiZn(-p1yVxJ$}!3j{bCW8GDe7^4tD|!VZtVj
zy^^8WuA3Tw&|ga5Bc4dv=Xm9}9Zjm8k>(tm!tx%M5Cch5diA1eo>vrbYPJV{dypil
z0eS><d9CW8dNo%umLm;tHXE^TpOcZ|vy!LtgD=TVJ|*YrZ2_0_et-Rsf)@+x5fFM!
zHW<NjM^_c8Z2~E$-YRHDAX(Ib3=<x{9g#a&8<v)e+7P{H^EVwYU9_YpI{~-n5esHL
z1)LR1ZE^{Jltn7-Wd&8R*`ow|gyUOLnNgc}<9f60d)gtlO1djGjwg8N9jk>8t|>Kb
z9@8t*5kp^mH9CDRarg>is>aS*`bt{(+AMj3scN^S-b$yHjsLB|iYb%N&oQZA179-4
z-ykUAJCdnxquIX~z8Ri&(D6MwBFg?2+;TDI^T<wHO9XTo_EmMNYmUsVoqBAwZ-_nU
zOW{z0zram7_eTXJEsgb*)nRH`cdx+Zx*WDNuB@d={Gu=0*e){gd%B_y4pJ8gDG_k3
zJT7wJ;Xdzwfa?l0fN-*_;GciQvj@{mUZ7tR*7p<zZYydCWMP%JHIpB6>O&8-chn5C
z*+3L~L|zobZTHc7m!wrqtvqDbNyn2DpEwBRDoJzE?VA$AunzB48HVu*RYEBR)`3?K
zZAu~5ElTHI9?<CvqtWGoF6z_m@Sz0Gi{xxyD(058yHv#-pGewyum}uxspU*B74@lg
z|BZEjK&bP^K^oK3hqxIl0kS+)`P@C18x>)X^u?!_0mQiy`7ID~_LX0;HA@oDp>ao<
z?qDm5z0P`W_Lny;1R}+YfcbXNXSsuosny4vQpL#6fbilCiR{N@-5dmlrW>w7$Vu#H
zZg4;%U<=4bZSB(x3dHcr8Cv}s`#~!jr;9708dUhar%kZZ@`$STMys2jv}8;@3);}3
z&yZ!pVAD`(CK*Lb#{)_1R1PWbMMNM|cD|tR3nOn3nxz?IKga1=N$Sh^X&ZbaV~h{-
z$-=e^`oXIoP*?+advs~0D%2Hb>KhaK*%%Fz^{1pm{NKk>%3|Wt&y&1cHo6|zee;Mj
z+9(CT-A4r!&5f{uQoi^J{>qNaXRjMHRq~C~Q<3BrZq+Vg-KM^`$KN-jT)gFMCifu%
z^s6$}XJ4ri!_5a9_=IxDxUCNM4Kc`khiTUS*bB4zz)*7r)rDIEgyWuciIa>FO-Q)j
z<<1%R3oK>GG{7Jfdi{zrgTHB7f=cgAD1KnuAU}!@uQ$KHSpjhsx+7J>u3Z$ac^jw{
zEpQAMT&78>f$ALcdxAp<-!9j3D>j-VkYDEag>T5_qTPKYdx7Uvdxk7r*e<(Q64_kW
z5Z>3)IZLKEZ1{LYo=}WQ-rX16qI9!~(Hw_qe$XZ4nZdMpi7Oh+<?xJ|Lxk0{p;OsT
z06|oklmTZ-zm~m!Nv^{CR)`5|vW-Fm6yEzpIlgnMo^S<mv$xm8P)Qe-;6sXl*caMU
z-Hz;6HlQZ6O_ZQG`uO;xbmZ|AJN(%T+Qaw{;Y;KT6es(ctrKY$U$xWDK6*XQ$&b~Z
z@XM7*|B6X!gB|tG4NjnmN1QbdId@sw^v1gC2{mTSeILdSq-pwJ*UXqAfawOp<cmW-
zgXji!rLC>VqKpw1-imNaN}BlQAy*)R8GXl<!0;^on`Xzn@ry5ypd34pVdbcJ9rfJL
z9BED$%_Fn6Y2Ll(uAG{F0Cl9{`f6la0p0Os4rbZD;KL&mt!<x9!Q+hm@g@46`+I}}
z(y@(lJ>30d3W?Gr1?-V31vP{WglXb|9cfD>nidfy8~DrTq|Rgi)bCM)kw-R>iy%~(
z%Q0c<lzK21)xfJjJPwUSkEgj)Oq-<|gsV*VupwyC-AyPU-6C?HyM~UL%V+pCaf~oO
z#y(l4#Fm}9E|oBfnz7#<c<7zZR-yNteszh@HgjK@<fv1h=OAV^#_?Fgdcx>Hb-+OH
zIGL3@@}Pid_+vTF<WV2hWB|1<azc=z4d{A-*6ryjq;Zt-R+44A=~#K0kC5lueywQS
zCjd?=9N1?j5CMr&n04>e+!yRQQn8}MSX-fEWlTOt7J;iDuH4`!8g5W08dl%Nrp8|z
zC_kz;^z+J*k5(m*-BKvbcQZ`aU5eY(jh2CE#T9=7h+Z10FRz#cE1Yk>y|U%rC7qw5
z4xKI6G)PdlB{@((Hq#RbEk$Y{8#c7P8Dki-C2Lb*AZbc6aZnJU-NXhLl`$w{R=Fp`
zXF7TrIE7HYD-ZRmQo-?!QMm;?xeS7YDkQQ?Wl>WT?1L~3|3+hAbPOk(RJu4SSD(=+
zj!{T&Mudc1Rf**(+J5&7@Gy#>L+Bwd7RpfivXmHDt?@MOJz!ag2Lh$5wMX<97ALZd
z>44g}FV_vnzN+W=%l~>!^i#_Lb%S)pa`4L0SJ%9Ip7#T9@X>nNlGC2%3ZQR7MySU8
z1{YK~y(GqKRH-2x$uTKJC|#QhP`bXyZYh$J-OWVMf(SoDl@oxUAxh})<8kYbm*?Lh
z?Cb_cx^K>0D;Yo;92I9`laoR<5YtRfhaaFuAnA7K;g3Q8tY2`#&;PakNEE&T{Ac?I
zYj^_HKkH=y@O{dE*5l%U@G9s(U+U6;@QHuz34bv7H|O6qa1f$K1LAY{t{<El04L)F
zF_1ah`QaWYRRYg(lFDrOY2%j42-r~+2+lxPa{+{32XMtPB7e{7X8k>@oBqGdXr)Vu
z{@rQ?--;c;b&Bg!W`X`^<}D>%*`MmDOCVegz%_@1sj#sB5A$N_+W#~g#y7+7<ulGe
zg#q%<7{7%8-P=EvJNU|WAb=|?28jIE*y8^@w8#jf`}(I#RX7MP2jFT<!Lg_^T>00i
z7x4G#Lj?Q+{TQGEc0N9L?*RaGF)<)GBi$yxgD`;W0%xa+{5u@RAYAaLCN}~EmjH0h
ooZ%1t|8q+j>01Bvo5J@i0RiZ+SZ4gZWG*7Qr0-34)U+V~3;G_F8~^|S

delta 13034
zcmd^_byOVByXJ?%T?co!5G*(ZcZU#yy9EmtEcD>6gF|qa1VXT&!6CRya8C$Ma9EPB
z{_eedcK7eC(`TlutGaro`uWT}Z}qzigKJ2FtFi;b&BI=TnGq>SK-!G3CRB3RM{qyD
z#z*-+)n2xI&RK9VX6ED;Dv|3f&rS*l_Zh0rNsfcs9WdrBLXF!WBw4D<ks4*yx$7xT
zGZv=J{Z^f2Dq_l|#q0w~rB&yqo3PQyIXqHu2TipN0s;cO<Y#Ab`j`mB{8dkJ+62^K
z&1wVAG|b|4i2zB3ZY$nGhwv}E9?e<60Wt<!@F2-h$ixv8DJWnSDo+kUm+d@)O4H+`
z6`D1gaFeO_1|kF}<l&ZD2<2*)LCPT+V~KSPzE5-1`?@sZa?~}=c+$t~x(LwHq|C-p
zPy|>Bqe9l<!sQvU@dy+nWD62@gA_}3Ds=2+wdRu>G97y*X<T)A!C=B@Xgd&|KS(W+
zTVNy2nx<NkXxD8gvW(44z#Y0)BCpQOQca>_tH0**iDIPll{P&yizbyivjtW`#E^~R
z0-o4<0<C7hn`EIjn+_XaK8u6WWg>wACpk1BufarsIzge^<+BA7OLdxuNOqhcds;$X
zq8%gyHLa6204^8L*Mw^>N(ZP})R3cAmt~etMy{aI*H3til~6_79?a~TNMp~Qm>hPE
zCE2XcUC&7k8k`6D>hqXr?@}j{5jZ=03c7)yGS?AwO%n4tZCwuRdxRv)>MHU^lMn{`
zn2)p@(vDDM6GUXIFl2;x^H73m&2e53x&i0Q&?<I+iQFwHPf*xncJblItS9hm)JV_3
zA)U}4w28#Ja?~d3RlMs+)^LObHK;BEhR0S(f$W$zWgnrA7@cTRvcjRIWML|`jwGq-
zh%B&HL@Ky%uq8xFxDD7XA|;XxJV>$&8%Bdf1bo|qRi*YL13@0Rqqol3vX4&A(Ytv8
z1fB^Q`5L6iHe(P9R|Pys8kd@ysjm`=7nfR)XBlyJ?z=tCDY&}m5i-(*CfZl;$#zX`
z^^Oh*hy;Q}LX2tY>1OQe?(FR3;$~?9!63Y2CtN}#%mFj55sC1DnBm}v;Xoh|1OZZ0
zi!=&`OMr!-aw6W6fTWopM8B!aRG(eI`7rY}ix0Q;{q{9n;4jIF(EfMD|B~!)Vfsq`
zBfd@V&lWs-Lmt1?`%Ao`U3o`g4~mB=qEi6<Hcz&Hq)q(r6kWz9Ccp|GO@EO5We^fV
z;e)W*{}#65FJ<s%#%y<Dgs0Q}rJ*kj`BRzo)_(0PaVU>W#MMc$!n!n5X*66dKBk}i
z99+UCCdbQ_S;s|1!9Y>e{afWc^mZ}w3=^nHnw5a=qm(Ee*QKmqEKAXT^ppmJ6)5AO
z?a&-va-8r&PN}@Df1yjoDL^k7$w|$Oi7ahaIv7_X$gMZz<+Js`@&QpdF-~F#JZ-hM
zRf*tZbys6&`5?mFt&)T>yWs_%w#|a{{6D!$R+23jbQ-x0m?Q@t)E|^i_n`fL8v%-u
zVnJ?PJ(09@x~T|;{Fs?*++Vtf{{F#eTGf)FR+nbhO8H0j+|W?%`F3$WJjL%}*I>!7
z4tSures?%jPGsaQ4v-TLToeaIkV~K$a%j9%D5$Ez<W(P^{Nd>rOR~rY86CV+iUg}q
zX9IYdWNCf7lEU^TOQ6r#1Sa1;YkOuWSjQ(}o*r8wVkixv<7}hTVy#*Nx;Ku`AvRbf
zP;?1FP;g*CAqhVY59X->mcC-vj9K3}v{+GTx5q@6CN7?H{!8AzGhZ?0ma!zH0CtN(
zju1t{xCEoXq(P`cfK)svpIn_7WC;e1cVa+Bw?QBbaU2cWHZy6ONoocY9=v##wh<=~
zkbkOwYqa;;^??7fvb;NNdHZ~x#6@}m8;x4M{M99A*<hAJ-XK4%l}k`~S`@4tlLFX7
zAl!jt+~WODX;-O2eljRSHsBx+QfIDxd~1@57p(+}Aa|$7sw#x12Z*ETzPogqqRmQz
zrNEM4|3pT#b^1`v*v#jJ1f7n${gsreBfQUHUs^<zn|E4fBA<?=f218@^j+IlO6j7O
z%q6=%*EVg`$bvn_R{#dzk=jSPkG=OoKGAu}gQVeky4cGSM4iGK^MZGN5=~2ZCZqnQ
zGvkir)a^hEhobl=d#A<b;b6Xd4n?KY@w{0Xq_BYk3H4^SICRPV!t|Te1j$E0D)q`J
zELpWoP#!^ZN|tpQ6ICRW_sz6Fi*Ww4{n=f2QcjD^0-00&oE5;?zeSBV3coeAoM)EV
zU}ZEbC4s`4&%3v1XEJjOLUaB6hEyeJuljCMg(_iWvId`y4~wq5O4OwL$ujcHl;c8m
zD;gPbiE~h}b7Qaa_E$y(LplsA3)xaf!#?%Pg^H+1b;Tni-`w4*TQ^dCg^sb3i7yPH
z3tdD@9$x^w$NhkJrl>K)N0sdPG6qk#Oyiq`eQ9TF`(f-)Zz?|tmCXM1TW0bz>Zk0I
z>-kw-kowb9gbuQ)Ke1(Aq(7O-l_Yh$&W@CCUoBc2-XHNMGcWo6c<CC!$)$k(9cMy?
zqP3hDW9X{lw$iJGDAEIFN0H{k&kN%?lZss!f~_g`_H^q~<If7f)9%9IVQdNtX&xCO
zNKf?<gwzD-3(8Pw%i_LEKDz3HV^cLQQy~iM)IU)6V6qWd9SoDi7z!o+JRM>Q0wKWF
zsNqn+0gP3Ex;%{{#GURQ!vH@<s_a{D-tjYW^4`eoZEw_lSuDs+@f($EksefKKh@fh
z0^X<o6ov2}cEcoZlmrqTrpj<szU@(9pwIP%<<hgdrzKt8Y8V07%Uc=2>_(N%Be>E5
zofmr>&o2nqTcQ5tdx^>eZTIDQV7dj)tD6H$pn42RtN%buTh$z`F3+dW?PJ3CC2U;C
zPKdzm>?+$%2bpH^5!=LGSt`{hUvE!k+P3kR>2m4RHQpLj34tRF8btSs2|lt26?frx
zXW=iBw(Q=Yv}X*xkx<w*$Zc5*RFv+{I9A|%UKBo0<xajP^&N|l2rU>H5c7qR1lJoi
z0`<LRrDu)%5fRq&3ntb;_kc>P54o|J*)Z?TQB72rp3H7C0j+i&7%0JerZv{v8p&@O
zD4_b-wxaIiu!_uw$`{Pvrf3`Y&kJhs%H<$(K3~xZIUn0}!e230k}fV&O>^KOo+Zeb
z#;i4MQrK1um{8=>NEN*L{BCdFfK7Vc55tSu!#f87EAbiWhfY{Lo+nVRA-3ZaVRY1a
z;OANlq_DG12_=p_tyXaUJQs^CLbgTys+Hyy*{51sj?;nXE+`O+Ryx&Fl^;u|xpH%P
z3A8KD&hm_nUME?S)??}ckqM*(wS*rod4$TJD=70%kIpk-lL`BtSzo!H$7A7#Qf(Ah
z5jHof3y7pT-w19pm3;&Bc(S#EUX7!+Xi#Yq@M37@ydxz0fhWAMcBs9x&2>vz>FwR(
z{fvgY+jgPV6VO+r&#5qXiV2E)lhnZIXX(6NxZkq)^z8vP8WCxThmIhE;m<>wj?K$~
z`eXH2CACdDS4+EuMrT9sCGd7y_EMXr9q$?6iv~`PuA5Rk&QlzqT39wJ9~Zq8TTOcq
zkH@hC@BE9&3V~%(%z^N8wr8K8VT=_MrIPxy%|(pA>om!`H*~(r^D@QEn7St*+(R#|
z`LL49>M@05)5mZBCgy#U(#Q7#vF4(5Ku+dRK~^n>s;19H<m$rH%(J9y>s-x1wIard
zXLWKYLUsi5xIV<PfMDVb>%rNg>)^AR@43N+{oxkLYF-Ef1y)#ck{1$#jWK~F!!1(9
z)tSMgQ5AsVS2>ExkXaqu^LyQ5Gs1(X^ZHJC1j?xfxcja5ozXM1rWz3uTyh5GF)k%q
za@Ka27f9*OiXJDFx|$~npB7X@=MDR^oK^MGY$a3MYIM4rfu^B>{Bc{Ava$%0YOYq+
zSD9>C9tya|7Q!yy4eAVaQI9egp_E{Tb2cp6U_xHwI&Ax+A=y&8rg*bA2^7(aKIzWU
zOF|tTaeBGdPBqeQ{u%H~U#)aw^!l1-)zY0_EYn#wU^q>v@~*Zp$-I<2Q=xg;nttQ%
zlRZEZ*3fEa1l%P^qUcxdEWN}JbtN&kODoqiSu}_;>8G{l+nAhO@LOE{&=Dg=cIJb>
z<<Ju+N;eeMouvw9No2Mhu?&&f#(i-i{S}&(R%Aux7;u5fpior2V4j)6u4v~{RgwL;
zE-MAS?KqsYNH~UqNXGw6(8BVt0Xcc;&~pWQ{laE23Lv>Ddh+b<r>pb=)tsne^mKyD
zvWl;@2hP)V^Cxu;LTu8WyI8?h1uCYs0(8uJNsE&wqdIFsv}IdQHk~36?Iuu0I-Wef
zjz@_fmDp$e{_KZHCieh8w-9UQ$LF%Syq$se4fH?S{a1xr5)!Q~g(s&Z(iRjYI~_Z}
z#Pjd2MK}N(2L{s-ubRDHr;UA~OK6f_ro2RQr{5F^#RROm{6$Aj@_IA7?^Sm=hC(dj
z9tpXY;EG7*;P?iJ->jrzQMbZ@pde~62myow;=G58D+htXM8F^@90&}3_+MVZ27z%<
zKsYGWAkaL{!-3zWy`-j+#J?Ro__2a0G6V$aVuD1$+==WFHz`28j~^_m8f^lyv}CRY
zFFCfnG<U8xJCU+%%__SnmAX7HZIUvRwqmyhj2(@VwjZ80SDl+Q2AQ5Vjh8JMoiTsT
zIXMfPnVvLWu?U~F(Ch=u9*qJB-hhi&XC}eyg~y9mPKSI!gpJt?D2*^>*4yg5;t<V_
z=>Ol^VR=d%)Zm0a0>FY7zwK~O<LTdan3b{oe`|-~J9xar>2ov~fIlKYg_>Wn9O?G>
zNqmU(MD3<`zfEn~S^8gA*(|&!J=%5mGRW>H*3ZhM45T3IW2n4%zTzLDAk6BwV;(Nt
zro#*``qMEFI{tFZbvllI?bOKvY<4E%o{ZRG2)tzlG-Gaa4viKc=cfmTfCt~)g=a1E
z<F{`ffyyTR8666wWX1B*r~m1lxte}i=LNmSU)EW7t>Tw;{_UI(l_{H1ZM6F}<)!j=
z|1i&xngy2@D$X#5;RsG@5g4))nDp7Yfw$OroQ=?4odLI=!^_RTO*D|zAIJ|E&B{t;
zYa{gdV$|(IDHF29*u1t|VpYGHFxI2b#{ZNv?MumFMDZV%I(6~EQj<AVe!XqCV23|s
z%z2oEAZ-zfc~4|UeCuv=l4soHDxSNSVa0UHj67u)0@T~#n*sec9ZDnCLfo%(M8F~N
zE?GInbdVahd<Wo{@g^<P7WSTdu-^}HDP9C3U*Vcx=lbv11X>39%aChOSJ5Y?HNjWl
z!iOx(EA@1|^M%(D&aDwQhT3B^5L)Ba=Wv3!-xxx{Pb8b+a`LeeWHT^U`8};dS%R#c
zLo1Jii7?H{(fQ<^@BE~I4cb|x&x@LU0D_Tf&Qvfxz6AqsR&TXDegp>8ADw7URJ^?q
zX%Ysh3;tyN0vaEnc>>?XV1WDO(DE|K6Q`!k#Lb<VE{E7TowpOoY+ign-kg(RB!{5u
zQSyNQMoT+wFyf{s3chm%@x)JD#ENUH3`0I<R;v)|N4yXWk|!V}91>!9R~u6Z*6#>$
z?=|5SA7cf-2+W^Im+T=ptiptRKvafI|Kp%_!Eew&K`=>jDJ0g{+64Vbun2N*7&3(s
zZ*PhY!Y_91N^mr(R<Ln8uobf(AqMHRz?bTp@{;P&50^b@icKwv3Yp`681E0H(<#`0
zA?^168Pc_XBW?5-(%&kvXn$5b`@dt_;TO{%bQ~U{;3NSyv;Sba@fXv2(`CTW1Jlh9
zOz-_4n6CfD^cTI_Uray$Po^9GE7PvOnfCmPX~%yt{q+~qk6upw#WaAE)erv<)3c*)
z+4T=lf8A<Y*)7?sdw{y*MLFNo_o=-l2LXRUz3>1vx^?CJZL=9U{K{{reF;P;1hyd$
zOs^9^Fg^5o_Yzu6*Ss{s#>o8(>&@R-xBd%j&8*Z1tN~ZHe9K>0?~{~C3s3xwHR{89
zSqaa7#aivZV=c!j%|419hY6ZFR$gOamH?%-a)|qr(kv0@kT`G$Ew9Y&YZ8yFVCiq7
zKPK%FAheHLxw#Fw>yxR%3}`Bh15leY#mmtZ7*7@PL}Jb383{;0@I)LRe4tPz%n~Lx
zdvy1a4<I_k!Y$-5r3TvM)i1}WN7Po&KJR=}mlfw);k6CH?E2{cm9E!BF-w9aOrPBW
zDPav9NkEvmPNUpjZKk{+M^Lq&;ovcsQ<6#&@NpD^@F&s;uD0Hmn18JL>^&i@gD}Rx
zifBv`ptz7e7&<LIAPzr;2C7RCW{G2?xWB(I?=<7-#w}>An*8)#*_HsQ-)dZK^6m7;
zpy-w5Jld!WODQ-m5EqCOq@qLKQa3_jl#zVB(Ndmj7;N8al%sEh>h|&2@Vs@2UABgD
ztc%rrNSsn)<>+C7DS$ZIox64j8Q+@*Ym!kvG2?x82;VX@kgKwj(=%Pmqe4EXM^DNo
z+7a>0Xe|3&wS&+!MppUE*E;J_nd%N+%qKj#gCx6Vc5P{+7YuaiZNeh_OkYA9a#U#C
zWv=x?2JjBKVpx|?>s^qnms*g(j|i_G<=1_BpFzm!6|R}XI2HCo^f||7#dY#m(^tR*
z5i>QXEHBC@VN-653;q=Xu7Iu?PQG?b<&O4J%(TK39UKz43+H$$s_D9#fv}e`k*Iu0
z9rx(nkMSFyA@xj?_@BCZGANcf?vAfE&C*OTB@V2_iR<`^cyTGQIuJFsbH%<e_Q7y?
z<t_^c(~u@a8N?&ZL}q*!0;{Jx1D7tqD(Dqp=kzq-cQLRkX(H5z-@w^QQb>ahOo$-|
zU=IgH$QfYpIIE1q-8|frU&XMhB)-L=e`#+4^D<#i71;Oyqvv{5M~KpW@3h9G&nh#1
z546d^4S~8q??EHEtoAB`HH8<f<toHDd@4!fG+I@;`=zsi!CVY3!#|^LFu0E0Z3>H!
zrHBpeS?Z<|CTb)iSoC7UF;F##0b3%}XV%}p5G-#Eh0G=KT@_4EWxQLiIP#uVKbO9=
z%Xm^3IWEY~VC9N_H03oGG7ob*>&r+|Jb9r?I$Tuxu_z%%NLoRL@_S=eGW-IBt}EnI
z=rl{^rCujNWv@3gF5G9TpiqAFk9-cb3aT02QtB__b3A;vwmv79^x-qei34%sm3k}>
ze;>h6e}acMe<E*%Bf-Rh-E=MU=S+Bd&w9&0@w2@dmyye?z_>?e|CZVAr7)1DGBwSh
z+Nr0n6*K!a<0MaPXD`Mn;&yCo#qz~QdCJxhX`!o$>wB?Ojdhrgufy=W;XS*7xOr+B
z-x-1hm$(F`v$1LUaEjS77J%+YeQCp7UWJX6N`{K@)M$dki<HcXQ1@tg<zA+-bR{Wo
zm+4u|#B>fpO9n-jfhgYz+c>*|z7B;q`$OMI>n0Q}f}Xj%n@q?H*m9I{>rQ2SO5i~H
zbrbjhT?qI7sB!cBmje7#ca_aI<Pg{E4UtBIr~CX$gPKdje=es_-on7f3QXHUtED0&
z6vfp;Z2s#dFe`vLBJbbZuWSBO#E{Vv+@_^o$OUMrzA$tuxL~h+@@$;i+*RGtq<q9~
zcz2%l?W}reDZo1NA@>Y5s{PMXU#I=6{Jxka=bx86{Y#z?T>&l=l#CFt7V4|2vUfjB
z8^xtZ`AO(kHd&LSP`)72YL)6%DZ4pL1;%`an>dVnrTcU5pO5x?YPfrdeV&x!?*V@t
ze`+38{e~Dp0_V@|_V4b%=X>PHpCySeFuA!wa9|LK-aH%u4sI)#<^_lV3MY>T0)asn
zaLgz+)DQ6n2!scI2sog~AZlv=hbkCE-EaUpym?D~5P+3ZM%=os*6rHEOnX?k9VmXb
zK^F?#6Xh>H$>h?cY0dGxD8N?R@i<3qZtFfF3zO7n2~D8eFkU6?#E!}|emcr7u5ugi
z#<ee1<|UYD%Sn!Xlf_#(GbisrM)rI-`$M^tB*tywaTs!tD7VYneQMzA>tR~C^L{LH
ze&ewf<@3J7&3lKrJRyQ{VEgd$NP6#MuDcv?(VXK2USvD=dVUdbD)`b*$NR=rzyl<4
z#r-3T*EiZozEQOY)m>nf!p7UytIQ!A<%eMAcO3O|X6=^@7aQv18B==~X-*@D^6{5Y
zJqu~wbytjI(KpPmvT4R|JyYGopjxQLJ<o5qE%8~)NAG!<4z}57+oJ1v3eGiqiTX+a
zl#!MWe6wN0iNf*kL%WTLDC%HFZ-u$qo06-q*c+?6REB;LAs(^VWX8Nujk(TY^zF%P
zY)uR%XBQnP4n}9W;`f!M&V%@)x5rlqGMk7B9>v08r#Cr=Pw67A_eXd4Do<W^@SLQ&
z<lQt><#mu;NnS2!+@Bh5ENuuJ1!g(|)%_fn&}5pcjmhV}60Efs_GRT47Jjd<j=+WH
z*Inqg7Dp;{nj`&c3_q<nP)A<Hgs?eCpa#=;o^+6eA|sXgZR}16#`mCYQ6HQfjZ<}M
zT^dWlJ-b{Z8Xxd(bJh?AT3G;|S=l#+&E%hXXR;{Fiql@Z87E_ELtoDoPW<Qv{7W)q
z&(Y_W^Hwo8Qc9lsWy$gxKDl9T#u9k?5mb4R?V;lB2Et|0FvWTWzrNLekjLfpin!xw
z2UPtE>1nzS2$_nUBMN|2z?Vo0cO@^u%d}w+DfQgFzw<gxJSbMWPr6`P)m;>Od#jMm
zfuucD5Ij}ghPR>n)hsHH4bq+me682aA2loNp0#OJ6e7W_R6ND0PEwSR8KPr_m`csC
z$Vf2-v0NVKCQ4U|pa*QleuNOoeD4X}L>_PSQm>51c}Go;=9kd!<{q{t0hw~nmylti
zGLT3+PtTElOYD)eqsQBTsCd&49iSmYRv+tVyr)-DJ!9e@Woh#J)ZKRppsMcSr!)AC
zaEA-l@>=`ojf-B|uh>WrPCXHc_5;VtM)GG(LXCMo87P4(Cx_8>##tB-Sr^|?@Qk%c
zww7#bV7V&(>`f}#0269`&r)D<2}_V;sY=N!XiA8a3P)-(2Neb<1RW~y_PzBP$N0wm
zy#RhEedVP9L=;t62AyLU(8-jJFMqV4rkxumIEJ!S7hXoY8XWG&&VP>guu9oXRi{BE
zn4NPq-Yj~3q(fQ6M=EKY?pZGxW6mlWVw|_{pWSi5@$qL{WW8~4xevH9sVC#L8o?qS
zC9Ui8&}YMIc(zwQ$s}Ci?dmplNI9R^qXRm5n(@M(QlYhdKP|JD1a>$8vtywnSA<K}
zo)x1D2T4)3<$yB`yF9A?FG#cH>SUY{<0lCyk$r(CtyPXg#+YTODPKbATF`*Ok*=A-
z2~0|Lksr5(9AU!*ixdfp6ii}q6_^K{S(fG;BKfX?HOgw3G8DL31T+Gfw(){Gyt0{C
zNdt0`_x-PXNQxNcfEzGT_wLGKBAu=8D%zU8A*s@>>soE;l{T06o`41v3vc<!x8=?j
zd2zgXDrG^ftOSu%*^67%PJc-l2aD-<WAd#I)*FhY8hgr&QhG)aVLye%Fb`ww%B`P|
zj~TXEn!4z~G;I<!WcZYdlBR<e?#>5Q9tL~P1B9uBMi(4vKqli^R!&=@!_6gqo*$X-
zFeH?Q9gcOya?QH)&L$OE8Cpi#^3pC&+Hf{S;=FzpKDwTC)=VK!;>D>H)<CcFa@R;<
zc2AGGh_jf|<FKs6TdJx8Uj|BAOiZUVN9(mmPo+-~nFlAL&#<Vox~VxcX>h!B_GB-~
z$qIo;YXvzB0B?d1x!Kjco2|R8U)c834%V^cuFX#J(Z=bmZIP(X*Gh|q5~NW4ZOpot
zo#<BgVWau9TsuPkciEDT(QUkIC`q^}YN0>S)Y9JfFQeb97#n9%jLLh3wfkW&%ag;h
zX?%@S<|fhKk6l77ky2$HUY_l4ua-m-zsqTM*4Dh!2Yf+7GWc@*he_k#Eyr%+oIQ?E
z&+;y8ON7@m=1t8`<tF{Ke3{(<V=wZvo2@ht&eg_O;NUIzZpuc#$WEy>9Xf7Nv=4`h
zb88Fp)3@mU4lz?AJg15x*;p|Cb_Y3KP*p5T(MWQ}OvmfkjISprc3Y27$Al*;lDW+1
z0%5M|z;eVF9NZ+Ui~bOLVM9v+(gDgQdX1*;(n1l(+m~ja89{vcJEhk%UeUrjCjE3<
zQr=YW?Wy#=8wYD{?36tQarh&H8ocM+6x}?BwL)WZBl2tDuhR-`b(c-K#Ws(;UD7Im
zz`Np#)kbqCyK4X5@V-H$3uBaTK9(Mnt1#pX!1UO&3zawtae&U}mJutot1_hT4##3W
z=|gw#xBC+k#HiT#oy)zJj=;rm7PRIcc+<IkT<&M~<)?Xvbxc_qm%59mDM9C!0mkqa
zrk5Aa7%tu(x6vMp4X>9U@3x-Y8A(dLCbK}(pM4h|r}3Ko^eb{HKukI6-0}75dyD9T
zGH^^Vve;JS=c`XG!)6>k62rY>MTcYkm|qQ*V#Mp)D3>;_+rG)`I^O8DrqsP*HzO$y
zhI^;n`$Rt9*MmxLcn<||Uo7C<p-cE0&BaK19=ErpK7@^Ma<$@@{Y6dPt29TIxGyLJ
zFq$>3F6|w*?_}CT)?bc9UE*aqh4y2wEC6)uph<WFHeBb^?$@Lj@4vDkm-WWdck<(Z
zo2L#CqRLN^I69cQ$4#-}&&tqqj0g$*F`*@&h4+T{n_Nfu86<DMSHber9m4ilF3-$I
z;gBS{6!;%>R>K+Hew_=rvE^+HShP=wV&%g5!<j4=-)be7oS}%1TyC-D_TtA19Yz7#
zD9aQxv5!=g42WPeyCc@)O%``-B~CI!#dk}auO0B-*Ja}2ymeSzdO?}th#W@k44Tr<
zDoFgG<5@`Z!H!M}yYx}3muvB^2$7Z2#EVNC4g+aNX`gIq<(x;Kgw#ibLYW#i6)thB
z=7JOaGp{ztjHbz^m<?Yj;CxWmu}B^Q=!GX|Bt5(u24a#x)oSK4aw7KC-6<5t_)VTe
z)1i=Q_z=T})h!v_Q&=@8T9Hp|6d7l@jP@PtK)wC@_af#qEqQOG$4@?m^<(+Z&^#S^
z_i`jV*iA66aMPtNk*5psTjlr4eK?kYtn`^r1$MHPJ0qs1HmmNn8L=N1Veb3|K#Ab3
zsf63j?Kg4Yd7gt6OT#!9?$B+GSuH_aU4*RHaab9ccDkr6#myq0^{#tMO{NKlM78h<
zO-$+Y?R3R(d&dH_rz4?Q{*cY%Q5}~OP5HwdETtI?^~DtE@ves3`Z2*mnAWuI*<G*}
z_;S%x{I2%8_(JLS?jn)g^~Vhk;GGojBX+z~Bt4^6MI>~(rT4No2haTSo4l*Z9c-oT
zaQdciO>jnOJd%0kupQPVxJ&Lo<DMd8dlZ#L)oe-mBpI{NXP6w696iwCADFy$J3be$
zZ#Mp>%$UGHx`%YcVi{eH**KzCcJ8}1W^G?a`=UkY8pG6BzS?nf2vu<}aF`v1eVh$n
z=Sl8%ivY`!ihd$iE6-?BNzD#WdL-R(Q_^D-y~+3yaK!7X?v2Y*>MS5jeVODIy<%%H
za+*X^eM3fqWD>Gl`&ml_S@2%n%bGH*t|FP3`>h?b-=h|$doD(0ABs2@KE?M*PtpzX
zu-R{N7TLte7*6%?R9Na~0l%azEpgdC)AemXfy9lN9lwFXpm!rCta{8nwR|zT+&#xr
z)%vU1yZozfyc&+qZehWWEPQ!ky{p*$=teHA9s4HHF4%J}_0o9oAw7F9J&~e3Bl^Kd
zhDj@~y&e(k$2I1~HbNI}0JB(!w?gyY8g3Wz?Dewy@XWCBPc0dH;8T-$w#uiOAdio9
zU0<;2=H`$+)I*94Y%EF2gQPcxk~|&p&SNu*=|$Qrnr7sEgf4Q$9M2tYo^pQ>`I0Ki
z91!?5XbVA@l9Li^$1S+Z?@q+vkaUYzc{j^PuH6s6@eZBKlV@q!%d#^GXQTsJp*12l
z*Y_|LyT4`faOc!L7!c#6lDVML;{F!KJJNnCVr-K)_q-4_$(|EJrPq1}uhw0|eM{e5
ze_Fme*S(I&Z1^on-dz38y^rR0Rl1n}t$1_vFwe~}Eyr!F2ucv{N#ql9W`WQwz~-do
zKnjkHW?{#L94p5Siik~b2<k*u6(hf-%N&>pS4+QTRAv7v2{a_Bk){TH(ylK}+GMQ3
z(9u|Bl+O>Z>R`s%IxM_G%B&)9IIDjYac79V^WAjwoYfXYumO|kl;pMj$pqfSM-Ic+
z+kG^i+PHl#t@K!ji5a<T5-R^_(E3F<-JN?Jwda=v<An+eDiGt1QLg!6R>yF-?B(0a
z8`TeLC<XQPVnDLDWEJp*px3dr;pW`1sh(ts-5>IG^Taz6-O;b7{Arx-ULOW^$Th4h
zTFfgOf)arWr$No({T6H{MmiEA7>58N);g42dGxb1CtM>%v=UhiAHC&za=<#CQ;W_;
zz#R1P>j91Jqrgq4B0+RV>YJIyg>~_Hil7Ge2rGAgM<Ax@hV$o78fGlI`)dpVdWuLg
zv%rs6WR->1_`7d+yL0F9__FrmPE-SNp4>26ejs@=AhRxS;D@h2$%gYoic8`?nos>s
z6m++VEF%YE)MS8|JkN!XD>qvDgdr9e&Fk_ZfxaIMCIgXyC_!y^4`C&hCA~tz!$$b<
zGG7NcEYvwHV6kcaIv7r^ApWcRYcr(uzx*Qz#lFA)CrIa^`+YLKLe1}PvI!WP=CuCu
zSPaCZaes_!|HlZZ!KhNWG$>fXL8h1Y@S&$AokQPYUI-_IBxwrA_&BmFzF-tEDj49$
zL3!AD1<t01ijO`8p%_&e-DF8mSrrqF9>34iML#P;zRF8I?2y_9)6bO5QmGGyE49xH
zYorl<E`9KBLYeKv0k~PI(O5^JRskkPpyo^-E@7tGf}HZ#s>U0+`QA;&t<<%Rk>rNH
z?Bz#VLRNbe--dC3^Kjz<uOm4e;Mu2IraMq7m2Bvb#v8-gA(Rff0xmiHjqdwJMBkMa
zLq}$zMr?1KW(ZG0q^ANFY`&GDt!vu*l$~`DKSF{hG{WX{Br(d2+Bpkkw~C{8J4sxs
z5t{1|1g*@Bm2q^*)ONQc_#j`<{yt%N+l{y4-k082uG7=cdl~i1a?f}cKqv7<eTPRX
zVr9SW^4fJ&h_{P!a93&Wr1?kz$=_WS{d=|KK>+8qc1a7x<IWqWUVk-uC^GrA>Dt<;
z_R?$ZLo&i_s}wvGzacCtYqEE_E2e0dtX>9LVwUv^+vC$OSHxc37&BfrY>bPC&4}85
zC~{7rMa_y#Nnx&gCS&rg3kYE{fF8@Q#ACoAm7&`;I=R!S#JS!*<r9C7E=Qx&7{O;j
zxg61DBY{~>rAW2NzT>z`ednlAScJrFWjUEn<12B6%Sgo%@~YtS#i0zE05OjAp#pq}
z!`iX{jxa@AiAA@v@Fyh}0?5$8IyF9HqB`}U(&GR#1UwrBd9U3dNr0EMx1>nCXOgHt
z<LD#Q=>_hUnBN<$yL;R>v_r!3DB?$5BFh`EB<_kmxue9WXehtY76o))3#`6}Y_D%D
zehz!n$#YfTEc%ryabLNX!kQp3x)ZrO+!sbuIm(oPa`FmprigWe3tK&JX}Ms@oWvV@
zL!A8pB$%wSG~miubpTMUtrBrP%QC<wQhQR1app9P^21$(u;~#Z)>!ZIP3y(gNb8hP
z(sES#wv;x>2Ak!QSbR`wY~-i9XEd>1P`tA{#xu$<qdwHgW?smB)%M3aSpwp(`}Eu0
zb4q%P2fN5vG!WkD_U^^%=^>qO4~JclW%WB$f{|6X@=zL7hYSID7tA%H{Q@`3)Rq|G
z)?vq2IzleP%ol_E6YJ2~((wBL?F$7gaZSh%k+3f^RZ_d3>3zge4Sm$e3#s1qH3C;z
zCRa{HFR{rIn*?$77LHOUt1>$S_#@x|v!5)}Ey>t@wjoZ@k;k76&V?rhG9Tmf7jMdN
zCD@IPUp~JXzhDB|+c+5Z#a*SnGb(*-Hzi(~U<qG{XD_S86^Oi}o@ssGBz><m5va$G
z-jl&29m@f&9s6{6CBnu0K0>$>cf%Bw`a-4oyBg$qNZQH8#P+H3*Fn#!8}Y{Hv8->X
z<O*n?k^%h(xQ;1Q1^hcsN0I9)bH$r2bI&fbR<};y1V5zs7QGptc^?l+I!O0}5mC)&
zEIrZEj9Sg;Nt;l`As+`gGj={sQY}-Zv)_`iCq7Mo&l0n#RzTGKnOpGX)9gg7K`f%)
zm_5gBj2r8q<>%KCa&sZwtjF#u`!|FN=i;xWf4-f$Hv5r(Y9b~qDw&<F7Dn$DOFclb
zw_x!#r3cR89e}lzO^4Fj3a!q4wsU>a#`OmBZm9%(fXnNeeXO=A48MQi?Bjli&u-HC
zNJwZniK=@%7UUE^vehE}x}Yr*c^BKeerNUO`K8UJ(}yxo>#NtkMmdk$CYHa~6wIb>
zArQ0fw7+lKeX{!<Dh_LrO)5fAb<7cb#yw5b%5*KQP!7CbKM6O#?1|I8%)b;rd%PGa
zAhevq$n|6Oy_+d}-OVeDCIMrmO!KS90zLdXMX@Pjn>Q<qF-x3DA0_vzE)qR+C?f7-
z)DoW(8mlBn%qUZBm*ZcZ3g0FgJ!-p%-B4>z^yJ54yr;TIH=P;P%!D7a5<||FqRcxt
zN%Xb36lev2D)xI~VwSe_W#*Vo>^P#H>q5boqK!#dgKvqjW3npY$b*e#h0Di%HsN<p
z&yNcBTWB()t{<gki)%;vMg6$#m@mDtiV|(fC2dG+rh9899={vRBQlM1<5%VSzI3zi
zwYzM?G90URJzf))pTpMc;1M|YQAjEGIqgZl8&@ZAkHycNiFz2Z+wk$Z`dPr8Atk>|
ztV$by*O$B=Oaiu}1AVVDXWVeZWg#v?7cmEL!jH0}KKFYZ)%^);hDPDW`;V(dRudgd
zsJ)V|sC@<|n0|iVX{oA(G1!y%J_U76^&d6`?>lC1$(<!W4C}nHN>CwAPQgy()rO8f
zCAd=q_{k>7%wU;ex9E<A6gm1`M%!o+DPOXq`8Jt~e0nzxZF*OWkTo-u<bK4}GVyO3
z&?)7Gn&)jfsT?IBt;kP&K00X#3^|-tK(YV)rq9#kMT7ZWv9HrN@wZLR;d=_1`d7YR
z)zsLiqN<bck%Jb`4=$LhN2*R+%Uuxrjt>uOfW2m#IP&{Pfp5j-BR!%&FRWcx-t>{T
z#RZAArFt?w%qLwZx~0gp^Nru=`a8kz@*Y2=8y3Z0p)r4+wGwsnxzkO396?2-4QEua
zb@}r0{n`lV_3N*G_iqj4Z&WkevewpWQhBAr$ZW+>D0GWojEPEWgkZ)k6a6rzHp9)j
zzXt4rC=J$nUhJ;9d0ZpB*|J!Ru*i~<`=&yrar0z{GS&r{|4HtsDbU>nZ5@Q~Kq}8Y
zSikGpc-G*ZkJc<VI{0SbY0?1+VXI~@o)G!Q;P(4<<BrECg{~=*)cOgfzHr_7r*3vx
zkHQT2?iE_;ZaIt-E1>&?qHi;pjcIgJczNYU5^vM?v9jxPnPQc*;_^5Q`GTamUgV!n
zdNLgW7IopqQpqbg@)<DK3#tMsH(G6xTt-mk2MX+w)-SJ-&^D=tiYIL&(@<gG#7NOD
z>=>_LaKDmaVMpT9uqX!Un&hYDa6lO*;ls)}2|3{!5hMrv=l%r)<oK`STfC4RxId3~
z7(zC1{@h=2hiqW~xnCCw2T4Twv(gg>2O0fqq=#!lF6@4_K)@sw@(3<YPHqr95Cnq(
zOpIY+>IQ3<D1|>JB>y`bo>-kQF$@eLCLHhwL-;_90i@*rZ~5zl6~9l127n>7AjWSz
z<o`}Oro<-f`rVxO&@2mL9EM3qQPTbosltR`e|OvYW0WG8zZ5ajKY0+kgr~ooBLcyY
zhg>Gcu3$Lw|9sN=9}ilce@#2eU&#=}C@K%r@kb$y3jg(8`+~p_F%aWGG)zsJ_@8Hl
zPSOzEf`<;IgdxFT2sel^83soVyQe4l<8dJ|ZtL&HhKEK05aaRPL*xH^t4RrGfA>6m
a=ve>;5&pa<f5`r4L?J9YugR8CMEGC*W&1n;

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 385ac1079..fe8a0ab33 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -407,7 +407,12 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
             else:
                 fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP))
                 result = pl.concat((trees, fresh))
-        return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags)))
+        return (
+            result.lazy()
+            .with_columns(col("tag").cast(semver.tag_enum(gh_tags)))
+            .sort("tag", descending=True)
+            .collect()
+        )
 
     def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
         limit = self.rate_limit(strict=True)

From 99f823eda9cc51189d3de53c298c9ac861306441 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 12:49:10 +0000
Subject: [PATCH 100/201] refactor: Misc `models.py` updates

- Remove unused `ParsedTreesResponse`
- Align more of the doc style
- Rename `ReParsedTag` -> `SemVerTag`
---
 tools/datasets/github.py | 15 ++++-----------
 tools/datasets/models.py | 37 +++++++++++++++++++++++++++++++------
 tools/datasets/semver.py |  2 +-
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index fe8a0ab33..921fdfc75 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -25,7 +25,7 @@
     ParsedRateLimit,
     ParsedTag,
     ParsedTree,
-    ReParsedTag,
+    SemVerTag,
 )
 
 if sys.version_info >= (3, 13):
@@ -121,7 +121,6 @@ def url(self) -> GitHubUrl:
         return self._gh.url
 
     def rate_limit(self) -> GitHubRateLimitResources:
-        """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user."""
         with self._gh._opener.open(self._request(self.url.RATE)) as response:
             content: GitHubRateLimitResources = json.load(response)["resources"]
         return content
@@ -131,7 +130,6 @@ def delay(self, *, is_auth: bool) -> float:
         return (ms + random.triangular()) / 1_000
 
     def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
-        """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags."""
         if n < 1 or n > self._TAGS_MAX_PAGE:
             raise ValueError(n)
         req = self._request(f"{self.url.TAGS}?per_page={n}")
@@ -145,11 +143,7 @@ def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
         return content
 
     def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
-        """
-        For a given ``tag``, perform **2x requests** to get directory metadata.
-
-        Returns response unchanged - but with annotations.
-        """
+        """For a given ``tag``, perform **2x requests** to get directory metadata."""
         if _is_str(tag):
             url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}"
         else:
@@ -390,10 +384,9 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
         rate_limit = self.rate_limit(strict=True)
         stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
         fp = self._paths["trees"]
-        TP = ReParsedTag
         if not fp.exists():
             print(f"Initializing {fp!s}")
-            result = self._trees_batched(_iter_rows(gh_tags, stop, TP))
+            result = self._trees_batched(_iter_rows(gh_tags, stop, SemVerTag))
         else:
             trees = (
                 pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect()
@@ -405,7 +398,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
                 print(f"Already up-to-date {fp!s}")
                 result = trees
             else:
-                fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP))
+                fresh = self._trees_batched(_iter_rows(missing_trees, stop, SemVerTag))
                 result = pl.concat((trees, fresh))
         return (
             result.lazy()
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 044447707..449c412ef 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -34,6 +34,13 @@ class NpmUrl(NamedTuple):
 
 
 class GitHubTag(TypedDict):
+    """
+    A single release's metadata within the response of `List repository tags`_.
+
+    .. _List repository tags:
+        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.
+    """
+
     name: str
     node_id: str
     commit: dict[Literal["sha", "url"], str]
@@ -47,7 +54,22 @@ class ParsedTag(TypedDict):
     trees_url: str
 
 
-class ReParsedTag(ParsedTag):
+class SemVerTag(ParsedTag):
+    """
+    Extends ``ParsedTag`` with `semantic versioning`_.
+
+    These values are extracted via:
+
+        tools.datasets.with_columns
+
+    Describes a row in the dataframe returned by:
+
+        tools.datasets.GitHub.tags
+
+    .. _semantic versioning:
+        https://semver.org/
+    """
+
     major: int
     minor: int
     patch: int
@@ -121,13 +143,16 @@ class ParsedTree(TypedDict):
     tag: str
 
 
-class ParsedTreesResponse(TypedDict):
-    tag: str
-    url: str
-    tree: list[ParsedTree]
+class GitHubRateLimit(TypedDict):
+    """
+    An individual item in `Get rate limit status for the authenticated user`_.
 
+    All categories share this schema.
+
+    .. _Get rate limit status for the authenticated user:
+        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
+    """
 
-class GitHubRateLimit(TypedDict):
     limit: int
     used: int
     remaining: int
diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py
index f18e1e992..788bbb2a2 100644
--- a/tools/datasets/semver.py
+++ b/tools/datasets/semver.py
@@ -1,5 +1,5 @@
 """
-Parsing/transforming semantic versioning strings.
+Parsing/transforming `semantic versioning`_ strings.
 
 .. _semantic versioning:
     https://semver.org/

From dcef1d984b79cf622f418b7e6ecb72214656e62a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 13:22:44 +0000
Subject: [PATCH 101/201] docs: Update `tools.datasets.__init__.py`

---
 tools/datasets/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index c1c7e0655..c8e67c394 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -1,6 +1,14 @@
 """
-Adapted from `altair-viz/vega_datasets`_.
+Metadata generation from `vega/vega-datasets`_.
 
+Inspired by `altair-viz/vega_datasets`_.
+
+The core interface of this package is provided by::
+
+    tools.datasets.app
+
+.. _vega/vega-datasets:
+    https://github.com/vega/vega-datasets
 .. _altair-viz/vega_datasets:
     https://github.com/altair-viz/vega_datasets
 """

From 173f3d6f5c43a0f248502240c8f1bf6ca7536415 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 14:55:31 +0000
Subject: [PATCH 102/201] test: Fix `@datasets_debug` selection

Wasn't being recognised by `-m not datasets_debug` and always ran
---
 pyproject.toml         | 4 ++++
 tests/test_datasets.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2297ca2ea..e7ce8ca7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -141,6 +141,10 @@ test-slow = [
     "ruff check .", "ruff format .",
     "pytest -p no:randomly -n logical --numprocesses=logical --doctest-modules tests altair tools -m \"slow\" {args}"
 ]
+test-datasets = [
+    "ruff check .", "ruff format .",
+    "pytest -p no:randomly -n logical tests -k test_datasets -m \"\" {args}"
+]
 
 [tool.hatch.envs.hatch-test]
 # https://hatch.pypa.io/latest/tutorials/testing/overview/
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 6d349dc9b..fa2543ced 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -49,7 +49,7 @@
     ],
 )
 
-datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug)
+datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
 """
 Custom ``pytest.mark`` decorator.
 
@@ -345,6 +345,7 @@ def test_pyarrow_read_json(
     data(dataset, ".json")
 
 
+@slow
 @datasets_debug
 @pytest.mark.parametrize("name", get_args(Dataset))
 def test_all_datasets(

From 3f5a805b34d22727e93d4eb4dad27874e68461f0 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:25:56 +0000
Subject: [PATCH 103/201] test: Add support for overrides in
 `test_all_datasets`

https://github.com/vega/vega-datasets/issues/627
---
 tests/test_datasets.py | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index fa2543ced..fc61caf8c 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -4,7 +4,7 @@
 import sys
 from functools import partial
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, Any, cast, get_args
+from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args
 from urllib.error import URLError
 
 import pytest
@@ -12,10 +12,11 @@
 from narwhals.stable import v1 as nw
 
 from altair.datasets import Loader
-from altair.datasets._typing import Dataset
+from altair.datasets._typing import Dataset, Extension, Version
 from tests import skip_requires_pyarrow, slow
 
 if TYPE_CHECKING:
+    from collections.abc import Iterator, Mapping
     from pathlib import Path
     from typing import Literal
 
@@ -23,6 +24,7 @@
     from _pytest.mark.structures import ParameterSet
 
     from altair.datasets._readers import _Backend, _Polars
+    from tests import MarksType
 
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
@@ -345,14 +347,43 @@ def test_pyarrow_read_json(
     data(dataset, ".json")
 
 
+class DatasetSpec(TypedDict, total=False):
+    """Exceptional cases which cannot rely on defaults."""
+
+    suffix: Extension
+    tag: Version
+    marks: MarksType
+
+
+def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]:
+    """https://github.com/vega/vega-datasets/issues/627."""
+    names: tuple[Dataset, ...] = get_args(Dataset)
+    args: tuple[Dataset, Extension | None, Version | None]
+    for name in names:
+        marks: MarksType = ()
+        if name in overrides:
+            el = overrides[name]
+            args = name, el.get("suffix"), el.get("tag")
+            marks = el.get("marks", ())
+        else:
+            args = name, None, None
+        yield pytest.param(*args, marks=marks)
+
+
 @slow
 @datasets_debug
-@pytest.mark.parametrize("name", get_args(Dataset))
+@pytest.mark.parametrize(
+    ("name", "suffix", "tag"),
+    list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})),
+)
 def test_all_datasets(
-    name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame]
+    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
+    name: Dataset,
+    suffix: Extension,
+    tag: Version,
 ) -> None:
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
-    frame = polars_loader(name)
+    frame = polars_loader(name, suffix, tag=tag)
     assert is_polars_dataframe(frame)
 
 
From 4fc84469c4d69331bd4c1f5bf30c63b396c99b4d Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 17:54:43 +0000
Subject: [PATCH 104/201] test: Adds `test_metadata_columns`

---
 tests/test_datasets.py | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index fc61caf8c..205a0d958 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -4,7 +4,7 @@
 import sys
 from functools import partial
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args
+from typing import TYPE_CHECKING, Any, cast, get_args
 from urllib.error import URLError
 
 import pytest
@@ -12,9 +12,15 @@
 from narwhals.stable import v1 as nw
 
 from altair.datasets import Loader
-from altair.datasets._typing import Dataset, Extension, Version
+from altair.datasets._readers import _METADATA
+from altair.datasets._typing import Dataset, Extension, Metadata, Version
 from tests import skip_requires_pyarrow, slow
 
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 if TYPE_CHECKING:
     from collections.abc import Iterator, Mapping
     from pathlib import Path
@@ -73,6 +79,26 @@ def polars_loader(
     return data
 
 
+@pytest.fixture
+def metadata_columns() -> frozenset[str]:
+    """
+    Returns all defined keys ``Metadata`` (``TypedDict``).
+
+    Note
+    ----
+    - ``# type: ignore``(s) are to fix a false positive.
+    - Should be recognised by this stub `typing_extensions.pyi`_
+
+    .. _typing_extensions.pyi:
+        https://github.com/python/typeshed/blob/51d0f0194c27347ab7d0083bd7b11210a09fef75/stdlib/typing_extensions.pyi#L222-L229
+    """
+    return Metadata.__required_keys__.union(
+        Metadata.__optional_keys__,
+        Metadata.__readonly_keys__,  # type: ignore[attr-defined]
+        Metadata.__mutable_keys__,  # type: ignore[attr-defined]
+    )
+
+
 @backends
 def test_loader_with_backend(backend: _Backend) -> None:
     data = Loader.with_backend(backend)
@@ -428,3 +454,13 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
         assert len(tuple(tmp_path.iterdir())) == 4
 
     assert_frame_equal(frame, frame_from_cache)
+
+
+@backends
+def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
+    """Ensure all backends will query the same column names."""
+    data = Loader.with_backend(backend)
+    fn = data._reader.scan_fn(_METADATA)
+    native = fn(_METADATA)
+    schema_columns = nw.from_native(native).lazy().collect().columns
+    assert set(schema_columns) == metadata_columns

From 9e9deeb95668d2c4e7d30311e85a8f9f6acdc88c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 13 Nov 2024 18:13:52 +0000
Subject: [PATCH 105/201] fix: Warn instead of raise for hit rate limit

There should be enough handling elsewhere to stop requesting

https://github.com/vega/altair/actions/runs/11823002117/job/32941324941#step:8:102
---
 tools/datasets/github.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 921fdfc75..6f55c1d52 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -311,7 +311,11 @@ def url(self) -> GitHubUrl:
     def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
         limit = self.parse.rate_limit(self.req.rate_limit())
         if strict and limit["is_limited"]:
-            raise NotImplementedError(limit)
+            warnings.warn(
+                f"Reached rate limit:\n{limit!r}\n\n"
+                f"Try setting environment variable {self.req._ENV_VAR!r}",
+                stacklevel=2,
+            )
         return limit
 
     def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float:

From fa5bea8b25f55cc5bba32c1ae8963a89f66481ee Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 16 Nov 2024 20:33:13 +0000
Subject: [PATCH 106/201] feat: Update for `v2.11.0`

https://github.com/vega/vega-datasets/releases/tag/v2.11.0
Includes support for `.parquet` following:
- https://github.com/vega/vega-datasets/pull/628
- https://github.com/vega/vega-datasets/issues/627
---
 altair/datasets/_metadata/metadata.parquet | Bin 18921 -> 18777 bytes
 altair/datasets/_readers.py                |  19 ++++++++++------
 altair/datasets/_typing.py                 |  24 ++++++++++++++++++---
 pyproject.toml                             |   2 ++
 tests/test_datasets.py                     |   2 +-
 tools/datasets/__init__.py                 |  13 ++++++++---
 tools/datasets/_metadata/tags.parquet      | Bin 6247 -> 6290 bytes
 tools/datasets/_metadata/tags_npm.parquet  | Bin 2597 -> 2599 bytes
 tools/datasets/github.py                   |   2 +-
 9 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index d47c4ebed0528df5c68dedf307f03f66fec5e63f..3eaa28ca39d5ab0230c23b9bb1799d78ffd64eb4 100644
GIT binary patch
delta 13381
zcmb8W2V7H4_cfY?00}je(2>x4?<g(wCcPs90s_*TQbX?`5PFkd0xDHO0qIpiiU^8G
z2k8i?VBrST=Xt;HU+%p_ewj%)b7tnuUTfB#J^6$N&ZPtEZ-bycT<~IIK6t7C+7rYK
zV}^>;0qnVO002O$01$trsRYCTL>&R6m01mjeUTMLye6Y35&ia}7O(t!*L<&9kAKC%
zi|%1=Z~}wDILK~K1vW}T)!xG}Wi|?e3cEHNkv!t6l>}aiE01$rRcq%wr9Y&+J^C4@
zSmLS9M+XFUhv^A1z~ajsdq3SvE!Hz=pw)4<T=ss(_;}+xFcQh^6Amn~K_BDLp`4U?
za<5lsxb(a9JBn4f#b5hK%0(&SWyHpnLHTetsj`{hD;l|;Y9<B(r6k~%E!|UKNry&)
zqti>tYV2i73^4Foc>YAj6ZT;FV!a^-s^qY)<!W_3E^h1y2%=xwO7W1AQ)(udxT+ey
z%W5rfA3UmOfI~GCGAQvW337W;+;*aQlTZ4vV=ihfk24Y})Z>1W@FwAvljeJJxs`|s
zPj(e*qT=$+c>#Na1}mXl?65Tw3^2y7ZH(ib_9TfMMde%-7X6M8_2RS|Q85Dwb$!B2
zTM`9zE@gkYl4~Gv5}gf>t;P5Y%Y>)JFX67@z|S0x&DE3Sfe_pRMr+i9U0))ONh<Q>
ze3ICgy;VIO5xUlQN&*vBO9`Xrz+33Tha}{MQHCl`Ms=#%nZ#Jk07fF{My+Nf>S2Qj
zI~*1t4X?9|U491gJk!$Sk}EdELOh~8EK!8Qk_8L`A698?_}_%-&2;+lBdgyohNzv^
z|HzR74C!eBQ=OD7-a-NS%^4y#$mm*hCoU}{Re)+yhN_+#{SlYWxgHn347@NQINrq{
zg9LyJ2&4gGZ3w{CBKXtB3?PdGFi?h3b;(N5o`8x|Fq}Gf-fJHDvOh1!umKg9ezNg7
zewmO*29d#jEnP{n+S|X0dGuXPNVEgu{Lb^_s~`YOXNCueL)+ppp^6V<fp7pV9FLZU
z*x{b<O&>o`PY-Wj2YWaX6}DSDxKu?Tj%u1~|Mmh7hL6y|Qwg=<<=C2kI>nBA0taxg
z-eN(Zy_4s%DjsFD++vwy<E98Fuqh06<+YYIEx}kk+hyaxvjA8EOaP&yixX5@U8prW
zwc}88K!e>hYteYhC!d;W;TAH_?FaIl-nTH#5ljU2osv|iQAJtQC%5n4K@Xq!;7#f$
zYxlg85DKa27zhrtm9pLMuo46v=JEZk#&!n+x&H+>v3O>IYMxXQSzBefejKB)4u8Ys
zBYH!PlP~kZmx6@>=Lw|@ymA+N9n{3WL&h~GlNJ(}S=o)}>=24qw^t4<DcC2Bh-BXV
z<jRaBWM^r(kBCNW^tOH0pk?I#dY&$OQ1a}ldSn1cEnc6SrOviwmGEv!=cC@HFLttD
zf2`KD^P@()-XX@K>HXF-E2ebWcey<{x`>oF2l>r6uH|Cp-?W+Bi4`+>-mHjfv@4$y
z0nNMKLh7fPv25kNhmFWrB;xKLo{9<lj8h*E43g)lj7Oxc_qx4Y*p|^g{p#ASK<pH0
zE6;7SYpavgvQn{Ol)F%o{=V!?G=%QCjeHccM4K;5Wcy*+3!<>RYM?O^CX$}p!!`=^
zWP>TI9C=9z=rHOKq_6WT-{D5BOxX0yB&T#dREoyc$Ti4Gx452Ub%Q+_sZ{Rt#aPcO
z@7F_{9ak$Yl1{ZuhWl~aY##XRJUNO6eW<Q-pT0Z}_vEb?*+q9b1EMp))<!z7aVYff
zy`5_L;No2eOZVlOq!b*X2#ohQ9@x`rc=bV=<<2cBlv0`l%~K(L(%H$^sHtj>+^`S8
zRwd8M5<tG0s%l2?m<5q4q1mj#uI$IdBG^|3p;%?c#v&x{(l4{=$nQDr#V&VmTHy|B
zN8G5kt8;T+;Ubb$)NM)dNg@1|>6jAX&V0C10Q8((Hwm<1zh*mi=&D@G)~>8x2fGit
zow2<m68Y0wn1@n$Y*KOrpCQMbjdEB+{%v@V9P*dR0S+7h$8z}cZw^m{a$`AMmL6cl
z^ACrkOF$Ip4MrcdCIKl(O9X96AO~X4MdxF)^q1%W95z&JU+H1oe<g6ues2v&G5YI+
zM^JuFZSJQJgyxb=nm+tx=L^=q#&W^o^<oS#XytHH%+z6U9&?(Hh{bL|<oe&$LCLN;
z9BAt-g&^1N$MWPg1le#DmLk(>u@reKro^&=`iD-vhn-qBi1Rv_grwM1X<%6~I4s>p
zj_Run0<;gmv@80=_Bl=5QwD)>dM0!JlDx(gQGCI_s}p<Kn<7c4(5}T)8vZM3VO>H9
zK4N_3h?-ZaL(=>BL?rPO%R~?@8y;OzHt&?|B$J?pV${W=^c@#3s$wZjT?me@qV-N8
zTxFnUATR45lD2PtcZBz9L9lJ3`p-JDmOkM!PA~RiU0RKrh<O7dU~kvPCQ65WPF^WS
zyje*y*86HnRQEZUHq=|}hMiVULLm?}gr4dJ(-&WpQlO#7pBmZe41;aZ<ql$Mp(un$
z4^AQvW7L%#5DAH3YzpXUtB_Y8BChOH)u=?qdwxKUf(m;6r%7RRn|>UkWC({SDM%}{
z*hG(-Et*^G6LK0?b+HcTmN}pnQ2pHgQFyD^;wTwC%w9mTK((=>pMe}ejie3R27+;j
zs9qryv~ME3q&pLaS&Q=~T@<*@RPPX4fM2(S2X7Sg_INEr%<%5$#8E)f{7EYJ5E5m$
zFGHcmIUqO@tvCP=MoR;Abar*P>2B-h@SAaykW`;(IZEl#SBMc;`@rzx|7|T`0f>VD
z-b#ez_1nIMHw>z~R-Kv4bMm}hL~zl;#QZpE40q9W#Iz_ZlZW9_%>y}7CH_w)kJE)e
z;?Rb#|I6fk<Zjvkiz^S=Na`!F#!QpUy6xR-OE9YnQUj@iC|Qi2fVg!i2Q)}3u1G)D
z<c|4JpKd8zP~`lGBReP{=iWrnV`@~9xU));&bPktY=L;iD5NBcM}8bxf%iaLhqBi7
zKFoKTE0yI7tm6&Xaz#ENU|FCpFwN0)(}nTe=)q@5hQ=#~l&FKq93Mt}(So4Z>$ju{
zOK8o{Gg|M<SP(ZnoZLHaxZZCZ>DC}{Ccba8;NmubAh_T8T`C&3YS1s7sG4tK>#`Ae
zl!dxCB_EmB|NZ_K)J#4XE(M2a)bssw&lPe1)+aeGD%|d|iXizfpGxjBhUT2T_Ql)%
z2&?R*F0DYjkdTKPwWN*QRRWQ6>OAnX85wiRyy_oVh(|=r=}h*Mic8AMW}kw1B}#C8
zi%5ciTB>=Y!hSF}d7*(@M!YIOW#||Dg(?MeP@TNrRP3tg32LV=F1c4-*p<}G>($q8
zzn1s@b&q@S@OT((nXUGOr=*`k72iT4X>;{4mee9fza)59STYRSsc$;TxXKp}D5rah
zZ5?|LFG(CmKmG9eL7!IRmD}T{P`9b7Coa${j2qN1%CC~m)8-#Vy5FpQM8fl=<Kx%(
z2d0hqhiMHVeB70&5SnpymD8E#)0|IK;<vTe^v<OcySXe~b=~Y5?U<^n*Iu-LmFv!D
z;7;UCVym5;OS?a8+d6q_FfUOya5aIRR51A9i5R$rTv#b(pPl~wP^&MzCd}`{b(UpK
z1B1#B9QV7&slvw$NuV_@5pg`L&-4=Iyt)#mvR$V77Y}BxqR=0pbl0UsOJPb!%&PS*
zwzL3%qLjw&l3<L-x#!HvO?V5NGDp6n$)TH8v9^k(`Abswtk{j0*0E#kXJI9yiJbMG
zf^S$tTsNjmr#f8J-?!CMCw_ot&+|}WWNw~Q<LKxW=VUMWLkBW=)At=8Y6+5aSSx#|
z#yJS^*l5C@p{`1n{!$nv5>!*P)3OLU*M}>quWL-|9_q`wU7skpQ`p%#Mlo8-q(vk{
zCf+7AU&O_|RO&r)%=EKi@2$N#Ta*V>v@X4zVr*-THM6-flbG|il3S307F`{cle&oa
zz#X6Yj(i!xVnYcxj#|g1j{X=$o<k_oo+Tmq3xlE?A&NrvCEBMmBaoKW_f)jfeU?8i
z0%RgQimN{nmhSh%)*#8G1i8KfO-Jbp<+{om5gQYHz4t4LTah#}bhJD~Pn}s4?#Fsg
zWJQspgxMM_{cNnjFqrsYBM`c$N&^2;6{7R~+5l8L^m<E7xtM%7V+D@C%NT8lmQ;q@
z8w^Zm3$?>YL^wATSHi~Qmz#O^Q{Ngz`f)u!n$P-S`#70$MDMqqRnu=&E1hBCwL2Nw
zBQMTtOF^vfh0cELJ2Xt-84mBu8C}0c@T4?o$hSFtdx4Y;z8NF+b9RY$vxnZG>X3J8
z2b06}EI9C<Hp`32J9%n(h^9b&ru4{hc75t|&Gef=s^Gd`@xM;u%`?|s9v^k(zY0;*
z_+(Ymu^g$X+@JqhL&&r;{vDGa!?Myg2^BR#G(Ji$7)=}f)Vl5IAg1P|Z7(6=_Ph5s
zw^98gI-NlbH|6Zc#C{w$CG}aTe98i%v&g@oq?5e49vd%gKi|lyb$qKtsD-8&=KGl1
zO8misj5=_QS2`+z<Rcm9S*5?n+3PY_eGM{CDJeGee0zXyNs|*-w~7XFf(7B+58A>v
zwQQC2cbs<9W0M&>&qRN=eOHf-?<M_gXhyhT#l>(%Tk2QAX9{D#hq>e2JyM^d7OF|P
z2~c!++<K;o$^P+$ZeI6zj*oIe@0-YWSa)Dxxy#LNH``zEvy<!1G)Sr;hdMQ_C|-!P
z9vXaI*n2Q`7Zy!OtaP=OAwnSN8*J%w@S0^n^<YKS=d?Bv@nNf00a=gmqnEn-A4SCT
z0`vK-XsGRe89`|R<7OFI2PWd8r(-#J9ODgWC2ixsI|z9uG-?ihEv(JNEb_!`P7!J+
zcV)V&IKQ<Onm8Ab3YloHk9oC)A|U({J#B7&pa*~M0ufuKe<C;d_5=Gn(}KZ<qBMEE
zF-L=O3y!8qN;weTZ1o;px<8tXT0&Hzj}QP+$mxq|00IyIAVakg5&(k$Kr8@oS9Q~4
zEUHQK1&3LFYp5Tjmws>{wF6c|RWJdRMgBVtMToC&z@z0jRfD!7;Af5A0H*0F&=L>g
zkf(8RXR<eans$}aAIGO8PgZB2f@T_Mn4{|m7*Rb(Vd`APNii$Hc=e2_k@PuV7m0%E
z4IF}a)$;UDQJOX7by?fEiNlV~dAl*ApZwd4uwsSaUtzWsfc32PetXs<&JM$_GUD{t
zr;ng6{8iQZ<DBV@Pm6#Kr3EHgFDnh3N>pVZXpa1+T0;RN|CMWcAy;6~ZTN_?^h3T}
zovIkRli+%0eSb}iTbs3rh1=-vKhn*5xz153IZ*Dq5=4&wC42+xjh|gZkk9aDT<W-Z
zZwqsg{OiQxTc<Wkr8~c6T&C~*g^Uw2Ex^jSP4=;bMH(0eHIBn#+kxa|a#hb@OtHku
zIjZ#ppN>9rvfv9jhdik><+}9Q$sMMvHzJAS8~-S}Nv~0?qU*8p%9i52kaOlj2%`u1
z-aUcQBD1MnSJ@e#T45hlV0;r+)xFsWFQ~UJ(JLz=m0%l58RW;xJ3^oS3w4LOkat^0
z3fIjZ<I|6%5N2gC-WH&^-7hVzCd<Spb5+K@&A!<;hj$G7<QK;4IK~nN#R=-x88&w=
zMIX<nR;=iow0$Hc-<0=u2)dziTP=$u_f0Ea36W1qpxovTM6@=I7)eW_ZrNrpQJCj5
zeKee6G}&;e1q<N!QA=UXnR`3x?2MzAdho{+<PybrijhHEk`bbwErYoUCn;A4Ml6X)
zO2#WY(OxzaZ6b#4D!d@>5_o41nUD&{Z%yK!Utf%QT-+OWWp;USz!nfewW@f0^(H+<
z)?)X<<5fsU1bDo!VF;=s*VT0cc8Eg`b_q8y8#ceg{1wG_)yBXKGEJ0wRiS{|sVfeu
zKzmKn{Wb<{^F(DE$p!Z;1cd{ua>t~ia-uE$S0RfWg2hn+yw@ib*q%cR#1jE-V(|S|
zh(aWL>0=3nW%VK=-{^XruAC?LlWyD)d%xPKsMp{KD;LN8M<(L<INAOdh&=&Rc%@Pt
zBU0DU;nZgISa-aO7~YAi4WIp2#Myuz5&)XfNCqW5Z|YvkVLWsKLm<vzwh0={=qPrl
z8fL(BFh>mQA$=WlztFF?S$Zyh$-p-y(pzWQwZ;D{nJ7kIvWeq2j`x1!_yHq37RPQ(
z0LuRs$E2Xd3tvcpHF^yTVWHm;*5-nbs^m_%O6mLugr)yL_%jy5Ilm#yF`lqi`qD|F
zpz=Q;T=2g`*!BX#+P@);zTNMwf8kqA9YLM=i;8u|IlUX7W_8;KnD9|Ms5XU5$UYbu
z{!au0QU8Hp*bOX#KV=Mmt}_!DNy2+~99+$;?>FLn3E<s?j(-4bjs-A|TcBLvbERro
zx5Tw@AoDXUfG4@pza6nv^9HL7TH@wUr+P}ITYmuD^!@_C{8{-}0Ow!<{Eh}zI*Nm0
zVJ;M5a#1&8^uNIGlYVcXrap81piBJvmK$>X-gDD<UAdt|9FO^f-!b<K{5~;r5ACwa
z`VC<NA%t=^z6VmqLBwb}wOVFv=g9?z9WOCFvDTfRdOcSmzlc<v4U_Wx0>kg^+OQZ#
zVO=zAgLqM!pBHu1t77ReDTJX$jK9cl_eV-XC1sf~tuk|NSQsBtjlQ~oF{$<ij73Ij
zIOdnPe#7|oQ!I=JEcqnmg>pyRp-XU6HHn4rVKcm3A`16Fx#cYgdV)xVmSiZvu8m7H
z&vWXiCdFv%(ItveSQsyb6>x5DsjuyfT%tHj<V{PG0iug7gSH8WXeR_Q><$o;AW<C^
zjlca$+cnI|1J{cZr=APVTz3d2IS{FAebr5TQ*jU|Uf<uH+zS9>Rn_L*P6-aa?nwQ?
zbaWd0^}Q=8D%ZIh0eE|$IFy1dcfo3%MOaW4zYsTy8q^Z=k`#8gl}^YGR7_MvuUj_f
zm`qh-)OCUjW-NH+8^hce3~Xc_1V{#iN03$dz9Z5m!0j||f;jh<99-|ku`y%&6^3WV
zP(D;o4PjJBa{kV>v(-6QnN5sI5RXK}bHl-ZTs*LkbKoEE?o5KJoQh-G`x*+np$5SM
z5`3|*M+4Eh$SxBW&!O;8Z8*u_y)}e<3lLBmNk~@gMwB|r|I|{^xa{16Jv;|QZ+af=
zb~4WI-cWK(EhH>)Ov=?#CVwxPru=vu2#6lV#R9mU41SDzxtj?4yPF6W@V)1YW@3S%
ztZ||V0qfI>oMAdLqC)44?q6qK)YR@+5iw_3w)Rt)CxJZ_j<nkz+TBIt3sD{8NCG4Q
zQUK5Ix~hW{8bcDBbaFVphM(o!Q2b*;NQs-=hko;_!PA(J?{4nw8MF@3Qa?(sRbhef
z2><{JFxs`0`8~mC51x4yV<_wFjB_KZ3qm12_IegJVJny8#cAjmOt**n{;=kyqJwN2
zm+4t=zj5e095~_LTaYD2Ott=%!tI(y_e$+C1!K#mz_(xD#|0Vt4a632#h4;7-XYJu
zy)B@mX7hS06LA#c+)T}xA+K**n7#HtPmiNBwU`{+Ht%EgHdxbsLHMX@*RZt%m64jZ
z&s1J#FtL74beDY3kc1PVpsGCdlB89n&{8>Q@jY#<3XZ?c5Ys#V^=rxg^a_@Dt&Z}K
zlPbx-{RD|5KH_+*zv@4DBh-2r`iz^HQ!|@nu69)7F^Hz&MMWwZlZJVeLI7^{qNV!x
zaEBO9_A9t&f@Z-Y6@N(`ZLUms;hGmyP|f0VEXM;ceDllMp}`Zd_4f9-3jkovVFn-m
z1E>u*`)d$3+kgt}4sbsS4D0n9(=7Xa&>0b<-#&1{*$Y|1a2kn_p{B6=+v2_a<Ie?R
zo(sl5HiQRz7(p2g4>~<5ymj^g0A}85SSy>wfG4lCD-H}6u8-%H50n$qa!tiN>iktT
zMMQ2ZjIpL752wP-y`5{_U<dSnJ>mO~5^GQqAo$+pmIV418w)Dj4+lUVM|VZYGzN!{
z(Y^*B4?>qfO-4pmiDpqrpyW1St7I47eYFc9(ZV$X89$+`hde$Y+V><B)+U1Y_6rO=
z9`KtVjGiCLoFA@W;8h##U=Wt{5x?sr{nSVHtB<0jugYED^sz~28Qt3wn1|wgfWHrq
z2FKU-Hp%ZLafpEG9hjpi9bK<XO^og1P$WG(=jNsSlw;39t=ixKqSyeAdl6iS`}sHk
zBB53bG`gu#!Z+Qoq2+2`*`(<q1uD4Nlp&!H0t3NH^>2j$LI8dM*QK4IQY0VeTGq(V
z>L?zGjfWB|p>qpQ*v;`BiI<#3;rxYU$`z_@tbGY{<+r{Vea?N-Kl(myNx^N{Q(Cp-
zsb2qj-+DBv?j^bGrotK`Okq!%C8a#CZ)_+}F;T@_k>eYpaC}Mg29m5dKrd2(h-D}1
z%Kbrb*Qa~et^k3jdK9SwUq>}FSX=_QJxZPu)wbs(j@uzzE%Lpe`nKf^NOW-rNq?#9
zHENJ1Wfymt)VTh>NT7yM%h?O5%Q?gX-I$$vP8|b8X>(?F6Um?fs_7GEj_XdL!x8Ig
zaLa`08+z}wHrchE*hTTlWH()R`uml8tBGoIE0D*{lO)$qtKCN!;iaXOZic@|yILhu
zYBH&+ruoE8p1EhGEjLuY%&;euK5{;uERB#RurtXI%=qqRnIi7yNAc5l$#9^cLyB`D
z(UD{s#SQO>dZqE)Ir)^2%{)b?U9bcr=IBwCvu9hNU*+pbfpn2>49jp7J#A(tAj*z_
zjP`V~E0(4z-s`fW<7i0NQ)sX!;K9W|{~r?y<Q6$XW}dpTwwnb%rkvtqCmxlXnKwyk
zLC%;+ptis1^=IW)C)i<TQ;_AwRlEah112vDB<RC=s`YQMh_H$nq_S!Fq$Q_9`0Nrk
zf#hMu>T%2pLQw!_8w3c<q7DN^BbfCdl@Pt}KxOfrnROL$MSU6x49Q3wBb+?&y(*7=
z`yCCQ<53~qn(9=Fo{CaDYGqO5rhAR~nA1p<conjXrAU)K9%%IHR=<Ujjv~8WMM57X
zjjk~%_`U<CwD0vpOnBG0jd7t-ae%R#5Os|4hc!ijImsjj!|=iwN_N-EI5xkVWvaF-
zB#$60#_tziV2+tFK=YP3VS<fAYokFnv5N@-LS5_uJURBzN&MEw_s-&mVm2-+E%7HP
zi(OH6hg!xICpU3XrG1mgh)s^r-b@h=@%y#H>8&HiH9H$TM9LI5h>OeXYVXWFt$i7B
zemXkIHg0hDfozBfiH9Vw?Bq_!%<i3isoifc-t-z;-nR;pb_=~eBk3HwE-!n7{E0@P
z_46+ujb*fJg1->Fyfx2dxV@5aPDsoDa|AWP9~4`p%lOmVU*p4ZuXV}Cx?Cy+OZ;l@
zAz?t;8e2P3X1$8{D*eY9;kKU?6QO!0eV;R2kGT6=S9B_$MA!t%#QUkmI@j!g7W<3h
znfdZ#bw7ljXe>ZINxOe_fg)zaW&+jfqPlzceI_au&EIOw-I4vk8$;Id#FCg*R>TW+
zmC2_2-GewN(?ss%yHwq;OMsDt2Nnv$jyjl%^yTQ$#w~N-7;SuDEv1g?VO?Yl*;z;d
zTXc%%f}nC<@?xGpv$0%hNAeZt5EVUVCZx9&-sYRI)bZT2XOxTwkf1wMrla4+vx^zR
zqklptthrwZ32Kr%wG#`NrZsX=ubzgYT-6z6;|WczOxFY&g*4g?7J1NG2@<aiWK$l?
z6auW_L?i%jJScHji?>2o+j3p_@cCyP<ID8(ugSyu0BaB>7}j7L&S<=@CuOoRlB=_T
zo50y%s3I~I_7YYHLx5kN0XfZ27TD%vgJsug&)QeIiq21=D3=N2oyE0*DIK+dp5dmi
zo}%xUZl%Xs){oN9yY4m)b+JE_d2hUP+MILSE_H4VdAv&|^!PTb<tTXej8VR%Hjkef
zB_4A-b)K2CGu}UxfwYoLa2&`VC{y*A^058zG_4HJg5}L4T@Ih}W3J2@-5cr8#lNoa
zOhrjLh9>@aMTRou;87OFn|T`(@L@H`Vx04;#hT093Tv|F$D9{zN3F;DQyZl6OsDG5
z!Lpk?a?*W<TemfIC0dl-Zw2$CIp;p|ohEE8wNp-QuQNW3|2ZF%L&_gV6>a@yJcdi!
zy8PgLa;n_)rHAf@XNIB+X2*6xTTEU%^w}}!_;$rYa%T+cF1Jh9az{p~2BG13_!`{s
z#nyB0bfv}NC&g14bmYD_3<a+U=9jt;%Y_=E@D^#R1||y(8a(GFx#2}Q3pE57|G27O
zX)nCK375(lll{1XCnKZRu$tPg=x`?;^5$b3GrGi}Vx6q4oEQKaB%&x*83h6uSBjN%
z-mBK<G!p?(2N38jaEdo#l#GZj?BMs5$_rEeo?`4f5Ie1DUxe_|u4`u@8-V0P<n!c7
zsbFaLIV+?-lv=Z=rpn@WT#Z+iVCEV54C_f^KptY6cWXmQj?smD>$G^mKoH>Pcap2S
zp-&x<H+7ud@WwQ`%=5h~#T}ko)%Q`#W51G|-l#IsC0A_U^K=`1_hf_3{s<O9|2D0R
zbKyvu&?()e!-aAn&v&)<ARm{XacMrH<Pl=DD9R`Fk>ma(_u?<kd&DH)IgnXyzvh27
z&w<K~Kbyu^X1eQp9G2_Y9lfd^dqMIRUUm!oc13!Hwq*O>fy2Ys-}9|q0t&wsptini
z>E3_CBiG+v_#<%6ZG~Dlq%{^zuA8u65*pN(QqBp@R%cAk6}gx3^ULI@lj)t0G>no~
zyGg@M5-^gx)5??2o)PzYF}h?135KL?<}7VFUmvV_IR+=4FA&#Ew`n_OHj-2sr@@<q
z=C>Ts2HYYwvg75{vSj_`WY2eFup1?A?b0NljTlA4g3F5vQpl(V8(UIn+mX=Ka+2r7
zH~7wg<uJfx5*bCR^e8ZMYkZS4XO^6ZA>-NACJ@=*S^A5_e{vL2w;5pAR+wu*Dvp*?
zjhQ)xf{aqL3yhsi#h>9>G~{V%5_sh52p9@127U`6kBC|%i-=NSR)D;o4`J}!)R^Fz
z=QVsS{-jZW&qq}bmCF_2U^AC$#?F5~<i>k1nX>RR$7IfE;rUlz-b&%8DiUuszX>%k
zcWEiQjU=R;MynIJ&1q1B`8&OWajc9~ru4J>5)Lb;>oi>PjU4Th4iH4dd_s?T9imXr
z%h~XKKkP(u^Y`y@ede;EQtpYf*)COGWQTN;gDk`;r@|-IR(;3f_y~#Eh#Hw$;+K+7
z7b=!p=xdW5jar)=oEIAoy#Nix?FNin=(AC}ep8o57cq3|yZvq1)+v!EcFw{3YAeDj
zFP)Srw1i)W-sI0Os>nV{UknxfSwDx8Pi6j+``tRr#h3Ka4$xCCtC9~WG2Ew5-uRKx
z`aL5n3t{rl&^`%wVS;@IdpiT>Z}sOM(&91fw>f&81ZkCU%TgV>el46Gz4g%B;$>`l
z-|_34{?bvY9hz&Ao$&F_S=+{Aqi5-^(QBnIGOQ`6jY9*suL9;sf1KRu06|;QQ4)i7
zB`vvZfo#dcXS9S{qDfuB$<`mA2a-@*RgmgS&)@Ca@~D5p!@ZVUhOsXzs|>`e*01=U
zXmdWTMh*%V0yu7p)7t1LGBW}K?wy+|vv*cS$>PoF7|_Jn0&tiCVgMe%rBwibU^COd
z2J=sjx+vkEb84isX{3{rW5x0ROFH~xV@!njbbL0&3dK?BsbPQZ?&ihaE^ozlij0b;
z{C3eOPOlIV_VxSg|N9-4&X)Fvb<WS3Y-oxVmbyXsL<oeC9zrpyrn+qKvAC#JRJ@A4
zm<B#YSXcr=D7s4;1ObP`!IWUR@I~W&oWP#(iX!mjhOxk>+|7**`VY0{DK$as+dd4=
zM`oQzV&f#dbVvGYVi0rG5d~fGjmm_w$yr2(#dA@X29(ZCJSL&A!T~bf%+`ZwuV+`@
zxyf9ay5%ddrJS?ez@URp@>&Pd7n-iWGO#xCBE-mOTKnXkEqzT4ej(ffZdG!9mF`>q
zC=ye-`St37MVsW1bJKi&<tMJ7$hq2-mNz67E(h9ON+X#S47M_p7*3e+rWpm0O=uk^
ztP?R2XoULwI;uve+{X^XIZ|u{e>FnEgZ6iEJJQdQEpdSwy}V|)XD&Ko)MT~A?M)I@
z&@@eGD>#nP-(x@0EwkyEu4Ua(xyEFWR)oGDDrNJY2pMHI*#K@>4<e47zDB*l*Y9<e
zCr+4E2}CPuA+UmEr_(%Bq^Y5yWn|BT;J9!Zfl+2(Ly;&xl42xy=emD9K%Bap8Ys}B
z&eQn?X=IWW$fd8?_OqaPz{iQKExl4tjfDX~+BUqWUsP?MO!dhSo%q^R?w9?PX2Xs3
z5SW6>l6`i#{6-j57G}`yprSVARekSAj_Q3MJvOn|DnN@|61kejcWPH0dKN+~UXJby
zYNNzX@4nXa4}eKf#>aDM-u#}^OZuj~xY_w(i7P^wBw@teU~;VZ5Xgq0Xya1ejm!XQ
zC8@-FQr>%jgQ@81CDPLeU3&$HCI`uDB43*)D5fxTQEDgJ_Ce@J1c(#86&en2iL_6E
z-;gi*X~!ao*VI7c61X_FTg)sg94dvgVAReWsX}G|e1@1MIvvtgP40w8U`h23Fk9Yg
zC=^~!i@V*`sI<Ce#fmiW3=<>gqs>fyzE@nONFGNC4`NFb*St}xzAO4k!`zKLT=Xr~
zGM&=f)HZL*eka4o4*yv#x}?g(#&SM>6X-Q6k&!Ule3g6v?^2vsU}4U^te;2eA5mqu
z!nm)oK9;Or6MD$$!d&}Fh@XWY<1|`rOfseaWuVcN;>GCnHguPvmnDay;!2Xp8^niI
zcYz>yPXf}KHeCDEhOUQ=E-=Ubxt6|4aigN4!TtL#@*IVE%~CY8j-{rHF$2*sMkhlJ
z>j{SvodbcS?+$5@JZ^J*zN;#~_GnOKr3sN5=ph){To*9OAr61?br;(<he`*m<IZu8
zvQuxT7G4m4pP;y7?)n=NHjP9j^Ij$`)0*c=XPSB0YaL=`@vWTcMS@-jT)kXgv%)>)
zB$`7sA}YyOhWeIr)%X|9z@@CiInO33$*%HN$c-xa#Ib%vOw{FZus)Q|R~SY;)q0`v
zg<{HDdum+@&$f~)P`KI0&^i5X!#ssvoxxTnSq?6ZH^wbWHDJNceg#xP8Af{|aO3S2
zu$O|cE8^7I(+UbDjC4(u07>Xi8>=9=D@UYk+E~41s>0JL8PsW@Sz(&QS1@;MERdWd
z+2RaHSmZp-Iei?ux6tH>?;(n_e=Te8Yo=LLf>Cxn`O8JY2EFO!z%g9_DRJhKp=CmJ
zcN%x2Nh4HUlsJYvN4+c>T6gE|Cm}PVH!42Hq{HsyblD=4$i;2r!f^E<`y%nE$A#*f
zN$GyY>4uA^kHXn9Y1D<{iNZ687#>b6W@tN1v)=i2GfvTpXoC)d?$trvIjVb--Q$qI
zcaq5$k>H!H0mb%VPH(zFW>7bbFgAUO8{3DaROd#42B><|H4nP{T^<R)2sQ5pnd}tI
z=-m>NG<P_uuff71OJ<?^*$!HQ^2R>?AbV{fP8|c_%PbT2s6;9WkpQ^ZmEDjdz=##V
z0-y%`L-MG<CH)KP{*MHQWMWr@|MH4aBtOFv^<^y}mfTTrMWcUf;Z|d?33<W#zZ2@5
zDPq5Rd1V`P#kMhmauP^hO;G@20~2=>bVIv&!F;G6ILbPpJJ(pH*JRt`VH@EdTu8wK
zHcyaH)*B|x&+(Hzdbv1;;06s4rkQ$4#qkGin3f(3>;bVppg{N=J1ph=bTNxy*|(Bn
zs*1zIhFAGG@9;#<dAqJCVC3kk_`s=iwr$Y(w)4l?B)B|f)ppJ9C$H<{->jxQ1G0HT
zz8s;X-#zD7`y{t}BIj`J-N~+Jhdu3}t1{mK2P<$#%q`o?mX_^1AO^#sRc03{VQbh%
z{vZt)@?6>qAr$5eWyIa&u|TAml6;Q$7HBw*anNH^CZi>$Qpk`a)Ca~{zcnYpUDSe!
zDH{`{>QfFLW#Jh^TcDH`;{t$VSuzWctxyy@;1z-+CRj{4S!?EpviJHO1B#L{oR3@z
zHxJB<!8ZK0SEE)%*KqZ6@E_42(NRXu{9TDV-|zvW#YW86=rI5qrGrt#6;9FRkVF+3
z-y=%GbDD)k#iVNSmF<wp8FSgAjE%c4Nk>xxy<dvnPIWf8aFDDLWrlKTCN|ActZ1Vi
zd}}MWJ?k9xzW!EeyKkdfBS^|{oU~NMCKK=7Q{K#_KK#wRCu<~;RjOimkJN4%31}>P
z-UThg@*k2ree-?sg+jN*?1$3)(wq7g{L;>Hl9$(%Z$e`#vKbuao31a4i8B`p5y!m)
zMq69XnEUOe@^X#Z#!@hGQ6E~+eM7I_LY3QF2sIp+!7%Zm0@pv3Y(9v~G8-99MrZEC
zF+X775r8+69CMas$hf7&9sq~ws_yBn`-;kQ_D$*2iT9z`V<%lyD?QE(IhC%&M{sew
z?1)x?ecb()N`$W&G`G_ydGq`#r@dmh+Um<0w+VV3j(fA@+mc3ig8WDnMH(019%_}Q
zUuK?K-;ncz?x5(*bAJ3wTUa@MJ)|E!qG6Eq6sF&&BLQr3cXyTdGo#MXPhG?jLgu<A
zQxQ)`>4>XD4DF`#Ocn<_`!nAWZ47&t-eP5_EF8IPFOp)J$~EAa1u&NknvwH!#ZTu-
zJpf6S=G3JmBv&jogQxKTlej26yaVaBQSjwf`|nNm-{^S}*8bz<r6b4l*hAOR67lg9
zmwl&{1o^nClZh{%?p5jrTYnM?85PIwL)LLx&w8tW_;hEV;d+#?k(vOV*Jh=@8A8j6
zF=v(qLB=D7{{odG;v^^7&~KyYV8^YbxQK7pcx_jfHf0qR_rQTX{UdXhGFMKQK)5HM
z9m97BLeeUYC!G_0iDAlk31v&Aip&VbD_!QZu2s6Cg3u}!PpM{rMOGIcWrEEqo(*$V
zJTL>h`>}?L2iym>Iu1cg7`AJ#72pBvMu-Wz*cmxC)Lo1Rp_gnDcF8CgEb%*`Fc>{!
z<3DV1KkjdWgtb@e$t}*!5|hWlzOi1on_35?95%i?qWBcUZ`BV}c@UL!TEatJ+@K+-
zmQWJ?=EL|~Otaiyg|fPno)sjy_1ywF3J0xwA`f9d2pswf$k-m52@RBlmqAL=5d*|z
zM|lwY2hnd(K}v2j8pC#tDCX-FLh_FMyW+95{YvF`>f&j{JQKja215tT1~)0fzcl&o
z7fUEbg24CyyHOIz;-@l}5W51xf5@-m?~pF@t=~gB?ERgqz4?bp!Puc8I)UA_hPP;O
z$WiLBZxj&F`D27ls=TDVA?H*ana$PhoL=RCoFsiBTB(FOhjme|0XvO}&B$91!4HkL
zV_dg}8H929*8Tl+xR}a?6O?eSSKJO4(_i0Jo;d7oQN?o`@Kf^NT0_gEo&-yAuoH|$
ziZ|WH7afV+7krR-j<Xm_)L*XH1c8iIP@-P3hd4@-J=7@9Z2X$4(^gO*Y4>Y}^aBB4
z5}$;mz6NK}h)?-yc~gdl0?EGv{4=_XvB_6!!nbntjH5NfSR5#4pSZn$!jBzcz1*Q@
zQamM(3l2FK83Ihw4IyN<fxw~wijZH5Ag-UlPh!X+BdZCN(t#1PB_$Lu;;_m@kroqD
z0CY!;0tqS?NBRyNfFgvF7#17M5S2zcFubeGq<ClYT0tsPe;@#?bj`j8alg#YOn~-d
z&<AuN&Bu=p<4}PU1%x8x!D~!MEH^2e=7~<%H{!yNof=QJV3`EoWBU&hZhY{WoUuHo
z^^W!U&QgPFrtY19elN)=&7a;=9VA=&FsnL-JuW`XTs0y8$}=b6hZDKIXmDC*k2#W(
zZcVvKUtesO;oLXPuc7AL`@PZ8cev5O7TZ#ofzwoW0Jnpl=PaV6NN}REJtLxYecfn1
zF3-9Vx#3Q*{o_FGZmD4WX6BYlJ{2}lzz)D3YsaVe^LD-I?&<bBRuC6MPc>`HF`s=I
zZ9yL@D5I0um7B8UYk{Z(Vk#^@&@xa})4+>Bf69{_@E(#s<x~myJn&CB=O%ob>`%Gz
z9((}yr+k(Ggm>Zn`Ouny{qXlr*f!y_wio|_gJ|p2!QLJozHoAEM1vhfLu7C3i|$e^
zB}f&ZWuPtKXr*TOzkeD)jiQ>tMqg;Cl70Z;5&(_?I)?v$C-<rLF01!otFr(&GQ=7F
ziKYBchfps9Qbd8yp0_;g9ef1Pe8i+=R56hkeOSfbL>0iXBgt@az*OnKKVV8#ba^f2
z=VgOCp8q!Ze;jA{_cVhU)sxE_FR?W`fdGzgaX^OuHH`W{MNz57F7K0Y4us1AIG&}U
zb(P_S{}F0Kdnm*C*0I;nQ4L|Q5y3W<g>CBZq1=nu(timp(TImdT~s*3R*(X4G@o8p
p_-$kVA54snD(|v-V;Bft4gyd`%`;+uVZi~R+Bs}`prQ%>e*if<3Wopy

delta 13569
zcma)@2RIyW+xE9vomCbq%3{@s=)L#eiD=PVh%SQI)w}3Hh~8^R5D~pcCkc^g(L)d-
zTKG2Q|2)t89`E<P-<Unj?#wlN&&>Thuk$WDmjc{M1=c%)@O#<et3=%JOYT$;5EE<#
zPYD4)v4a5sz$G^jce1enL<bav-)hOZH}K^&xHDf@6h)PgpX8*-m8D&t=F;c#(voj3
z-nenutChZP;~W?!zno+u2m*zsib3enHUimGO}%SA5|-mXz|aW!^>~M#Eq__Ce3yqT
zDlrzPXxU$uIR!sL|Cu>&7IE2Uxlnnv$!J6VTwTt$v2SBXoz&;a+ZlB^>{iNEbvZbn
zA+(&$RtO?Sn4UR7Aarm-<(>nKB8mbRpMX_~MSsXgzQpjl@pa)MPB)FyVA%q6k{CBF
zFeFY{wpTAHUss=!AgjmIaLz-V0mYF<Su-17`uv8c*r&t|Yf&!pG)hiV4odzg{XPf5
z61NyhPc^a%JMjYjN_Z9g$z)m+YtR6;yy~$$G$gWWlP}GA>0m>}HZQICQz=Uft7bJN
zp6U$JQE95_slSLF4Ii<lS45(9RXG&yljgrcumY|{O@UfT6{5{pQ#<R00gGcn)HmqE
zy~CJE7%WSbR%2~l!vJ#V#rb;MW{6cNmArmoEr+#$>UA&eRmcelCCaNHsoSy*t>caJ
z3?mykgJlrsf5EW=>IM#(Lj_@;q;c0>hc7Uq(RSw;F%*DDehB;pR{#W!&=7E0g6<b&
zS|u5LNojsTXXh+yP=ezVcFlp#Ha(>FIuXtXW@`(8Yld$+fPzaBC-ox^0`HQWYDJ9d
zh${6b4lRBm4txptyM`d4)T(*_AuvE60;8Is2m8RnLFO33hzxno$2JO+^44nPa-V|8
zsRT}!9!>CoS{c7~mD^ti7i6wKvT|s}bAD%a@PiP*6zL1V;H6@qj3DHBY5|lMjs3a>
z9I6};+YYs)Fn|LHqyb_pARGdJN&`P5(1aHP;n?>aFK+&{BE`vt12UMcnR%1FJ~hKj
zYSUi$GS4Yy%wTj0yII`1E@Loz!coe*yF@ok0Hy!~Kx=2$V?v0x8_StA3L`8qm(u^^
zFo@(<Q@1PV&FTA&@(iN}w1<2iX!hbB&x7!%k(oO`%Dk<@K5Rhp8%<8_=7cDPH3Hn;
zBR|KkERKllfNm7k@8tVSG9+yZ#<S0Lvy^*!_kC*&>YUq}p??RM4XWI^Ri3HcgXg-;
zE)@m$GCfL8hnZIk8yp4Dl=M4#KMSKV9DlGr6w(_Dqp7EKYB}ekLW4DNm`_!7m1kBO
z2lv%xsnO+X<yTQ(>&qHmDIpf3pCCxf5{Ny5y&LZ6td$90ur-c;>DNi9I{j8g_=MQh
zPMu{TNL>8FBxS+Ha$p3aSn2+ta7-wG$FP5pCasG$lF;Lj`Aw}{R=p-qN<?06xz^ep
zg_CA|c$arucTcQ6E?P0*`O8zdgP@*bcc8jh(T^c*){DSf945e1N|5`wgV%=f<6uuW
zleuoUy;H#fj!V@N{bZ}B!XFZfFq!4chPE*N?Bs6L8=w^mrl@q}DSAVTNsD0G0ik%C
zkD{nRu!_q0;sc;80Pr0@v!_V@M!s8z?X++Di_<ujjedgG3iL9$GwVmc?80`$s?t#p
zgAg;Iy}Kg$6!*KNIjxPBVWHU9Rn0j9Rf5gnf|`~0UzI3Y<@?n*r<xXWHLa!=H-N8z
zV=q0U6$FG-WW()!A&a%nL9QA)GPYX$M<y#dgAGzRFxv{rQuO2WN8dhpJ|IWI@g6<+
z2C}gylGM4bQjLZr3Gk7s;P2jTWj1Z|qU41UF-_rIZ&H~+5HD>fHO*{VH;Jc;;B&rN
z@97lCT*r)x`s4YKTIj~O&j;79kW(&8^YwGBb5{6zFPRH`yt=749zFc&lb&psU{OO%
z&Z@Uy;SBrCqJA(O0LRL04)HIT;4nB84yD6ZccRozd<LxYveE(+Isc`+sRba4R4W>J
z5VdCNGX`l8AzSJJ0}nAy4=`Dmn}&D*OrETnDhTFG-6EhxhYmRj=Ok`WnCa+a<?Q|8
zPmJ6z8D>ak`nQ^Ty|!vV)HuBQml`YBO!_Ngl^Q7LEQ%!8=nI|nkVKLVh5r&`1Xhes
zc@+xR?Ve{J9*=DKwLRU#B_fDYbQ+DEJ_wQrhfZM?Jmar|qf_upekmcX_Q0Waq~hqm
zgmW!r(q(L3K*q{LHJlg8L5!<tQ!$cMF2-*<`XF$fBxa)UAQ}RJLQ0k~s@(s`d8#P=
zcoc$MNToPsD_p(8xH>~axk}7^{3oW1iOf`z85mM*C96wI65Xqx$)xAKKuUwABQNNq
zfu%h&bWok4<3ZG{w@G<p^U)8&6(T%!6E1-nI_h$<o1!TBI9=}{C!vkqtk-wEM)m$C
zj5n-BZ*!U*gWq8F-6avL@A9K$;=lF1*Q?7;5PM^%=_-u55p4^P@NO_@jwZYGrA!Wd
z`58?whr}wpp(7fXNd}2bA`YJt_gs{V?yz_Gu;1_S%xMb=5#jLL_pmoLuNCbT#$(=G
ze0?qH;07jT2`6K<=9Y#>|Dd8Ob%6DKFur7c$OBBL5n)O<ob@&46$UTI*NWTB3J!RY
zF(Lx{0loC<g%oOzC(JuFy?_qCC}OOqp)}2Y7Lw_fKcKc|?lYvJOK_W3*U%aGX^BXH
zBy$)E#`DAMJ83v<LwngwJ69j;E_1?(5JCVb3_*kM;OuID=bp8jJ)DT@m)grxWgysA
zX;beKBk`X@;HCc(d<I&876{-4CW23FU`1FHrpMf2qsTOaWR({uPv%VRC%%D3Buc;m
z2sji$L-OmCcWm8VU7fw1-S7Q5`96ed1;}<R_+PcXUK<|CrU_sDx3*`Jk7@uuT(dxt
z)R%x!Xqp6;FJ3*?1haA=d5|1Pog%_BMWI4vtV%!DTBLf_n6%6>yJ~37k^#+Td9?7Q
zFV9iSG})0t`rxoc!v<C`kuW^os9Vv+Za~q;HO~T}7a}N*mLe*m8@`+V<Ls;0dga{r
ziSDUYMgBKsk|2A1fhZQH3*@@_Ko`-*Lcz4jn7otnI~mrmMTy1=u2s*^(bWc?Z>`L#
z<AO&eGMi1G@I`kQ?MZ7hL0+N_Jx2;BON}!>yj*BoV_Y~I&v!VuEPDN&O}#682)_LF
zT;`*H0)6d`Qd7OydF!OkM-KxwEut7pr3G{4&PBp2_9C3NinGHMkw$LC-##UE_NYxe
z(q}dZn64$}T9yG7`qfMoJ8%d{sAta=dqkJxUW$h$bmNfhKH!}5bSS!yc8%2@b+#mH
zK=;*6Dh@Jy(fM@5bgC&O!DsWCSD$DeY;1n`(8FT4iT+DfeMks_iV5E1z!9G@H~9&n
zZYyPnwC#5hMCl!)!!39vlzV;D*(CEiWqipfx&-mp=3XsYLCbE<7wiWq)dO>fM)LDV
z6*Z=zDg&o|H@CDQ_xsl`(E0Q4E33j{-S&PwT-iw9kqfeOWz>juX6zTx*J~GYlXhrT
z8buh0&B%<|@D5YW8$8LZo*m$HIlZ~}T4$e4Wq5^vWvJ4}Z`Ia|rq-vOvU#-pAaSL?
zhmf)D2C-a?;z-ixz@dI&ejW=YE-Q=X)aaI!v^3}DA=~{%rToJ*1+;u+HnNc5CC^Nu
z4qg4qQOO4%KGC-CV|MpLBPqt>IeI68qRCv{OrQW+*g;*aDLHehmO#}>6RU1%VGZ{>
zuE?7nZ-+XC4f2Pb6Kj1GCM}O$lBJuxT_(3Ig^G90XXE<5hHKn|6B^~#UB!j4ap;b`
z@puKX9fl9xZ+urmjgO|dSt!EN%TEpGWySlHdrc3WZy))%2>Fb)=Ji#1EEDPwwN-Xf
z{#3ZkY3zmeojV&b_LGlTp}Eo7<}~+o($YNi#O%)=U7Gy3+rUlU-5icER-3|i{QfOj
z-E?j^GPSXwNveXC*qBfdg%JDvLhS2;Rxb4f=M&*nqsILOn;s-O{RG0qPu8a#n3&%b
zfzPr&Bz8yrbv#+?1;RjNhe}ZnSe3aJTJ9t}B!0=usa&PYba((6>R`{*Y+|U4<cbKC
zME9rVN6B^APf2qW!%AXXlhM#x`~fDT@3wlUT*$=FedFG*A@^CL2rj4FOCK_PhOo($
znJVaK=XEAd0VU8v8}aR|bi++9M$v4($0U&vu6Dh;<ms&=>OU_z>N=v_4@c#x>WmjQ
z@R+--gZ#c-4~p}=)7fd{r_Q#vcx9v)lI%wF@bX8)tW#LU>U~D8GCmx$Nhv#V(;?rz
zsAvKog`D7>eYnpuamDmWy0f3r2&sj%lq6g7oh_kJ7Of=o?bqd`LtPsw<#A}%At@G+
z@s;A$S#Pzv_lx%rmPXgx^NW3ICoB)-SPNIQPEncHrxkY}iIW@@_~OP9Ms$bCFt!iH
zcbnCHoVR^Db7p>DsZU_nt-i6tW{qu8D)A;GpCj$%$!%qunnE890x~YM5YRKdPeLf`
zGp1eED$wC>>R0kz^qdIlo<g&6Z-nKkI!Qrc&;8z*_KHtWq>nhv4D}&&BqT#95%&dy
zO>SCs4?X>`!gun*Xl2LmTrpeyw5G3Ct!<;~E;Z3#rKhY{_QNhH--U`$6GY)sB+8Us
zC>mMZUH6XV)fO)h*;a@Rjg%vpd23T-VdH2<_mtqLWa(m&5n8A2RvnjOsi@mL^`5&%
z3?$8WXEYhsP)Gj0w~pvu+sy0c^Rh#F#bx=Pi7~FI{5K5JiY=0NPK_D1J+~@?>}oZ&
z_^E3(#?6)_52w$YH4{n~*a)fnBQj--K#>g0egn~-f;LRwmt$gY+w^OhtOO&7atf=b
z4jyi}lyJ4gGO3;&fectYSk3f4_|j+|br5Rb5xiM#-FTMQ9r}>T*O*Guu&<UDM0^@q
zGk)NEKjs*|;aTmtN}AS~HPe{UrRGA4<_kKHYGi1<4wnW2eN)}3g#|hv0swfO`EE4V
z000oIGen&g0tf*hksw(>y!d%G5U^i=n!*kGW2+iSf45O)sqL{|>LCNbj`zQNDQVo4
zV#72I2ZPjB5KF2v0U`4kn7P28pEeN}#jK{!HO$&rG3Sy0l#~NSo2*_+!BrC5keWfj
zz_fw&T#|_2o(ok1UrU*raTV1a$E74sP-UIQPuEvUm1AT;haO^7IiAKxeg?*<rcA|p
z4Y|4q)0J;f7?`PF*DQFd1GoRL-b|D<I(%ga#aYAgc%;-pAR<8eAGc<d_V)ko)<U)=
zk>o~SQh&QOHGNY(o-D0V%I#CMxdQW!;JnvstY!0iGy0EZd)0f}f#7?P{JW@wKwAH6
zV_?wR2ePQjckF-K-o?)g&H0~XbIu&Tj8vTcYuw6Pei^qVH&Gd@dW{@-2QN}L7gw>+
z<)QKWBs>XtvpK&haU1c5Urgi5kKg{S(y&Gc{iiWLRH{3^K}&A^*S!_4{Bmz0&C6K#
z#^Y)H%e^&N$>xibepY;YW>;OyMCQlEb?+tC#O;tf_OXz|Xkg`IJ)(Lqq_gCu^6sjJ
z)5n@&4IOkL@{ge-!ZX~k&M7y|8y5<vr$$lWNz`ewjm1WJ+M*)`{0u*O4LLdO3+KGP
zBc|o@{dNLkeD;sa)APi-yn?QIkKi@I%6N*mBLV%1#KhOj^oXSqB>hC+gT`jb#4Y!3
zX@91D$WI6J(|>6vEaXtW5k_B8X!FbYiC$;u0a~-Cf0#|?;>wup{N?{<EgrVh<0g92
z>mlpe&}oQ9uqa=r5v3R25zb&g=I-glfoql;;p^QRkh@-~)(NqtWT(C6q%cS%h|CAC
zzsG{EUzErgBgc>#+!)<*4ZKPfl9~_1<rU9Z)vQBy9cD9m$tUCRdQhi*0)sH(a-1Jm
ztDTk`T9RyX3%<&(gQE{ENon|iy3p~2vPsuFjmB}hTuW!5kdWzOga}L^@J%9J2<HPF
z)}(5aSSa(H7G{&W>|<Iaabl6YeLfIEBHI5v5C#0wK3DpfcqR3*hg9!F^vi{@V
zcJ5Qvi?B6`r0UU`)6^5gTW~bt@BVGtOhD-b0H4%8It848$HtUHsU-9Psd4lULN4Vn
zea8JcVpuonMC7Af-@~6deM4#q2|k_Vk^?}~qL_OSolkW@iNE+}{=t`(ffb7{oB>dL
z^S|Tk%{+|7)#MMZS-M;q8vI3b9-04vtIHo;*M2pT{@|*a1AoJsHR&p<`5(9n{lWDc
z7T1hFxMFM2TFQu%FkR^eg^ro%b;+!!I&ba%E3O=*Lo3xLJVWsghj#ygYU=+3RbV9+
zRlTj0KMkwi+t18X5TmJEr<w}P+vvs*zft|}@-I}M_1<0|Itr3^h~f)8ZUuu3v7i=C
zvHk%yrqSjlN+6qk`Y<a}apo_mY|Xzw{qRLt#tMc7wUHMon~tl<%w;j16fM5mgymE#
z@YA}@@oz}W6-zY!Fq-;Qa~$nRORo19((I4FAa$;r=l(+|wta=+BK@Rj*lJf%%0%W{
zIpp?>&|WN|G_d?(1rf4zo(KF=dCsbe^j^R4EPZ|NU+pd3wSp(n>W{Iva9mK4p86<S
zeD4b@me6}(2VUQKET1;JzE55+qVH?k`REg8_RRbNRO=oV(3ltR-GUeSQhX>Tv6tOh
zny6KMo|sI;gCK#990hgH{6h2xJ%wYREy4mz>0z&znuDNK>4pM*MYj3BDCN@`n@O%5
zN}qiG3sP!&i&A<QETWxkX^1MY=(0Z3GufLMv@4~@jWN_^M%KOMn)Zj2tO_RycZjSU
zH9Ze9h;QzPnf!(m5QgOaX&=*a4K#QI$J`%5Vs)QeEX&d&)X|f>@Oj89%nk!=Ngr%D
zL@+P^^|o<maXqXY!G9mq((@KWMgg$@@$p2$00+<rccYef#w6h9Je8eTa-E957HxPN
z?D2>W!}%bT-`h;w4a#iqnfskk*CAz8j;LPL%Lqy<yM(dTe5Jx{Wg$4f(;%n!j17k3
z!+}HoHsK)N&H;a0Z#zO%gK)Mn56M(+1{z&#znKI4o+g~=pH4HRfG-fR5{eJ2uK<U;
z)e0rA@KO%>n<*5(Z0KMexhy4;FGP!mmB^@y4(47`0Ra(jaIh&gk_^6p^E;&uF*BkS
zz&8j1Y-uu_N!RJG&;yl~u@Z?m;WO02vlqbwQyfGqkq1wXOK3Tsk*&Q0)h!H1f`kBq
z*gu}`{(7ashWBS;J$D<t9yN7PPRGfveSx$_&SUQBpV$0+y}+eZHCFDCU|ti&g2ycm
zz?$^KAxWabR3X<uY4#G@MIX0a4s`50g-(Z4+3s}7<Oz$GtKG(oa)rVhA#CqY75aoM
zcihi`d71?a!8Px7b3Iy*zhw(e-g~NA>f@rWU3(UM&7-P9eG{hNx8KjMSLv>EL;2d`
za_B4P7DmA}>CHWE?oVHDk(1i0-`Z(=$IPG;#&ze+!inAu-M*U}t`T$vvrTX%U_(3l
z$9dqgA$64n?fh_C_;bw^1DECzQL)e<D^;19k0YlE$HJY@zqYK4Y$3E*_s=7&;%a$*
z=&kxi@yOgBLbO&yQj}bedS3G8rm>vM>{W3RF{9XR*?JnF*r%3MD_*i%9VuZaPon~0
zYL^1VuRhT!a;gSgHcS-oSHl^?<Kyo`1Ld*y@^ZibyK}S?7k&oRgd6|!n5~uoxnO{N
z9|^4fi;oe_iti__VZm!(_MO1BH_g9?tFx`Re=v)0acEh4@V?LE2kU`vW$0c4SAi|T
z0`)!l+AVqj62esu>zv`X^|&tT3WmW#^q`(kfK9PD&THFm(Bd^nGOD5zdnTZ$KGTE8
zyYMP1yC??7GA(Q}fL{RNmw);UI9Qp{!7spol_)yKBq`|CuSKDqVaRat?NtYOie4Sf
z@w4D4YAP?uv~&AzJ;BZdx!W@B9+1TYbe-WQj{uJdkA$?0w!&p{$EwV!!DV{K8u~%P
zgWS`*mhP81=#9No^kqJJ^Cq^%RE*woZ++OB(3;z7(Gmd0mLKrDhlA=@7YEqK+VM{Y
zt0th@gs^2iN<}JC6JyIdTSE(Pu=`!L&sj6kt{Py#b9#WVv;R%ZU2y?k@b5L8$}KCU
zP}BLX>xWNM8Q!sk*N}Y8dKAg^bbc=ro@e2kmMwzZ0B!&~0I7jkV8-`n=+qmHtjkf#
zrS)e&KYrs>x+(TyPw#GNY7JQ>N|EpIO@pnZ_U$G!;zb^XwVeZZzZ$3HcT(D7L}+Bv
zAO*r7^|n`rp0OxG-xxN0W7Rva_PPBk&A2iVNY3Mjj1<Eoe2nbC=L0;v_=1TC_8-O@
z*5rx(JPa>g@H|uVMWiC8)bvM;-yacso90e(t?6bv5f?bN3sM@6F~kB=lbr-xt}$w(
zn5~x-p&3^w=Xu>(hv!@TrgO8*#c1DgsJk!)NqhX(HE%i7n&&ri_cm#*^*?aGK!e>|
z_s7iBg>Q1#OFg#!iR*mDZ$_NeE*yKuzB%^OX;rGrf@D+lTXr|`cWhtV0_5q`j1?*$
zNWG}6@3$Xrl$+gBxA)E&le)LnEk$zI>n&4(3K{1QcjKTG1`&gdqBZC{%UL=!0Wmdx
zWGJC>L%tqn(43CVvP+jcdst4scZ74GGuG7mu(7)E?TD>5EtRERLRDlP$=vY3{pjJW
z$N>1L;pKQ}hm-G1ZvpcTHQyF7t{+m<!S6T4q7Dq0lT||w-#Y*FH_e^4b*Xl}%Cvko
z_*??hS<ZKxS%8&aKMAGgZKaGt%h6`<l6USq#WTr@LgesZkpOEvN>BunNf)mluahw|
z)V9(KMps!LTgh3?r4Ay!XQaqA+sXwT`EF~JDk+OJGci?~-*T!_g#mSIL#KW4x$|ym
z-FI#Wtdxq|g7p-tz2w55*@WufUVdFo%jcX4$Ct~$s8-F+nT+iUwk<^)6*}F0LEeb(
zSmV#8H0!dWDr@$_SN~ogWwBEzPA#pbF!e1`wYIg&)wRTz-LmUbZ^KhGEc)jB@m_%j
zQKd@GZx*;7(+wqDsHI80T)Rq7ymq~lZ|5?mg;d^Bv#kEDp+2s?HwCuudGnOT)7dgj
zc$mk~k~@QE#dRkyHg0VQJ%N8g8pu~)KA%m~z896C_FdXPu_-;^VG1)f%d5wcFVQVE
z#>(iO`LVJVixHb>N5Mc>;=oVrP~Ee$f{3&?)iI{SuPse7_KF5WBYVQ~OdIu%E+$1M
zcgg1PiWE%tx~G>;-ZRALE@Li*B~pm53(|_|Zcx%BCHcm8O)(khpaDv>lG_eKgfrrZ
zGlDgarDGjQN#zGUG;OG!Hn(lA8*w(Fs07^5UT^L0mx9)nAE`7_zr_M*rp1-i8jA-e
zPq!d=M)3py6IBD#+v>7f3ZguGn6C1D1h}3}SuLA{JURtZ1Yr@Cli#<6T!#<=FM$c9
zB*qaQy#9(ojDr|-5%@|B`i{AKT<GOQFe=9W#pKi*y^yI_no-5n8c|yBH@`1!?6FF-
zNOJ};_;8#Gb;b3&Lv*6veeROEe)6VNueO(|v^kq`F-z)}s_$}qffM}BwFI94rPINw
zs#S_<k+MfB6{T4z*9Gf`w5FD)2sc?u?km4;(|Cb_w;N7Yqj{B}RFb8G+w(p6;hmml
zhS@Gj$nsL&eY__rH%`3_h{1r*lmPEa`(<ult^-A<GJT_4Pg9bv#JT$OrPD)Y#N3u@
zWQANvw?ZrZM#(B4-m51u@(SkB+kKVZP)^Gc>uAV1d{pF;FQQlp*|Skgf6Sy^p*onw
zV?=9ylErnq4eep@eVn|^l#Di@((prmY&hJ)P)HXMZuo2pUy&F<9*%>z<V<9hL?8%G
zDzE&tZ2yvU+>p?6>p&t;ZKCFQAo(B@gayJ*KpvXMbP^7@gm%-iBrOW25S7%IgdVi0
z-Xm@kD3(9eoQTdHSoO-@Y9%p#F`?5_^)nk|h)(C;Pn_c7AK53bw|4?_f5U$$7f_@C
z=T)&KH_=z^ZStf6r{uO`3VZH}dgCSBdy2O~+O>s5PQ$fX)Ksd!9f8;$WCNdM_+#p2
z({0J_QQQdeQ;X=lE=TKiNbiC-6|=2Qqy9d3N$kF#Q16T)4TBlkv@iX|nS36!?pO$@
zkG8jc6mzMG`l>(i{hV^|#gXz_Fyl{D<Bh=z&?}2m>e0erm2K!A8_~>x<k|gT<+iG(
z*X(N<k7@jMI>duA?&7qjs*(C!5!9XUd<b8IWy3ME4^t*Y#RhV>M8d4z^JCO8j7)q#
z9cVe2GczO<*>MWp`DYi+kAqCy%v4yobkXv``gT`r4RxMpyF1(H8OIT{*7|YOHJ8iK
zkAWv0Yu~xMA1^x!F6WI07VUEu9FRnu3JrJMwLioiy5XyK@795}OLG)DMW>pnjRDXQ
zMGnt*BU!Qztag8=UZZ9#XeWSw%g7vTWMl-#K0QP1|3AxPobY`~B#Ck;+F4RF%!3A3
znJ<%Dt80lnO<fTLyZH5p(P7~7sbjHFl(h*Nld~;(ITFtShJxVLBAHGQq2aPjvamA}
zY023;afPEl-ubKIryJamB@HPS8F-)et^2g=d)e{gOvms+T7zWKSNt}CEtQ8y&jX~N
z!cE$hnFo>&FkTEY!4&8^fFG)~{!ZIK+AE_M5tl-B#b}$vN7wYf$0VN0*G`k}z5;(G
z`yPR&Us2ut5=S=O$<F&R((S=(hkJD|sAA{cqU7L|i+$dgpB?UpNDdh&%*cJDSP#5B
zyxjb7M+GUX(xYCM-?hHOtyFbsKf<TLpZi7^GSM|>=Yqo!N<#-k50(XNNKkZSB(PHq
z7Mq|9*#gu>b0)}e@;_}@RX(^36Qjy)ZZ|uJA%=?yFd%`?k4Uai=Y9Bm$C^K1S9N56
zPPPhEKA{M3+`7JY>gM~<U%CEi0(q3kA^zN@;ISp(efbPg|Bw6Gt<AJ%AMPB$zvwIn
zzZ&9tI+#l_BegZ{kEUO{!-9VGd4g)oJP6U}lUMHmc94_9R|*MW6X(epr3CpQZRT&M
z)cgFz?eZP0TCO|L#8krH;7asONQu=eG!8KCe^fOQG_>AFlD&qY7CH_cIcoKc!kfGJ
zDFbj$Jxq}HzE9Vjd~VP&j#yJzzp!RsPV?}RRe^^D&Idj|M!$IB5;(Kv`igiaf1qgx
zV63Y<B4o|Vd0XO@zSUd*Ikp#{0~B)mZ&G(oX+C_u&tt=bG{u<(?JjM5AH~;I3p<w8
z5zWR=+u`Yq@K`n_JpGjJ1|=F6O|zF!YjR6U@)R}y^ek{|Vm6V*#pzj(#GaPx{kzAc
z^Qmzm))F=?O0(#8J0hI<ugQE?b-w%WJr_IAm=ouG>-HHrH+-E^iT!t(-<Hn3_z^U%
z`d+!`O`nQ-J{k2Mh3vFh8xS}Pz7u%keM9~`Yj4bi&1i?iZkmraRW)1e)}(38;>E^U
zr^U;5;#;_dOerlRs^d0BjTs**!D}Jqu?97z;*@dy<$gP8oAU|_0!~qQelAz(x&(Fk
zjI$|=uhZeCu8W19HMuwm4p{)Mk*o{AXfyQGP;&d5A=9l+g{mlL(6I~i6ur?fK>b``
z?#pKVAa52SbLppA!_qI3Gz>lMw9Wl`<i<D5A@u3t<7!<5qoOB(L*-#5g@_e-PRY5G
zDzbF4?6*8<WD$livEQEQl3-=2Ue@D#PD0P@Z6ayD9(LAk3{}XJK_37;C*b6=#t(!N
z08DAqQDL+q`K;c<awoY{o(;TF=KS%TpXnzwL2M((drWMD4*SAxB%{<)uxk?*Wm38*
za59n?nlxa;?5s#f+1FK{iDV#F>Ec$7BST{kK-<Ri^{b``R%lX%U0!LVj1;=fH~n5O
zuh5-WjFGtI)1y)SpKn71sMWGpDKLO=ByVJ=77h###14Z9LX=^;u>3{f26*Fr^sV^U
zlw#`lw>$<G=KBtu!o;|wHKGX!r!KsmTJ?imy{^d;iNsK--;mxp8CJX&hc3SD!-u4+
z=k>{PXjR<{o=C;J-?};qKBuI=#hf@mLcB3H6&b#I>KTsv=25hOA4QPpb%gO%PuSL&
zzgRZT%usg{&yF7GX}o!)Vei=h?#T2y-SaR#A}K<{w~U2B5{@JkfkLm{knf_7fRoe@
z{qC`15E^&6{WAgVbTqUHwrKIB?r&=<Z=)8K+6^>qR?7M&cqACZ1@(5(R(--X*BD1G
z?&M^5`SCLcsH1weHzWm4X=?9;fmkj0r|@eDCc-9rV6i4QFEIx<=@eLph)zBrDHq7A
z@Dyx0@>os~BA(O4pdn~Vgi=xd3~_o~&KJQnR|qm=nohy9w^#f#AKFPQcZ+S3+WKaU
zrdEEM)irO_cakos`FRS6nvJ%N0u70>nWT9XjcUI1MoKPQex=Fh5YiI0cS2AjG`bpS
zpcqDTN;sPjq%}Hlr++;D^CSQ&aGU{yv!{JqO_S2Um^%`eU63Xxf$VW(S`#mIz3Cd(
z+?URHm$cgu;*Unql&GCWS;2>;u6d;#5k;qkQcW|0C%t3?U)_&l-Y!_Z_<FW~fhdqk
z$maJ)bW<;+O>A_=@WmFN9Z&Rh<Q??_6Ci%<zP7TX<=Z%LS40&N_QLB}_7QQj)xZ|J
z5p^b!ife1|8rd+#>}ZeVH>=+cjCBu*3i3S~uJ2y~OO?=gb0$yKyh&mK4l@X{sIkec
z3_G=WNp*#chhAc%dwC5hvI=wGrCnto!JZA?(n(ZzbIE@9PS;rVdz504gw41ro+#+E
zbFyp(7XuTcuBmKRxgkK9*9CFGL2P&NS;IHIgovIh7U9bAs@n#p%QG^lTp#E0C12Ex
z*gkpX%#8$XOH4(8M{q6^bL!ns`!zG<<yFRba4rXFx%uY&#rvR`sEdigp8Zv73{f-g
z<;~GYrg<cQid`DM&GV}2Z3YV&X<GHVGH~pQa<tL*X|uk_`@mT~xSZ(h-3xLmk_pej
z=sjFv+H$b35{7w~1t_R}kG!3~da8vv#$VY5m=%EzCeAx%0s0+Gdup)80FB980dBo=
zdHeXBEZtm7;@WEYF)f2P66A=uU%3I53foL*014zID{}xfQHi8m))VC`;kCRb${Gv&
zAcHfNXR4GDR%E(iGC<#~2xwFYt&P*(BWJ2*0GB!newH&~L2pVYeMm-l%^|;3Uq4rM
zxe<L(_VXJ~_G&j(ldTE=y!!d)%O(JgW2XA{gso)CQ@!ey{bYVIo^Na3`8$0#O&x=1
zwJde0v%itp7;Ro?hhFI~jdyCLy@-?tif}H6<ZvXO@3Bt~UBgdPe!?uf)~d=(Tap)E
zM`PIUe^h&fqqa)E^Tn!D@W|~_=%&4Dibfim{^ax1VLC9SnjQWkCY<q^MDb1XLgXQh
zYFBDGl8}apX4BOd!N8Q*#qTLdq#er+p7P*H^6Gn}*9A28e5J^W;O#A|q+3GP$0E2g
zpvkZuW}H5bP$8<1OG86R{Vav~B>=tV1+|68VQ~rqGE)-=Dgjg9Ni16;+Z{UAv*OWp
z@8Ux1L73A)(_Y%HS@sKVp-0b2Z~^#V6PyOEp^Xbl90cws&Y75KnS2TrS#6_v;<B#E
z22Tg5X2^4#pZOxUm*pHd;{5AgRD3(B{TVmWxOGUsq6@nVB1S!KANyK7Jg_yf5w#*x
zb_x2y8RMn0m>S%1B)|3W_SIG6h?NU^Z%p7Ex8j`kN*V)D=*WC77HseVJ0!9pw0DdP
zZr-c*QE8i+elt{ZFnx-a2wx?n{It8`i4wketJp~Aa85n!&KXu9buCSU&Ag`s%$-B#
z%a9j|AzU;li|lygu~u^z&r9NMfX)_9rAe0KOIJhjU7}TnIcNZF@8vYOc$*YGW!Ozu
z*6sPg!vR`4@?7@&!7cu+Fl9<`OWvd*ow<97EP!g7l_I}l3dqiWw&?Dx-1HHZMDcSm
zkRCt}U;<!`q8{F_RHGk)Qk(jdHK;tS`8TmUNdLZX%FX(_fE9jQOYFWgFc}#cHVpkM
z)b)3|`JJ*jWmA~{F64hw*xy`4iVub00f68rY_QDVBS29f%2P+bP`F?dla|>Qg{}f1
z1UZ59unbXwiOO5y03pCL`Yl1WxB(^dgS~FJ;z2o`sI<9VxE-o_5tcZHizG*f*aL=w
zCk;FjpqQ@Ps(^<-<&Q$2irZv)<h317DxVW(ofyG#bBghSgo(Y{;nhv$`K)T~{@)(O
zi>ZR103BZ|*(%-0mWyIZ2AsbQ-M7g~OY~aF)%?yC?<|v;_3SQ><3+!(&L`f>`L$38
zwHgx)XSt)RlF%xakX8FLXod%!AZSYz60#kdJy;W*oP^vEylnF|8ZTKiqb529x98&X
zW;_F&7l?1N@qdz{li$k-C}*-k^7in@v?9~PH{GLqGi`d>A$N+qD^yP=IH>Ka_z!N%
zH*TI#%TeJ&o4@LxHHjX+f*2_?GnTxP;J^8nIMzt1+e~|<)7;Yc&R{wA`L8!6ef57y
z6LSlvnCn=ws*Pg*e#mA>@<GQ@L}<9p9k|(I<od`?TT3W(8TM6as%w_Wxt)CMbKeki
zz?XudSYMvo(k@T(30fNJNUMUCGu*L5xLvkes@GSOMSoJ4ZfqCo`aD}v0SBn?1Qqi*
zRh$&sa<E_YKT7TL*M+d6nH8Px#xMs`OkSd1;@0&P`ftmr@?>BXw$&1HSaqNW8av8*
znM@#(Jv<MR;kE}T?JL5{##Rm@i};hti3Jv1=?cPZRQsmrFs#G#vn0*<gc7c}EaSkd
z$Cf1!ix&BdE?4OErT*yhKo|MhcF0gH>t#ZwHyM4)sykT`%K`zqodbiwEVZ0#Cm|PF
z_up9b`#p5nI7q(s>~ZRhIS)}TvTXLg<E`@GCpse2%K-dr(YzK2G4smLsOlxrCTR3=
zx(nExWUsT1o%!W$Q=Tx9LSUXX^hNeyLsHcVt9TLJIv}KILo_posG9{x&uGIbfNm1A
z&JOmA18f1A$Su%b&q4m2(t74U$G%evMr)-OlMTwQ?`hzyv^*iJxz+0IBOw-9$AHqa
z?bBnJ(B0INpGiPcQgJ}yIu(M7dJ&;?i926V5BTYB;Tk6!VAi9x&Bb&ieKd5x5z)p3
zcx7PPd41qjk4P*4oIP3;Q{^hMGj$EI{Y<oa2|7~}LB8k@(WFu_sHS+&mW{4Q*56#C
z^*2hu@Ai>?g|j0}pu{geygxIe^O$P~jpV(fwH3wK`CBy#8Mn#r?{W9dC=_iu7)yT)
z1^ui@^4gbgK(lkf1{RPmXy?y^eM2;oM=-UT?|Z@KA8D%3p;~Y=0Dtt;E)jweya`b!
zclNAtAAfWSO}Z{Zw%4aHJ@A{F8K~s`gxp7_4dUa7kUEnGo8=HEzI)=u%o>FuYIlM1
z5j-bw-ero!YN+NRw;MPJ_uX<0yIg|_g6_-QKL0JLY?O<aSTFE`Y)_AY4bx@)N;H%0
zCc^V(3TyEcizOF_z*CYjG53AmEmCL8NVRd8+DEh&F2_uZhlrf+Y!=6u34~uI6FQY?
z58y=xi|eu`_N&|Y7H7+TXoVOeC)-F=K_R_Q6=FK4>Tp*Ow|jdH_2ji+v0j7-h)sbF
z+1;>y1zmC?t2j}T<4-xCB*IRnnBgy)DGy^lhAa`!lbr6Sw@xIRe$_}m|KyRAl^3Nk
z;e*Z=P5Bz%20QMZ9h^Ynk2t6vvhFgpX^(Z);i}G<ct4IBNLKR|R!f`0fob`}WQu~;
zK~w{~5*Fq};RXm(PdPX#DMiflAtxY$9(B(NNAEo5n_9=5LGza<P?jCYutIo@rb_lY
zOS1jt^O3h!$)3FzPONG^02MmDwa;P6`Dm(>tE^?4{Etrz)wjJmc~8>z$Cs#k9_-=r
zNJKSA_ptX9$;L?#<THmU=2zp=;3kXscO)+nsF{WqZ(whqy$XlTGoL5*`mULHj=WHQ
zHoMrVGxEV~WId-W{x~!YHJ<E3GHsls8=^Sf!-SwjbvGh`REu<T?A28CY+l2!(fE=4
z+%^e{#a7JZwMn?)<h1=Rz(dazrgH5j>d#ADR_Xf+1jn5^90!q~BkfL9Ehh92lm>LQ
zPZAi}!w&NChCh|18b0YGoAe|1rkmg;X#+Z4qO^K?3MlL(J>{jCZrfE{<soF)wqMH`
z^zne>3kLS-aRfkuB*xu4)em@k(8r4Aq!#9=C`rThutIRv<CR<7c*FH7c*82&n53AS
z17*imdOjXmG7(C|QCqSFdCqz%T1(NJS`m^E_2{AnfZ&zB%JPa~pzOuwyX#vnT@rbT
zD$uuOYPzv1Rs;tsC&t=54@>AYPW0;A-i*-<SrN4<(hxMp8`{bWP;O!Z(S@Zna-Z2J
zLuNX9X;}G?-Ybu_E0e%64dL1O9N9FyxQYZ)OQqpc6U>7!Ro?~!U_>M<lX!{<GFyjM
zKbn?Ldq#kOSV^AY8Omn&3-B<Un}zQ&Ck9GW^0EXUSf%<b`U7B@j{^dws<A=z7Zk-Y
zjA??}*ss>~#=fd#`O5rsd5!l&9eImz#cc5U@mHtZ`)&{XZ*ft2SP@g6W%HnJLq^EP
zd<N$gSv^F@tCh$hED4c`ct|bFa!`tn>uw1hE3>m9uPGjO;Dr@{odJsL>*adqo`>6?
zp_j~hTB>gjY%6KM$FNeZWffD^*c2O~GF3qIb7Cs&gi-_@)ebd02lRLQk`;dO&*#U2
z@D<?S&p%qgW6Az*m-@l?N&jv~M+4!N(7$hLlY#Jwf6j#cPWU&6UwhynM71i!%iY}v
zPL3TQVg=C<*;)If^2!wwTyi7m5IJlYRCNFCcMozj=;Lq@oQA6U5(pOtutn3-{hi`U
zZ13-L$J^J#!`;ir-tNz3{tv%iph}GR6~2NEX9lpH3DEtob8ml*@(~3%d)T_$*?aTY
zrVitixKfq=K1=u&5Uvbhn-%`25dXjPY^mCR#}8xUv87nQ_wQ2tj}ts@^HRP09k7E9
zXa@q=G9rO=|Lce|jOy#}pvn*sTpGaEkeJ$`04My90nF4z1vqzPBnVDRwTV53AHa5*
ziaqZ?lac?&kvbZo{NDlDp&+;@fNkddcfkL56{DqU{T)7q4KD@(s4y6M>^x^SJgWGk
KMp7kp$o~QO3ASPY

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 9b0e7007c..cd9ef157f 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -33,6 +33,8 @@
 import narwhals.stable.v1 as nw
 from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
+from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read
+
 if TYPE_CHECKING:
     import json  # noqa: F401
     import sys
@@ -257,6 +259,7 @@ def __init__(self, name: _Pandas, /) -> None:
             ".json": pd.read_json,
             ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"),
             ".arrow": pd.read_feather,
+            ".parquet": pd.read_parquet,
         }
         self._scan_fn = {".parquet": pd.read_parquet}
 
@@ -274,6 +277,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
             ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa),
             ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa),
             ".arrow": partial(pd.read_feather, dtype_backend=_pa),
+            ".parquet": partial(pd.read_parquet, dtype_backend=_pa),
         }
         self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}
 
@@ -288,6 +292,7 @@ def __init__(self, name: _Polars, /) -> None:
             ".json": pl.read_json,
             ".tsv": partial(pl.read_csv, separator="\t"),
             ".arrow": pl.read_ipc,
+            ".parquet": pl.read_parquet,
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
@@ -304,6 +309,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
             ".json": pl.read_json,
             ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True),
             ".arrow": partial(pl.read_ipc, use_pyarrow=True),
+            ".parquet": partial(pl.read_parquet, use_pyarrow=True),
         }
         self._scan_fn = {".parquet": pl.scan_parquet}
 
@@ -378,6 +384,7 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table:
             ".json": pa_read_json,
             ".tsv": partial(pa_read_csv, parse_options=tab_sep),
             ".arrow": pa_read_feather,
+            ".parquet": pa_read_parquet,
         }
         self._scan_fn = {".parquet": pa_read_parquet}
 
@@ -401,17 +408,19 @@ def validate_constraints(
     name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
 ) -> Metadata:
     constraints: Metadata = {}
-    suffixes = ".csv", ".json", ".tsv", ".arrow"
     if tag is not None:
         constraints["tag"] = tag
-    if name.endswith(suffixes):
+    if name.endswith(EXTENSION_SUFFIXES):
         fp = Path(name)
         constraints["dataset_name"] = fp.stem
         constraints["suffix"] = fp.suffix
         return constraints
     elif suffix is not None:
         if not is_ext_read(suffix):
-            msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}"
+            msg = (
+                f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n"
+                f"but got: {suffix!r}"
+            )
             raise TypeError(msg)
         else:
             constraints["suffix"] = suffix
@@ -432,10 +441,6 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
     return suffix == ".parquet"
 
 
-def is_ext_read(suffix: Any) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow"}
-
-
 @overload
 def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
 
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index e9546d2b1..cdaa57322 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -4,20 +4,32 @@
 from __future__ import annotations
 
 import sys
-from typing import Literal
+from typing import Any, Literal
 
 if sys.version_info >= (3, 14):
     from typing import TypedDict
 else:
     from typing_extensions import TypedDict
 
+if sys.version_info >= (3, 13):
+    from typing import TypeIs
+else:
+    from typing_extensions import TypeIs
+
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
     from typing_extensions import TypeAlias
 
 
-__all__ = ["Dataset", "Extension", "Metadata", "Version"]
+__all__ = [
+    "EXTENSION_SUFFIXES",
+    "Dataset",
+    "Extension",
+    "Metadata",
+    "Version",
+    "is_ext_read",
+]
 
 Dataset: TypeAlias = Literal[
     "airports",
@@ -96,6 +108,7 @@
     "zipcodes",
 ]
 Version: TypeAlias = Literal[
+    "v2.11.0",
     "v2.10.0",
     "v2.9.0",
     "v2.8.1",
@@ -140,7 +153,12 @@
     "v1.7.0",
     "v1.5.0",
 ]
-Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
+Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"]
+EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet")
+
+
+def is_ext_read(suffix: Any) -> TypeIs[Extension]:
+    return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}
 
 
 class Metadata(TypedDict, total=False):
diff --git a/pyproject.toml b/pyproject.toml
index a3f99b7e9..43370cf7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -250,6 +250,8 @@ extend-safe-fixes=[
     "ANN204",
     # unnecessary-dict-comprehension-for-iterable
     "C420",
+    # unnecessary-literal-set
+    "C405"
 ]
 
 # https://docs.astral.sh/ruff/preview/#using-rules-that-are-in-preview
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 205a0d958..e325147b2 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -400,7 +400,7 @@ def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[Parame
 @datasets_debug
 @pytest.mark.parametrize(
     ("name", "suffix", "tag"),
-    list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})),
+    list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})),
 )
 def test_all_datasets(
     polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index c8e67c394..3702028ac 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -193,6 +193,9 @@ def generate_typing(self, output: Path, /) -> None:
         NAME = "Dataset"
         TAG = "Version"
         EXT = "Extension"
+        EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet"
+        EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
+        EXTENSION_GUARD = "is_ext_read"
         METADATA_TD = "Metadata"
         DESCRIPTION_DEFAULT = "_description_"
         NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
@@ -276,14 +279,18 @@ def generate_typing(self, output: Path, /) -> None:
             f"{HEADER_COMMENT}",
             "from __future__ import annotations\n",
             "import sys",
-            "from typing import Literal, TYPE_CHECKING",
+            "from typing import Any, Literal, TYPE_CHECKING",
             utils.import_typing_extensions((3, 14), "TypedDict"),
+            utils.import_typing_extensions((3, 13), "TypeIs"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
-            f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
+            f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
             f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
             f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
-            f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
+            f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}",
+            f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}",
+            f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
+            f"{indent}return suffix in set({EXTENSION_TYPES!r})\n",
             UNIVERSAL_TYPED_DICT.format(
                 name=METADATA_TD,
                 metaclass_kwds=", total=False",
diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet
index b932af7c5de7eaa7decace6422fd8191fcecb3c0..f8ed6f54e46e03902d48eed24ad595faeeae94fc 100644
GIT binary patch
delta 2269
zcmai#2UHX37RP5uXd#doGDCuaP_vX!1w@D<L5gK<ixAXZh=>FaDDAmQ3sN+Su)v6L
zA_};GC_JSZkS0Z11f|JS5EVg%=#!!-_z+%#tjm`3-pn~O_sqHX|K0iS@4NT4X{s4@
zr~Pg-2o3eiLLb3x5DNkTK<}2x7q2%!A_`!}r%-9Tvpn8XOKO|j+IGjz)1dv;+-?xq
zWLSW&94RL!LweqLN-W)tjAD&84HWg&p73yYDXl&+LD|x{sCIRG<HmrnIewiP&vz>b
z0pbM?(;-$$x&g>aRnS(sv2BC_Vab+&44zCvg|dAiEX;y4Qs$}LY;~vzw=$z-86R~(
z<WE;pD$Yaca7iV`_kx?)m`6{6A@RNX>_IbhLQZxgr)SSV@aw7~f29b`wA+T(CTFh_
z)`?{Ei)D5l?P_`ac)RkpB(|$XyLTaxj+^cxs_>nu)*S4<#kRD$SJ&TbS(<YanL~cu
zy3TaLXy>IUE{D|nVwJ(Whh$gUf>w27@l{dP*0i4X;dJ>sx7v6OXH*JRHH%qtx{b_J
zgU=6_jL*!wYP}Y?pB@mrm^2FSHte_&Q>biJo0a6BS9<2cp}yGkoc--p%&wR<n_m#!
zycRMtBl#R*TKC0w1()2n|8TQ)$mj%r0Cxn}dJJnM8^XFf2?Eg?2wmy{_o`>b{Zp>I
zS5J-2Ti7a*hwM$ph5G$zvvX~4_a!!t>E~DeIJ_g6GpN?HSv8aRrcim_@F}CI`e?m{
z6LGKN$vt2G6>4v%`}pz)nCkW*E)KugWK@9m`H%MM2WXMYm3Y2lPE^4M-Qt@Jo$i37
zCnIJHnREGd@#k6c>g-R=2$jpW&dL49neyWseoBg+I+DgvxitTIHYiz6^I?{`U&S>?
zfhUgdSl(86+=+23U$23*_3pUm`B|(_MG>#}qY-(ilJP$4$90psWwG7b9woB1lXJTg
zdQz(@b@qfD<gafYIp4y(dsE~$I7Gima7h~HM3Tdu6E)2kq|P;#AJn!ZW2ly`${S4E
zj++*&D#GiawegxvIs2$&vW}X@JkCpFqt?Zs=ePrQQUGAei~vzUwgRWtKDtB|k3Tt<
zBj*UrMjSZ(JBG+4I$(*z)qmtLgu`Cqa7+U}ML;s8J-NwFPz!z~j68g&2EY&0z(LW5
zqI$G2dQ-|h-m26doy`PutqH3|;1w_mWC)+c-w%IrZBx|eApIiyzJ!6bwu64b?Pd{S
zH1Z>Zm%>=eMXk6)+0II;)&00pA%*7Z%j;{)wrs@MzYnM@*KLZ+envg(y_d^iHrLvO
z2=96x%rSm@EsS5qbXh{M;n~1Q_468hp(@Fp9!g+m3|m#~^;ZpxGWaEAu&_k%PtqM9
zhYy&t!Y87*?<cNO_g(Y9oJXG7??QJky^*{ol-7Q+>g+u#;*~|?6NF>m|J)ZNLC{%_
z-QL@GEZkb)%~_WbG_iIv2ta+3R+Pbdb=8wOsDA@{>#10;<c%jyNYNUg0!vm#m7=Hd
zJX%W4G#WrjE2@&jeWYFnv6SuSb|26yFy@D*37*_$x>;$TmJs`CuJ!G{$i~<D`4wVf
z3HJrIxrl|ee!siy7csF@Ln<bT6LD^4H?G@R#Aj!v`tJx!Xnp9#Wd6K3e9FN7@j`3M
znjo80Usf4!#v#unNKI&Vrty`_tU`ocL&OiyOm*G5wCNF3qt1^MFL29^Q>G(o&`u*m
z1}BU$({Dyj?$U~L<qC6$@;^R`QnO6p{Nipxn(E_U&Ynu+_1@77Dn^D%wYCpdjE0O%
zcW{rrtW584`qi%VfV1N4+^%pjuoXIcl-n%A&$j9(#`8QM_6n*!e)3sc=+K<E#htaS
zUFlMwbJV!6nz<EP)P&5)^~yeNppWUUH$jB*8zGVt*ALT?0$`3QtUys6GrWW%T7n{y
zLVz(WxPA-FgJ_u~q=e9k5V7BZEQEnI(qe(29%u?c=;gjC9l$<(G<b>FD<E76(iV^e
ztf$!d4c2G_3zSxrSM$bb#^ITAWF|WB=B<7=o4j+Aw_7f3X(Niit<dp1vaC~uhObM}
zvezK*l*!0ZMY>olQ_;wn1)}5usX&APfcQX6G(cLBL?AoVoAdS9fbb)bR&IrSQnbOa
zAUq@k$xy6zB3vSDvou^Z6g&VxT1<#Eu*VtzP~S382n_b|4EFK&VTbrENu`2t5lH)J
z&G;HFFP<Rkkg0frAlH?sbn(CaCISIuNQO*O@(c0&vxX~th?ZenHlx%Ff;*9Ryr#=%
zWvE697Kf54aI`*@i(a;C%%HGklWzePgUj~36_os&U9j$(ONK3CzABBvB2&ocXgw&(
z-_9`#4L-LCBEkI($Qqyt;1ynf?lSvvaIVBPX+2a20<>HsB$=S{M+)v6)JQx!HbD6(
zv6?yM+s^$h1FA}J-zEr#r|qE)08MC#bSo>11y>YTl`E167BeLd=d4%g{k5D7D?m>U
zE#5WBMUwmamOSAYN3q7cC5<aKzia%jYU(Y=Bd3YD9ehV|l0^=Z92RUT|F@KqL_m-P
cN|r8`go+#>EdUlp!YrB`oNHe8k{E;fFMAqHGynhq

delta 2132
zcmZvddpwkB8^@o=Fb!tL44%gflbmLZnP$vr9TUbWWRb(v`-W!P9Lphx(pH8^X{{PO
zs<okmoF$~mCuPf)kdUo5wu43~UM)!nO5Pc3TfJ}h^Uw8p?%#FY*XRCyKi8edea>0t
z$Mfex%FvZ1&|A<O;zIxc(9fIC3I>6LD8Tu8?lPKx{A|~zxdSg*?(Ro)pITQA&bTcB
z2q2?^&@|MjcLJ*55(w$swoQk*>nBUt`~#uS0~synB-&Cx#_YQ6X}m`L$TQ>f*4s@~
zAOKnqMK6c=xkniwA7QE+>#)6t>>(`K5<nBkBvg_x0>Z+4Sd8E`DI$4W?MtCFjm2$k
zMxS2_R7JnQ;p<9KCL&Tj`)0zgLbdz-fZ{9Pc%gfL;DO;sY>PJGg_9nzOC3p#kxdIJ
zW$mcF**Sbg$G@Oi{Q6W1Bex_~ptgnZ1|3sa^zxglPf6}>h`rS7>ZF0jx7$wcoJ!x@
zCaQZQ2`-4MwKMlO-g}RJr)c<(-BD{EJs5eHxu46@KPd|>;)Th~8<^N#RmEks&4Fe`
zl+=QW4L@d+3RCK<h)O^1^D|n$xd~Sb9}DgAr6$DgO0I}u;vYBYTy7m>G}#G%`!LeZ
zXB>-+a$l7~?mS$>RA$*ACO$%yVM#}SIs8LeF!}Pxp@E%L0N}ZPtSXj1yfw){=t?~z
zP|JbSo^t$Oy=yKeWF=Em9V|ZZOE}+WnZ>Sqk5C4-XC~#LO|Q?4D$iwE_e}h$$2myc
z$9E9~%~^)@`g%%DFJI5RyTZ`$+FZOkgV+D5y=_IhU0&qLaF>c@j6JGjObySZW5(}N
zvK=E@D&q{PeDUi0dtN<VP4b~fg&47lH%N{S2`+7*_PCk;;*d%)arSsq?mZ9_o%{0l
zz2QXGB%_$<d&bO*)26v?Mu+O)($8Yb8uzSmw`TWzj97=?-7zMZ^5Bp0%y*~WIn{WY
z&Gx)YDG2v{Wm8(OjL1;^&LF#jSVJ&THpUq8Fuc?{oj%auqXqyx2-sr#ekjNalLi>L
zr-A@MAPE`AbXBVZP8fjF!hG@>F$Ap=^A}@!Uow^$DV%34#{eE6B7T~|3eK*%LWP9I
zoEZ_Q27qpCFCI!`j$TpjOmoe>d4QPLXX2@8VKi<v13Up91F#D(`S$7R#%JA7Lj&1X
zz(~}w8~>qPb9&@PX&rj4dGzD4t>qom8xtc1n+4VROpS!CS%ja{^=%`^{_U8<O#I2O
z$k5+dwD%r{seJ3gbl(xImN->JKNR%}1HBvVGSd>}?z>o5J{mQuW*FpUnz<1z+9DYj
zzjm&P76jM#dS(ds|M4KYKI+AErG0nI*|q`crjJK(x0zkF3DsIwmyf4zEp6C&B>YaI
zVSRANMd$9`0+YB=23zDU-t%*$7rn&#;ac@jTgSBY4`MV5&50buHl}X}Sc4j9j>m~B
zI#{QiG?f|@3WwJw#5<+qNlt0$Vl)mb9}xl+bd(Vy<Wg5AwIc3I?uocdyTqY=hXZPu
z<F6(C7iKbg0=(mzTaTL*#_aGgjf(fLIg6iJcWI}r**sMDvFe#k!9k)(HJ&52zjz9k
zRA0@=`18}RxFB)EtmI&XEXky&E}ZzEOs|bHyZrAo_ua~!(0QLKo-S(<dexj*z5czd
zLsF-=T+ZUN=@ixMwq{|_WU%(r-*2EwTrriO@;#fseOW(i7Y*OmKO4%CRa&m%4Ka!3
zqO~%CX>*ucp){`JdE2vsxv|b|<^>1cMVqf5(w%r+^7b?}InJ|+mJ{_>TN0qFPW+{+
zJZ8taOP#U&OoX8NP)VPSOsP6)um8}O|0b8Iwp|_khc{2PCx7t<vJqc(vN|80@_-*{
z(<XH65kIvq#1U&M-;cnuO!y#3133VsyfF4uVuaCg3G+~%0^uf*R>84HVzDHo7waZZ
zfpIJcKB$6wK?=YaW)Pg)2}WtV&YYL_nT2KaSV{w(%M||>Rf~#UsCH2qu2Q0PS|I5-
zMWGuwmN_3psX76%{z?Xbk`G7#;6-i33zNQ!oNwa=2;T>3k1XwxZX5|&i8oZ^Lj(v7
zp($2ANI0H~N@G|dhwyqhKAZ=_H6ZPTjlFzFir1GPa~Ho%P2R7Z5*3B)6J8g?hXdho
zTC3eT1C8H6&<B|!7<zPTL6?+Zxr$7ItLV^a<%M{d1AV^`URwdxfeUe&6_oRL+-LJy
z(rSl<5D!!MWC}T()`y~e{ViEamAXJAIL#ii0cct9BGvy+gAh))LfK>wwSWMvd-*?7
z)ue>5u*8kY30vo{=nHdM@USB5?JB4Y1<(e$ltpfQ-CHL7$pwPpLLTG*(5AjwobWaK
z$V@ojRer+2yk2U(NdNz?AtBnj_!@=$Cuc?ZE?WwciziupVHO=Dzxw$_`WyiOEFXk0
wd~ZIjVkHHc!;|TMn`dtY&_R#{%9G#$g)H9%G6LYbT`-@9fxEfq2#^@{Ul5T0H~;_u

diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet
index acd04f2c79bb6936ef8776b2a066caa0c99d7515..dac952f9fa86a2de165343eb808b376195022c29 100644
GIT binary patch
delta 815
zcmZvaUr19?9LIm>?Eaa5rkn0}xHm7DQn$5oZIPp3h#{;nDVYQYJvbIL?T@3xhdM%x
zhzcRb@iC#K(H<hgEJ8>leCRQx!U{5c3aLiD<U>K{&KV<oxaWS)J-^?F@A-Xi-hbcU
z+<Pz)K?MsnI8A$yBLD=R?D;eBhKwmN9nUp))qVTj*8-hNN5T8z<BXXbj(Q)S7+rav
zK)?)U$!_GinIPf0<%m%n!&<W)G_Ps6Mv|womT`8~!|sRx=_FhXlD2J}kSK`*D&;!$
zTykjmLaB0!uxG@+rp82u7-EqemMu-R?GPubFL?K2&(8crqPrz^U3vs4UM@}!PyTrR
z>(iNC`M&tt$<&d2s=HXh9Xwy=&Z%sH+68foI>aT~*3OB`12iKVfJhWQN?@W76sy8j
zCOMKyuluL$3$fo-$HX!<M8XxW=IBKYH7lAH<F4&09RIAx#8X#GKWACWJ`lU$iAB=^
zw^lL=V7SXdzU0taai5%-LvR=nAjEK7l;*zGcSHJ-8%}h&R+ZQwyt=7#*3X9QHE~Rt
zN~FQ`QB$_bhai{?z_i#B#n(hiM=z*&sVoa_T-f+LRfD%T^qL26{?npM&79(<WuCPQ
zinV!)itUa`zuM^D(1B3c&_^SoNH`P*VB>x?*sKrtgFUBN6&Y-vZIIpbBV#{(*a7y}
zs>+5<LfhhDvkw1JgHt{yu4w)$`CF~WA||z_x<%*sd+9h^Z;5%A;r%}fF`)TtSDm^C
ojSgl-)agw-$}_EZu2qhKWl>sNZNye!FAc_RH+%KCSAR?Y0=>_>t^fc4

delta 776
zcmZvZUr3Wt7{=dozJJ@?bZdJL-?vRHp|5S^+9XGXg(CeSx~SzvAVexyS*{TxaVm(>
zMMMPabg30G(F^IKKSPAV!kdC3qHYWdZ&V7j3!#gc^nGi_pj{kzIQ)3t=Y8JEz(k<w
zcxO6^GTy7h$MhI-1c1o%U8OT`$dC*<HQ&_bDRn!*OZHEB=e|4(<+G`V%)2MQqX?MA
z9NCFHUyKo+Urg%D*U@dXf$BAMX}I?+x*2DyZZ;}nq=RsAfwVMpLUxlwfO3UXhI$7t
zT`rU3gv}5$CuT{X1vk=`!<;BS=UD&I=*0cB%jlA=Y;fh<)wOFs?ySA|b-E`y*7|xm
z*b{3!$=9kJ&u1!*FBLXV&4S3&BbMU=ZE52~QaT`$rdtD`kx78JQJ}!8@??95`_Gm0
z+@x%Um>*s3Vv*{iTS~Y%dQn9+oEi{!rR~a%*iUJnn31-5Im=PDK+J-tJv9(<sG?H}
zh?1mPQx&<%&533ucmCxQa!}*!6dN?x#R+*lJphJ}8ae^%5d?z)7-9B_;s;_$ThA%T
zE3)9g`OWVKbvU}I*FAXqpB4v|Y#%qm#znIrhijiiHXZFLh^Qm2@xAecu8kz)$wWK>
zz-|RlXGI_G0drOjSJhhMP{9~`6hOvSeRu%O6=yYG;|dmDwB4t-*!Zlw3jeQm@lXxb
zzqje{WqPANEM7@^t*ZSVwq%1-e%tn|@M@tQm4+MaB?+<GwKm(Zqb~We8Q8SWxanZ?
JPt)3G`UeUUxhMbt

diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index 6f55c1d52..b9b156c60 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -64,7 +64,7 @@
 
 
 def is_ext_supported(suffix: str) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow"}
+    return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}
 
 
 def _is_str(obj: Any) -> TypeIs[str]:

From 95582df0847c84c61b41a349887a4a1b703477cb Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 16 Nov 2024 20:35:45 +0000
Subject: [PATCH 107/201] feat: Always use `pl.read_csv(try_parse_dates=True)`

Related https://github.com/vega/altair/pull/3631#issuecomment-2480670438
---
 altair/datasets/_readers.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index cd9ef157f..54edb909e 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -288,9 +288,9 @@ def __init__(self, name: _Polars, /) -> None:
         if not TYPE_CHECKING:
             pl = self._import(self._name)
         self._read_fn = {
-            ".csv": pl.read_csv,
+            ".csv": partial(pl.read_csv, try_parse_dates=True),
             ".json": pl.read_json,
-            ".tsv": partial(pl.read_csv, separator="\t"),
+            ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True),
             ".arrow": pl.read_ipc,
             ".parquet": pl.read_parquet,
         }
@@ -305,9 +305,11 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
             pl = self._import(_pl)
             pa = self._import(_pa)  # noqa: F841
         self._read_fn = {
-            ".csv": partial(pl.read_csv, use_pyarrow=True),
+            ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True),
             ".json": pl.read_json,
-            ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True),
+            ".tsv": partial(
+                pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True
+            ),
             ".arrow": partial(pl.read_ipc, use_pyarrow=True),
             ".parquet": partial(pl.read_parquet, use_pyarrow=True),
         }

From dc4a23013d39b88b2047c8408b902081a30aec96 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 16 Nov 2024 21:46:07 +0000
Subject: [PATCH 108/201] feat: Adds `_pl_read_json_roundtrip`

First mentioned in https://github.com/vega/altair/pull/3631#issuecomment-2480670438

Addresses most of the  `polars` part of https://github.com/vega/altair/pull/3631#issuecomment-2479333070
---
 altair/datasets/_readers.py | 36 ++++++++++++++++++++++++++++++--
 tests/test_datasets.py      | 41 +++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 54edb909e..e55d28359 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -38,6 +38,7 @@
 if TYPE_CHECKING:
     import json  # noqa: F401
     import sys
+    from io import IOBase
     from urllib.request import OpenerDirector
 
     import pandas as pd
@@ -282,6 +283,37 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
         self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}
 
 
+def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame:
+    """
+    Try to utilize better date parsing available in `pl.read_csv`_.
+
+    `pl.read_json`_ has few options when compared to `pl.read_csv`_.
+
+    Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_.
+
+    .. _pl.read_json:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html
+    .. _pl.read_csv:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html
+    .. _pandas.read_json:
+        https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
+    """
+    from io import BytesIO
+
+    import polars as pl
+
+    df = pl.read_json(source, **kwds)
+    if any(tp.is_nested() for tp in df.schema.dtypes()):
+        # NOTE: Inferred as `(Geo|Topo)JSON`, which wouldn't be supported by `read_csv`
+        return df
+    buf = BytesIO()
+    df.write_csv(buf)
+    if kwds:
+        SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"}
+        kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS}
+    return pl.read_csv(buf, try_parse_dates=True, **kwds)
+
+
 class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
     def __init__(self, name: _Polars, /) -> None:
         self._name = _requirements(name)
@@ -289,7 +321,7 @@ def __init__(self, name: _Polars, /) -> None:
             pl = self._import(self._name)
         self._read_fn = {
             ".csv": partial(pl.read_csv, try_parse_dates=True),
-            ".json": pl.read_json,
+            ".json": _pl_read_json_roundtrip,
             ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True),
             ".arrow": pl.read_ipc,
             ".parquet": pl.read_parquet,
@@ -306,7 +338,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
             pa = self._import(_pa)  # noqa: F841
         self._read_fn = {
             ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True),
-            ".json": pl.read_json,
+            ".json": _pl_read_json_roundtrip,
             ".tsv": partial(
                 pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True
             ),
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e325147b2..221666c35 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import datetime as dt
 import re
 import sys
 from functools import partial
@@ -35,6 +36,15 @@
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
 
+class DatasetSpec(TypedDict, total=False):
+    """Exceptional cases which cannot rely on defaults."""
+
+    name: Dataset
+    suffix: Extension
+    tag: Version
+    marks: MarksType
+
+
 requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow()
 
 backends: pytest.MarkDecorator = pytest.mark.parametrize(
@@ -346,7 +356,7 @@ def test_reader_cache(
 
 
 @pytest.mark.parametrize(
-    "dataset",
+    "name",
     [
         "cars",
         movies_fail,
@@ -361,7 +371,7 @@ def test_reader_cache(
 @pytest.mark.parametrize("fallback", ["polars", None])
 @skip_requires_pyarrow
 def test_pyarrow_read_json(
-    fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch
+    fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     monkeypatch.setenv(CACHE_ENV_VAR, "")
     monkeypatch.delitem(sys.modules, "pandas", raising=False)
@@ -370,15 +380,28 @@ def test_pyarrow_read_json(
 
     data = Loader.with_backend("pyarrow")
 
-    data(dataset, ".json")
+    data(name, ".json")
 
 
-class DatasetSpec(TypedDict, total=False):
-    """Exceptional cases which cannot rely on defaults."""
-
-    suffix: Extension
-    tag: Version
-    marks: MarksType
+@pytest.mark.parametrize(
+    ("spec", "column"),
+    [
+        (DatasetSpec(name="cars", tag="v2.11.0"), "Year"),
+        (DatasetSpec(name="unemployment-across-industries", tag="v2.11.0"), "date"),
+        (DatasetSpec(name="flights-10k", tag="v2.11.0"), "date"),
+        (DatasetSpec(name="football", tag="v2.11.0"), "date"),
+        (DatasetSpec(name="crimea", tag="v2.11.0"), "date"),
+        (DatasetSpec(name="ohlc", tag="v2.11.0"), "date"),
+    ],
+)
+def test_polars_read_json_roundtrip(
+    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
+    spec: DatasetSpec,
+    column: str,
+) -> None:
+    frame = polars_loader(spec["name"], ".json", tag=spec["tag"])
+    tp = frame.schema.to_python()[column]
+    assert tp is dt.date or issubclass(tp, dt.date)
 
 
 def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]:

From 7ddb2a8c1e8ec6477cfc646c385e0b168f2fd330 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 17 Nov 2024 19:28:43 +0000
Subject: [PATCH 109/201] feat(DRAFT): Adds infer-based `altair.datasets.load`

Requested by @joelostblom in:
https://github.com/vega/altair/pull/3631#issuecomment-2480832609
https://github.com/vega/altair/pull/3631#issuecomment-2479333070
---
 altair/datasets/__init__.py | 35 +++++++++++++++---------
 altair/datasets/_readers.py | 32 +++++++++++++++++++++-
 tests/test_datasets.py      | 54 +++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 3760a4f2a..4545d36b0 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -23,7 +23,7 @@
     from altair.datasets._readers import _Backend
     from altair.datasets._typing import Dataset, Extension, Version
 
-__all__ = ["Loader", "data"]
+__all__ = ["Loader", "load"]
 
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
@@ -320,18 +320,29 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}[{self._reader._name}]"
 
 
+load: Loader[Any, Any]
+
+
 def __getattr__(name):
-    if name == "data":
-        global data
-        data = Loader.with_backend("pandas")
-        from altair.utils.deprecation import deprecated_warn
-
-        deprecated_warn(
-            "Added only for backwards compatibility with `altair-viz/vega_datasets`.",
-            version="5.5.0",
-            alternative="altair.datasets.Loader.with_backend(...)",
+    if name == "load":
+        import warnings
+
+        from altair.datasets._readers import infer_backend
+
+        reader = infer_backend()
+        global load
+        load = Loader.__new__(Loader)
+        load._reader = reader
+
+        warnings.warn(
+            "For full IDE completions, instead use:\n\n"
+            "    from altair.datasets import Loader\n"
+            "    load = Loader.with_backend(...)\n\n"
+            "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609",
+            UserWarning,
             stacklevel=3,
         )
-        return data
+        return load
     else:
-        raise AttributeError(name)
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index e55d28359..953401bae 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -11,7 +11,7 @@
 
 import os
 import urllib.request
-from collections.abc import Mapping, Sequence
+from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
 from importlib import import_module
 from importlib.util import find_spec
@@ -475,6 +475,36 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
     return suffix == ".parquet"
 
 
+def is_available(pkg_names: str | Iterable[str], *more_pkg_names: str) -> bool:
+    pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
+    names = chain(pkgs_names, more_pkg_names)
+    return all(find_spec(name) is not None for name in names)
+
+
+def infer_backend(
+    *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow")
+) -> _Reader[Any, Any]:
+    """
+    Return the first available reader in order of `priority`.
+
+    Notes
+    -----
+    - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``)
+    - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support
+    - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed
+    - ``"pyarrow"``: least reliable
+
+    .. _fastparquet:
+        https://github.com/dask/fastparquet
+
+    """
+    it = (backend(name) for name in priority if is_available(_requirements(name)))
+    if reader := next(it, None):
+        return reader
+    msg = f"Found no supported backend, searched:\n" f"{priority!r}"
+    raise NotImplementedError(msg)
+
+
 @overload
 def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 221666c35..f903d500a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -3,7 +3,9 @@
 import datetime as dt
 import re
 import sys
+import warnings
 from functools import partial
+from importlib import import_module
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, cast, get_args
 from urllib.error import URLError
@@ -127,6 +129,58 @@ def test_loader_url(backend: _Backend) -> None:
     assert pattern.match(url) is not None
 
 
+def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
+    """
+    Inferring the best backend available.
+
+    Based on the following order:
+
+        priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow"
+    """
+    import altair.datasets
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning)
+        from altair.datasets import load
+
+        assert load._reader._name == "polars"
+        monkeypatch.delattr(altair.datasets, "load")
+
+        monkeypatch.setitem(sys.modules, "polars", None)
+
+        from altair.datasets import load
+
+        if find_spec("pyarrow") is None:
+            # NOTE: We can end the test early for the CI job that removes `pyarrow`
+            assert load._reader._name == "pandas"
+            monkeypatch.delattr(altair.datasets, "load")
+            monkeypatch.setitem(sys.modules, "pandas", None)
+            with pytest.raises(NotImplementedError, match="no.+backend"):
+                from altair.datasets import load
+        else:
+            assert load._reader._name == "pandas[pyarrow]"
+            monkeypatch.delattr(altair.datasets, "load")
+
+            monkeypatch.setitem(sys.modules, "pyarrow", None)
+
+            from altair.datasets import load
+
+            assert load._reader._name == "pandas"
+            monkeypatch.delattr(altair.datasets, "load")
+
+            monkeypatch.setitem(sys.modules, "pandas", None)
+            monkeypatch.delitem(sys.modules, "pyarrow")
+            monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
+            from altair.datasets import load
+
+            assert load._reader._name == "pyarrow"
+            monkeypatch.delattr(altair.datasets, "load")
+            monkeypatch.setitem(sys.modules, "pyarrow", None)
+
+            with pytest.raises(NotImplementedError, match="no.+backend"):
+                from altair.datasets import load
+
+
 @backends
 def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv(CACHE_ENV_VAR, raising=False)

From 9544d9b68e1e6c1786d823cdd9ef3e961497cfa3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 18 Nov 2024 21:39:24 +0000
Subject: [PATCH 110/201] refactor: Rename `Loader.with_backend` ->
 `Loader.from_backend`

https://github.com/vega/altair/pull/3631#discussion_r1847157544
---
 altair/datasets/__init__.py | 28 ++++++++++++++--------------
 tests/test_datasets.py      | 24 ++++++++++++------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 4545d36b0..d01ef6f60 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -34,7 +34,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
 
         from altair.datasets import Loader
 
-        data = Loader.with_backend("polars")
+        data = Loader.from_backend("polars")
         >>> data  # doctest: +SKIP
         Loader[polars]
 
@@ -46,24 +46,24 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
 
     @overload
     @classmethod
-    def with_backend(
+    def from_backend(
         cls, backend_name: Literal["polars", "polars[pyarrow]"], /
     ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
 
     @overload
     @classmethod
-    def with_backend(
+    def from_backend(
         cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
     ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
 
     @overload
     @classmethod
-    def with_backend(
+    def from_backend(
         cls, backend_name: Literal["pyarrow"], /
     ) -> Loader[pa.Table, pa.Table]: ...
 
     @classmethod
-    def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
+    def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
         """
         Initialize a new loader, with the specified backend.
 
@@ -94,7 +94,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
 
             from altair.datasets import Loader
 
-            data = Loader.with_backend("polars")
+            data = Loader.from_backend("polars")
             cars = data("cars")
 
             >>> type(cars)  # doctest: +SKIP
@@ -102,7 +102,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
 
         Using ``pandas``:
 
-            data = Loader.with_backend("pandas")
+            data = Loader.from_backend("pandas")
             cars = data("cars")
 
             >>> type(cars)  # doctest: +SKIP
@@ -110,7 +110,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
 
         Using ``pandas``, backed by ``pyarrow`` dtypes:
 
-            data = Loader.with_backend("pandas[pyarrow]")
+            data = Loader.from_backend("pandas[pyarrow]")
             cars = data("cars", tag="v1.29.0")
 
             >>> type(cars)  # doctest: +SKIP
@@ -170,7 +170,7 @@ def __call__(
 
             from altair.datasets import Loader
 
-            data = Loader.with_backend("polars")
+            data = Loader.from_backend("polars")
             source = data("stocks", tag="v2.10.0")
 
             >>> source.columns  # doctest: +SKIP
@@ -198,7 +198,7 @@ def __call__(
 
         Using ``pandas``:
 
-            data = Loader.with_backend("pandas")
+            data = Loader.from_backend("pandas")
             source = data("stocks", tag="v2.10.0")
 
             >>> source.columns  # doctest: +SKIP
@@ -222,7 +222,7 @@ def __call__(
 
         Using ``pyarrow``:
 
-            data = Loader.with_backend("pyarrow")
+            data = Loader.from_backend("pyarrow")
             source = data("stocks", tag="v2.10.0")
 
             >>> source.column_names  # doctest: +SKIP
@@ -276,7 +276,7 @@ def url(
             import altair as alt
             from altair.datasets import Loader
 
-            data = Loader.with_backend("polars")
+            data = Loader.from_backend("polars")
             >>> data.url("cars", tag="v2.9.0")  # doctest: +SKIP
             'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
 
@@ -302,7 +302,7 @@ def cache_dir(self) -> Path | None:
 
             from altair.datasets import Loader
 
-            data = Loader.with_backend("polars")
+            data = Loader.from_backend("polars")
             data.cache_dir = Path.home() / ".altair_cache"
 
             >>> data.cache_dir.relative_to(Path.home()).as_posix()  # doctest: +SKIP
@@ -337,7 +337,7 @@ def __getattr__(name):
         warnings.warn(
             "For full IDE completions, instead use:\n\n"
             "    from altair.datasets import Loader\n"
-            "    load = Loader.with_backend(...)\n\n"
+            "    load = Loader.from_backend(...)\n\n"
             "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609",
             UserWarning,
             stacklevel=3,
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index f903d500a..0d2deae7f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -86,7 +86,7 @@ class DatasetSpec(TypedDict, total=False):
 def polars_loader(
     tmp_path_factory: pytest.TempPathFactory,
 ) -> Loader[pl.DataFrame, pl.LazyFrame]:
-    data = Loader.with_backend("polars")
+    data = Loader.from_backend("polars")
     data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars")
     return data
 
@@ -112,14 +112,14 @@ def metadata_columns() -> frozenset[str]:
 
 
 @backends
-def test_loader_with_backend(backend: _Backend) -> None:
-    data = Loader.with_backend(backend)
+def test_loader_from_backend(backend: _Backend) -> None:
+    data = Loader.from_backend(backend)
     assert data._reader._name == backend
 
 
 @backends
 def test_loader_url(backend: _Backend) -> None:
-    data = Loader.with_backend(backend)
+    data = Loader.from_backend(backend)
     dataset_name = "volcano"
     pattern = re.compile(
         rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+"
@@ -185,7 +185,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
 def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv(CACHE_ENV_VAR, raising=False)
 
-    data = Loader.with_backend(backend)
+    data = Loader.from_backend(backend)
     frame = data("stocks", ".csv")
     assert is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)
@@ -208,7 +208,7 @@ def test_missing_dependency_single(
             flags=re.DOTALL,
         ),
     ):
-        Loader.with_backend(backend)
+        Loader.from_backend(backend)
 
 
 @pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"])
@@ -227,7 +227,7 @@ def test_missing_dependency_multi(
             flags=re.DOTALL,
         ),
     ):
-        Loader.with_backend(backend)
+        Loader.from_backend(backend)
 
 
 @backends
@@ -239,7 +239,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
     """
     import polars as pl
 
-    data = Loader.with_backend(backend)
+    data = Loader.from_backend(backend)
     real_name: Literal["disasters"] = "disasters"
     real_suffix: Literal[".csv"] = ".csv"
     real_tag: Literal["v1.14.0"] = "v1.14.0"
@@ -344,7 +344,7 @@ def test_reader_cache(
 
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
 
-    data = Loader.with_backend(backend)
+    data = Loader.from_backend(backend)
     cache_dir = data.cache_dir
     assert cache_dir is not None
     assert cache_dir == tmp_path
@@ -432,7 +432,7 @@ def test_pyarrow_read_json(
     if fallback is None:
         monkeypatch.setitem(sys.modules, "polars", None)
 
-    data = Loader.with_backend("pyarrow")
+    data = Loader.from_backend("pyarrow")
 
     data(name, ".json")
 
@@ -497,7 +497,7 @@ def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
 def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     from polars.testing import assert_frame_equal
 
-    data = Loader.with_backend("polars")
+    data = Loader.from_backend("polars")
     data.cache_dir = tmp_path
 
     data("londonCentroids")
@@ -536,7 +536,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
 @backends
 def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
     """Ensure all backends will query the same column names."""
-    data = Loader.with_backend(backend)
+    data = Loader.from_backend(backend)
     fn = data._reader.scan_fn(_METADATA)
     native = fn(_METADATA)
     schema_columns = nw.from_native(native).lazy().collect().columns

From 7b3a89e5b5374eb391b7ae73ace219327069f979 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 18 Nov 2024 21:52:47 +0000
Subject: [PATCH 111/201] feat(DRAFT): Add optional `backend` parameter for
 `load(...)`

Requested by @jonmmease
https://github.com/vega/altair/pull/3631#discussion_r1847111064
https://github.com/vega/altair/pull/3631#discussion_r1847176465
---
 altair/datasets/__init__.py | 94 +++++++++++++++++++++++++++++++------
 tests/test_datasets.py      | 81 ++++++++++++++++++++------------
 2 files changed, 132 insertions(+), 43 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index d01ef6f60..26fd39b20 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Generic, overload
+from typing import TYPE_CHECKING, Generic, final, overload
 
 from narwhals.typing import IntoDataFrameT, IntoFrameT
 
@@ -320,28 +320,94 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}[{self._reader._name}]"
 
 
-load: Loader[Any, Any]
+@final
+class _Load(Loader[IntoDataFrameT, IntoFrameT]):
+    @overload
+    def __call__(  # pyright: ignore[reportOverlappingOverload]
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: None = ...,
+        **kwds: Any,
+    ) -> IntoDataFrameT: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["polars", "polars[pyarrow]"] = ...,
+        **kwds: Any,
+    ) -> pl.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["pandas", "pandas[pyarrow]"] = ...,
+        **kwds: Any,
+    ) -> pd.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["pyarrow"] = ...,
+        **kwds: Any,
+    ) -> pa.Table: ...
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+        backend: _Backend | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
+        if backend is None:
+            return super().__call__(name, suffix, tag, **kwds)
+        else:
+            return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
+
+
+load: _Load[Any, Any]
+"""
+For full IDE completions, instead use:
+
+    from altair.datasets import Loader
+    load = Loader.from_backend("polars")
+    cars = load("cars")
+    movies = load("movies")
+
+Alternatively, specify ``backend`` during a call:
+
+    from altair.datasets import load
+    cars = load("cars", backend="polars")
+    movies = load("movies", backend="polars")
+
+Related
+-------
+- https://github.com/vega/altair/pull/3631#issuecomment-2480832609
+- https://github.com/vega/altair/pull/3631#discussion_r1847111064
+- https://github.com/vega/altair/pull/3631#discussion_r1847176465
+"""
 
 
 def __getattr__(name):
     if name == "load":
-        import warnings
-
         from altair.datasets._readers import infer_backend
 
         reader = infer_backend()
         global load
-        load = Loader.__new__(Loader)
+        load = _Load.__new__(_Load)
         load._reader = reader
-
-        warnings.warn(
-            "For full IDE completions, instead use:\n\n"
-            "    from altair.datasets import Loader\n"
-            "    load = Loader.from_backend(...)\n\n"
-            "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609",
-            UserWarning,
-            stacklevel=3,
-        )
         return load
     else:
         msg = f"module {__name__!r} has no attribute {name!r}"
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 0d2deae7f..3d986ec75 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -3,7 +3,6 @@
 import datetime as dt
 import re
 import sys
-import warnings
 from functools import partial
 from importlib import import_module
 from importlib.util import find_spec
@@ -11,7 +10,12 @@
 from urllib.error import URLError
 
 import pytest
-from narwhals.dependencies import is_into_dataframe, is_polars_dataframe
+from narwhals.dependencies import (
+    is_into_dataframe,
+    is_pandas_dataframe,
+    is_polars_dataframe,
+    is_pyarrow_table,
+)
 from narwhals.stable import v1 as nw
 
 from altair.datasets import Loader
@@ -138,47 +142,66 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow"
     """
     import altair.datasets
+    from altair.datasets import load
 
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=UserWarning)
-        from altair.datasets import load
+    assert load._reader._name == "polars"
+    monkeypatch.delattr(altair.datasets, "load")
+
+    monkeypatch.setitem(sys.modules, "polars", None)
 
-        assert load._reader._name == "polars"
+    from altair.datasets import load
+
+    if find_spec("pyarrow") is None:
+        # NOTE: We can end the test early for the CI job that removes `pyarrow`
+        assert load._reader._name == "pandas"
+        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.setitem(sys.modules, "pandas", None)
+        with pytest.raises(NotImplementedError, match="no.+backend"):
+            from altair.datasets import load
+    else:
+        assert load._reader._name == "pandas[pyarrow]"
         monkeypatch.delattr(altair.datasets, "load")
 
-        monkeypatch.setitem(sys.modules, "polars", None)
+        monkeypatch.setitem(sys.modules, "pyarrow", None)
 
         from altair.datasets import load
 
-        if find_spec("pyarrow") is None:
-            # NOTE: We can end the test early for the CI job that removes `pyarrow`
-            assert load._reader._name == "pandas"
-            monkeypatch.delattr(altair.datasets, "load")
-            monkeypatch.setitem(sys.modules, "pandas", None)
-            with pytest.raises(NotImplementedError, match="no.+backend"):
-                from altair.datasets import load
-        else:
-            assert load._reader._name == "pandas[pyarrow]"
-            monkeypatch.delattr(altair.datasets, "load")
+        assert load._reader._name == "pandas"
+        monkeypatch.delattr(altair.datasets, "load")
+
+        monkeypatch.setitem(sys.modules, "pandas", None)
+        monkeypatch.delitem(sys.modules, "pyarrow")
+        monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
+        from altair.datasets import load
 
-            monkeypatch.setitem(sys.modules, "pyarrow", None)
+        assert load._reader._name == "pyarrow"
+        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.setitem(sys.modules, "pyarrow", None)
 
+        with pytest.raises(NotImplementedError, match="no.+backend"):
             from altair.datasets import load
 
-            assert load._reader._name == "pandas"
-            monkeypatch.delattr(altair.datasets, "load")
 
-            monkeypatch.setitem(sys.modules, "pandas", None)
-            monkeypatch.delitem(sys.modules, "pyarrow")
-            monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
-            from altair.datasets import load
+@requires_pyarrow
+def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
+    import altair.datasets
+
+    monkeypatch.delattr(altair.datasets, "load", raising=False)
+
+    load = altair.datasets.load
+    assert load._reader._name == "polars"
 
-            assert load._reader._name == "pyarrow"
-            monkeypatch.delattr(altair.datasets, "load")
-            monkeypatch.setitem(sys.modules, "pyarrow", None)
+    default = load("cars")
+    df_pyarrow = load("cars", backend="pyarrow")
+    df_pandas = load("cars", backend="pandas[pyarrow]")
+    default_2 = load("cars")
+    df_polars = load("cars", backend="polars")
 
-            with pytest.raises(NotImplementedError, match="no.+backend"):
-                from altair.datasets import load
+    assert is_polars_dataframe(default)
+    assert is_pyarrow_table(df_pyarrow)
+    assert is_pandas_dataframe(df_pandas)
+    assert is_polars_dataframe(default_2)
+    assert is_polars_dataframe(df_polars)
 
 
 @backends

From c835c131282cc189b9bc4cc91bef2492c0b2dd36 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 20 Nov 2024 13:25:27 +0000
Subject: [PATCH 112/201] feat(DRAFT): Adds `altair.datasets.url`

A dataframe package is still required currently,.
Can later be adapted to fit the requirements of (https://github.com/vega/altair/pull/3631#discussion_r1846662053).

Related:
- https://github.com/vega/altair/pull/3631#issuecomment-2484826592
- https://github.com/vega/altair/pull/3631#issuecomment-2480832711
- https://github.com/vega/altair/discussions/3150#discussioncomment-11280516

@mattijn, @joelostblom
---
 altair/datasets/__init__.py | 415 ++++--------------------------------
 altair/datasets/_loader.py  | 394 ++++++++++++++++++++++++++++++++++
 tests/test_datasets.py      |  59 ++++-
 3 files changed, 491 insertions(+), 377 deletions(-)
 create mode 100644 altair/datasets/_loader.py

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 26fd39b20..ac7ac9f06 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -1,380 +1,23 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Generic, final, overload
+from typing import TYPE_CHECKING
 
-from narwhals.typing import IntoDataFrameT, IntoFrameT
-
-from altair.datasets._readers import _Reader, backend
+from altair.datasets._loader import Loader
 
 if TYPE_CHECKING:
     import sys
-    from pathlib import Path
-    from typing import Any, Literal
-
-    import pandas as pd
-    import polars as pl
-    import pyarrow as pa
-    from _typeshed import StrPath
+    from typing import Any
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
-    from altair.datasets._readers import _Backend
-    from altair.datasets._typing import Dataset, Extension, Version
-
-__all__ = ["Loader", "load"]
-
-
-class Loader(Generic[IntoDataFrameT, IntoFrameT]):
-    """
-    Load examples **remotely** from `vega-datasets`_, with *optional* caching.
-
-    A new ``Loader`` must be initialized by specifying a backend:
-
-        from altair.datasets import Loader
-
-        data = Loader.from_backend("polars")
-        >>> data  # doctest: +SKIP
-        Loader[polars]
-
-    .. _vega-datasets:
-        https://github.com/vega/vega-datasets
-    """
-
-    _reader: _Reader[IntoDataFrameT, IntoFrameT]
-
-    @overload
-    @classmethod
-    def from_backend(
-        cls, backend_name: Literal["polars", "polars[pyarrow]"], /
-    ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
-
-    @overload
-    @classmethod
-    def from_backend(
-        cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
-    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
-
-    @overload
-    @classmethod
-    def from_backend(
-        cls, backend_name: Literal["pyarrow"], /
-    ) -> Loader[pa.Table, pa.Table]: ...
-
-    @classmethod
-    def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
-        """
-        Initialize a new loader, with the specified backend.
-
-        Parameters
-        ----------
-        backend_name
-            DataFrame package/config used to return data.
-
-            * *polars*: Using `polars defaults`_
-            * *polars[pyarrow]*: Using ``use_pyarrow=True``
-            * *pandas*: Using `pandas defaults`_.
-            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
-            * *pyarrow*: (*Experimental*)
-
-            .. warning::
-                Most datasets use a `JSON format not supported`_ by ``pyarrow``
-
-        .. _polars defaults:
-            https://docs.pola.rs/api/python/stable/reference/io.html
-        .. _pandas defaults:
-            https://pandas.pydata.org/docs/reference/io.html
-        .. _JSON format not supported:
-            https://arrow.apache.org/docs/python/json.html#reading-json-files
-
-        Examples
-        --------
-        Using ``polars``:
-
-            from altair.datasets import Loader
-
-            data = Loader.from_backend("polars")
-            cars = data("cars")
-
-            >>> type(cars)  # doctest: +SKIP
-            polars.dataframe.frame.DataFrame
-
-        Using ``pandas``:
-
-            data = Loader.from_backend("pandas")
-            cars = data("cars")
-
-            >>> type(cars)  # doctest: +SKIP
-            pandas.core.frame.DataFrame
-
-        Using ``pandas``, backed by ``pyarrow`` dtypes:
-
-            data = Loader.from_backend("pandas[pyarrow]")
-            cars = data("cars", tag="v1.29.0")
-
-            >>> type(cars)  # doctest: +SKIP
-            pandas.core.frame.DataFrame
-
-            >>> cars.dtypes  # doctest: +SKIP
-            Name                string[pyarrow]
-            Miles_per_Gallon    double[pyarrow]
-            Cylinders            int64[pyarrow]
-            Displacement        double[pyarrow]
-            Horsepower           int64[pyarrow]
-            Weight_in_lbs        int64[pyarrow]
-            Acceleration        double[pyarrow]
-            Year                string[pyarrow]
-            Origin              string[pyarrow]
-            dtype: object
-        """
-        obj = Loader.__new__(Loader)
-        obj._reader = backend(backend_name)
-        return obj
-
-    def __call__(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: Version | None = None,
-        **kwds: Any,
-    ) -> IntoDataFrameT:
-        """
-        Get a remote dataset and load as tabular data.
-
-        Parameters
-        ----------
-        name
-            Name of the dataset/`Path.stem`_.
-        suffix
-            File extension/`Path.suffix`_.
-
-            .. note::
-                Only needed if ``name`` is available in multiple formats.
-        tag
-            Version identifier for a `vega-datasets release`_.
-        **kwds
-            Arguments passed to the underlying read function.
-
-        .. _Path.stem:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
-        .. _Path.suffix:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-        .. _vega-datasets release:
-            https://github.com/vega/vega-datasets/releases
-
-        Examples
-        --------
-        Using ``polars``:
-
-            from altair.datasets import Loader
 
-            data = Loader.from_backend("polars")
-            source = data("stocks", tag="v2.10.0")
-
-            >>> source.columns  # doctest: +SKIP
-            ['symbol', 'date', 'price']
-
-            >>> source  # doctest: +SKIP
-            shape: (560, 3)
-            ┌────────┬────────────┬────────┐
-            │ symbol ┆ date       ┆ price  │
-            │ ---    ┆ ---        ┆ ---    │
-            │ str    ┆ str        ┆ f64    │
-            ╞════════╪════════════╪════════╡
-            │ MSFT   ┆ Jan 1 2000 ┆ 39.81  │
-            │ MSFT   ┆ Feb 1 2000 ┆ 36.35  │
-            │ MSFT   ┆ Mar 1 2000 ┆ 43.22  │
-            │ MSFT   ┆ Apr 1 2000 ┆ 28.37  │
-            │ MSFT   ┆ May 1 2000 ┆ 25.45  │
-            │ …      ┆ …          ┆ …      │
-            │ AAPL   ┆ Nov 1 2009 ┆ 199.91 │
-            │ AAPL   ┆ Dec 1 2009 ┆ 210.73 │
-            │ AAPL   ┆ Jan 1 2010 ┆ 192.06 │
-            │ AAPL   ┆ Feb 1 2010 ┆ 204.62 │
-            │ AAPL   ┆ Mar 1 2010 ┆ 223.02 │
-            └────────┴────────────┴────────┘
-
-        Using ``pandas``:
-
-            data = Loader.from_backend("pandas")
-            source = data("stocks", tag="v2.10.0")
-
-            >>> source.columns  # doctest: +SKIP
-            Index(['symbol', 'date', 'price'], dtype='object')
-
-            >>> source  # doctest: +SKIP
-                symbol        date   price
-            0     MSFT  Jan 1 2000   39.81
-            1     MSFT  Feb 1 2000   36.35
-            2     MSFT  Mar 1 2000   43.22
-            3     MSFT  Apr 1 2000   28.37
-            4     MSFT  May 1 2000   25.45
-            ..     ...         ...     ...
-            555   AAPL  Nov 1 2009  199.91
-            556   AAPL  Dec 1 2009  210.73
-            557   AAPL  Jan 1 2010  192.06
-            558   AAPL  Feb 1 2010  204.62
-            559   AAPL  Mar 1 2010  223.02
-
-            [560 rows x 3 columns]
-
-        Using ``pyarrow``:
-
-            data = Loader.from_backend("pyarrow")
-            source = data("stocks", tag="v2.10.0")
-
-            >>> source.column_names  # doctest: +SKIP
-            ['symbol', 'date', 'price']
-
-            >>> source  # doctest: +SKIP
-            pyarrow.Table
-            symbol: string
-            date: string
-            price: double
-            ----
-            symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]]
-            date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]]
-            price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]]
-        """
-        return self._reader.dataset(name, suffix, tag=tag, **kwds)
-
-    def url(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: Version | None = None,
-    ) -> str:
-        """
-        Return the address of a remote dataset.
-
-        Parameters
-        ----------
-        name
-            Name of the dataset/`Path.stem`_.
-        suffix
-            File extension/`Path.suffix`_.
-
-            .. note::
-                Only needed if ``name`` is available in multiple formats.
-        tag
-            Version identifier for a `vega-datasets release`_.
-
-        .. _Path.stem:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
-        .. _Path.suffix:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-        .. _vega-datasets release:
-            https://github.com/vega/vega-datasets/releases
-
-        Examples
-        --------
-        The returned url will always point to an accessible dataset:
-
-            import altair as alt
-            from altair.datasets import Loader
-
-            data = Loader.from_backend("polars")
-            >>> data.url("cars", tag="v2.9.0")  # doctest: +SKIP
-            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
-
-        We can pass the result directly to a chart:
-
-            url = data.url("cars", tag="v2.9.0")
-            alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
-        """
-        return self._reader.url(name, suffix, tag=tag)
-
-    @property
-    def cache_dir(self) -> Path | None:
-        """
-        Returns path to datasets cache.
-
-        By default, this can be configured using the environment variable:
-
-            "ALTAIR_DATASETS_DIR"
-
-        You *may* also set this directly, but the value will **not** persist between sessions:
-
-            from pathlib import Path
-
-            from altair.datasets import Loader
-
-            data = Loader.from_backend("polars")
-            data.cache_dir = Path.home() / ".altair_cache"
-
-            >>> data.cache_dir.relative_to(Path.home()).as_posix()  # doctest: +SKIP
-            '.altair_cache'
-        """
-        return self._reader._cache
-
-    @cache_dir.setter
-    def cache_dir(self, source: StrPath, /) -> None:
-        import os
-
-        os.environ[self._reader._ENV_VAR] = str(source)
-
-    def __repr__(self) -> str:
-        return f"{type(self).__name__}[{self._reader._name}]"
+    from altair.datasets._loader import _Load
+    from altair.datasets._typing import Dataset, Extension, Version
 
 
-@final
-class _Load(Loader[IntoDataFrameT, IntoFrameT]):
-    @overload
-    def __call__(  # pyright: ignore[reportOverlappingOverload]
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = ...,
-        /,
-        tag: Version | None = ...,
-        backend: None = ...,
-        **kwds: Any,
-    ) -> IntoDataFrameT: ...
-    @overload
-    def __call__(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = ...,
-        /,
-        tag: Version | None = ...,
-        backend: Literal["polars", "polars[pyarrow]"] = ...,
-        **kwds: Any,
-    ) -> pl.DataFrame: ...
-    @overload
-    def __call__(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = ...,
-        /,
-        tag: Version | None = ...,
-        backend: Literal["pandas", "pandas[pyarrow]"] = ...,
-        **kwds: Any,
-    ) -> pd.DataFrame: ...
-    @overload
-    def __call__(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = ...,
-        /,
-        tag: Version | None = ...,
-        backend: Literal["pyarrow"] = ...,
-        **kwds: Any,
-    ) -> pa.Table: ...
-    def __call__(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: Version | None = None,
-        backend: _Backend | None = None,
-        **kwds: Any,
-    ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
-        if backend is None:
-            return super().__call__(name, suffix, tag, **kwds)
-        else:
-            return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
+__all__ = ["Loader", "load", "url"]
 
 
 load: _Load[Any, Any]
@@ -400,14 +43,50 @@ def __call__(
 """
 
 
+def url(
+    name: Dataset | LiteralString,
+    suffix: Extension | None = None,
+    /,
+    tag: Version | None = None,
+) -> str:
+    """
+    Return the address of a remote dataset.
+
+    Parameters
+    ----------
+    name
+        Name of the dataset/`Path.stem`_.
+    suffix
+        File extension/`Path.suffix`_.
+
+        .. note::
+            Only needed if ``name`` is available in multiple formats.
+    tag
+        Version identifier for a `vega-datasets release`_.
+
+    .. _Path.stem:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+    .. _Path.suffix:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+    .. _vega-datasets release:
+        https://github.com/vega/vega-datasets/releases
+
+    Related
+    -------
+    - https://github.com/vega/altair/pull/3631#issuecomment-2484826592
+    - https://github.com/vega/altair/pull/3631#issuecomment-2480832711
+    - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516
+    - https://github.com/vega/altair/pull/3631#discussion_r1846662053
+    """
+    from altair.datasets._loader import load
+
+    return load.url(name, suffix, tag=tag)
+
+
 def __getattr__(name):
     if name == "load":
-        from altair.datasets._readers import infer_backend
+        from altair.datasets._loader import load
 
-        reader = infer_backend()
-        global load
-        load = _Load.__new__(_Load)
-        load._reader = reader
         return load
     else:
         msg = f"module {__name__!r} has no attribute {name!r}"
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
new file mode 100644
index 000000000..3c2a0ee21
--- /dev/null
+++ b/altair/datasets/_loader.py
@@ -0,0 +1,394 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Generic, final, overload
+
+from narwhals.typing import IntoDataFrameT, IntoFrameT
+
+from altair.datasets._readers import _Reader, backend
+
+if TYPE_CHECKING:
+    import sys
+    from pathlib import Path
+    from typing import Any, Literal
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+    from _typeshed import StrPath
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    from altair.datasets._readers import _Backend
+    from altair.datasets._typing import Dataset, Extension, Version
+
+__all__ = ["Loader", "load"]
+
+
+class Loader(Generic[IntoDataFrameT, IntoFrameT]):
+    """
+    Load examples **remotely** from `vega-datasets`_, with *optional* caching.
+
+    A new ``Loader`` must be initialized by specifying a backend:
+
+        from altair.datasets import Loader
+
+        data = Loader.from_backend("polars")
+        >>> data  # doctest: +SKIP
+        Loader[polars]
+
+    .. _vega-datasets:
+        https://github.com/vega/vega-datasets
+    """
+
+    _reader: _Reader[IntoDataFrameT, IntoFrameT]
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["polars", "polars[pyarrow]"], /
+    ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
+    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["pyarrow"], /
+    ) -> Loader[pa.Table, pa.Table]: ...
+
+    @classmethod
+    def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
+        """
+        Initialize a new loader, with the specified backend.
+
+        Parameters
+        ----------
+        backend_name
+            DataFrame package/config used to return data.
+
+            * *polars*: Using `polars defaults`_
+            * *polars[pyarrow]*: Using ``use_pyarrow=True``
+            * *pandas*: Using `pandas defaults`_.
+            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
+            * *pyarrow*: (*Experimental*)
+
+            .. warning::
+                Most datasets use a `JSON format not supported`_ by ``pyarrow``
+
+        .. _polars defaults:
+            https://docs.pola.rs/api/python/stable/reference/io.html
+        .. _pandas defaults:
+            https://pandas.pydata.org/docs/reference/io.html
+        .. _JSON format not supported:
+            https://arrow.apache.org/docs/python/json.html#reading-json-files
+
+        Examples
+        --------
+        Using ``polars``:
+
+            from altair.datasets import Loader
+
+            data = Loader.from_backend("polars")
+            cars = data("cars")
+
+            >>> type(cars)  # doctest: +SKIP
+            polars.dataframe.frame.DataFrame
+
+        Using ``pandas``:
+
+            data = Loader.from_backend("pandas")
+            cars = data("cars")
+
+            >>> type(cars)  # doctest: +SKIP
+            pandas.core.frame.DataFrame
+
+        Using ``pandas``, backed by ``pyarrow`` dtypes:
+
+            data = Loader.from_backend("pandas[pyarrow]")
+            cars = data("cars", tag="v1.29.0")
+
+            >>> type(cars)  # doctest: +SKIP
+            pandas.core.frame.DataFrame
+
+            >>> cars.dtypes  # doctest: +SKIP
+            Name                string[pyarrow]
+            Miles_per_Gallon    double[pyarrow]
+            Cylinders            int64[pyarrow]
+            Displacement        double[pyarrow]
+            Horsepower           int64[pyarrow]
+            Weight_in_lbs        int64[pyarrow]
+            Acceleration        double[pyarrow]
+            Year                string[pyarrow]
+            Origin              string[pyarrow]
+            dtype: object
+        """
+        obj = Loader.__new__(Loader)
+        obj._reader = backend(backend_name)
+        return obj
+
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        """
+        Get a remote dataset and load as tabular data.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`Path.stem`_.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+        tag
+            Version identifier for a `vega-datasets release`_.
+        **kwds
+            Arguments passed to the underlying read function.
+
+        .. _Path.stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+        .. _vega-datasets release:
+            https://github.com/vega/vega-datasets/releases
+
+        Examples
+        --------
+        Using ``polars``:
+
+            from altair.datasets import Loader
+
+            data = Loader.from_backend("polars")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.columns  # doctest: +SKIP
+            ['symbol', 'date', 'price']
+
+            >>> source  # doctest: +SKIP
+            shape: (560, 3)
+            ┌────────┬────────────┬────────┐
+            │ symbol ┆ date       ┆ price  │
+            │ ---    ┆ ---        ┆ ---    │
+            │ str    ┆ str        ┆ f64    │
+            ╞════════╪════════════╪════════╡
+            │ MSFT   ┆ Jan 1 2000 ┆ 39.81  │
+            │ MSFT   ┆ Feb 1 2000 ┆ 36.35  │
+            │ MSFT   ┆ Mar 1 2000 ┆ 43.22  │
+            │ MSFT   ┆ Apr 1 2000 ┆ 28.37  │
+            │ MSFT   ┆ May 1 2000 ┆ 25.45  │
+            │ …      ┆ …          ┆ …      │
+            │ AAPL   ┆ Nov 1 2009 ┆ 199.91 │
+            │ AAPL   ┆ Dec 1 2009 ┆ 210.73 │
+            │ AAPL   ┆ Jan 1 2010 ┆ 192.06 │
+            │ AAPL   ┆ Feb 1 2010 ┆ 204.62 │
+            │ AAPL   ┆ Mar 1 2010 ┆ 223.02 │
+            └────────┴────────────┴────────┘
+
+        Using ``pandas``:
+
+            data = Loader.from_backend("pandas")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.columns  # doctest: +SKIP
+            Index(['symbol', 'date', 'price'], dtype='object')
+
+            >>> source  # doctest: +SKIP
+                symbol        date   price
+            0     MSFT  Jan 1 2000   39.81
+            1     MSFT  Feb 1 2000   36.35
+            2     MSFT  Mar 1 2000   43.22
+            3     MSFT  Apr 1 2000   28.37
+            4     MSFT  May 1 2000   25.45
+            ..     ...         ...     ...
+            555   AAPL  Nov 1 2009  199.91
+            556   AAPL  Dec 1 2009  210.73
+            557   AAPL  Jan 1 2010  192.06
+            558   AAPL  Feb 1 2010  204.62
+            559   AAPL  Mar 1 2010  223.02
+
+            [560 rows x 3 columns]
+
+        Using ``pyarrow``:
+
+            data = Loader.from_backend("pyarrow")
+            source = data("stocks", tag="v2.10.0")
+
+            >>> source.column_names  # doctest: +SKIP
+            ['symbol', 'date', 'price']
+
+            >>> source  # doctest: +SKIP
+            pyarrow.Table
+            symbol: string
+            date: string
+            price: double
+            ----
+            symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]]
+            date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]]
+            price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]]
+        """
+        return self._reader.dataset(name, suffix, tag=tag, **kwds)
+
+    def url(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+    ) -> str:
+        """
+        Return the address of a remote dataset.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`Path.stem`_.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+        tag
+            Version identifier for a `vega-datasets release`_.
+
+        .. _Path.stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+        .. _vega-datasets release:
+            https://github.com/vega/vega-datasets/releases
+
+        Examples
+        --------
+        The returned url will always point to an accessible dataset:
+
+            import altair as alt
+            from altair.datasets import Loader
+
+            data = Loader.from_backend("polars")
+            >>> data.url("cars", tag="v2.9.0")  # doctest: +SKIP
+            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
+
+        We can pass the result directly to a chart:
+
+            url = data.url("cars", tag="v2.9.0")
+            alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
+        """
+        return self._reader.url(name, suffix, tag=tag)
+
+    @property
+    def cache_dir(self) -> Path | None:
+        """
+        Returns path to datasets cache.
+
+        By default, this can be configured using the environment variable:
+
+            "ALTAIR_DATASETS_DIR"
+
+        You *may* also set this directly, but the value will **not** persist between sessions:
+
+            from pathlib import Path
+
+            from altair.datasets import Loader
+
+            data = Loader.from_backend("polars")
+            data.cache_dir = Path.home() / ".altair_cache"
+
+            >>> data.cache_dir.relative_to(Path.home()).as_posix()  # doctest: +SKIP
+            '.altair_cache'
+        """
+        return self._reader._cache
+
+    @cache_dir.setter
+    def cache_dir(self, source: StrPath, /) -> None:
+        import os
+
+        os.environ[self._reader._ENV_VAR] = str(source)
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}[{self._reader._name}]"
+
+
+@final
+class _Load(Loader[IntoDataFrameT, IntoFrameT]):
+    @overload
+    def __call__(  # pyright: ignore[reportOverlappingOverload]
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: None = ...,
+        **kwds: Any,
+    ) -> IntoDataFrameT: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["polars", "polars[pyarrow]"] = ...,
+        **kwds: Any,
+    ) -> pl.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["pandas", "pandas[pyarrow]"] = ...,
+        **kwds: Any,
+    ) -> pd.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        tag: Version | None = ...,
+        backend: Literal["pyarrow"] = ...,
+        **kwds: Any,
+    ) -> pa.Table: ...
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+        backend: _Backend | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
+        if backend is None:
+            return super().__call__(name, suffix, tag, **kwds)
+        else:
+            return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
+
+
+load: _Load[Any, Any]
+
+
+def __getattr__(name):
+    if name == "load":
+        from altair.datasets._readers import infer_backend
+
+        reader = infer_backend()
+        global load
+        load = _Load.__new__(_Load)
+        load._reader = reader
+        return load
+    else:
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3d986ec75..6de691ff2 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -141,11 +141,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
 
         priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow"
     """
-    import altair.datasets
+    import altair.datasets._loader
     from altair.datasets import load
 
     assert load._reader._name == "polars"
-    monkeypatch.delattr(altair.datasets, "load")
+    monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
 
     monkeypatch.setitem(sys.modules, "polars", None)
 
@@ -154,20 +154,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
     if find_spec("pyarrow") is None:
         # NOTE: We can end the test early for the CI job that removes `pyarrow`
         assert load._reader._name == "pandas"
-        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pandas", None)
         with pytest.raises(NotImplementedError, match="no.+backend"):
             from altair.datasets import load
     else:
         assert load._reader._name == "pandas[pyarrow]"
-        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.delattr(altair.datasets._loader, "load")
 
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
         from altair.datasets import load
 
         assert load._reader._name == "pandas"
-        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.delattr(altair.datasets._loader, "load")
 
         monkeypatch.setitem(sys.modules, "pandas", None)
         monkeypatch.delitem(sys.modules, "pyarrow")
@@ -175,7 +175,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         from altair.datasets import load
 
         assert load._reader._name == "pyarrow"
-        monkeypatch.delattr(altair.datasets, "load")
+        monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
         with pytest.raises(NotImplementedError, match="no.+backend"):
@@ -184,11 +184,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
 
 @requires_pyarrow
 def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
-    import altair.datasets
+    import altair.datasets._loader
 
-    monkeypatch.delattr(altair.datasets, "load", raising=False)
+    monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
+    from altair.datasets import load
 
-    load = altair.datasets.load
     assert load._reader._name == "polars"
 
     default = load("cars")
@@ -204,6 +204,47 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
     assert is_polars_dataframe(df_polars)
 
 
+@pytest.mark.parametrize(
+    "name",
+    [
+        "jobs",
+        "la-riots",
+        "londonBoroughs",
+        "londonCentroids",
+        "londonTubeLines",
+        "lookup_groups",
+        "lookup_people",
+        "miserables",
+        "monarchs",
+        "movies",
+        "normal-2d",
+        "obesity",
+        "ohlc",
+        "penguins",
+        "platformer-terrain",
+        "points",
+        "political-contributions",
+        "population",
+        "population_engineers_hurricanes",
+        "seattle-temps",
+        "seattle-weather",
+        "seattle-weather-hourly-normals",
+        "sf-temps",
+        "sp500",
+        "sp500-2000",
+        "stocks",
+        "udistrict",
+    ],
+)
+def test_url(name: Dataset) -> None:
+    from altair.datasets import url
+
+    pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+")
+    result = url(name)
+    assert isinstance(result, str)
+    assert pattern.match(result) is not None
+
+
 @backends
 def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv(CACHE_ENV_VAR, raising=False)

From 0817ff8503f728a4bc0c8d160abaab311f829fd7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 20 Nov 2024 21:46:22 +0000
Subject: [PATCH 113/201] feat: Support `url(...)` without dependencies

https://github.com/vega/altair/pull/3631#discussion_r1846662053, https://github.com/vega/altair/pull/3631#issuecomment-2488621316, https://github.com/vega/altair/pull/3631#issuecomment-2481977891
---
 altair/datasets/__init__.py          |  13 ++++-
 altair/datasets/_loader.py           |  77 +++++++++++++++++++++++++--
 altair/datasets/_metadata/url.csv.gz | Bin 0 -> 855 bytes
 altair/datasets/_readers.py          |   5 +-
 tests/test_datasets.py               |  70 +++++++++++++++++++-----
 tools/datasets/__init__.py           |  23 ++++++++
 6 files changed, 168 insertions(+), 20 deletions(-)
 create mode 100644 altair/datasets/_metadata/url.csv.gz

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index ac7ac9f06..e426ca467 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -78,9 +78,18 @@ def url(
     - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516
     - https://github.com/vega/altair/pull/3631#discussion_r1846662053
     """
-    from altair.datasets._loader import load
+    from altair.datasets._readers import AltairDatasetsError
 
-    return load.url(name, suffix, tag=tag)
+    try:
+        from altair.datasets._loader import load
+
+        url = load.url(name, suffix, tag=tag)
+    except AltairDatasetsError:
+        from altair.datasets._loader import url_cache
+
+        url = url_cache[name]
+
+    return url
 
 
 def __getattr__(name):
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 3c2a0ee21..5d8c1ec8b 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Generic, final, overload
+from pathlib import Path
+from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload
 
 from narwhals.typing import IntoDataFrameT, IntoFrameT
 
@@ -8,8 +9,8 @@
 
 if TYPE_CHECKING:
     import sys
-    from pathlib import Path
-    from typing import Any, Literal
+    from collections.abc import MutableMapping
+    from typing import Any, Final, Literal
 
     import pandas as pd
     import polars as pl
@@ -23,8 +24,15 @@
     from altair.datasets._readers import _Backend
     from altair.datasets._typing import Dataset, Extension, Version
 
+
 __all__ = ["Loader", "load"]
 
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
+_T = TypeVar("_T")
+
+_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
+
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
@@ -377,6 +385,69 @@ def __call__(
             return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
 
 
+class UrlCache(Generic[_KT, _VT]):
+    """
+    `csv`_, `gzip`_ -based, lazy url lookup.
+
+    Operates on a subset of available datasets:
+    - Only the latest version
+    - Excludes `.parquet`, which `cannot be read via url`_
+    - Name collisions are pre-resolved
+        - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``)
+
+    .. _csv:
+        https://docs.python.org/3/library/csv.html
+    .. _gzip:
+        https://docs.python.org/3/library/gzip.html
+    .. _cannot be read via url:
+        https://github.com/vega/vega/issues/3961
+    """
+
+    def __init__(
+        self,
+        fp: Path,
+        /,
+        *,
+        columns: tuple[str, str] = ("dataset_name", "url_npm"),
+        tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
+    ) -> None:
+        self.fp: Path = fp
+        self.columns: tuple[str, str] = columns
+        self._mapping: MutableMapping[_KT, _VT] = tp()
+
+    def read(self) -> Any:
+        import csv
+        import gzip
+
+        with gzip.open(self.fp, mode="rb") as f:
+            b_lines = f.readlines()
+        reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
+        header = tuple(next(reader))
+        if header != self.columns:
+            msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}"
+            raise ValueError(msg)
+        return dict(reader)
+
+    def __getitem__(self, key: _KT, /) -> _VT:
+        if url := self.get(key, None):
+            return url
+
+        from altair.datasets._typing import Dataset
+
+        if key in get_args(Dataset):
+            msg = f"{key!r} cannot be loaded via url."
+            raise TypeError(msg)
+        else:
+            msg = f"{key!r} does not refer to a known dataset."
+            raise TypeError(msg)
+
+    def get(self, key: _KT, default: _T) -> _VT | _T:
+        if not self._mapping:
+            self._mapping.update(self.read())
+        return self._mapping.get(key, default)
+
+
+url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
 load: _Load[Any, Any]
 
 
diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..3580606d7cca77cefee4c5bd2b48134f9fac22d9
GIT binary patch
literal 855
zcmV-d1E~BTiwFn-B0gsV|8;U~E@N|c0Ik?dQ`;~Q0N^{n1vw!zOlNxPwAUVc&&ZNu
zi^y7452x_!tCyXSP&%W{2?*?`_3GgjoO<en-)lOF$6(c7o9W@pn>3H3_r`tQY#7(w
zi{nDc*>+m^P5g_^ECxz=iFM!RUHA0VZ8zzIO$zRe9v-N)2CR3@(gJkM%@0)TKov1o
zFhp|ilo$y*!j8ez3xrvK!u8ZD@!E`)@JdO`owxER+G}`W<TCbw55l)|IlQKqzjsK>
zEtvIEBdio&C`N62QYpAHup<mK6WUwqCA24ZBjuT35aQ1gF}JHw6Si^^v|Y>Lh2qjt
z=LMpUtB{|STRBTTv}+~4Bqyl#<L^K*+v36~=-~i=nza<1kGzATRuC?X&|GMSncQ2d
zCYnCSsS6~B(L3_AJtDZ(c#rP3cTf}3QETCJ;ogJ2PWBGw75R)OpRdUJHblGC$@}sE
z;Crf6J6eH~>OjpT<8rlR6jb?<NFCpd?W^@4UQT`e9bT`=%c;GuNWu987^ELVk8o0W
zc|ZlRvcwBaT4y7h$oZ({&*hSEqC_ac;v0xb-`??DLpjkG=-ZFZ6kL*2lvoLf9%TV?
zQVKK&{T(cXVMLhkn3`z*>__SlKys|TI+ysb(2Ji^3oO1m3l7I%_CtIcgP|{!TI~FZ
z5nz<Sx<$$EB%QFdSBN^CiBf_IG_?$|A{JobW=!Ky!V9IrPI3S5c(b+&1%zHW%x16g
z6&n?6mtt^XVuv&YsoN{I0F_hJAL3KTw@&O|Ek)c)ZmBv1NR2p@O87!4=_QL(6mJ;u
z1non*U|{SIy=5>H6z(o`Ca%eSoP~I-#4E#o3^u+CDCVsCkDHGIC#d&IkW>6RrX~Y|
zRj;Hh`SzhdXFnSGUPBezJa4zDciy(MD{&TaSaCeCBciT3JWC;7FH^hF-VLupS%yK!
z7D>VD6yKbLG7HYdW|IepyP<#1-VS}2fjXZmq-8pJFh~EHsEMX~-qg<J@Rhp9LtiRo
zT{z-A^@1>Jq{nRH8>u8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#?
hPGA)$PnhL%CQ6!b(lFqNy}1B6!M}FHK(GfF008u|rd9v|

literal 0
HcmV?d00001

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 953401bae..e93fb55e1 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -83,6 +83,9 @@
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
 
+class AltairDatasetsError(Exception): ...
+
+
 class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     """
     Describes basic IO for remote & local tabular resources.
@@ -502,7 +505,7 @@ def infer_backend(
     if reader := next(it, None):
         return reader
     msg = f"Found no supported backend, searched:\n" f"{priority!r}"
-    raise NotImplementedError(msg)
+    raise AltairDatasetsError(msg)
 
 
 @overload
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 6de691ff2..e5d1f1d3f 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import contextlib
 import datetime as dt
 import re
 import sys
@@ -18,8 +19,8 @@
 )
 from narwhals.stable import v1 as nw
 
-from altair.datasets import Loader
-from altair.datasets._readers import _METADATA
+from altair.datasets import Loader, url
+from altair.datasets._readers import _METADATA, AltairDatasetsError
 from altair.datasets._typing import Dataset, Extension, Metadata, Version
 from tests import skip_requires_pyarrow, slow
 
@@ -115,6 +116,13 @@ def metadata_columns() -> frozenset[str]:
     )
 
 
+def match_url(name: Dataset, url: str) -> bool:
+    return (
+        re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url)
+        is not None
+    )
+
+
 @backends
 def test_loader_from_backend(backend: _Backend) -> None:
     data = Loader.from_backend(backend)
@@ -124,13 +132,8 @@ def test_loader_from_backend(backend: _Backend) -> None:
 @backends
 def test_loader_url(backend: _Backend) -> None:
     data = Loader.from_backend(backend)
-    dataset_name = "volcano"
-    pattern = re.compile(
-        rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+"
-    )
-    url = data.url(dataset_name)
-    assert isinstance(url, str)
-    assert pattern.match(url) is not None
+    dataset_name: Dataset = "volcano"
+    assert match_url(dataset_name, data.url(dataset_name))
 
 
 def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -178,7 +181,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
-        with pytest.raises(NotImplementedError, match="no.+backend"):
+        with pytest.raises(AltairDatasetsError, match="no.+backend"):
             from altair.datasets import load
 
 
@@ -239,10 +242,49 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
 def test_url(name: Dataset) -> None:
     from altair.datasets import url
 
-    pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+")
-    result = url(name)
-    assert isinstance(result, str)
-    assert pattern.match(result) is not None
+    assert match_url(name, url(name))
+
+
+def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
+    import altair.datasets
+    from altair.datasets._loader import url_cache
+
+    monkeypatch.setitem(sys.modules, "polars", None)
+    monkeypatch.setitem(sys.modules, "pandas", None)
+    monkeypatch.setitem(sys.modules, "pyarrow", None)
+
+    assert url_cache._mapping == {}
+
+    with contextlib.suppress(AltairDatasetsError):
+        monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
+    with pytest.raises(AltairDatasetsError):
+        from altair.datasets import load as load
+
+    assert match_url("jobs", url("jobs"))
+
+    assert url_cache._mapping != {}
+
+    assert match_url("cars", url("cars"))
+    assert match_url("stocks", url("stocks"))
+    assert match_url("countries", url("countries"))
+    assert match_url("crimea", url("crimea"))
+    assert match_url("disasters", url("disasters"))
+    assert match_url("driving", url("driving"))
+    assert match_url("earthquakes", url("earthquakes"))
+    assert match_url("flare", url("flare"))
+    assert match_url("flights-10k", url("flights-10k"))
+    assert match_url("flights-200k", url("flights-200k"))
+
+    with pytest.raises(TypeError, match="cannot be loaded via url"):
+        url("climate")
+
+    with pytest.raises(TypeError, match="cannot be loaded via url"):
+        url("flights-3m")
+
+    with pytest.raises(
+        TypeError, match="'fake data' does not refer to a known dataset"
+    ):
+        url("fake data")
 
 
 @backends
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 3702028ac..ae4d0b583 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -15,12 +15,15 @@
 
 from __future__ import annotations
 
+import gzip
 import json
 import types
+from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
 import polars as pl
+from polars import col
 
 from tools.codemod import ruff
 from tools.datasets.github import GitHub
@@ -107,6 +110,7 @@ def __init__(
             }
         )
         self._fp_typing: Path = out_fp_typing
+        self._fp_url: Path = out_dir_altair / "url.csv.gz"
 
     @property
     def github(self) -> GitHub:
@@ -135,6 +139,14 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
         gh_trees = self.github.refresh_trees(gh_tags)
         self.write_parquet(gh_trees, self._paths["gh_trees"])
 
+        npm_urls_min = (
+            gh_trees.lazy()
+            .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
+            .filter(col("size") == col("size").min().over("dataset_name"))
+            .select("dataset_name", "url_npm")
+        )
+        self.write_csv_gzip(npm_urls_min, self._fp_url)
+
         if include_typing:
             self.generate_typing(self._fp_typing)
         return gh_trees
@@ -159,6 +171,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path:
         else:
             return self._paths[name]
 
+    def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
+        if fp.suffix != ".gz":
+            fp = fp.with_suffix(".csv.gz")
+        if not fp.exists():
+            fp.touch()
+        df = frame.lazy().collect()
+        buf = BytesIO()
+        with gzip.open(fp, mode="wb") as f:
+            df.write_csv(buf)
+            f.write(buf.getbuffer())
+
     def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """Write ``frame`` to ``fp``, with some extra safety."""
         if not fp.exists():

From e01fdd727b2bbfa389e995d126506d647d60ea9f Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 20 Nov 2024 21:52:32 +0000
Subject: [PATCH 114/201] fix(DRAFT): Don't generate csv on refresh

https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631
---
 altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes
 tools/datasets/__init__.py           |  21 +++++++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz
index 3580606d7cca77cefee4c5bd2b48134f9fac22d9..49a227404cc162e9177aee307c297cfffd4869a1 100644
GIT binary patch
delta 15
Wcmcc4cAbq)zMF$1Dsm%RATt0aRs=Z!

delta 15
Wcmcc4cAbq)zMF%CQE4MvATt0XEd!|l

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index ae4d0b583..398c06f84 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -120,7 +120,9 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
-    def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
+    def refresh(
+        self, *, include_typing: bool = False, include_csv: bool = False
+    ) -> pl.DataFrame:
         """
         Update and sync all dataset metadata files.
 
@@ -139,13 +141,16 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
         gh_trees = self.github.refresh_trees(gh_tags)
         self.write_parquet(gh_trees, self._paths["gh_trees"])
 
-        npm_urls_min = (
-            gh_trees.lazy()
-            .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
-            .filter(col("size") == col("size").min().over("dataset_name"))
-            .select("dataset_name", "url_npm")
-        )
-        self.write_csv_gzip(npm_urls_min, self._fp_url)
+        if include_csv:
+            # BUG: Non-deterministic
+            # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631
+            npm_urls_min = (
+                gh_trees.lazy()
+                .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
+                .filter(col("size") == col("size").min().over("dataset_name"))
+                .select("dataset_name", "url_npm")
+            )
+            self.write_csv_gzip(npm_urls_min, self._fp_url)
 
         if include_typing:
             self.generate_typing(self._fp_typing)

From 0c5195e92d428033b311b784b30c69f5ebeac6ee Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 20 Nov 2024 21:57:19 +0000
Subject: [PATCH 115/201] test: Replace rogue `NotImplementedError`

https://github.com/vega/altair/actions/runs/11942364658/job/33289235198?pr=3631
---
 tests/test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e5d1f1d3f..a4bbe40c4 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -159,7 +159,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         assert load._reader._name == "pandas"
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pandas", None)
-        with pytest.raises(NotImplementedError, match="no.+backend"):
+        with pytest.raises(AltairDatasetsError, match="no.+backend"):
             from altair.datasets import load
     else:
         assert load._reader._name == "pandas[pyarrow]"

From 5595d905c29a89d6388b12b46caa016e9cd91d27 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:50:12 +0000
Subject: [PATCH 116/201] fix: Omit `.gz` last modification time header

Previously was creating a diff on every refresh, since the current time updated.
https://docs.python.org/3/library/gzip.html#gzip.GzipFile.mtime

https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631
---
 altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes
 tools/datasets/__init__.py           |  23 +++++++++--------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz
index 49a227404cc162e9177aee307c297cfffd4869a1..07cb52ec1c834808609b204ed2ffe0b4cd83f62e 100644
GIT binary patch
delta 17
Xcmcc4cAbqwzMF%C0SGp7_%j0lCyxV&

delta 17
Ycmcc4cAbqwzMF$1D$*`}BZogT05C-a-~a#s

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 398c06f84..a3690f65f 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -120,9 +120,7 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
-    def refresh(
-        self, *, include_typing: bool = False, include_csv: bool = False
-    ) -> pl.DataFrame:
+    def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
         """
         Update and sync all dataset metadata files.
 
@@ -141,16 +139,13 @@ def refresh(
         gh_trees = self.github.refresh_trees(gh_tags)
         self.write_parquet(gh_trees, self._paths["gh_trees"])
 
-        if include_csv:
-            # BUG: Non-deterministic
-            # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631
-            npm_urls_min = (
-                gh_trees.lazy()
-                .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
-                .filter(col("size") == col("size").min().over("dataset_name"))
-                .select("dataset_name", "url_npm")
-            )
-            self.write_csv_gzip(npm_urls_min, self._fp_url)
+        npm_urls_min = (
+            gh_trees.lazy()
+            .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
+            .filter(col("size") == col("size").min().over("dataset_name"))
+            .select("dataset_name", "url_npm")
+        )
+        self.write_csv_gzip(npm_urls_min, self._fp_url)
 
         if include_typing:
             self.generate_typing(self._fp_typing)
@@ -183,7 +178,7 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non
             fp.touch()
         df = frame.lazy().collect()
         buf = BytesIO()
-        with gzip.open(fp, mode="wb") as f:
+        with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
             df.write_csv(buf)
             f.write(buf.getbuffer())
 

From 9f621519ac4eb84e506632d81e6b794e55eee00c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 21 Nov 2024 11:01:02 +0000
Subject: [PATCH 117/201] docs: Add doc for `Application.write_csv_gzip`

---
 tools/datasets/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index a3690f65f..26955e9c0 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -172,6 +172,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path:
             return self._paths[name]
 
     def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
+        """
+        Write ``frame`` as a `gzip`_ compressed `csv`_ file.
+
+        - *Much smaller* than a regular ``.csv``.
+        - Still readable using ``stdlib`` modules.
+
+        .. _gzip:
+            https://docs.python.org/3/library/gzip.html
+        .. _csv:
+            https://docs.python.org/3/library/csv.html
+        """
         if fp.suffix != ".gz":
             fp = fp.with_suffix(".csv.gz")
         if not fp.exists():

From 1bd455206d5898800ae87d7c22cafba05c9c012e Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:34:02 +0000
Subject: [PATCH 118/201] revert: Remove `"polars[pyarrow]" backend

Partially related to https://github.com/vega/altair/pull/3631#issuecomment-2484826592

After some thought, this backend didn't add support for any unique dependency configs.
I've only ever used `use_pyarrow=True` for `pl.DataFrame.write_parquet` to resolve an issue with invalid headers in `"polars<1.0.0;>=0.19.0"`
---
 altair/datasets/_loader.py  |  5 ++---
 altair/datasets/_readers.py | 32 +++-----------------------------
 tests/test_datasets.py      |  5 ++---
 3 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 5d8c1ec8b..3e31aea2e 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -55,7 +55,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     @overload
     @classmethod
     def from_backend(
-        cls, backend_name: Literal["polars", "polars[pyarrow]"], /
+        cls, backend_name: Literal["polars"], /
     ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
 
     @overload
@@ -81,7 +81,6 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
             DataFrame package/config used to return data.
 
             * *polars*: Using `polars defaults`_
-            * *polars[pyarrow]*: Using ``use_pyarrow=True``
             * *pandas*: Using `pandas defaults`_.
             * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
             * *pyarrow*: (*Experimental*)
@@ -347,7 +346,7 @@ def __call__(
         suffix: Extension | None = ...,
         /,
         tag: Version | None = ...,
-        backend: Literal["polars", "polars[pyarrow]"] = ...,
+        backend: Literal["polars"] = ...,
         **kwds: Any,
     ) -> pl.DataFrame: ...
     @overload
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index e93fb55e1..f7b8aecf5 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -73,9 +73,8 @@
     _Pandas: TypeAlias = Literal["pandas"]
     _PyArrow: TypeAlias = Literal["pyarrow"]
     _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow)
-    _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"]
     _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"]
-    _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow]
+    _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow]
 
 
 __all__ = ["backend"]
@@ -332,25 +331,6 @@ def __init__(self, name: _Polars, /) -> None:
         self._scan_fn = {".parquet": pl.scan_parquet}
 
 
-class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
-        _pl, _pa = _requirements(name)
-        self._name = name
-        if not TYPE_CHECKING:
-            pl = self._import(_pl)
-            pa = self._import(_pa)  # noqa: F841
-        self._read_fn = {
-            ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True),
-            ".json": _pl_read_json_roundtrip,
-            ".tsv": partial(
-                pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True
-            ),
-            ".arrow": partial(pl.read_ipc, use_pyarrow=True),
-            ".parquet": partial(pl.read_parquet, use_pyarrow=True),
-        }
-        self._scan_fn = {".parquet": pl.scan_parquet}
-
-
 class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
     """
     Reader backed by `pyarrow.Table`_.
@@ -509,7 +489,7 @@ def infer_backend(
 
 
 @overload
-def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
+def backend(name: _Polars, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
 
 
 @overload
@@ -524,8 +504,6 @@ def backend(name: _Backend, /) -> _Reader[Any, Any]:
     """Reader initialization dispatcher."""
     if name == "polars":
         return _PolarsReader(name)
-    elif name == "polars[pyarrow]":
-        return _PolarsPyArrowReader(name)
     elif name == "pandas[pyarrow]":
         return _PandasPyArrowReader(name)
     elif name == "pandas":
@@ -548,10 +526,6 @@ def _requirements(s: _ConcreteT, /) -> _ConcreteT: ...
 def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ...
 
 
-@overload
-def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ...
-
-
 def _requirements(s: _Backend, /):
     concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"}
     if s in concrete:
@@ -560,7 +534,7 @@ def _requirements(s: _Backend, /):
         from packaging.requirements import Requirement
 
         req = Requirement(s)
-        supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"}
+        supports_extras: set[Literal[_Pandas]] = {"pandas"}
         if req.name in supports_extras:
             name = req.name
             if (extras := req.extras) and extras == {"pyarrow"}:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index a4bbe40c4..e31f7990e 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -68,7 +68,6 @@ class DatasetSpec(TypedDict, total=False):
                 ),
             ),
         ),
-        pytest.param("polars[pyarrow]", marks=requires_pyarrow),
         pytest.param("pandas[pyarrow]", marks=requires_pyarrow),
         pytest.param("pyarrow", marks=requires_pyarrow),
     ],
@@ -302,7 +301,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None
 def test_missing_dependency_single(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    if backend in {"polars[pyarrow]", "pandas[pyarrow]"}:
+    if backend == "pandas[pyarrow]":
         pytest.skip("Testing single dependency backends only")
 
     monkeypatch.setitem(sys.modules, backend, None)
@@ -317,7 +316,7 @@ def test_missing_dependency_single(
         Loader.from_backend(backend)
 
 
-@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"])
+@pytest.mark.parametrize("backend", ["pandas[pyarrow]"])
 @skip_requires_pyarrow
 def test_missing_dependency_multi(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch

From 11da9c8f584e466a02a021ef8e93b895145fb333 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:41:10 +0000
Subject: [PATCH 119/201] test: Add a complex `xfail` for `test_load_call`

Doesn't happen in CI, still unclear why the import within `pandas` breaks under these conditions.
Have tried multiple combinations of `pytest.MonkeyPatch`, hard imports, but had no luck in fixing the bug
---
 tests/test_datasets.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e31f7990e..50ece0a26 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -86,6 +86,19 @@ class DatasetSpec(TypedDict, total=False):
 """
 
 
+@pytest.fixture
+def is_flaky_datasets(request: pytest.FixtureRequest) -> bool:
+    mark_filter = request.config.getoption("-m", None)  # pyright: ignore[reportArgumentType]
+    if mark_filter is None:
+        return False
+    elif mark_filter == "":
+        return True
+    elif isinstance(mark_filter, str):
+        return False
+    else:
+        raise TypeError(mark_filter)
+
+
 @pytest.fixture(scope="session")
 def polars_loader(
     tmp_path_factory: pytest.TempPathFactory,
@@ -184,6 +197,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
             from altair.datasets import load
 
 
+# HACK: Using a fixture to get a command line option
+# https://docs.pytest.org/en/stable/example/simple.html#pass-different-values-to-a-test-function-depending-on-command-line-options
+@pytest.mark.xfail(
+    is_flaky_datasets,  # type: ignore
+    reason=(
+        "'pandas[pyarrow]' seems to break locally when running:\n"
+        ">>> pytest -p no:randomly -n logical tests -k test_datasets -m ''\n\n"
+        "Possibly related:\n"
+        "    https://github.com/modin-project/modin/issues/951\n"
+        "    https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L164\n"
+        "    https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L257\n"
+    ),
+    raises=AttributeError,
+)
 @requires_pyarrow
 def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
     import altair.datasets._loader

From 694ada0ad496ecd0e07f49ff97e0c5c0753a6085 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:46:48 +0000
Subject: [PATCH 120/201] refactor: Renaming/recomposing `_readers.py`

The next commits benefit from having functionality decoupled from `_Reader.query`.
Mainly, keeping things lazy and not raising a user-facing error
---
 altair/datasets/_readers.py | 68 +++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index f7b8aecf5..2c8d53820 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -69,6 +69,13 @@
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
 
+    # NOTE: Using a constrained instead of bound `TypeVar`
+    #       error: Incompatible return value type (got "DataFrame[Any] | LazyFrame[Any]", expected "FrameT")  [return-value]
+    # - https://typing.readthedocs.io/en/latest/spec/generics.html#introduction
+    # - https://typing.readthedocs.io/en/latest/spec/generics.html#type-variables-with-an-upper-bound
+    # https://github.com/narwhals-dev/narwhals/blob/21b8436567de3631c584ef67632317ad70ae5de0/narwhals/typing.py#L59
+    FrameT = TypeVar("FrameT", nw.DataFrame[Any], nw.LazyFrame)
+
     _Polars: TypeAlias = Literal["polars"]
     _Pandas: TypeAlias = Literal["pandas"]
     _PyArrow: TypeAlias = Literal["pyarrow"]
@@ -111,7 +118,7 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
 
     Used exclusively for ``metadata.parquet``.
 
-    Currently ``polars`` backends are the only lazy options.
+    Currently ``"polars"`` is the only lazy option.
     """
 
     _name: LiteralString
@@ -125,12 +132,10 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
 
     def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
-        suffix = validate_suffix(source, is_ext_read)
-        return self._read_fn[suffix]
+        return self._read_fn[_extract_suffix(source, is_ext_read)]
 
     def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
-        suffix = validate_suffix(source, is_ext_scan)
-        return self._scan_fn[suffix]
+        return self._scan_fn[_extract_suffix(source, is_ext_scan)]
 
     def dataset(
         self,
@@ -140,7 +145,7 @@ def dataset(
         tag: Version | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
-        df = self.query(**validate_constraints(name, suffix, tag))
+        df = self.query(**_extract_constraints(name, suffix, tag))
         it = islice(df.iter_rows(named=True), 1)
         result = cast("Metadata", next(it))
         url = result["url_npm"]
@@ -166,7 +171,7 @@ def url(
         /,
         tag: Version | None = None,
     ) -> str:
-        frame = self.query(**validate_constraints(name, suffix, tag))
+        frame = self.query(**_extract_constraints(name, suffix, tag))
         url = nw.to_py_scalar(frame.item(0, "url_npm"))
         if isinstance(url, str):
             return url
@@ -180,6 +185,8 @@ def query(
         """
         Query multi-version trees metadata.
 
+        Applies a filter, erroring out when no results would be returned.
+
         Notes
         -----
         Arguments correspond to those seen in `pl.LazyFrame.filter`_.
@@ -187,12 +194,7 @@ def query(
         .. _pl.LazyFrame.filter:
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
         """
-        frame = (
-            nw.from_native(self.scan_fn(_METADATA)(_METADATA))
-            .filter(_parse_predicates_constraints(predicates, constraints))
-            .lazy()
-            .collect()
-        )
+        frame = self._scan_metadata(*predicates, **constraints).collect()
         if not frame.is_empty():
             return frame
         else:
@@ -200,18 +202,13 @@ def query(
             msg = f"Found no results for:\n    {terms}"
             raise ValueError(msg)
 
-    def _read_metadata(self) -> IntoDataFrameT:
-        """
-        Return the full contents of ``metadata.parquet``.
-
-        Effectively an eager read, no filters.
-        """
-        return (
-            nw.from_native(self.scan_fn(_METADATA)(_METADATA))
-            .lazy()
-            .collect()
-            .to_native()
-        )
+    def _scan_metadata(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame:
+        frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
+        if predicates or constraints:
+            return _filter(frame, *predicates, **constraints)
+        return frame
 
     @property
     def _cache(self) -> Path | None:  # type: ignore[return]
@@ -406,24 +403,30 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table:
         self._scan_fn = {".parquet": pa_read_parquet}
 
 
-def _parse_predicates_constraints(
-    predicates: tuple[Any, ...], constraints: Metadata, /
-) -> nw.Expr:
+def _filter(
+    frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+) -> FrameT:
     """
     ``narwhals`` only accepts ``filter(*predicates)``.
 
     So we convert each item in ``**constraints`` here as::
 
        col("column_name") == literal_value
+
+    - https://github.com/narwhals-dev/narwhals/issues/1383
+    - https://github.com/narwhals-dev/narwhals/pull/1417
     """
-    return nw.all_horizontal(
-        chain(predicates, (nw.col(name) == v for name, v in constraints.items()))
+    return frame.filter(
+        nw.all_horizontal(
+            *chain(predicates, (nw.col(name) == v for name, v in constraints.items()))
+        )
     )
 
 
-def validate_constraints(
+def _extract_constraints(
     name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
 ) -> Metadata:
+    """Transform args into a mapping to column names."""
     constraints: Metadata = {}
     if tag is not None:
         constraints["tag"] = tag
@@ -445,7 +448,7 @@ def validate_constraints(
     return constraints
 
 
-def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T:
+def _extract_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T:
     suffix: Any = Path(source).suffix
     if guard(suffix):
         return suffix
@@ -479,7 +482,6 @@ def infer_backend(
 
     .. _fastparquet:
         https://github.com/dask/fastparquet
-
     """
     it = (backend(name) for name in priority if is_available(_requirements(name)))
     if reader := next(it, None):

From 6f41c7e5b830bff1e901ecbe1fcec862f72c4683 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:49:46 +0000
Subject: [PATCH 121/201] build: Generate `VERSION_LATEST`

Simplifies logic that relies on enum/categoricals that may not be recognised as ordered
---
 altair/datasets/_typing.py | 10 +++++++++-
 tools/datasets/__init__.py | 10 ++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index cdaa57322..0b681b834 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -24,6 +24,7 @@
 
 __all__ = [
     "EXTENSION_SUFFIXES",
+    "VERSION_LATEST",
     "Dataset",
     "Extension",
     "Metadata",
@@ -154,7 +155,14 @@
     "v1.5.0",
 ]
 Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"]
-EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet")
+VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0"
+EXTENSION_SUFFIXES: tuple[
+    Literal[".csv"],
+    Literal[".json"],
+    Literal[".tsv"],
+    Literal[".arrow"],
+    Literal[".parquet"],
+] = (".csv", ".json", ".tsv", ".arrow", ".parquet")
 
 
 def is_ext_read(suffix: Any) -> TypeIs[Extension]:
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 26955e9c0..1402a9c7b 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -226,9 +226,14 @@ def generate_typing(self, output: Path, /) -> None:
         indent = " " * 4
         NAME = "Dataset"
         TAG = "Version"
+        LATEST = "VERSION_LATEST"
+        LATEST_TAG = f"{tags.first()!r}"
         EXT = "Extension"
         EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet"
         EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
+        EXTENSION_TYPE_TP = (
+            f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXTENSION_TYPES)}]"
+        )
         EXTENSION_GUARD = "is_ext_read"
         METADATA_TD = "Metadata"
         DESCRIPTION_DEFAULT = "_description_"
@@ -318,11 +323,12 @@ def generate_typing(self, output: Path, /) -> None:
             utils.import_typing_extensions((3, 13), "TypeIs"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
-            f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
+            f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES, LATEST]}\n\n"
             f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
             f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
             f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}",
-            f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}",
+            f"{LATEST}: Literal[{LATEST_TAG}] = {LATEST_TAG}",
+            f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXTENSION_TYPES!r}",
             f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
             f"{indent}return suffix in set({EXTENSION_TYPES!r})\n",
             UNIVERSAL_TYPED_DICT.format(

From 88d06a64ac8a21350314b5300fbd7142d57e13cf Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:54:16 +0000
Subject: [PATCH 122/201] feat: Adds `_cache.py` for `UrlCache`, `DatasetCache`

Docs to follow
---
 altair/datasets/__init__.py |   2 +-
 altair/datasets/_cache.py   | 226 ++++++++++++++++++++++++++++++++++++
 altair/datasets/_loader.py  | 110 ++----------------
 altair/datasets/_readers.py |  21 +---
 tests/test_datasets.py      |  75 +++++++++---
 5 files changed, 304 insertions(+), 130 deletions(-)
 create mode 100644 altair/datasets/_cache.py

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index e426ca467..70d01eacc 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -85,7 +85,7 @@ def url(
 
         url = load.url(name, suffix, tag=tag)
     except AltairDatasetsError:
-        from altair.datasets._loader import url_cache
+        from altair.datasets._cache import url_cache
 
         url = url_cache[name]
 
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
new file mode 100644
index 000000000..9239911fd
--- /dev/null
+++ b/altair/datasets/_cache.py
@@ -0,0 +1,226 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
+
+import narwhals.stable.v1 as nw
+from narwhals.dependencies import get_pyarrow
+from narwhals.typing import IntoDataFrameT, IntoFrameT
+
+from altair.datasets._typing import VERSION_LATEST
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Iterator, MutableMapping
+    from typing import Any, Final
+
+    from _typeshed import StrPath
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    from altair.datasets._readers import _Reader
+    from altair.datasets._typing import Dataset
+
+__all__ = ["DatasetCache", "UrlCache", "url_cache"]
+
+
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
+_T = TypeVar("_T")
+
+_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
+
+
+class UrlCache(Generic[_KT, _VT]):
+    """
+    `csv`_, `gzip`_ -based, lazy url lookup.
+
+    Operates on a subset of available datasets:
+    - Only the latest version
+    - Excludes `.parquet`, which `cannot be read via url`_
+    - Name collisions are pre-resolved
+        - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``)
+
+    .. _csv:
+        https://docs.python.org/3/library/csv.html
+    .. _gzip:
+        https://docs.python.org/3/library/gzip.html
+    .. _cannot be read via url:
+        https://github.com/vega/vega/issues/3961
+    """
+
+    def __init__(
+        self,
+        fp: Path,
+        /,
+        *,
+        columns: tuple[str, str] = ("dataset_name", "url_npm"),
+        tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
+    ) -> None:
+        self.fp: Path = fp
+        self.columns: tuple[str, str] = columns
+        self._mapping: MutableMapping[_KT, _VT] = tp()
+
+    def read(self) -> Any:
+        import csv
+        import gzip
+
+        with gzip.open(self.fp, mode="rb") as f:
+            b_lines = f.readlines()
+        reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
+        header = tuple(next(reader))
+        if header != self.columns:
+            msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}"
+            raise ValueError(msg)
+        return dict(reader)
+
+    def __getitem__(self, key: _KT, /) -> _VT:
+        if url := self.get(key, None):
+            return url
+
+        from altair.datasets._typing import Dataset
+
+        if key in get_args(Dataset):
+            msg = f"{key!r} cannot be loaded via url."
+            raise TypeError(msg)
+        else:
+            msg = f"{key!r} does not refer to a known dataset."
+            raise TypeError(msg)
+
+    def get(self, key: _KT, default: _T) -> _VT | _T:
+        if not self._mapping:
+            self._mapping.update(self.read())
+        return self._mapping.get(key, default)
+
+
+class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
+    _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
+
+    def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
+        self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader
+
+    def download_all(self) -> None:
+        """
+        Download any missing datasets for latest version.
+
+        ``v2.11.0`` stats
+        -----------------
+        - **66** items
+        - **27.8** MB
+        - Only 1 file > 2 MB
+        """
+        stems = tuple(fp.stem for fp in self)
+        latest = nw.col("tag") == nw.lit(VERSION_LATEST)
+        predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,)
+        frame = (
+            self._rd._scan_metadata(
+                *predicates, ext_supported=True, name_collision=False
+            )
+            .select("sha", "suffix", "url_npm")
+            .unique("sha")
+            .collect()
+        )
+        if frame.is_empty():
+            print("Already downloaded all datasets")
+            return None
+        print(f"Downloading {len(frame)} missing datasets...")
+        for row in frame.iter_rows(named=True):
+            fp: Path = self.path / (row["sha"] + row["suffix"])
+            with self._rd._opener.open(row["url_npm"]) as f:
+                fp.touch()
+                fp.write_bytes(f.read())
+        print("Finished downloads")
+        return None
+
+    def clear(self) -> None:
+        # unlink all matching sha
+        # stricter than `__iter__`
+        # - to avoid deleting unrelated files in dir
+        self.ensure_active()
+        if self.is_empty():
+            return None
+        ser = (
+            self._rd._scan_metadata()
+            .select("sha", "suffix")
+            .unique("sha")
+            .select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
+            .collect()
+            .get_column("sha_suffix")
+        )
+        names = set[str](
+            ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser
+        )
+        for fp in self:
+            if fp.name in names:
+                fp.unlink()
+
+    def __iter__(self) -> Iterator[Path]:
+        yield from self.path.iterdir()
+
+    def __repr__(self):
+        name = type(self).__name__
+        if self.is_not_active():
+            return f"{name}<UNSET>"
+        else:
+            return f"{name}<{self.path.as_posix()!r}>"
+
+    def is_active(self) -> bool:
+        return not self.is_not_active()
+
+    def is_not_active(self) -> bool:
+        return os.environ.get(self._ENV_VAR) is None
+
+    def is_empty(self) -> bool:
+        """Cache is active, but no files in the directory."""
+        return next(iter(self), None) is None
+
+    def ensure_active(self) -> None:
+        # Fail fast when the cache op is later
+        # Otherwise, just get the error from `self.path`
+        if self.is_not_active():
+            msg = (
+                f"Cache is unset.\n"
+                f"To enable dataset caching, set the environment variable:\n"
+                f"    {self._ENV_VAR!r}\n\n"
+                f"You can set this for the current session via:\n"
+                f"    from pathlib import Path\n"
+                f"    from altair.datasets import load\n\n"
+                f"    load.cache.path = Path.home() / '.altair_cache'"
+            )
+            raise ValueError(msg)
+
+    @property
+    def path(self) -> Path:
+        """
+        Returns path to datasets cache.
+
+        By default, this can be configured using the environment variable:
+
+            "ALTAIR_DATASETS_DIR"
+
+        You can set this for the current session via:
+
+            >>> from pathlib import Path
+            >>> from altair.datasets import load
+            >>> load.cache.path = Path.home() / ".altair_cache"
+
+            >>> load.cache.path.relative_to(Path.home()).as_posix()
+            '.altair_cache'
+        """
+        self.ensure_active()
+        fp = Path(os.environ[self._ENV_VAR])
+        fp.mkdir(exist_ok=True)
+        return fp
+
+    @path.setter
+    def path(self, source: StrPath | None, /) -> None:
+        if source is not None:
+            os.environ[self._ENV_VAR] = str(Path(source).resolve())
+        else:
+            os.environ.pop(self._ENV_VAR, None)
+
+
+url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 3e31aea2e..ac56aa892 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
-from pathlib import Path
-from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload
+from typing import TYPE_CHECKING, Generic, final, overload
 
 from narwhals.typing import IntoDataFrameT, IntoFrameT
 
@@ -9,13 +8,13 @@
 
 if TYPE_CHECKING:
     import sys
-    from collections.abc import MutableMapping
-    from typing import Any, Final, Literal
+    from typing import Any, Literal
 
     import pandas as pd
     import polars as pl
     import pyarrow as pa
-    from _typeshed import StrPath
+
+    from altair.datasets._cache import DatasetCache
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
@@ -27,12 +26,6 @@
 
 __all__ = ["Loader", "load"]
 
-_KT = TypeVar("_KT")
-_VT = TypeVar("_VT")
-_T = TypeVar("_T")
-
-_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
-
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
@@ -294,34 +287,18 @@ def url(
         """
         return self._reader.url(name, suffix, tag=tag)
 
+    # TODO: Examples for tasklist
     @property
-    def cache_dir(self) -> Path | None:
+    def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         """
-        Returns path to datasets cache.
-
-        By default, this can be configured using the environment variable:
-
-            "ALTAIR_DATASETS_DIR"
-
-        You *may* also set this directly, but the value will **not** persist between sessions:
-
-            from pathlib import Path
-
-            from altair.datasets import Loader
-
-            data = Loader.from_backend("polars")
-            data.cache_dir = Path.home() / ".altair_cache"
+        Dataset caching.
 
-            >>> data.cache_dir.relative_to(Path.home()).as_posix()  # doctest: +SKIP
-            '.altair_cache'
+        - [x] Enable via 2 examples
+        - [ ] Disable after enabling (self.cache.path = None)
+        - [ ] Pre-download missing
+        - [ ] Clear entire cache
         """
-        return self._reader._cache
-
-    @cache_dir.setter
-    def cache_dir(self, source: StrPath, /) -> None:
-        import os
-
-        os.environ[self._reader._ENV_VAR] = str(source)
+        return self._reader.cache
 
     def __repr__(self) -> str:
         return f"{type(self).__name__}[{self._reader._name}]"
@@ -384,69 +361,6 @@ def __call__(
             return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
 
 
-class UrlCache(Generic[_KT, _VT]):
-    """
-    `csv`_, `gzip`_ -based, lazy url lookup.
-
-    Operates on a subset of available datasets:
-    - Only the latest version
-    - Excludes `.parquet`, which `cannot be read via url`_
-    - Name collisions are pre-resolved
-        - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``)
-
-    .. _csv:
-        https://docs.python.org/3/library/csv.html
-    .. _gzip:
-        https://docs.python.org/3/library/gzip.html
-    .. _cannot be read via url:
-        https://github.com/vega/vega/issues/3961
-    """
-
-    def __init__(
-        self,
-        fp: Path,
-        /,
-        *,
-        columns: tuple[str, str] = ("dataset_name", "url_npm"),
-        tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
-    ) -> None:
-        self.fp: Path = fp
-        self.columns: tuple[str, str] = columns
-        self._mapping: MutableMapping[_KT, _VT] = tp()
-
-    def read(self) -> Any:
-        import csv
-        import gzip
-
-        with gzip.open(self.fp, mode="rb") as f:
-            b_lines = f.readlines()
-        reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
-        header = tuple(next(reader))
-        if header != self.columns:
-            msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}"
-            raise ValueError(msg)
-        return dict(reader)
-
-    def __getitem__(self, key: _KT, /) -> _VT:
-        if url := self.get(key, None):
-            return url
-
-        from altair.datasets._typing import Dataset
-
-        if key in get_args(Dataset):
-            msg = f"{key!r} cannot be loaded via url."
-            raise TypeError(msg)
-        else:
-            msg = f"{key!r} does not refer to a known dataset."
-            raise TypeError(msg)
-
-    def get(self, key: _KT, default: _T) -> _VT | _T:
-        if not self._mapping:
-            self._mapping.update(self.read())
-        return self._mapping.get(key, default)
-
-
-url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
 load: _Load[Any, Any]
 
 
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 2c8d53820..e7c97b9d1 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -9,7 +9,6 @@
 
 from __future__ import annotations
 
-import os
 import urllib.request
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
@@ -33,6 +32,7 @@
 import narwhals.stable.v1 as nw
 from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
+from altair.datasets._cache import DatasetCache
 from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read
 
 if TYPE_CHECKING:
@@ -128,7 +128,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     Otherwise, has no concrete meaning.
     """
 
-    _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
 
     def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
@@ -151,8 +150,8 @@ def dataset(
         url = result["url_npm"]
         fn = self.read_fn(url)
 
-        if cache := self._cache:
-            fp = cache / (result["sha"] + result["suffix"])
+        if self.cache.is_active():
+            fp = self.cache.path / (result["sha"] + result["suffix"])
             if fp.exists() and fp.stat().st_size:
                 return fn(fp, **kwds)
             else:
@@ -211,18 +210,8 @@ def _scan_metadata(
         return frame
 
     @property
-    def _cache(self) -> Path | None:  # type: ignore[return]
-        """
-        Returns path to datasets cache, if possible.
-
-        Requires opt-in via environment variable::
-
-            Reader._ENV_VAR
-        """
-        if _dir := os.environ.get(self._ENV_VAR):
-            cache_dir = Path(_dir)
-            cache_dir.mkdir(exist_ok=True)
-            return cache_dir
+    def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
+        return DatasetCache(self)
 
     def _import(self, name: str, /) -> Any:
         if spec := find_spec(name):
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 50ece0a26..1d0990abf 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -7,6 +7,7 @@
 from functools import partial
 from importlib import import_module
 from importlib.util import find_spec
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast, get_args
 from urllib.error import URLError
 
@@ -21,7 +22,7 @@
 
 from altair.datasets import Loader, url
 from altair.datasets._readers import _METADATA, AltairDatasetsError
-from altair.datasets._typing import Dataset, Extension, Metadata, Version
+from altair.datasets._typing import Dataset, Extension, Metadata, Version, is_ext_read
 from tests import skip_requires_pyarrow, slow
 
 if sys.version_info >= (3, 14):
@@ -104,7 +105,7 @@ def polars_loader(
     tmp_path_factory: pytest.TempPathFactory,
 ) -> Loader[pl.DataFrame, pl.LazyFrame]:
     data = Loader.from_backend("polars")
-    data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars")
+    data.cache.path = tmp_path_factory.mktemp("loader-cache-polars")
     return data
 
 
@@ -273,7 +274,7 @@ def test_url(name: Dataset) -> None:
 
 def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     import altair.datasets
-    from altair.datasets._loader import url_cache
+    from altair.datasets._cache import url_cache
 
     monkeypatch.setitem(sys.modules, "polars", None)
     monkeypatch.setitem(sys.modules, "pandas", None)
@@ -477,11 +478,11 @@ def test_reader_cache(
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
 
     data = Loader.from_backend(backend)
-    cache_dir = data.cache_dir
-    assert cache_dir is not None
+    assert data.cache.is_active()
+    cache_dir = data.cache.path
     assert cache_dir == tmp_path
 
-    assert tuple(cache_dir.iterdir()) == ()
+    assert tuple(data.cache) == ()
 
     # smallest csvs
     lookup_groups = data("lookup_groups", tag="v2.5.3")
@@ -489,7 +490,7 @@ def test_reader_cache(
     data("iowa-electricity", tag="v2.3.1")
     data("global-temp", tag="v2.9.0")
 
-    cached_paths = tuple(cache_dir.iterdir())
+    cached_paths = tuple(data.cache)
     assert len(cached_paths) == 4
 
     if is_polars_dataframe(lookup_groups):
@@ -504,15 +505,15 @@ def test_reader_cache(
         )
 
     assert_frame_equal(left, right)
-    assert len(tuple(cache_dir.iterdir())) == 4
-    assert cached_paths == tuple(cache_dir.iterdir())
+    assert len(tuple(data.cache)) == 4
+    assert cached_paths == tuple(data.cache)
 
     data("iowa-electricity", tag="v1.30.2")
     data("global-temp", tag="v2.8.1")
     data("global-temp", tag="v2.8.0")
 
-    assert len(tuple(cache_dir.iterdir())) == 4
-    assert cached_paths == tuple(cache_dir.iterdir())
+    assert len(tuple(data.cache)) == 4
+    assert cached_paths == tuple(data.cache)
 
     data("lookup_people", tag="v1.10.0")
     data("lookup_people", tag="v1.11.0")
@@ -522,8 +523,52 @@ def test_reader_cache(
     data("lookup_people", tag="v2.3.0")
     data("lookup_people", tag="v2.5.0-next.0")
 
-    assert len(tuple(cache_dir.iterdir())) == 4
-    assert cached_paths == tuple(cache_dir.iterdir())
+    assert len(tuple(data.cache)) == 4
+    assert cached_paths == tuple(data.cache)
+
+
+@slow
+@datasets_debug
+@backends
+def test_reader_cache_exhaustive(
+    backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    """
+    Fully populate and then purge the cache for all backends.
+
+    - Does not attempt to read the files
+    - Checking we can support pre-downloading and safely deleting
+    """
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+    data = Loader.from_backend(backend)
+    assert data.cache.is_active()
+    cache_dir = data.cache.path
+    assert cache_dir == tmp_path
+    assert tuple(data.cache) == ()
+
+    data.cache.download_all()
+    cached_paths = tuple(data.cache)
+    assert cached_paths != ()
+
+    # NOTE: Approximating all datasets downloaded
+    assert len(cached_paths) >= 40
+    assert all(
+        bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size)
+        for fp in data.cache
+    )
+    # NOTE: Confirm this is a no-op
+    data.cache.download_all()
+    assert len(cached_paths) == len(tuple(data.cache))
+
+    # NOTE: Ensure unrelated files in the directory are not removed
+    dummy: Path = tmp_path / "dummy.json"
+    dummy.touch(exist_ok=False)
+    data.cache.clear()
+
+    remaining = tuple(tmp_path.iterdir())
+    assert len(remaining) == 1
+    assert remaining[0] == dummy
+    dummy.unlink()
 
 
 movies_fail: ParameterSet = pytest.param(
@@ -559,7 +604,7 @@ def test_reader_cache(
 def test_pyarrow_read_json(
     fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv(CACHE_ENV_VAR, "")
+    monkeypatch.delenv(CACHE_ENV_VAR, raising=False)
     monkeypatch.delitem(sys.modules, "pandas", raising=False)
     if fallback is None:
         monkeypatch.setitem(sys.modules, "polars", None)
@@ -630,7 +675,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
     from polars.testing import assert_frame_equal
 
     data = Loader.from_backend("polars")
-    data.cache_dir = tmp_path
+    data.cache.path = tmp_path
 
     data("londonCentroids")
     data("stocks")

From f21b52b6c932c517383de02087f75228af0f7a28 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:59:09 +0000
Subject: [PATCH 123/201] ci(ruff): Ignore `0.8.0` violations

https://github.com/vega/altair/discussions/3687#discussioncomment-11351453
---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c43e00504..e398dfb6f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -378,7 +378,9 @@ ignore = [
     # doc-line-too-long
     "W505",
     # Any as annotation
-    "ANN401" 
+    "ANN401",
+    # 0.8.0
+    "RUF039", "RUF200"
 ]
 # https://docs.astral.sh/ruff/settings/#lintpydocstyle
 pydocstyle={ convention="numpy" }

From e7974d90c78a38c06d7e19aeeb54e32179948022 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 22 Nov 2024 19:53:01 +0000
Subject: [PATCH 124/201] fix: Use stable `narwhals` imports

https://github.com/narwhals-dev/narwhals/issues/1426, https://github.com/vega/altair/pull/3693#discussion_r1854513083
---
 altair/datasets/_cache.py   |  8 +++++---
 altair/datasets/_loader.py  |  2 +-
 altair/datasets/_readers.py |  2 +-
 tests/test_datasets.py      | 25 ++++++++++---------------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 9239911fd..0166c50e8 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -5,8 +5,8 @@
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
 
 import narwhals.stable.v1 as nw
-from narwhals.dependencies import get_pyarrow
-from narwhals.typing import IntoDataFrameT, IntoFrameT
+from narwhals.stable.v1 import dependencies as nw_dep
+from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
 from altair.datasets._typing import VERSION_LATEST
 
@@ -151,7 +151,9 @@ def clear(self) -> None:
             .get_column("sha_suffix")
         )
         names = set[str](
-            ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser
+            ser.to_list()
+            if nw.get_native_namespace(ser) is nw_dep.get_pyarrow()
+            else ser
         )
         for fp in self:
             if fp.name in names:
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index ac56aa892..5be85e60a 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -2,7 +2,7 @@
 
 from typing import TYPE_CHECKING, Generic, final, overload
 
-from narwhals.typing import IntoDataFrameT, IntoFrameT
+from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
 from altair.datasets._readers import _Reader, backend
 
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index e7c97b9d1..5adcf3751 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -30,7 +30,7 @@
 )
 
 import narwhals.stable.v1 as nw
-from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT
+from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
 from altair.datasets._cache import DatasetCache
 from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 1d0990abf..20515069b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -12,13 +12,8 @@
 from urllib.error import URLError
 
 import pytest
-from narwhals.dependencies import (
-    is_into_dataframe,
-    is_pandas_dataframe,
-    is_polars_dataframe,
-    is_pyarrow_table,
-)
 from narwhals.stable import v1 as nw
+from narwhals.stable.v1 import dependencies as nw_dep
 
 from altair.datasets import Loader, url
 from altair.datasets._readers import _METADATA, AltairDatasetsError
@@ -227,11 +222,11 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
     default_2 = load("cars")
     df_polars = load("cars", backend="polars")
 
-    assert is_polars_dataframe(default)
-    assert is_pyarrow_table(df_pyarrow)
-    assert is_pandas_dataframe(df_pandas)
-    assert is_polars_dataframe(default_2)
-    assert is_polars_dataframe(df_polars)
+    assert nw_dep.is_polars_dataframe(default)
+    assert nw_dep.is_pyarrow_table(df_pyarrow)
+    assert nw_dep.is_pandas_dataframe(df_pandas)
+    assert nw_dep.is_polars_dataframe(default_2)
+    assert nw_dep.is_polars_dataframe(df_polars)
 
 
 @pytest.mark.parametrize(
@@ -320,7 +315,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None
 
     data = Loader.from_backend(backend)
     frame = data("stocks", ".csv")
-    assert is_into_dataframe(frame)
+    assert nw_dep.is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)
     assert set(nw_frame.columns) == {"symbol", "date", "price"}
 
@@ -493,7 +488,7 @@ def test_reader_cache(
     cached_paths = tuple(data.cache)
     assert len(cached_paths) == 4
 
-    if is_polars_dataframe(lookup_groups):
+    if nw_dep.is_polars_dataframe(lookup_groups):
         left, right = (
             lookup_groups,
             cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")),
@@ -664,7 +659,7 @@ def test_all_datasets(
 ) -> None:
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
     frame = polars_loader(name, suffix, tag=tag)
-    assert is_polars_dataframe(frame)
+    assert nw_dep.is_polars_dataframe(frame)
 
 
 def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
@@ -698,7 +693,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
 
     # Now we can get a cache-hit
     frame = data("birdstrikes")
-    assert is_polars_dataframe(frame)
+    assert nw_dep.is_polars_dataframe(frame)
     assert len(tuple(tmp_path.iterdir())) == 4
 
     with monkeypatch.context() as mp:

From c907dc500504cdff8e2342f488fb679cd2108975 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 13:52:44 +0000
Subject: [PATCH 125/201] revert(ruff): Ignore `0.8.0` violations

f21b52b6c932c517383de02087f75228af0f7a28
---
 pyproject.toml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a44b4459e..c353b9b9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -377,9 +377,7 @@ ignore = [
     # doc-line-too-long
     "W505",
     # Any as annotation
-    "ANN401",
-    # 0.8.0
-    "RUF039", "RUF200"
+    "ANN401" 
 ]
 # https://docs.astral.sh/ruff/settings/#lintpydocstyle
 pydocstyle={ convention="numpy" }

From a3b38c49836c850681c41c797865351bddfccbb7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 13:58:53 +0000
Subject: [PATCH 126/201] revert: Remove `_readers._filter`

Feature has been adopted upstream in https://github.com/narwhals-dev/narwhals/pull/1417
---
 altair/datasets/_readers.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 5adcf3751..354a45532 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -206,7 +206,7 @@ def _scan_metadata(
     ) -> nw.LazyFrame:
         frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
         if predicates or constraints:
-            return _filter(frame, *predicates, **constraints)
+            return frame.filter(*predicates, **constraints)
         return frame
 
     @property
@@ -392,26 +392,6 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table:
         self._scan_fn = {".parquet": pa_read_parquet}
 
 
-def _filter(
-    frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-) -> FrameT:
-    """
-    ``narwhals`` only accepts ``filter(*predicates)``.
-
-    So we convert each item in ``**constraints`` here as::
-
-       col("column_name") == literal_value
-
-    - https://github.com/narwhals-dev/narwhals/issues/1383
-    - https://github.com/narwhals-dev/narwhals/pull/1417
-    """
-    return frame.filter(
-        nw.all_horizontal(
-            *chain(predicates, (nw.col(name) == v for name, v in constraints.items()))
-        )
-    )
-
-
 def _extract_constraints(
     name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
 ) -> Metadata:

From a6c5096ddab82fd4682006f90158b71b0f3aa479 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 14:43:11 +0000
Subject: [PATCH 127/201] feat: Adds example and tests for disabling caching

---
 altair/datasets/_cache.py  |  4 ++++
 altair/datasets/_loader.py |  2 +-
 tests/test_datasets.py     | 30 ++++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 0166c50e8..f801a26d1 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -211,6 +211,10 @@ def path(self) -> Path:
 
             >>> load.cache.path.relative_to(Path.home()).as_posix()
             '.altair_cache'
+
+        You can *later* disable caching via:
+
+            >>> load.cache.path = None
         """
         self.ensure_active()
         fp = Path(os.environ[self._ENV_VAR])
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 5be85e60a..111af950b 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -294,7 +294,7 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         Dataset caching.
 
         - [x] Enable via 2 examples
-        - [ ] Disable after enabling (self.cache.path = None)
+        - [x] Disable after enabling (self.cache.path = None)
         - [ ] Pre-download missing
         - [ ] Clear entire cache
         """
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 20515069b..5d2b93c2d 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -566,6 +566,36 @@ def test_reader_cache_exhaustive(
     dummy.unlink()
 
 
+def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    from altair.datasets import load
+
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+
+    assert load.cache.is_active()
+    assert load.cache.path == tmp_path
+    assert load.cache.is_empty()
+    load("cars")
+    assert not load.cache.is_empty()
+
+    # RELATED: https://github.com/python/mypy/issues/3004
+    load.cache.path = None  # type: ignore[assignment]
+
+    assert load.cache.is_not_active()
+    with pytest.raises(
+        ValueError,
+        match=re.compile(
+            rf"Cache.+unset.+{CACHE_ENV_VAR}.+\.cache\.path =", flags=re.DOTALL
+        ),
+    ):
+        tuple(load.cache)
+
+    load.cache.path = tmp_path
+
+    assert load.cache.is_active()
+    assert load.cache.path == tmp_path
+    assert not load.cache.is_empty()
+
+
 movies_fail: ParameterSet = pytest.param(
     "movies",
     marks=pytest.mark.xfail(

From 71423eadfe63a767c2b591f743b3a36272d59c7d Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 15:11:41 +0000
Subject: [PATCH 128/201] refactor: Tidy up `DatasetCache`

---
 altair/datasets/_cache.py | 124 ++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 67 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index f801a26d1..f9e3c683a 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
 
 import narwhals.stable.v1 as nw
-from narwhals.stable.v1 import dependencies as nw_dep
+from narwhals.stable.v1.dependencies import get_pyarrow
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
 from altair.datasets._typing import VERSION_LATEST
@@ -102,22 +102,38 @@ class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
     def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
         self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader
 
+    def clear(self) -> None:
+        """Delete all previously cached datasets."""
+        self._ensure_active()
+        if self.is_empty():
+            return None
+        ser = (
+            self._rd._scan_metadata()
+            .select("sha", "suffix")
+            .unique("sha")
+            .select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
+            .collect()
+            .get_column("sha_suffix")
+        )
+        names = set[str](
+            ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser
+        )
+        for fp in self:
+            if fp.name in names:
+                fp.unlink()
+
     def download_all(self) -> None:
         """
         Download any missing datasets for latest version.
 
-        ``v2.11.0`` stats
-        -----------------
-        - **66** items
-        - **27.8** MB
-        - Only 1 file > 2 MB
+        Requires **30-50MB** of disk-space.
         """
         stems = tuple(fp.stem for fp in self)
         latest = nw.col("tag") == nw.lit(VERSION_LATEST)
         predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,)
         frame = (
             self._rd._scan_metadata(
-                *predicates, ext_supported=True, name_collision=False
+                predicates, ext_supported=True, name_collision=False
             )
             .select("sha", "suffix", "url_npm")
             .unique("sha")
@@ -135,65 +151,6 @@ def download_all(self) -> None:
         print("Finished downloads")
         return None
 
-    def clear(self) -> None:
-        # unlink all matching sha
-        # stricter than `__iter__`
-        # - to avoid deleting unrelated files in dir
-        self.ensure_active()
-        if self.is_empty():
-            return None
-        ser = (
-            self._rd._scan_metadata()
-            .select("sha", "suffix")
-            .unique("sha")
-            .select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
-            .collect()
-            .get_column("sha_suffix")
-        )
-        names = set[str](
-            ser.to_list()
-            if nw.get_native_namespace(ser) is nw_dep.get_pyarrow()
-            else ser
-        )
-        for fp in self:
-            if fp.name in names:
-                fp.unlink()
-
-    def __iter__(self) -> Iterator[Path]:
-        yield from self.path.iterdir()
-
-    def __repr__(self):
-        name = type(self).__name__
-        if self.is_not_active():
-            return f"{name}<UNSET>"
-        else:
-            return f"{name}<{self.path.as_posix()!r}>"
-
-    def is_active(self) -> bool:
-        return not self.is_not_active()
-
-    def is_not_active(self) -> bool:
-        return os.environ.get(self._ENV_VAR) is None
-
-    def is_empty(self) -> bool:
-        """Cache is active, but no files in the directory."""
-        return next(iter(self), None) is None
-
-    def ensure_active(self) -> None:
-        # Fail fast when the cache op is later
-        # Otherwise, just get the error from `self.path`
-        if self.is_not_active():
-            msg = (
-                f"Cache is unset.\n"
-                f"To enable dataset caching, set the environment variable:\n"
-                f"    {self._ENV_VAR!r}\n\n"
-                f"You can set this for the current session via:\n"
-                f"    from pathlib import Path\n"
-                f"    from altair.datasets import load\n\n"
-                f"    load.cache.path = Path.home() / '.altair_cache'"
-            )
-            raise ValueError(msg)
-
     @property
     def path(self) -> Path:
         """
@@ -216,7 +173,7 @@ def path(self) -> Path:
 
             >>> load.cache.path = None
         """
-        self.ensure_active()
+        self._ensure_active()
         fp = Path(os.environ[self._ENV_VAR])
         fp.mkdir(exist_ok=True)
         return fp
@@ -228,5 +185,38 @@ def path(self, source: StrPath | None, /) -> None:
         else:
             os.environ.pop(self._ENV_VAR, None)
 
+    def __iter__(self) -> Iterator[Path]:
+        yield from self.path.iterdir()
+
+    def __repr__(self) -> str:
+        name = type(self).__name__
+        if self.is_not_active():
+            return f"{name}<UNSET>"
+        else:
+            return f"{name}<{self.path.as_posix()!r}>"
+
+    def is_active(self) -> bool:
+        return not self.is_not_active()
+
+    def is_not_active(self) -> bool:
+        return os.environ.get(self._ENV_VAR) is None
+
+    def is_empty(self) -> bool:
+        """Cache is active, but no files are stored in ``self.path``."""
+        return next(iter(self), None) is None
+
+    def _ensure_active(self) -> None:
+        if self.is_not_active():
+            msg = (
+                f"Cache is unset.\n"
+                f"To enable dataset caching, set the environment variable:\n"
+                f"    {self._ENV_VAR!r}\n\n"
+                f"You can set this for the current session via:\n"
+                f"    from pathlib import Path\n"
+                f"    from altair.datasets import load\n\n"
+                f"    load.cache.path = Path.home() / '.altair_cache'"
+            )
+            raise ValueError(msg)
+
 
 url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)

From 7dd9c18a6eef4c15baa91540ef887c30e38bff04 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 15:25:13 +0000
Subject: [PATCH 129/201] docs: Finish `Loader.cache`

Not using doctest style here, none of these return anything but I want them hinted at
---
 altair/datasets/_cache.py  |  2 ++
 altair/datasets/_loader.py | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index f9e3c683a..ce058c561 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -97,6 +97,8 @@ def get(self, key: _KT, default: _T) -> _VT | _T:
 
 
 class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
+    """Optional caching of remote dataset requests."""
+
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
 
     def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 111af950b..ce2559aed 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -287,16 +287,22 @@ def url(
         """
         return self._reader.url(name, suffix, tag=tag)
 
-    # TODO: Examples for tasklist
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         """
-        Dataset caching.
+        Optional caching of remote dataset requests.
 
-        - [x] Enable via 2 examples
-        - [x] Disable after enabling (self.cache.path = None)
-        - [ ] Pre-download missing
-        - [ ] Clear entire cache
+        Enable caching:
+
+            self.cache.path = ...
+
+        Download the latest datasets *ahead-of-time*:
+
+            self.cache.download_all()
+
+        Remove all downloaded datasets:
+
+            self.cache.clear()
         """
         return self._reader.cache
 

From a982759715061c436ea93aea8234cd04dfca4657 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 24 Nov 2024 17:26:20 +0000
Subject: [PATCH 130/201] refactor(typing): Use `Mapping` instead of `dict`

Mutability is not needed.
Also see https://github.com/vega/altair/pull/3573
---
 altair/datasets/_readers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 354a45532..9228c5531 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -105,14 +105,14 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
         _Reader._name
     """
 
-    _read_fn: dict[Extension, Callable[..., IntoDataFrameT]]
+    _read_fn: Mapping[Extension, Callable[..., IntoDataFrameT]]
     """
     Eager file read functions.
 
     Each corresponds to a known file extension within ``vega-datasets``.
     """
 
-    _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]]
+    _scan_fn: Mapping[_ExtensionScan, Callable[..., IntoFrameT]]
     """
     *Optionally*-lazy file read/scan functions.
 

From d20e9c11071898bb3f418fda22bf3f915ff949e8 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 30 Nov 2024 14:44:42 +0000
Subject: [PATCH 131/201] perf: Use `to_list()` for all backends

https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508957161, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508928135, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508981618
---
 altair/datasets/_cache.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index ce058c561..edca990d6 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
 
 import narwhals.stable.v1 as nw
-from narwhals.stable.v1.dependencies import get_pyarrow
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
 from altair.datasets._typing import VERSION_LATEST
@@ -117,9 +116,7 @@ def clear(self) -> None:
             .collect()
             .get_column("sha_suffix")
         )
-        names = set[str](
-            ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser
-        )
+        names = set[str](ser.to_list())
         for fp in self:
             if fp.name in names:
                 fp.unlink()

From 909e7d05e57718b2f634a7e6781cb4e58a835837 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:38:12 +0000
Subject: [PATCH 132/201] feat(DRAFT): Utilize `datapackage` schemas in
 `pandas` backends

Provides a generalized solution to `pd.read_(csv|json)` requiring the names of date columns to attempt parsing.
cc @joelostblom

The solution is possible in large part to https://github.com/vega/vega-datasets/pull/631

https://github.com/vega/altair/pull/3631#issuecomment-2480816377
---
 altair/datasets/_cache.py                     | 149 ++++++++++++++++--
 .../_metadata/datapackage_schemas.json.gz     | Bin 0 -> 2490 bytes
 altair/datasets/_readers.py                   |  37 ++++-
 altair/datasets/_typing.py                    |  22 +++
 tests/test_datasets.py                        |  68 +++++++-
 tools/datasets/__init__.py                    |  85 +++++++---
 tools/datasets/datapackage.py                 | 133 ++++++++++++++++
 tools/datasets/models.py                      |  94 ++++++++++-
 tools/datasets/npm.py                         |  54 ++++++-
 9 files changed, 600 insertions(+), 42 deletions(-)
 create mode 100644 altair/datasets/_metadata/datapackage_schemas.json.gz
 create mode 100644 tools/datasets/datapackage.py

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index edca990d6..22c652bf3 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
 
@@ -9,19 +10,32 @@
 
 from altair.datasets._typing import VERSION_LATEST
 
+if sys.version_info >= (3, 12):
+    from typing import Protocol
+else:
+    from typing_extensions import Protocol
+
 if TYPE_CHECKING:
-    import sys
-    from collections.abc import Iterator, MutableMapping
+    from collections.abc import Iterator, Mapping, MutableMapping
+    from io import IOBase
     from typing import Any, Final
 
     from _typeshed import StrPath
+    from narwhals.stable.v1.dtypes import DType
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
     from altair.datasets._readers import _Reader
-    from altair.datasets._typing import Dataset
+    from altair.datasets._typing import Dataset, FlFieldStr
+
+    _Dataset: TypeAlias = "Dataset | LiteralString"
+    _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
 
 __all__ = ["DatasetCache", "UrlCache", "url_cache"]
 
@@ -31,9 +45,62 @@
 _T = TypeVar("_T")
 
 _URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
+_SCHEMA: Final[Path] = (
+    Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz"
+)
+
+_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
+    "integer": nw.Int64,
+    "number": nw.Float64,
+    "boolean": nw.Boolean,
+    "string": nw.String,
+    "object": nw.Struct,
+    "array": nw.List,
+    "date": nw.Date,
+    "datetime": nw.Datetime,
+    # "time": nw.Time, (Not Implemented, but we don't have any cases using it anyway)
+    "duration": nw.Duration,
+}
+"""
+Similar to an inverted `pl.datatypes.convert.dtype_to_ffiname`_.
+
+But using the string repr of ``frictionless`` `Field Types`_ to `narwhals.dtypes`_.
+
+.. _pl.datatypes.convert.dtype_to_ffiname:
+    https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165
+.. _Field Types:
+    https://datapackage.org/standard/table-schema/#field-types
+.. _narwhals.dtypes:
+    https://narwhals-dev.github.io/narwhals/api-reference/dtypes/
+"""
+
+_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = {
+    v: k for k, v in _FIELD_TO_DTYPE.items()
+}
+
+
+class CompressedCache(Protocol[_KT, _VT]):
+    fp: Path
+    _mapping: MutableMapping[_KT, _VT]
+
+    def read(self) -> Any: ...
+    def __getitem__(self, key: _KT, /) -> _VT: ...
+
+    def __enter__(self) -> IOBase:
+        import gzip
+
+        return gzip.open(self.fp, mode="rb").__enter__()
 
+    def __exit__(self, *args) -> None:
+        return
 
-class UrlCache(Generic[_KT, _VT]):
+    def get(self, key: _KT, default: _T, /) -> _VT | _T:
+        if not self._mapping:
+            self._mapping.update(self.read())
+        return self._mapping.get(key, default)
+
+
+class UrlCache(CompressedCache[_KT, _VT]):
     """
     `csv`_, `gzip`_ -based, lazy url lookup.
 
@@ -65,9 +132,8 @@ def __init__(
 
     def read(self) -> Any:
         import csv
-        import gzip
 
-        with gzip.open(self.fp, mode="rb") as f:
+        with self as f:
             b_lines = f.readlines()
         reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
         header = tuple(next(reader))
@@ -89,10 +155,72 @@ def __getitem__(self, key: _KT, /) -> _VT:
             msg = f"{key!r} does not refer to a known dataset."
             raise TypeError(msg)
 
-    def get(self, key: _KT, default: _T) -> _VT | _T:
-        if not self._mapping:
-            self._mapping.update(self.read())
-        return self._mapping.get(key, default)
+
+class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]):
+    """
+    `json`_, `gzip`_ -based, lazy schema lookup.
+
+    - Primarily benefits ``pandas``, which needs some help identifying **temporal** columns.
+    - Utilizes `data package`_ schema types.
+    - All methods return falsy containers instead of exceptions
+
+    .. _json:
+        https://docs.python.org/3/library/json.html
+    .. _gzip:
+        https://docs.python.org/3/library/gzip.html
+    .. _data package:
+        https://github.com/vega/vega-datasets/pull/631
+    """
+
+    def __init__(
+        self,
+        fp: Path,
+        /,
+        *,
+        tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"],
+    ) -> None:
+        self.fp: Path = fp
+        self._mapping: MutableMapping[_Dataset, _FlSchema] = tp()
+
+    def read(self) -> Any:
+        import json
+
+        with self as f:
+            return json.load(f)
+
+    def __getitem__(self, key: _Dataset, /) -> _FlSchema:
+        return self.get(key, {})
+
+    def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]:
+        """
+        Return column names specfied in ``name``'s schema.
+
+        Parameters
+        ----------
+        name
+            Dataset name.
+        *dtypes
+            Optionally, only return columns matching the given data type(s).
+        """
+        if (match := self[name]) and dtypes:
+            include = {_DTYPE_TO_FIELD[tp] for tp in dtypes}
+            return [col for col, tp_str in match.items() if tp_str in include]
+        else:
+            return list(match)
+
+    def schema(self, name: _Dataset, /) -> Mapping[str, DType]:
+        return {
+            column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items()
+        }
+
+    def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]:
+        """
+        Can be passed directly to `.with_columns(...).
+
+        BUG: `cars` doesnt work in either pandas backend
+        """
+        for column, dtype in self.schema(name).items():
+            yield nw.col(column).cast(dtype)
 
 
 class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
@@ -219,3 +347,4 @@ def _ensure_active(self) -> None:
 
 
 url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
+schema_cache = SchemaCache(_SCHEMA)
diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..537dcd28ba9377319523683299cb1773ddf40e79
GIT binary patch
literal 2490
zcmV;r2}SlFiwFn+00002|72lwVQ^t%Yhh<)Uvpz<Wo=<|E^2dcZUDs@U2oes@~;TN
zJ?sHzAWpZ9eQ$}=?FHCuvT<&EeG+JilGsqBKvHqELH_#<DcP20=r;GTy*GP?8qW6@
zUrvPHvKf~=b4FysbtnH!UQRmB4E#9Bnd2w_NKUM7O~yUM!ZB5PnBp!ix#EU7p_Ru8
z<#@@>$tz$R-L!ItIS||(ffKCc2x9p&f;3z=l4IH%W+WdN6<p>XuiARkfAhqQ;6aWN
zvyuC)aAS!);apf|9XB@6{-~V5biPnJSDd-ZVjh{YnFf{ur8NhZ?3B`f5ScF{DT0u#
zaiUk=CCh**!KMW6xB`Po84iu)UpvDar|T}mnBRB|`pK|5T{|X}Cp@|06IZQ8I+AQ=
zgm%4(3Sq2E6|0eNNEQWClY;~0+)PgyhFvZ9Do%qlj+~O^x#8AguM}mq(WcTmpJH6s
zU@o;Pg==#@Rfu<61oeW~`l&bK%gKXq64CniI{9R@wMf3t+Q@t>avsIPM^}GbC2J-*
zM%@{mx0#Qh<g(36?hyVSLY7&cTrr0@xw*TzNbW%fY|Z;tc@rlOqUK306!v4`k8WD0
zkv-Ot;J;yCwMoD~3_`37*h)O{<bkOBN*Ks8nEJhTj;rKB*UV{dN1Ts%yS}-)9AOT&
z3Wz_n<0JrE#p_1uuI9=Gqlp|5HG}F#Ha=vnl!I?54!Iz$S_7$(sloWx5tS3tlk7J}
z?5R%0m|QUtBQszv7B0sAztyeEvEOJ{ah#zbt$AO;qygx_k7rvSUEItnu(RxDN}HNV
zORjXlNRZ73U29O#IMB8)1q87bDo0u_B*dt*GbT)<O>Z4-MA^IKIP5K0e6Pj>A_K`C
z7H3EG?DTQM`<R|&9o3CvD#Ll{?d-hvi6qx*1)gTK$a~e%?3X=HHIsEm^m2B(6XER)
z2kT79=?%BLg8n_{_wIoF`gJhgQwZ8#sJCa#z$tY^vtl@t*@koMAsHJk{EcYwMnJMm
z!PnAHOiB`9s}2fcJ_Oh_yEPID%a9I6Z2k*`<Bi^uu=(suiA#k_<pz_x$TCR4ht}sn
z?h9(2Zm}mtLN6NL5NDZagp1tagDtJua}0{hJBmLK*<U(lDa4f7Rk#$Imz}D<*+mr`
z+7vnTNh6OC1yA{N1HLmwsPxDepygxV^sIG=VKApFAXh==g6_z$p;^K$B#s~1*4mQW
zt<PMsgiXU(<Tx2gT+<$TN3?r3%QS>72y*C(uaL=Cw`WtfSpKzzhBQy6m=N8S%%Xmy
z8;)F{0zXFP(OaZqsV&Y2qEt+x){Hw+F+o}Op!vi1IJ>n<cyrCp@^HFET{}7rX{Ks7
z;p8s0eJB5q3;v8{zTKLE@3;2g)i%<HM|`^0%2fzPu8ihTQZM`JpdZ^N-On<y+CV$w
zDr6s8<e1^JoHxF2R@q*#eTz0O3Cy6rF({TbwHTW2=v+plE%=A`Lqppf99_p^I2QGj
zn)kzt_GZ^szYHC20=0>k+H}cE=a}|zB-<UKCLB)KcC$MaCowt@p>hEeQPw3H<zT?j
zPcEV7Vr}e`0Ta0;3s#O^Goa{44?qkWzb-k><E@${Fc;{enC;a&qA*{6I3tjE1oAV1
zyeE(k1aeLw3j$eU$oVOO%n5|>d`@^iCp@1Mp3f1_55JQWM22%B!#R;*L1b7E85Tr_
z1(9JvWLOXx7DVq0!t;XgydXR;3C~Ny^OEqqBs?z(&r8DdlJLAFJTD2)OT;rY<);KP
zCy+A&c}E~W6Uci4`9L7&1hOCy!t<Q)JSRNQ3D0xF^PKQJCp^yy&vU}_obWs+JfEH7
zPESMOfbm%R7OLM-KsxU$--@YR_^Od!LqT++fOB+UJe;Bf#kI<OIZ`fNRWMP&gbuUC
z8rtp_bqaUPxGs&<h8}TI6j0zImp^F7WO~mtE*ggh{|CdAE&P_DFuG1JT3108`;+Hz
zPhAV=IN1#K<Y1oJv@p2%Zs#aXr;k%SoWJByHT`BNT``Lq#gCx3C}K6XM9=(Q$<vd+
zY8E{)E*pLxNy9-r&g~87v7p3;brF#d1kjPmo_6DGdb<TpT%zv3SLroe^(68Zey)(8
z#aUX@P0#_^uhTNzGe-2*Gk>>p^5d3|e7p0>qhCD1kXrH-?3mJ3;XN<>3d#G)L3m3x
zjr&9KdtN6qrC$=nLh%1Q-VRN8X>{B4nIrrp?+wNtX%zd9`rnaP02pm}#%v7r#AJ(i
zGsEhuCkjae?e-(fBk~jx*+WKK_xN%qw#o5~_1nKU4r#5O1ukY8^OltRB@tN<eS6hJ
z<qK&aQgJJWJimsKg3NB$Rx*Xg@k`DglV9OeS0z8y_1|%>1MC%VqR3yQXut#T*W-p-
z=UsBctWAC@F>c+zUX29!Q#8Iu?gfB9X{2L*yLJ92C&0SQ4McZXp2M&xa{m1pSkO0#
zl36^Z?}o`)#hG*!4BVNn`yn^#jC=fa)b-y^MLoJfH&Creh}!0T=+0sTWCK~4ADKGm
zKYmxd5hnM49|ym<=rch=8xz1}h7}GR?vW=q;5%p=^RXyk&Q31sFtF(&$yq%t5)Oxc
z38E%=)wCxLKcrARxee=kyvgdfmr113-Btei2`=q*IYoXJGWV;1WDT9iF;py~!`oDl
z1Ub%1TbJBM9ybEQ7kv@Mgwme;4achAIVaZ_>jZ>*RN9e(R%^vfRvp{rgCiJ6-gfYk
z-G|0AraTG}N|$(d^mPd@SDY6T2kWl9jeHOCHF^}=uex@Ap|mQPsqu}FK4{xkY7Kos
z6J9Fmo1Nw62_nzqp|PO&foA%>b9H!_P9f2!QkNw?07C)D*guxyZ<%Tfmbuo@&+^u=
zjpS*N0&jh_`WD(5mh;s4KHJA`|6mAhnRJ3A&26P2lQJ33<AiRwV?xF-h`5)2%M7n{
zYsn*M8#ZeK_w_>geQPX-7Lb}$CZN{@-9TZ8unpEIxV7-DKLxjM{KWv-*$aQb!3hq$
z_9^|?S;>WeLF`Xz8+wPKwBe`BV<bx7*{a<W#ng1(fzzyFsw8ioh1PBZQ(km)?mJ*h
z!mom8O&w=7+SqPc7XgRUHq!<NOE^yu<6`vwhu5O7debL3_<{1<q?w7g2(gto6p;)!
ziTPm?dO=Skq&W;th16i8neX?H?WhM_tfarICt4h<?5|Q|SjxwrSy6`H_bRhto%Gi|
z-?02^1p1_9TN4l5Lb$qPy51qjBNlWSxD~WcXOF{cL|1RLjE{fc=*wRJ3!unLSdAtC
E0I|--i~s-t

literal 0
HcmV?d00001

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 9228c5531..5b9829b9e 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -33,7 +33,7 @@
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
 from altair.datasets._cache import DatasetCache
-from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read
+from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read
 
 if TYPE_CHECKING:
     import json  # noqa: F401
@@ -136,6 +136,10 @@ def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
     def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
         return self._scan_fn[_extract_suffix(source, is_ext_scan)]
 
+    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
+        """Hook to provide additional schema metadata on read."""
+        return {}
+
     def dataset(
         self,
         name: Dataset | LiteralString,
@@ -149,6 +153,8 @@ def dataset(
         result = cast("Metadata", next(it))
         url = result["url_npm"]
         fn = self.read_fn(url)
+        if default_kwds := self._schema_kwds(result):
+            kwds = default_kwds | kwds if kwds else default_kwds
 
         if self.cache.is_active():
             fp = self.cache.path / (result["sha"] + result["suffix"])
@@ -238,7 +244,32 @@ def __repr__(self) -> str:
     def __init__(self, name: LiteralString, /) -> None: ...
 
 
-class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
+class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol):
+    """
+    Provides temporal column names as keyword arguments on read.
+
+    Related
+    -------
+    - https://github.com/vega/altair/pull/3631#issuecomment-2480816377
+    - https://github.com/vega/vega-datasets/pull/631
+    - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+    - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
+    """
+
+    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
+        from altair.datasets._cache import schema_cache
+
+        name: Any = result["dataset_name"]
+        suffix = result["suffix"]
+        if cols := schema_cache.by_dtype(name, nw.Date, nw.Datetime):
+            if suffix == ".json":
+                return {"convert_dates": cols}
+            elif suffix in {".csv", ".tsv"}:
+                return {"parse_dates": cols}
+        return super()._schema_kwds(result)
+
+
+class _PandasReader(_PandasReaderBase):
     def __init__(self, name: _Pandas, /) -> None:
         self._name = _requirements(name)
         if not TYPE_CHECKING:
@@ -253,7 +284,7 @@ def __init__(self, name: _Pandas, /) -> None:
         self._scan_fn = {".parquet": pd.read_parquet}
 
 
-class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]):
+class _PandasPyArrowReader(_PandasReaderBase):
     def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
         _pd, _pa = _requirements(name)
         self._name = name
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 0b681b834..c83c6066e 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -257,3 +257,25 @@ class Metadata(TypedDict, total=False):
     suffix: str
     tag: str
     url_npm: str
+
+
+FlFieldStr: TypeAlias = Literal[
+    "integer",
+    "number",
+    "boolean",
+    "string",
+    "object",
+    "array",
+    "date",
+    "datetime",
+    "time",
+    "duration",
+]
+"""
+String representation of `frictionless`_ `Field Types`_.
+
+.. _frictionless:
+    https://github.com/frictionlessdata/frictionless-py
+.. _Field Types:
+    https://datapackage.org/standard/table-schema/#field-types
+"""
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 5d2b93c2d..9d91c275e 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -30,10 +30,12 @@
     from pathlib import Path
     from typing import Literal
 
+    import pandas as pd
     import polars as pl
     from _pytest.mark.structures import ParameterSet
 
-    from altair.datasets._readers import _Backend, _Polars
+    from altair.datasets._readers import _Backend, _PandasAny, _Polars
+    from altair.vegalite.v5.schema._typing import OneOrSeq
     from tests import MarksType
 
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
@@ -743,3 +745,67 @@ def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -
     native = fn(_METADATA)
     schema_columns = nw.from_native(native).lazy().collect().columns
     assert set(schema_columns) == metadata_columns
+
+
+@skip_requires_pyarrow
+@pytest.mark.parametrize("backend", ["pandas", "pandas[pyarrow]"])
+@pytest.mark.parametrize(
+    ("name", "columns"),
+    [
+        ("birdstrikes", "Flight Date"),
+        ("cars", "Year"),
+        ("co2-concentration", "Date"),
+        ("crimea", "date"),
+        ("football", "date"),
+        ("iowa-electricity", "year"),
+        ("la-riots", "death_date"),
+        ("ohlc", "date"),
+        ("seattle-weather-hourly-normals", "date"),
+        ("seattle-weather", "date"),
+        ("sp500-2000", "date"),
+        ("unemployment-across-industries", "date"),
+        ("us-employment", "month"),
+    ],
+)
+def test_pandas_date_parse(
+    backend: _PandasAny,
+    name: Dataset,
+    columns: OneOrSeq[str],
+    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
+) -> None:
+    """
+    Ensure schema defaults are correctly parsed.
+
+    NOTE:
+    - Depends on ``frictionless`` being able to detect the date/datetime columns.
+    - Not all format strings work
+    """
+    date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns)
+
+    load = Loader.from_backend(backend)
+    url = load.url(name)
+    kwds: dict[str, Any] = (
+        {"convert_dates": date_columns}
+        if url.endswith(".json")
+        else {"parse_dates": date_columns}
+    )
+    kwds_empty: dict[str, Any] = {k: [] for k in kwds}
+
+    df_schema_derived: pd.DataFrame = load(name)
+    nw_schema = nw.from_native(df_schema_derived).schema
+
+    df_manually_specified: pd.DataFrame = load(name, **kwds)
+    df_dates_empty: pd.DataFrame = load(name, **kwds_empty)
+
+    assert set(date_columns).issubset(nw_schema)
+    for column in date_columns:
+        assert nw_schema[column] in {nw.Date, nw.Datetime}
+
+    assert nw_schema == nw.from_native(df_manually_specified).schema
+    assert nw_schema != nw.from_native(df_dates_empty).schema
+
+    # NOTE: Checking `polars` infers the same[1] as what `pandas` needs a hint for
+    # [1] Doesn't need to be exact, just recognise as *some kind* of date/datetime
+    pl_schema: pl.Schema = polars_loader(name).schema
+    for column in date_columns:
+        assert pl_schema[column].is_temporal()
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 1402a9c7b..66c31e6f6 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -39,7 +39,15 @@
     else:
         from typing_extensions import TypeAlias
 
-    _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"]
+    _PathAlias: TypeAlias = Literal[
+        "npm_tags",
+        "gh_tags",
+        "gh_trees",
+        "typing",
+        "url",
+        "dpkg_features",
+        "dpkg_schemas",
+    ]
 
 __all__ = ["app"]
 
@@ -102,15 +110,17 @@ def __init__(
             npm_cdn_url=self._npm.url.CDN,
             **kwds_gh,
         )
-        self._paths = types.MappingProxyType["_PathAlias", Path](
+        self.paths = types.MappingProxyType["_PathAlias", Path](
             {
                 "npm_tags": self.npm._paths["tags"],
                 "gh_tags": self.github._paths["tags"],
                 "gh_trees": self.github._paths["trees"],
+                "typing": out_fp_typing,
+                "url": out_dir_altair / "url.csv.gz",
+                "dpkg_features": out_dir_altair / "datapackage_features.parquet",
+                "dpkg_schemas": out_dir_altair / "datapackage_schemas.json.gz",
             }
         )
-        self._fp_typing: Path = out_fp_typing
-        self._fp_url: Path = out_dir_altair / "url.csv.gz"
 
     @property
     def github(self) -> GitHub:
@@ -131,13 +141,13 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
         """
         print("Syncing datasets ...")
         npm_tags = self.npm.tags()
-        self.write_parquet(npm_tags, self._paths["npm_tags"])
+        self.write_parquet(npm_tags, self.paths["npm_tags"])
 
         gh_tags = self.github.refresh_tags(npm_tags)
-        self.write_parquet(gh_tags, self._paths["gh_tags"])
+        self.write_parquet(gh_tags, self.paths["gh_tags"])
 
         gh_trees = self.github.refresh_trees(gh_tags)
-        self.write_parquet(gh_trees, self._paths["gh_trees"])
+        self.write_parquet(gh_trees, self.paths["gh_trees"])
 
         npm_urls_min = (
             gh_trees.lazy()
@@ -145,31 +155,29 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
             .filter(col("size") == col("size").min().over("dataset_name"))
             .select("dataset_name", "url_npm")
         )
-        self.write_csv_gzip(npm_urls_min, self._fp_url)
+        self.write_csv_gzip(npm_urls_min, self.paths["url"])
+
+        package = self.npm.datapackage()
+        # TODO: Re-enable after deciding on how best to utilize
+        # self.write_parquet(package["features"], self.paths["dpkg_features"])
+        self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"])
 
         if include_typing:
-            self.generate_typing(self._fp_typing)
+            self.generate_typing()
         return gh_trees
 
     def reset(self) -> None:
         """Remove all metadata files."""
-        for fp in self._paths.values():
+        for fp in self.paths.values():
             fp.unlink(missing_ok=True)
 
     def read(self, name: _PathAlias, /) -> pl.DataFrame:
         """Read existing metadata from file."""
-        return pl.read_parquet(self._from_alias(name))
+        return pl.read_parquet(self.paths[name])
 
     def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
         """Scan existing metadata from file."""
-        return pl.scan_parquet(self._from_alias(name))
-
-    def _from_alias(self, name: _PathAlias, /) -> Path:
-        if name not in {"npm_tags", "gh_tags", "gh_trees"}:
-            msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}'
-            raise TypeError(msg)
-        else:
-            return self._paths[name]
+        return pl.scan_parquet(self.paths[name])
 
     def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """
@@ -193,6 +201,21 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non
             df.write_csv(buf)
             f.write(buf.getbuffer())
 
+    def write_json_gzip(self, obj: Any, fp: Path, /) -> None:
+        """
+        Write ``obj`` as a `gzip`_ compressed ``json`` file.
+
+        .. _gzip:
+            https://docs.python.org/3/library/gzip.html
+        """
+        if fp.suffix != ".gz":
+            fp = fp.with_suffix(".json.gz")
+        if not fp.exists():
+            fp.touch()
+
+        with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
+            f.write(json.dumps(obj).encode())
+
     def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """Write ``frame`` to ``fp``, with some extra safety."""
         if not fp.exists():
@@ -207,7 +230,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
             with fp_schema.open("w") as f:
                 json.dump(schema, f, indent=2)
 
-    def generate_typing(self, output: Path, /) -> None:
+    def generate_typing(self) -> None:
         from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
 
         tags = self.scan("gh_tags").select("tag").collect().to_series()
@@ -314,6 +337,20 @@ def generate_typing(self, output: Path, /) -> None:
             f"{textwrap.indent(textwrap.dedent(examples), indent)}"
         )
 
+        FIELD = "FlFieldStr"
+        FIELD_TYPES = (
+            "integer",
+            "number",
+            "boolean",
+            "string",
+            "object",
+            "array",
+            "date",
+            "datetime",
+            "time",
+            "duration",
+        )
+
         contents = (
             f"{HEADER_COMMENT}",
             "from __future__ import annotations\n",
@@ -341,8 +378,14 @@ def generate_typing(self, output: Path, /) -> None:
                 doc=metadata_doc,
                 comment="",
             ),
+            f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n"
+            '"""\n'
+            "String representation of `frictionless`_ `Field Types`_.\n\n"
+            f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n"
+            f".. _Field Types:\n{indent}https://datapackage.org/standard/table-schema/#field-types\n"
+            '"""\n',
         )
-        ruff.write_lint_format(output, contents)
+        ruff.write_lint_format(self.paths["typing"], contents)
 
 
 _alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets"
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
new file mode 100644
index 000000000..da1f8375e
--- /dev/null
+++ b/tools/datasets/datapackage.py
@@ -0,0 +1,133 @@
+"""
+``frictionless`` `datapackage`_ parsing.
+
+.. _datapackage:
+    https://datapackage.org/
+"""
+
+from __future__ import annotations
+
+from collections import deque
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, get_args
+
+import polars as pl
+from polars import col
+from polars import selectors as cs
+
+from tools.datasets.models import ParsedPackage
+from tools.schemapi import utils
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Mapping, Sequence
+
+    from altair.datasets._typing import Dataset, FlFieldStr
+    from tools.datasets.models import FlPackage
+
+
+__all__ = ["parse_package"]
+
+
+DATASET_NAME: Literal["dataset_name"] = "dataset_name"
+
+# # NOTE: Flag columns
+# Storing these instead of the full **56KB** `datapackage.json`
+FEATURES: Sequence[pl.Expr] = (
+    (col("format") == "png").alias("is_image"),
+    (col("type") == "table").alias("is_tabular"),
+    (col("format") == "geojson").alias("is_geo"),
+    (col("format") == "topojson").alias("is_topo"),
+    col("format").is_in(("geojson", "topojson")).alias("is_spatial"),
+    (col("format").str.contains("json")).alias("is_json"),
+)
+
+
+def parse_package(pkg: FlPackage, /) -> ParsedPackage:
+    return ParsedPackage(features=extract_features(pkg), schemas=extract_schemas(pkg))
+
+
+def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
+    """Reduce all datasets with schemas to a minimal mapping."""
+    m: Any = {
+        Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]}
+        for rsrc in pkg["resources"]
+        if (s := rsrc.get("schema"))
+    }
+    return m
+
+
+def extract_features(pkg: FlPackage, /) -> pl.DataFrame:
+    # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision`
+    # - This only considers latest version
+    #   - Those others are based on whatever tag the tree refers to
+    # https://github.com/vega/vega-datasets/issues/633
+    EXCLUDE = (
+        "name",
+        "type",
+        "format",
+        "scheme",
+        "mediatype",
+        "encoding",
+        "dialect",
+        "schema",
+    )
+    return (
+        pl.LazyFrame(pkg["resources"])
+        .with_columns(
+            path_stem("path").alias(DATASET_NAME),
+            cs.exclude("name"),
+            col("name").is_duplicated().alias("is_name_collision"),
+        )
+        .select(
+            DATASET_NAME,
+            path_suffix("path").alias("suffix"),
+            ~cs.by_name(DATASET_NAME, EXCLUDE),
+            *FEATURES,
+            col("schema").is_not_null().alias("has_schema"),
+        )
+        .collect()
+    )
+
+
+def path_stem(column: str | pl.Expr, /) -> pl.Expr:
+    """
+    The final path component, minus its last suffix.
+
+    Needed since `Resource.name`_ must be lowercase.
+
+    .. _Resource.name:
+        https://specs.frictionlessdata.io/data-resource/#name
+    """
+    path = col(column) if isinstance(column, str) else column
+    rfind = (path.str.len_bytes() - 1) - path.str.reverse().str.find(r"\.")
+    return path.str.head(rfind)
+
+
+def path_suffix(column: str | pl.Expr, /) -> pl.Expr:
+    """
+    The final component's last suffix.
+
+    This includes the leading period. For example: '.txt'.
+    """
+    path = col(column) if isinstance(column, str) else column
+    return path.str.tail(path.str.reverse().str.find(r"\.") + 1)
+
+
+def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]:
+    guards = deque[str]()
+    ldf = frame.lazy()
+    for feat in FEATURES:
+        guard_name = feat.meta.output_name()
+        alias_name = guard_name.removeprefix("is_").capitalize()
+        members = ldf.filter(guard_name).select(DATASET_NAME).collect().to_series()
+        guards.append(guard_literal(alias_name, guard_name, members))
+        yield f"{alias_name}: TypeAlias = {utils.spell_literal(members)}"
+    yield from guards
+
+
+def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str:
+    """Type narrowing function, all members must be literal strings."""
+    return (
+        f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n"
+        f"    return obj in set({sorted(set(members))!r})\n"
+    )
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 449c412ef..a454ed30c 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -3,7 +3,8 @@
 from __future__ import annotations
 
 import sys
-from typing import TYPE_CHECKING, Literal, NamedTuple
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Literal, NamedTuple
 
 if sys.version_info >= (3, 14):
     from typing import TypedDict
@@ -14,9 +15,18 @@
     import time
 
     if sys.version_info >= (3, 11):
-        from typing import LiteralString, Required
+        from typing import LiteralString, NotRequired, Required
     else:
-        from typing_extensions import LiteralString, Required
+        from typing_extensions import LiteralString, NotRequired, Required
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    import polars as pl
+
+    from altair.datasets._typing import Dataset, FlFieldStr
+
+Map: TypeAlias = Mapping[str, Any]
 
 
 class GitHubUrl(NamedTuple):
@@ -31,6 +41,7 @@ class GitHubUrl(NamedTuple):
 class NpmUrl(NamedTuple):
     CDN: LiteralString
     TAGS: LiteralString
+    GH: LiteralString
 
 
 class GitHubTag(TypedDict):
@@ -178,3 +189,80 @@ class GitHubRateLimitResources(TypedDict, total=False):
     graphql: GitHubRateLimit
     integration_manifest: GitHubRateLimit
     code_search: GitHubRateLimit
+
+
+#####################################################
+# frictionless datapackage
+#####################################################
+
+
+FlCsvDialect: TypeAlias = Mapping[
+    Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]]
+]
+FlJsonDialect: TypeAlias = Mapping[
+    Literal[r"json"], Mapping[Literal["keyed"], Literal[True]]
+]
+
+
+class FlField(TypedDict):
+    """https://datapackage.org/standard/table-schema/#field."""
+
+    name: str
+    type: FlFieldStr
+
+
+class FlSchema(TypedDict):
+    """https://datapackage.org/standard/table-schema/#properties."""
+
+    fields: Sequence[FlField]
+
+
+class FlResource(TypedDict):
+    """https://datapackage.org/standard/data-resource/#properties."""
+
+    name: Dataset
+    type: Literal["table", "file", r"json"]
+    path: str
+    format: Literal[
+        "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv"
+    ]
+    mediatype: Literal[
+        "application/parquet",
+        "application/vnd.apache.arrow.file",
+        "image/png",
+        "text/csv",
+        "text/tsv",
+        r"text/json",
+        "text/geojson",
+        "text/topojson",
+    ]
+    schema: NotRequired[FlSchema]
+    scheme: Literal["file"]
+    dialect: NotRequired[FlCsvDialect | FlJsonDialect]
+    encoding: NotRequired[Literal["utf-8"]]
+
+
+class FlPackage(TypedDict):
+    """
+    A subset of the `Data Package`_ standard.
+
+    .. _Data Package:
+        https://datapackage.org/standard/data-package/#properties
+    """
+
+    name: Literal["vega-datasets"]
+    version: str
+    homepage: str
+    description: str
+    licenses: Sequence[Map]
+    contributors: Sequence[Map]
+    sources: Sequence[Map]
+    created: str
+    resources: Sequence[FlResource]
+
+
+class ParsedPackage(TypedDict):
+    """Minimal representations to write to disk."""
+
+    features: pl.DataFrame
+    schemas: Mapping[Dataset, Mapping[str, FlFieldStr]]
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index a5f068082..f71037d5c 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -2,23 +2,28 @@
 
 import json
 import urllib.request
-from typing import TYPE_CHECKING, ClassVar, Literal
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 import polars as pl
 
-from tools.datasets import semver
+from tools.datasets import datapackage, semver
 from tools.datasets.models import NpmUrl
 
 if TYPE_CHECKING:
     import sys
-    from pathlib import Path
     from urllib.request import OpenerDirector
 
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
-    from tools.datasets.models import NpmPackageMetadataResponse
+    from altair.datasets._typing import Version
+    from tools.datasets.models import (
+        FlPackage,
+        NpmPackageMetadataResponse,
+        ParsedPackage,
+    )
 
 
 __all__ = ["Npm"]
@@ -46,6 +51,7 @@ def __init__(
         self._url: NpmUrl = NpmUrl(
             CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
             TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}",
+            GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
         )
 
     @property
@@ -78,3 +84,43 @@ def tags(self) -> pl.DataFrame:
             if (tag := v["version"]) and semver.CANARY not in tag
         ]
         return pl.DataFrame({"tag": versions}).pipe(semver.with_columns)
+
+    def file_gh(
+        self,
+        branch_or_tag: Literal["main"] | Version | LiteralString,
+        path: str,
+        /,
+    ) -> Any:
+        """
+        Request a file from the `jsdelivr GitHub`_ endpoint.
+
+        Parameters
+        ----------
+        branch_or_tag
+            Version of the file, see `branches`_ and `tags`_.
+        path
+            Relative filepath from the root of the repo.
+
+        .. _jsdelivr GitHub:
+            https://www.jsdelivr.com/documentation#id-github
+        .. _branches:
+            https://github.com/vega/vega-datasets/branches
+        .. _tags:
+            https://github.com/vega/vega-datasets/tags
+        """
+        path = path.lstrip("./")
+        suffix = Path(path).suffix
+        if suffix == ".json":
+            headers = {"Accept": "application/json"}
+            read_fn = json.load
+        else:
+            raise NotImplementedError(path, suffix)
+        req = urllib.request.Request(
+            f"{self.url.GH}{branch_or_tag}/{path}", headers=headers
+        )
+        with self._opener.open(req) as response:
+            return read_fn(response)
+
+    def datapackage(self, *, tag: LiteralString | None = None) -> ParsedPackage:
+        pkg: FlPackage = self.file_gh(tag or "main", "datapackage.json")
+        return datapackage.parse_package(pkg)

From 9274284a16962c55df1faff2db20ec1e0d55313f Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 16:08:48 +0000
Subject: [PATCH 133/201] refactor(ruff): Apply `TC006` fixes in new code

Related https://github.com/vega/altair/pull/3706
---
 tests/test_datasets.py        | 2 +-
 tools/datasets/datapackage.py | 2 +-
 tools/datasets/github.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 9d91c275e..f9dd4c5a3 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -493,7 +493,7 @@ def test_reader_cache(
     if nw_dep.is_polars_dataframe(lookup_groups):
         left, right = (
             lookup_groups,
-            cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")),
+            cast("pl.DataFrame", data("lookup_groups", tag="v2.5.3")),
         )
     else:
         left, right = (
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index da1f8375e..deb63fbb9 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -9,7 +9,7 @@
 
 from collections import deque
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, get_args
+from typing import TYPE_CHECKING, Any, Literal
 
 import polars as pl
 from polars import col
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
index b9b156c60..406eca3dc 100644
--- a/tools/datasets/github.py
+++ b/tools/datasets/github.py
@@ -487,4 +487,4 @@ def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator
     if not TYPE_CHECKING:
         assert is_typeddict(tp) or issubclass(tp, Mapping)
 
-    return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop))
+    return cast("Iterator[_TD]", islice(df.iter_rows(named=True), stop))

From 8e232b8d38d39c2832e64f5b959482585c4cc4e3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 17:01:45 +0000
Subject: [PATCH 134/201] docs(DRAFT): Add notes on
 `datapackage.features_typing`

---
 tools/datasets/datapackage.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index deb63fbb9..445974795 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -114,6 +114,15 @@ def path_suffix(column: str | pl.Expr, /) -> pl.Expr:
 
 
 def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]:
+    """
+    Current plan is to use type aliases in overloads.
+
+    - ``Tabular`` can be treated interchangeably
+    - ``Image`` can only work with ``url``
+    - ``(Spatial|Geo|Topo)`` can be read with ``polars``
+        - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955
+    - ``Json`` should warn when using the ``pyarrow`` backend
+    """
     guards = deque[str]()
     ldf = frame.lazy()
     for feat in FEATURES:

From 93308958fbf40873fc4023d6b20e1e81bc97d5ab Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 18:16:04 +0000
Subject: [PATCH 135/201] docs: Update `Loader.from_backend` example w/ dtypes

Related https://github.com/vega/altair/pull/3631/commits/909e7d05e57718b2f634a7e6781cb4e58a835837
---
 altair/datasets/_loader.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index ce2559aed..f9190f789 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -117,15 +117,15 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
             pandas.core.frame.DataFrame
 
             >>> cars.dtypes  # doctest: +SKIP
-            Name                string[pyarrow]
-            Miles_per_Gallon    double[pyarrow]
-            Cylinders            int64[pyarrow]
-            Displacement        double[pyarrow]
-            Horsepower           int64[pyarrow]
-            Weight_in_lbs        int64[pyarrow]
-            Acceleration        double[pyarrow]
-            Year                string[pyarrow]
-            Origin              string[pyarrow]
+            Name                       string[pyarrow]
+            Miles_per_Gallon           double[pyarrow]
+            Cylinders                   int64[pyarrow]
+            Displacement               double[pyarrow]
+            Horsepower                  int64[pyarrow]
+            Weight_in_lbs               int64[pyarrow]
+            Acceleration               double[pyarrow]
+            Year                timestamp[ns][pyarrow]
+            Origin                     string[pyarrow]
             dtype: object
         """
         obj = Loader.__new__(Loader)

From caf534da20f9b96187283d67a458f17c0b0346bb Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 18:27:33 +0000
Subject: [PATCH 136/201] feat: Use `_pl_read_json_roundtrip` instead of
 `pl.read_json` for `pyarrow`

Provides better dtype inference
---
 altair/datasets/_readers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 5b9829b9e..e2607acbc 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -380,10 +380,9 @@ def __init__(self, name: _PyArrow, /) -> None:
             # -------------------------------------------------------
             # NOTE: Prefer `polars` since it is zero-copy and fast (1)
             if find_spec("polars") is not None:
-                import polars as pl
 
                 def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
-                    return pl.read_json(source).to_arrow()
+                    return _pl_read_json_roundtrip(source).to_arrow()
 
             else:
                 # NOTE: Convert inline from stdlib json (2)

From 75bf2bad9d5d8f59c6084f1f58686085409f604c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 2 Dec 2024 18:57:03 +0000
Subject: [PATCH 137/201] docs: Replace example dataset

Switching to one with a timestamp that `frictionless`  recognises

https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L2674-L2689

https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L45-L57
---
 altair/datasets/_loader.py | 88 +++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index f9190f789..2b8a2cd95 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -171,72 +171,72 @@ def __call__(
             from altair.datasets import Loader
 
             data = Loader.from_backend("polars")
-            source = data("stocks", tag="v2.10.0")
+            source = data("iowa-electricity", tag="v2.10.0")
 
             >>> source.columns  # doctest: +SKIP
-            ['symbol', 'date', 'price']
+            ['year', 'source', 'net_generation']
 
             >>> source  # doctest: +SKIP
-            shape: (560, 3)
-            ┌────────┬────────────┬────────┐
-            │ symbol ┆ date       ┆ price  │
-            │ ---    ┆ ---        ┆ ---    │
-            │ str    ┆ str        ┆ f64    │
-            ╞════════╪════════════╪════════╡
-            │ MSFT   ┆ Jan 1 2000 ┆ 39.81  │
-            │ MSFT   ┆ Feb 1 2000 ┆ 36.35  │
-            │ MSFT   ┆ Mar 1 2000 ┆ 43.22  │
-            │ MSFT   ┆ Apr 1 2000 ┆ 28.37  │
-            │ MSFT   ┆ May 1 2000 ┆ 25.45  │
-            │ …      ┆ …          ┆ …      │
-            │ AAPL   ┆ Nov 1 2009 ┆ 199.91 │
-            │ AAPL   ┆ Dec 1 2009 ┆ 210.73 │
-            │ AAPL   ┆ Jan 1 2010 ┆ 192.06 │
-            │ AAPL   ┆ Feb 1 2010 ┆ 204.62 │
-            │ AAPL   ┆ Mar 1 2010 ┆ 223.02 │
-            └────────┴────────────┴────────┘
+            shape: (51, 3)
+            ┌────────────┬──────────────┬────────────────┐
+            │ year       ┆ source       ┆ net_generation │
+            │ ---        ┆ ---          ┆ ---            │
+            │ date       ┆ str          ┆ i64            │
+            ╞════════════╪══════════════╪════════════════╡
+            │ 2001-01-01 ┆ Fossil Fuels ┆ 35361          │
+            │ 2002-01-01 ┆ Fossil Fuels ┆ 35991          │
+            │ 2003-01-01 ┆ Fossil Fuels ┆ 36234          │
+            │ 2004-01-01 ┆ Fossil Fuels ┆ 36205          │
+            │ 2005-01-01 ┆ Fossil Fuels ┆ 36883          │
+            │ …          ┆ …            ┆ …              │
+            │ 2013-01-01 ┆ Renewables   ┆ 16476          │
+            │ 2014-01-01 ┆ Renewables   ┆ 17452          │
+            │ 2015-01-01 ┆ Renewables   ┆ 19091          │
+            │ 2016-01-01 ┆ Renewables   ┆ 21241          │
+            │ 2017-01-01 ┆ Renewables   ┆ 21933          │
+            └────────────┴──────────────┴────────────────┘
 
         Using ``pandas``:
 
             data = Loader.from_backend("pandas")
-            source = data("stocks", tag="v2.10.0")
+            source = data("iowa-electricity", tag="v2.10.0")
 
             >>> source.columns  # doctest: +SKIP
-            Index(['symbol', 'date', 'price'], dtype='object')
+            Index(['year', 'source', 'net_generation'], dtype='object')
 
             >>> source  # doctest: +SKIP
-                symbol        date   price
-            0     MSFT  Jan 1 2000   39.81
-            1     MSFT  Feb 1 2000   36.35
-            2     MSFT  Mar 1 2000   43.22
-            3     MSFT  Apr 1 2000   28.37
-            4     MSFT  May 1 2000   25.45
-            ..     ...         ...     ...
-            555   AAPL  Nov 1 2009  199.91
-            556   AAPL  Dec 1 2009  210.73
-            557   AAPL  Jan 1 2010  192.06
-            558   AAPL  Feb 1 2010  204.62
-            559   AAPL  Mar 1 2010  223.02
-
-            [560 rows x 3 columns]
+                     year        source  net_generation
+            0  2001-01-01  Fossil Fuels           35361
+            1  2002-01-01  Fossil Fuels           35991
+            2  2003-01-01  Fossil Fuels           36234
+            3  2004-01-01  Fossil Fuels           36205
+            4  2005-01-01  Fossil Fuels           36883
+            ..        ...           ...             ...
+            46 2013-01-01    Renewables           16476
+            47 2014-01-01    Renewables           17452
+            48 2015-01-01    Renewables           19091
+            49 2016-01-01    Renewables           21241
+            50 2017-01-01    Renewables           21933
+
+            [51 rows x 3 columns]
 
         Using ``pyarrow``:
 
             data = Loader.from_backend("pyarrow")
-            source = data("stocks", tag="v2.10.0")
+            source = data("iowa-electricity", tag="v2.10.0")
 
             >>> source.column_names  # doctest: +SKIP
-            ['symbol', 'date', 'price']
+            ['year', 'source', 'net_generation']
 
             >>> source  # doctest: +SKIP
             pyarrow.Table
-            symbol: string
-            date: string
-            price: double
+            year: date32[day]
+            source: string
+            net_generation: int64
             ----
-            symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]]
-            date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]]
-            price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]]
+            year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]]
+            source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]]
+            net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]]
         """
         return self._reader.dataset(name, suffix, tag=tag, **kwds)
 

From d4930e7e91f2518c98edc917e7c8ceec6787e517 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 20 Dec 2024 22:10:52 +0000
Subject: [PATCH 138/201] fix(ruff): resolve `RUF043` warnings

https://github.com/vega/altair/actions/runs/12439154550/job/34732432411?pr=3631
---
 tests/test_datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index f9dd4c5a3..33779efa8 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -169,7 +169,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         assert load._reader._name == "pandas"
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pandas", None)
-        with pytest.raises(AltairDatasetsError, match="no.+backend"):
+        with pytest.raises(AltairDatasetsError, match=r"no.+backend"):
             from altair.datasets import load
     else:
         assert load._reader._name == "pandas[pyarrow]"
@@ -191,7 +191,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
-        with pytest.raises(AltairDatasetsError, match="no.+backend"):
+        with pytest.raises(AltairDatasetsError, match=r"no.+backend"):
             from altair.datasets import load
 
 
From 5a31333c9ff425f623134d03bab0aacd8c62c74e Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 20 Dec 2024 22:12:51 +0000
Subject: [PATCH 139/201] build: run `generate-schema-wrapper`

https://github.com/vega/altair/actions/runs/12439184312/job/34732516789?pr=3631
---
 .../_metadata/datapackage_schemas.json.gz     | Bin 2490 -> 2473 bytes
 altair/datasets/_metadata/metadata.parquet    | Bin 18777 -> 19296 bytes
 tools/datasets/_metadata/tags.parquet         | Bin 6290 -> 6289 bytes
 tools/datasets/_metadata/tags_npm.parquet     | Bin 2599 -> 2598 bytes
 4 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz
index 537dcd28ba9377319523683299cb1773ddf40e79..34433838d4623a280939f19d83e067930800a66f 100644
GIT binary patch
delta 2463
zcmV;Q31IfR6R8uBB!608bKAHP|0*0$AFeaijOAQnsc*KNOFeTwyS97jJ#iosBoTuE
z2LLTA8GrXKKvE)i!HfIQ<|)ns*ne2;{&uNvCrp?|8)r}co_sqI%rW@m#5yBXdGfd9
zM6sGrDVcElDaAVG@Fv|VXV5;$oN#R(wyL(?aucXpm6-erbbqF7rs01<u?KDhw^Mfi
z_Wu>i@v>v|2V$^gM)G|)z875PQ?btsIATlooEgN`iivH3u`s#+?-TF5cG0;o`HlDr
zRx-m1m)x)?p4{l1%jt%XMZq(dT-4rQi7t|y)vQEZevzWATyoWWIrsmLnZqAaD4yIl
z+%Tuj^z^L}rGHS#1HGiXidpW37D2#t@w1S*6a`Nn#8*DI)bNa4%cH1yX9;rdMq8KM
zx{6~LhlP9IKq$sm9<^gK`3qVC**s6KFCG#w?n&to!G{gun%cbNt_vUCnv5qKa6zff
zKH2Cl+{XO6E!nVz27bT8w%cJ-oW*EEgv&+ARklwus((ShLXTXwhO;(yq+f|h$-)ba
zis_f=p+{h5ThMx5$6K{Xw1Mj_76<o^I4sv6&Isilq5MoJ?+N7tp_~)Sicr>=a(+rE
zOF|(!pA((WiO%Ol=X0d<<3Gs`V#7JH;hfm8A~vju4J%^9irBCsHmryZE8_PR(RoF5
zUJ;$wM1SWs(RodDUK5?yMCUcpc};X)6P?#Y=QYy#?DX`MP?m&pMkwzH<!3^9PbeP<
z<(yDfghF&)5}lVs=Oxj3NpxNkotH%CCDD0FbY2pjmqh2YQ(RC(D(Q-8_xB&P5Et6;
zjM>=IGm|Zb58Ku_p#sZGmN~!=2-)0mB{Csp*nczv1`p2g#_4*W;g#QbjQYv2I)&;g
zlxI8<@rkR}B3m<NIzuU1xDk@u6pDEIiAjJ(xESt1*PI)R=vTrvl4ZQ+%1u@Khc=cs
zdPgemXYLDrDpV@BmevO@J&+I@Svq)r03aQ*!1n30MW$5-${cunJHgmxb9*-B&GIj;
zb$?LNrz@`qEGc@hl>#UnEd5j9R&WRa*@M*)+yvl~@)tlZ$XP0#IRQ?ZJ40b^P}l$j
zVnN#+*}a1UIrg}YV4_@DD8?LP8X&_H;C%>vRE~i0$n&`3%vA&)<wiVr5&I25ZqC2!
z(AtM5qLs{|+bSe8e3A2pL#(Q7xHcQG5q~=EnCT(JSh2vNYe#Z)0AZGHw-05zSe`yj
zJbp9<KsVb{_d2GCLK;VOIBp@49G*i&H00%8_S+>nQFG$Cf!&X@<cCRqSCjV=T=f5t
z{r-np-8bJs{M$JUiHAHKt=$$XM>O-~@Y*fmQ|X$gj<b47W;?czki%~1$_=(Y?0+W<
zuOUG+b*Ng4iZkh|g;0>dyXPGp5**gY{JXQ$+}r4;l?T_1HXB}2k012t<^eRjMU}~<
zFG_eTNJf;|0ud`IK^@_$b{ljVGQoE)x0D+0cHwy4%$d-yjpdBIP5V9KD1QK_+y<NE
zd^?WIfYEJ(JTCN(EdWVmK<Z~kM}Mt{s63X1(a4Fr6dKSG1R49`i{5q>1^z%|DE{)@
z8b(~}E#xLcZj)?bgzgb(A&hk?b)RHWazDNqdx!UdH}*ZtNV0({o#JQ9^4xH1v4<di
zY_+Mh&ZjGG-(W2x<F)uK?zP*UYqjy<O^e*l1WxvXV9=hh(NArYhOp?E9DgEY>Uc{Y
zYJarhx{-5%OI=*?VS+V|m0BxivTDu@NaO|w16&oN_^k0lIX@4`=219_82{@b`DC=k
zVCl0qGT#Bzimt+;tKYAZ2S!oWozZ!l`PYzK9;Pm%^?P_`ew24bD%j24#YJ)tca2+<
z`&RjWB?;5IS}596z}9U`rhkbKkq3LjzG#y`vt!W0#z0JqXP!Kgu)Gq`%@OYR+BvS0
zM_n_gxgBXf((U@@>T<Ml48ySSyLOylU8PM8$@44^H`by_FbZD1<#xzdBpsUG#mD1#
z`W;wQGK*GW4X}86yn?{R1MwYtUQ$~uRFsNIoX#Dx1ca5XP+goxhkv!#Rx*Wh$R%e_
z$**wKRmo3v{nx4FuzCd@a%}MzDH?dy`1E*Usq=ku!>mnyD)HLGB!LG5AWcEvUjuxA
zGHh7i(7&LA7^BIy)|T{fmbuo%00YeoBF7b3;!u6!xb15+@U%w-kCJhEY&1XukwHX4
zr`)|W(5gGSf%_0XTz}GQ=%1V@U?0VQ$6a)caIG>gO1X4X0UCs>;KmOqT#GLM4$|K~
zjm*Nkg^Qv9=n@eg2J-1W&$wuuotpY+n6iL+Y+4vyrx&fO0N8%zImoGN0neLcw~ifT
zoNa4tm}1QGbb6lSI79(R`|Or(m_?84Bls=4uo>S(r~JXm(|?`6Y8Krw9+UfyJ`Ef3
z5a)1lel_)9J@yfkj||Y3Nn5{lHoe_}CoWN_cP_nF9`xre{JBAK7FQ|qjp(@8m4wWZ
zPc+1Do$_}>kk8+I<m?cWvnQSb7nVGQdrWD$yXS?UulP7Q8gEIpJZy@~dBxpD6m3%w
zV)+X}I6aD);eVaN;}nYf(5_ko-eqc-m(Z)A(0r}~I&ej$W-MZ&k%c-9C}y$q7}WP_
zQ03)*(Una*+nF~WnzjbL58sA(@J<$XLggqEBEiv`2~slA2p5@1dQoZ3USd={r$y2|
z<sfvN#%=Td^mNgG4fx+xaF&0GP@gr_E)p<7rI{H;1AkYFQM`3Jd+Ot8UvD)63U6c5
z=En#B4VAmDfxrh8_+9Fds}?Nt5Er`rytT*St&M!fk4+jOFH~>n+$b0$QHm9N6RPVD
z36DJ9h#6bEKr#+C41uGxDwwJ98#dyZ){hbzJDWBE_EP{1UVMi8^6QY{=z7mH{ocMh
ze6mO(!+)nzmnGc*=mhSm>pbvTrrLsK@W|=09AMT~@-$e1u@*F#f3oZh%X#X2!S{o)
zE=fXtkWP?Vu&Xq*O3*H);nehk01PuBV=IWcw|>VAhm>bYN707ijlcjyC?6WJ%v%V>
zR3@P4fp4JxMAU}cC?IXZS-%UUZ@plk4EKc(aDNDbqpsa3Y;LdQ!YAZzr?x(84$~+6
z!C{U?=}W8{KCw)%&IfRsRZNv6N5KmSHEWTvAhmn&5@+mb+L-zA(Db9m>m*fso-suH
zgR904?#0bPg<)~b1>ghzbmC<_hpzbGyf=<+!hDy0LNXhMFVXs+%^<m7MdN?!(FukB
dq!^JOe!Ts+TmRKh#3uLr`X4lowJeS%0014j&;I}b

delta 2480
zcmV;h2~YN^6S@<SB!3xQZ`(NXuL!|C>;Y#WPPdJHZ;8|G1=wt|ac+Bk5@?B%*ifWE
zQgO6F{`(Cn*_LMLHutc-H+zN}&i5BzPK4gF8J9eBMr6WuC;v=dPCCvE{5Z*(<0t<}
zPONTC#y!NsF;#k);w~+@;)Xe)mB$I?c*)JlD_|Slv~q_z5P#eqffKCc2x9p&f;3z=
zl4IH%W+WdN6<p>XuiARkfAhqQ;6aWNvyuC)aAS!);apf|9XB@6{-~V5biPnJSDd-Z
zVjh{YnFf{ur8NhZ?3B`f5ScF{DT0u#aiUk=CCh**!KMW6xB`Po84iu)UpvDar|T}m
znBRB|`pK|5U4J_!lqWp7;uBY`MLLpfW`uUViV9(@OBJh;Z%7seQ<H-O<=jk98HQag
z_bN_<Gme~+<+<V3Vy_fswb7>1I-g=(*I+KSDuruvK2?Z!TLkrj*ZQe9;>*c{a1zn_
z_d5Axw6#dS&)Ud*D{>yi!bewsTqSEJIY!+Xowu2fpMT`C%}eeO{vJY>S)N=mhd8;p
zySPa1K?ZEi`&M}qCl8|LNi7uiW8sf(TBngc){)@9VPCaLz&{K^tPI#nJn`g#sQXG7
z$TFDvy>^bP<U!ZWX>LcHk9fPjxw;%-4z>!2KeXc{09(cDM(VES$_1l|91%5x>P9v`
zWUZ8gZ+|Ebxgf1t1F4az!T8n@l@rpF>^Da2sZPb1Trm+NGhi(iF2?@9)ve00-)L8H
zoS`7Cd0)Y#0qDSwXImd#+{`Mlv+QO{o0>^Wu5`dikj)2OYf#WQ(6%oH1hEw=M_Mi<
z#Hh0~CQPGEZyjw!*}LR8>@8S)uf_u+1IZm0XMaca?DTQM`<R|&9o3CvD#Ll{?d-hv
zi6qx*1)gTK$a~e%?3X=HHIsEm^m2B(6XER)2kT79=?%BLg8n_{_wIoF`gJhgQwZ8#
zsJCa#z$tY^vtl@t*@koMAsHJk{EcYwMnJMm!PnAHOiB`9s}2fcJ_Oh_yEPID%a9I6
zY=8a>gyW6glCb&gONmQ`O63NVyT~#~z=zi7K<*1_oo=xwMM5tc-VkS*XoQR0;e#!$
z*>env%R7oc57}QjW+}v!*;Tj{nwOobzS%_;9NH8)^hqO+5Cu>9a|6CJMX2=17og>1
z-}J0?h+#0ND<D@v=7R3Xu%TJPEhLT~*?-pBlH0A%T(N{r!&u}v8A)8z9(hN!dp657
zge?ei=!&n9$yc{$Q?^+CwS|T>Po|g<-IdIuexn<XT%ZC!M&{95q++Qp&Ih7YOrq9|
zJ5n)0S@xj$!}mD5wMuw%&Cc?0x<y?(It^*2YB%BJF139p|Bnm)jAg#vnt|`P_J82j
zHqwVje7e@kRR~6|jOI{MFZ=4CAKNC~&oZ&vKs)0qWFK1OnBlXWH@<IH*<P@Hi#9F^
z%%Hw8D3&#~7@F?rTt=fU_=opHL)#o2UB_ZL7WI>w_rr_!X4h7~3>|I)wTYM7bjeER
znD%fa+Z~}M98TDFvpW<gF**>Ta(@96QPw3H<zT?jPcEV7Vr}e`0Ta0;3s#O^Goa{4
z4?qkWzb-k><E@${Fc;{enC;a&qA*{6I3tjE1oAV1yeE(k1aeLw3j$eU$oVOO%n5|>
zd`@^iCp@1Mp3f1_55JQWM22%B!#R;*L1b7E85Tr_1(9JvWLOXx7DVq0!hiFE@Vp>A
zFA2{}!t;{wyd*p?3C~Ny^OEqqBs?z(&r8HJH07rRGAEET0(nOuKNHA%0{K87=LE7K
z5W@4E@H{6x&k4_S!t<Q)JSRNQ3D0xF^PKQJCp@2>;!aOP;ehd2`WC9+Q9wHHE8mK#
zT==SyUPD22qJVRBU_6|n1AoP}%6vIeE?reHQNV-_v&9<P?iO_lcg(mhjnsx7aZwad
z;3AhlXvbuF&oeF>hX(%#!;~%jmZ31ZPA^(lK^6Ox=WtJ53+Fi54E5w-p4qf8xcF}8
zC{3r2Q#_o%<WM#JW++`TiyFm`ptmSuHMT_0{9eh^lfP;fJuxmDet#ZG!$CaG?G5L#
zpu~oC5s?oB(2>cWcH?Y%y9G^LqVB&}=`~#SB=Q!1u8^O_Sz6Og&;i-6(=yyMM)cM*
zf46h;<Cc$nyYtDTUp&E(TJjX^n9^0@JumzU$@|GccuO{o`$O@2UMDl9UlPPZ@c%sC
z4o!GzbldcqBm5-q4S&WSX%zd9`rnaP02pm}#%v7r#AJ(iGsEhuCkjae?e-(fBk~jx
z*+WKK_xN%qw#o5~_1nKU4r#5O1ukY8^OltRB@tN<eS6hJ<qK&aQgJJWJimsKg3NB$
zRx*Xg@k`DglV9OeS0z8y_1|%>1MC%VqR3yQXut#T*W-p-=YL&t!>mnyDlu-|zg~?5
z_)|2#NA3lHKWU_6e!F%4CnvzV%nd|$Sf0bMD02S&8CcLaiIQ17rSFExS;d)j6%5>&
zuKOW3>Wq8*bkz0VPDMSsK{rsXNQm0zedx|&17rhPm>-!s=RbZ|yb&h%e;)_Gxac!M
zLK_pnWQG+E9DnYSCpX|bXdCmfC}7S`F6uC_=_1KlJuDIqhkgm7CV17fCk{WPP&~N}
z>wCP(>bI9kq|)70{`m<m?R7ateikzKtAJz;oyRd$ETY5PRFDKY&PrRC+(sTZ0>T%4
z5ygbkp8O5Rs^2*$*B9#qgnLxlk%Crh#Y|Qm+vS5J7=K6JcJPwjhsHCeJPHv?mw0#d
zbqOz5oEH)Y>#n?wd=K(9dKBBQx^{k{v?`dX@r{r^Xxmn54ShiqUMlIEo#o~UBG2QY
zv7q>YX8OHzb$FOgA<?H&mnA&_LjlOxKbGQenQ9A`xz^Cn^4748<Y|xsZ+*4;7TOt>
z^VIo1+keMy|6mAhnRJ3A&26P2lQJ33<AiRwV?xF-h`5)2%M7n{Ysn*M8#ZeK_w_>g
zeQPX-7Lb}$CZN{@-9TZ8unpEIxV7-DKLxjM{KWv-*$aQb!3hq$_9^|?S;>WeLF`Xz
z8+wPKwBe`BV<bx7*{a<W#ng1(fzzyFsw8iog@4v=15;jfbM8A}OTw>$XiXhwHQLy2
zSr-9^(>Buv2TM3l5aVL>{)gA1uX@ubIQW6`+oYL^w+OM7I24f#H;MUS5_&;TBcwSD
zO@-88qM7gakL{=jT&$$Ot0!6<tL(2*V_3?^pIK3c-}frBVV(5XJ>RhWYXthFWm^*u
u+$lo1x?{TDA;%*YbQ!o6v`%M_!)ruWZ?ue$f8XfKUjGZA$V*s_CIA2g4#rLZ

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 3eaa28ca39d5ab0230c23b9bb1799d78ffd64eb4..633815d1ff27144689ab23aacff171a20598293e 100644
GIT binary patch
delta 1870
zcmaJ=eN0<b6o0paw+{Ij+umEtI*16BxXIcAr6i1bU;7F*^wB~KZKpxn5ojqN0v&x_
zvJyx93M%1|xnwajhRm6v*{sHx5oClcYK+<9MzV-V44INe!{#t(M7{UX(w7By|LFVO
z^E>BvKJMx5QLuO#T-r-US5m3#;2<?pOc4}Gkt#h=1b_k%=oEc=X#IsSu__~WeU+8D
zQZc&v=H>TPQ?@5Aj)C#6v0UHXx%sAm|HB`DpS=9X&kIMttAsbs&EB-`^xXZGuVQaB
zcbpqvK6|2N4>2fDBZ_A~(49Ka-f^C+e&d_l&5`-%?o59lERdZ~O``~9Awen?8Li>|
z=Dtu*Pj_#B>oF>w9x0|LNP0LnL%vUfQ6Z4=IIj1?qw96b2$?)RYtf%E{mNr>gLzqX
zD_57=J^at=7gyHaT3_D2*1xu@xH99*I(g*y`q`6*s}7H=Qsjgo<zm5F<NaN?p67dg
zSN&J-jcA@ekti**tEIEG+1I@%81(n~ySp}vTLS9Am@-qJ31(w96-jRUwzpC^n5&4r
zm@k)s`B<djC;(@KJd+~!Tj6Vp?Q?H~N;G4k<r$M;QJZ4qk<Wl*SVp6HD^>K2nwrUs
z<3T;O1mbwKnYx)B#}i@d+RiwB@H(KT$@t~tF?2~N&Bd}|3b0*&Nk%E?m;VNoo=|_1
zqe}Gif1{SEFDP(*@g1Ol5#uLOd_JKbDpLv1X5?)f-B&40?oiq2!Bbd$0kLL6y;Q!b
zo|x8F!_rIT&xiu!>!SP};U2RO%nCM@XE%LGWUe7*3J~i1uL6}&nV*xGZWjHe$PfJs
zs4CQ$FlEdG^TL9W&R#@B4n2bi9f~`R;wB4~;tMFV{}Q<eh%*rCkN!lQr)vMhPokv9
zhOorMpo|)q5%kIr`_Uq*<@8TNCQGLcrys(nlx2ltmMMUC213X~D})xi2|)uKK)?82
zYd1p3XkeQ##(H;KpatEq$ut1%5b7Z->@e%1`T;C}+#y69`r6Ug=)x{spkHTgDAm>m
zpsudO9j*=dLROQz{8)eu)fr0%xQNJyty0_|vHO#lllX8FpC~7F;*$Z=1a-*(UXqMQ
z#U+ZsI2<pCw<a}tNn&226G1xIKZ-lV7d{@0aYTYOn>d*)A|}JrL5bqi;cL-UHl||<
zS#cv9d<5C?^bl`HJR8}-&XCop4S8JAc1=<M8Kn*l*x@1GArbIz!3QKfzXcz`&u0mt
zC(wumM*XEF$pYFoJ3!+P$2%xPe91=DEbcae(LRXF1fjXY?l4!F9l=J<rS&ilbEDZQ
zZXtvRY-m{r#0%QLHoRsmw0V)yAYKf}P>l%&nhbcc(KfOh`^aeAO+HwT1$a?F3_#9U
z!znCO7ZowsRKsaKot~an1V!hF=qGhHBIjJjfrKh>Fy3)=+Mv;nbK%_R)NnnmolcA%
znKMoNf7Ni^ZoBX<yH{p+3!k$^$~qk9?KJm$8Y==`UAV8s#r1e~rBL|V%w%^UlId(|
z3I|=fvUYEiQ+S8XNrZAZ6hE}9gwy(JnB%^#`k)8da43yCXFA-j`gWhLQ4<e`!c&$f
M<uHvPY_MMYA5CgTMF0Q*

delta 1411
zcmah{e@t6d6u!4BFO8C>=-%Bc)Cm<Hn9TlQqg|Kba@)Rkbn8ZIX-lVJ1v=KzLT#DF
z8i6U%v5^IRi(6(uvS1{bC|U7GFjHJ~nuVCSpvkn!99fJBxQ&>(iQ>HXQH+1N`;vRk
zx#v6I`R=*zr#Il0*Wu&_rgyS{orjy)Asx%mPh$dA5ONTZRrP^>Wq%OVt^;1X3eD7-
z)VF1$CnhT0?LWGH>#F!<Y@!_EYb)iO4*dS@4(kg=s=u#o9DMr5&8CU>{&X5PpI1bG
zNZh`o2lC<|1CFXn>nn52=S#DSsmRqU_;N|Ju<ry{w0jp!Ec|lh?eWp2S0M}3toRnQ
zLB3dkMzYh)J52A{lI=uNVJ4br>1>UpbS<gpIE9&nx<p4C04!j?kh7!zcLOb`k%6S`
z!G|Unx`v<ej?Y|=mW`c0lRQ*iz}zysKKt(Gmp$e?J>u2r(BRz{d)^-|b55;1P+I@)
zvws*SjI$$kXJ^fi@yY&y*EDG=kWq8Wj&MBE)Y2J?MN`qv4z>^t=}=z*%0Yf8i<BlM
zyp(lo7zWN|eOj#y&S%r*`yd<<H`=-ELd5_l&z*wX#d{UUv+H(z1brhiMCPs5pvyda
zL7Df5Ol%J3{nJhCTv6UX*Ues3<^APjkey-j!N(^_u*Q_jDzO}{|NbQz%b~%)AZr5r
z#bV9cgtc_SGJc$+)6G9Px*>rt5U>{TC&m5NvX1+VAHDK2<-bl!9DrXmOZ@j4km<iI
zN2L0R6{+Q-c*|M>FNtnVD1bhcGP5M}DFpo8qfo=j)ir=osX2Knnu)9qec6P@5I>is
zoQkhb`Q`M-<rHpeq=mkbc#hqJ?2~|>7SG!rt(YHx6ff6xb$4-$E}TqurU5`d6J#;^
znjou5LlN3F(u=8;ef>1COcGlGKP3wG(tqF2g3d_EbA2*)Oa@SY?h*1Y@CvlJ1lQ9i
z<qEOayEm{c)p9Tr4|Dwry|mhG;uC_k+*XI%F!ta&jO)`@jBN%I;0P9oOJjQqV^I{E
zAC6<P>e6Jhc2c*IoGTpFz1{Q(yXj_=tcO}sj<L6)#_uC+WCiKgAXwZUzst9~eUH^I
z_;>n@E|<sPa0iXvP=nAwAw<O?B5&&&ACW~GVtX~!ECn=fqjEi1U$57z^l1+PU^@l2
zIvpM2<1TA25r?gr)<7s84wCEb%}~xQ6tYT4&L}N0MCenLi&!tF2l=}YMg#4+Dqp$|
zDQ%~%q~4T298AP$L7tyXD$g%&a~ZYt3IyX6u$k!8KQGNmYT%?mEI7qqcUEzBdgQD*
z*c|X5XsouRnw_ysIAA#}4hyy-7X^mmu~f)!Ioud9#>69aB~U6vb}G&3-_q)^WLksP
qbn|AHAruhb6a=+2y^zykXbkR&k-Sujac!+=+*PSCD*&j+UgJOY5TF16

diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet
index f8ed6f54e46e03902d48eed24ad595faeeae94fc..189dbbcae0b49d624a63d54b76e6ca9ce9425e3c 100644
GIT binary patch
delta 255
zcmbPaIMHy!c|IQrQ7s+O2vG)6Hc>ViGX@PtMhQj+Akf&V|5t%0k%55;$cbQN2n7j>
zGDxa2ut`erWfsR56s5)&rRJn27N?5xh((C`i6v~7<#%LaOqd)mc!sURMN~rU(qu!S
z8ny)>X4mA+Lf_eTfS4yHZxFU;yWk=!CiZUfS7AH0XCTovlPyK;*#3c-uO`=uxcE<V
z1u9tKENaJ~_QP6+fYoX<tZfjk7h40;`^;HXi9zg1tyqK_huFN${G!#&92)El3;~Wo
Fh5)i8KScll

delta 220
zcmbPeILUCsc|H?4Q3D;(2vIf=lrdw_U}ThFWB>w<t@?iz_$wJ0n1GxJMphun$`ES7
zAj%*qI+>AQoH;}+ZL>7LBNJoV<XFKoY&|Zb5@OdT>kHMeEdeq6CT|q_&b9}{JTrNn
zusz!q7f~^>Pm@0j+p)a@iEfx|E@H>l<O*cIn_Mm8A~eets9=q=s2zjaKWiD7J*q%^
r)Mi-QAe<()0i^Jiv#1h-*sEHx2sI9|C7by~tC=~}*%=rD9D@u2ZLBk5

diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet
index dac952f9fa86a2de165343eb808b376195022c29..d53aff4a1b06955c4043c298c930178edae13db9 100644
GIT binary patch
delta 243
zcmZ23vP@*de`Y@!Q9T{e8c_yOHc>ViGX@PtMhQj+Akf&V|5t&pkb!{-$f;pu2z|f~
z5*B5URApe3l;F!OjxQ)mjW0^gNlh$H73C4D5epJane5N<gE3{Y2kSDv4iQlav2B8)
z77S|p*kmTpXEbFOTOcATCiZT!1)Ck)Hjwn1$u&UcIS}*J<O6Ij{?9<n6~dx+3~F}-
vWC&QM_JiLB;bO6WAj6*tiz+dQ?U4|xQR5Ihv^j|VEEC5S76yg@#~?!h&|x}K

delta 244
zcmZ1`vRq`tf93!sQ4<}}8c{Y7lrdw_U}ThFWB>w<t@?iz1UnfRn1GxbMs^^{$q@R0
z-G)JwK~j`WQi3nDIKH4LHNGe{CpED+Rg_1pMl3=sYqBrP55}y?Zmi4rdPGDe#C8da
zS}>^XW0RRYpV5?EY>9}dnAoSuW^8tByFk(#CRYKOmq5&SllQT?_`L!#*9eQ+F{nKf
wkil=4+7Eskgp<XZM1f|%5*Afr5IZCxR-?utc4BiN`&lNA%Pb5G0gge2020PJfdBvi


From 6080116447590f4c036275ab8c144b1c1b18f57a Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 21 Dec 2024 16:37:08 +0000
Subject: [PATCH 140/201] chore: update schemas

Changes from https://github.com/vega/vega-datasets/pull/648

Currently pinned on `main` until `v3.0.0` introduces `datapackage.json`
https://github.com/vega/vega-datasets/tree/main
---
 .../_metadata/datapackage_schemas.json.gz     | Bin 2473 -> 2483 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz
index 34433838d4623a280939f19d83e067930800a66f..2655d6baca5f11208703bd123308b16b171b0f53 100644
GIT binary patch
delta 2472
zcmV;Z30L;16SEVLC4W(K+_(+?D;iH9t~1>kuXBla`)03msb|h-*LE+xCk<DkB*qk}
zAt|pn8UOc!l(Z`W)Wv;h^Aral0T2X%572%)VZt=pID7K<<lBj0j=>)%))}G7lfNY=
ziq(8d$%NZaDb_KE7wJ|xgVssrglqG#RJHY%n?Tj7#N=0?Gk;|>4gU*<J#Ztqor?Rn
z|F2Mvmz|(Lkbo^SlJCRzz2GvRs(of45uaqwnL$#mn4B#L7AE)qed3+hE;<(`zY$*n
zCo`;Y$qjqr$&JproG$oS6g+duMeY5S=pxBk%}ONY7b(ihC0D(dbN}yHIQ$`n;>m5p
z4RhK|ci$RO3V)S6&`Y|jnB`t+5fn_BpM}h&D0uQ9zVbO%!!vR%kD}(?Nsx0l+PdV{
zRUErGFz$H+p%~*lYR6>q7sLYHJWsAK9uhd*lhPrA4-3LIwRy>17e2Z*8BaFgf>N7(
zve8|*jpcPqvSA4g{C<UPx5J{ih|z+Gl#7z9Y@cM*f`6Vvw_LV{vo^M+=R~w*;RQz3
z^enpR5ro+mte&^=RxJ{3;ChS2!M!61%k_sdLU~6hKNHG(Lis=_=Y+B%lr^TDpAyQF
zP>9Xv#O8Bi^Et8k9NGN%PqKoX;hdb|oSb1r&afh9SdlZV$Qf4T3@dVm74iFu*t{Y(
zuZYcSVt@0R*t{k-uZhiTV)L5Vye2lUiOp+b^BUQFc6xeBC`&>)Bb0Z9@-v~lCzKC_
za!x2KLLoLUiOox5^OD%SBsMRJ%}ZkQlGwZ?HZO_IOJeicDK4lXm2}0l`}+@Chzo6a
z#%zrB%w&t<!?ra}s37u^We)HILN<3?iA+ct7JrR^!Gklrak}1TnDZNtQ9l`0r%+vm
z@`5KSK5^Arbk>ZS?w}Mc+z82S3Pn8q#3aBXTnzVMYtD^D^ebT-$ueGZ<))_nLmSH*
zy(1O(Gxr5Q6)KfmOX~xd9!Lm{PC9sg03aQ*z~|Fvi%hEwj5)~oc7n0X=JsqVo8@0x
z>wlo4PdTp#EGc@hl>#UnPWq?7t>6#<vInapxCy`|<u8C-kh4@evjdzow}-;qV6XuQ
z#DcXsI`<AX<k;gnf{Airp%~-x%S^pbZ4b!CSO=EF)8Kt*eN>Ks^vJWh;>=Y9HswY<
zcai)Jz;DjK>*(5tEuxjoqFXE^GklTrhJQm0t8BPV8!!{v?U?DI#n`YQp=(oev;kq6
zZnY0_yI7t+O>W|N6aeFFPu=dAA_i$3>0!Ht1ao*M5!H~dd);r><V4Mh=LT^<?j%1<
z^She7qu`?dhwk@3?9_cz9@M{G!jO>2!_nGpp`t`HQVy@(5;>KwdFnW;r)0Kc`+o>I
ztcGsgkbLFW6UJ*u5KSGb*`nf1x@sX5B=+ukM~4Ro{#bsucbbzM-L&%Hn$cpzOX~51
z9o?LOX1Ay@ne@d9F9pqrGFv!er6!moT-9!aEkh>w&gGU;!`&_%uba6e^t`d2k+*3d
zNF3!i;FR0oBRSuu<1%1$+aQk%y?<j1Ko}X2`dQM^>LDtRWnnam;x2{8bOcSte)yud
zT}6RE(0Gc!e7A-X*ZLE3lOeZBwlG5XNVO2gx|F(4vM9M9>x{j_`@kFfo@FH2K$TAM
zvt@a1xV6|rkUqBBR9fd#j@vib%E;I)KBIf>R_9u6JgCzmw=;p0y&!nBCx2q}Q`@8=
zEIJ{F2$?$Gl84%FZMbgaT;Ng{SA3Xkjbo+OikYmMI|d|jgM$IC3Q>I4c%hP?2W0ao
zoJ4~E^^klr+G4QuSsR(}0BS|Ku<7detK@-Elyzrx-e&$aB$tP2%!q#v&&-ebu1E#D
zxx2VX?%}R+%W~f;-^V0jihoxNMSBfcx^2mn@geeHZ`c=Y5@>=9Vr&e=w0P#pBMHkZ
z0bL%Eey^S5DtXj3bDG<c<s;j!Z>}y!oMRXU#_!s3f_0TPH6+ioJlt4|Cd4Rs^_JTq
zUy*cZdKVwJ<EeNMQOPV?g*Cw9>Glc&7Z1vJ=y^$Pu~JbgCUH7<=zk=jtZaqq;xszM
zy|$7mltV5#drE$Vt*%Obs_VZ_H3#k$bj~s6FH$t{s`2jeBva@6<c3+B{8VDz!z6)+
z13;Q8;O+oEKp8fyZ}eZ#MvT#9TWd@DJIh>aVt|2W29e{6EODqlaoi3z8f4m|fk(|a
zJvJI3fyf{tp@Z(;8Gl&SZQa0q2p=x#HS|wT6tIrszvC)8Ubt47SEXFKsQ?YaRdC}6
z7p_H@e+TJrpGIcn9pj=X0J=n!he3UM&oeF>XQxMfG)!4QJvJ?juG5RwRRC<i@*MQk
zwSebMvRfw(GS0R&HcU5WWjZ}CaU7xmq<wZvH_W2P^%49QWq)kOH_;h?aPoBJubM?y
zjK}A`(@(=fJlr{4oad(gYsfxw<f8zzWYX4eolS3d;E78V>YYokl?VNK3x94<oW)Iw
zd^<WayONMO^67^7tuy{^2=e)xkL(>{a(2fv;KGupaE~c1clW&Ta~B^c$HQAvEf0(0
zdR}pN5k=b+gnwB6LJ&@mVrFEg@HmCyKD4XWfOnZ1<|XthC^VnzfDTeosTr%7SY)A2
z1BzMfJO=f>8dQ0?Uvy>D&UWUFho-GT@58qt9=sFAPN*DZLZmoaGeJux8sQ?7O0O!d
z*-MOy=eJ0^ryP`y)3|NkpPnxIuLS?Q3eNH`5$dys+J8j?CYUrcqiEntF^acNXHR_`
z?dz>ZK;dmd+Wh$7zoBy1H4u1%0>4Wga@B%m9^yi`pSSin)U}b%`>{zQ<b~=Dof`#X
zBucSiZ$fq5CgG9C8!=;R7f8q93`5{3tqNvp{DzHqruCzQ#?Gcqfc+EzgBPFSzC0f?
z9Od^i(|_-+tHY;^6f%4&by?B{fKK3^y3PadWvVS$29KO>%K>I>B~QaCFxG+w^H12$
zu$-sP7kob$>yjkY2k8W<1-nW^s|4**8g@-D2*5BCGR8r|{pokia7cNUbQEnE-UtjZ
zgz}*g%e;k9Ol1O!9{2|8PsD7vjRMjp?DeZa`+wFe2Fh?>cmszZINI8c!sgaWE__1n
zR%+|B<}iK2A0p;Nl)l8O;nU0X>U;pFS;bUIaumFPP_q^p3tGDeFLB16rj3~&6-_^C
zyiQWJml;FEKe%e_;9lGuR2Z0JDF7evrxUO1IdsJb=e==s6Xv`0(~{XRe2LcooCfLr
mDj^#G6Oc|R{3nh4Amr`8>H4pJIySlI*Z%-?tB3`rCIA559?s?f

delta 2462
zcmV;P31RlL6R8uBC4XOY+qe<`DjZKAt~1q)<y>N^Z?>FEJ##+0wtMM4aUc>T5rY5+
z04*ySfA=mxQX+T3i~G>#Db51ee^~7PcByYCOqfO+XHWi~d^-`$G5F)eIwMqh^0(wf
zv6@dQnQ;3l#X9EjCfzD$&_2nWaBUv8s<z&86R291nEVQKrhjav;eSD~2W|wnQ+EIM
z{}sydvSaiIVz6aK@_jhI7hL93vCj-RVoUa%8N}6!iEV+gFuDKl6Ysos(YY}Bjra;y
zGQ$d&+^{E}+~}Ok>4uL*!84a!)ZSl-E|Q$ptVCRXk)o_za@Bh|_y3NW!yi&8p4>Lv
zFsIG*^sN!4P=Cn-y`;N}S?+}vLBMqJvyizI1y3HtS3bAY@Qhr`qo{dj33Bd6TbJCr
ziendtg?rvWD8^PEwPP~*3t9r%JWsAK9uhF_N$C*5hYjJH+Pvhh3m@H@j3*m#L8;9?
z+2}6Z#{9Z1*|3EMe!s)E+hJ3j#b`r>%SFjmwofvuL4UtOk6gBfvo?05Ux`S`!V8Rw
z>6hrCM_^`K(0X3STeV2Ef$J?62ltLREY}~-2<082{7fkC3FQN!oD<55P}Z1oeo81y
zLLoYz6P?eA&gVqubENa*KgkYa!#T0xoY=4;HmryZD`LZn*svlttcVRO;`bHNc|~+y
z5uMjW=YKWPc};X)6P?#Y=QYuJO>|xpo!3O?HPZR)^z@WamV|OfDDMd6XF_>TC?5#r
zoKRMTLUdjdotH%CCDD0FbY2pjmqh0!(RoR9UJ{*`MCY?pTu?(Q>56Ih_aC$n7uxWQ
z+1S!ElP!i1+txUt0?SL5IlvDH+1zm@G9hKyG=BmH56<w$>3W~xmEU-b`pK|5h3YDl
zXFL({iL2HkTQg=lLn&Ig5t7>!ig@~oNq|MT816yWoEwYiSHd=uWxVFfO;!7cHkLPf
zM=I`T?hAe@R4TWY)(0*<kPsSKI(U8nARV&6_UW@lrd0;Y9C&;?!PsSUdp70G@-MA*
zP=C><E3XGEDSEJ#0w^3T{Zrspa0md|gVhn-1mKeL7eFq^St^}50Zy7bLt$=E*Z>4#
zLE9YJy@LZe_PCB<qFh)g#vEfBAj1>jeF%M2j)3vV^SI*7RRkX8Mm%>B`wc*D&cEx>
z+J`5imCT~sDkL*}k@JQ_tg39dHXE=JI)Ck$=^?~evB05gM{;xkVU}*U4`sVpo<2=H
zel!I@H``P9I;Mz18b@?EZXuBzo<l@5<mF!W+a)<sbK<#y-H)^6he>`{llKx_^#73k
z{)buJH{U`0+c^x0hddmu-4-fGH1p)}+AZNz>6)jGvwBKqJGPII!*1xx4Yoe)Cw~jC
zAwe{Cs9KAPGwG^@P>{g8=N%mq9M;GDyR*~W+vuj12iJ@?8(vb6AN1(v0W`ZsmC2+p
zN_Z<sMwHnC5i2P{9pS2W8*~{m!FMjVlp5}K;dtH5nb5C|<&3;d`#s_)e*mZ42Akx3
zJC4hM(QSh~F7%Eq07+y(>Ssnrt$&B8JeGyg$ceiY8qg608T;Xj-gXrQ{y<|W{_@=#
zMqKMH<R(LIlWbvx?h$DrjCCn>pJY*TKfW1zhxdUu_C3o;vVkg{;%Ce9+;D5Lhai1y
zwW+kurz>vXU@ar#wfHRVwcDL*wejFhi`>oxPWFOe(4Mf-Pi>Qiu;`c^B7bD+cuO8?
zf3)Gck#m7dU0m^Df;EnnS}SI<YR(Ku<OT-=Tot1DtnorQKM%;}Q8<Yh|LY<7WVFR#
z>9aO6-vQK$uEL?K->;GfMp4$C(RrKs*N|KurY@uPdw6Dkly^lc*v;L=MRE^!ja!rZ
zR{4G<3Dde-DB4rN)@@6siGL4~2YbW5Xp=y*W6;9JKun8go;;GUyb{pO5$^ZeIj)jN
zT{EY-9cez&?fT~Ga<p;`!?5tXcAQ{crA-aV^DGZH)}l!;3SPbCcF0#G9h%<7$K!bV
z9avN{i&kL`uy}gBg22TC@f~_zQd=xkl!{55&K<G@gq5vOU7SXTwSU)EGKF%;C1+2`
zuW-~=$xn6t*Qw;NdIcSFZ1EQ<8hF+C^mt;a^L=u|tWAC@@!G>Afd>O1O+nvZ1AKro
zY*^pWzo3H{qsg|`mh^F!xz@x01I-K~#}!%PP<`UK?Q1mfv_}Pxl5u)$G(ZB8K}15Q
z+`Ti<syn)Y`w%`{(tm5{pPVRQAH{#iU383atuil4xpY$j8icFh#t$f5i!T2T(%(Lf
z%)+~ci=qJN5)mE-^65R#xM-Z6n)+y%vVeMQS{PlY7p<!R*nZ_X$f;`q&zoeojvZv2
zZEI|pV$AY%dY<DrL;*<q?3QkrMUU$v_$|7y8Q(;w{K3i7oqxY-7Tqx(llzW74IA+g
z=Wua;HT7RT_7Rhh4A7QITfcQSz1@K)E>Wm=F1=PB^ye-7xj}IjS1Iz1=(yOGgv^mo
zG{kS6@^?d!&)<CH>=2W)C!PTpmOO=fOli5h=Y^lI_&7NlZ%MU0Y>LZy#oa{|ZBr0p
z`3pffJ&Kv(oqxjP6pH)Mu37`$Wonq0(5s-(e69mJa7CqNEMlUOg*pu=X0h`a)c0yo
z<>h|Sl}$U_nKvGqwg$Zq--dYbP8N1T<tP&(!O@xtQZms97nw+UQEAOyVpKe*MbbUx
zAatC@ZS(&0bkTne_}^7<mVb#*pEcAj5->rfnHfa`SAU98ymdNz>f>l%Z#4o6Z)4Kt
z#|QrnmAkHizy}ogUFwjl7A*4+7rOntwa4MDjeN$BO&TFDRB!0qC>SGAiWPeks_PC3
zk38Op8C$zRG7dHjfupo4n5pp_HsYDqj}jU?n>GRVQveKJe1`k->yY8-de1Zc-o83~
zvPdDrr+-qHCEWn%1n#NpJn&hj+Ja^9$my{hVAfXhG+2SL7BrZDvg{1YdFp(@_k*!6
zNkV;)PLNu#t2DGq&@QFn)bxS?3^O5PD~P$be#Z=llxImt(T3rTzyL!i9~!aDTL{He
zCZOnnZ=n7})P~zAAZ@}~zYC;qy<ngW_k|B|2!Dd3uH7hXZm;CRC**FYwmxeP(<l7F
zVU9)VORO3`u}rVd2XLBIOqC=@!3zjAYmu=awR`XqXY6U(nECP0^rOb>BvpH!F+}`>
ztHuuQ#mzy5VR6g_-~;}2;$=ODuK3`*H;!(?e3yPgG8=|3(fXgwAh};f<A3VW35EZp
ckry9+y#2RZ|J6^#CindMA2g1&ERH4s0D&#fo&W#<


From 897e8f9cfae13944ca40990deb3430ea3fd15393 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 22 Dec 2024 12:40:22 +0000
Subject: [PATCH 141/201] feat(typing): Update `frictionless` model hierarchy

- Adds some incomplete types for fields (`sources`, `licenses`)
- Misc changes from https://github.com/vega/vega-datasets/pull/651, https://github.com/vega/vega-datasets/pull/643
---
 tools/datasets/models.py | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index a454ed30c..f8414f739 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -209,6 +209,7 @@ class FlField(TypedDict):
 
     name: str
     type: FlFieldStr
+    description: NotRequired[str]
 
 
 class FlSchema(TypedDict):
@@ -217,12 +218,29 @@ class FlSchema(TypedDict):
     fields: Sequence[FlField]
 
 
+class FlSource(TypedDict, total=False):
+    title: str
+    path: Required[str]
+    email: str
+    version: str
+
+
+class FlLicense(TypedDict):
+    name: str
+    path: str
+    title: NotRequired[str]
+
+
 class FlResource(TypedDict):
     """https://datapackage.org/standard/data-resource/#properties."""
 
     name: Dataset
     type: Literal["table", "file", r"json"]
+    description: NotRequired[str]
+    licenses: NotRequired[Sequence[FlLicense]]
+    sources: NotRequired[Sequence[FlSource]]
     path: str
+    scheme: Literal["file"]
     format: Literal[
         "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv"
     ]
@@ -236,10 +254,20 @@ class FlResource(TypedDict):
         "text/geojson",
         "text/topojson",
     ]
-    schema: NotRequired[FlSchema]
-    scheme: Literal["file"]
-    dialect: NotRequired[FlCsvDialect | FlJsonDialect]
     encoding: NotRequired[Literal["utf-8"]]
+    bytes: int
+    dialect: NotRequired[FlCsvDialect | FlJsonDialect]
+    schema: NotRequired[FlSchema]
+
+
+class Contributor(TypedDict, total=False):
+    title: str
+    givenName: str
+    familyName: str
+    path: str
+    email: str
+    roles: Sequence[str]
+    organization: str
 
 
 class FlPackage(TypedDict):
@@ -254,9 +282,9 @@ class FlPackage(TypedDict):
     version: str
     homepage: str
     description: str
-    licenses: Sequence[Map]
-    contributors: Sequence[Map]
-    sources: Sequence[Map]
+    licenses: Sequence[FlLicense]
+    contributors: Sequence[Contributor]
+    sources: Sequence[FlSource]
     created: str
     resources: Sequence[FlResource]
 

From fdffed0a15be3967c6b9513787fd40feb59c9cdc Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sun, 12 Jan 2025 12:08:31 +0000
Subject: [PATCH 142/201] chore: Freeze all metadata

Mainly for `datapackage.json`, which is now temporarily stored un-transformed

Using version (https://github.com/vega/vega-datasets/commit/7c2e67f6e7ba69b00e7cb1473503518942385d11)
---
 tools/datasets/__init__.py                | 50 +++++++++++++++--------
 tools/datasets/_metadata/datapackage.json |  1 +
 tools/datasets/npm.py                     | 15 +++++--
 tools/generate_schema_wrapper.py          |  2 +-
 4 files changed, 45 insertions(+), 23 deletions(-)
 create mode 100644 tools/datasets/_metadata/datapackage.json

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 395119dd7..c30c43867 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -130,7 +130,9 @@ def github(self) -> GitHub:
     def npm(self) -> Npm:
         return self._npm
 
-    def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
+    def refresh(
+        self, *, include_typing: bool = False, frozen: bool = False
+    ) -> pl.DataFrame:
         """
         Update and sync all dataset metadata files.
 
@@ -138,26 +140,38 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
         ----------
         include_typing
             Regenerate ``altair.datasets._typing``.
-        """
-        print("Syncing datasets ...")
-        npm_tags = self.npm.tags()
-        self.write_parquet(npm_tags, self.paths["npm_tags"])
-
-        gh_tags = self.github.refresh_tags(npm_tags)
-        self.write_parquet(gh_tags, self.paths["gh_tags"])
+        frozen
+            Don't perform any requests or attempt to check for new versions.
 
-        gh_trees = self.github.refresh_trees(gh_tags)
-        self.write_parquet(gh_trees, self.paths["gh_trees"])
+            .. note::
+                **Temporary** measure to work from ``main`` until `vega-datasets@3`_.
 
-        npm_urls_min = (
-            gh_trees.lazy()
-            .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
-            .filter(col("size") == col("size").min().over("dataset_name"))
-            .select("dataset_name", "url_npm")
-        )
-        self.write_csv_gzip(npm_urls_min, self.paths["url"])
+        .. _vega-datasets@3:
+            https://github.com/vega/vega-datasets/issues/654
+        """
+        if not frozen:
+            print("Syncing datasets ...")
+            npm_tags = self.npm.tags()
+            self.write_parquet(npm_tags, self.paths["npm_tags"])
+
+            gh_tags = self.github.refresh_tags(npm_tags)
+            self.write_parquet(gh_tags, self.paths["gh_tags"])
+
+            gh_trees = self.github.refresh_trees(gh_tags)
+            self.write_parquet(gh_trees, self.paths["gh_trees"])
+
+            npm_urls_min = (
+                gh_trees.lazy()
+                .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
+                .filter(col("size") == col("size").min().over("dataset_name"))
+                .select("dataset_name", "url_npm")
+            )
+            self.write_csv_gzip(npm_urls_min, self.paths["url"])
+        else:
+            print("Reusing frozen metadata ...")
+            gh_trees = pl.read_parquet(self.paths["gh_trees"])
 
-        package = self.npm.datapackage()
+        package = self.npm.datapackage(frozen=frozen)
         # TODO: Re-enable after deciding on how best to utilize
         # self.write_parquet(package["features"], self.paths["dpkg_features"])
         self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"])
diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json
new file mode 100644
index 000000000..dbb2e51dc
--- /dev/null
+++ b/tools/datasets/_metadata/datapackage.json
@@ -0,0 +1 @@
+{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2024-12-31T18:32:26.970186+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths &times; 1000 &times; 12) &divide; Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n   originally added to this collection in 2015. The specific criteria for country selection \n   in that version are not known. Data for Aruba are no longer available in the new version. \n   Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n   These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n   corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n   preserve continuity with previous versions of this dataset, we have retained the column \n   name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n   `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart&ltype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&amp;P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]}
\ No newline at end of file
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index f71037d5c..fd2aa848d 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -45,8 +45,9 @@ def __init__(
         jsdelivr_version: LiteralString = "v1",
     ) -> None:
         output_dir.mkdir(exist_ok=True)
-        self._paths: dict[Literal["tags"], Path] = {
-            "tags": output_dir / f"{name_tags}.parquet"
+        self._paths: dict[Literal["tags", "datapackage"], Path] = {
+            "tags": output_dir / f"{name_tags}.parquet",
+            "datapackage": output_dir / "datapackage.json",
         }
         self._url: NpmUrl = NpmUrl(
             CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
@@ -121,6 +122,12 @@ def file_gh(
         with self._opener.open(req) as response:
             return read_fn(response)
 
-    def datapackage(self, *, tag: LiteralString | None = None) -> ParsedPackage:
-        pkg: FlPackage = self.file_gh(tag or "main", "datapackage.json")
+    def datapackage(
+        self, *, tag: LiteralString | None = None, frozen: bool = False
+    ) -> ParsedPackage:
+        pkg: FlPackage = (
+            json.loads(self._paths["datapackage"].read_text("utf-8"))
+            if frozen
+            else self.file_gh(tag or "main", "datapackage.json")
+        )
         return datapackage.parse_package(pkg)
diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py
index 27ef56f97..3177b56cf 100644
--- a/tools/generate_schema_wrapper.py
+++ b/tools/generate_schema_wrapper.py
@@ -1394,7 +1394,7 @@ def main() -> None:
     copy_schemapi_util()
     vegalite_main(args.skip_download)
     write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT)
-    datasets.app.refresh(include_typing=True)
+    datasets.app.refresh(include_typing=True, frozen=True)
 
     # The modules below are imported after the generation of the new schema files
     # as these modules import Altair. This allows them to use the new changes

From e259fbabfc38c3803de0a952f7e2b081a22a3ba3 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 12:46:46 +0000
Subject: [PATCH 143/201] feat: Support and extract `hash` from
 `datapackage.json`

Related https://github.com/vega/vega-datasets/pull/665
---
 tools/datasets/_metadata/datapackage.json | 2 +-
 tools/datasets/datapackage.py             | 5 +++++
 tools/datasets/models.py                  | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json
index dbb2e51dc..df9d40e85 100644
--- a/tools/datasets/_metadata/datapackage.json
+++ b/tools/datasets/_metadata/datapackage.json
@@ -1 +1 @@
-{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2024-12-31T18:32:26.970186+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths &times; 1000 &times; 12) &divide; Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n   originally added to this collection in 2015. The specific criteria for country selection \n   in that version are not known. Data for Aruba are no longer available in the new version. \n   Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n   These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n   corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n   preserve continuity with previous versions of this dataset, we have retained the column \n   name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n   `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart&ltype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&amp;P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]}
\ No newline at end of file
+{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2025-01-12T14:23:04.938086+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:6586d6c00887cd48850099c174a42bb1677ade0c", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:608ba6d51fa70584c3fa1d31eb94533302553838", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:719e73406cfc08f16dda651513ae1113edd75845", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:11ae97090b6263bdf0c8661156a44a5b782e0787", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8dc50de2509b6e197ce95c24c98f90d9d1ab138c", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:1b8b190c9bc02ef7bcbfe5a8a70f61b1616d3f6c", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:5b18c08b28fb782f54ca98ce6a1dd220f269adf1", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8a909e24f698a3b0f6c637c30ec95e7e17df7ef6", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d8a82abaad7dba4f9cd8cee402ba3bf07e70d0e4", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:1d56d3fa6da01af9ece2d6397892fe5bb6f47c3d", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b8715cbd2a8d0c139020a73fdb4d231f8bde193a", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:0070959b7f1a09475baa5099098240ae81026e72", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths &times; 1000 &times; 12) &divide; Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d2df500c612051a21fe324237a465a62d5fe01b6", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0584ed86190870b0089d9ea67c94f3dd3feb0ec8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:33d0afc57fb1005e69cd3e8a6c77a26670d91979", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "hash": "sha1:ed4c47436c09d5cc5f428c233fbd8074c0346fd0", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:0691709484a75e9d8ee55a22b1980d67d239c2c4", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:10bbe538daaa34014cd5173b331f7d3c10bfda49", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d232ea60f875de87a7d8fc414876e19356a98b6b", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:769a34f3d0442be8f356651463fe925ad8b3759d", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "hash": "sha1:74f6b3cf8b779e3ff204be2f5a9762763d50a095", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4722e02637cf5f38ad9ea5d1f48cae7872dce22d", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:20c920b46db4f664bed3e1420b8348527cd7c41e", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d9221dc7cd477209bf87e680be3c881d8fee53cd", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "hash": "sha1:9c4e0b480a1a60954a7e5c6bcc43e1c91a73caaa", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8459fa09e3ba8197928b5dba0b9f5cc380629758", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0ba03114891e97cfc3f83d9e3569259e7f07af7b", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d07898748997b9716ae699e9c2d5b91b4bb48a51", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:abce37a932917085023a345b1a004396e9355ac3", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n   originally added to this collection in 2015. The specific criteria for country selection \n   in that version are not known. Data for Aruba are no longer available in the new version. \n   Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n   These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n   corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n   preserve continuity with previous versions of this dataset, we have retained the column \n   name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n   `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8cb2f0fc23ce612e5f0c7bbe3dcac57f6764b7b3", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:cf0505dd72eb52558f6f71bd6f43663df4f2f82c", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:18547064dd687c328ea2fb5023cae6417ca6f050", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:01a4f05ed45ce939307dcd9bc4e75ed5cd1ab202", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:ebfd02fd584009ee391bfc5d33972e4c94f507ab", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart&ltype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:214238f23d7a57e3398f4e9f1e87e61abb23cafc", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:69d386f47305f4d8fd2886e805004fbdd71568e9", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:94ee8ad8198d2954f77e3a98268d8b1f7fe7d086", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:d90805055ffdfe5163a7655c4847dc61df45f92b", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:2e24c01140cfbcad5e1c859be6df4efebca2fbf5", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:1b21ea5339320090b106082bd9d39a1055aadb18", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:741df36729a9d84d18ec42f23a386b53e7e3c428", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:c79f69afb3ff81a0c8ddc01f5cf2f078e288457c", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:a8b0faaa94c7425c49fe36ea1a93319430fec426", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:921dfa487a4198cfe78f743aa0aa87ad921642df", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:e38178f99454568c5160fc759184a1a1471cc558", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4303306ec275209fcba008cbd3a5f29c9e612424", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:6da8129ed0b0333c88302e153824b06f7859aac9", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9b3d93e8479d3ddeee29b5e22909132346ac0a3b", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:517b6d3267174b1b65691a37cbd59c1739155866", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:01df4411cb16bf758fe8ffa6529507419189edc2", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4716a117308962f3596179d7d7d2ad729a19cda7", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4aa2e19fa392cc9448aa8ffbdad15b014371f499", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:680fd336e777314198450721c31227a11f02411f", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:3bad66ef911b93c641edc21f2034302348bffaf9", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d55461adc9742bb061f6072b694aaf73e8b529db", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0f38b53bdc1c42c5e5d484f33b9d4d7b229e0e59", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&amp;P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b82f20656d0521801db7c5599a6c990415a8aaff", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0eb287fb7c207f4ed392821d67a92267180fc8cf", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:58e2ce1bed01eeebe29f5b4be32344aaec5532c0", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:65675107d81c19ffab260ac1f235f3e477fe8982", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4d769356c95c40a9807a7d048ab81aa56ae77df0", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "hash": "sha1:d1aca19c4821fdc3b4270989661a1787d38588d0", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:c6120dd8887a0841a9fcc31e247463dbd3d0a996", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:ff7a7e679c46f2d1eb85cc92521b990f1a7a5c7a", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:8795be57cf1e004f4ecba44cab2b324a074330df", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9c3211c5058c899412c30f5992a77c54a1b80066", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:841151dbfbc5f6db3e19904557abd7a7aad0efd2", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0e7e853f4c5b67615da261d5d343824a43510f50", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:bd42a3e2403e7ccd6baaa89f93e7f0c164e0c185", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:cde46b43fc82f4c3c2a37ddcfe99fd5f4d8d8791", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:ed686b0ba613abd59d09fcd946b5030a918b8154", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:a1ce852de6f2713c94c0c284039506ca2d4f3dee", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d3df33e12be0d0544c95f1bd47005add4b7010be", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]}
\ No newline at end of file
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 445974795..ac6ae7087 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -70,6 +70,10 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame:
         "encoding",
         "dialect",
         "schema",
+        "sources",
+        "licenses",
+        "hash",
+        "description",
     )
     return (
         pl.LazyFrame(pkg["resources"])
@@ -84,6 +88,7 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame:
             ~cs.by_name(DATASET_NAME, EXCLUDE),
             *FEATURES,
             col("schema").is_not_null().alias("has_schema"),
+            col("hash").str.split(":").list.last().alias("sha"),
         )
         .collect()
     )
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index f8414f739..e2036b4ea 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -255,6 +255,7 @@ class FlResource(TypedDict):
         "text/topojson",
     ]
     encoding: NotRequired[Literal["utf-8"]]
+    hash: str
     bytes: int
     dialect: NotRequired[FlCsvDialect | FlJsonDialect]
     schema: NotRequired[FlSchema]

From 3fa7cacca21ae9f440619aac8df2ce81a606a8c7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:21:54 +0000
Subject: [PATCH 144/201] feat: Build dataset url with `datapackage.json`

New column deviates from original approach, to support working from `main`

https://github.com/vega/altair/blob/e259fbabfc38c3803de0a952f7e2b081a22a3ba3/altair/datasets/_readers.py#L154
---
 tools/datasets/datapackage.py |  9 ++++++---
 tools/datasets/npm.py         | 32 +++++++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index ac6ae7087..49baf7a32 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -42,8 +42,10 @@
 )
 
 
-def parse_package(pkg: FlPackage, /) -> ParsedPackage:
-    return ParsedPackage(features=extract_features(pkg), schemas=extract_schemas(pkg))
+def parse_package(pkg: FlPackage, base_url: str, /) -> ParsedPackage:
+    return ParsedPackage(
+        features=extract_features(pkg, base_url), schemas=extract_schemas(pkg)
+    )
 
 
 def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
@@ -56,7 +58,7 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS
     return m
 
 
-def extract_features(pkg: FlPackage, /) -> pl.DataFrame:
+def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
     # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision`
     # - This only considers latest version
     #   - Those others are based on whatever tag the tree refers to
@@ -89,6 +91,7 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame:
             *FEATURES,
             col("schema").is_not_null().alias("has_schema"),
             col("hash").str.split(":").list.last().alias("sha"),
+            pl.concat_str(pl.lit(base_url), "path").alias("url"),
         )
         .collect()
     )
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index fd2aa848d..8f9182c45 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import string
 import urllib.request
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
@@ -18,6 +19,10 @@
         from typing import LiteralString
     else:
         from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
     from altair.datasets._typing import Version
     from tools.datasets.models import (
         FlPackage,
@@ -25,6 +30,8 @@
         ParsedPackage,
     )
 
+    BranchOrTag: TypeAlias = 'Literal["main"] | Version | LiteralString'  # noqa: TC008
+
 
 __all__ = ["Npm"]
 
@@ -55,6 +62,19 @@ def __init__(
             GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
         )
 
+    def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
+        """
+        Common url prefix for all datasets derived from ``version``.
+
+        Notes
+        -----
+        - Encodes the endpoint at this stage
+            - Use github if its the only option (since its slower otherwise)
+            - npm only has releases/tags (not branches)
+        - So the column can be renamed ``"url_npm"`` -> ``"url"``
+        """
+        return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/"
+
     @property
     def url(self) -> NpmUrl:
         return self._url
@@ -88,7 +108,7 @@ def tags(self) -> pl.DataFrame:
 
     def file_gh(
         self,
-        branch_or_tag: Literal["main"] | Version | LiteralString,
+        branch_or_tag: BranchOrTag,
         path: str,
         /,
     ) -> Any:
@@ -125,9 +145,15 @@ def file_gh(
     def datapackage(
         self, *, tag: LiteralString | None = None, frozen: bool = False
     ) -> ParsedPackage:
+        tag = tag or "main"
         pkg: FlPackage = (
             json.loads(self._paths["datapackage"].read_text("utf-8"))
             if frozen
-            else self.file_gh(tag or "main", "datapackage.json")
+            else self.file_gh(tag, "datapackage.json")
         )
-        return datapackage.parse_package(pkg)
+
+        return datapackage.parse_package(pkg, self.dataset_base_url(tag))
+
+
+def is_branch(s: BranchOrTag, /) -> bool:
+    return s == "main" or not (s.startswith(tuple("v" + string.digits)))

From 34b869e7d1420287887796a2612fe12af133cf3b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:02:28 +0000
Subject: [PATCH 145/201] revert: Removes `is_name_collision`

Not relevant following upstream change https://github.com/vega/vega-datasets/issues/633
---
 tools/datasets/datapackage.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 49baf7a32..2ff40c32b 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -59,10 +59,6 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS
 
 
 def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
-    # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision`
-    # - This only considers latest version
-    #   - Those others are based on whatever tag the tree refers to
-    # https://github.com/vega/vega-datasets/issues/633
     EXCLUDE = (
         "name",
         "type",
@@ -82,7 +78,6 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
         .with_columns(
             path_stem("path").alias(DATASET_NAME),
             cs.exclude("name"),
-            col("name").is_duplicated().alias("is_name_collision"),
         )
         .select(
             DATASET_NAME,

From 5af370162945c41efcddb55059623afea2bc098b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:13:03 +0000
Subject: [PATCH 146/201] build: Re-enable and generate
 `datapackage_features.parquet`

Eventually, will replace `metadata.parquet`
- But for a single version (current) only
- Paired with a **limited** `.csv.gz` version, to support cases where `.parquet` reading is not available (`pandas` w/o (`pyarrow`|`fastparquet`))
---
 .../_metadata/datapackage_features.parquet       | Bin 0 -> 9189 bytes
 tools/datasets/__init__.py                       |   3 +--
 tools/datasets/datapackage.py                    |   1 +
 3 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 altair/datasets/_metadata/datapackage_features.parquet

diff --git a/altair/datasets/_metadata/datapackage_features.parquet b/altair/datasets/_metadata/datapackage_features.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c76395167255bd0c1b8c374e51ce292a7c51de48
GIT binary patch
literal 9189
zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFJFCc|M|0<}pLYhz6n%8B&obLxeO)
ziAwTmP%0^<lKy*}s{hk>Z@2H>``q*B>~{_C?|s)=?^<iWzmo;QN)DodE^$E945rY2
z5CQ-|;*89f9aw5ICtx7#A|x9$nsG0+bYp|xq33;Ow-^$;u%sRLzG5BiE1=icUX;^r
z1?&M!z!Ute7#$}kd&*1oVbTCj%<0m)hu5vwpPJ77a*rYZZEcj?CUHd~?d#1&52-TK
zy)(AFZ+V_(WAV9w>>zMO%Dj!SNT~I!&-lZuY5@-UVE%fp{sw>swV^pGi>m$5@u0U|
z4UU>KPklq4vv}BOTE7>!(L2}9IB%3mOqU(_NbkfLz3&z-*qMEO*NqBg-WG$o0uKT0
zUBW1u$rN?2-m>v5b!^m$-el5o{WDiAXeC>UD38*cMx>loi<Rl#JYMJy)Crf=%4?6x
z5iA-`&*ZaH=sqGn@9u+guu58wvE?p1cW_uvl0D4o;j9k%)J;l>yg@f#7^Qu*lP;qO
zy3&}D%H5wqU)H$0vq(!w-_Kz224lsuIQk0*awWr$OmlTy45N*kv*6C<=sB*O^+%Z6
z(9FJ*9epVO_K2$YqKip1!dZHCfFbm<o6XxZWHFGsWKmG<6|IegtWn?)hYGfMz-W-g
zr<i4&&7H&0Cx)n1*<)}n10$Gof;1iF#A>b7RRT^V9NX)ai|x$bm?EB%pc)=y%_}eM
zjPI9vN7LhipAZhns&EpGH)OoJDTn5D=k>{F&GpvW5En<LRR7kB&u>I@q7S<8v+TOi
zG8X><vNG)UeUoNHUl)_u3bJl-+H8<*SRbyVq4a1*fR?VZRF<&kj`L|Qq8FA2W%U3U
zi;~1a0CX6H>Od(d8-zF@I*5*68IWKI_X;EhxREJn8WmVvi7xDx-BbWIT*wANdMG-2
zhqeToEjiw`-BVRyf3llSiH#Jmsln+-A?kt_MvB6Drs6gxXI#&0ld(5dK6cwzWt;iA
zKj$K~c37y2qS6hQ+aMN<5<tg+L9@6KLx>b|h>Jhbj|{P(vO&}}E>s#bjzRrr4#Cmy
z5A8CASXM(Ht8xegggcoy_nXnrTtwxogoU-dI+YHYgLTb|mkN8bs~1J8Ob1;=Z(Do?
zobX<A^dJZ%^wYu<U=Hp0ariXSA(T)wois5xI3R3w@ZxxIb@0?sa+#4;nbCelW1BLp
zC6956sBw0gsIiT4masl7uMLWhveVt&YuEB{4pX7dQ{m!Skn#LK3-D+%P*5q8DY6Qy
zdpU_lg9k_yCJJFGqZ!W!tc`;sM19P&bwLeao7e02<+0YhEfL=NtlL-|00+PZpu!Hc
zbM9z<^5}w#fMRu~TFleaq=V9VYR8qS{lC0@s^_nl=RVWP@FIdlbHvQs%Q7l|o7jPE
zTJBiP?JG=84)n)m8ixzG1roX6kM++n@Hyv{3!z|NK(MA&riuHU&dFm=-64jPv#HNB
zn^RsL<IV3gj4-W@_qrxi4p#CxWW9};I!b@NtLlytT}w}7Sd7kPkGC^U-1<V5*vQ7u
zmKzvReKfAG6~m*8Qz&E|wyASq#jAml9fy0*3kBWFFPmF;EBU>J3-5$OME&6i%~_i<
z{-_t1M)|HKywH=bl`1rc8c$p(EZ3xF`xL}0MLBG`_0{ujC(o#?JL$kD<e-r4#b>-O
z4sXWZ$=E;Cr07OfKKFj_Rq=Wg{+ND8tCEj_pGP9LCrNxgtc0xq2lDMXNiTO9O%Szq
zo9~?3zM)y8O6%580qtj67Kz+ch;U%;4qNI>$l@&Z<3*Q-N#^L3iuxU{JXM&|qrG6X
z=zje(tlBKXnC!CsY@4`L&lIvg6e!Ko*zhJWHBZ@W+V0&k7On~Xq^g73Y*i`iaF+$M
z1a#=b_s#c3_A?cXA1mec4N5c)jEK>;{a`Ae5b4AGAr5<8weM~)>*yD;iCX+wDg22f
zuHL@8*pWWt3%0TOv(X>6J0^VIGjd1!wCpB{xjuo=_&kflsyqjE&ODxcyz$Lmv(eo3
ze%O<)HcUKuWZppX`DYe6-5=fv+3quu%nzrT#63X|w-b*LG*Kz<7Ya<$z$WT|=(5JE
zDfu;g6tTum|DYP}N%quHQ@!Hzr8h`?CmJk8BE#8v@Vf`W_tZZ>CRkRF3273%Ul@?1
zLORgjV|B;;ljEY&Ub-O7y>n`p=F$D0dK(JOD{{RrdpA4R75EyQ9Pw`|_V0P|aqp$0
zfksk%nOBzE)^-+6ATh*qd0$ndQG+bFqwL7O`b&@C9k!nl8mInYhZ%uN^ne>n3_H-O
zM2+E~IY9@A3zbxHHP6!?F#ygnr6q$DBOoRoOkw~Ej1yQ{4{_bs0}fkAB+VRR-QkBG
zip>{MU#HbfCGh9K@HIM708I1YY8L|!2?5+#U8^WMU4HO0EzoPi#;_hp!;3IJQU%Qb
zU=Ls;dy=YV8|0h_9Axl9$<Tq1&O@5qfI}CjuR6Wc1>cJ_AR{ntPj_1eP{*83lDth-
znE>Di80+<c_3X7)#-kue3y3j1o(F>mZLSKy6Al1I6&$XD<Ln#_?1g`h7#F>bu0b;n
zS#<e#G-<E5m;3nwGQf^6DWe%+zg9oUo5Vjm>wjzUXjaa8TG!nnWXkgD9cM-*)8Quh
zmrnVgxdVGH3@I)h(7@{b1cGP~@pj2m&J<wvU<LtDdwCQPfDr@eFlYuZii?*Y(PQOG
zaYXsz?usC`Rv`Wr^iNTX7hOJojq-ud=w);!7(ImOy3>~!ygV8NKUeHG@qy@N-vMA1
zpAN?NAP0QI9>V=DHVl0f0HUMUu;~%s0|LJTPm}mvaM<x5!I6ofz+2Rd==;t2Zpq)o
zzklx)-}42*XJ>zK-kTEO{|!8G<8OlhWK}(hu=gZSvLEpq{w1mJ@oAs}d1xH{!)cPn
ztRx3)IAg=hR##3f8ih`;?ho!HGTjXZ1HeG=NsG`rly#hCYEozRM31J%yR)5{M>Q3<
zs~0QoDWSFwH5j;Gf9qvqoxHic-n*pI@$;9)#QC}_Roq{&u-Vvcd%0V3BuFT2EXX3n
z``S24=(EQHpIQ8L{o;_BOE2%xX>Ji^HW9HM5heQej8QKby-VJCdS7udxa=w#7{6!B
z_1y&=A?bWqudrBeZ|Sr6c&{;SpM}XA=g5+&(Y<QvDWZ=&ho;CmoqiK1*;}4yr5xZ1
zFV=7oYmC#}S(iWiO!eZt*n^Z{58rDt587<EueX_=+51p@uYa#Vmtjn2Pqu4f?VB{4
z+<*aDL4^mU?HKFkX+aM;G&#M~nx3(NOViwPAZqvPJ+UfyE0i7D*tYz%h+vM--NBDm
z>b{BTVQ3Wr=lSs(uRJ<#MiAe2pd&s#ZL=Nc#>N++T%2}X+{~lvC*|hxdN+6X2?-UN
zab#^h#eC)&H?L!Hn?3z@7Pf#^=7vAYcXkwVCfW}qq1#KS9RUE#P&paql7Fipw&t00
zVMhDp`41aAs5G$VTA{79+xKWcXIbb|Y}wGxzFo6bAm_^UcL$$j9e1;pncmnODV;Jp
zZ<l>vq4dTl@lV6z?pqJmJh-+`rRZ~P_TWg>gNe;zv9rdsc4LA&`6oJWY!^Dkl0YBT
z)+R|)#oih$C@|1{=jr)PQioG+rw!^_=~q-kl4j^`W43YueC^<vZPScL_5I0~o;OcK
z+6xo4_U2of)`$9t4Tr00C^XSE-|z42+BX)_I{&t@cgN&N{Fa5tscd^;p7SxGc%l6T
zFDbg`{DNk!EDosaYSmb5Ei@N%C{Eo?U34U$Oarl2v&vj^m;ca%(!)KxG@f)3L$Mp^
z*`zQceR(*NWFNoIdSa*z>KG2KNH@-y!aP0PNg-DsP@ePE50B7f;?|^)F?@G;{5#4=
zGXpL!<UD;aq2M?Fyz$s&+oE?TEL?@q8lTL!D(=a0Z<2cXkmGn8Z)rKVk93OgR==`G
z_|wC;oAq4m{ruH0ceJ|gdzV%Fb)P=B4bNNo`_fQBu>Pz@{GD>%$`sz4MVzjM%o|N5
zOIqG)4+)~>Up%f5y;FAPgW%Ts4Ta*yT9*yy8u9v+Q#(I>Qrn`#_a(mTDECdQUFXY`
zY%AV;65;Dj!IyjEr1shD*&HA3o*mNR)f!or-QLl6r#GR5Ey8s(Uv-J6k$iBsALV{)
zky`By$s*;Pt+!@0yY65M=V^rGIWHSTWE>5xsLUGg#Rr(Qrg5HMz>rj?g_2{L1_~aV
z*fd%UW@yEtNp%f<k?Mrm*Ol0unxv`{lbxAQuNPQ_+ke`W^C)gZg2dJ#>W?1vX6x;$
zp7)f_ItXR_dHMS3R+=!8B&Nk%ZSM2x-I6#h@q5u_idvU51``e)q{Ot9vyq){OX=mP
zMbL}H`?_^ma21}vn)+o?qW0o&*o>Jj>YVM^Q;Do`z9WYsPX;jFx3l*}nM)>r{BY61
zjXq8DYlMR@OB~oH6y)^4o71j!Hh&xW=A%C*^d1hqGIP=B6Oxi-N{ZW9IQb;*16plg
zKBKgWx5eGWz9-k8H9Ctip05XWTPBI5VW+6$T491yvO!<4TDKEt-E|+$Fm=3nntP<8
zYTdz)#iiZGC&D{5yDA##Dl2Nl(#{$;)pgIgJ$owmMW7};yfxe%Dx7FsKej8%$+S#1
zo?+Wf`L<iVjqDRyeR>AWq%5Ipud!9kH6@3(o*E?A)-oJ;s^HSEtD&rsmw7d>-q^i!
z=hR1wV%~*!i@L`RtOKkoP6i1**^<|BL$~V{nvwru*r|3KTD$tvEi$r7shjX<a?>eK
zJ>BP>+<6>a3jW?HpUHKq`VQsv++SYzO{G6cQ0k~&JW;p9C3$-thq_z$DMx+XwlnYX
zZLg$nwoRyKysu65w#w=yC=AS>6|?9rjY}vHyzeo|p3AII$?HfbUfH_nI{%*CL*p#P
z_vV!I8TsjL@dr3stPkAZP;ymmfZDu5hym0&%E6_#E1u&q{iOqJ*fu!f1Q=o%;LK~5
zmm?+_5PWG*1E^F)z;|sXU0D|w7&tGh2OO;h!FWI$a7h})Qyop>a2YlJ`P-O;ySoZe
zg{-WCCn+ks%e%>uT~(DxB)q(myqqf@FY7KxR3R#nRETH^Sy>fXyb|731+J6DE2=2D
z5{XK(a38#?yrL|TtSTofuS`~vhdud^z0Iv=?$i{|m3#7(5-OP=7cGt4l>d>zAaC(@
zP#^OT_wE&*2Ph!)grBf%5GCV;S<WdVpA9?BTFSkGp9DtUpBc?a@Ed!@34fSq-2hL(
z1sGvE9aYp3yf`OQMa~muxp_jg=;455<~486-n}k|@|e9{;-Xo*`ndBo;)s=X4ud`|
ztXUhoFBa|;xa`nfH;vt;uFNh!V|ejm-nI5;Z&WjT$Z95+A7idc&+Xz-{Tv0)IU5xw
zyZ=Pr-h|XC<@p%Gym2kDM)Bpx5Ogj*L;a}f^%swFaD2nKu4|SStxpQM`RN3T3<OEo
z7?XHnTnjswSDiLh_SZirV;#5cW3kJ=q<KiSAI;W%Nwsd<tzB*Gyu||&MT7Hu55M4G
zYMPRAwl7w;eY9}L?Q4Tc_y(=K36(-WXO+F)@dxD=ZCp>|_nA(KG)Jl_Ul2BC_jc|{
z<FI)c9IHJ1jB$w$J4b0hn1Bhq1r{mSXMg4uUsO#m66^OFi8+0%QiUHLcTm0S2!YXP
zb5W5Cu~AuNai*|rpW}^~54pi*A9~Hj79uitMDyvmIu-REa@yXIyin*8FI~;j;aA)6
zkfL->Vv$ian+Lle#2NFx?A}u48MzoJvc!{BPWxX^P@#*CsVFgZ<#0_*jXiEgi5BHx
zw1QNG1MN}q3>@*}U!#Hq3FyMT*Hg<_wa^lCtX*XnAJ@O(jWBx`T_9>u8%9l0r#Pr?
zy^YH-Fv(z8XSw~X9Lr(an%OYsmrQq*_+P~^3Ud_}U5GzAcngm=8*KK+?Z=1@OngAS
zzWs@)H=y)<(e?$W?nip7COvP~j@p{TO|=g~2uG8vS!0?5S}n)L_aDk~ot-o8-6J|W
zAJ=(i;X-Pc#}{7W*Mb4f`O!d{G0rA)sO*oEIJxefz7ZGhN(#snyoD0a4<FuOJi0!m
zl%A{Q>YZ3m@>JaBE19?LyampQD)H!-ybIpmE>_*KUaefTP9#({6l5Cgc`@C<$C5qO
z&^5B4JarL0hfS0d5RZ*L`SZg<-!mj6kfJ6dLvr(%_NKUzeZ4}1rTxhvG9I2Xp=1x@
z($}UHR*j<VNA&WSK}s?|B<+OXhWT%1`_wywSMrE#C)5}_Ze}xb#QYCU0fv(bv&@yk
zDd*}0!&w|mpe;46aEWjqgj)QrVU09_8PHWdgiZOr%W7&4L$4;t-!+2(1`Q$^>8h>&
zPQ@=@K^XKeUj^R{;_t^L2L11ej}@o*QMrszG!?>#V)#+{tWX8GB2QqT{j2qpSSWc_
zzAFOxu5`&2TN8(HYhs$jn%FJ5CMIrN6E8`viP45oGW81Ds2%ifNrtE}(?&GG0RbTp
z9{{n4QT@oGivgzmu)9kQD@^qFRZA4wY6T6fyiCi$IW{b>@g~c~?}(1&iWO#Jxv)wl
zeCxZC`+sXjlJ%A5Z`t*-YsmY<svF1{kj=hQhJOXX@MG@da+&$pn7@DVhXT|j8A78<
zogrla*T(h@=2xkC)B>V^*BO!raI@>a>Hn()Br4Y(#z}R73;^7;FhBHLTo|ev0lDD<
zX#%(|QGPfye~IKmb-6<*RK^ZS7{Ep2*3`aR)Nee;59}3c6j7{OQoJO-rudI(Dbx(2
zG`OTRFR`Yy8YJIL{lGV&5<Ot8cS~9g8`iY`Igx}aMildj$lzu-t|=ns{sBXTYCyD_
zmb9ujt!XiR3(jwT{y_bpt{{q|ONz-R-{RrF_!f*B^MIpZfe66~2=1EGcRu{DUz4c!
zi09`?P!$z`dN%;V`y0oA`eez29I9v*f|b=4JGEuz4c)~kgM1Wy^auokKKzfM8VZ+p
zSt24_UaHzmAW#Up1OoiDBy6;R+rz?;c4Gp;W2K)g@^m1;)$&NSi3i+W#|&xKMapKD
zI<TY<s}ro2G2t>|l0a}rzLv}^f2$z)yO;2nzzvahg8o+c%aWjDOn?>Pr#|8j{8{Qp
zAXtRMN~$mlYyhb;vfb@U(4)YWGNwqk<<1U>NU(zYz##e_us!5yi8#3G4S^uDG(OnG
zN)<`o)041e37>-WTk5Q{<eCohfQ?!scHyVh%6yn1{a|%f#6DtzKnQ@#Isp#$=I$hU
z^8iOXIZwF#=l)s3ZvL8oMu-(6j7ju24|VnU+j#K$F5{6DbUfV*eLY>_@?We6vMj%}
z9y-W+82X0WD_Dh+3|0R=9y0X^p1hl{D~$bj{nK5=Q?T-M@VBKn*uyLQx7Qm1)d?iY
z;R9Xm4E!CJw$t+ZY=K*Th(}A<-alAxJQ9yf+Y9Nnq-ugpJsi>oNZG;PSJu%U3Aw*m
z|6h-HI8F%!lA(bt5fT1A9ugx2l7g+Dqo1$hzX5M)Upd15!TZS9%>cjC!A{PXK+w<t
d&~z37@XtdO%zgM>>{tGPJB$Ee0sjY|{{sw;My>z=

literal 0
HcmV?d00001

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index c30c43867..a4ef8f833 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -172,8 +172,7 @@ def refresh(
             gh_trees = pl.read_parquet(self.paths["gh_trees"])
 
         package = self.npm.datapackage(frozen=frozen)
-        # TODO: Re-enable after deciding on how best to utilize
-        # self.write_parquet(package["features"], self.paths["dpkg_features"])
+        self.write_parquet(package["features"], self.paths["dpkg_features"])
         self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"])
 
         if include_typing:
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 2ff40c32b..549889f6d 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -88,6 +88,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
             col("hash").str.split(":").list.last().alias("sha"),
             pl.concat_str(pl.lit(base_url), "path").alias("url"),
         )
+        .sort(DATASET_NAME, "bytes")
         .collect()
     )
 

From c3139f11a017b62b53c62a06a560d6e32379e164 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:00:30 +0000
Subject: [PATCH 147/201] feat: add temp `_Reader.*_dpkg` methods

- Will be replacing the non-suffixed versions
- Need to do this gradually as `tag` will likely be dropped
  - Breaking most of the tests
---
 altair/datasets/_readers.py | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index efd1af7ce..37357ae5a 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -87,6 +87,9 @@
 __all__ = ["backend"]
 
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
+_DATAPACKAGE: Final[Path] = (
+    Path(__file__).parent / "_metadata" / "datapackage_features.parquet"
+)
 
 
 class AltairDatasetsError(Exception): ...
@@ -215,6 +218,71 @@ def _scan_metadata(
             return frame.filter(*predicates, **constraints)
         return frame
 
+    def dataset_dpkg(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        df = self.query_dpkg(**_extract_constraints(name, suffix, tag))
+        result = next(df.iter_rows(named=True))
+        url = result["url"]
+        fn = self.read_fn(url)
+        if default_kwds := self._schema_kwds(result):  # type: ignore
+            kwds = default_kwds | kwds if kwds else default_kwds
+
+        if self.cache.is_active():
+            fp = self.cache.path / (result["sha"] + result["suffix"])
+            if fp.exists() and fp.stat().st_size:
+                return fn(fp, **kwds)
+            else:
+                with self._opener.open(url) as f:
+                    fp.touch()
+                    fp.write_bytes(f.read())
+                return fn(fp, **kwds)
+        else:
+            with self._opener.open(url) as f:
+                return fn(f, **kwds)
+
+    def url_dpkg(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        tag: Version | None = None,
+    ) -> str:
+        frame = self.query_dpkg(**_extract_constraints(name, suffix, tag))
+        url = frame.item(0, "url")
+        if isinstance(url, str):
+            return url
+        else:
+            msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
+            raise TypeError(msg)
+
+    def query_dpkg(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.DataFrame[IntoDataFrameT]:
+        frame = self._scan_dpkg(*predicates, **constraints).collect()
+        if not frame.is_empty():
+            return frame
+        else:
+            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
+            msg = f"Found no results for:\n    {terms}"
+            raise ValueError(msg)
+
+    def _scan_dpkg(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame:
+        if "tag" in constraints:
+            msg = f"{_DATAPACKAGE.name!r} only supports the latest version, but got: {constraints.get('tag')!r}"
+            raise NotImplementedError(msg)
+        frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy()
+        if predicates or constraints:
+            return frame.filter(*predicates, **constraints)
+        return frame
+
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         return DatasetCache(self)

From 6035b39ac69b70330373e3c5a58838fb60fd1e88 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 18:40:36 +0000
Subject: [PATCH 148/201] test: Remove/replace all `tag` based tests

---
 tests/test_datasets.py | 143 +++++++++++++----------------------------
 1 file changed, 46 insertions(+), 97 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index b35efa60e..66353b9e4 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -17,7 +17,7 @@
 
 from altair.datasets import Loader, url
 from altair.datasets._readers import _METADATA, AltairDatasetsError
-from altair.datasets._typing import Dataset, Extension, Metadata, Version, is_ext_read
+from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
 from tests import skip_requires_pyarrow, slow
 
 if sys.version_info >= (3, 14):
@@ -26,7 +26,7 @@
     from typing_extensions import TypedDict
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Mapping
+    from collections.abc import Container, Iterator
     from pathlib import Path
     from typing import Literal
 
@@ -46,7 +46,6 @@ class DatasetSpec(TypedDict, total=False):
 
     name: Dataset
     suffix: Extension
-    tag: Version
     marks: MarksType
 
 
@@ -127,10 +126,8 @@ def metadata_columns() -> frozenset[str]:
 
 
 def match_url(name: Dataset, url: str) -> bool:
-    return (
-        re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url)
-        is not None
-    )
+    pattern = rf".+/vega-datasets@.+/data/{name}\..+"
+    return re.match(pattern, url) is not None
 
 
 @backends
@@ -253,10 +250,10 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
         "political-contributions",
         "population",
         "population_engineers_hurricanes",
-        "seattle-temps",
+        "unemployment",
         "seattle-weather",
         "seattle-weather-hourly-normals",
-        "sf-temps",
+        "gapminder-health-income",
         "sp500",
         "sp500-2000",
         "stocks",
@@ -367,30 +364,16 @@ def test_dataset_not_found(backend: _Backend) -> None:
 
     ``Loader.url`` is used since it doesn't require a remote connection.
     """
-    import polars as pl
-
     data = Loader.from_backend(backend)
     real_name: Literal["disasters"] = "disasters"
-    real_suffix: Literal[".csv"] = ".csv"
-    real_tag: Literal["v1.14.0"] = "v1.14.0"
-
     invalid_name: Literal["fake name"] = "fake name"
     invalid_suffix: Literal["fake suffix"] = "fake suffix"
-    invalid_tag: Literal["fake tag"] = "fake tag"
-
     incorrect_suffix: Literal[".json"] = ".json"
-    incorrect_tag: Literal["v1.5.0"] = "v1.5.0"
 
     ERR_NO_RESULT = ValueError
-    # NOTE: ``polars`` enforces enums stricter than other packages.
-    # Rather than returning an empty dataframe, filtering on a value
-    # *outside* of the enum range raises an internal error.
-    ERR_NO_RESULT_OR_ENUM = (ERR_NO_RESULT, pl.exceptions.InvalidOperationError)
-
     MSG_NO_RESULT = "Found no results for"
     NAME = "dataset_name"
     SUFFIX = "suffix"
-    TAG = "tag"
 
     with pytest.raises(
         ERR_NO_RESULT,
@@ -407,27 +390,6 @@ def test_dataset_not_found(backend: _Backend) -> None:
     ):
         data.url(real_name, invalid_suffix)  # type: ignore[arg-type]
 
-    with pytest.raises(
-        ERR_NO_RESULT_OR_ENUM,
-        match=re.compile(rf"{invalid_tag}", re.DOTALL),
-    ):
-        data.url(real_name, tag=invalid_tag)  # type: ignore[arg-type]
-
-    with pytest.raises(
-        ERR_NO_RESULT_OR_ENUM,
-        match=re.compile(rf"{invalid_tag}", re.DOTALL),
-    ):
-        data.url(real_name, real_suffix, tag=invalid_tag)  # type: ignore[arg-type]
-
-    with pytest.raises(
-        ERR_NO_RESULT,
-        match=re.compile(
-            rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{SUFFIX}.+{real_suffix}.+{NAME}.+{real_name}",
-            re.DOTALL,
-        ),
-    ):
-        data.url(real_name, real_suffix, tag=incorrect_tag)
-
     with pytest.raises(
         ERR_NO_RESULT,
         match=re.compile(
@@ -437,23 +399,6 @@ def test_dataset_not_found(backend: _Backend) -> None:
     ):
         data.url(real_name, incorrect_suffix)
 
-    with pytest.raises(
-        ERR_NO_RESULT,
-        match=re.compile(
-            rf"{MSG_NO_RESULT}.+{TAG}.+{real_tag}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}",
-            re.DOTALL,
-        ),
-    ):
-        data.url(real_name, incorrect_suffix, tag=real_tag)
-
-    with pytest.raises(
-        ERR_NO_RESULT,
-        match=re.compile(
-            rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{NAME}.+{real_name}", re.DOTALL
-        ),
-    ):
-        data.url(real_name, tag=incorrect_tag)
-
 
 @backends
 def test_reader_cache(
@@ -482,10 +427,10 @@ def test_reader_cache(
     assert tuple(data.cache) == ()
 
     # smallest csvs
-    lookup_groups = data("lookup_groups", tag="v2.5.3")
-    data("lookup_people", tag="v2.4.0")
-    data("iowa-electricity", tag="v2.3.1")
-    data("global-temp", tag="v2.9.0")
+    lookup_groups = data("lookup_groups")
+    data("lookup_people")
+    data("iowa-electricity")
+    data("global-temp")
 
     cached_paths = tuple(data.cache)
     assert len(cached_paths) == 4
@@ -493,32 +438,29 @@ def test_reader_cache(
     if nw_dep.is_polars_dataframe(lookup_groups):
         left, right = (
             lookup_groups,
-            cast("pl.DataFrame", data("lookup_groups", tag="v2.5.3")),
+            cast("pl.DataFrame", data("lookup_groups", ".csv")),
         )
     else:
         left, right = (
             pl.DataFrame(lookup_groups),
-            pl.DataFrame(data("lookup_groups", tag="v2.5.3")),
+            pl.DataFrame(data("lookup_groups", ".csv")),
         )
 
     assert_frame_equal(left, right)
     assert len(tuple(data.cache)) == 4
     assert cached_paths == tuple(data.cache)
 
-    data("iowa-electricity", tag="v1.30.2")
-    data("global-temp", tag="v2.8.1")
-    data("global-temp", tag="v2.8.0")
+    data("iowa-electricity", ".csv")
+    data("global-temp", ".csv")
+    data("global-temp.csv")
 
     assert len(tuple(data.cache)) == 4
     assert cached_paths == tuple(data.cache)
 
-    data("lookup_people", tag="v1.10.0")
-    data("lookup_people", tag="v1.11.0")
-    data("lookup_people", tag="v1.20.0")
-    data("lookup_people", tag="v1.21.0")
-    data("lookup_people", tag="v2.1.0")
-    data("lookup_people", tag="v2.3.0")
-    data("lookup_people", tag="v2.5.0-next.0")
+    data("lookup_people")
+    data("lookup_people.csv")
+    data("lookup_people", ".csv")
+    data("lookup_people")
 
     assert len(tuple(data.cache)) == 4
     assert cached_paths == tuple(data.cache)
@@ -644,12 +586,12 @@ def test_pyarrow_read_json(
 @pytest.mark.parametrize(
     ("spec", "column"),
     [
-        (DatasetSpec(name="cars", tag="v2.11.0"), "Year"),
-        (DatasetSpec(name="unemployment-across-industries", tag="v2.11.0"), "date"),
-        (DatasetSpec(name="flights-10k", tag="v2.11.0"), "date"),
-        (DatasetSpec(name="football", tag="v2.11.0"), "date"),
-        (DatasetSpec(name="crimea", tag="v2.11.0"), "date"),
-        (DatasetSpec(name="ohlc", tag="v2.11.0"), "date"),
+        (DatasetSpec(name="cars"), "Year"),
+        (DatasetSpec(name="unemployment-across-industries"), "date"),
+        (DatasetSpec(name="flights-10k"), "date"),
+        (DatasetSpec(name="football"), "date"),
+        (DatasetSpec(name="crimea"), "date"),
+        (DatasetSpec(name="ohlc"), "date"),
     ],
 )
 def test_polars_read_json_roundtrip(
@@ -657,40 +599,47 @@ def test_polars_read_json_roundtrip(
     spec: DatasetSpec,
     column: str,
 ) -> None:
-    frame = polars_loader(spec["name"], ".json", tag=spec["tag"])
+    frame = polars_loader(spec["name"], ".json")
     tp = frame.schema.to_python()[column]
     assert tp is dt.date or issubclass(tp, dt.date)
 
 
-def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]:
-    """https://github.com/vega/vega-datasets/issues/627."""
+def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
+    """Temp way of excluding datasets that were removed."""
     names: tuple[Dataset, ...] = get_args(Dataset)
-    args: tuple[Dataset, Extension | None, Version | None]
+    args: tuple[Dataset, Extension | None]
     for name in names:
         marks: MarksType = ()
-        if name in overrides:
-            el = overrides[name]
-            args = name, el.get("suffix"), el.get("tag")
-            marks = el.get("marks", ())
-        else:
-            args = name, None, None
+        if name in skip:
+            continue
+        args = name, None
         yield pytest.param(*args, marks=marks)
 
 
 @slow
 @datasets_debug
 @pytest.mark.parametrize(
-    ("name", "suffix", "tag"),
-    list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})),
+    ("name", "suffix"),
+    list(
+        _dataset_params(
+            skip=(
+                "climate",
+                "graticule",
+                "sf-temps",
+                "iris",
+                "weball26",
+                "seattle-temps",
+            )
+        )
+    ),
 )
 def test_all_datasets(
     polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
     name: Dataset,
     suffix: Extension,
-    tag: Version,
 ) -> None:
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
-    frame = polars_loader(name, suffix, tag=tag)
+    frame = polars_loader(name, suffix)
     assert nw_dep.is_polars_dataframe(frame)
 
 
From 5d8b6db6774421bd232dfebd2db25e731ab2c89c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 13 Jan 2025 19:46:07 +0000
Subject: [PATCH 149/201] revert: Remove all `tag` based features

---
 altair/datasets/__init__.py |  9 +---
 altair/datasets/_cache.py   | 16 ++-----
 altair/datasets/_loader.py  | 39 +++++-----------
 altair/datasets/_readers.py | 89 ++++---------------------------------
 4 files changed, 27 insertions(+), 126 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 70d01eacc..6095dd404 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -14,7 +14,7 @@
         from typing_extensions import LiteralString
 
     from altair.datasets._loader import _Load
-    from altair.datasets._typing import Dataset, Extension, Version
+    from altair.datasets._typing import Dataset, Extension
 
 
 __all__ = ["Loader", "load", "url"]
@@ -47,7 +47,6 @@ def url(
     name: Dataset | LiteralString,
     suffix: Extension | None = None,
     /,
-    tag: Version | None = None,
 ) -> str:
     """
     Return the address of a remote dataset.
@@ -61,15 +60,11 @@ def url(
 
         .. note::
             Only needed if ``name`` is available in multiple formats.
-    tag
-        Version identifier for a `vega-datasets release`_.
 
     .. _Path.stem:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
     .. _Path.suffix:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-    .. _vega-datasets release:
-        https://github.com/vega/vega-datasets/releases
 
     Related
     -------
@@ -83,7 +78,7 @@ def url(
     try:
         from altair.datasets._loader import load
 
-        url = load.url(name, suffix, tag=tag)
+        url = load.url(name, suffix)
     except AltairDatasetsError:
         from altair.datasets._cache import url_cache
 
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index c247988d6..0cbb7a251 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -8,8 +8,6 @@
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
-from altair.datasets._typing import VERSION_LATEST
-
 if sys.version_info >= (3, 12):
     from typing import Protocol
 else:
@@ -105,10 +103,7 @@ class UrlCache(CompressedCache[_KT, _VT]):
     `csv`_, `gzip`_ -based, lazy url lookup.
 
     Operates on a subset of available datasets:
-    - Only the latest version
     - Excludes `.parquet`, which `cannot be read via url`_
-    - Name collisions are pre-resolved
-        - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``)
 
     .. _csv:
         https://docs.python.org/3/library/csv.html
@@ -256,13 +251,10 @@ def download_all(self) -> None:
         Requires **30-50MB** of disk-space.
         """
         stems = tuple(fp.stem for fp in self)
-        latest = nw.col("tag") == nw.lit(VERSION_LATEST)
-        predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,)
+        predicates = (~(nw.col("sha").is_in(stems)),) if stems else ()
         frame = (
-            self._rd._scan_metadata(
-                predicates, ext_supported=True, name_collision=False
-            )
-            .select("sha", "suffix", "url_npm")
+            self._rd._scan_metadata(predicates, is_image=False)  # type: ignore
+            .select("sha", "suffix", "url")
             .unique("sha")
             .collect()
         )
@@ -272,7 +264,7 @@ def download_all(self) -> None:
         print(f"Downloading {len(frame)} missing datasets...")
         for row in frame.iter_rows(named=True):
             fp: Path = self.path / (row["sha"] + row["suffix"])
-            with self._rd._opener.open(row["url_npm"]) as f:
+            with self._rd._opener.open(row["url"]) as f:
                 fp.touch()
                 fp.write_bytes(f.read())
         print("Finished downloads")
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 2b8a2cd95..63bd5f3f7 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -21,7 +21,7 @@
     else:
         from typing_extensions import LiteralString
     from altair.datasets._readers import _Backend
-    from altair.datasets._typing import Dataset, Extension, Version
+    from altair.datasets._typing import Dataset, Extension
 
 
 __all__ = ["Loader", "load"]
@@ -111,7 +111,7 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
         Using ``pandas``, backed by ``pyarrow`` dtypes:
 
             data = Loader.from_backend("pandas[pyarrow]")
-            cars = data("cars", tag="v1.29.0")
+            cars = data("cars")
 
             >>> type(cars)  # doctest: +SKIP
             pandas.core.frame.DataFrame
@@ -137,7 +137,6 @@ def __call__(
         name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: Version | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
         """
@@ -152,8 +151,6 @@ def __call__(
 
             .. note::
                 Only needed if ``name`` is available in multiple formats.
-        tag
-            Version identifier for a `vega-datasets release`_.
         **kwds
             Arguments passed to the underlying read function.
 
@@ -161,8 +158,6 @@ def __call__(
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
         .. _Path.suffix:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-        .. _vega-datasets release:
-            https://github.com/vega/vega-datasets/releases
 
         Examples
         --------
@@ -171,7 +166,7 @@ def __call__(
             from altair.datasets import Loader
 
             data = Loader.from_backend("polars")
-            source = data("iowa-electricity", tag="v2.10.0")
+            source = data("iowa-electricity")
 
             >>> source.columns  # doctest: +SKIP
             ['year', 'source', 'net_generation']
@@ -199,7 +194,7 @@ def __call__(
         Using ``pandas``:
 
             data = Loader.from_backend("pandas")
-            source = data("iowa-electricity", tag="v2.10.0")
+            source = data("iowa-electricity")
 
             >>> source.columns  # doctest: +SKIP
             Index(['year', 'source', 'net_generation'], dtype='object')
@@ -223,7 +218,7 @@ def __call__(
         Using ``pyarrow``:
 
             data = Loader.from_backend("pyarrow")
-            source = data("iowa-electricity", tag="v2.10.0")
+            source = data("iowa-electricity")
 
             >>> source.column_names  # doctest: +SKIP
             ['year', 'source', 'net_generation']
@@ -238,14 +233,13 @@ def __call__(
             source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]]
             net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]]
         """
-        return self._reader.dataset(name, suffix, tag=tag, **kwds)
+        return self._reader.dataset(name, suffix, **kwds)
 
     def url(
         self,
         name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: Version | None = None,
     ) -> str:
         """
         Return the address of a remote dataset.
@@ -259,15 +253,11 @@ def url(
 
             .. note::
                 Only needed if ``name`` is available in multiple formats.
-        tag
-            Version identifier for a `vega-datasets release`_.
 
         .. _Path.stem:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
         .. _Path.suffix:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-        .. _vega-datasets release:
-            https://github.com/vega/vega-datasets/releases
 
         Examples
         --------
@@ -277,15 +267,15 @@ def url(
             from altair.datasets import Loader
 
             data = Loader.from_backend("polars")
-            >>> data.url("cars", tag="v2.9.0")  # doctest: +SKIP
-            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json'
+            >>> data.url("cars")  # doctest: +SKIP
+            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json'
 
         We can pass the result directly to a chart:
 
-            url = data.url("cars", tag="v2.9.0")
+            url = data.url("cars")
             alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
         """
-        return self._reader.url(name, suffix, tag=tag)
+        return self._reader.url(name, suffix)
 
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
@@ -318,7 +308,6 @@ def __call__(  # pyright: ignore[reportOverlappingOverload]
         name: Dataset | LiteralString,
         suffix: Extension | None = ...,
         /,
-        tag: Version | None = ...,
         backend: None = ...,
         **kwds: Any,
     ) -> IntoDataFrameT: ...
@@ -328,7 +317,6 @@ def __call__(
         name: Dataset | LiteralString,
         suffix: Extension | None = ...,
         /,
-        tag: Version | None = ...,
         backend: Literal["polars"] = ...,
         **kwds: Any,
     ) -> pl.DataFrame: ...
@@ -338,7 +326,6 @@ def __call__(
         name: Dataset | LiteralString,
         suffix: Extension | None = ...,
         /,
-        tag: Version | None = ...,
         backend: Literal["pandas", "pandas[pyarrow]"] = ...,
         **kwds: Any,
     ) -> pd.DataFrame: ...
@@ -348,7 +335,6 @@ def __call__(
         name: Dataset | LiteralString,
         suffix: Extension | None = ...,
         /,
-        tag: Version | None = ...,
         backend: Literal["pyarrow"] = ...,
         **kwds: Any,
     ) -> pa.Table: ...
@@ -357,14 +343,13 @@ def __call__(
         name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: Version | None = None,
         backend: _Backend | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
         if backend is None:
-            return super().__call__(name, suffix, tag, **kwds)
+            return super().__call__(name, suffix, **kwds)
         else:
-            return self.from_backend(backend)(name, suffix, tag=tag, **kwds)
+            return self.from_backend(backend)(name, suffix, **kwds)
 
 
 load: _Load[Any, Any]
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 37357ae5a..6ac13695e 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -14,7 +14,7 @@
 from functools import partial
 from importlib import import_module
 from importlib.util import find_spec
-from itertools import chain, islice
+from itertools import chain
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -25,7 +25,6 @@
     Literal,
     Protocol,
     TypeVar,
-    cast,
     overload,
 )
 
@@ -63,7 +62,7 @@
     else:
         from typing_extensions import TypeAlias
 
-    from altair.datasets._typing import Dataset, Extension, Metadata, Version
+    from altair.datasets._typing import Dataset, Extension, Metadata
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]
@@ -148,15 +147,13 @@ def dataset(
         name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: Version | None = None,
         **kwds: Any,
     ) -> IntoDataFrameT:
-        df = self.query(**_extract_constraints(name, suffix, tag))
-        it = islice(df.iter_rows(named=True), 1)
-        result = cast("Metadata", next(it))
-        url = result["url_npm"]
+        df = self.query(**_extract_constraints(name, suffix))
+        result = next(df.iter_rows(named=True))
+        url = result["url"]
         fn = self.read_fn(url)
-        if default_kwds := self._schema_kwds(result):
+        if default_kwds := self._schema_kwds(result):  # type: ignore
             kwds = default_kwds | kwds if kwds else default_kwds
 
         if self.cache.is_active():
@@ -177,10 +174,9 @@ def url(
         name: Dataset | LiteralString,
         suffix: Extension | None = None,
         /,
-        tag: Version | None = None,
     ) -> str:
-        frame = self.query(**_extract_constraints(name, suffix, tag))
-        url = nw.to_py_scalar(frame.item(0, "url_npm"))
+        frame = self.query(**_extract_constraints(name, suffix))
+        url = frame.item(0, "url")
         if isinstance(url, str):
             return url
         else:
@@ -213,71 +209,6 @@ def query(
     def _scan_metadata(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.LazyFrame:
-        frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
-        if predicates or constraints:
-            return frame.filter(*predicates, **constraints)
-        return frame
-
-    def dataset_dpkg(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: Version | None = None,
-        **kwds: Any,
-    ) -> IntoDataFrameT:
-        df = self.query_dpkg(**_extract_constraints(name, suffix, tag))
-        result = next(df.iter_rows(named=True))
-        url = result["url"]
-        fn = self.read_fn(url)
-        if default_kwds := self._schema_kwds(result):  # type: ignore
-            kwds = default_kwds | kwds if kwds else default_kwds
-
-        if self.cache.is_active():
-            fp = self.cache.path / (result["sha"] + result["suffix"])
-            if fp.exists() and fp.stat().st_size:
-                return fn(fp, **kwds)
-            else:
-                with self._opener.open(url) as f:
-                    fp.touch()
-                    fp.write_bytes(f.read())
-                return fn(fp, **kwds)
-        else:
-            with self._opener.open(url) as f:
-                return fn(f, **kwds)
-
-    def url_dpkg(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        tag: Version | None = None,
-    ) -> str:
-        frame = self.query_dpkg(**_extract_constraints(name, suffix, tag))
-        url = frame.item(0, "url")
-        if isinstance(url, str):
-            return url
-        else:
-            msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
-            raise TypeError(msg)
-
-    def query_dpkg(
-        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.DataFrame[IntoDataFrameT]:
-        frame = self._scan_dpkg(*predicates, **constraints).collect()
-        if not frame.is_empty():
-            return frame
-        else:
-            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
-            msg = f"Found no results for:\n    {terms}"
-            raise ValueError(msg)
-
-    def _scan_dpkg(
-        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.LazyFrame:
-        if "tag" in constraints:
-            msg = f"{_DATAPACKAGE.name!r} only supports the latest version, but got: {constraints.get('tag')!r}"
-            raise NotImplementedError(msg)
         frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy()
         if predicates or constraints:
             return frame.filter(*predicates, **constraints)
@@ -491,12 +422,10 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table:
 
 
 def _extract_constraints(
-    name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
+    name: Dataset | LiteralString, suffix: Extension | None, /
 ) -> Metadata:
     """Transform args into a mapping to column names."""
     constraints: Metadata = {}
-    if tag is not None:
-        constraints["tag"] = tag
     if name.endswith(EXTENSION_SUFFIXES):
         fp = Path(name)
         constraints["dataset_name"] = fp.stem

From df26bc23de09102175d4afee25255bf354c19760 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 11:49:39 +0000
Subject: [PATCH 150/201] feat: Source version from
 `tool.altair.vega.vega-datasets`

---
 altair/utils/schemapi.py         | 2 +-
 pyproject.toml                   | 2 +-
 tools/datasets/__init__.py       | 6 ++++--
 tools/datasets/npm.py            | 5 +----
 tools/generate_schema_wrapper.py | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py
index 3a49b928d..a6e5464d8 100644
--- a/altair/utils/schemapi.py
+++ b/altair/utils/schemapi.py
@@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]:
     ],
     str,
 ] = {
-    "vega-datasets": "v2.11.0",
+    "vega-datasets": "main",
     "vega-embed": "6",
     "vega-lite": "v5.21.0",
     "vegafusion": "1.6.6",
diff --git a/pyproject.toml b/pyproject.toml
index 7fba9a9ee..c582fd1b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,7 +102,7 @@ doc = [
 
 [tool.altair.vega]
 # Minimum/exact versions, for projects under the `vega` organization
-vega-datasets     = "v2.11.0" # https://github.com/vega/vega-datasets
+vega-datasets     = "main" # https://github.com/vega/vega-datasets
 vega-embed        = "6"       # https://github.com/vega/vega-embed
 vega-lite         = "v5.21.0" # https://github.com/vega/vega-lite
 
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index a4ef8f833..c9f35ae7f 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -131,13 +131,15 @@ def npm(self) -> Npm:
         return self._npm
 
     def refresh(
-        self, *, include_typing: bool = False, frozen: bool = False
+        self, tag: Any, /, *, include_typing: bool = False, frozen: bool = False
     ) -> pl.DataFrame:
         """
         Update and sync all dataset metadata files.
 
         Parameters
         ----------
+        tag
+            Branch or release version to build against.
         include_typing
             Regenerate ``altair.datasets._typing``.
         frozen
@@ -171,7 +173,7 @@ def refresh(
             print("Reusing frozen metadata ...")
             gh_trees = pl.read_parquet(self.paths["gh_trees"])
 
-        package = self.npm.datapackage(frozen=frozen)
+        package = self.npm.datapackage(tag=tag, frozen=frozen)
         self.write_parquet(package["features"], self.paths["dpkg_features"])
         self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"])
 
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 8f9182c45..99d5fe5b0 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -142,10 +142,7 @@ def file_gh(
         with self._opener.open(req) as response:
             return read_fn(response)
 
-    def datapackage(
-        self, *, tag: LiteralString | None = None, frozen: bool = False
-    ) -> ParsedPackage:
-        tag = tag or "main"
+    def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage:
         pkg: FlPackage = (
             json.loads(self._paths["datapackage"].read_text("utf-8"))
             if frozen
diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py
index 3177b56cf..94ad19faf 100644
--- a/tools/generate_schema_wrapper.py
+++ b/tools/generate_schema_wrapper.py
@@ -1394,7 +1394,7 @@ def main() -> None:
     copy_schemapi_util()
     vegalite_main(args.skip_download)
     write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT)
-    datasets.app.refresh(include_typing=True, frozen=True)
+    datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True, frozen=True)
 
     # The modules below are imported after the generation of the new schema files
     # as these modules import Altair. This allows them to use the new changes

From 9f23ccdaca6fa3b9e2a5e6bef40dbed4fb8f0ddd Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 14:08:22 +0000
Subject: [PATCH 151/201] refactor(DRAFT): Migrate to `datapackage.json` only

Major switch from multiple github/npm endpoints -> a single file.
Was Only possible following https://github.com/vega/vega-datasets/pull/665

Still need to rewrite/fill out the `Metadata` doc, then moving onto features
---
 altair/datasets/_cache.py                     |  12 +-
 .../_metadata/datapackage_features.parquet    | Bin 9189 -> 0 bytes
 altair/datasets/_metadata/metadata.parquet    | Bin 19296 -> 9208 bytes
 ...ackage_schemas.json.gz => schemas.json.gz} | Bin 2483 -> 2471 bytes
 altair/datasets/_metadata/url.csv.gz          | Bin 855 -> 858 bytes
 altair/datasets/_readers.py                   |   5 +-
 altair/datasets/_typing.py                    | 135 ++---
 tests/test_datasets.py                        |   9 +-
 tools/datasets/__init__.py                    | 149 ++----
 tools/datasets/_metadata/tags.parquet         | Bin 6289 -> 0 bytes
 tools/datasets/_metadata/tags_npm.parquet     | Bin 2598 -> 0 bytes
 tools/datasets/datapackage.py                 |   2 +
 tools/datasets/github.py                      | 490 ------------------
 tools/datasets/models.py                      | 163 +-----
 tools/datasets/npm.py                         |  46 +-
 tools/datasets/semver.py                      |  76 ---
 16 files changed, 106 insertions(+), 981 deletions(-)
 delete mode 100644 altair/datasets/_metadata/datapackage_features.parquet
 rename altair/datasets/_metadata/{datapackage_schemas.json.gz => schemas.json.gz} (88%)
 delete mode 100644 tools/datasets/_metadata/tags.parquet
 delete mode 100644 tools/datasets/_metadata/tags_npm.parquet
 delete mode 100644 tools/datasets/github.py
 delete mode 100644 tools/datasets/semver.py

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 0cbb7a251..3e4beb82d 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -43,9 +43,7 @@
 _T = TypeVar("_T")
 
 _URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
-_SCHEMA: Final[Path] = (
-    Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz"
-)
+_SCHEMA: Final[Path] = Path(__file__).parent / "_metadata" / "schemas.json.gz"
 
 _FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
     "integer": nw.Int64,
@@ -118,7 +116,7 @@ def __init__(
         fp: Path,
         /,
         *,
-        columns: tuple[str, str] = ("dataset_name", "url_npm"),
+        columns: tuple[str, str],
         tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
     ) -> None:
         self.fp: Path = fp
@@ -253,7 +251,7 @@ def download_all(self) -> None:
         stems = tuple(fp.stem for fp in self)
         predicates = (~(nw.col("sha").is_in(stems)),) if stems else ()
         frame = (
-            self._rd._scan_metadata(predicates, is_image=False)  # type: ignore
+            self._rd._scan_metadata(predicates, is_image=False)
             .select("sha", "suffix", "url")
             .unique("sha")
             .collect()
@@ -338,5 +336,7 @@ def _ensure_active(self) -> None:
             raise ValueError(msg)
 
 
-url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
+url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(
+    _URL, columns=("dataset_name", "url")
+)
 schema_cache = SchemaCache(_SCHEMA)
diff --git a/altair/datasets/_metadata/datapackage_features.parquet b/altair/datasets/_metadata/datapackage_features.parquet
deleted file mode 100644
index c76395167255bd0c1b8c374e51ce292a7c51de48..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9189
zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFJFCc|M|0<}pLYhz6n%8B&obLxeO)
ziAwTmP%0^<lKy*}s{hk>Z@2H>``q*B>~{_C?|s)=?^<iWzmo;QN)DodE^$E945rY2
z5CQ-|;*89f9aw5ICtx7#A|x9$nsG0+bYp|xq33;Ow-^$;u%sRLzG5BiE1=icUX;^r
z1?&M!z!Ute7#$}kd&*1oVbTCj%<0m)hu5vwpPJ77a*rYZZEcj?CUHd~?d#1&52-TK
zy)(AFZ+V_(WAV9w>>zMO%Dj!SNT~I!&-lZuY5@-UVE%fp{sw>swV^pGi>m$5@u0U|
z4UU>KPklq4vv}BOTE7>!(L2}9IB%3mOqU(_NbkfLz3&z-*qMEO*NqBg-WG$o0uKT0
zUBW1u$rN?2-m>v5b!^m$-el5o{WDiAXeC>UD38*cMx>loi<Rl#JYMJy)Crf=%4?6x
z5iA-`&*ZaH=sqGn@9u+guu58wvE?p1cW_uvl0D4o;j9k%)J;l>yg@f#7^Qu*lP;qO
zy3&}D%H5wqU)H$0vq(!w-_Kz224lsuIQk0*awWr$OmlTy45N*kv*6C<=sB*O^+%Z6
z(9FJ*9epVO_K2$YqKip1!dZHCfFbm<o6XxZWHFGsWKmG<6|IegtWn?)hYGfMz-W-g
zr<i4&&7H&0Cx)n1*<)}n10$Gof;1iF#A>b7RRT^V9NX)ai|x$bm?EB%pc)=y%_}eM
zjPI9vN7LhipAZhns&EpGH)OoJDTn5D=k>{F&GpvW5En<LRR7kB&u>I@q7S<8v+TOi
zG8X><vNG)UeUoNHUl)_u3bJl-+H8<*SRbyVq4a1*fR?VZRF<&kj`L|Qq8FA2W%U3U
zi;~1a0CX6H>Od(d8-zF@I*5*68IWKI_X;EhxREJn8WmVvi7xDx-BbWIT*wANdMG-2
zhqeToEjiw`-BVRyf3llSiH#Jmsln+-A?kt_MvB6Drs6gxXI#&0ld(5dK6cwzWt;iA
zKj$K~c37y2qS6hQ+aMN<5<tg+L9@6KLx>b|h>Jhbj|{P(vO&}}E>s#bjzRrr4#Cmy
z5A8CASXM(Ht8xegggcoy_nXnrTtwxogoU-dI+YHYgLTb|mkN8bs~1J8Ob1;=Z(Do?
zobX<A^dJZ%^wYu<U=Hp0ariXSA(T)wois5xI3R3w@ZxxIb@0?sa+#4;nbCelW1BLp
zC6956sBw0gsIiT4masl7uMLWhveVt&YuEB{4pX7dQ{m!Skn#LK3-D+%P*5q8DY6Qy
zdpU_lg9k_yCJJFGqZ!W!tc`;sM19P&bwLeao7e02<+0YhEfL=NtlL-|00+PZpu!Hc
zbM9z<^5}w#fMRu~TFleaq=V9VYR8qS{lC0@s^_nl=RVWP@FIdlbHvQs%Q7l|o7jPE
zTJBiP?JG=84)n)m8ixzG1roX6kM++n@Hyv{3!z|NK(MA&riuHU&dFm=-64jPv#HNB
zn^RsL<IV3gj4-W@_qrxi4p#CxWW9};I!b@NtLlytT}w}7Sd7kPkGC^U-1<V5*vQ7u
zmKzvReKfAG6~m*8Qz&E|wyASq#jAml9fy0*3kBWFFPmF;EBU>J3-5$OME&6i%~_i<
z{-_t1M)|HKywH=bl`1rc8c$p(EZ3xF`xL}0MLBG`_0{ujC(o#?JL$kD<e-r4#b>-O
z4sXWZ$=E;Cr07OfKKFj_Rq=Wg{+ND8tCEj_pGP9LCrNxgtc0xq2lDMXNiTO9O%Szq
zo9~?3zM)y8O6%580qtj67Kz+ch;U%;4qNI>$l@&Z<3*Q-N#^L3iuxU{JXM&|qrG6X
z=zje(tlBKXnC!CsY@4`L&lIvg6e!Ko*zhJWHBZ@W+V0&k7On~Xq^g73Y*i`iaF+$M
z1a#=b_s#c3_A?cXA1mec4N5c)jEK>;{a`Ae5b4AGAr5<8weM~)>*yD;iCX+wDg22f
zuHL@8*pWWt3%0TOv(X>6J0^VIGjd1!wCpB{xjuo=_&kflsyqjE&ODxcyz$Lmv(eo3
ze%O<)HcUKuWZppX`DYe6-5=fv+3quu%nzrT#63X|w-b*LG*Kz<7Ya<$z$WT|=(5JE
zDfu;g6tTum|DYP}N%quHQ@!Hzr8h`?CmJk8BE#8v@Vf`W_tZZ>CRkRF3273%Ul@?1
zLORgjV|B;;ljEY&Ub-O7y>n`p=F$D0dK(JOD{{RrdpA4R75EyQ9Pw`|_V0P|aqp$0
zfksk%nOBzE)^-+6ATh*qd0$ndQG+bFqwL7O`b&@C9k!nl8mInYhZ%uN^ne>n3_H-O
zM2+E~IY9@A3zbxHHP6!?F#ygnr6q$DBOoRoOkw~Ej1yQ{4{_bs0}fkAB+VRR-QkBG
zip>{MU#HbfCGh9K@HIM708I1YY8L|!2?5+#U8^WMU4HO0EzoPi#;_hp!;3IJQU%Qb
zU=Ls;dy=YV8|0h_9Axl9$<Tq1&O@5qfI}CjuR6Wc1>cJ_AR{ntPj_1eP{*83lDth-
znE>Di80+<c_3X7)#-kue3y3j1o(F>mZLSKy6Al1I6&$XD<Ln#_?1g`h7#F>bu0b;n
zS#<e#G-<E5m;3nwGQf^6DWe%+zg9oUo5Vjm>wjzUXjaa8TG!nnWXkgD9cM-*)8Quh
zmrnVgxdVGH3@I)h(7@{b1cGP~@pj2m&J<wvU<LtDdwCQPfDr@eFlYuZii?*Y(PQOG
zaYXsz?usC`Rv`Wr^iNTX7hOJojq-ud=w);!7(ImOy3>~!ygV8NKUeHG@qy@N-vMA1
zpAN?NAP0QI9>V=DHVl0f0HUMUu;~%s0|LJTPm}mvaM<x5!I6ofz+2Rd==;t2Zpq)o
zzklx)-}42*XJ>zK-kTEO{|!8G<8OlhWK}(hu=gZSvLEpq{w1mJ@oAs}d1xH{!)cPn
ztRx3)IAg=hR##3f8ih`;?ho!HGTjXZ1HeG=NsG`rly#hCYEozRM31J%yR)5{M>Q3<
zs~0QoDWSFwH5j;Gf9qvqoxHic-n*pI@$;9)#QC}_Roq{&u-Vvcd%0V3BuFT2EXX3n
z``S24=(EQHpIQ8L{o;_BOE2%xX>Ji^HW9HM5heQej8QKby-VJCdS7udxa=w#7{6!B
z_1y&=A?bWqudrBeZ|Sr6c&{;SpM}XA=g5+&(Y<QvDWZ=&ho;CmoqiK1*;}4yr5xZ1
zFV=7oYmC#}S(iWiO!eZt*n^Z{58rDt587<EueX_=+51p@uYa#Vmtjn2Pqu4f?VB{4
z+<*aDL4^mU?HKFkX+aM;G&#M~nx3(NOViwPAZqvPJ+UfyE0i7D*tYz%h+vM--NBDm
z>b{BTVQ3Wr=lSs(uRJ<#MiAe2pd&s#ZL=Nc#>N++T%2}X+{~lvC*|hxdN+6X2?-UN
zab#^h#eC)&H?L!Hn?3z@7Pf#^=7vAYcXkwVCfW}qq1#KS9RUE#P&paql7Fipw&t00
zVMhDp`41aAs5G$VTA{79+xKWcXIbb|Y}wGxzFo6bAm_^UcL$$j9e1;pncmnODV;Jp
zZ<l>vq4dTl@lV6z?pqJmJh-+`rRZ~P_TWg>gNe;zv9rdsc4LA&`6oJWY!^Dkl0YBT
z)+R|)#oih$C@|1{=jr)PQioG+rw!^_=~q-kl4j^`W43YueC^<vZPScL_5I0~o;OcK
z+6xo4_U2of)`$9t4Tr00C^XSE-|z42+BX)_I{&t@cgN&N{Fa5tscd^;p7SxGc%l6T
zFDbg`{DNk!EDosaYSmb5Ei@N%C{Eo?U34U$Oarl2v&vj^m;ca%(!)KxG@f)3L$Mp^
z*`zQceR(*NWFNoIdSa*z>KG2KNH@-y!aP0PNg-DsP@ePE50B7f;?|^)F?@G;{5#4=
zGXpL!<UD;aq2M?Fyz$s&+oE?TEL?@q8lTL!D(=a0Z<2cXkmGn8Z)rKVk93OgR==`G
z_|wC;oAq4m{ruH0ceJ|gdzV%Fb)P=B4bNNo`_fQBu>Pz@{GD>%$`sz4MVzjM%o|N5
zOIqG)4+)~>Up%f5y;FAPgW%Ts4Ta*yT9*yy8u9v+Q#(I>Qrn`#_a(mTDECdQUFXY`
zY%AV;65;Dj!IyjEr1shD*&HA3o*mNR)f!or-QLl6r#GR5Ey8s(Uv-J6k$iBsALV{)
zky`By$s*;Pt+!@0yY65M=V^rGIWHSTWE>5xsLUGg#Rr(Qrg5HMz>rj?g_2{L1_~aV
z*fd%UW@yEtNp%f<k?Mrm*Ol0unxv`{lbxAQuNPQ_+ke`W^C)gZg2dJ#>W?1vX6x;$
zp7)f_ItXR_dHMS3R+=!8B&Nk%ZSM2x-I6#h@q5u_idvU51``e)q{Ot9vyq){OX=mP
zMbL}H`?_^ma21}vn)+o?qW0o&*o>Jj>YVM^Q;Do`z9WYsPX;jFx3l*}nM)>r{BY61
zjXq8DYlMR@OB~oH6y)^4o71j!Hh&xW=A%C*^d1hqGIP=B6Oxi-N{ZW9IQb;*16plg
zKBKgWx5eGWz9-k8H9Ctip05XWTPBI5VW+6$T491yvO!<4TDKEt-E|+$Fm=3nntP<8
zYTdz)#iiZGC&D{5yDA##Dl2Nl(#{$;)pgIgJ$owmMW7};yfxe%Dx7FsKej8%$+S#1
zo?+Wf`L<iVjqDRyeR>AWq%5Ipud!9kH6@3(o*E?A)-oJ;s^HSEtD&rsmw7d>-q^i!
z=hR1wV%~*!i@L`RtOKkoP6i1**^<|BL$~V{nvwru*r|3KTD$tvEi$r7shjX<a?>eK
zJ>BP>+<6>a3jW?HpUHKq`VQsv++SYzO{G6cQ0k~&JW;p9C3$-thq_z$DMx+XwlnYX
zZLg$nwoRyKysu65w#w=yC=AS>6|?9rjY}vHyzeo|p3AII$?HfbUfH_nI{%*CL*p#P
z_vV!I8TsjL@dr3stPkAZP;ymmfZDu5hym0&%E6_#E1u&q{iOqJ*fu!f1Q=o%;LK~5
zmm?+_5PWG*1E^F)z;|sXU0D|w7&tGh2OO;h!FWI$a7h})Qyop>a2YlJ`P-O;ySoZe
zg{-WCCn+ks%e%>uT~(DxB)q(myqqf@FY7KxR3R#nRETH^Sy>fXyb|731+J6DE2=2D
z5{XK(a38#?yrL|TtSTofuS`~vhdud^z0Iv=?$i{|m3#7(5-OP=7cGt4l>d>zAaC(@
zP#^OT_wE&*2Ph!)grBf%5GCV;S<WdVpA9?BTFSkGp9DtUpBc?a@Ed!@34fSq-2hL(
z1sGvE9aYp3yf`OQMa~muxp_jg=;455<~486-n}k|@|e9{;-Xo*`ndBo;)s=X4ud`|
ztXUhoFBa|;xa`nfH;vt;uFNh!V|ejm-nI5;Z&WjT$Z95+A7idc&+Xz-{Tv0)IU5xw
zyZ=Pr-h|XC<@p%Gym2kDM)Bpx5Ogj*L;a}f^%swFaD2nKu4|SStxpQM`RN3T3<OEo
z7?XHnTnjswSDiLh_SZirV;#5cW3kJ=q<KiSAI;W%Nwsd<tzB*Gyu||&MT7Hu55M4G
zYMPRAwl7w;eY9}L?Q4Tc_y(=K36(-WXO+F)@dxD=ZCp>|_nA(KG)Jl_Ul2BC_jc|{
z<FI)c9IHJ1jB$w$J4b0hn1Bhq1r{mSXMg4uUsO#m66^OFi8+0%QiUHLcTm0S2!YXP
zb5W5Cu~AuNai*|rpW}^~54pi*A9~Hj79uitMDyvmIu-REa@yXIyin*8FI~;j;aA)6
zkfL->Vv$ian+Lle#2NFx?A}u48MzoJvc!{BPWxX^P@#*CsVFgZ<#0_*jXiEgi5BHx
zw1QNG1MN}q3>@*}U!#Hq3FyMT*Hg<_wa^lCtX*XnAJ@O(jWBx`T_9>u8%9l0r#Pr?
zy^YH-Fv(z8XSw~X9Lr(an%OYsmrQq*_+P~^3Ud_}U5GzAcngm=8*KK+?Z=1@OngAS
zzWs@)H=y)<(e?$W?nip7COvP~j@p{TO|=g~2uG8vS!0?5S}n)L_aDk~ot-o8-6J|W
zAJ=(i;X-Pc#}{7W*Mb4f`O!d{G0rA)sO*oEIJxefz7ZGhN(#snyoD0a4<FuOJi0!m
zl%A{Q>YZ3m@>JaBE19?LyampQD)H!-ybIpmE>_*KUaefTP9#({6l5Cgc`@C<$C5qO
z&^5B4JarL0hfS0d5RZ*L`SZg<-!mj6kfJ6dLvr(%_NKUzeZ4}1rTxhvG9I2Xp=1x@
z($}UHR*j<VNA&WSK}s?|B<+OXhWT%1`_wywSMrE#C)5}_Ze}xb#QYCU0fv(bv&@yk
zDd*}0!&w|mpe;46aEWjqgj)QrVU09_8PHWdgiZOr%W7&4L$4;t-!+2(1`Q$^>8h>&
zPQ@=@K^XKeUj^R{;_t^L2L11ej}@o*QMrszG!?>#V)#+{tWX8GB2QqT{j2qpSSWc_
zzAFOxu5`&2TN8(HYhs$jn%FJ5CMIrN6E8`viP45oGW81Ds2%ifNrtE}(?&GG0RbTp
z9{{n4QT@oGivgzmu)9kQD@^qFRZA4wY6T6fyiCi$IW{b>@g~c~?}(1&iWO#Jxv)wl
zeCxZC`+sXjlJ%A5Z`t*-YsmY<svF1{kj=hQhJOXX@MG@da+&$pn7@DVhXT|j8A78<
zogrla*T(h@=2xkC)B>V^*BO!raI@>a>Hn()Br4Y(#z}R73;^7;FhBHLTo|ev0lDD<
zX#%(|QGPfye~IKmb-6<*RK^ZS7{Ep2*3`aR)Nee;59}3c6j7{OQoJO-rudI(Dbx(2
zG`OTRFR`Yy8YJIL{lGV&5<Ot8cS~9g8`iY`Igx}aMildj$lzu-t|=ns{sBXTYCyD_
zmb9ujt!XiR3(jwT{y_bpt{{q|ONz-R-{RrF_!f*B^MIpZfe66~2=1EGcRu{DUz4c!
zi09`?P!$z`dN%;V`y0oA`eez29I9v*f|b=4JGEuz4c)~kgM1Wy^auokKKzfM8VZ+p
zSt24_UaHzmAW#Up1OoiDBy6;R+rz?;c4Gp;W2K)g@^m1;)$&NSi3i+W#|&xKMapKD
zI<TY<s}ro2G2t>|l0a}rzLv}^f2$z)yO;2nzzvahg8o+c%aWjDOn?>Pr#|8j{8{Qp
zAXtRMN~$mlYyhb;vfb@U(4)YWGNwqk<<1U>NU(zYz##e_us!5yi8#3G4S^uDG(OnG
zN)<`o)041e37>-WTk5Q{<eCohfQ?!scHyVh%6yn1{a|%f#6DtzKnQ@#Isp#$=I$hU
z^8iOXIZwF#=l)s3ZvL8oMu-(6j7ju24|VnU+j#K$F5{6DbUfV*eLY>_@?We6vMj%}
z9y-W+82X0WD_Dh+3|0R=9y0X^p1hl{D~$bj{nK5=Q?T-M@VBKn*uyLQx7Qm1)d?iY
z;R9Xm4E!CJw$t+ZY=K*Th(}A<-alAxJQ9yf+Y9Nnq-ugpJsi>oNZG;PSJu%U3Aw*m
z|6h-HI8F%!lA(bt5fT1A9ugx2l7g+Dqo1$hzX5M)Upd15!TZS9%>cjC!A{PXK+w<t
d&~z37@XtdO%zgM>>{tGPJB$Ee0sjY|{{sw;My>z=

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 633815d1ff27144689ab23aacff171a20598293e..293c93a975929f24b4ccd533d4cb475df654bf9a 100644
GIT binary patch
literal 9208
zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFIF*Df1D9GLM<3NC{CWp;RQw5FsiJ
zK0Zl44N4`YRMLNMWBs4Ld%J!2-skQ|d#~yJz3*D<UGG}!cd{T@$w4&GRSsyH!4%2_
zApig*&dYq+g{3BQ0tUh^Lb8D)>3^h@Z9eOJ^m(t@9fpKXtn02nzG5BiE1}l|FDhua
z1NML=;0b<KjEa?$J>#YNFtHye=5%$#!<*I{&rIij`GX<<ZEd977I8%)?dz>Y52-TJ
zywkV6Z+@O?WAV9w>>zMm%Dk1aNT}t)?(v5=)chUt!TgO}eP;n0)TXA$EULDnCj#Gg
zo^{lmdFm7VoW;XN)B3%*jo!sJ#(AR*Vw!CKM|vm5sDpQKLC)+OJ8xAg^EMmI6?h0}
z?-fSTOeU*y^^}iisbeEg_9VHU&_93Of>yG*i1H|{aahV(^@uXvo5u@X0XkulT6t}e
zIf6w)X&HQW3SGyf=iPUs9IO&IVr;p~FCHF}lVlIIdN`{?K69H=DsRxm7fNXx>7Yw5
zg045Dr*QYB)0a2w>nPF^()Tr3yv11gESCQAp<KzZW7AyiS3+rH=PbB$Il50MXZ;qc
zc6a9BsrFu!Ut4(fd(p*28sRLxI=~Qm*~RAV8N3)kUAic!_KMa<Le?nYE{6*CNWak_
z%kCpA<81C6hP$JQT2<W!7t=9<IVWAGBb`{Sl{!noiTL9OymGM}*_)HaljBvxqOE!5
zrJeD8QtxQGUGNja{#lhyqH%_dH@4)^yzaO;`K+nlS{vfx$dKyWUitZrh)&dD7k-w#
zmz&4pK0sE6T|RG8jp*y56IwvlZBAPavJLCQbTpJ6%?QxaRh7vS_TP6t$3^tQ@}R69
z0ArDoI0%4>L8uOtg0ew~1EPcI_>}<(hA^)HSARD$1x=#@Cs(`+yKNs8Kn-WIL69Db
zirS?ufo4mJBei*|3S=g^>6F??@tPW(dlalLXknx%oM$R-V{)E!eus>`sq*oAJ}NuR
zFa9|fp|#6GRTPzGxLgOZV3Ytl4h)*bjTlU%kb_<Ph`wZq1(gk=2DnhE%s2-19~lID
zzaO;M6k=I*eXPn+5D@NQ;><LopSgm{R|ySmesv}dG6(CL7Oxg|XP;gasWu&S5xr;e
z6>!3P&C!D(5Z^}&4}dwe>-+B0NC#6w&~(zopdkOy)y|9K!PU-FLrLXER^>*SipDnO
zSW6z`WKrYna#3R&<1Ar)IK9>>I?5h*cdxz6-6^I*U82Ipu^|2Ve;VN4q@$oRCR1b<
zR@ZVOjRp^pAWRg(Qby9B_gfnWg^TVs&(;MsfE`}1-&e#~^EQWj=d<o$aR3|u8-NPi
z-^RJC>B*zZE&__DGt{D=o^w4cou_s}ncDBm+oyVddU@_M9SkqRU1^S)d3#w#=I;<Y
zv_s1si@A55snLP{giOOw0k=Q`_xrKFIR-xGoC+ZnYzqk1w8%7aU(`8u+^H+raB?=~
zc}7$6tK+=+y@uhYwQ*hpG8JGIpF`H$@Tn5|o1NA7mFSwgBSNEfu6ewjapKk&s=`Jz
ze74-gi0Y*wy;cm1I+9Ew>#$8-1S?<l5AQ1Oxg-?$M}GO-hC4~`EnIjf9K!31!!>7Z
z#`q&&Tpi&Xh<~9cT`N^+4mF&-Tv(w=&GspfSBg?>y8YGj9jDH#Y&hk>C*+`z?Zs!j
zAr^1O-oe;+w^7lJs$%Z_fg9rWCj8NTj#i}~13nLj?@W~VTC9Yv1pD*tIbC1wHJTu5
z?K9sqwR2OGMzz+Ry9KnLX;~z4Qy{{jx%+G>Gr^0q)Q=Zk&Q3B%C0Ew(BJosX%1U~`
zDA7#)^Q_t|!kFyxOtvjts^<$?9}1LZX>58EkdmitHf{IrI1AT=eq!}uZMN#<4Y+H9
zSpqurVF%}XBl?&M#*dfr`UEBz2ZTp!+kP;WkB`{R{2>;5Q?>W^Al8vDViUFa3sU%#
ziCjIszhj4cjW65A<j+QZ*y$MmdH?Wz?Q^nQB<6YrLgMl)idA_I>zsc)`FQi217;(+
z8-1~-NH$D7d1T%I@}*}MIb9##2-zMqk<1UHnZ!Lo5491G5j0WB?-vS8Qo%-Q|EThY
z8_D@Kd=#;U4!^(}?Me2O5mUV*m&$Ir@|`?uDH0LJ&V%1K2)?KOahPCP4HMF?@OoiD
zLWOjIpU3Ko`6uC`%3iu4)xBeCuVzW6r{1PQ^U7TBYu-)Hbp<{Kr-uC+kN9=J_;}!I
zNq~_QU&i(2y0w+X=<el1{w7wOMxzE<a3$=>;`&Ro;ANJ{2#r&Jx6F({C3?V(C7K;*
zQKH6h(43?L#Dz+$xtiu_j~M_LnNpKLiV+YU2PQHA1;+8LtVg*<^?+gviNu+sth;>C
zcVqHJ)Hi4~Q3?FnKQurm3V^A*x!T0Qqe1{TR@W+$PM072ObhgwurX`|Qt={;k5oZ3
z0N4-M$eyCA*#S8x0EZd8P%?Djqf3w`H{j68>7!2XblK+$4af+L+tb~X0n{<)5+(0Z
zRmB7Ne#UxzU?Y32mGKA&(gI=(kLSUlL7N)_@TdcTQ6&dSaGagvEPLUf!^TB#qiWEM
zcP%>o+8ebu+ROcT78zi}mxR#_uwAPUa-8^Q0sog~kY**|(~|ZDlPSxg@dPs}i4OOQ
ze<{@e$Q;;mVMuW)N&~B#6bPb0#M&iGIg^3a=nMj&_Hr*E03!y_VbBa-6c;aFqQ}a<
z;)wFWO^6`6Rv`Wr^iPqCS6n`Sjob|f>Sc5$7(JLs+T%kETJ8;kUo7^E_(0UM?EtWf
zPY2_Bkp0)O$8f)j4MUdzKvdKkHa%jyf5123`4Ybh4jcYGI5IGl0AjEg(P!QGTav$s
zKl;ZjzUK>q&n|ptyf?+)Zyh{y^Dlz`U{*beu=TE<WMASsey`NG_%u*~JT#8}E}En=
zE6D+y&fD;^)m2c7MxxVB_XTwjnSKuf{lNh6Nwd%fly$6SN@7R$M7O5Ky9*r|C7Oym
z)sHCdFQv8)G3Xzyzw@%8PTpKz?_FZq_@%33;(T3ID(){>*lg@}yxb=_94M4J7HARd
zJur?E`s}g5XBIbIzj)WorHA+KIc^bUHW9I1;idZajFB%Gy-VMDdS7=jxJD8Uh}*yI
z=Dq@s;55D)*IBH0HuqY5{G%cDpoPgB=ZMm&kppUJ$)b-u?@p0(I(#Qiu{S@_N<PFB
zc0|KPtRYr&PhI}(Gu12eVh@soJbVUZ9<<u-+-NgBbKs%)0lyxBPQ&Pq?rc&*?VD7a
zT>pMqL4^lpZ5ZpOX+aM;G&!xqnx642m!`R8f8@T``(sq_Rwz5Pv2DdU5y70@zYl)2
zQuj$n3q`94IM0vQc;(S?GlKZmL+x>Csax$hH#fWp;o`L8;$|M%I4L)e*So#1S4gPP
zj3aCN8RqlPxOp9qwA$0}WMT7fVLtm?#h&&;&IJ4ZM08szwIcvv8LB2jUGnb~#MC@<
zE=+Hmy!2snJCz33Tq~r7cISTW=PV1oip`ta*mr8S2;^M9`R?$OtP^gwGSi!zBBYZ?
z=Iyda70PaX68|(L?!NtS&4Yo1Dn*}TvImE&A53f&i<vd1wHp)M!#~k+Yp2i|mU#Ne
z)>cWHYW9{OL4p3R`%f=zkt$BUmpZ6xrC)g(k~Bki8M9Rg;A;oRY#V1hPLEDDcfWZe
z(pH$Dbs*o;v_53F*ie|NhC(A<(`a8u=fSb?mif00J-a4{<F+kCOl8{(^IVD!!3$*;
zyrk$}^bMS~vN)u!t5svMz0h3D;Yi9p>Y`)$WEzO|G^@;jyZnc4lpgNk)$znD7>Zqg
z_ZEd=>1#s~uJ&;otS9caLhVB#m1)N5Q<$g49Tf8EL&|eL`eEUkOx&6jGKTLyk6(Mm
zNQVElg`B4kCKP<<pEn%8W?S^`qy<R`t?|ixyW;*V_eQCg4>?Y>@|IO_@0Ly$-tJre
z2!C$qUXz}Sy|178we}XbgYU9xzaG@*w&8gzKPn9s1nJLe#NDsptxD#-UBpQ$WZrBl
zS=#(o`>r5b{>9@;(fj4+KL~EG-&81WtaZ(Bt^u!4IkV@}C$()ld|%=^OSo@i?K)m2
zXIt^+yAr<M7JPXiR_dV5{;hFQ?%BcZUM&&j*=_B;_j}??*}_R%`A(O58p#KB`BFw(
ziqvXvNfs&RY`-(3*?AvZI8P%a&w0%tJiR2OvMOu52k&pvlFE5$0pqGNEtC|))L-z}
z#HPVwFkLGK?OJ!XH$t5-`??C7Q<GR-YO*Ke>CFPGF#AtiavsHQikH}am-?eez1c>)
z>gU~MvkpS(e_p$Ju7xI4B#~+HPOJO8dY2?lOZ<<haz(9c>4Wh{4^yIBE7-_R_oVc4
z)WYdS;(XjXEw~CV-AMVeC{cT5D0Ie57j@Bg?5RZ7IN!0O5vTkaNA2u=Q09_JA3t1i
zaHCJv{2K1y!x9U&3I#en@aD8@na$rpzWwO83B8ARUzxdR^a@EyG9|`tE}VQ4`vI+X
zFrQJ{#M|Qcgx)7NpEWp(F<z<%b(<%Nu0u|dC$vHZr(^@aV72Zg%)0A7nqg{x^ECHZ
zW%Y)`ACHuE8J`U6(Cn;ipsT8^5lg*b+*sE&=l1NW+!ukGu&|aecc^fpVdL1|NGH>B
z**JzBx8+;!^fa(fWcBJ9FuP_64ZOxyGuM<J-F{|}TwBX<=&6EBpRR_oMqb8^yn18z
zjy+Q!EspRmyj#>gVPNfVU3n@{=*hOc_FKB0uh5MA4@1wi+0fe6mu-`gRZ7`{N0S@R
zc<Sjs@8Hhk;8O7OPX0`;Q`L8<py&Sbx_2t=NxV|~>BW<EyIhiX)^Vu2b)9k4*KIxj
z9^d*(`gZGtdiwj?6mP4n9)d#u`~@+KuCmzp0>M#_N%mZ3g(_Z0I`OKOMbi9xb`OmU
z6rbBu&gbQ)cf=jyXtq8yx~cSrT0gaUr4R$CQ^LWew>OUCG5ys;Y}i)#)(J2~Gr-rc
zSzeClBtY<`Jq@5z84mBY8Fb~HTwuVwtR7HO3xaWgHsF#tf~P7;<Zu}={_$x{!rfhk
zs6tj&!MiFdyUV-Dkx8mbuC91_C3!g#9xv-IM^qsyxvCJ+60))?vUnvtNd+#G#Ve{P
zk%&YkS-1^eRbEk+NLH1Tl~*RK$itTW$4+yr*LP})=gK~LMhTU~k4uq8cFO-qXOL68
z9n{PG-M)LB=K%@`Iq54b8%Rk%X_j-wX!oW)X3Z5|K~DlAMrTIS<9)|oal#K1tsCG8
zxB$aU=OT;RgBIsxs>yl6EVoaJ7Cr2j%oy<Y>^b0aG>_TaB{qt+vzI$xBbHcI=P<at
znKf&3*OkIO0@ob6>ZY-K)s@-hXAG}g$s1^U_C__Mo2+JX?J?$t^xR$^)z6Xen6pt~
zvinZ<9*9qwQl5_{%p2DdYZPC83`XbDGt`fm-hA;W2gf&r>m0DOXn9h|%}*y#WFY8@
zjW&rR#x}EadDUrCWq<v1GRASoK^D90tC~ku`_OD%S5@nF+}Yd8&U>U^qG)jbK=BI>
zrp75LXZs_{wvQI>yL~-t61GVzZ$hQe*IDI&cidsQMH|vN{6W(xk){YW<;%jx?B34Z
zsT?*BgJP72o-r<QVizfGhvP8;cfca$`s~lV;)|+jMPhxshojHksZ!yG`yE(M8YVCr
zZ7nKtAvP$hEY1{`A9TDG{UJA~{6mkq*g|;5t|&eol2cL7QKy|}lNJhH;-pWrwENbc
zeMnKdD6z;Wn$3gF1aZc^FT1u?dqyk<h%Aw1RnvYq<5lRQqAN>HNgSkvl$aA{lqgXS
zMk`1~IKUni$G{Oc{xvdCkbo{ca5JTxRSPXK$J$wb<#GKR-f**bQ3av~wV~7$b&7-P
z_ItQ=1Cw-y4VF7E$gvdD*35=7zht_v#Q!RqQJAZ+=yF`i;2k{PY_Q1>mx&SYpZI`!
zeeV-duV3l;qU{S#-H-HEje6d!?X@+<jkOPg2_;FVS)-f$TP(-LGmmDGX6KB1_KS|p
z$99}wxSZ1I@r9T8wV+>fek6cqjI+@kD*x>iPOfW@Px$5EB?V*(-a-kNhKe^Ck8DgX
zqvvYAaX-eBJQch3dd59FZ-I-VN<8|d?}B!=iJflWs8%6bClaC>0x}JDznDJD$C5pD
zwsUwvdFl#!4x1n+ARZHQ>c_!C-!nKkfTAWN<Lc%o?M-nb`*?)}N&At5Wjs7(LdYJ(
zrPr1fYK@}pOZ4)SL2@$R-P{Sk4D;W-@>A~#TDeDLJE6wdai6v#A?AN*2r!&fm}SBg
zzH+V(Fno)HS+u396+#higYb*rRID*4Fb%pYhfpftHd(!z!_cc2<Zr4$0D}gR8|kX9
z|4zYAZ$TLJPj3a^bmH&(B?kTP@sAaw_))oxP!ttHi(>du`K(YSxRRd0K$)xMudq<k
zYI;Hh@>yw8EVh;$!>uLrCDxL+B-fIeo7a-PQftW+Lnw)Q1#Qv}dbgB@s4&w;G(rCU
z!4Mw+v4~N9Po;|iru?wEOBE}0^tVOpJlbk04UDkN&A=CJIK4)mEN8!AJC+Mp=!xab
zDxa|4c7*^~uSRa{E7j{)>}C6qL&T~XNJo$rzmkW455VwaMsc~!{A(QGzxa~@>J=G6
zqfR?R$^fpFZ5`%kNqSU*J1jrp49Nqy*$wOR>qEFq6#Ntwi7H0Ga$O(;0QX*)ANnO$
z40Qnk8F7I$0o;HnKkTGGMR%bF+#wXIU>76|;8JjFV&BZ{I^*#jkA<2<1TQTK_KL3w
z{$qj)^$8IgT@s3sToYRLlyz0#F;A#W4_In(N$T>ZHK~72DWOgvf+a+xa|@f-1QB)r
zfGa{>Mx@%8q%Le(lVV!;&~-b%BY{wZh~VUsV7AG+Q~Vdt!>DH-uoondAowbR``zgq
z8~)cfOjML7Y<Z_ER80k--t~j<BF8bHK3Vc0;T6q7u(H}_r?$+&p|3N_AdjM~9)Up6
zhyM{&L*V>gOC$;Bmx{I$2o!=YfdGFk37ak8`p{6M-k3n}SZODVd|e4}u>w+T;sH0;
zF+-|#k-V9u4xG}5#R*o+m~b9ZNg%i*uO&6hZxsZ8-xB^3xFJ$c(BBTfED1Ws1XvLM
z>Ld2R&r&-A!6FP6QiV}q1xS&R?LHDgj{+CUm?F)V8#^FLf)(5b2GRF`^&wwN#K2W+
z2n3m>{=q6%id^MAJqg>E@F_^UrN%l-rs*IbSgAFl7yeqUjE5Q04i;BM^dl+=1b;ZM
z<L_W^?(QmY?(b+P=Ly&U*gi|x%%8K*2+=}>F^PWWAtaB#jR()~GM=k~j;EWUk0%Mv
z|HXVD)ACdEp@Ynap--5-f>nsCq3YkqLxvu~lXvqW!PtM-KHXJ31uIVnKU<1}Jv_sI
zd%h7+od8!kd;rPLz|V1MIW5o6Hn`?Hf3$@4{e$_&BmTIwypUE)q9(}D!!B)r<Q@Ec
zWF74hm-~zP|9O9haR~%RUmuF2Eq;%KotzJn|BLbc>Gim@RE*5!9DN-F$o~esrKn+O
t1^cH#n4`UwJIU7|7#?2@08MA%5C2$1!HmKuv!D3`CK&<10{%Zg{|BD>P{#lO

literal 19296
zcmdVC2Urx%(my)8EMbXDSTd4xT5?drE;;9nNS2&If*?8PoF!)iAPR~|jw%urMI{I*
zK|#Q*+*#09&w0=JzVqGh-uwLTJh;;{)l*&FU0uJfu5R_wh9XE9@+vv90<MiLh9Cg|
z;OP+lZfk}kK>^IOJFudBpOVpQQ1G|)<Uh|ZsUs&&LgzPn9>uuWResug+ihcM2Al&-
zfI?RTK~ZCZuf}#U{CVzllOHSFL&m6ghd=HcC-9$kI+BcD`?hsXdjf7C-S>5R{Z?;v
zb#XQ4*Rz4D()wSE@g-Aoy!)NiukC1lE3w!RAr|XxQ;N2ujcl<nU+J`n(rMQp`<_CD
z|3f;hggbw(ElS~hwUIY%%&M5vC5%b{eQIRJ0sXxCO!=kjc>t_a+Ouloa2K8C=HKd1
z8*ib1cfAOoQ~<FBu}?DRqSTCXT8aF|%*_rtb*Ujr|A?&2juvm^ZtnA)u3go|0BT`+
zU51!Xei81{tao!S-(I>mmhoU~SS_u&>wP_8&qAq6{(;>{JQDZKmG!{;69boLZ*}XJ
zxF1+Iq-M1a?^wggspj1Et5+x{b!YS6Upq2&!we-kCEyr7CEi=t3pOapQn&~W7Un&7
zAiuyFd+8lK`%s@MHxoKiCz*Bng6u&`N=%-hb^J%;2ic#PAvt5PXf!FKuZvU!)PhG*
z)pq`oEKA{(`cq<evg`NaYKfPgNEBX5o~SBhBjPc7CD*#L@yzNdHDgEPOuOA_dp?pq
z@kibQg<JW!uXM8S`>Gi~N9?6!YTR>*c5-gz`Z#5V*c-BPY@k`_uk9Yaq8I&!GU(NK
zSDsM?p@cfymVd|W{wEREire}haCE!PykT$3-sxOWUGtHwSbl0dFypqI)z2^C(VgH$
z_O2^ZeLoJp<v`+|&rXG>y{sB8h0lokLRW|cko`Q!3K9WirVtW=#6#jSN&-9tp{_m-
z-i}WGv@j_!`AhFu;S%(KBM%PvoGApsU&W1#v{mGxB~6UA?{kqpm6_;>sWcX#(N=9=
z4miWCuP)A(r_F7w*<s&dEo`nWSv}_=WvzQ__3)yCtv(MUMT)ATB8r8I7+)FFYAeo2
zrmiensn3^B#8tiEBU?_!!$F^=Sx3oNYuRdfl}3Q8WXkO(gHZjmz0e+J6b}_?k_tNu
z3oU{$9*3>c97Ugw&%sut&x4%6&6)C$W}3vOqDWKcoPp;VUNIMZOhldYz(wL2Nx{rc
zu-r>BS}MYX!76eDf|vkz0FtFIKua=$Ly@J;lbgI2J?0`=n!bTUn50^h{U%nk(X=Ih
z3pZuV`9|?h{KT7}?(%o|#DsB5_EQ7|1bDI0%aQ6l)T!~SkR)|3D$OCkK1Xo|-I0Ar
zoHD!NSOBKhnBR2bFnY*Q%<@6dz*@j*o5_zjcyax#tu8Pa3<uNaqsT!;q&lz{smwu1
zSZm*9D_TraznaW1dFooBhidbJkIeJ*$NS$S(Um^x0`w3_U!*>Y5s_5wJn$wsvqImn
zl}^XiYQ^t1)3uGSkQfZBe-xzBR+@!EZ76!m=L`$^SS+T|T&VDV{LCH;y0XSanXgE<
zA=holZOB=?)-&nMOEMlRQD0*Yo?NOcPg3h{2>Qlh)-%n;q0sbX-0J0EMYe2s3@k3Y
zioDTLw$iW(*^Dfi%Du^XQK3SAl#x0uvUjCHU7v>+tQKM1eLHDBG72iqWK$1Sqo?_u
zrCuZZ*BOy;&E#Avd@91C0bYBt;^~`%U^|p|)+8E>mg)1(CC?>4aM655A-@Vr%&9^{
zTv79SQOMD-)f!a<HibsAVUvkR7t<uSBWVg(c@0mk<&ZOhdPP>Fn7AROx&cw19jT%^
zk8+TF<ryd}mEIP|&T{geRr0Nh`$!K7$a^m58x7MGp#-?4Og1m<2UBoNGckup(<A|$
zo$A>v5o?2=Nf9Dy>5&XvcrSYK36hH=sUpzRu|2Bxrty~Zz<4zMg!R1Sb;JmZJr0|{
zhVPf$9YIElVza7)%2T!^C_XVhwpbz&sZvIv=db8&1?M95=X(MLF%6HG!qq-s|5hjs
zjOuGaGF_A`9|I*faz$-1am`3#Itc&|K}SpMXcu7T?-X#(%g)mYNlcw7M7<<SeVvAj
z;LN`oh~RkL#YH}X9FJGv*La0MXd&RBJVA$G;1CK>jFFM<%48p=w^lLs=(9KHLTbEn
zPdRMOFPgczUU|9S-TGnUh1JX%^wj0SSsjGs)=r4L%G5dc%QbIG!tAJkNCHAYXs}T<
zhTp}#4o@xZ{=&J?MeFt6XCBZxTtXy3#|hxk(ULd?2b}W{^zrfb3vhBgk-i0wIu***
zK#LXsr(7V2jng7CiL{Y5zd?)>_Yetiu{*G_<oKm6<kw!!?QmdQ;NYc<BD5`w@Zh(R
zGplT}yuHK0g=Yy^0j5C2#L^VCR__|U+ChW`&}u)+UOt)r#=mh^q=VdN>#_p3pF`7)
zXlBCeJyO)4W6Set4$o(}=Zqct<4qf+Y4<;nM1|Mh9ljW8CvErcuC*|9uUO!F1K4N;
z1Q{Q2Q!3_PP%n~6VVdi$)(_$p*O7C~{$gj<xCQbqe<*!1>^h~Ci&x|3s5AA>DfbMs
zX$Ps>6S<8$oCK5)&aduTQF2Zh6U#pR&XX5I#L3p0!4k)^G0^p1gN}*!<59NUZsqM;
z>M<c)&3J>JRytc!^&&f!Jy!?XKiJEC{PwD`Tkz6E?^BkCaSVa$d9^cooIAYUT)o7~
zuSWzeHqI0^EzWhBx+jR6-noHpv#*&Eg)VwHU<|U%**1%xA;uMIQ*htyeHIt`o~S+<
z8m7Qim&B5_KHz!(#g?r0=Z_wJiX<*Eb_%?<J9av$9jmn)#zik`v!7Le5euikW2+F0
zsniz87u~v2eUCV@xB+5<L5OA-^>a)>d^ixwD*L|DLOM)3gxTx-%I>_YQ?~u{Y3X;b
zD8=Dw6d4v~Tk57-pXE%$DAl-pFwwUz{&B^2+rwImv_~zEF(Xl%!yBKIuMllGc&WF>
zf3TR#D{b>$LHPylkhol!jj_%n97==W$1@$z-TYb**#UghRKnwwp-J8c!*8`(A3T>~
zb9a!wq?F}EdkbYi_GJ3eOaoU@<a0=;l22VFP-3pCntSn~C9x`z`4ht(xtDw8h>wh@
z1ZAeSat!YB5A)fWuZ7(eZWqp3<Bn-ZpKY*j@pN6~A(lexbtL<z6aC0@PLK9t-CHe%
z_)M>xhT3wTv76cRP_E+WR#tC8WI)g7Zf%Rke76zdqY`;IEj5l`Xu&}>rlIgSs$U-S
z!*mzt1Uy&%2|S5^fv1pze-JGJ7C?o3yw3j`o~&>=2Edr_zXDOLxlYBJclnCU-{!i@
z=K3)ncwvp|*5`=c3M?LEsQ(>AjRguE{}T+B=Acef<M=y>^5tuZ{wEj=LijK{#os{`
zIl)=<pI|Tu;j85Ezkw)-!P)o!6AT6+d@u3uAPQozhVwB7!v(pud2d}tEu@;ZKmUc0
zSmZs!hQ-nA;*1D*-B>D!j2DYpv;0LZcS2&;e<8Gz{f#Jyohyi>+J7M_2uay=AS8Vj
zS7O`v3%&ftTsk&LitjR`q&ZY+5&7VZq|ew{{S~7S-IWjBX#a#km#JW7C={oEy6_jI
zYRnKPmHwU}a8}QiOSwe!E@jdRp2~{sMX?Bw;44QrKFA!E0jC?JNmDDt1X{LydSV=Y
z=>@5#VJ|AM(}J(Kc(JwM1O$c5si*TxCwj$5!$?s*G%DkG<LN%$gBQYG8x21}do6p)
zZIVI!-nxt$4KeEmfsmtp7l#-<I8*UWH|Ebzm9=@NmP(z$#k`^3VL#@wdKf`KpvI!F
zipAB6Gt!E*4EQtS+dYwp4f>*yre-*ccRx-FA5-k9LMSPza6&rtRwv4LkR`GDP5p#q
z?lVE4P*Ejk@w;i+jV^;kmda5a;?yv$hze7E8jd(#@i&-RT-Bu(90v=a8EClUcr~h1
zd})H50pTcwE>&&2JH$u<&|v5yw;(VaV(JGhirVK`d}Vr)$Ji^1r`;5J&E1JCVUIeZ
zE^d?#^!u(5nB%$YBvKM&1(DXhWs$7Ly&nNLDTMxovT$ct52rsMHWfqto{lS<0Si+9
zcD$ZoZNq<!wI)yiN<d7u<Vw<;Ktye%`i61tlqjxu>W{;FcRpo&e}n%i;L{=Poqh+h
z2!){UB@uG+ay5)F0zewp%<;+eJ;mJF5I=`L*S@_>lwjB|TqVU$2F^Kndw96|yLx;5
z1+1?iT#=Ih2&{>E$g5o1$o1dAI!NKE4OpJK!a;hy7Kh^!Z5sQQ-+&$A6IG}hR253a
zW_%OMt3x%cL0WrC=9(sN{PXMCR&u4~u2;DV!a@pzr^2q$Tq>7vRVml`G&orxl!T7O
zNbxC5Vrua&YwJ)odt@L2W_dE%J|OPS!K_vll0#O6T0*m&&0f1P9T{JKPmrtefH6IG
zH>S{^316%<EJ4>nhNzOx;wZN>L)Ma{^~&_yqgLG^lNirdp)V5eY+tx}4zmzu^n8_$
zL%cE^5=l`lv9xpB2;I-W6g;C4Q#|xF<HLLj4=yE_S?ry6M?R|(L7g`X-BftJ63|eE
z4{s_jFhvx8c@%)R^Ab_lLsM0o<3>smW!#Z9enAOJ#;tQX(02TxOWx_A&@wz?I&N3;
z@6<ff*0%2`NmgTp*EfkJ32CHnY}7iA6s5gr<&~AF4^bKYfxM?m$r{$85IB?YO6+iZ
zFfnaFUBrXT-1ou9zQB%WK`qz%@$mQ<?O3hfip<E|f}<Z3#LyKPG+EJz8vl^wUuDZR
z?4~*ABI}`0HmscOBffdyH?}Of7kBIV`^$q`ZKuvpn!!D1>TkNiPcd!K+^acFzDQTH
zALDhd`6?;j&ATr@CS5jb!{5tl4Hw|83#Xk_SNS}D<8$E~YKimOYx+mhDSbRv9(tbk
zZT8Id4QuzhKg#!&F!HAGr*bq;FJxtm*>z5THe8gf9zLDSKqh=~_og_kgF-|p{T(O6
zv(e4~WMgFDb6vI-O+&-F=Uf?mlhjcUjY#2*ZqbQ+uWsw7$ouvt&lI@L3@z=>pU!y>
zr`MGht3oL4v#MV2u%iP2w6w;~vT(fjk<a|<Ib;WiGFOSS>7J)ng|>>7#eFib{Dh7B
zHVF?ozeH9}q;OyN5uRfU_t==Nnz`$y{;cadP0Dk4!6F}ZlkB-88XO({io$~BAoy@D
zfA%})D_X)7TsF$ys)<fQe72g1+oz?feke{53#*~+wJgJq43J9d>l)K~dj@izx>Kd@
zWj#F)DJQC!wTNZOCAv_H<vhI0Res|K%->tzK6bR=i1mhxwPe>&KHOYm&%4o<N5Xwx
z$urDQi@t^0MP1Zy*xi5eZi%dLg^{EuSF`i--Jy6i-yR&}$d;V;ff4OVM5%f|$G;~x
zl#V^%vsj(i6T!r2sHw;VuEA7fcHlQV!&K8s%=#)k`;s22$V1MU#Dwr;@Q+kpG+9i<
zL`}H9I;$o!ko}0*nlfFfX>+9Ny@?{@NXq4nQ21L_QslS#aGkGbhT+-~x*hQ~;tEkr
zwKzd;59z|Sq;oyznh-jh+f2kn3!{mZ9DIRAd0)O8*jyrexn2?{U~{E=l3Y1@;M4Xi
zvrp8kJ&{q(+qv4~_l}yYpzP03U%tI_YMsI}8ha;itm{B{vnp&f;70b=3o>%#>v-w!
zPnP*#_cIvQ@A1!UHx)A9z8D&;&313vy;!Z7r9ISuIXh;O(}3nkGdoNb*776i$6=C1
z-n!eh{oaxX;b@IF)|GcxVl<V9O5SUr%<7V!G6yoQC~c8a(-6ktUy{F=Ll<|;rt8*7
zQ{!RR+vMc)PoLSIzce(i(-X#cPTu~Z__w|G)Im#?H~A2|FRVM9&Cvaw!`q(MDvr7|
z9^gi-w=^2vjMbetB6i}nn=<Y48@O_vNsZ!5ctSo6ofOH=_$TU5gVCo13jI~6534f7
z{a?P9%|Cq1Dwb3H9wl$&_x?rb0Dk=J2>h0LpT=!pWD7OtQ1Bz~LDR(90w<BjQm@03
zZT1G<MflfW8f!DWqD1$dvIIs``u$UHyB#UT*N?__T?XZXwu7p<7xrGrY1YLZl&lEQ
zJ=QEL8YDB2H0Lq9<!g=hes#sNOIx7(OkwoyU<<>AjJ4P--kGZ|Lw=;sb5rKOh9qF-
zWRY@LNiW|1$eWmM6qaw-<`=v^CHp9SUP>m_kLQ}ni>*5wh0V^7m58)*D8~d|W_FT1
zw<M<tUE`ODO(uOw&i$n>$otEqY7YYqavy1Fj-18r5WUKF7oI*94U%L_qJ?jCWpm9O
zbqww<JJ|_oOg&%3zIT09Pe>Xddv9b;^un5l@szgokJ9&)CV^LqCVBg%-^9LXAmb&Z
z$K%yEOGyh#D)aOW<~!IgjL0yRyK8f|2~o6i!}DDCJwZ+i{rOfYHOy#_rZwd~(at@?
zk1yU{es}>9M?|7@x|uOrDC`qr`TfN;tB{6~+WPldU7`|W)}2Cf-ckGa_1?V{l_(A^
z5wNDEvHxKVrwvVf!o)s2l^8dhz|H5JL?>mJ^wkOFlia2`^0BNruW5-d{`Cxzc3N+q
zhl=ZCJJi&XkaYM|_x1P(o0kZQKE%yhSnTQ}@3;|&zhby4Kl1oF=To!Nk=F7o1^tK4
zhLe_DD!hEdjJ{W7$18Q2${N>J8C?2#7aoM!YCPb6<8_+J?E-69zl^I>9o+86LrdiT
zTS@*5&3cK(CO+rE;ZIG2zE^{va;jX++8Mp#zYt`{Ip`mR6H1V~cuhKy%JR|01^Mek
zQVg}}+p5Wj8bRWGJFfi_p<4t*N^WuGcOOtp;d+q<`<K|sxUufrE4cPJg=ge0i+1ar
zo{IfgPIU!7Cl%N7+3o$~op8SMEGu7{LOyE|vU+q>g<aEOeCU+g=ycUBD$rjfnlt74
zLZ*?C_t(IL3BuE)p0n?C3UARVeMl7-qYjgW*XUQ}$`#O2m#H}}Q~KhBIu{p*cfwbs
zayg31y)t}l+ISkr`r0CTo`l*6eH-U?D|{;*WKD%Tzmb{Ow|dGvbYL|~A$7G{M9g`&
zWZwAmvq^H#OO79TBrLVY!=?7h7SP{PGv=PGY_=elYfn@4XG_M-t)y1Cm;@P3Dp@6M
zB{md4&b`M=-JrvMhUjtkwi=r<2EVJM#jIoBXPoEd=<5cbX&bt@X=1Xr<?I_c`0-l@
znug_hJNjn$PG0<Qk~<{AYA<S1_-|(2^S9t1-dlg}q$-@D{+1#<>=naoR_-dJ<hQaR
z*oQNdDX09Yk4{(K5ogFo-+ZlLoYd6#+TrC;$#pbieSx^CmE}eq+rqp6)uCIK^~bPj
zi=?^g<Z8LMxI~3CH<lscxu$7v-;ue~Bt?ggRiV`Op1)5QH+8z6{wR}my3^Jy$nz?)
z%J<$Gs?&1%Q2cQ|l7;0b4R7hQgBr+LVB9c5G=>(PxPre_M9T6FKo9^hHXu>)EX3zA
z=)DE{UuC+UDG0H>3;>Z3Vitg0QE`w6AW$qEjBkYiEC7VD!5D-63d#n+dK=oWfk;VK
zB0R=+;wOG1`^QCA9%pAMJ1HkgDH#WGNoP?<5hr_T2?qxmQ3+8Idl?y&vxuFPorHsw
z9W4(EC54iakg=Bn-=Sp0r6la_>?BZN9vNv-ag?2tv<OO6(n(76w~txN3@D5KpX*~9
z81iX3&ourA>%6~x%(xA3p;nRZc%hbuH;=RZ&8&yC!5{0q8W#Uzo%gqo*}o4i)K;7<
z)aK5NXD<Y}NtQPJvCdobuhw~g`<Tli;6knb$wKY8tJ9c=tOSGZ>^|I0u)e`yk~_QY
zRyo9}s?;?9ew|T!rK;Rz&GCQdc<P6rLcq6><H|Bu1d4R(<LM7CUS~B3(rohVvJtiP
zoY?ux3AI^iaaKwTmH(<lATM|yxdE>2emTQJG0&fSytbR<XK5i?qDvyN`Po*fYWu_^
z%?nt>dZZ|`QqUuP&G|5Si5Af`iNj`h7sJo&p`Oc>ZUs7~)a%LqcLy!VFJc{2%wdZe
z&+%F>h0|=qMhr<(+fUzg+IIr<P5Z5V3#9q6j;RHT#rQJ5Uw>$v=#xyI)0|AVtU>=Y
zG^ri*Pv<tGO0U~g>Q|SONpg&*j|hT}Dv|%tFGqEAUs2cm8a~5#I#GTu(|IAv^FuN+
zYI4j1vZrOex*Tr=6!Jd=m)BoRws1X69+4nyS!aB`eeAySz3Bwq*J-<}(hBXxk4NDf
zD(BVmNsHz>@hXY^(?jK7Zxe_$XOUp&NY$;n93{(&{b%>b(v7EEkG<MbWX5vjjl8$p
z>YPjy$Byl{o0w(F$#i4Gu2f@|>m0eP?Kr7LItUUemee%7>O<`nbFp?3#E#-Ul3t;w
zjs#QEQTUyyyo>8g@z*K_B2PV8SsJzjqN!h@4^E$Bpv+(DdvWa*!QE)s<Y4P4Tt&XO
z_bg%$hXUpnWoSNT;m-O|;IyrwIl(M((P_m}8kgQgxFX#dDX;TQFx#6dTNoasPZ=}{
zQeX5?I<_#*D(Dr#5|?mAtPsEMlp@C+c&S7R;3<xL*GZrl!&!ArQgKDSoH!uvI!<rl
z&5YEuXT_hrYD4R{IwNW%aQ||6ar|BEPTa=+5b7#vu0d(6+xY*A$pKe)dr6QzxZ21k
zznxH9=oLcX21OJZa909{4gssj1Grf$n<%<I;a*k+ZV{#?Lj2W%s94At9#E(eM?z9Y
zv8W92B$Vr~W|Krjor>RYe8$1><R;XQW`Jr8j{|_Dx>J;~0)3niN)|xL@DiR9BjhVC
zFs4aLzyzequ)~+7AvypM4j7}Fp;xStHt|3)fh)Bz9%T78Ql1j97$o;NgYR(<X4MTW
znd3p=z?|?II^H>GdjilHC-VT_ssb>PT{F~p3W4APoCIs1Ab!)v_fL%!nDSjPCclZx
z=K;wgZh`{ZQjCjKK)_=Kdl{S_5g=Ox=e8VxmrmKv0ef*9ICu<s{b^tq>S~7LA3<41
z2>38U8OeZrWQ+jB7aw@W4bAsqUMR>D0Pa{UhC#gW0Jn>jpxFih_XdO^^X>u(qMz(h
zd`4Md^9`AWq-Dc^^cX*#6_1XfkK|%a22$-@Ct&jSpTL9wloN(!LrVa*!msN%TFeUf
zWCp1I4+euGFzA=^KU{E(TLZxWbpi(3V9-wmyWptv2N=lwH(>A{1cSn#QM<|HwJPj_
zW8EKMQ2JkkfgKhMv`@ew=lqbL0d~Q0W*_b%SYDws$?ey6E5FY%#8iOBNwqynQttBj
z*uTR9#90sv-sFzGZ!s4dPsMwBaIt~aAaLCE7z%ch@BRe^79bShc!tV{-cf3h^GsQb
zg0S8Op<tRf=OoPc%A(aemyYDdo6q`6RhvJdp#2#Z3Iy{@Kqx2#q2MVkqG|$%jkQda
z*-hP;DF}-MH;4TE+XpQS!j6$3pvH*nB<iQ}sBS}vB&q2q5<K+6BEd~_uZUjT{1Y%R
zM6oCr;Co|aokWdiGaF>rwr^t5!1)*rrq=pOGIfg-OUlV4IGWP$VA0^2eb)&ZfaT%E
zzJ9-?qh6mt-;_=iVa)V{;`~sI6kJM<xv5ij!4m|8J?e=ESU@1t#sY%qcq7;1%H|0m
zoWBJE!myQqlme<~q8q-9G*gp&5jAFxS42$d6)L~7Nsu!|EJ{Z@8e-qYBUbD)v)`C*
z{P5~;JSgRUy{W#oJ${S_`J!_jsfH}Q9JzGuIK<oGEMr~(N|ID{LM-Y018t8;7jIl&
zDxB*)Ijk*vFw$Moy3PlEB<Ij05Q*zUeQ5|tj>eW6E^OQ0{tU?e*y3;8JcRJw{T
zPaA;u7MgfT(T=zDRf}b0SiT^Nmj>Dqf1eC-p_3kE53L|BXV9x&a89GHH10jbg)x;r
z2#9ALya;Jy9|5F7qN2&`1D+CV6XNz*v=g`vRPO2y;MkgT{D{OeXRH~$PD8+?nCkkK
zXM3|}p{{_WDOn<hC5aaa`x#t>`MZYx3@Uz+q^_andgzCRfq!W%Kr<eJjL=4s{@P~3
zqR;^Ws$z)9D?EuaCj@U<p-rlfyg8!^p$uk67d;Or1-)7;4`@&#k_Tiw9n}iKX|y#5
zlMo<o0vEJV-Q>sv+<%<#!h`~X18DJt?EL(^FMuZM*gDzZ#1R4OvuN%}9a%Bd5tG-)
z`FoAc+t$Ra`Bt4nloqKlZ^eD>t}FHza_~{q2RKrI6d(=weAQDOnbH`Qd`&Np6JYdS
z-t&^+Llj2x9PeJ>qUy!jrk798ZND|_9HpbVn%%6zMj#*r0E!5@we-ce!g1bwiz+5?
z_9yeMZM_7P5)U6eK}^}n7y5D=IbWoId+F<y#`|a|xhx*DF9Sm+5l?ZDL~kEMt(wGD
zuRl;c-`M6=r~OdT#Hu~?@yBP0VJ3mY2^E|1W|$_w@H>8fmT)rjMg7&OL`n%><{Nq8
zkIq?|KMFd^PNX-poPM}vG05&`xTbxN=xY6rQRm&<%&d3JH7$ly>qo>FDBc>8a<eF^
zDv#bL?G!DuQVv^sMwg(16J$He{4{9&Oj;0wqSXcK{gQ*!I*L!<p`s~QxgHz53K}^Z
zVKWB5%}c_qSwOnbJRx}vO51v`Hj|uL!y;BO1h-+yO8sk;Q#?24Dcmo@<`+e4|C%P?
z`3E{Xy9S@k6e=jGpWt&XCPA<|@=u)wYR5c2@*6}OY5q$;Y`1_~9Kdjh6mfkkz=U=s
z@VU;oIK$Si3(ni{<tA=pQSwVOV7OgkKydPoQ2ZU?q-#dVi{8d?CZpXRA4)%+cL2c3
z-+<`kupIW`_rO6Qq73kSOCa(nEsxBmt35yJXNW26M4D`<DWa%xi_RC>wAw?09!&*2
zr2^F}8j5^-vM~SGrWL#(Q6U*26fJ&`h>!?M2=!CLSfT=P0E$HVQz)}|904ZBMtnR1
z`eYh%a`L(yHkD*5UPF#LPKlidI{>K`t})c)CjE7SYrDkne27G}iID?CLc`aF0~bf)
z7Dw|I$7&h*)h6yTib(lOT=18<<uCWqA1xK2av>o5;k2u)-g(KUD-r_0U(JgXV0Zqc
zZAS^IpTf8bvya>M?~;Nw8jWE<7M}aZxW1B0K!A_Gtgx_yqnD7IzoV0fYmlFims5bS
zmyf4#kkffP!Jpd){1s1XFN}Q_{)Y&p;{SxnNz<Kh`KQQb#VcDJAeIAgejClhl2L*K
z5TiORb8_0-Bm=SoTRTn{S5KSmQC_-u!;CR`kN^QiR2evggA=kqz+-uPv<f4@y_P@z
zy&;xQa^s5RYQ(~eo17N-&LqpOVo1R<a^+gpF80CXg&Kzs#_x-64oy5uTvqfP^N~@#
zdrN<4eQ-Ul<vxYnYsEE|NX56xZ0R+{gAYfG(J3kxXs%BzWs}RAXEEdhA^I_j#BAI7
zr!q!hy>EigoPt0;>r-Y5eVowDWpfMR^{%`{+}vH5GHK7^VOipLE1;`zShANlOy)z&
zfN`q~8K;EPw1)1pa-l{hEmvQR9``64eB;T?9h!KEHg{ehv1|^YnmuLiyzUY)7QLQ@
zv`U^mtN&E%HK(=<rx-rD+-r~Rp&{jg2I9t|TFk+XX;R(K4PN6+$f~M3Pop1Xy`7Tj
zjd|4dvjP&Px4rVSR$A-s=Q@(h?7JRJS4GPZ+MAYy=6?0GN|*2qr2HPRY&2YYSDO1p
ziZkg7FY<2e1C2YbX`cl+EYfwlg)5uHoxSS|-`bTL)Cc%bV%%ifB6FR^KHp#^Brb?1
znCS22L|0{^gL|#%xmvUJ6<Zw%`Ec=%{_S-M>KQXmZjrgVwv&%MpqdfjB$<$(Uo=hc
zz|5OUZhg`p$}eh2wr_fpj;X1r=O11hHoaFW$q>cYU~raAlwH&?lS9KlD=m{iz&`mk
zgd(y+J&{!r6$`N1vOr;M>Ii5Y3#&ds9fAH=h_b}?{JM$++JIKFiF7=X2}u$9OqI{E
z+g*e2U;?Gr*pNxtUt5Joqbz399^6*a^f^YN9@EQKuF07MF@E4MWNEB}=G3oE9;Bkx
zGa-XzI5kxbKDyEr)jMfxQf6EcV&aLSi8pz^h6XH1r<<A>@0q~KFIbx-l=QL9H1~!V
z6Gp}d{=f?@G&cb>9VCd7ZJj#X3=2ryObJ=k#V;e%682mq99}+il`s;wbyMj`I<(vo
z<8*3fN_X*;5L0?*8Wa7RD`Fr|luII`StPr2+@x`PgO6C5@+?V3O-r-;!mZ}}(MO*r
zra2}JFI<)j7bW$U;+LD=4xis~e<!{3>E7IckyVCun2cwH?!1(1!n%UoS&Ew)Wj1#{
zyfl&3ZoK$`@V>($kI~j@@(~f8;P>$<!LWpKJ*Mw|K^o5w25c%{wiHn-TH!bNjfwzS
zYaHDe+4Xw<(+n@?MY_IIPDSXO4!+Oz*ykPUT-B+&8EqRXn-r*);M%wiT^cG+VihP!
z(0d+nsPO{sL)Q1B7aBb;J|C*q65BWM&VQ;7ZSh!R!Cme-e?0l!n^q+3a-zPcnQi-?
zUQUEFPZdo+&D8t242&mVwp1K*)@iEEUWuD%+q4LX*T#o5Q|YMgwZz1ee+e(;h)dUe
zA*@`Swp1L%Y9e2CH|>;bxQf0jGse#v@AX_{=48>A+f+=u1Yz#fW)q(#3o00+;=aSD
zY<TaXgf%H#x=Do0vf6lPUVV=6P-l`$A~Lf!TN7$SX>=Pd@#SbGOFl4^OTQ*t2G}5p
zNdZ4RI7x4ZpJH#<N=ww((R&<|<Lmn`Wk(DGYXnLNP*+>O(E948=5a7lsI$S^VFfTa
z8k32*k7z-#!0vy6a9bR{;8;w!D7Q}crF*rv{OB{>ZOUYOX>E8$M=hj(to@^p*t2Da
z>;$Xp6AX(UJ8h%AoVR74nQVW)QFz`yb72j0utSczcAnj80`}w!lR{;4u^{UuiTLxG
zi>%x|NkI{eWOd}ilMumB+4^f#d)<4Vv#Rkd+2*e5arxIA@Z`<woz1=@@o{~7CRWNh
zBIVlyaw9H2Wf8pj$MGT0Uxista-X(bb6Z$tPt$x^c#mVh^T1$cgG_<>vpRgF`W&CU
z%%IWcc?~_u4rMQgi-I}a3oiveCvPryQ_XCxGhIpgz8GIfCYVSaXEQe$&m&_~vwJi>
zQ)71DTW`ZB7wy)xZTCW3TtPeH_5t+ZeC><0p7;yAZoMmabE`CnjE<t#kVf}5@Azda
zEsfo*n8~H52smpbd`h^a%4<wM!srs-5^eqPbg5yh&%!h>vb^v`BVkieV*QV-d%mAU
zs^m?`zn#UCmDO*3mD!DUa?d81d)dXBQ)yVcPF`I@0zgNIDJxVaAOO>9g_6!Q)$4^f
zh=E-KxC1QRk7a_Km_Bm%q&pQvs7|^U{DpvBn)LyNm!+$nk7<Qaj8ZI8q-Mh4eMjsB
z*CS{&`y1;m&nGte)(hu-p_pesObIDwndRTyP?BeIquBgh@xo9T2n;+d(%Ueg2`QdA
zC}^ECORM&I=0WMs=g^R${P0Jr%bY4ReOm2Su#e}&)0-O{j{AschR0de+%NWJh+MMW
z?z&M87YDp*-YvluWLjQ~uDr@JQ6B3ban<?VH1E<6?qCwquUwdX&mW85Z!AD-Ox~L%
z)#Z5^c<<Hd*zZ4RczBQWF|yhL_IOohm9BCtc-QI5qpu}4ZXso#N;ki1>1E9E$q#jx
zeG6UiT&2+q?@Y*{&`W+{8WA>_Uc(J9P-jXj5)ID%{$YB;#mxOBEt8b>PU={@B!cw9
ztn&2j+av?NOm2B$!r@u33zs)tbw?WSKSYu(mP%-5+jbqW8cV56(&A0SOFDMxLLA79
z?fJR2tk{3JI0~E{>65T=YgZ^>naDw0tSK)|C#N21>qw{T#=u|IklrCVEARzUg8-&e
z$tg2sCLnp6ldri8pHL7p=H5Qt4kiE9MZbpNU-wJIedfTJoe0mcbRr#<8Y^o$B{`L5
zFAVHS^!KQIT8gYJNjwU576eQj_LzVoI(CUXI#!WYk>Jr{IHS*NjVZoGexpYcH`{~+
z{8i;I74d{P*)C+7a|&jJpMB;lTOIYqIgL9`Wbwg=$I|$jXp+qva}kCXZXM<4F+^0e
zIqHO-3mP;q!5-g>IM&80GX@2N$$NFPEgBy9#?FqZyDY>c0;p^JPO*1tIPe3WZ^!To
z4!z}h&tpri+@EN>Ri(PbNzg+9wUnTmiJDe>74R@ofQXobxQ&%9X*uoli`tbAhUPS9
z<IZ*`*QM4yU!b+3+mLB9=gS2BkeOSXn|My<(|22Poin00?OiXvQ(F~Lx$mM(sU`9#
z;+$Yfd2PXd_ELn{_v;G^nXDg*zS`uw1(03chWO~`*9kx*#|8~3+Fml*JY!;KBTD-o
zF(~OJLimo+(cX~t)Ab{78Hsq#^IZKd!gNZw)tOGcKVE#<f9$Ql$<NgO?Cy_G2Fv?Z
z_BlLDoJ7~QzjSR}Grpbe5w})#FV}{W#v~$i>ol-H_U+L9E)?F8EjiLs*-^w1%8@qq
zg^p-bEVcJyn$62Qp`<j{wPXe|ix&nry|3To<6SGNZgQ-yt_#I$FsS{SVtX{JMghHu
z0-Rq<(AnysS($*4;3G3-&Yt>MIlKiOL)v&d0EZP22l#+vjevXuZ)U%g<sa_2N8B3O
z92(ga6yRL_zq$5)suwd6KD~f#g<?f)c4p)+l{<%Jd(4XCGbT2U>Jzq1oB<Ru_;>R9
z@4s0;jm^;!vn_3f;&LAppO}ROrO$$nZEUC>xh5fI9h;=$D6WA|85x<(f{N>vffB%?
zkT5EkeAJT3JDkw|$=Y(*^oEJho1)hn8w}5zEz%pq*0=l_UH8p<_QfYj`RVr!*2D=c
zF6~p&SDdX&uAY9vl52TKjICAY93C?&vTT@KFRybq&iD4Ir=GH>W*h>9HkAukS{Ze6
zQhnDU3}t5P4-9RLeTkZ6HEsNhzbt=Dxp+^c1J<eJ@gO^(=4wn+-RqC1cP+c5MqS$%
zOX}Y6gvBg0r+3Ve*1GL#_bQF&)iT=2PB(EQOkSH)LO4+C2oas=sZithk765jYW(e+
zxW_Aukq^cx`Er8XJogO><tp9a#`n({y|oY<H*UAy<n<$sEp4AAvJ;+cGI-0muUBn*
zcea^*TjdP1VODvLZiKY$Gh$4v`E)C+bv=wEX7&uthQN^TX}&}eb|omCn5EDvhLc|N
z3!1i(mX3+Dm;lF(#~5b*F#>Z*fV2Vw+tv+A0wieqXdprz>U=#PFvh0&p*#lYuJ5H4
z!~QPhUD<W|YHW-ES=ZQGgYpK)H0n1-IVq3K<bOEMXtthR4@W4fEISrNDQrZ-<q(G5
zPAY0Mz74_O3RN@w^*O{JsX#1?NaY*bo~oU4>VFY#d4J-au(tT;3y<`JLJ*QvNl84K
z=e`yWkj>Rp+;F{8>A@mGnmq1hIQ_6<55mDh*~O#06O#+kN>xenp$fi?(^T6#K&)>7
zJ@WvFqkt-CVjfu}qtjV=sI*h;1_>C(g-BBT6kGQkM7yV8a}-N~+6fqvH8tp@BrcBK
zCM(-2mrB_a*!BXMVqOSxo`fwfo1nLW!Ud1eiux(=q^8pd6;(rry9FM;d9`WHjxqF!
z6sHiN%S*fSwxS+Qkw}FM<H(ZGJX@u{Blbqa!jmFO>@oEUz0%{%E<dUv7o(WFK~J>k
zQ|tEHY6JvL;b*8t$0O}ZR7wE;<wW1mvcll}@B7&=s~sYF&#+&UYFI;E;dWzfeuENZ
z6KrysXfPq2G59dtW=45$Vs;C@!#KcJNLhO-RdkN!`71A>Fl2u+#)d9R`?D>5KL>qi
zq2nDb1C@$4w2@&(hMNLcS@8{N+9%FcW=rwIaR??CBMqA=r%Ih&q13NVSp>zr7We|E
zRerpsC9g`3(a0G^kT3K?Qk@d<ryuolY;mdFMcj2iC{%VC=+VLp6C4zla4uS(BjwOY
zQL-3d)-r3nlln!oxM1zBcy&@IcXqk3?=H^(kM9$a{u)xvQCd-zv{R#l%SCE}OXjdD
z_OZg-(^TZA`D^7T6#Wy~U$RWK6mzj(kttCeyQOtc;{)Z4jrPpCG@e}@PpHTZe<RoI
z3$2Tk`Ync=dE|w-w0=#Vv8o|2>>XF3wN#OGheBr`pMv=+ig>VmcJ;A_!---%QY4|0
z2D2tAEWCB&(zadfezNsZ*;I_`wD9~$O_Ebh?zWZ~?(qT%Mhqfmk@kooF=wF6bYI{K
z_t0AY+mCtXvB|~-Nfh^&gj@AzS3)1^0c0e3%SKkoaeZ05ZKiE-bup57-a_^2ICzWu
z<2NXC<2e<76S6Tc3i<-kY0T1=Nm-P7m}9v_?6orW*Qwco71>5hpRY!7<k705l8B>n
zi5ag<E#+!E&9b}yIG2dFCf=YY$QjUa-*35D(C<|8_ApN%Iys;~0}hI?pg-FxJE9j!
zl#sp53yQFu>DngL3RiDG<4s?(!zURSt>)J%Taex~aiP*Q%L_>sVEAH*Ew4=d%XW?=
z)!BFWBb?2lI4z98hbPS7hQv%#Vj)<C2d5ENh%r0B2G9V1v+M@{fC45aNP;{N1^)II
zkEZwnTIb`>A)rCOghU(v{t~yL3A`{^OMh}*I%1A~*RZm>1wCcg7ELt`p=h8iMR34K
z`Uv~ref%(iZ#c?2Q1>(JGHY^ONr;UoZyt>BE{6{kl|RR<`95j7U%v>)2-d0rZMs3T
ztm6EQE>cUM4RM*o04Npt#ED2h`n;4+xDrrDIaAN&ZOgBGP`r02@4D4e9x}dvT4MOK
zYk^(Z<d*BpC#gsUs`{<Qoj1O^lb@_-d_oHN!$0iHJiQ~R_C|i^P~Pdx)59H~yN+}t
z9?Al{T<nl-anAx@J35Z9KztLIR<(VMq@7V0#pNtW_#GK*7F47woC)_epCwC{8R`2Z
zKcUuxcqe@hWpX+aYQ<c6A_GXG&0`Bv+$Al9xUvahrUBK+em<TFyaP^EJ1GPl$dOxm
zZ$@we4_LV3FN!OtY0ZCA_FH#1q^x|1^O8sL+^$6h%vP}ZbnGj!HC+8d{HwH>oLFO5
z!QPbZPx!z@g)!?Hh9-bkX?KF<6t~z)c#4W_z&;hx5$%g5bZUdd>Q?ylyoKC;?#2bT
z)cqNufe+=6XL?%QxJX|S=SA>nrnE0mu4-TY)Kz2mrDwuV_p#E};6{T+n6%L(S(S=y
z9^TVi{CUfR_^*p^u93#ntBT`YRdX;F(pd4i09`?pT%mCJ6mahYrCx{mH>Jhpx$7PH
zRXsJN_s=MwgU8nvFgh)^>n@2)u$G}n5}!iiY^>%j0^erx^Gw(!P%`t->{-%(%6a8b
z<7kO$J*Y-BB}EA7KCgUzIWgaSd?GC;Z#$9oG9#Z5vW@hByE<3aGb>>iGDcq?tiK*0
zrocToV?Zx4n6sWR?WS7i{l$n|=~PlQ53k#{SS`%oD{#3|<c#5sZiZApz8^Jor;J{8
z25=|7hCYhIJy-c@S)(smVLX;Hi$~z?E7~F1MH&Mm?&o?VV)J(d@sIUj_2AK{LEN~8
zVd^b}!Jv*Lq}|KQLm|+dCf6Wy2?vEK@<^j5nT^$vP>CMh$>y7`xab<h>Mqt6`82!3
z+DKU>X2nr7-71r3*f}4tkP4fZ5A?v#=E>ZJN>>%Oq$j7<F5iI7;sMjRczC-qT@$e5
z@Zm}1@MPHBi|qb(c;A`pPQsqYM5V;!q1!u`^kjv^`opOYZ-VReF50|7g-=L;YoIOM
zHc$N2pTBW`$EX`CVyq@a@B6yWz??<Pg{klf8w)v~IR3p$g`zI<!mUHLXeWDKCG--$
zecPEGIlA;$iI<%yvR|_1EAtfg3Pt$<-Aw{}Pz;^YWa<&|hj`}P`*4m->X_UJys8xe
zn`Wg`DlA$R66p<$h?s`5{XCcj<?S(^+RNrJuR!)FiI5Cvr}HShl5wl~Oer2<KTbl}
z%gMyGq3&if0zbAQk;j$<YaM=F`X(knjYLM`uZtkEyFp)mY2gV8MIz!8JJzGr+9l(%
z4cHdLryP5%zN^Z|Qb(^PGTO@v9ffP5QW;)fNpfI*!ut`G-<Nv3G}Uv^0qQIgw(f)3
zLwqB28Z0H}xMGePu8gWCsLF{RCL!N1CUCqQHy5VlIj=Eh-*$;rmlCDmEVv_)KsTgR
z<KB`)EAEpF3osnrWi>oU8TFw(;9Z5JQVbM^57<wT5-h!`W(&73CHl+c)&8o^@qP28
zMtcJ%ceS~{>?aJY8A}hOui40f4u?V=@rjZEdUTD&Hd8^$(TIB{k=*w5R^fnhNMWi0
zF`aaBi_^Lo&#=A5)aw|BE3hlZTk#&7V~irW0_#CRg*?nPBFRcPy0zz{#0}PWl&ALk
zI#lsIhXa*@HrH}wGY>CHb8!+rjFD(Rk1sZ!@J{%0$`Q^|1o2P}x}AXFVJ+1I&TwZb
z@+%q@d2Jsvb-GGRrR;yqlRYN{rtwKhuQ%c>84D<%u4&KJP$d0by`Oc%)=Z&66S-Na
zZxW{&$>v1$<fiAdn}T5duFLOfrY2GGx#3XofE&9=`v{0wZ3z&u0Gi+j8p`t>@<tpp
zYHU5Fv}??2MFq!;-m5o7(=~;cLcCZeAcS>G<Ab|S5K$tjCN|rqQEH8BNK|jNX~p*R
znbJ(=p-=#(bjGotC8OHjT!`*v*z=rF+Lte#CgDPdiY#b?%a5AwCh(GRXddeHej+J*
z*`slD6Ol*g_we22<g?HHr{}GX==>7Azp^#npc$Bgf33`|Dw%z&Izqnu{E6yA@KD?s
zYyFhM1E0c>Z!Q#$Vi&V|`YkY2^lQrP1_t6gj7I@kfvt^4e$R~ezaou?H#wFg4P9mm
zLU^6@eV(vXmJ3hSb>~J`t*;xeCl=eZVK%%7x4!MFU8oXHdY!lFRzeL%Cyo#A;RpJ8
zoCD(sCjpAYcn0bxc3krX#}SJEQvwfk<WRbkGstwbe;rB&Pf1V%z%dB|(9uG`T_=CN
z`0Kb8c(U@(D99gK{`Sp(yuFT&_CMZ0^GA7qZO8zt@N1XGALO6$P_UZV@aP|eUvVrt
z+Mn^FKWP8!m<}E7{}WyOQzAy{Q8;p!1nd-mj*+^77MTkDneOL8_LKfhM@k}JL4KyQ
z&mm{Yf2Lc4k;90e>4RhlvKQ~?Lt8HR@QV|un8+t~SPBwK*P;&d^Y#uvQh=d0PUt_L
zq2&a$8L_zlknoS^Q%^GgJ____&VM>_bCMOiiRxD-7|8B8oPI+3894n##Y%w<_Wk+%
zYkd8D2cGPuW27tP>ZD=(D?a!?Jp4=j2n>7DQpbOTASD5=8}y7|`~OeQ;Znao{=OUh
zo(<rNmG~{l??ySX2LI9dDC)=<tT-<~cB%l^E2+Q4`KQJ_VF{iK{^#RM)P=_kYQ7)K
zv+na-p8pTGuTft=7Gf9_q6Y$Sy-$QN{@>a^K|OlRC-w+}lm|N}3uN=_jQ4*WWv5;O
zdC*h$gFHk*QS$%b@gIkhslOcaH~@J_16)@>AM^PCjlrkRjm5S=OC%Io0|lsepEH5u
zhKqoDWe-AywJx;8XhTCiGg*HJ7bj0U+Eij;>?N+Ef<~jU|IpGw;B&A6HVHl-ztTdZ
z{n1KjG<Y{a^Xr533m33-4K(`v&wTFKy8#CL2|??C6a!^2DUCkP20m*WfS+Sf*zAVb
zoFE@GTG=04v^Mq;q=J{l@vk9B0~x8D2MfpE4Y1{axsQoxwD58H23V3h_LZZ#roXwU
z#07f;Wlt<W4QzG{wmjtv*kUBHb-*0UiBZL9pw*7~o=2-4^Sywjn*6Niab;z&{3S17
zs{#ssEE;$>07X<X$MVy43lP;Y5*N{SLyPK(Sp;jlIRxq`TSz&&sRZgNiC@qSJ*Efi
z{FyfT<Gt(ewC*2ybpObs`)8hz-}(RKulGBD$RA~efU^CP0h<o~DNl&bZ=NB4lpFE~
zpHPs`ul(3_$j@)F>CivYsK3&`%L+X%3)?neIX~MbL>p|AleW<a)WO!{gdPgM$F?0v
zHvs7>p(kmRKpiEKKudEM)bH=XcYo52{srCmkMCXo1>NlrddR<^hhqCqBM@wV)KB@u
zewTm#FZsc?5B^D)0O?{UGOJ+g`YR3U!m-w&(W){A#;P)^#vZ1+<|39DV^vdCljE^~
z{@3G2-yT#t>0>1YRZa~%t}tk2X{<hDhro#{Vo(-l4rnQ?euf+~JI)Vg1}_y*he7QG
z^&89&-c^1Msh@@Fhhq7Id_WTRD|KBFOHWH5Cop-;FZ7tzpZu^R^1t9`4Dyo!vmECF
zf6k+o46t%y<pwWxT^}b;tVlok|1ab{wjt`e-WCS`#$FhJW&_(u|GmA?!^&gtsTyEu
zD&uA^8tiXpuIpnjDvn0$9k-7^mVRt(e%TA~8LX5Mwkg0nXu6J@=-4iRJ^e@jVC^5q
z)6Oi|1GGgh_GZ|+|F7lM1(~3+<H1}^SHwXb6Xf7!=wXR{`M;JIs}~q|3v)vk2T@bh
gFPQ;aJbiBuJ3oKAQg90R@BRQZApq!u|F_ov1#%M&?*IS*

diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/schemas.json.gz
similarity index 88%
rename from altair/datasets/_metadata/datapackage_schemas.json.gz
rename to altair/datasets/_metadata/schemas.json.gz
index 2655d6baca5f11208703bd123308b16b171b0f53..c435afa6a718f4a5c2649c50149768b2300b4422 100644
GIT binary patch
delta 19
Zcmdliyj++|zMF%C0SK7>Zxox$2>>ow1hW7D

delta 31
lcmZ23yjfUGzMF%C0SK7>rzDmn79=KTC#I*yZ{(WG2>^)t33vbi

diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz
index 07cb52ec1c834808609b204ed2ffe0b4cd83f62e..7de1154e5be7e104ad07d9b3a7bce639375c8c67 100644
GIT binary patch
delta 850
zcmV-Y1Fihm2HFOY6@PErFbn|Tdw)xnqy>rrI}O`)$K3;2Vr(_C=;`6E{q@t!j^h*?
z5c<+42Kb4VWr~XAo;&bI!+UrPRzH0CP{U*A{ouat_d=T8tCOJBXm<wuzU}rSv^@V|
zvc$S?Jy&L*zU@==ZZPcwS9U<_-Tl-g?1UR1xG{liHdqjp^nb3H7>Xv!PGH{v1ua7H
z8g4bb->;SUYAGw7x9SXpd0ZfQDw`l%@B}Z%*97xtN5$NvS+5KQOYy}L+!E4C(bd6c
zCeR2l-g2*yklqcIABVsYzbncUo{5^WRXuP5m*a~imE+C>WiDqT->$T3RHh|x1t{qO
zx4!!wIOzphAb&+Yo$$~7T#C*m@T9pVl$9J_6G|VMgaM2MBS?$o)|$ppZRZ_(I-Du^
zj9jbFhb`Eotldr6wYmkmGG8_@Ush(_BI$62@*8?W^bOaVpjM$I+w!4Tajn_=!I@Zz
ztOKrn$CQa1Fl^vogLyhy)e{<y6rGDft9%z43Y6kYMt>UP>@p9e#4boisZ&^IEI%KG
zZ=8V!5jQ2FJfRs{OF7k7YzXx)MVB-+XI3GldzplsmPRAdd`F9+?I`G#Q&Y{?ex%L;
zl_cXcwA6ov27W37WPOGv9nWDnw())jlD%N+gy$;*;GlsF7be5Ka$tF*DN}JV$}IF~
zxD}LjF@Fh@H<H?b5??HXwvV;oS>DghL=mw8M|$xqP_<4A0u`f+6I-4(C_|XFNvNLY
z{uFOK?i;bcwVZNxxwYyTs0^h<mH5fh$}16{Q#^*mirR#93L)X2g41Y2(8MFt#aNri
z_#|SDiN}g#7i`=eIH_{|2hL@|0ry@5dyD@&N`E`Ef<D7phr#N1R^BVO*;p&L3_ZLY
z4ula>_ytN$I#tU?Aw7yS<@R%Yrb0Z3rZ$4mF3&_+jlsn3N#lDTw^IVTP<B2gy12l&
zu;^TDNM&%up9%i$PPe2|6(a2g7o(c#JlGSw96wu|%=wZjtHUwpxd$frpgif7yP-<z
zBTMR7OYnZT0!1Sip=I%T131T`uz|UBYNGK-MR-y<yh?Ggj`1jflwYG3NV#>iTCyk7
c0=*a|(GN8Ul$2L>5^{=v0TW#UNTU}307+|?ng9R*

delta 847
zcmV-V1F-zs2G<6V6@OFPFc1LXJHG`vAu~*8dg`>-9(&Kol46U<T2&9H@awCWosdvE
zqs<8j?5FkW;T4>E>V)5GI*G?%)n1$F;meyekE8d-ecfyr*WHWbLdn^7TKG-;i_I(s
zN_dHN-{xKS^we!P>4QxQ?u;HDsI&&Gcg@lQbj-~URO>($F@IJtL~`$x7zsMUj=|Im
zgjs;X_0%fy+KiR(N=eI|xAG|3Yj|PgGWLNF!nbodyr!4GcSy@EnDtU4tQ1}-Ms0gi
zDY!DQBMdYX+FR-+v?q5X<(Xg*;?EK>x2sSSwsI1*UCZHx;?hy)1)`3tke`)XIZHjX
zYbCxUC#dz~?|(os+v36~=-~i=nza<1kGzATRuC?X&|GMSncQ2dCYnCSsS6~B(L3_A
zJtDZ(c#rP3cTf}3QETCJ;ogJ2PWBGw75R)OpRdUJHblGC$@}sE;Crf6J6eH~>OjpT
z<8rlR6jb?<NFCpd?W^@4UQT`e9bT`=%c;GuNWu987=NT6LyvG$czHktv9iPqOj>6n
zo5=a7<<I4kaH2#g!QvZ;N#EY_Tthk07wFrM&J<jdRFqf=h#qADa#9L32mKu^gkeOO
z@0gls{_IEU5I}OQy*iipuh5I1QVT4;ItvcRVD>|N8iS!Plv?cki4kCwz`8}r?j)VC
zv{#5aoPUW@f(bOW46z~>VBuy=<50p2rNK^d|L=ISwh9G=UO3EVukjTd6>FDbaA9JH
zGz6*JE4BcYQ`8^gQ^&VX>|ZTK+)8e#Is{0KIFw5GLMiDbi&GSD81V${L%LvK><_(V
zFcTE+E_5cY$$gxKc&x-L!+s1lz6&Vktoo0ekADIusP{^cQ~WxnCIf<1ucR#b_Mty#
zKN|U7LlxURZ?~y;-nO$VaTak{aXhdiqOLhSOCeq_Q@haK4X;93hC#;`Nx|zB-<*Up
z3(m!6lLv>pp?}Wa4t<w_I-e?}Wjc8<NB^0qiKctr)XwGbmAc16Un*r?IO06@f-q{N
z$4G4A8>u8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#?PGA)$PnhL%
ZCQ6!b(lFqNy}1B6!M}FHK(GfF007c1q}l)g

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 6ac13695e..d0094f5ff 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -86,9 +86,6 @@
 __all__ = ["backend"]
 
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
-_DATAPACKAGE: Final[Path] = (
-    Path(__file__).parent / "_metadata" / "datapackage_features.parquet"
-)
 
 
 class AltairDatasetsError(Exception): ...
@@ -209,7 +206,7 @@ def query(
     def _scan_metadata(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.LazyFrame:
-        frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy()
+        frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
         if predicates or constraints:
             return frame.filter(*predicates, **constraints)
         return frame
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index c83c6066e..87d1ac366 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -22,17 +22,10 @@
     from typing_extensions import TypeAlias
 
 
-__all__ = [
-    "EXTENSION_SUFFIXES",
-    "VERSION_LATEST",
-    "Dataset",
-    "Extension",
-    "Metadata",
-    "Version",
-    "is_ext_read",
-]
+__all__ = ["EXTENSION_SUFFIXES", "Dataset", "Extension", "Metadata", "is_ext_read"]
 
 Dataset: TypeAlias = Literal[
+    "7zip",
     "airports",
     "annual-precip",
     "anscombe",
@@ -42,13 +35,13 @@
     "budgets",
     "burtin",
     "cars",
-    "climate",
     "co2-concentration",
     "countries",
     "crimea",
     "disasters",
     "driving",
     "earthquakes",
+    "ffox",
     "flare",
     "flare-dependencies",
     "flights-10k",
@@ -61,12 +54,11 @@
     "football",
     "gapminder",
     "gapminder-health-income",
+    "gimp",
     "github",
     "global-temp",
-    "graticule",
     "income",
     "iowa-electricity",
-    "iris",
     "jobs",
     "la-riots",
     "londonBoroughs",
@@ -86,10 +78,8 @@
     "political-contributions",
     "population",
     "population_engineers_hurricanes",
-    "seattle-temps",
     "seattle-weather",
     "seattle-weather-hourly-normals",
-    "sf-temps",
     "sp500",
     "sp500-2000",
     "stocks",
@@ -102,71 +92,24 @@
     "us-state-capitals",
     "volcano",
     "weather",
-    "weball26",
+    "weekly-weather",
     "wheat",
     "windvectors",
     "world-110m",
     "zipcodes",
 ]
-Version: TypeAlias = Literal[
-    "v2.11.0",
-    "v2.10.0",
-    "v2.9.0",
-    "v2.8.1",
-    "v2.8.0",
-    "v2.7.0",
-    "v2.5.4",
-    "v2.5.3",
-    "v2.5.3-next.0",
-    "v2.5.2",
-    "v2.5.2-next.0",
-    "v2.5.1",
-    "v2.5.1-next.0",
-    "v2.5.0",
-    "v2.5.0-next.0",
-    "v2.4.0",
-    "v2.3.1",
-    "v2.3.0",
-    "v2.1.0",
-    "v2.0.0",
-    "v1.31.1",
-    "v1.31.0",
-    "v1.30.4",
-    "v1.30.3",
-    "v1.30.2",
-    "v1.30.1",
-    "v1.29.0",
-    "v1.24.0",
-    "v1.22.0",
-    "v1.21.1",
-    "v1.21.0",
-    "v1.20.0",
-    "v1.19.0",
-    "v1.18.0",
-    "v1.17.0",
-    "v1.16.0",
-    "v1.15.0",
-    "v1.14.0",
-    "v1.12.0",
-    "v1.11.0",
-    "v1.10.0",
-    "v1.8.0",
-    "v1.7.0",
-    "v1.5.0",
-]
-Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"]
-VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0"
+Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".tsv"]
 EXTENSION_SUFFIXES: tuple[
+    Literal[".arrow"],
     Literal[".csv"],
     Literal[".json"],
-    Literal[".tsv"],
-    Literal[".arrow"],
     Literal[".parquet"],
-] = (".csv", ".json", ".tsv", ".arrow", ".parquet")
+    Literal[".tsv"],
+] = (".arrow", ".csv", ".json", ".parquet", ".tsv")
 
 
 def is_ext_read(suffix: Any) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}
+    return suffix in {".arrow", ".csv", ".json", ".parquet", ".tsv"}
 
 
 class Metadata(TypedDict, total=False):
@@ -177,29 +120,34 @@ class Metadata(TypedDict, total=False):
     ----------
     dataset_name
         Name of the dataset/`Path.stem`_.
-    ext_supported
-        Dataset can be read as tabular data.
+    suffix
+        File extension/`Path.suffix`_.
     file_name
         Equivalent to `Path.name`_.
-    name_collision
-        Dataset is available via multiple formats.
-
-        .. note::
-            Requires specifying a preference in calls to ``data(name, suffix=...)``
+    bytes
+        File size in *bytes*.
+    is_image
+        _description_
+    is_tabular
+        Can be read as tabular data.
+    is_geo
+        _description_
+    is_topo
+        _description_
+    is_spatial
+        _description_
+    is_json
+        _description_
+    has_schema
+        Data types available for improved ``pandas`` parsing.
     sha
         Unique hash for the dataset.
 
         .. note::
-            If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
+            E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
 
-            then all ``tag``(s) in this range would **share** this value.
-    size
-        File size (*bytes*).
-    suffix
-        File extension/`Path.suffix`_.
-    tag
-        Version identifier for a `vega-datasets release`_.
-    url_npm
+            then this value would remain stable.
+    url
         Remote url used to access dataset.
 
     .. _Path.stem:
@@ -208,13 +156,14 @@ class Metadata(TypedDict, total=False):
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name
     .. _Path.suffix:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-    .. _vega-datasets release:
-        https://github.com/vega/vega-datasets/releases
+
 
     Examples
     --------
     ``Metadata`` keywords form constraints to filter a table like the below sample:
 
+    ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION
+
     ```
     shape: (2_879, 9)
     ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
@@ -249,14 +198,18 @@ class Metadata(TypedDict, total=False):
     """
 
     dataset_name: str
-    ext_supported: bool
+    suffix: str
     file_name: str
-    name_collision: bool
+    bytes: int
+    is_image: bool
+    is_tabular: bool
+    is_geo: bool
+    is_topo: bool
+    is_spatial: bool
+    is_json: bool
+    has_schema: bool
     sha: str
-    size: int
-    suffix: str
-    tag: str
-    url_npm: str
+    url: str
 
 
 FlFieldStr: TypeAlias = Literal[
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 66353b9e4..95a6fb0ad 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -16,7 +16,7 @@
 from narwhals.stable.v1 import dependencies as nw_dep
 
 from altair.datasets import Loader, url
-from altair.datasets._readers import _METADATA, AltairDatasetsError
+from altair.datasets._readers import AltairDatasetsError
 from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
 from tests import skip_requires_pyarrow, slow
 
@@ -296,9 +296,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     assert match_url("flights-10k", url("flights-10k"))
     assert match_url("flights-200k", url("flights-200k"))
 
-    with pytest.raises(TypeError, match="cannot be loaded via url"):
-        url("climate")
-
     with pytest.raises(TypeError, match="cannot be loaded via url"):
         url("flights-3m")
 
@@ -690,9 +687,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
 def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
     """Ensure all backends will query the same column names."""
     data = Loader.from_backend(backend)
-    fn = data._reader.scan_fn(_METADATA)
-    native = fn(_METADATA)
-    schema_columns = nw.from_native(native).lazy().collect().columns
+    schema_columns = data._reader._scan_metadata().collect().columns
     assert set(schema_columns) == metadata_columns
 
 
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index c9f35ae7f..131e15bac 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -26,7 +26,6 @@
 from polars import col
 
 from tools.codemod import ruff
-from tools.datasets.github import GitHub
 from tools.datasets.npm import Npm
 from tools.schemapi import utils
 
@@ -40,13 +39,10 @@
         from typing_extensions import TypeAlias
 
     _PathAlias: TypeAlias = Literal[
-        "npm_tags",
-        "gh_tags",
-        "gh_trees",
         "typing",
         "url",
-        "dpkg_features",
-        "dpkg_schemas",
+        "metadata",
+        "schemas",
     ]
 
 __all__ = ["app"]
@@ -67,20 +63,11 @@ class Application:
         Directories to store ``.parquet`` metadata files.
     out_fp_typing
         Path to write metadata-derived typing module.
-    write_schema
-        Produce addtional ``...-schema.json`` files that describe table columns.
-    trees_gh
-        ``GitHub.trees`` metadata file name.
-    tags_gh
-        ``GitHub.tags`` metadata file name.
-    tags_npm
-        ``Npm.tags`` metadata file name.
-    kwds_gh, kwds_npm
+    kwds_npm
         Arguments passed to corresponding constructor.
 
     See Also
     --------
-    - tools.datasets.github.GitHub
     - tools.datasets.npm.Npm
     """
 
@@ -90,42 +77,20 @@ def __init__(
         out_dir_altair: Path,
         out_fp_typing: Path,
         *,
-        write_schema: bool,
-        trees_gh: str = "metadata",
-        tags_gh: str = "tags",
-        tags_npm: str = "tags_npm",
-        kwds_gh: Mapping[str, Any] | None = None,
         kwds_npm: Mapping[str, Any] | None = None,
     ) -> None:
         out_dir_tools.mkdir(exist_ok=True)
-        kwds_gh = kwds_gh or {}
         kwds_npm = kwds_npm or {}
-        self._write_schema: bool = write_schema
-        self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm)
-        self._github: GitHub = GitHub(
-            out_dir_tools,
-            out_dir_altair,
-            name_tags=tags_gh,
-            name_trees=trees_gh,
-            npm_cdn_url=self._npm.url.CDN,
-            **kwds_gh,
-        )
+        self._npm: Npm = Npm(out_dir_tools, **kwds_npm)
         self.paths = types.MappingProxyType["_PathAlias", Path](
             {
-                "npm_tags": self.npm._paths["tags"],
-                "gh_tags": self.github._paths["tags"],
-                "gh_trees": self.github._paths["trees"],
                 "typing": out_fp_typing,
                 "url": out_dir_altair / "url.csv.gz",
-                "dpkg_features": out_dir_altair / "datapackage_features.parquet",
-                "dpkg_schemas": out_dir_altair / "datapackage_schemas.json.gz",
+                "metadata": out_dir_altair / "metadata.parquet",
+                "schemas": out_dir_altair / "schemas.json.gz",
             }
         )
 
-    @property
-    def github(self) -> GitHub:
-        return self._github
-
     @property
     def npm(self) -> Npm:
         return self._npm
@@ -151,35 +116,26 @@ def refresh(
         .. _vega-datasets@3:
             https://github.com/vega/vega-datasets/issues/654
         """
-        if not frozen:
-            print("Syncing datasets ...")
-            npm_tags = self.npm.tags()
-            self.write_parquet(npm_tags, self.paths["npm_tags"])
-
-            gh_tags = self.github.refresh_tags(npm_tags)
-            self.write_parquet(gh_tags, self.paths["gh_tags"])
-
-            gh_trees = self.github.refresh_trees(gh_tags)
-            self.write_parquet(gh_trees, self.paths["gh_trees"])
-
-            npm_urls_min = (
-                gh_trees.lazy()
-                .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
-                .filter(col("size") == col("size").min().over("dataset_name"))
-                .select("dataset_name", "url_npm")
-            )
-            self.write_csv_gzip(npm_urls_min, self.paths["url"])
-        else:
-            print("Reusing frozen metadata ...")
-            gh_trees = pl.read_parquet(self.paths["gh_trees"])
-
+        print("Syncing datasets ...")
         package = self.npm.datapackage(tag=tag, frozen=frozen)
-        self.write_parquet(package["features"], self.paths["dpkg_features"])
-        self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"])
+        self.write_parquet(package["features"], self.paths["metadata"])
+        self.write_json_gzip(package["schemas"], self.paths["schemas"])
+        # FIXME: 2-Part replacement
+        # - [x] Switch source to `"metadata"` + refresh (easy)
+        # - [ ] Rewriting `UrlCache` to operate on result rows (difficult)
+        urls_min = (
+            package["features"]
+            .lazy()
+            .filter(~(col("suffix").is_in((".parquet", ".arrow"))))
+            .select("dataset_name", "url")
+            .sort("dataset_name")
+            .collect()
+        )
+        self.write_csv_gzip(urls_min, self.paths["url"])
 
         if include_typing:
             self.generate_typing()
-        return gh_trees
+        return package["features"]
 
     def reset(self) -> None:
         """Remove all metadata files."""
@@ -237,25 +193,16 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
             fp.touch()
         df = frame.lazy().collect()
         df.write_parquet(fp, compression="zstd", compression_level=17)
-        if self._write_schema:
-            schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()}
-            fp_schema = fp.with_name(f"{fp.stem}-schema.json")
-            if not fp_schema.exists():
-                fp_schema.touch()
-            with fp_schema.open("w") as f:
-                json.dump(schema, f, indent=2)
 
     def generate_typing(self) -> None:
         from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
 
-        tags = self.scan("gh_tags").select("tag").collect().to_series()
-        metadata_schema = self.scan("gh_trees").collect_schema().to_python()
+        dpkg = self.scan("metadata")
+        metadata_schema = dpkg.collect_schema().to_python()
 
         DATASET_NAME = "dataset_name"
         names = (
-            self.scan("gh_trees")
-            .filter("ext_supported")
-            .unique(DATASET_NAME)
+            dpkg.unique(DATASET_NAME)
             .select(DATASET_NAME)
             .sort(DATASET_NAME)
             .collect()
@@ -263,34 +210,32 @@ def generate_typing(self) -> None:
         )
         indent = " " * 4
         NAME = "Dataset"
-        TAG = "Version"
-        LATEST = "VERSION_LATEST"
-        LATEST_TAG = f"{tags.first()!r}"
         EXT = "Extension"
-        EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet"
+        EXT_TYPES = tuple(
+            dpkg.filter(is_image=False)
+            .select(col("suffix").unique().sort())
+            .collect()
+            .to_series()
+            .to_list()
+        )
         EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
         EXTENSION_TYPE_TP = (
-            f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXTENSION_TYPES)}]"
+            f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]"
         )
         EXTENSION_GUARD = "is_ext_read"
         METADATA_TD = "Metadata"
         DESCRIPTION_DEFAULT = "_description_"
         NOTE_SEP = f"\n\n{indent * 2}.. note::\n{indent * 3}"
 
-        name_collision = (
-            f"Dataset is available via multiple formats.{NOTE_SEP}"
-            "Requires specifying a preference in calls to ``data(name, suffix=...)``"
-        )
         sha = (
             f"Unique hash for the dataset.{NOTE_SEP}"
-            f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
-            f"then all ``tag``(s) in this range would **share** this value."
+            f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
+            f"then this value would remain stable."
         )
         links = (
             f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n"
             f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n"
             f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
-            f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases"
         )
         import textwrap
 
@@ -299,6 +244,8 @@ def generate_typing(self) -> None:
         --------
         ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample:
 
+        ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION
+
         ```
         shape: (2_879, 9)
         ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
@@ -334,14 +281,13 @@ def generate_typing(self) -> None:
 
         descriptions: dict[str, str] = {
             "dataset_name": "Name of the dataset/`Path.stem`_.",
-            "ext_supported": "Dataset can be read as tabular data.",
+            "suffix": "File extension/`Path.suffix`_.",
             "file_name": "Equivalent to `Path.name`_.",
-            "name_collision": name_collision,
+            "bytes": "File size in *bytes*.",
+            "is_tabular": "Can be read as tabular data.",
+            "has_schema": "Data types available for improved ``pandas`` parsing.",
             "sha": sha,
-            "size": "File size (*bytes*).",
-            "suffix": "File extension/`Path.suffix`_.",
-            "tag": "Version identifier for a `vega-datasets release`_.",
-            "url_npm": "Remote url used to access dataset.",
+            "url": "Remote url used to access dataset.",
         }
         metadata_doc = (
             f"\n{indent}".join(
@@ -375,14 +321,12 @@ def generate_typing(self) -> None:
             utils.import_typing_extensions((3, 13), "TypeIs"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
-            f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES, LATEST]}\n\n"
+            f"__all__ = {[NAME, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
             f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
-            f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
-            f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}",
-            f"{LATEST}: Literal[{LATEST_TAG}] = {LATEST_TAG}",
-            f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXTENSION_TYPES!r}",
+            f"{EXT}: TypeAlias = {utils.spell_literal(EXT_TYPES)}",
+            f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}",
             f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
-            f"{indent}return suffix in set({EXTENSION_TYPES!r})\n",
+            f"{indent}return suffix in set({EXT_TYPES!r})\n",
             UNIVERSAL_TYPED_DICT.format(
                 name=METADATA_TD,
                 metaclass_kwds=", total=False",
@@ -408,7 +352,6 @@ def generate_typing(self) -> None:
     Path(__file__).parent / "_metadata",
     _alt_datasets / "_metadata",
     _alt_datasets / "_typing.py",
-    write_schema=False,
 )
 
 
diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet
deleted file mode 100644
index 189dbbcae0b49d624a63d54b76e6ca9ce9425e3c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6289
zcmds62|Sct+rNj&mSw_yo6*QVA<T?*l&89fP^l;}RG1l)QDhk{cG>bIrA0j2=qVI$
z*@=`bq#{ypo3v=tf>wHp@7!a_^FH<ddVYQH`}=<1J?1{wIoCPY`Cr#{?*BRWa6p&S
zVLW_89R7f_hgmQLK{}<>FP<-;WKoD2FH0zR5xkQ4aAVh!7L(~O`uU=#P~D2Th`?ca
zc{eu@o>ANxd8GP?>r2(b>I==y%VR%N;MG-8?KsGKRxY99Q3)vp*5MM!vHH#=QC3g=
z?flN_5LXwwqN<QS@}jyCm7Jw@^Zi1HGS1Nxy%*zP2tZh;4{iil3>G7sK{{faj0`Od
zX+-f&bVEALkTwQTO`nZ}L_mdw6O);*CT-0o+a1-ne`M*PwZw_HhL!EN`e}rx&Q33%
z!~Bz{Kszi=GKItlB%)|A+Z&cfSV+1g(jrQcu>9sEASRhJ;C#8sF-Z~Q&47A+^<`xX
zj*DtV5ep431=X{qZr+2sz-Rh)FFHOcE3=N@vGR7%)5?5b*>LsLGqV`f?K$!mz2eyg
zVvhb-8*klOs$jp9?PPM1o2RHR_vWgiQiiQ6gD<^mq&aoy$;r1Grbhf^W|z;oD`{mt
zR}Mu+@rl<SPSbmRo#aFr)~Kp0$my+IoZ4~mL7L?G(&oh4UzPHd)eBe>+I7r?JNMTW
zK7aq&N#kjm%eLEP5Bm+m+Gkxp6_cl6R()ir@A0Bv_pj}YP0Lz+v66W;#(3dFMQ6|9
z^n(!@{Fc=A1Fv%rxh!3Ax~a<`B;)o5xu#9h2I2x%Lq86RR;zN2fYbHXu<6=!;_8Nq
z*&#*t8A;EJ9-D2mX_yxBrS8%~SLM>Tg3|`Q8`#5FmVbIT#X|d4dElP@qYe^%2jcI4
z3_qWgaKr72Yx-1`l7q{1)-~5HmNi)4-TB_FoxhE+XqkM>tUK{(X00Bc=&pi+c7etr
z>9?Nl25WkePODu_RjbpRW8cs2;v^Q!^l8*TN2gotn%kPH^sx0-<kQyedv}yyn_8(}
zX+l0(w6AL*-ZuNqj^9=V(7P`CN^DQhiAr_FH|F%49p0!_c;;xef?QmVM|o?4Q{t0*
zhDMESrHHjC^?8f#jnu)R=9jCI>z?UmSFCxkEQo(crDLJ;LB$t&3ZG})v#hW3uQ9Pv
zbd%Y-^2<Mi(Rtdp4!=WGo!jE#2qX0dxp>|mn$z3xy>>_AGi0(NbKhwfoVL_z_uF~*
z5q<dJQ1-d_eJsft?2i}rs<Ihb$2R_2vSqIO&e(wssg_EIK7SevOp#E(e#FS9{KSGX
z54ns5CCzzTY%EK&b!v%=8=iaY8<ggi=O<qKU_k1suzY)DjYYq9acsM$YoT~`|Bz!+
z$DYaxt(5|=j9)H1+SkZzINj@Wr%V5|yxq>{{0LH*ZL&Jul6b|~^qtC5?-{z{ww#cc
z90#(C%+^QVhSq;{Hz#!WAGl_dYtxnX;ih1isL*hT_50`iONzr5$EXfB`%3i>mPw{7
z1Z(1-i0|&_*Kf4m5W1ukT@?~hFV+&^x=*DpfY|+cgcE$}ONU7)_xfU<Vwmj6`zk|K
z_J<|Rp~6<qr-uBdwE~-E%p2=h#52OCg#JOU)9|ZuQO%MT@cRxtNUkSnI=zg0wLe<F
z_P1wquJ-(*zrSl=&#i@PH0z?o4O+cQ+!U+AdXzTPr2JW5E|)6Ir8RG%<xa~dXyG*p
z>P!hVGKHk2qV`$NQ*FM+fx!E6oOysDXv}b&D3m#sUyVi=Dia7>pJhobfCj@k+rC9N
zIy{D{Ih|(WWopdknJ_HO%#FN^O-(IK%`G@|wuvd%oM*9?<7H$@)S(%%Ip%aWkISWV
zyl7?|y4P9;*TmF|W5Vz<;dyZ^xaJlnMr)0Jp0Nacwve%&sUZpSlma|PS^p8Bfb~*r
z;6}n^)=MN@QG+r9)a2mkS-my*mT2qUmlCJ#>CjpzZ=}&@HUd3?dZ6@{yYa1I4^LP}
zehSphM>~^lo6ou96LgUt9!eqI)O*|#OFp0xw>HyOR=MhiTu%#`;^dv!*_>%wCxyQC
zJ6EDzAD7v!x}EE0$-GcKThP+r;gx0h@<eDxCDTsGb+z5MA64ByOUP3uy66YXv(q1#
zmAm;Wheqn{65PovEc*xXJkRQ#RB_(j-nb=wIjXBp_#Qq^dcWFE-=*kOig7UI;(^NT
zmsIz99-(B&w>(?2xibdiI$H_$#cQ3L!Wd;-zD0UqUw<GZ`f-Ykn6&5B8WLah735Z<
zD(#sve=C(3t%jS*5*Je@>!{t27Ez%^L!wh;l!+j_HDW`>6_&1)D;E@tw0&ih6T1gO
zYtqzsKAo*?)nz$V8;98YJ94a2R&+#8pDEcV8Q*`-wK<~gV-=tOkL&n)$(y;4$#F*$
zq&<T8u5;|;-DfHLH8_OITf3AI+;5xr@n@twwybNkja7K=mHvCk!nDe)Q2zY^LnTU3
ziF;1LwrRfgRm~ehTWw-&#lKRb+7M!rX=sziVXWL=L}0{?Q)2vYw^9`7%SQBy=bytU
z@x5lD<bvrjjE*T^q+a;iZuEZrN`p&AkR&g;c8*T2VMcIj+1)ctXEV)h7(;#>YI?aU
zqVB0~b~#{3`PQ;I$M%{qbZ@wbe*p|R(4|C8?u&D#pE^0uBtG-V9^Yl5Nln*1nas^2
z4-)jyTf<F_#(}f<c(aNV-&-B02CB5sf316BH#jwXUTye_Zko39RZacyfganNGW$ym
zcfScgi??~yr59o-_2$K+t&SRTPEjq#y0Smqj8rjA;_q^y5(hdn4rdOeCSE(Q6Iigf
zt4L$%o$?;Rqc@kMHa)ILyKM9OydsXR%;1n?7$C@Ut(6MRCSlu~bd%!~J+5CXt8#Vc
zndezuIKIe*Wq(ojP=IaZb8i(RGkkCT+5Y>+ZB8!<8)h4)1O@b@HoeZeSD~lT)P~>j
zK~6L2{2*Y<9GfonSEpwVde{>-yC=UXHdGiE*okZ1?JLbcP*!^h?W(B=SFD{k{nY%P
z?5q<=Y3e_flyvD{j%E*OJN<{F2F^T=^ii%ZTxa)KA*^xa)0#B7gAscjhVwk>Dx+i>
z>p4o6>eUglYucl>mBj<PY>@QsJ22SD-n<o_Q_&UE6w_+m9Z2b87rxBaYFpZ?ymdsS
z#C&i2I>(o<cYECB$N9Sewv1KXW^}Js|C#nL)ZA&GCAP*8v?Q{6#IM2$qNoIOaAf2Q
zPBpQX#>!WvIa?9eODD1EtTxtpHL-dcjhH_(VP@dYWOe~D;X;Ea(I(1A>hf+#YSYwj
zQ7DHW92_8+O{KB}d<?yPg83UchFpI?Y7j5LUqIc&^JWWg243u7wtyEbpn^Ws;2<7P
zK>cCz{fRmLYqTvX^vPpmkgt%uYZf7iFy$Z-Bk!Mp|9>k)V8c*@wd_GLzRMAoK#&NE
z+p0b?jEBH^9)WfKbe@;M4(-@Ql)#M&*d2T4P@C!shJp$Uhtt8)AQ(S(VxQv2UgsYq
z4B}B7Qh}ok)36|Z(-J@uEegBa!?X!RHKsV|O|6(@N+n%te!tAj6(do{SJ(`$<}TI_
zwpX{p7l<o);reCp#WDb{J?#ExxT<3yi4tQ1RtVuz=;Lo7Q)DDnxKiqI1f~Rui4Qnk
zdc%44@g4nV8uu@1Rs?O6Jn`8u=1C<UIVnQPG!}aDtx13<4zQs;EH&YX&PE7|i4lpV
zPaFC2zD;!mhs9SPp8IFO`0oNn3diFsDhY`M?3IUbgaD~%h&U<K0qkHdfAp|{LvG?I
zC1yf^-!cjS5@4b|EHVL5LIr|Q=&xCK(3uF{U&DxHeG0%l1zO2~Akm3fO$`X*c?9u%
zd4ME@fcD^!d>rKi!_w?K_CPd<upSl-X#DLFha(DCnJ{6}>~a60l~|Bqh@=gPk%+QB
z0*`;4Y$@VNNuPuHK4yY6O@uV@5=tVDy_NzK2m}drKYSh}5r6pXHp%=?P{xlk9*Ou9
zll&+zO>$|107%4%XL+)EY#~EHJ^p}vM?JngzoQ;EgsB}yvxNgzabWinrjLvq6oJcd
zAd?V@jC@jn&x(x6(Yo+4{Fr=}4u_73876SSc=yj{aLTxxYcbI!eGcr2^@c@A2kr1d
z{{F!*0fMEqMT9><XhZg7Oew5`<Nf6N*L`EW6%R&@k|20eg)&AXW3upx9hE0PGGKLs
zp8vA6A%sODlXg%#M9F{i_Yfq}8YUtgmM{aNlp&Kw|JUA4kW380_Fv$0IEZr0;46f$
z@%sO}&mt9=^UZVNY*7$@Bk}}G{++!MxoQm~$ZHgy3sG83C)@b(T?u)D0m)}#Hg+&3
z+4vh92QrM==oQ*HI{T}Q$;<c07$4cc0C<#Zg`pXoLk5rYj}tnQkBxm|1#3V^PXxlE
zNFaxe6fp)MN~6n{FLRkK;PQEXY+{WV6}ybQtxyzQ0KO<=6R2--#6(aRHZ4X`0SY>R
z(h;5I0Q8$TWBP?C>OIy^SPH9KurUi!CQ#5ochn5)u?XwWT!M+10$mmgj4%Y42#kP6
z;dmxifYB?kc|f^*%)W3Oidr+kASxISR3_M<0508PGuO{~E%*xDTx>$!z<lNyzEGh(
z;msDak1oLcb3~a!V4z%p`2#MYA1LbJ1O{0MTQJPnyjDU8pv1b^F+<#(EkfK}w)ncT
zmimt2$3XlEKNgOEgP(@s7n;X<35Tx7;sHXi5>%O+U0KW5u`$hPeEzrX2eVyFH*vA3
z{D<tjx&%zN|CjvQfT{mYd?w|0J?P1#fxP;H9MideSQ4xSSp4jPG?qM9OhlK1h+>Pv
z8<@sQ$I+Bn3zaBJ6|Ps1a$o>!;u`V!-&=n_jt^UsOq#0;7lnvo4*tIEAOUFy*!8~o
O2kD4IkOTN<#Qy-UbLb%e

diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet
deleted file mode 100644
index d53aff4a1b06955c4043c298c930178edae13db9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2598
zcmcJRe{54#6vyv-{R!)~cH6z_GY{#IH5G7lLzZI3<c(2abY<%pFvM*AG1f`PN;?OP
zE`$Mw@Q3loG?0Kn60}h=6GUQ+OjJlrOwcF+{DGT40t(_VCOQbLp7ZuvI@rkggO|Q{
z?|tXo@A=$w&wXjN<SC+R8qc6dc^8dQLWu2y*|!_da$Rb2)R|*Ecy0gX-x9vL*Ou)I
z*0JGN+&etsn;)GPd(Pgz3T>cBSMc^XS6@H&$(D^5eeT}#d%G?^@$Y*#ebGQ@>5f~j
z8IEhWYV*kgby?ha@`q!>OVl>SQn6t7%X{<kbaDROb0<6v0~fb+)?GR2+LG1qetZ7u
zN2W`bmTy~Z1s7Jj-FX%LzO6@k>pOn=&O-?e({64SU>d2GYQ+*_({>aW&MUN=bsa^8
zMfO6w!f$bW$`Kw0F-~h*U@=s9#U+PkxqdCJo?f-0>Dkq9wXVI~)o6P$(DFv`o!-#Z
zzVNKC+FScK_YTGeyQo2!OSDG8tZMf*Qv-{!otf;Q+Nv)dGa#hTLZ`c+Nd`Ao5yDH{
z_WZ=KngBJUwo%Y}B*V+B=SDjC#JpQF>R~DjVjNe(ISExOD4JI9b8QhB<`Bm|<g71f
zkY-FJAi1uh0T(Sx5e-#w#MiCUTKc%h7S<*biT)esgobDS=rNU@zu=iPrS=OW7kf0o
z4P>j&P#|ukk<r9c6;88W>0l!h+qWk(tuf7bGwOQ$aF3@l-@jY-8{9g0pzXllC;t0s
z{mjF)&Y`vbWrzKhgK5iZ*`O?|Ps2{}*0@ey)>z?SS&g*VMGdJ{*>^#*RH~ij(~OCs
z&$n&6{m)cPw}V!WhSw;}_ri-mC)2IUBz*YQ9D*CdyE)QROVnao{=?pOe{iH2c=kJI
zJvUhf9ZMQO!?eUjvr?dr*9hs}q1siRZ%W+w^6BC0{S65YQLWUGh8nc#wd11Z?juA8
z)=8vpKvzms8Zr=>Ozse7gd<H2k)}|SH`*j~cJu6e&Kl1wEg44_;)}Ep<CElZ;yANh
zk@C)FZA>NyLn0U@LIKeV=B$=z!`<OLEMOhV#-u1Bf{7PW=Ta~$Clw?ZA+jx1{DL`E
z$LVGz6$H%5dKkx?+=X$>$)ut=I1-x!RgP2qMRtLwUvt1k1d*N6&`zZr*V9wU*M~Cb
z{$zhBlkQ3NvuOa`DKEs9s3UD{?bJZ1p@5e^9|goEqEQXwku#j^W8o`(xwc07>AMK&
zhFN#zny{E43L$G;Ek59PitWy$W_H{}9mLwF71Pe(#6K<UU>;@cFB2^z);}kuYfStc
zv*XBdBnLJ63~3q<aKzXd)VLyRbZ6huNKfCy4gtG_O7UEz`OzpsMNGWH*)V3~QZAiF
z*vDoTv+CJrhCD1HvqAEBmalp$>JK)xdd+(^v+<hQTrNq{BDf_-2aGq@U>C;nt7VcD
zl@>}8v^CPqYOrtGg!ZM9)U5c`Lkk`-&5d)G<NJkvG_Jr08l;LShH~K$T;ak=V1fps
zFKUK~*w&yfc;f@E*>b!ZG%d%mceQ(+e@SI%^_<F(Z}G~Yzqk%8ouKS^67BNja+#|f
z@qnfl<3sCFc&pG4BG{2fUMX7Tsp~NgNJL>+zRBOZavj{!HLIMRYidULWr$Jy;C>(e
zdB|@^dGwN(UWq&);l#0jvD02(jJyv=-?u9Cyl)O><zeYNH>T^~v%en5@O|==w!iY0
i2NpXwNYecI#H_7u3wa|^Aq3y4yME-XnviPvhxk9Ii#j?0

diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 549889f6d..9945bd07d 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -72,6 +72,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
         "licenses",
         "hash",
         "description",
+        "path",
     )
     return (
         pl.LazyFrame(pkg["resources"])
@@ -82,6 +83,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
         .select(
             DATASET_NAME,
             path_suffix("path").alias("suffix"),
+            col("path").alias("file_name"),
             ~cs.by_name(DATASET_NAME, EXCLUDE),
             *FEATURES,
             col("schema").is_not_null().alias("has_schema"),
diff --git a/tools/datasets/github.py b/tools/datasets/github.py
deleted file mode 100644
index a2956df28..000000000
--- a/tools/datasets/github.py
+++ /dev/null
@@ -1,490 +0,0 @@
-from __future__ import annotations
-
-import json
-import os
-import random
-import sys
-import time
-import urllib.request
-import warnings
-from collections.abc import Iterable, Iterator, Mapping, Sequence
-from itertools import islice
-from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast
-
-import polars as pl
-from polars import col
-
-from tools.datasets import semver
-from tools.datasets.models import (
-    GitHubRateLimitResources,
-    GitHubTag,
-    GitHubTree,
-    GitHubTreesResponse,
-    GitHubUrl,
-    ParsedRateLimit,
-    ParsedTag,
-    ParsedTree,
-    SemVerTag,
-)
-
-if sys.version_info >= (3, 13):
-    from typing import is_typeddict
-else:
-    from typing_extensions import is_typeddict
-
-if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-    from email.message import Message
-    from urllib.request import OpenerDirector, Request
-
-    from altair.datasets._typing import Extension
-
-    if sys.version_info >= (3, 13):
-        from typing import TypeIs
-    else:
-        from typing_extensions import TypeIs
-    if sys.version_info >= (3, 11):
-        from typing import LiteralString
-    else:
-        from typing_extensions import LiteralString
-    if sys.version_info >= (3, 10):
-        from typing import TypeAlias
-    else:
-        from typing_extensions import TypeAlias
-
-    _PathName: TypeAlias = Literal["dir", "tags", "trees"]
-
-
-__all__ = ["GitHub"]
-
-_TD = TypeVar("_TD", bound=Mapping[str, Any])
-
-_DATA = "data"
-
-
-def is_ext_supported(suffix: str) -> TypeIs[Extension]:
-    return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}
-
-
-def _is_str(obj: Any) -> TypeIs[str]:
-    return isinstance(obj, str)
-
-
-class _ErrorHandler(urllib.request.BaseHandler):
-    """
-    Adds `rate limit`_ info to a forbidden error.
-
-    .. _rate limit:
-        https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28
-    """
-
-    def http_error_default(
-        self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message
-    ):
-        if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)):
-            limit = hdrs.get("X-RateLimit-Limit", "")
-            remaining = hdrs.get("X-RateLimit-Remaining", "")
-            msg = (
-                f"{msg}\n\nFailed to balance rate limit.\n"
-                f"{limit=}, {remaining=}\n"
-                f"Reset: {time.localtime(int(reset))!r}"
-            )
-        raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp)
-
-
-class _GitHubRequestNamespace:
-    """
-    Fetching resources from the `GitHub API`_.
-
-    .. _GitHub API:
-        https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28
-    """
-
-    _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN"
-    _TAGS_MAX_PAGE: Literal[100] = 100
-    _VERSION: LiteralString = "2022-11-28"
-    _UNAUTH_RATE_LIMIT: Literal[60] = 60
-    _TAGS_COST: Literal[1] = 1
-    _TREES_COST: Literal[2] = 2
-    _UNAUTH_DELAY: Literal[5_000] = 5_000
-    """**ms** delay added between **unauthenticated** ``trees`` requests."""
-    _AUTH_DELAY: Literal[500] = 500
-    """**ms** delay added between **authenticated** ``trees`` requests."""
-    _UNAUTH_TREES_LIMIT: Literal[10] = 10
-
-    def __init__(self, gh: GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._gh.url
-
-    def rate_limit(self) -> GitHubRateLimitResources:
-        with self._gh._opener.open(self._request(self.url.RATE)) as response:
-            content: GitHubRateLimitResources = json.load(response)["resources"]
-        return content
-
-    def delay(self, *, is_auth: bool) -> float:
-        ms = self._AUTH_DELAY if is_auth else self._UNAUTH_DELAY
-        return (ms + random.triangular()) / 1_000
-
-    def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]:
-        if n < 1 or n > self._TAGS_MAX_PAGE:
-            raise ValueError(n)
-        req = self._request(f"{self.url.TAGS}?per_page={n}")
-        with self._gh._opener.open(req) as response:
-            content: list[GitHubTag] = json.load(response)
-        if warn_lower and len(content) < n:
-            earliest = response[-1]["name"]
-            n_response = len(content)
-            msg = f"Requested {n=} tags, but got {n_response}\n{earliest=}"
-            warnings.warn(msg, stacklevel=3)
-        return content
-
-    def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse:
-        """For a given ``tag``, perform **2x requests** to get directory metadata."""
-        if _is_str(tag):
-            url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}"
-        else:
-            url = tag["trees_url"]
-        with self._gh._opener.open(self._request(url)) as response:
-            content: GitHubTreesResponse = json.load(response)
-        query = (tree["url"] for tree in content["tree"] if tree["path"] == _DATA)
-        if data_url := next(query, None):
-            with self._gh._opener.open(self._request(data_url)) as response:
-                data_dir: GitHubTreesResponse = json.load(response)
-            return data_dir
-        else:
-            raise FileNotFoundError
-
-    def _request(self, url: str, /, *, raw: bool = False) -> Request:
-        """
-        Wrap a request url with a `personal access token`_ - if set as an env var.
-
-        By default the endpoint returns json, specify raw to get blob data.
-        See `Media types`_.
-
-        .. _personal access token:
-            https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
-        .. _Media types:
-            https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types
-        """
-        headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION}
-        if tok := os.environ.get(self._ENV_VAR):
-            headers["Authorization"] = (
-                tok if tok.startswith("Bearer ") else f"Bearer {tok}"
-            )
-        if raw:
-            headers["Accept"] = "application/vnd.github.raw+json"
-        return urllib.request.Request(url, headers=headers)
-
-
-class _GitHubParseNamespace:
-    """
-    Transform responses into intermediate representations.
-
-    Where relevant:
-    - Adding cheap to compute metadata
-    - Dropping information that we don't need for the task
-    """
-
-    def __init__(self, gh: GitHub, /) -> None:
-        self._gh = gh
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._gh.url
-
-    def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit:
-        core = rate_limit["core"]
-        reset = core["reset"]
-        return ParsedRateLimit(
-            **core,
-            reset_time=time.localtime(reset),
-            is_limited=core["remaining"] == 0,
-            is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT,
-        )
-
-    def tag(self, tag: GitHubTag, /) -> ParsedTag:
-        sha = tag["commit"]["sha"]
-        return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}")
-
-    def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]:
-        return [self.tag(t) for t in tags]
-
-    def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree:
-        """For a single tree (file) convert to an IR with only relevant properties."""
-        path = Path(tree["path"])
-        return ParsedTree(
-            file_name=path.name,
-            dataset_name=path.stem,
-            suffix=path.suffix,
-            size=tree["size"],
-            sha=tree["sha"],
-            ext_supported=is_ext_supported(path.suffix),
-            tag=tag,
-        )
-
-    def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]:
-        """For a tree response (directory of files) convert to an IR with only relevant properties."""
-        return [self.tree(t, tag) for t in tree["tree"]]
-
-    def tag_from_str(self, s: str, /) -> str:
-        # - Actual tag
-        # - Trees url (using ref name)
-        # - npm url (works w/o the `v` prefix)
-        trees_url = self.url.TREES
-        npm_url = self._gh._npm_cdn_url
-        if s.startswith("v"):
-            return s
-        elif s.startswith(trees_url):
-            return s.replace(trees_url, "")
-        elif s.startswith(npm_url):
-            s, _ = s.replace(npm_url, "").split("/")
-            return s if s.startswith("v") else f"v{s}"
-        else:
-            raise TypeError(s)
-
-
-class GitHub:
-    """
-    Primary interface with the GitHub API.
-
-    Maintains up-to-date metadata, describing **every** available dataset across **all known** releases.
-
-    - Uses `tags`_, `trees`_, `rate_limit`_ endpoints.
-    - Organizes distinct groups of operations into property accessor namespaces.
-
-    .. _tags:
-        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
-    .. _trees:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    .. _rate_limit:
-        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-    """
-
-    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler)
-
-    def __init__(
-        self,
-        out_dir_tools: Path,
-        out_dir_altair: Path,
-        name_tags: str,
-        name_trees: str,
-        *,
-        npm_cdn_url: LiteralString,
-        base_url: LiteralString = "https://api.github.com/",
-        org: LiteralString = "vega",
-        package: LiteralString = "vega-datasets",
-    ) -> None:
-        out_dir_tools.mkdir(exist_ok=True)
-        out_dir_altair.mkdir(exist_ok=True)
-        self._paths: dict[_PathName, Path] = {
-            "dir": out_dir_tools,
-            "tags": out_dir_tools / f"{name_tags}.parquet",
-            "trees": out_dir_altair / f"{name_trees}.parquet",
-        }
-        repo = f"{base_url}repos/{org}/{package}/"
-        self._url = GitHubUrl(
-            BASE=base_url,
-            BLOBS=f"{repo}git/blobs/",
-            RATE=f"{base_url}rate_limit",
-            REPO=repo,
-            TAGS=f"{repo}tags",
-            TREES=f"{repo}git/trees/",
-        )
-        self._npm_cdn_url: LiteralString = npm_cdn_url
-
-    @property
-    def req(self) -> _GitHubRequestNamespace:
-        return _GitHubRequestNamespace(self)
-
-    @property
-    def parse(self) -> _GitHubParseNamespace:
-        return _GitHubParseNamespace(self)
-
-    @property
-    def url(self) -> GitHubUrl:
-        return self._url
-
-    def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit:
-        limit = self.parse.rate_limit(self.req.rate_limit())
-        if strict and limit["is_limited"]:
-            warnings.warn(
-                f"Reached rate limit:\n{limit!r}\n\n"
-                f"Try setting environment variable {self.req._ENV_VAR!r}",
-                stacklevel=2,
-            )
-        return limit
-
-    def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float:
-        """Return a delay time in seconds, corresponding with authentication status."""
-        limit = rate_limit or self.rate_limit(strict=True)
-        return self.req.delay(is_auth=limit["is_auth"])
-
-    def tags(
-        self,
-        n_head: int | None = None,
-        *,
-        npm_tags: pl.DataFrame | pl.LazyFrame | None = None,
-        warn_lower: bool = False,
-    ) -> pl.DataFrame:
-        """
-        Get release info, enhance with `SemVer`_ context.
-
-        Parameters
-        ----------
-        n_head
-            Limit to most recent releases.
-        npm_tags
-            Used to remove any github-only releases.
-        warn_lower
-            Emit a warning if fewer than ``n_head`` tags were returned.
-
-        .. _SemVer:
-            https://semver.org/#semantic-versioning-200
-        """
-        tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower)
-        frame = pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns)
-        if npm_tags is not None:
-            return frame.lazy().join(npm_tags.lazy().select("tag"), on="tag").collect()
-        else:
-            return frame
-
-    def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame:
-        """Retrieve directory info for a given version ``tag``."""
-        trees = self.req.trees(tag)
-        tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"]
-        parsed = self.parse.trees(trees, tag=tag_v)
-        url = pl.concat_str(
-            pl.lit(self._npm_cdn_url),
-            col("tag"),
-            pl.lit(f"/{_DATA}/"),
-            col("file_name"),
-        )
-        df = (
-            pl.LazyFrame(parsed)
-            .with_columns(
-                name_collision=col("dataset_name").is_duplicated(), url_npm=url
-            )
-            .collect()
-        )
-        return df.select(*sorted(df.columns))
-
-    def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame:
-        """
-        Use known tags to discover and update missing trees metadata.
-
-        Aims to stay well-within API rate limits, both for authenticated and unauthenticated users.
-
-        Notes
-        -----
-        Internally handles regenerating the ``tag`` enum.
-        """
-        if gh_tags.is_empty():
-            msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}"
-            raise NotImplementedError(msg)
-        rate_limit = self.rate_limit(strict=True)
-        stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT
-        fp = self._paths["trees"]
-        if not fp.exists():
-            print(f"Initializing {fp!s}")
-            result = self._trees_batched(_iter_rows(gh_tags, stop, SemVerTag))
-        else:
-            trees = (
-                pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect()
-            )
-            missing_trees = gh_tags.join(
-                trees.select(col("tag").unique()), on="tag", how="anti"
-            )
-            if missing_trees.is_empty():
-                print(f"Already up-to-date {fp!s}")
-                result = trees
-            else:
-                fresh = self._trees_batched(_iter_rows(missing_trees, stop, SemVerTag))
-                result = pl.concat((trees, fresh))
-        return (
-            result.lazy()
-            .with_columns(col("tag").cast(semver.tag_enum(gh_tags)))
-            .sort("tag", descending=True)
-            .collect()
-        )
-
-    def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame:
-        limit = self.rate_limit(strict=True)
-        npm_tag_only = npm_tags.lazy().select("tag")
-        fp = self._paths["tags"]
-        if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST:
-            return pl.scan_parquet(fp).join(npm_tag_only, on="tag").collect()
-        elif not fp.exists():
-            print(f"Initializing {fp!s}")
-            tags = self.tags(npm_tags=npm_tag_only)
-            print(f"Collected {tags.height} new tags")
-            return tags
-        else:
-            print("Checking for new tags")
-            prev = pl.scan_parquet(fp)
-            latest = self.tags(1, npm_tags=npm_tag_only)
-            if latest.equals(prev.pipe(semver.sort).head(1).collect()):
-                print(f"Already up-to-date {fp!s}")
-                return prev.collect()
-            print(f"Refreshing {fp!s}")
-            prev_eager = prev.collect()
-            tags = (
-                pl.concat((self.tags(npm_tags=npm_tag_only), prev_eager))
-                .unique("sha")
-                .pipe(semver.sort)
-            )
-            print(f"Collected {tags.height - prev_eager.height} new tags")
-            return tags
-
-    def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame:
-        rate_limit = self.rate_limit(strict=True)
-        if not isinstance(tags, Sequence):
-            tags = tuple(tags)
-        req = self.req
-        n = len(tags)
-        cost = req._TREES_COST * n
-        if rate_limit["remaining"] < cost:
-            raise NotImplementedError(rate_limit, cost)
-        print(
-            f"Collecting metadata for {n} missing releases.\n"
-            f"Using {self.delay(rate_limit):.2f}[ms] between requests ..."
-        )
-        dfs: list[pl.DataFrame] = []
-        for tag in tags:
-            time.sleep(self.delay(rate_limit))
-            dfs.append(self.trees(tag))
-        df = pl.concat(dfs)
-        print(f"Finished collection.\nFound {df.height} new rows")
-        return df
-
-
-def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator[_TD]:
-    """
-    Wraps `pl.DataFrame.iter_rows`_ with typing to preserve key completions.
-
-    Parameters
-    ----------
-    df
-        Target dataframe.
-    stop
-        Passed to `itertools.islice`_.
-    tp
-        Static type representing a row/record.
-
-        .. note::
-            Performs a **very basic** runtime check on the type of ``tp`` (*not* ``df``).
-
-            Primarily used to override ``dict[str, Any]`` when a *narrower* type is known.
-
-    .. _itertools.islice:
-        https://docs.python.org/3/library/itertools.html#itertools.islice
-    .. _pl.DataFrame.iter_rows:
-        https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html
-    """
-    if not TYPE_CHECKING:
-        assert is_typeddict(tp) or issubclass(tp, Mapping)
-
-    return cast("Iterator[_TD]", islice(df.iter_rows(named=True), stop))
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index e2036b4ea..21d98050e 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -4,7 +4,7 @@
 
 import sys
 from collections.abc import Mapping, Sequence
-from typing import TYPE_CHECKING, Any, Literal, NamedTuple
+from typing import TYPE_CHECKING, Literal, NamedTuple
 
 if sys.version_info >= (3, 14):
     from typing import TypedDict
@@ -12,8 +12,6 @@
     from typing_extensions import TypedDict
 
 if TYPE_CHECKING:
-    import time
-
     if sys.version_info >= (3, 11):
         from typing import LiteralString, NotRequired, Required
     else:
@@ -26,171 +24,12 @@
 
     from altair.datasets._typing import Dataset, FlFieldStr
 
-Map: TypeAlias = Mapping[str, Any]
-
-
-class GitHubUrl(NamedTuple):
-    BASE: LiteralString
-    BLOBS: LiteralString
-    RATE: LiteralString
-    REPO: LiteralString
-    TAGS: LiteralString
-    TREES: LiteralString
-
 
 class NpmUrl(NamedTuple):
     CDN: LiteralString
-    TAGS: LiteralString
     GH: LiteralString
 
 
-class GitHubTag(TypedDict):
-    """
-    A single release's metadata within the response of `List repository tags`_.
-
-    .. _List repository tags:
-        https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.
-    """
-
-    name: str
-    node_id: str
-    commit: dict[Literal["sha", "url"], str]
-    zipball_url: str
-    tarball_url: str
-
-
-class ParsedTag(TypedDict):
-    tag: str
-    sha: str
-    trees_url: str
-
-
-class SemVerTag(ParsedTag):
-    """
-    Extends ``ParsedTag`` with `semantic versioning`_.
-
-    These values are extracted via:
-
-        tools.datasets.with_columns
-
-    Describes a row in the dataframe returned by:
-
-        tools.datasets.GitHub.tags
-
-    .. _semantic versioning:
-        https://semver.org/
-    """
-
-    major: int
-    minor: int
-    patch: int
-    pre_release: int | None
-    is_pre_release: bool
-
-
-class GitHubTree(TypedDict):
-    """
-    A single file's metadata within the response of `Get a tree`_.
-
-    .. _Get a tree:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    """
-
-    path: str
-    mode: str
-    type: str
-    sha: str
-    size: int
-    url: str
-
-
-class GitHubTreesResponse(TypedDict):
-    """
-    Response from `Get a tree`_.
-
-    Describes directory metadata, with files stored in ``"tree"``.
-
-    .. _Get a tree:
-        https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
-    """
-
-    sha: str
-    url: str
-    tree: list[GitHubTree]
-    truncated: bool
-
-
-class NpmVersion(TypedDict):
-    version: str
-    links: dict[Literal["self", "entrypoints", "stats"], str]
-
-
-class NpmPackageMetadataResponse(TypedDict):
-    """
-    Response from `Get package metadata`_.
-
-    Using:
-
-        headers={"Accept": "application/json"}
-
-    .. _Get package metadata:
-        https://data.jsdelivr.com/v1/packages/npm/vega-datasets
-    """
-
-    type: str
-    name: str
-    tags: dict[Literal["canary", "next", "latest"], str]
-    versions: list[NpmVersion]
-    links: dict[Literal["stats"], str]
-
-
-class ParsedTree(TypedDict):
-    file_name: str
-    dataset_name: str
-    suffix: str
-    size: int
-    sha: str
-    ext_supported: bool
-    tag: str
-
-
-class GitHubRateLimit(TypedDict):
-    """
-    An individual item in `Get rate limit status for the authenticated user`_.
-
-    All categories share this schema.
-
-    .. _Get rate limit status for the authenticated user:
-        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-    """
-
-    limit: int
-    used: int
-    remaining: int
-    reset: int
-
-
-class ParsedRateLimit(GitHubRateLimit):
-    reset_time: time.struct_time
-    is_limited: bool
-    is_auth: bool
-
-
-class GitHubRateLimitResources(TypedDict, total=False):
-    """
-    A subset of response from `Get rate limit status for the authenticated user`_.
-
-    .. _Get rate limit status for the authenticated user:
-        https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user
-    """
-
-    core: Required[GitHubRateLimit]
-    search: Required[GitHubRateLimit]
-    graphql: GitHubRateLimit
-    integration_manifest: GitHubRateLimit
-    code_search: GitHubRateLimit
-
-
 #####################################################
 # frictionless datapackage
 #####################################################
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 99d5fe5b0..7f61323c4 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -6,9 +6,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-import polars as pl
-
-from tools.datasets import datapackage, semver
+from tools.datasets import datapackage
 from tools.datasets.models import NpmUrl
 
 if TYPE_CHECKING:
@@ -23,14 +21,9 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from altair.datasets._typing import Version
-    from tools.datasets.models import (
-        FlPackage,
-        NpmPackageMetadataResponse,
-        ParsedPackage,
-    )
+    from tools.datasets.models import FlPackage, ParsedPackage
 
-    BranchOrTag: TypeAlias = 'Literal["main"] | Version | LiteralString'  # noqa: TC008
+    BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'  # noqa: TC008
 
 
 __all__ = ["Npm"]
@@ -44,21 +37,17 @@ class Npm:
     def __init__(
         self,
         output_dir: Path,
-        name_tags: str,
         *,
         jsdelivr: Literal["jsdelivr"] = "jsdelivr",
         npm: Literal["npm"] = "npm",
         package: LiteralString = "vega-datasets",
-        jsdelivr_version: LiteralString = "v1",
     ) -> None:
         output_dir.mkdir(exist_ok=True)
-        self._paths: dict[Literal["tags", "datapackage"], Path] = {
-            "tags": output_dir / f"{name_tags}.parquet",
+        self._paths: dict[Literal["datapackage"], Path] = {
             "datapackage": output_dir / "datapackage.json",
         }
         self._url: NpmUrl = NpmUrl(
             CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
-            TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}",
             GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
         )
 
@@ -79,33 +68,6 @@ def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
     def url(self) -> NpmUrl:
         return self._url
 
-    def tags(self) -> pl.DataFrame:
-        """
-        Request, parse tags from `Get package metadata`_.
-
-        Notes
-        -----
-        - Ignores canary releases
-        - ``npm`` can accept either, but this endpoint returns without "v":
-
-            {tag}
-            v{tag}
-
-        .. _Get package metadata:
-            https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package-
-        """
-        req = urllib.request.Request(
-            self.url.TAGS, headers={"Accept": "application/json"}
-        )
-        with self._opener.open(req) as response:
-            content: NpmPackageMetadataResponse = json.load(response)
-        versions = [
-            f"v{tag}"
-            for v in content["versions"]
-            if (tag := v["version"]) and semver.CANARY not in tag
-        ]
-        return pl.DataFrame({"tag": versions}).pipe(semver.with_columns)
-
     def file_gh(
         self,
         branch_or_tag: BranchOrTag,
diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py
deleted file mode 100644
index 788bbb2a2..000000000
--- a/tools/datasets/semver.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""
-Parsing/transforming `semantic versioning`_ strings.
-
-.. _semantic versioning:
-    https://semver.org/
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Literal
-
-import polars as pl
-from polars import col
-
-if TYPE_CHECKING:
-    from typing import TypeVar
-
-    _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame)
-
-__all__ = ["CANARY", "sort", "with_columns"]
-
-_SEM_VER_FIELDS: tuple[
-    Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"]
-] = "major", "minor", "patch", "pre_release"
-CANARY: Literal["--canary"] = "--canary"
-
-
-def with_columns(frame: _Frame, /, *, tag: str = "tag") -> _Frame:
-    """
-    Extracts components of a `SemVer`_ string into sortable columns.
-
-    .. _SemVer:
-        https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions
-    """
-    fields = col(_SEM_VER_FIELDS)
-    pattern = r"""(?x)
-        v?(?<major>[[:digit:]]*)\.
-        (?<minor>[[:digit:]]*)\.
-        (?<patch>[[:digit:]]*)
-        (\-(next)?(beta)?\.)?
-        (?<pre_release>[[:digit:]]*)?
-    """
-    sem_ver = col(tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS)
-    ldf = (
-        frame.lazy()
-        .with_columns(sem_ver)
-        .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64))
-        .with_columns(is_pre_release=col("pre_release").is_not_null())
-    )
-    if isinstance(frame, pl.DataFrame):
-        return ldf.collect()
-    else:
-        return ldf
-
-
-def tag_enum(frame: _Frame, /, *, tag: str = "tag") -> pl.Enum:
-    """Extract an **ascending** order ``pl.Enum`` from ``tag``."""
-    return pl.Enum(
-        frame.lazy().pipe(sort, descending=False).select(tag).collect().get_column(tag)
-    )
-
-
-def sort(frame: _Frame, /, descending: bool = True) -> _Frame:
-    """
-    Sort ``frame``, displaying in release order.
-
-    Parameters
-    ----------
-    descending
-        By default, **most recent** is first.
-
-    Notes
-    -----
-    Ensures pre release versions maintain order, always appearing before actual releases.
-    """
-    return frame.sort(_SEM_VER_FIELDS, descending=descending, nulls_last=not descending)

From d297d7ea1a8cc7cfa451f61356d7f4f4466062b4 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 15:24:58 +0000
Subject: [PATCH 152/201] docs: Update `Metadata` example

---
 altair/datasets/_typing.py | 59 ++++++++++++++++++-------------------
 tools/datasets/__init__.py | 60 ++++++++++++++++++--------------------
 2 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 87d1ac366..c6daba45e 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -162,38 +162,35 @@ class Metadata(TypedDict, total=False):
     --------
     ``Metadata`` keywords form constraints to filter a table like the below sample:
 
-    ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION
-
     ```
-    shape: (2_879, 9)
-    ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
-    │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag     ┆ url_npm  │
-    │ a…        ┆ or…      ┆ e        ┆ li…      ┆   ┆ ---    ┆ ---     ┆ ---      │
-    │ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ str    ┆ enum    ┆ str      │
-    │ str       ┆ bool     ┆ str      ┆ bool     ┆   ┆        ┆         ┆          │
-    ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡
-    │ cars      ┆ true     ┆ cars.jso ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
-    │           ┆          ┆ n        ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ flights-2 ┆ true     ┆ flights- ┆ true     ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │
-    │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ flights-2 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v2.9.0  ┆ https:// │
-    │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ unemploym ┆ true     ┆ unemploy ┆ false    ┆ … ┆ .json  ┆ v2.7.0  ┆ https:// │
-    │ e…        ┆          ┆ me…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ ffox      ┆ false    ┆ ffox.png ┆ false    ┆ … ┆ .png   ┆ v2.5.2  ┆ https:// │
-    │           ┆          ┆          ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ …         ┆ …        ┆ …        ┆ …        ┆ … ┆ …      ┆ …       ┆ …        │
-    │ flights-a ┆ true     ┆ flights- ┆ false    ┆ … ┆ .csv   ┆ v1.18.0 ┆ https:// │
-    │ i…        ┆          ┆ ai…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ income    ┆ true     ┆ income.j ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
-    │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ burtin    ┆ true     ┆ burtin.j ┆ false    ┆ … ┆ .json  ┆ v2.8.0  ┆ https:// │
-    │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ flights-5 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v1.8.0  ┆ https:// │
-    │ k         ┆          ┆ 5k…      ┆          ┆   ┆        ┆         ┆ cd…      │
-    │ wheat     ┆ true     ┆ wheat.js ┆ false    ┆ … ┆ .json  ┆ v1.18.0 ┆ https:// │
-    │           ┆          ┆ on       ┆          ┆   ┆        ┆         ┆ cd…      │
-    └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘
+    shape: (73, 13)
+    ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
+    │ dataset_name   ┆ suffix ┆ file_name      ┆ … ┆ sha           ┆ url           │
+    │ ---            ┆ ---    ┆ ---            ┆   ┆ ---           ┆ ---           │
+    │ str            ┆ str    ┆ str            ┆   ┆ str           ┆ str           │
+    ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡
+    │ 7zip           ┆ .png   ┆ 7zip.png       ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ d48850099c17… ┆ sdelivr.net/… │
+    │ airports       ┆ .csv   ┆ airports.csv   ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 584c3fa1d31e… ┆ sdelivr.net/… │
+    │ annual-precip  ┆ .json  ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │
+    │                ┆        ┆ json           ┆   ┆ 8f16dda65151… ┆ sdelivr.net/… │
+    │ anscombe       ┆ .json  ┆ anscombe.json  ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 3bdf0c866115… ┆ sdelivr.net/… │
+    │ barley         ┆ .json  ┆ barley.json    ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ e197ce95c24c… ┆ sdelivr.net/… │
+    │ …              ┆ …      ┆ …              ┆ … ┆ …             ┆ …             │
+    │ weekly-weather ┆ .json  ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │
+    │                ┆        ┆ .json          ┆   ┆ ccd6baaa89f9… ┆ sdelivr.net/… │
+    │ wheat          ┆ .json  ┆ wheat.json     ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │
+    │ windvectors    ┆ .csv   ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │
+    │                ┆        ┆ v              ┆   ┆ bd59d09fcd94… ┆ sdelivr.net/… │
+    │ world-110m     ┆ .json  ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │
+    │                ┆        ┆ n              ┆   ┆ 13c94c0c2840… ┆ sdelivr.net/… │
+    │ zipcodes       ┆ .csv   ┆ zipcodes.csv   ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 0544c95f1bd4… ┆ sdelivr.net/… │
+    └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘
     ```
     """
 
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 131e15bac..26dc8439b 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -239,43 +239,41 @@ def generate_typing(self) -> None:
         )
         import textwrap
 
+        # NOTE: Uses `pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80)`
         examples = f"""\
         Examples
         --------
         ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample:
 
-        ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION
-
         ```
-        shape: (2_879, 9)
-        ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐
-        │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag     ┆ url_npm  │
-        │ a…        ┆ or…      ┆ e        ┆ li…      ┆   ┆ ---    ┆ ---     ┆ ---      │
-        │ ---       ┆ ---      ┆ ---      ┆ ---      ┆   ┆ str    ┆ enum    ┆ str      │
-        │ str       ┆ bool     ┆ str      ┆ bool     ┆   ┆        ┆         ┆          │
-        ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡
-        │ cars      ┆ true     ┆ cars.jso ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
-        │           ┆          ┆ n        ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ flights-2 ┆ true     ┆ flights- ┆ true     ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │
-        │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ flights-2 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v2.9.0  ┆ https:// │
-        │ 0…        ┆          ┆ 20…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ unemploym ┆ true     ┆ unemploy ┆ false    ┆ … ┆ .json  ┆ v2.7.0  ┆ https:// │
-        │ e…        ┆          ┆ me…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ ffox      ┆ false    ┆ ffox.png ┆ false    ┆ … ┆ .png   ┆ v2.5.2  ┆ https:// │
-        │           ┆          ┆          ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ …         ┆ …        ┆ …        ┆ …        ┆ … ┆ …      ┆ …       ┆ …        │
-        │ flights-a ┆ true     ┆ flights- ┆ false    ┆ … ┆ .csv   ┆ v1.18.0 ┆ https:// │
-        │ i…        ┆          ┆ ai…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ income    ┆ true     ┆ income.j ┆ false    ┆ … ┆ .json  ┆ v1.21.0 ┆ https:// │
-        │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ burtin    ┆ true     ┆ burtin.j ┆ false    ┆ … ┆ .json  ┆ v2.8.0  ┆ https:// │
-        │           ┆          ┆ so…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ flights-5 ┆ true     ┆ flights- ┆ false    ┆ … ┆ .json  ┆ v1.8.0  ┆ https:// │
-        │ k         ┆          ┆ 5k…      ┆          ┆   ┆        ┆         ┆ cd…      │
-        │ wheat     ┆ true     ┆ wheat.js ┆ false    ┆ … ┆ .json  ┆ v1.18.0 ┆ https:// │
-        │           ┆          ┆ on       ┆          ┆   ┆        ┆         ┆ cd…      │
-        └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘
+        shape: (73, 13)
+        ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
+        │ dataset_name   ┆ suffix ┆ file_name      ┆ … ┆ sha           ┆ url           │
+        │ ---            ┆ ---    ┆ ---            ┆   ┆ ---           ┆ ---           │
+        │ str            ┆ str    ┆ str            ┆   ┆ str           ┆ str           │
+        ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡
+        │ 7zip           ┆ .png   ┆ 7zip.png       ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ d48850099c17… ┆ sdelivr.net/… │
+        │ airports       ┆ .csv   ┆ airports.csv   ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ 584c3fa1d31e… ┆ sdelivr.net/… │
+        │ annual-precip  ┆ .json  ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │
+        │                ┆        ┆ json           ┆   ┆ 8f16dda65151… ┆ sdelivr.net/… │
+        │ anscombe       ┆ .json  ┆ anscombe.json  ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ 3bdf0c866115… ┆ sdelivr.net/… │
+        │ barley         ┆ .json  ┆ barley.json    ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ e197ce95c24c… ┆ sdelivr.net/… │
+        │ …              ┆ …      ┆ …              ┆ … ┆ …             ┆ …             │
+        │ weekly-weather ┆ .json  ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │
+        │                ┆        ┆ .json          ┆   ┆ ccd6baaa89f9… ┆ sdelivr.net/… │
+        │ wheat          ┆ .json  ┆ wheat.json     ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │
+        │ windvectors    ┆ .csv   ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │
+        │                ┆        ┆ v              ┆   ┆ bd59d09fcd94… ┆ sdelivr.net/… │
+        │ world-110m     ┆ .json  ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │
+        │                ┆        ┆ n              ┆   ┆ 13c94c0c2840… ┆ sdelivr.net/… │
+        │ zipcodes       ┆ .csv   ┆ zipcodes.csv   ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │
+        │                ┆        ┆                ┆   ┆ 0544c95f1bd4… ┆ sdelivr.net/… │
+        └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘
         ```
         """
 

From 64b80ff6f707cd42d0583291dfe5a23b188f1579 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:31:28 +0000
Subject: [PATCH 153/201] docs: Add missing descriptions to `Metadata`

---
 altair/datasets/_typing.py | 14 +++++++++-----
 tools/datasets/__init__.py |  7 +++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index c6daba45e..958db2300 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -127,17 +127,17 @@ class Metadata(TypedDict, total=False):
     bytes
         File size in *bytes*.
     is_image
-        _description_
+        Only accessible via url.
     is_tabular
         Can be read as tabular data.
     is_geo
-        _description_
+        `GeoJSON`_ format.
     is_topo
-        _description_
+        `TopoJSON`_ format.
     is_spatial
-        _description_
+        Any geospatial format. Only natively supported by ``polars``.
     is_json
-        _description_
+        Not supported natively by ``pyarrow``.
     has_schema
         Data types available for improved ``pandas`` parsing.
     sha
@@ -156,6 +156,10 @@ class Metadata(TypedDict, total=False):
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name
     .. _Path.suffix:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+    .. _GeoJSON:
+        https://en.wikipedia.org/wiki/GeoJSON
+    .. _TopoJSON:
+        https://en.wikipedia.org/wiki/GeoJSON#TopoJSON
 
 
     Examples
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 26dc8439b..7350ede7f 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -236,6 +236,8 @@ def generate_typing(self) -> None:
             f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n"
             f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n"
             f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
+            f".. _GeoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON\n"
+            f".. _TopoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON#TopoJSON\n"
         )
         import textwrap
 
@@ -283,6 +285,11 @@ def generate_typing(self) -> None:
             "file_name": "Equivalent to `Path.name`_.",
             "bytes": "File size in *bytes*.",
             "is_tabular": "Can be read as tabular data.",
+            "is_image": "Only accessible via url.",
+            "is_geo": "`GeoJSON`_ format.",
+            "is_topo": "`TopoJSON`_ format.",
+            "is_spatial": "Any geospatial format. Only natively supported by ``polars``.",
+            "is_json": "Not supported natively by ``pyarrow``.",
             "has_schema": "Data types available for improved ``pandas`` parsing.",
             "sha": sha,
             "url": "Remote url used to access dataset.",

From a0f75852b4d7d88a1894f883b7f8cef9e368b917 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:48:44 +0000
Subject: [PATCH 154/201] refactor: Renaming/reorganize in `tools/`

Mainly removing `Fl` prefix, as there is no confusion now `models.py` is purely `frictionless` structures
---
 tools/datasets/datapackage.py |  8 +++---
 tools/datasets/models.py      | 48 ++++++++++++++---------------------
 tools/datasets/npm.py         | 13 ++++++----
 3 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 9945bd07d..5272170c2 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -22,7 +22,7 @@
     from collections.abc import Iterable, Iterator, Mapping, Sequence
 
     from altair.datasets._typing import Dataset, FlFieldStr
-    from tools.datasets.models import FlPackage
+    from tools.datasets.models import Package
 
 
 __all__ = ["parse_package"]
@@ -42,13 +42,13 @@
 )
 
 
-def parse_package(pkg: FlPackage, base_url: str, /) -> ParsedPackage:
+def parse_package(pkg: Package, base_url: str, /) -> ParsedPackage:
     return ParsedPackage(
         features=extract_features(pkg, base_url), schemas=extract_schemas(pkg)
     )
 
 
-def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
+def extract_schemas(pkg: Package, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
     """Reduce all datasets with schemas to a minimal mapping."""
     m: Any = {
         Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]}
@@ -58,7 +58,7 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS
     return m
 
 
-def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame:
+def extract_features(pkg: Package, base_url: str, /) -> pl.DataFrame:
     EXCLUDE = (
         "name",
         "type",
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index 21d98050e..f88a0b842 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -4,7 +4,7 @@
 
 import sys
 from collections.abc import Mapping, Sequence
-from typing import TYPE_CHECKING, Literal, NamedTuple
+from typing import TYPE_CHECKING, Literal
 
 if sys.version_info >= (3, 14):
     from typing import TypedDict
@@ -13,9 +13,9 @@
 
 if TYPE_CHECKING:
     if sys.version_info >= (3, 11):
-        from typing import LiteralString, NotRequired, Required
+        from typing import NotRequired, Required
     else:
-        from typing_extensions import LiteralString, NotRequired, Required
+        from typing_extensions import NotRequired, Required
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
@@ -25,25 +25,15 @@
     from altair.datasets._typing import Dataset, FlFieldStr
 
 
-class NpmUrl(NamedTuple):
-    CDN: LiteralString
-    GH: LiteralString
-
-
-#####################################################
-# frictionless datapackage
-#####################################################
-
-
-FlCsvDialect: TypeAlias = Mapping[
+CsvDialect: TypeAlias = Mapping[
     Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]]
 ]
-FlJsonDialect: TypeAlias = Mapping[
+JsonDialect: TypeAlias = Mapping[
     Literal[r"json"], Mapping[Literal["keyed"], Literal[True]]
 ]
 
 
-class FlField(TypedDict):
+class Field(TypedDict):
     """https://datapackage.org/standard/table-schema/#field."""
 
     name: str
@@ -51,33 +41,33 @@ class FlField(TypedDict):
     description: NotRequired[str]
 
 
-class FlSchema(TypedDict):
+class Schema(TypedDict):
     """https://datapackage.org/standard/table-schema/#properties."""
 
-    fields: Sequence[FlField]
+    fields: Sequence[Field]
 
 
-class FlSource(TypedDict, total=False):
+class Source(TypedDict, total=False):
     title: str
     path: Required[str]
     email: str
     version: str
 
 
-class FlLicense(TypedDict):
+class License(TypedDict):
     name: str
     path: str
     title: NotRequired[str]
 
 
-class FlResource(TypedDict):
+class Resource(TypedDict):
     """https://datapackage.org/standard/data-resource/#properties."""
 
     name: Dataset
     type: Literal["table", "file", r"json"]
     description: NotRequired[str]
-    licenses: NotRequired[Sequence[FlLicense]]
-    sources: NotRequired[Sequence[FlSource]]
+    licenses: NotRequired[Sequence[License]]
+    sources: NotRequired[Sequence[Source]]
     path: str
     scheme: Literal["file"]
     format: Literal[
@@ -96,8 +86,8 @@ class FlResource(TypedDict):
     encoding: NotRequired[Literal["utf-8"]]
     hash: str
     bytes: int
-    dialect: NotRequired[FlCsvDialect | FlJsonDialect]
-    schema: NotRequired[FlSchema]
+    dialect: NotRequired[CsvDialect | JsonDialect]
+    schema: NotRequired[Schema]
 
 
 class Contributor(TypedDict, total=False):
@@ -110,7 +100,7 @@ class Contributor(TypedDict, total=False):
     organization: str
 
 
-class FlPackage(TypedDict):
+class Package(TypedDict):
     """
     A subset of the `Data Package`_ standard.
 
@@ -122,11 +112,11 @@ class FlPackage(TypedDict):
     version: str
     homepage: str
     description: str
-    licenses: Sequence[FlLicense]
+    licenses: Sequence[License]
     contributors: Sequence[Contributor]
-    sources: Sequence[FlSource]
+    sources: Sequence[Source]
     created: str
-    resources: Sequence[FlResource]
+    resources: Sequence[Resource]
 
 
 class ParsedPackage(TypedDict):
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 7f61323c4..ea38eb971 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -4,10 +4,9 @@
 import string
 import urllib.request
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple
 
 from tools.datasets import datapackage
-from tools.datasets.models import NpmUrl
 
 if TYPE_CHECKING:
     import sys
@@ -21,7 +20,7 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from tools.datasets.models import FlPackage, ParsedPackage
+    from tools.datasets.models import Package, ParsedPackage
 
     BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'  # noqa: TC008
 
@@ -29,6 +28,11 @@
 __all__ = ["Npm"]
 
 
+class NpmUrl(NamedTuple):
+    CDN: LiteralString
+    GH: LiteralString
+
+
 class Npm:
     """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
 
@@ -60,7 +64,6 @@ def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
         - Encodes the endpoint at this stage
             - Use github if its the only option (since its slower otherwise)
             - npm only has releases/tags (not branches)
-        - So the column can be renamed ``"url_npm"`` -> ``"url"``
         """
         return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/"
 
@@ -105,7 +108,7 @@ def file_gh(
             return read_fn(response)
 
     def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage:
-        pkg: FlPackage = (
+        pkg: Package = (
             json.loads(self._paths["datapackage"].read_text("utf-8"))
             if frozen
             else self.file_gh(tag, "datapackage.json")

From 0df79b0a4baa4cce76516315cd4a91f13221ff1c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 17:27:34 +0000
Subject: [PATCH 155/201] test: Skip `is_image` datasets

---
 tests/test_datasets.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 95a6fb0ad..923fb9fbc 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -617,18 +617,7 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
 @datasets_debug
 @pytest.mark.parametrize(
     ("name", "suffix"),
-    list(
-        _dataset_params(
-            skip=(
-                "climate",
-                "graticule",
-                "sf-temps",
-                "iris",
-                "weball26",
-                "seattle-temps",
-            )
-        )
-    ),
+    list(_dataset_params(skip=("7zip", "ffox", "gimp"))),
 )
 def test_all_datasets(
     polars_loader: Loader[pl.DataFrame, pl.LazyFrame],

From ee0d381b4a4d37c2e436b8a73cd95f1b1a5f6f97 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 14 Jan 2025 18:08:19 +0000
Subject: [PATCH 156/201] refactor: Make caching **opt-out**, use
 `$XDG_CACHE_HOME`

Caching is the more sensible default when considering a notebook environment
Using a standardised path now also https://specifications.freedesktop.org/basedir-spec/latest/#variables
---
 altair/datasets/_cache.py  | 24 +++++++++++++++++-------
 altair/datasets/_loader.py | 12 ++++++++----
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 3e4beb82d..fdc8c3db8 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -217,9 +217,12 @@ def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]:
 
 
 class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
-    """Optional caching of remote dataset requests."""
+    """Opt-out caching of remote dataset requests."""
 
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
+    _XDG_CACHE: ClassVar[Path] = (
+        Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair"
+    ).resolve()
 
     def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
         self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader
@@ -273,9 +276,13 @@ def path(self) -> Path:
         """
         Returns path to datasets cache.
 
-        By default, this can be configured using the environment variable:
+        Defaults to (`XDG_CACHE_HOME`_):
 
-            "ALTAIR_DATASETS_DIR"
+            "$XDG_CACHE_HOME/altair/"
+
+        But can be configured using the environment variable:
+
+            "$ALTAIR_DATASETS_DIR"
 
         You can set this for the current session via:
 
@@ -289,10 +296,13 @@ def path(self) -> Path:
         You can *later* disable caching via:
 
             >>> load.cache.path = None
+
+        .. _XDG_CACHE_HOME:
+            https://specifications.freedesktop.org/basedir-spec/latest/#variables
         """
         self._ensure_active()
-        fp = Path(os.environ[self._ENV_VAR])
-        fp.mkdir(exist_ok=True)
+        fp = Path(usr) if (usr := os.environ.get(self._ENV_VAR)) else self._XDG_CACHE
+        fp.mkdir(parents=True, exist_ok=True)
         return fp
 
     @path.setter
@@ -300,7 +310,7 @@ def path(self, source: StrPath | None, /) -> None:
         if source is not None:
             os.environ[self._ENV_VAR] = str(Path(source).resolve())
         else:
-            os.environ.pop(self._ENV_VAR, None)
+            os.environ[self._ENV_VAR] = ""
 
     def __iter__(self) -> Iterator[Path]:
         yield from self.path.iterdir()
@@ -316,7 +326,7 @@ def is_active(self) -> bool:
         return not self.is_not_active()
 
     def is_not_active(self) -> bool:
-        return os.environ.get(self._ENV_VAR) is None
+        return os.environ.get(self._ENV_VAR) == ""
 
     def is_empty(self) -> bool:
         """Cache is active, but no files are stored in ``self.path``."""
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 63bd5f3f7..ef1cf46d3 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -29,7 +29,7 @@
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
-    Load examples **remotely** from `vega-datasets`_, with *optional* caching.
+    Load examples **remotely** from `vega-datasets`_, with caching.
 
     A new ``Loader`` must be initialized by specifying a backend:
 
@@ -280,11 +280,11 @@ def url(
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         """
-        Optional caching of remote dataset requests.
+        Caching of remote dataset requests.
 
-        Enable caching:
+        Configure cache path:
 
-            self.cache.path = ...
+            self.cache.path = "..."
 
         Download the latest datasets *ahead-of-time*:
 
@@ -293,6 +293,10 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         Remove all downloaded datasets:
 
             self.cache.clear()
+
+        Disable caching:
+
+            self.cache.path = None
         """
         return self._reader.cache
 

From 138ede601ef35f136e3e54c34e3cf001c2679c6b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 15 Jan 2025 12:45:12 +0000
Subject: [PATCH 157/201] refactor(typing): Add `_iter_results` helper

---
 altair/datasets/_cache.py   | 17 +++++++++++++++--
 altair/datasets/_readers.py |  6 +++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index fdc8c3db8..c3ca65848 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -3,7 +3,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args
+from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args
 
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
@@ -21,6 +21,8 @@
     from _typeshed import StrPath
     from narwhals.stable.v1.dtypes import DType
 
+    from altair.datasets._typing import Metadata
+
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
@@ -75,6 +77,17 @@
 }
 
 
+def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
+    """
+    Yield rows from ``df``, where each represents a dataset.
+
+    See Also
+    --------
+    ``altair.datasets._typing.Metadata``
+    """
+    yield from cast("Iterator[Metadata]", df.iter_rows(named=True))
+
+
 class CompressedCache(Protocol[_KT, _VT]):
     fp: Path
     _mapping: MutableMapping[_KT, _VT]
@@ -263,7 +276,7 @@ def download_all(self) -> None:
             print("Already downloaded all datasets")
             return None
         print(f"Downloading {len(frame)} missing datasets...")
-        for row in frame.iter_rows(named=True):
+        for row in _iter_results(frame):
             fp: Path = self.path / (row["sha"] + row["suffix"])
             with self._rd._opener.open(row["url"]) as f:
                 fp.touch()
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index d0094f5ff..330f85642 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -31,7 +31,7 @@
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
-from altair.datasets._cache import DatasetCache
+from altair.datasets._cache import DatasetCache, _iter_results
 from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read
 
 if TYPE_CHECKING:
@@ -147,10 +147,10 @@ def dataset(
         **kwds: Any,
     ) -> IntoDataFrameT:
         df = self.query(**_extract_constraints(name, suffix))
-        result = next(df.iter_rows(named=True))
+        result = next(_iter_results(df))
         url = result["url"]
         fn = self.read_fn(url)
-        if default_kwds := self._schema_kwds(result):  # type: ignore
+        if default_kwds := self._schema_kwds(result):
             kwds = default_kwds | kwds if kwds else default_kwds
 
         if self.cache.is_active():

From 1a4f1c10c52f74d51a4e9ef78fc5d8c21cbded84 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 15 Jan 2025 13:05:49 +0000
Subject: [PATCH 158/201] feat(DRAFT): Replace `UrlCache` w/ `CsvCache`

Now that only a single version is supported, it is possible to mitigate the `pandas` case w/o `.parquet` support (https://github.com/vega/altair/pull/3631#issuecomment-2480832609)

This commit adds the file and some tools needed to implement this - but I'll need to follow up with some more changes to integrate this into `_Reader`
---
 altair/datasets/__init__.py               |   4 +-
 altair/datasets/_cache.py                 |  59 +++++++++++++---------
 altair/datasets/_metadata/metadata.csv.gz | Bin 0 -> 3577 bytes
 altair/datasets/_metadata/url.csv.gz      | Bin 858 -> 0 bytes
 tests/test_datasets.py                    |   6 +--
 tools/datasets/__init__.py                |  16 +++---
 6 files changed, 45 insertions(+), 40 deletions(-)
 create mode 100644 altair/datasets/_metadata/metadata.csv.gz
 delete mode 100644 altair/datasets/_metadata/url.csv.gz

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 6095dd404..4986f671d 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -80,9 +80,9 @@ def url(
 
         url = load.url(name, suffix)
     except AltairDatasetsError:
-        from altair.datasets._cache import url_cache
+        from altair.datasets._cache import csv_cache
 
-        url = url_cache[name]
+        url = csv_cache.url(name)
 
     return url
 
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index c3ca65848..8b14e3660 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -37,15 +37,16 @@
     _Dataset: TypeAlias = "Dataset | LiteralString"  # noqa: TC008
     _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
 
-__all__ = ["DatasetCache", "UrlCache", "url_cache"]
+__all__ = ["DatasetCache"]
 
 
 _KT = TypeVar("_KT")
 _VT = TypeVar("_VT")
 _T = TypeVar("_T")
 
-_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"
-_SCHEMA: Final[Path] = Path(__file__).parent / "_metadata" / "schemas.json.gz"
+_METADATA_DIR: Final[Path] = Path(__file__).parent / "_metadata"
+_SCHEMA: Final[Path] = _METADATA_DIR / "schemas.json.gz"
+_CSV: Final[Path] = _METADATA_DIR / "metadata.csv.gz"
 
 _FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
     "integer": nw.Int64,
@@ -109,19 +110,23 @@ def get(self, key: _KT, default: _T, /) -> _VT | _T:
         return self._mapping.get(key, default)
 
 
-class UrlCache(CompressedCache[_KT, _VT]):
+class CsvCache(CompressedCache["_Dataset", "Metadata"]):
     """
-    `csv`_, `gzip`_ -based, lazy url lookup.
+    `csv`_, `gzip`_ -based, lazy metadata lookup.
 
-    Operates on a subset of available datasets:
-    - Excludes `.parquet`, which `cannot be read via url`_
+    Used as a fallback for 2 scenarios:
+
+    1. ``url(...)`` when no optional dependencies are installed.
+    2. ``(Loader|load)(...)`` when the backend is missing* ``.parquet`` support.
+
+    Notes
+    -----
+    *All backends *can* support ``.parquet``, but ``pandas`` requires an optional dependency.
 
     .. _csv:
         https://docs.python.org/3/library/csv.html
     .. _gzip:
         https://docs.python.org/3/library/gzip.html
-    .. _cannot be read via url:
-        https://github.com/vega/vega/issues/3961
     """
 
     def __init__(
@@ -129,12 +134,10 @@ def __init__(
         fp: Path,
         /,
         *,
-        columns: tuple[str, str],
-        tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
+        tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"],
     ) -> None:
         self.fp: Path = fp
-        self.columns: tuple[str, str] = columns
-        self._mapping: MutableMapping[_KT, _VT] = tp()
+        self._mapping: MutableMapping[_Dataset, Metadata] = tp()
 
     def read(self) -> Any:
         import csv
@@ -143,24 +146,32 @@ def read(self) -> Any:
             b_lines = f.readlines()
         reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
         header = tuple(next(reader))
-        if header != self.columns:
-            msg = f"Expected header to match {self.columns!r},\nbut got: {header!r}"
-            raise ValueError(msg)
-        return dict(reader)
-
-    def __getitem__(self, key: _KT, /) -> _VT:
-        if url := self.get(key, None):
-            return url
+        return {row[0]: dict(zip(header, row)) for row in reader}
 
+    def __getitem__(self, key: _Dataset, /) -> Metadata:
+        if result := self.get(key, None):
+            return result
         from altair.datasets._typing import Dataset
 
         if key in get_args(Dataset):
-            msg = f"{key!r} cannot be loaded via url."
+            msg = f"{key!r} cannot be loaded via {type(self).__name__!r}."
             raise TypeError(msg)
         else:
             msg = f"{key!r} does not refer to a known dataset."
             raise TypeError(msg)
 
+    def url(self, name: _Dataset, /) -> str:
+        if result := self.get(name, None):
+            return result["url"]
+        from altair.datasets._typing import Dataset
+
+        if name in get_args(Dataset):
+            msg = f"{name!r} cannot be loaded via url."
+            raise TypeError(msg)
+        else:
+            msg = f"{name!r} does not refer to a known dataset."
+            raise TypeError(msg)
+
 
 class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]):
     """
@@ -359,7 +370,5 @@ def _ensure_active(self) -> None:
             raise ValueError(msg)
 
 
-url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(
-    _URL, columns=("dataset_name", "url")
-)
 schema_cache = SchemaCache(_SCHEMA)
+csv_cache = CsvCache(_CSV)
diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..422e10cd48f3b181b47faac540f91c111ec767b7
GIT binary patch
literal 3577
zcmV<V4F>WbiwFn+00002|7~S-VPs)+VJ>5Hb^xtf%XZs17Toh&><-?KS<RW*&T{rS
z7q}NTJF=+JBkA<lPf;Re$xhNtGbiWR0Hh>U;C)nqU^>!4jvu%5A`gdCYyBS&t>4IH
zp?p2c;h{f#>|f|f#=()wX+!&Q_$0%5W!Rm6A9i%?X)_LgKMdQ4XTn;2mKS<BJk!Hz
zzv<$i{qEtn-S+8W9GwR?hx~BdpX8y@=I~cPgHM4&RVt-e*`?%_%DL+3$eB`fh>=)S
zeRw_|cZYv`_)vNK+wTXKP5*iS+g6Sro}NE^mM1#@c;pND!#`iB-+mZpA7)@3_50nh
zKOP=_tB21I^Xxn^TG`+yNI%4W{xGOiD6rR!qVmaA+eow3QnK^bT4lVq$)>*#`W9VB
z+wF-qkGs9paktOU;%@D-WI`}k5<kR$8AhEYTBky7Ri&l_GX<}`wj^3>E6fq({0bKD
zK!VGm4lhNnG%nN2Oh;vZ{1DE|=O0>=<fyVLU~LOGRZ|FB`#{c-FEJTWF~v3LZ?JWR
z_8a*+!M;o`<K*z4ECijndc|VA$|Z=-u}b#UxSCVTiZg2}+NKrJeuJzl{hklUeg8?O
zn7>&#&$Thun!GsfbxB2MRdcDzNQ<SGCY}<qpaqR&MGV;%*1*3<*p(AM$#LHG^W-wN
zSzEt2SbWh5F<wk+V_-DzD&<rqAZ^(g)lA65O|Jm{ifsaYxI=xIB0Q;>7e@<amBqLg
zaw1!>$2Hhkt&*C(MA4C3lor;aKFqN9$9{W7Jx->=65ZmR=0wR*A>zn|T+5ZQ&*GFR
zpwLv5s91?xgZYZAtF)iDc^poAtdz@UaUEel1}(}tDNQX)6~n>in6g1|mlB%8p7Y-a
zd_~sP!93PsTV;FPqqPF-ISt<}Uy?I<7Z;~yNm2V+*id3swav;XineiaY^-jnFzSt^
z6*Rtu5|mDu_i}v<^ZYW+Xo!oeqf)5nK9|@usqCUh7@<-k36gP&Bxz-WM6-s&ExNAm
z`xl`Z`FS!$yiUs_-f)9xsX-g%H5uK+8fR=I7d!>SzKPOhMV8(m>$pGA;VAp5qF<*|
z$Rwv0hs>y4gi}C8Pf-;}u*_K~#F|~RjHHl)QnNzE+@b4u-+%5Q5HDLi&8BMZf?m4c
z*0Q2jeQX7p?j@kaSV<Hrq#`sB>Ptpv%`0HvK<k9|$LIf@=+iaCZ&s#)V{}f}X7rB6
zEA-1;*j0y;3e}t0*Xo-ysiG6N!e|e!Dr`q%_4f(9McB1A{Be=#<LErlF_^Smr>l^m
z&az96qL<7`#CtLZdLStd5k)3fQ&-gW8ClohMrM6@9#1)`0u=S)q_hU1tXhhmB}Iyy
zT6G#)ZPZ;BDlw-L$_ltwWEa4XEIZk<Z0l<=eE-2U+?86~pj0UWJHbQ*wXR)-@rbrq
zRQ1TV;`7Gj@=7Z2VS-ox^n5%#YW3+#=(;kA-;Bz<m`a4ovSV#g4(3i$!*hV+Z~>(z
z8&8~yjXv`lD)#`d9E^HLVqBWKR6=Mimkcf%cndRXvF4jiG?q=zy17(I;5iIKM3`Sd
z;|lSH$9o!|X7>au5*Bw`108QvaRCych2V;yjB1CaWL@$Gc03|u<X5nGOX3}e8w%Mk
zPV#Jw<~p7m#AsltknIwZDq?F&8X1owW$Uka4sQtfw*>q|z@d>Xe*`esxk2iPEtJNd
zWm55w&8p;v#$b~QCc}fTC*a=@n9dh2BI3>Bc`bM!&2lLL%vPhiW(`wbTeU4&WCijY
zd7ULf_!Io%dLlO%yEY8RLYvL}ET-w)lvRY=S{zFXf<Hwt$g$+813_41K~G~}vM#O^
zm&mVhh<EV1C)&OAD7X6w{B?erMsL(&MU-kWO*KdeI9hGQH&r8=hUH45P6`p(9gF?@
z*zeK3f`5D#+8m!B`)wUw<PuHa`SAQ%3TpW&8<i?h(b<|&dL*xmMfZS}Cm1}NgP@Ij
zs=tEPd#K%0|1zCyj-&G+8LgKqrrMNOo;eyR-guu{Xi*o2pR^%ZZmyY@%!<R385po0
zpHF4l{_}V~)$~7cxHI}q6&!O&FuW!SnO4SKM^g(<$4X(Ov<B}Ct$W%G$c4wFjGKIh
z{km|mGT5T!<wuhPGze>75f$00$dz-cP9RpHvBz^YqgEKlH}JYS;LpiAMa?B~O^k~Z
zSD@UKX$&K#&;`Vz(7@TSwHZ<4Mjz%;I9HU-8)#iWe4$6#NQDEg{qgH%<KM1ccHdjQ
zc=ruNgH6p?j^v{Nd}>Z|(=xVW?0gwRsI3U}Z(;0yAIhw=$Kj;pkXml>0b9sy(|EpP
z6-*fP0-KbABt*G#aAO>`4@p)?>npNuLy!A@nBP9kv-1RAaCxiEPGEK!av`zF-nED=
zhnX=UK|aG0G>NRzisOb`Y~5zq^058aupdrO&+~rYFHZdyyJ&v6+c(@et8tUPZ;hLH
z9V|sCjtYB;%ncgF(YI_?1pYT5PF(&y9$*bU-;=qnPG%EPyjVz!foF&MQMHvy>_yk)
zbCCcV(gbTLq|L7-a}RN1^WRgE|MXkAXLDVhTpFrNT>5NuF&cIT8(?8SvMO2yl}uqI
zOD41wPsBy1wRG+=g3j>iwEOsk2ezA&<aX(jDa#K;qeB$g5KX4c$+1pSooS<nNTGNu
zNE<AyqVWda%wZ?PZX<UbE=$u(m*o<u#@yhUTd}Pr=$@LG5o$08&<;?nf_mkAtgA`9
z;_F`egY2nnt|9fhFe_^t&GOfbl!|IZlwFO^AjfkP8-z5Joz>Y{)y5~YqF}uN*u4x}
z+ShBfnx~gpSbnmb;Vv7w6v?65RutCMP^*a)5f))Tb#R8;3goxYfcx|7+0kV(?YPZ3
zr{%K3CK%~9p1nKtuZsE#cMyHn$zj`3sdcS>jjg-^*KLRW3#5_Z3HWtBpLIbGEIwJW
z%32kqn$ZB|R^clZ><wG;&E%R#f8)%GQ^tFA-B9FkmO^K&(`;%1qhMSrBpCkPWMNf6
zU_5=W$_P|cGOj4-e)I#Px<W|YLhFX-O`RcQcp3R<7au(20(Wi&-4;ciSwu`OUW~~q
zYiq0vR2A73{r8Hi+sXFn)Nkjwo~KhbMy(e&TCZaXYz_WWbVZloL)K&?;=_+uMj)RN
z)nP>?xdYejHgs%=7x=@Y?DuqiIr{#CSpymO+G+p#eb=aSoYuAIP+EjTl++pp4_&9w
zzmTs8sk*|pd<ST^8~W|_1@&bzonK`X;>E|HI*`V;tCWLj*5{xRPZ56%@z{8Eu1qU#
zVB8=B<V}C<D{{9E<2$@^8s9;_a`@(_Q)Va=79T1)B4loEWV5MoU@oE9H<$rp?TgaR
zM%`S_Ygm0BD`@Uco9W%~#B5%eU&e>!WI|F6ZZb%W(dw~_IYgGx)oNoR9$r%h0oc}%
zxr29u|08k<@=o@LkI$z)j2ms|dnkYLrTO*ua#LgraR}0~)+O5-oF4N=H>1#g6zW+j
zNUWBB1<&t7?hZo7<3=98;P>-*vN!U0vv^)}$>`+=GpgAH!CJU#=%nh!Gkj9Bwq$l3
z3#zs#@hixE1LIpVkI%ztzxn!j>7R#ha(nZg^H;1MkA)X=W$#_k#5Kq9`kxBA1q5sf
z8F|?vT2?`I<nqh7egkuN*uno#49DT+FwOqN(WR1NQfwtwqasWooRh&b=MX8Iv40^i
zD-`7l8Z)p9+VM@%E#&k35^EZDIsL>v5_lwq&dSOgz|;&9-seoA=B)6q5ei~ki~NeO
zI~<4lX^yA!c)ptEmhZQR6RM&M1ek`HDUdVG7guD|GKd@j1#9Yxv&&a#-HH40idvno
zrCsNfq;X;KTL<iO^jbwux}xVaDkdnZC|uSzD=tPzf}B<;fO~M=X)7<g&G7X_wt#=c
zM&GWS7p#kRd2wP*b?ozFNo!nfaVDw^L4YiV35jfzPif6%hPMFkI6P9_4~N5Jzva{6
z_8!uYzcgRQ^XL?dg#pJfUL2?y9#&CKYP?mTpbrYHIr%{7vfO?V)9+vg`F<31uXgKo
zVfI@_FIJ%EOH0ft!Op7WV98sBHW1^Y3zp%#7)*0m!|4uQcN*W#zs%UrlR0gSy<P(P
z%R{Qxu=x@q0xvYfT1xUzYX&u|piDJQk$jCSuDjh}8$sWu%~+rAmM#@5Cb#_1JjLvb
z_*k2k@vzwmysmQ<6`)w-2>KsRc+J_uJA@;R!;y~iSZUWE&s}_e4EO6(B&D#}n&oN@
zjEq-4)s(Z-rdrj|HVj4d4@bHrrPf%pZ-eYU4;u)uVPZ1P=1Y99ui_V+Q{m}pk9OBe
z^(}C*Z~<s|-bX5oXd_}JjaSs-TXfx>NW2w?+CZqspO6|+lDExOUqTGpGnt^-Gn8*a
zAUTT$)7HGUyH$l>L_R?+UBRC>`hIafBuskw-WPU;EXt0u5^Lp92(Bx)Y{#nuRR<^d
znfw|O?=W^>o-vy_jN|#DSz2tcDvJxnS*SUKM_?;xMCRHg=f>xkN{m!rac$rgSoa0Z
z;Iq`@FyH>XS(rix6>#}$Q{j+8QH6rGG@haGc;v<zjJ&ca{5qkv`4y*aZvb{*hW&;g
z^=k)sU6{f~qjspJU4V2&>>C#JfT%6pdR5gVL{auCR5CnXHj@=snBL&){_J;k;MvE{
zvx`AcI@$CSR>v|toM=--L2RS;XWuk3Au8pG*_EiYDl5R>!t4GIPe~^$7d8L@%y<Ez

literal 0
HcmV?d00001

diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz
deleted file mode 100644
index 7de1154e5be7e104ad07d9b3a7bce639375c8c67..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 858
zcmV-g1Eu^QiwFn+00002|8;U~E@N|c0Ik?dZ`&{o0N{InOO>PriUB(f+jYm?16g8h
zHL~dG;jaDl)60(I6dMrw(k2G@iI!!GisYU<@JGXYcnnrQeECqrW9R+gzV7!zn%%3D
zpw(!12K>J5_9L`B|6#Jkx^F#KW}m+8Q}k{y?E_bKK<nN8)FkYL8y~nafonEc5R~+;
zm>7yC%1&V400k{V@fvP5yx*^t_-ZLDoww=?gn3*bc`BPATkr%g$JYe&XGg`{q*<>F
z1xxY865JBfO3~H9XC}}HFy3;nkdWRDl^=(|5Wg$R6P}5hvQ<5B0+-{9C6(jO17$8}
zBHymGYE-5ra0Mvo0k^*U9XRO)Ss+C{o$$~7T#C*m@T9pVl$9J_6G|VMgaM2MBS?$o
z)|$ppZRZ_(I-Du^j9jbFhb`Eotldr6wYmkmGG8_@Ush(_BI$62@*8?W^bOaVpjM$I
z+w!4Tajn_=!I@ZztOKrn$CQa1Fl^vogLyhy)e{<y6rGDft9%z43Y6kYMjGSnG7qH0
zE=WeHQ&?v#KOco}oPh=rHzlDwp&43BIn`He2=y;TmoznJRw1Q(nS`8{MkCRDM~k8D
zDCm__Q_a_Yq|O1AB;zx*)PIErekucGeTF6-&tW*W@qPx9y<qBu=PLu?pn(h*Cd0jQ
zV0oh{Q*klMEc9r&6_j-`36nRH+JF*YEQ7X>wcuIa&&@;;u>nVV@hecZP74ARql*(;
zo;D~$n6*i$p631(Z#?cBvA?yPa(213>KLdDr9_qZ$<oRz5uZ~$hQx~6gmela;h%!j
zXhYD%Bh$rLo5%PhVvUK%iendS+#NWna{UL+Wx)aWUITlJ|2s-Mvw}XuS%<;ucUImj
zx7k=Lw+ubJ91esLQuqZ*O*&P}Mj<_lGv)Ske5OJ?h^97z&@RtJS&hNO?n&c&AGcEi
zx=?mLCAzr4xUlG4Y)EBr#GeWN?M}C(QWYZY1s9{5={(pIyc|DUo6Px=DXYUV=eY+a
z_@F%LmAj!z>Lcn{OYnZT0!1Sip=I%T131T`uz|UBYNGK-MR-y<yh?Ggj`1jflwYG3
kNV#>iTCyk70=*a|(GN8Ul$2L>5^{=v0TW#UNTU}3016M7O8@`>

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 923fb9fbc..a65b96bd7 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -268,13 +268,13 @@ def test_url(name: Dataset) -> None:
 
 def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     import altair.datasets
-    from altair.datasets._cache import url_cache
+    from altair.datasets._cache import csv_cache
 
     monkeypatch.setitem(sys.modules, "polars", None)
     monkeypatch.setitem(sys.modules, "pandas", None)
     monkeypatch.setitem(sys.modules, "pyarrow", None)
 
-    assert url_cache._mapping == {}
+    assert csv_cache._mapping == {}
 
     with contextlib.suppress(AltairDatasetsError):
         monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
@@ -283,7 +283,7 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert match_url("jobs", url("jobs"))
 
-    assert url_cache._mapping != {}
+    assert csv_cache._mapping != {}
 
     assert match_url("cars", url("cars"))
     assert match_url("stocks", url("stocks"))
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 7350ede7f..534bf6b9c 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -40,7 +40,7 @@
 
     _PathAlias: TypeAlias = Literal[
         "typing",
-        "url",
+        "metadata-csv",
         "metadata",
         "schemas",
     ]
@@ -82,11 +82,12 @@ def __init__(
         out_dir_tools.mkdir(exist_ok=True)
         kwds_npm = kwds_npm or {}
         self._npm: Npm = Npm(out_dir_tools, **kwds_npm)
+        METADATA = "metadata"
         self.paths = types.MappingProxyType["_PathAlias", Path](
             {
                 "typing": out_fp_typing,
-                "url": out_dir_altair / "url.csv.gz",
-                "metadata": out_dir_altair / "metadata.parquet",
+                "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz",
+                "metadata": out_dir_altair / f"{METADATA}.parquet",
                 "schemas": out_dir_altair / "schemas.json.gz",
             }
         )
@@ -120,18 +121,13 @@ def refresh(
         package = self.npm.datapackage(tag=tag, frozen=frozen)
         self.write_parquet(package["features"], self.paths["metadata"])
         self.write_json_gzip(package["schemas"], self.paths["schemas"])
-        # FIXME: 2-Part replacement
-        # - [x] Switch source to `"metadata"` + refresh (easy)
-        # - [ ] Rewriting `UrlCache` to operate on result rows (difficult)
-        urls_min = (
+        metadata_min = (
             package["features"]
             .lazy()
             .filter(~(col("suffix").is_in((".parquet", ".arrow"))))
-            .select("dataset_name", "url")
             .sort("dataset_name")
-            .collect()
         )
-        self.write_csv_gzip(urls_min, self.paths["url"])
+        self.write_csv_gzip(metadata_min, self.paths["metadata-csv"])
 
         if include_typing:
             self.generate_typing()

From 32fd0f9444cbc12630d6b4d27ed3ebfdb0e7ac67 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:39:32 +0000
Subject: [PATCH 159/201] refactor: Misc reworking caching

- Made paths a `ClassVar`
- Removed unused `SchemaCache` methods
- Replace `_FIELD_TO_DTYPE` w/ `_DTYPE_TO_FIELD`
  - Only one variant is ever used
Use a `SchemaCache` instance per-`pandas`-based reader
- Make fallback `csv_cache` initialization lazy
  - Only going to use the global when no dependencies found
  - Otherwise, instance-per-reader
---
 altair/datasets/_cache.py   | 74 ++++++++++++++++---------------------
 altair/datasets/_readers.py | 10 +++--
 2 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 8b14e3660..89ed16858 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -37,7 +37,7 @@
     _Dataset: TypeAlias = "Dataset | LiteralString"  # noqa: TC008
     _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
 
-__all__ = ["DatasetCache"]
+__all__ = ["CsvCache", "DatasetCache", "SchemaCache", "csv_cache"]
 
 
 _KT = TypeVar("_KT")
@@ -45,25 +45,23 @@
 _T = TypeVar("_T")
 
 _METADATA_DIR: Final[Path] = Path(__file__).parent / "_metadata"
-_SCHEMA: Final[Path] = _METADATA_DIR / "schemas.json.gz"
-_CSV: Final[Path] = _METADATA_DIR / "metadata.csv.gz"
-
-_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
-    "integer": nw.Int64,
-    "number": nw.Float64,
-    "boolean": nw.Boolean,
-    "string": nw.String,
-    "object": nw.Struct,
-    "array": nw.List,
-    "date": nw.Date,
-    "datetime": nw.Datetime,
-    # "time": nw.Time, (Not Implemented, but we don't have any cases using it anyway)
-    "duration": nw.Duration,
+
+_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = {
+    nw.Int64: "integer",
+    nw.Float64: "number",
+    nw.Boolean: "boolean",
+    nw.String: "string",
+    nw.Struct: "object",
+    nw.List: "array",
+    nw.Date: "date",
+    nw.Datetime: "datetime",
+    nw.Duration: "duration",
+    # nw.Time: "time" (Not Implemented, but we don't have any cases using it anyway)
 }
 """
-Similar to an inverted `pl.datatypes.convert.dtype_to_ffiname`_.
+Similar to `pl.datatypes.convert.dtype_to_ffiname`_.
 
-But using the string repr of ``frictionless`` `Field Types`_ to `narwhals.dtypes`_.
+But using `narwhals.dtypes`_ to the string repr of ``frictionless`` `Field Types`_.
 
 .. _pl.datatypes.convert.dtype_to_ffiname:
     https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165
@@ -73,10 +71,6 @@
     https://narwhals-dev.github.io/narwhals/api-reference/dtypes/
 """
 
-_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = {
-    v: k for k, v in _FIELD_TO_DTYPE.items()
-}
-
 
 def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
     """
@@ -129,14 +123,13 @@ class CsvCache(CompressedCache["_Dataset", "Metadata"]):
         https://docs.python.org/3/library/gzip.html
     """
 
+    fp = _METADATA_DIR / "metadata.csv.gz"
+
     def __init__(
         self,
-        fp: Path,
-        /,
         *,
         tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"],
     ) -> None:
-        self.fp: Path = fp
         self._mapping: MutableMapping[_Dataset, Metadata] = tp()
 
     def read(self) -> Any:
@@ -189,14 +182,13 @@ class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]):
         https://github.com/vega/vega-datasets/pull/631
     """
 
+    fp = _METADATA_DIR / "schemas.json.gz"
+
     def __init__(
         self,
-        fp: Path,
-        /,
         *,
         tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"],
     ) -> None:
-        self.fp: Path = fp
         self._mapping: MutableMapping[_Dataset, _FlSchema] = tp()
 
     def read(self) -> Any:
@@ -225,20 +217,6 @@ def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]:
         else:
             return list(match)
 
-    def schema(self, name: _Dataset, /) -> Mapping[str, DType]:
-        return {
-            column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items()
-        }
-
-    def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]:
-        """
-        Can be passed directly to `.with_columns(...).
-
-        BUG: `cars` doesnt work in either pandas backend
-        """
-        for column, dtype in self.schema(name).items():
-            yield nw.col(column).cast(dtype)
-
 
 class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
     """Opt-out caching of remote dataset requests."""
@@ -370,5 +348,15 @@ def _ensure_active(self) -> None:
             raise ValueError(msg)
 
 
-schema_cache = SchemaCache(_SCHEMA)
-csv_cache = CsvCache(_CSV)
+csv_cache: CsvCache
+
+
+def __getattr__(name):
+    if name == "csv_cache":
+        global csv_cache
+        csv_cache = CsvCache()
+        return csv_cache
+
+    else:
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 330f85642..d69b50e1d 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -31,7 +31,7 @@
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
-from altair.datasets._cache import DatasetCache, _iter_results
+from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_results
 from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read
 
 if TYPE_CHECKING:
@@ -252,12 +252,12 @@ class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol):
     - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
     """
 
-    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
-        from altair.datasets._cache import schema_cache
+    _schema_cache: SchemaCache
 
+    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
         name: Any = result["dataset_name"]
         suffix = result["suffix"]
-        if cols := schema_cache.by_dtype(name, nw.Date, nw.Datetime):
+        if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime):
             if suffix == ".json":
                 return {"convert_dates": cols}
             elif suffix in {".csv", ".tsv"}:
@@ -278,6 +278,7 @@ def __init__(self, name: _Pandas, /) -> None:
             ".parquet": pd.read_parquet,
         }
         self._scan_fn = {".parquet": pd.read_parquet}
+        self._schema_cache = SchemaCache()
 
 
 class _PandasPyArrowReader(_PandasReaderBase):
@@ -296,6 +297,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
             ".parquet": partial(pd.read_parquet, dtype_backend=_pa),
         }
         self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}
+        self._schema_cache = SchemaCache()
 
 
 def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame:

From a1839df416e56814fa7f74afead758377f38550b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:42:20 +0000
Subject: [PATCH 160/201] chore: Include `.parquet` in `metadata.csv.gz`

- Readable via url w/ `vegafusion` installed
- Currently no cases where a dataset has both `.parquet` and another extension
---
 altair/datasets/_metadata/metadata.csv.gz | Bin 3577 -> 3632 bytes
 tools/datasets/__init__.py                |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz
index 422e10cd48f3b181b47faac540f91c111ec767b7..30793abc86eee5f4255edba76dd1d9b739e8d66e 100644
GIT binary patch
delta 3501
zcmV;e4N~&?8?YRZ8Gp%=+&GrJ^II}Y*q1(yO<(&s_p%4PhfJ0tgVC`_t^WFPMv$>&
zN>x-*CbLWc5t7`&x445~I?_Rozi;VHo(`wh`u{w&ej}HK^6e;xr~dGJ|3)t|4vth#
z8`_V<7a7hg!|we1u%ly7n{oKZVc0&s64vUgywTI)m7Y%fO@9~v+wY!!+ihQ-#?g6T
zbI4D}{Yjo0Z4Q6+Gx!uZRHagim0e0+shq2hj+`k)hZu=P)u-3vad-H~r%#o)zx{Dw
z+4Nubzis9C>E-p)7kQ!ck7vG+Km5-d_1jP5?9&Xaqkg{|_Q%81Z}sryX`Y=YMk^cq
z1nEz)pFa#L6=4eOb)%?!a@97{Y_*i^ytP&t?`^W_?}NTa*U@%+qRsPeFLm7Q^RswZ
zyDXUy%$3BS;=c@|&JwLtp|+}0(}9_S*Iru^t+f^A2y%V}iw_{d<xq#WB3Bxh>1C#)
zvVZ=Qb^#v<@33`+_8a*&lbr!24u`BO{hklUeg9den7><-&;cU{e?-`o6Tirl`T-_?
ze#16_K0Kg4Oc9<`%!{LivdUsy3ptT3*y9>(tX4_QUZUv8ElLY(Q6Fa5`(wYoq8=wx
zVTo?>PIIDUs1R}FLayb?*k^Id6i{d?N>r@Gt-*Xn)>Yci+dK}ZJyyzPv$&41AA=U<
zoRp@PrHbKTb4=MFxJwDmVbA&R1HK}E>*`>h>#(h|J?_z3f%Tk*AC@o4nY@dOQ?sO~
zeJyM#v8vi;WfVo*xHvXew^SJQ#?lHJ-$MyXC(L`fK8AUInPxP^#nn+M)N`LpY?@Sd
z(IbpdDUk%pI7O1QGC`tQ!{HuXSNHv!(2V>%nIc}N<q>bVLA2DMjq;j|ZeoppGd7Y7
zo`PZDMCr03OYe|%+#l$0l>JoEuhS`Hl2eOAW>hZ1DWIaKs0t)l<}4Iq&8}HSQb<9m
zS)pPc&~?1;zw{7@mo1)VQ#E%%FWqlzSy8J#wt`Ie5>R5SBnlN$5gG{fC8M+E6|nE1
zbwc~&>%UL*`5NMPD^tNSI;U%YGx|W|2K_P@cGaPzLiJ|$wfg2vs_4Y6FxrEw3fs|G
z{e1%O5q7N&|GCKYadaN&7))BO(^be&XW1o3(M#qe;yoDyJ&+WKh$54#sVnOGjI3*L
zBeOm{kEfhe0g8HYQd)yhRxL%(k|IS;tvU^@HtH@5m6%frWd+<TvJ2pUXO^98S+@07
z3_pHw4R@s$Hz-w#z)mm`L9J_7VLYNO7F9j6t@yk#xxA9fN0{K%zq}p~&su%H61uKT
z;y0r*FQyWqvg}w}l!LjG)bJeOI9x!f$;K0>Vx!NzhRP$rD+i-KkQkSyE|m~k%O!)0
z2HwJqTCDkI6OColvu-YbRT6j(!w?bX7tpvuyyNka#^>2R!HR^%-PS<I8&zC@L}($n
zA}FKUVJTUcyn!8$$Qbz*EZ&p&z~PQU_KTA|8>6|7=LRtvSSn<@grthtnvzDwqe$8M
zE1ttU0`?8nl=lBV!JFLHu1ngwY?U#KTXC*V@F<cZjiQx3imdp58cMAW&!lDzDOZvI
z>zUk8Udj0PPs2}7!=WoLkEi5(ZqRLF3#G9En^ZhJfGWA6%h{xY$<e3v9Q?bdF&&m%
z6w$lI^IGseE`Jybn5{;u$XejgTeU4&q&SKdid2>e7eg?=>xtZ9?AkCK3vD*@vzVr{
z!cY;`adBcOxR(@v!63(yqYea>lm!i%eaX7Gf><NJ!tOr6>t1O0)}wdqC-B$#Wg5Lv
zi{)La#WdBRhhWdO5#Lme=zx|hjiN6^l&V<l-^c!l?gsw(RcLd3eeSn)c#}&s{oup%
zXDO)VM}}0Yz!GF@Mje&BG8V1S7dRc|Yz_iV;;H@$S|6c*b}#+gbWA&r&VyvMUM~M?
zQ(k%IXry@KeQKdaT^Q!rhG4n5W?C{UPJU)!z<PW=m1+CW<N1iz|HMhvXr5JY%pt+O
zn<Qjf8Fw8WIyfCGg^|)4yfd`!Wiucbo{uta@)`DR;o{4%Ma#>NCI@J63BDpK&`2Uz
z&IK(3${U7%VLT)>YK04a2d|q0{+z5+)LatR#JD(d1u|5b#&DGiT|g`f4Qv`)n-Mi`
zG?gBOb4A&_gVy!KS9+F>RM`32AHQ8T{{8A@_r2ANci%t(+0=~XNInX{r{*L#Eu*%;
z&X+NS+KNE`9>(sEq0FXX98R(nspS@*7KY3=jfYi#R>4t1{IE$WNJ5k=2W!bu`;cUX
zPq-rMHuSvjhxt{?JUdTdY=1)cIDvC#$c4lPvZqCCIh>gZ30?sHrb%R#R-B#OW9v4<
zmWS>C4*TKs@;dML<KonBv5V$Ucl(YTXEkoJ_pNaguY;us#Zlpvk-0&mIQo{&iopL4
z#EHv)KgZL!q31_3*VV~6BZ?Qh0x_;Cs2^2Zsl*=gGx=O3fQB@|8VYIiYsox9oY?%|
zsmQ<jtvs^1u1=N{)g>-{Ho6!MZ-x!9@KRY7t%6FXFp?z`T8by)qSIPBj~GE`_<Y*^
z{(=X#o0H^z>5?hSPj{n36xk3>rp(E)PEwtd-UkhT4&KaRC&O+d4;(H_(;KAa5~#-9
zLZ()1YYDoiCT4^h%mK6m6sw?KIUnn45;uI^TYr!}mCZGzZVR)rw$UtqXG*E41|ybT
zjm{v)a}yf`wk=z$v$Lv=Pi95IdIzw38@9Bs*J?FSFSD@xm^#BMH*zVGL$$3atf`?^
z6DcBpEW%6c;0(7F$nT*6_m|s2*kv;9xXn4I<+8#i4XERb1c&}rQD5N>qR%=xY&$Bo
zuGO!xm3QE}?XZ7?G%`E^zs~1_JLrMM$3#|HtAbQB8lc=Ne5HcDVN1T5T=VE}oLO;X
z`iQO@iX6^T=!|umO)X#)j7x<C!@rv>tO^K!j7KR}8G(vQ#uWwKkA6T@R|tuFXx;F-
zsWW5@FC!oA;)91=;Lfd}+oGs5i-^g^i!oVcZH;w-sv^6h|6Xx*JK4UR`t2Oo^K{C_
zsP*DT>vb%Ft-)W4uILhc$eL_KeE9MH3*<ARI;^N958%4phK>#K0)Kdv{hqFGdOv=D
zFl!*=Zk_h8U*3&6$7x-Q4y8pnL`kht@X&P%{R{bukg6+e%MXBdyP@A+-?m>S)A?0K
zAzpl@s{?6lyGl8jW_=DC@f7jL5RZ*l=gPF=I>{X}K;HDnz9M()Fuss1r}2gAjl*|8
zoian2u=s@35g~JPBb!Zy19J(*zQGKC5NltQb~ftfa$dvg$5=sgciK!Zx+iAy!hEwJ
zEGH9^YH*W5VvJUgUCbe}jILH26Y=nxG6=x7hRg%JJN&;Rmmu$CfB60Nw1;t{?R=@`
zFTOOt%U^DaY#|OoTGqN`TZ7YM-sol&+K)m#O9hG5@~`0eL&)7h=y=@7^H=<TejQKt
zMjr1L&ucCjz5HNCHJcz<3s()DRK0kHPioed%#LG0)fOdw1-b8Fd{5^2bvW%e-<~i1
z^YC46@4j>Xiq+$>@M5m)y$hPS<~ZJER6)0ZfGr^-FIz;*DyWWJei_&AVD1h(`2UIF
zIJ_LD*`GMNR8mZet;A|ngb9Ryb251593o{i_AlgRg`!+RV+M9XJHGb1hkTx2Vojqi
zr=PeK1CON8Sy_1ln3_St`<yA%oE82xLP2b6kzet3hvQH`&+&8~&-d)y^8NO3LREBu
z0MigN1#+hO;);w~29YD6U`<_dc6o!=owy(G+tvAw-E}@m8W$G7b-+G<N3T`nq$_$(
zqhf-hio#`mv*KceB*<xn0(b=1owo9}+YH~{WDEF5Z1nxgdBM79mlr42RL4F)mbAvz
z7H6W$5Cq6#n2^Xe`IOe&%y<v*fx|P^{ct!u_gg+4?k`pS`IqLKg&v(^u`u8m##<0I
z!^0}dNsU(|6!bxXH76f`2wj%jFJk%w%pl*7g6?LwZVR*DGJ3HBJ>S4$P6>8aB?n91
zDzt$Z7hSLn-^E~>!x~Nx@Ve7@(co>yexA%}W9;=t(_bD^wT8`?5D|Ex8P-yghgvhJ
zSp{XPX^P})T=Dkz4%-O&K5fSO^sscPSTVWfhvq3}U&P1Sw2X&;%}(HTovWw-#TrM@
z|8T-<&K5o(9BCYmbd={xyZ(6Y;`3v8T%RH-g~iq^S8HHoyz;50oSin+s)n{<D58Hj
z(j_Ui#+rQ}WcOv*K!6PslW8{JNW8s@UvN%^r>8yIT`SeMz{SD^py7ERsW76Ah?O*6
zQH$@<bq^x(UL0zF1EC&&LTW@w-Zoc#2{CBTWP)bTP`(L)<SZIYTl3oPUKM^7`3$vm
z1%KY?$Hn=OFzMxcU)UM4C_BnZtd&C{xUSr?9j^{l9h~52@@q(Zz}S6##cbv<j^|r*
zX|choEG`shq2>%8fvunsnQN1r8=qe)F;acSwSgP3?kk#q!568=VZO?Gw=jhcD&X?h
zrothGq6!6VX*@&W@yLxc7<pw;_;o^S^D9o<-T~~s4*Ly1>str7ElgpfQ9IPqE<m~>
z_6-YqK-3m)y{c*wq9}V6Dj6Pc&dG}VSnu$4|Lu2m;MvE{vx`AcI@$CSR>v|toM=--
bL2RS;XWulF;0+B7!|VPZIj%aR<Td~RV$t(M

delta 3426
zcmV-o4W06^9QhlN8Gp-m+c*~7^IPl=-j7+$nc2>A_Bj{07dAVxsL><o^w&>OB4x==
z(o8ca=hy(GBvs&jRDobR(m{?NxAY<phf{0)9}lhH$Yr5?J<8#sKYZ+8=t;)Gk;-X9
z`*HXr!+B-coqr#8bnIy}4u3xk+lObuT78xmdN@4O!)d?i;(wq0?%}uH_UU09od-6D
z{BYc#<e|~#@K-;BPk}>KDy3N2rR0^$x$5Z1nNoC!kyuoHcs?F?hktzdP<i{??+2Dm
z|9Su0R*oN@o<Dq+Cp!Ol<O})3KVPWdei&yTW?&ul``xfV9v*(HhtCi5>^w1A+2AKg
zKg53iFsM{0uwU1WqVmaA+eow3QnK^bT4lVq$)>*#`W9VB+wF-qkGs9paktOU;%@D-
zWI`}k5<kR$8AhEYTBky7Ri&l_GX<}`wj^3>E6fq({0bKDK!VGm4lhNnG%nN2Oh;vZ
z{F8VA9|&)-b%pjD`8t!I0VNKDtSkMV566A~Nv4>;S(DNMBL}}n*p(AM$&>s6CV&2l
zZ32C`Lw%ScJgJx$M+;?@#kdx7B3rP>HP~3KlA66l(UDt}7S^IZ%&_;zetShdPNu>V
z-Qu0*M9EMg;>d+u%ayUu;*=?%&{UMDSczMM`HHNow4b+m98P<zl*?vu9brEPEy_75
zO)X0m!@=g5vO#c{5}L!F^WO)2MSs@S!93PsTV;FPqqPF-ISt<}Uy?I<7Z;~yNm2V+
z*id3swav;XineiaY^-jnFzSt^6*Rtu5|mDu_i}v<^ZYW+Xo!oeqf)5nK9|@usqCUh
z7@<-k36gP&Bxz-WM6-s&ExNAm`xl`Z`FS!$yiUs_-f)9xsX-g%H5uK+8h>YOBo{md
z!@h~qWkr_WAnUk4(BUZisiI$}Q^+Kz7KhBJT!d3VMNd%`NU+RVD8!mwvy7yWf>N_W
z#oVFmc;A2SArLQHJk6$R?t)&r-`29CR()&*neHW^#8^obDx@Mb5b8@tXU!{M-$3hx
z_Q&V{o#@jw#BWxnf@5?}*MDa8j>aqW%Usx1hms1_o7vaun=`4R6Su->53VX~M`QK(
z3A{zvwKn{5k?G^;JkT+iv|OjFkfF}9OOB$K%t^$1G6s4eDGm`uCRbBe)b$xz*WgBG
zeRv*EIjI5^_2Q(o2BEB4ik>A!ikw<?8d`1CT^1@arxMBvxL0Hsz<-Y{JK3^q>uWK5
z|G_ofm0H}OR4D>G!9)bLu3d%kh_+Z%^~kp3^Ty=zN-FPRf>;0ad^|jA_329Jx-yC1
zjLN*2N`%U?V{K6m=1x+>bAaP;0i`AzPn?R4KJywX_W-XPjCx06T$;L6LTD|Q3@#dY
z3o~l5=9^74mQByPxqnni;5iIKM3`Sd;|lSH$9o!|X7>au5*Bw`108QvaRCych2V;y
zjB1CaWL@$Gc03|u<X5nGOX3}e8w%MkPV#Jw<~p7m#AsltknIwZDq?F&8X1owW$Uka
z4sQtfw*>q|z@d>Xe*`esxk2iPEtJNdWm55w&8p;v#$b~QCV#_&uP5N&5SY#vE+XR1
z;(0B2AI)+p0nApTx@HYiUR$*-S!4zB8+n~2LiiK>;(8)C7`rwM$3mOU{4A#F+>}*>
z+gcn;3W7gHFvzjwr~^S*WI<13U$QQ)6qm@aaEN#Cx+mJb^eDIc3H)__nMQBaVnvi{
zF-<i{2sm18#D6zcBbtWgN~2B+5!oG!{rlMO(Y=Cyd=}aqpC9{e9bV)TP2c(O{8<WW
z`6(NfDp1kcno)WruZ%_afR!g0Jez}{jeDxUg4TPe-BbTEootSy^B@_mmn){)lvkcP
z8Y$j*pIT^97lxm-Ay{s%nU>6o!;%>oupXaJW!nDpcz-_C^gnU9Gx|*x9CJu8ye0{m
zR>oaNQwvVVN@1k52JZ~5d)f@hg~y|en|y}-x^S^F*rMg-N0S3I2y0&v71^rDm2;_1
zAXcHV$8$BKRv5=O@VYtR&&fJP%_VV7jEfUjpxl&c3?rt{1;nDzz}c|18Bya#ALdaw
zSCq{gXn$Qle4$6#NQDEg{qgH%<KM1ccHdjQc=ruNgH6p?j^v{Nd}>Z|(=xVW?0gwR
zsI3U}Z(;0yAIhw=$Kj;pkXml>0b9sy(|EpP6-*fP0-KbABt*G#aAO>`4@p)?>npNu
zLy!A@nBP9kv-1RAaCxiEPGEK!av`zF-nED=hkuzdAwfRF5;Tdd(u(7TTWsBC*z&Ob
z*RUT>PtWsy-!D%67Q1MExZ5|}IID4!y>E@1cpWT7D2@tyiOdZe#nHEHRs{YxAWmHV
zJsw~UJ>QeLu1;nXQM_14i-Bi{`cbu&O6*0~<a3b#8qx%7D5TA=C36pPV)Nfqk^l5t
zxqoMKU7cJSs!Lq@Y;-Xib_N?@VL!4eS_PF%VI)f?v=mRoMW?lN?lFSS@aeSs_=E?x
zo0H^r>5?hS4@9Fw6xk3>rp(E)PEws|qlQSKcq>R7EUcpO2HwnJC&O+dcN{KD(@U4-
z5~#-9;F(*ottIH5nwSx4FbB{MP^^M_<$rvvt4X}#>t6bU?5S+7A@#a2D{C9g^4E-%
zifTlZU5(Bl$8!@Kgfx_$)!A9q#wW9)V7&p@y$oC0*K4(!r<Yk+ezKe4E*rTN$)Va-
z6xP&GtBDj57GXbiaE99o<hRg(`}6DB(Pc92xXn4I<+8#i80j{iy*u=;iuww75PyBv
z$zj`3sdcS>jjg-^*KLRW3#5_Z3HWtBpLIbGEIwJW%32kqn$ZB|R^clZ><wG;&E%R#
zf8)%GQ^tFA-B9FkmO^K&(`;%1qhMSrBpCkPWMNf6U_5=W$_P|cGOj4-e)I#Px<W|Y
zLhFX-O`RcQcp3R<7au(20(Wi&-G3HEomoUoE?$huDr;-33se=^75(>$tJ}%;>C|uM
zxSpp|Hb$)%H(IY_32Y7iQglU^;6v7ABjUr4S4JS85!GQuCAkCF?KX65h!^<7qwM!|
zeL4F6gINO^_u6Uy`hC}^bDY+-=uldOLzL7S1rJ@P(7%wc2&uZlwtNR@w|^V@?ezuq
zWip*#WfbDY$Dlfp#<r`JgK5_1pb<|Ie+=>1cy+E!D{f%iAOqx0f9xxAw+`bwymA`f
zLB4YM=BHC;C=(VRDmo%$Zf<0=sc>K}q1ZQ=0b=co(#}TRT+VA)eIF}m?oONO-SEV0
zUYK9Thvj5KQVni0NQ}|yv44v>M3&LjYGWcEUQ-4E*w&D_gLi}fBXSAyPWFe7&!;_%
z8*S%%D1Y&#`Stg5Q)CNq2-33FCEFUD9`i;wqtJd7>RBpCtd@TT&+kI+4noJ{MjpT5
z_w#tNH}ZJ1cwTeK=;a49s@Vj=TDWTHr0T^pd{VQvWOf`2s<tTcD}Ts+1LIpVkI%zt
zzxn!j>7R#ha(nZg^H;1MkA)X=W$#_k#5Kq9`kxBA1q5sf8F|?vT2?`I<nqh7egkuN
z*uno#49DT+FwOqN(WR1NQfwtwqasWooRh&b=MX8Iv40^iD-`7l8Z)p9+VM@%E#&k3
z5^EZDIsL>v5_lwq&VS0v8^F{I65i)bq2{dcuMrAjTZ{aPuR9!v`e}}*^LW0R=9cfb
zhZCxz3j~;km?@Am%@<c>)G~-10R?O7inGgCXx)kX@rqiVucck*lcaHB@mmM%bM#t8
zPP(G!G%6-2swiC6H!ChiNP?VJD1du#-DxW?yUp<RMYe!{#D7NLuACRFi*|W&Voi1I
z^J7VCTy1eCstiGZEQSe*Y?DuE&1HtS0Pi?FQr!=S!(+eY)8Y0W(vQD1U&iz36pMuc
z$1q+Ts2LtsQBG>SRiK~`3amN#K<KjEei75}U<Ubq6m+k4>vduFTShNdpyx|V%qhXn
zs^nnFTZJ|d<A0(Hmf^b?OmkSn=?-3Z8sE*o%-GM9Ic<!+UIO~dL#o!W`4S=mFEqni
zO7c)^1~sdoOf^lBe2puvyWL<LLEom$SfB2eE)^>#xBSpN#q5jtSeusdu-OT`u5%R?
zpjhJw`X5es&Dp{`gd>f^k&g0MY1bdmU3`8F_v=$6rGK#4n&oN@jEq-4)s(Z-rdrj|
zHVj4d4@bHrrPf%pZ-eYU4;u)uVPZ1P=1Y99ui_V+Q{m}pk9OBe^(}C*Z~<s|-bX5o
zXd_}JjaSs-TXfx>NW2w?+CZqspO6|+lDExOUqTGpGnt^-Gn8*aAUTT$)7HGUyH$l>
zL_R?+U4Ox!H~M~YJ|s+f`Q8_HhAhgCvJz|MPzbIow`|9&162nn_?i3~67Mi}U!F0W
zIgI1^qFGvOuqulS#aXC1gGXR1Xhi1PB<IHGmr9IOUvX{V6<GHL&ET`t<1pX;yjhq+
z2NiJnYg6HnLQ#c+wltoh@Ob3L8H~KLDEvC1wL19~r)_Tlc3+15h9C872Y6kW!bYQZ
zsHI(ibVckN7W9CqE!=ul)g(kw_9|2|JYF`F6<3(v;OqYElll!R48rUF4^K%aD;G8Z
E0H%eu2><{9

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 534bf6b9c..37d487da0 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -124,7 +124,7 @@ def refresh(
         metadata_min = (
             package["features"]
             .lazy()
-            .filter(~(col("suffix").is_in((".parquet", ".arrow"))))
+            .filter(col("suffix") != ".arrow")
             .sort("dataset_name")
         )
         self.write_csv_gzip(metadata_min, self.paths["metadata-csv"])

From 2db8dafddb07ec82d8c3e6110b1f2303a37bbbc9 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:47:47 +0000
Subject: [PATCH 161/201] feat: Extend `_extract_suffix` to support `Metadata`

Most subsequent changes are operating on this `TypedDict` directly, as it provides richer info for error handling
---
 altair/datasets/_readers.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index d69b50e1d..921b5ef7d 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -65,6 +65,7 @@
     from altair.datasets._typing import Dataset, Extension, Metadata
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
+    _IntoSuffix: TypeAlias = "StrPath | Metadata"  # noqa: TC008
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
 
@@ -129,10 +130,10 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
 
     _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
 
-    def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]:
+    def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]:
         return self._read_fn[_extract_suffix(source, is_ext_read)]
 
-    def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]:
+    def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]:
         return self._scan_fn[_extract_suffix(source, is_ext_scan)]
 
     def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
@@ -443,8 +444,10 @@ def _extract_constraints(
     return constraints
 
 
-def _extract_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T:
-    suffix: Any = Path(source).suffix
+def _extract_suffix(source: _IntoSuffix, guard: Callable[..., TypeIs[_T]], /) -> _T:
+    suffix: Any = (
+        Path(source).suffix if not isinstance(source, Mapping) else source["suffix"]
+    )
     if guard(suffix):
         return suffix
     else:

From c265e1d0536dd6a4d0fcd12344b15b4f6c515e3f Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:50:39 +0000
Subject: [PATCH 162/201] refactor(typing): Simplify `Dataset` import

---
 altair/datasets/_cache.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 89ed16858..f0309a350 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -7,6 +7,7 @@
 
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
+from altair.datasets._typing import Dataset
 
 if sys.version_info >= (3, 12):
     from typing import Protocol
@@ -32,7 +33,7 @@
     else:
         from typing_extensions import TypeAlias
     from altair.datasets._readers import _Reader
-    from altair.datasets._typing import Dataset, FlFieldStr
+    from altair.datasets._typing import FlFieldStr
 
     _Dataset: TypeAlias = "Dataset | LiteralString"  # noqa: TC008
     _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
@@ -144,7 +145,6 @@ def read(self) -> Any:
     def __getitem__(self, key: _Dataset, /) -> Metadata:
         if result := self.get(key, None):
             return result
-        from altair.datasets._typing import Dataset
 
         if key in get_args(Dataset):
             msg = f"{key!r} cannot be loaded via {type(self).__name__!r}."
@@ -156,7 +156,6 @@ def __getitem__(self, key: _Dataset, /) -> Metadata:
     def url(self, name: _Dataset, /) -> str:
         if result := self.get(name, None):
             return result["url"]
-        from altair.datasets._typing import Dataset
 
         if name in get_args(Dataset):
             msg = f"{name!r} cannot be loaded via url."

From 5503e0b835e3eed32a32b975b0ff6f97de36d1e6 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:53:34 +0000
Subject: [PATCH 163/201] fix: Convert `str` to correct types in `CsvCache`

---
 altair/datasets/_cache.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index f0309a350..eef5a86ee 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -15,7 +15,14 @@
     from typing_extensions import Protocol
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Mapping, MutableMapping
+    from collections.abc import (
+        Iterable,
+        Iterator,
+        Mapping,
+        MutableMapping,
+        MutableSequence,
+        Sequence,
+    )
     from io import IOBase
     from typing import Any, Final
 
@@ -140,7 +147,19 @@ def read(self) -> Any:
             b_lines = f.readlines()
         reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
         header = tuple(next(reader))
-        return {row[0]: dict(zip(header, row)) for row in reader}
+        return {row[0]: dict(self._convert_row(header, row)) for row in reader}
+
+    def _convert_row(
+        self, header: Iterable[str], row: Iterable[str], /
+    ) -> Iterator[tuple[str, Any]]:
+        map_tf = {"true": True, "false": False}
+        for col, value in zip(header, row):
+            if col.startswith(("is_", "has_")):
+                yield col, map_tf[value]
+            elif col == "bytes":
+                yield col, int(value)
+            else:
+                yield col, value
 
     def __getitem__(self, key: _Dataset, /) -> Metadata:
         if result := self.get(key, None):

From 3c7c5716e28ea3cbe863bf0d0a503786e09820c1 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:04:54 +0000
Subject: [PATCH 164/201] feat: Support `pandas` w/o a `.parquet` reader

---
 altair/datasets/_cache.py   | 23 +++++++++++++++++++++--
 altair/datasets/_readers.py | 37 ++++++++++++++++++++++++++++++++-----
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index eef5a86ee..98b97e35d 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from collections import defaultdict
 from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args
 
@@ -107,9 +108,13 @@ def __exit__(self, *args) -> None:
         return
 
     def get(self, key: _KT, default: _T, /) -> _VT | _T:
+        return self.mapping.get(key, default)
+
+    @property
+    def mapping(self) -> MutableMapping[_KT, _VT]:
         if not self._mapping:
             self._mapping.update(self.read())
-        return self._mapping.get(key, default)
+        return self._mapping
 
 
 class CsvCache(CompressedCache["_Dataset", "Metadata"]):
@@ -139,6 +144,7 @@ def __init__(
         tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"],
     ) -> None:
         self._mapping: MutableMapping[_Dataset, Metadata] = tp()
+        self._rotated: MutableMapping[str, MutableSequence[Any]] = defaultdict(list)
 
     def read(self) -> Any:
         import csv
@@ -161,6 +167,19 @@ def _convert_row(
             else:
                 yield col, value
 
+    @property
+    def rotated(self) -> Mapping[str, Sequence[Any]]:
+        """Columnar view."""
+        if not self._rotated:
+            for record in self.mapping.values():
+                for k, v in record.items():
+                    self._rotated[k].append(v)
+        return self._rotated
+
+    def metadata(self, ns: Any, /) -> nw.LazyFrame:
+        data: Any = self.rotated
+        return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy())
+
     def __getitem__(self, key: _Dataset, /) -> Metadata:
         if result := self.get(key, None):
             return result
@@ -274,7 +293,7 @@ def download_all(self) -> None:
         stems = tuple(fp.stem for fp in self)
         predicates = (~(nw.col("sha").is_in(stems)),) if stems else ()
         frame = (
-            self._rd._scan_metadata(predicates, is_image=False)
+            self._rd._scan_metadata(*predicates, is_image=False)
             .select("sha", "suffix", "url")
             .unique("sha")
             .collect()
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 921b5ef7d..176439c0d 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -207,10 +207,13 @@ def query(
     def _scan_metadata(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.LazyFrame:
-        frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
         if predicates or constraints:
-            return frame.filter(*predicates, **constraints)
-        return frame
+            return self._metadata.filter(*predicates, **constraints)
+        return self._metadata
+
+    @property
+    def _metadata(self) -> nw.LazyFrame:
+        return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
 
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
@@ -279,8 +282,18 @@ def __init__(self, name: _Pandas, /) -> None:
             ".parquet": pd.read_parquet,
         }
         self._scan_fn = {".parquet": pd.read_parquet}
+        self._supports_parquet: bool = is_available(
+            "pyarrow", "fastparquet", require_all=False
+        )
+        self._csv_cache = CsvCache()
         self._schema_cache = SchemaCache()
 
+    @property
+    def _metadata(self) -> nw.LazyFrame:
+        if self._supports_parquet:
+            return super()._metadata
+        return self._csv_cache.metadata(nw.dependencies.get_pandas())
+
 
 class _PandasPyArrowReader(_PandasReaderBase):
     def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
@@ -459,10 +472,24 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
     return suffix == ".parquet"
 
 
-def is_available(pkg_names: str | Iterable[str], *more_pkg_names: str) -> bool:
+def is_available(
+    pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True
+) -> bool:
+    """
+    Check for importable package(s), without raising on failure.
+
+    Parameters
+    ----------
+    pkg_names, more_pkg_names
+        One or more packages.
+    require_all
+        * ``True`` every package.
+        * ``False`` at least one package.
+    """
     pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
     names = chain(pkgs_names, more_pkg_names)
-    return all(find_spec(name) is not None for name in names)
+    fn = all if require_all else any
+    return fn(find_spec(name) is not None for name in names)
 
 
 def infer_backend(

From c23805d25027682e55ac84aeff1a7b311315fe79 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:07:39 +0000
Subject: [PATCH 165/201] refactor: Reduce repetition w/ `_Reader._download`

---
 altair/datasets/_cache.py   |  5 +----
 altair/datasets/_readers.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 98b97e35d..36d965f2e 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -303,10 +303,7 @@ def download_all(self) -> None:
             return None
         print(f"Downloading {len(frame)} missing datasets...")
         for row in _iter_results(frame):
-            fp: Path = self.path / (row["sha"] + row["suffix"])
-            with self._rd._opener.open(row["url"]) as f:
-                fp.touch()
-                fp.write_bytes(f.read())
+            self._rd._download(row["url"], self.path / (row["sha"] + row["suffix"]))
         print("Finished downloads")
         return None
 
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 176439c0d..88d917ab4 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -156,13 +156,9 @@ def dataset(
 
         if self.cache.is_active():
             fp = self.cache.path / (result["sha"] + result["suffix"])
-            if fp.exists() and fp.stat().st_size:
-                return fn(fp, **kwds)
-            else:
-                with self._opener.open(url) as f:
-                    fp.touch()
-                    fp.write_bytes(f.read())
-                return fn(fp, **kwds)
+            if not (fp.exists() and fp.stat().st_size):
+                self._download(url, fp)
+            return fn(fp, **kwds)
         else:
             with self._opener.open(url) as f:
                 return fn(f, **kwds)
@@ -215,6 +211,11 @@ def _scan_metadata(
     def _metadata(self) -> nw.LazyFrame:
         return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
 
+    def _download(self, url: str, fp: Path, /) -> None:
+        with self._opener.open(url) as f:
+            fp.touch()
+            fp.write_bytes(f.read())
+
     @property
     def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         return DatasetCache(self)

From 056f96d4de55fc98f7f4f6e9e61650e5c8d62e25 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:20:34 +0000
Subject: [PATCH 166/201] feat(DRAFT): `Metadata`-based error handling

- Adds `_exceptions.py` with some initial cases
- Renaming `result` -> `meta`
- Reduced the complexity of `_PyArrowReader`
- Generally, trying to avoid exceptions from 3rd parties - to allow suggesting an alternate path that may work
---
 altair/datasets/__init__.py    |   2 +-
 altair/datasets/_cache.py      |  19 +++--
 altair/datasets/_exceptions.py |  72 ++++++++++++++++
 altair/datasets/_readers.py    | 146 ++++++++++++++++-----------------
 tests/test_datasets.py         |  11 ++-
 5 files changed, 163 insertions(+), 87 deletions(-)
 create mode 100644 altair/datasets/_exceptions.py

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 4986f671d..cc6a07d32 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -73,7 +73,7 @@ def url(
     - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516
     - https://github.com/vega/altair/pull/3631#discussion_r1846662053
     """
-    from altair.datasets._readers import AltairDatasetsError
+    from altair.datasets._exceptions import AltairDatasetsError
 
     try:
         from altair.datasets._loader import load
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 36d965f2e..79fc9c50b 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -3,11 +3,14 @@
 import os
 import sys
 from collections import defaultdict
+from importlib.util import find_spec
 from pathlib import Path
 from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args
 
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
+
+from altair.datasets._exceptions import AltairDatasetsError
 from altair.datasets._typing import Dataset
 
 if sys.version_info >= (3, 12):
@@ -81,7 +84,7 @@
 """
 
 
-def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
+def _iter_metadata(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
     """
     Yield rows from ``df``, where each represents a dataset.
 
@@ -181,8 +184,8 @@ def metadata(self, ns: Any, /) -> nw.LazyFrame:
         return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy())
 
     def __getitem__(self, key: _Dataset, /) -> Metadata:
-        if result := self.get(key, None):
-            return result
+        if meta := self.get(key, None):
+            return meta
 
         if key in get_args(Dataset):
             msg = f"{key!r} cannot be loaded via {type(self).__name__!r}."
@@ -192,8 +195,10 @@ def __getitem__(self, key: _Dataset, /) -> Metadata:
             raise TypeError(msg)
 
     def url(self, name: _Dataset, /) -> str:
-        if result := self.get(name, None):
-            return result["url"]
+        if meta := self.get(name, None):
+            if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
+                raise AltairDatasetsError.url_parquet(meta)
+            return meta["url"]
 
         if name in get_args(Dataset):
             msg = f"{name!r} cannot be loaded via url."
@@ -302,8 +307,8 @@ def download_all(self) -> None:
             print("Already downloaded all datasets")
             return None
         print(f"Downloading {len(frame)} missing datasets...")
-        for row in _iter_results(frame):
-            self._rd._download(row["url"], self.path / (row["sha"] + row["suffix"]))
+        for meta in _iter_metadata(frame):
+            self._rd._download(meta["url"], self.path / (meta["sha"] + meta["suffix"]))
         print("Finished downloads")
         return None
 
diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py
new file mode 100644
index 000000000..488470709
--- /dev/null
+++ b/altair/datasets/_exceptions.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from altair.datasets._readers import _Backend
+    from altair.datasets._typing import Metadata
+
+
+class AltairDatasetsError(Exception):
+    # TODO: Rename, try to reduce verbosity of message, link to vegafusion?
+    @classmethod
+    def url_parquet(cls, meta: Metadata, /) -> AltairDatasetsError:
+        name = meta["file_name"]
+        msg = (
+            f"Currently unable to load {name!r} via url, as '.parquet' datasets require `vegafusion`.\n"
+            "See upstream issue for details: https://github.com/vega/vega/issues/3961"
+        )
+        return cls(msg)
+
+    @classmethod
+    def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError:
+        msg = f"Found no supported backend, searched:\n{priority!r}"
+        return cls(msg)
+
+
+def module_not_found(
+    backend_name: str, reqs: str | tuple[str, ...], missing: str
+) -> ModuleNotFoundError:
+    if isinstance(reqs, tuple):
+        depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
+    else:
+        depends = f"{reqs!r} package"
+    msg = (
+        f"Backend {backend_name!r} requires the {depends}, but {missing!r} could not be found.\n"
+        f"This can be installed with pip using:\n"
+        f"    pip install {missing}\n"
+        f"Or with conda using:\n"
+        f"    conda install -c conda-forge {missing}"
+    )
+    return ModuleNotFoundError(msg, name=missing)
+
+
+# TODO: Give more direct help (e.g. url("7zip"))
+def image(meta: Metadata):
+    name = meta["file_name"]
+    ext = meta["suffix"]
+    msg = (
+        f"Unable to load {name!r} as tabular data.\n"
+        f"{ext!r} datasets are only compatible with `url(...)` or `Loader.url(...)`."
+    )
+    return AltairDatasetsError(msg)
+
+
+# TODO: Pass in `meta`
+def geospatial(backend_name: str) -> NotImplementedError:
+    msg = _suggest_supported(
+        f"Geospatial data is not supported natively by {backend_name!r}."
+    )
+    return NotImplementedError(msg)
+
+
+# TODO: Pass in `meta`
+def non_tabular_json(backend_name: str) -> NotImplementedError:
+    msg = _suggest_supported(f"Non-tabular json is not supported {backend_name!r}.")
+    return NotImplementedError(msg)
+
+
+def _suggest_supported(msg: str) -> str:
+    return f"{msg}\nTry installing `polars` or using `Loader.url(...)` instead."
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 88d917ab4..11cc473a9 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 import urllib.request
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from importlib import import_module
 from importlib.util import find_spec
@@ -19,7 +19,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     ClassVar,
     Final,
     Literal,
@@ -31,11 +30,11 @@
 import narwhals.stable.v1 as nw
 from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
 
-from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_results
+from altair.datasets import _exceptions as _ds_exc
+from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata
 from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read
 
 if TYPE_CHECKING:
-    import json  # noqa: F401
     import sys
     from io import IOBase
     from urllib.request import OpenerDirector
@@ -84,14 +83,11 @@
     _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow]
 
 
-__all__ = ["backend"]
+__all__ = ["backend", "infer_backend"]
 
 _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
 
 
-class AltairDatasetsError(Exception): ...
-
-
 class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
     """
     Describes basic IO for remote & local tabular resources.
@@ -136,10 +132,16 @@ def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]:
     def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]:
         return self._scan_fn[_extract_suffix(source, is_ext_scan)]
 
-    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
+    def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
         """Hook to provide additional schema metadata on read."""
         return {}
 
+    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]:
+        """Backend specific tweaks/errors/warnings, based on ``Metadata``."""
+        if meta["is_image"]:
+            raise _ds_exc.image(meta)
+        return self.read_fn(meta)
+
     def dataset(
         self,
         name: Dataset | LiteralString,
@@ -148,14 +150,14 @@ def dataset(
         **kwds: Any,
     ) -> IntoDataFrameT:
         df = self.query(**_extract_constraints(name, suffix))
-        result = next(_iter_results(df))
-        url = result["url"]
-        fn = self.read_fn(url)
-        if default_kwds := self._schema_kwds(result):
+        meta = next(_iter_metadata(df))
+        fn = self._maybe_fn(meta)
+        url = meta["url"]
+        if default_kwds := self._schema_kwds(meta):
             kwds = default_kwds | kwds if kwds else default_kwds
 
         if self.cache.is_active():
-            fp = self.cache.path / (result["sha"] + result["suffix"])
+            fp = self.cache.path / (meta["sha"] + meta["suffix"])
             if not (fp.exists() and fp.stat().st_size):
                 self._download(url, fp)
             return fn(fp, **kwds)
@@ -170,7 +172,10 @@ def url(
         /,
     ) -> str:
         frame = self.query(**_extract_constraints(name, suffix))
-        url = frame.item(0, "url")
+        meta = next(_iter_metadata(frame))
+        if meta["suffix"] == ".parquet" and not is_available("vegafusion"):
+            raise _ds_exc.AltairDatasetsError.url_parquet(meta)
+        url = meta["url"]
         if isinstance(url, str):
             return url
         else:
@@ -223,21 +228,7 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
     def _import(self, name: str, /) -> Any:
         if spec := find_spec(name):
             return import_module(spec.name)
-        else:
-            reqs = _requirements(self._name)  # type: ignore[call-overload]
-            if isinstance(reqs, tuple):
-                depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
-            else:
-                depends = f"{reqs!r} package"
-
-            msg = (
-                f"Backend {self._name!r} requires the {depends}, but {name!r} could not be found.\n"
-                f"This can be installed with pip using:\n"
-                f"    pip install {name}\n"
-                f"Or with conda using:\n"
-                f"    conda install -c conda-forge {name}"
-            )
-            raise ModuleNotFoundError(msg, name=name)
+        raise _ds_exc.module_not_found(self._name, _requirements(self._name), name)  # type: ignore[call-overload]
 
     def __repr__(self) -> str:
         return f"Reader[{self._name}]"
@@ -259,15 +250,21 @@ class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol):
 
     _schema_cache: SchemaCache
 
-    def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]:
-        name: Any = result["dataset_name"]
-        suffix = result["suffix"]
+    def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
+        name: Any = meta["dataset_name"]
+        suffix = meta["suffix"]
         if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime):
             if suffix == ".json":
                 return {"convert_dates": cols}
             elif suffix in {".csv", ".tsv"}:
                 return {"parse_dates": cols}
-        return super()._schema_kwds(result)
+        return super()._schema_kwds(meta)
+
+    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]:
+        fn = super()._maybe_fn(meta)
+        if meta["is_spatial"]:
+            raise _ds_exc.geospatial(self._name)
+        return fn
 
 
 class _PandasReader(_PandasReaderBase):
@@ -378,51 +375,49 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
         https://arrow.apache.org/docs/python/json.html#reading-json-files
     """
 
+    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]:
+        fn = super()._maybe_fn(meta)
+        if fn is self._read_json_polars:
+            return fn
+        elif meta["is_json"]:
+            if meta["is_tabular"]:
+                return self._read_json_tabular
+            elif meta["is_spatial"]:
+                raise _ds_exc.geospatial(self._name)
+            else:
+                raise _ds_exc.non_tabular_json(self._name)
+        else:
+            return fn
+
+    def _read_json_tabular(self, source: Any, /, **kwds: Any) -> pa.Table:
+        import json
+
+        if not isinstance(source, Path):
+            obj = json.load(source)
+        else:
+            with Path(source).open(encoding="utf-8") as f:
+                obj = json.load(f)
+        pa = nw.dependencies.get_pyarrow()
+        return pa.Table.from_pylist(obj)
+
+    def _read_json_polars(self, source: Any, /, **kwds: Any) -> pa.Table:
+        return _pl_read_json_roundtrip(source).to_arrow()
+
     def __init__(self, name: _PyArrow, /) -> None:
         self._name = _requirements(name)
         if not TYPE_CHECKING:
-            pa = self._import(self._name)
-            pa_csv = self._import(f"{self._name}.csv")
-            pa_feather = self._import(f"{self._name}.feather")
-            pa_parquet = self._import(f"{self._name}.parquet")
-            pa_read_csv = pa_csv.read_csv
-            pa_read_feather = pa_feather.read_table
-            pa_read_parquet = pa_parquet.read_table
-
-            # HACK: Multiple alternatives to `pyarrow.json.read_json`
-            # -------------------------------------------------------
-            # NOTE: Prefer `polars` since it is zero-copy and fast (1)
-            if find_spec("polars") is not None:
-
-                def pa_read_json(source: StrPath, /, **kwds) -> pa.Table:
-                    return _pl_read_json_roundtrip(source).to_arrow()
+            pa = self._import(self._name)  # noqa: F841
+            pa_read_csv = self._import(f"{self._name}.csv").read_csv
+            pa_read_feather = self._import(f"{self._name}.feather").read_table
+            pa_read_parquet = self._import(f"{self._name}.parquet").read_table
 
+            # NOTE: Prefer `polars` since it is zero-copy and fast
+            if find_spec("polars") is not None:
+                pa_read_json = self._read_json_polars
             else:
-                # NOTE: Convert inline from stdlib json (2)
-                import json
-
-                pa_json = self._import(f"{self._name}.json")
-
-                def pa_read_json(source: Any, /, **kwds) -> pa.Table:
-                    if not isinstance(source, Path):
-                        obj = json.load(source)
-                    else:
-                        with Path(source).open(encoding="utf-8") as f:
-                            obj = json.load(f)
-                    # NOTE: Common case of {"values": [{...}]}, missing the `"values"` keys
-                    if isinstance(obj, Sequence) and isinstance(obj[0], Mapping):
-                        return pa.Table.from_pylist(obj)
-                    elif isinstance(obj, Mapping) and "type" in obj:
-                        msg = (
-                            "Inferred file as geojson, unsupported by pyarrow.\n"
-                            "Try installing `polars` or using `Loader.url(...)` instead."
-                        )
-                        raise NotImplementedError(msg)
-                    else:
-                        # NOTE: Almost certainly will fail on read as of `v2.9.0`
-                        return pa_json.read_json(source)
-
-        # Stubs suggest using a dataclass, but no way to construct it
+                pa_read_json = self._import(f"{self._name}.json").read_json
+
+        # NOTE: Stubs suggest using a dataclass, but no way to construct it
         tab_sep: Any = {"delimiter": "\t"}
 
         self._read_fn = {
@@ -512,8 +507,7 @@ def infer_backend(
     it = (backend(name) for name in priority if is_available(_requirements(name)))
     if reader := next(it, None):
         return reader
-    msg = f"Found no supported backend, searched:\n{priority!r}"
-    raise AltairDatasetsError(msg)
+    raise _ds_exc.AltairDatasetsError.from_priority(priority)
 
 
 @overload
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index a65b96bd7..10f030cfa 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -16,7 +16,7 @@
 from narwhals.stable.v1 import dependencies as nw_dep
 
 from altair.datasets import Loader, url
-from altair.datasets._readers import AltairDatasetsError
+from altair.datasets._exceptions import AltairDatasetsError
 from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
 from tests import skip_requires_pyarrow, slow
 
@@ -296,8 +296,13 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     assert match_url("flights-10k", url("flights-10k"))
     assert match_url("flights-200k", url("flights-200k"))
 
-    with pytest.raises(TypeError, match="cannot be loaded via url"):
-        url("flights-3m")
+    if find_spec("vegafusion"):
+        assert match_url("flights-3m", url("flights-3m"))
+
+    with monkeypatch.context() as mp:
+        mp.setitem(sys.modules, "vegafusion", None)
+        with pytest.raises(AltairDatasetsError, match=r".parquet.+require.+vegafusion"):
+            url("flights-3m")
 
     with pytest.raises(
         TypeError, match="'fake data' does not refer to a known dataset"

From e168948b6239f07a16d1e9f20b4a4c58cbea7ab4 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:04:09 +0000
Subject: [PATCH 167/201] chore(ruff): Remove unused `0.9.2` ignores

Related #3771

https://github.com/vega/altair/actions/runs/12810882256/job/35718940621?pr=3631
---
 altair/datasets/_cache.py   | 2 +-
 altair/datasets/_readers.py | 2 +-
 tools/datasets/npm.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 79fc9c50b..cbeb8f01f 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -46,7 +46,7 @@
     from altair.datasets._readers import _Reader
     from altair.datasets._typing import FlFieldStr
 
-    _Dataset: TypeAlias = "Dataset | LiteralString"  # noqa: TC008
+    _Dataset: TypeAlias = "Dataset | LiteralString"
     _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
 
 __all__ = ["CsvCache", "DatasetCache", "SchemaCache", "csv_cache"]
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 11cc473a9..55ab96851 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -64,7 +64,7 @@
     from altair.datasets._typing import Dataset, Extension, Metadata
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
-    _IntoSuffix: TypeAlias = "StrPath | Metadata"  # noqa: TC008
+    _IntoSuffix: TypeAlias = "StrPath | Metadata"
     _ExtensionScan: TypeAlias = Literal[".parquet"]
     _T = TypeVar("_T")
 
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index ea38eb971..95856d4fc 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -22,7 +22,7 @@
         from typing_extensions import TypeAlias
     from tools.datasets.models import Package, ParsedPackage
 
-    BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'  # noqa: TC008
+    BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'
 
 
 __all__ = ["Npm"]

From 7fd1f4d5ce8450d2a611d2cb461983aab03643aa Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:15:29 +0000
Subject: [PATCH 168/201] refactor: clean up, standardize `_exceptions.py`

---
 altair/datasets/_cache.py      |  2 +-
 altair/datasets/_exceptions.py | 63 +++++++++++++++++++++-------------
 altair/datasets/_readers.py    |  8 ++---
 3 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index cbeb8f01f..08016d622 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -197,7 +197,7 @@ def __getitem__(self, key: _Dataset, /) -> Metadata:
     def url(self, name: _Dataset, /) -> str:
         if meta := self.get(name, None):
             if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
-                raise AltairDatasetsError.url_parquet(meta)
+                raise AltairDatasetsError.from_url(meta)
             return meta["url"]
 
         if name in get_args(Dataset):
diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py
index 488470709..36dba27ef 100644
--- a/altair/datasets/_exceptions.py
+++ b/altair/datasets/_exceptions.py
@@ -10,14 +10,20 @@
 
 
 class AltairDatasetsError(Exception):
-    # TODO: Rename, try to reduce verbosity of message, link to vegafusion?
     @classmethod
-    def url_parquet(cls, meta: Metadata, /) -> AltairDatasetsError:
-        name = meta["file_name"]
-        msg = (
-            f"Currently unable to load {name!r} via url, as '.parquet' datasets require `vegafusion`.\n"
-            "See upstream issue for details: https://github.com/vega/vega/issues/3961"
-        )
+    def from_url(cls, meta: Metadata, /) -> AltairDatasetsError:
+        if meta["suffix"] == ".parquet":
+            msg = (
+                f"{_failed_url(meta)}"
+                f"{meta['suffix']!r} datasets require `vegafusion`.\n"
+                "See upstream issue for details: https://github.com/vega/vega/issues/3961"
+            )
+        else:
+            msg = (
+                f"{cls.from_url.__qualname__}() called for "
+                f"unimplemented extension: {meta['suffix']}\n\n{meta!r}"
+            )
+            raise NotImplementedError(msg)
         return cls(msg)
 
     @classmethod
@@ -43,30 +49,41 @@ def module_not_found(
     return ModuleNotFoundError(msg, name=missing)
 
 
-# TODO: Give more direct help (e.g. url("7zip"))
-def image(meta: Metadata):
-    name = meta["file_name"]
-    ext = meta["suffix"]
-    msg = (
-        f"Unable to load {name!r} as tabular data.\n"
-        f"{ext!r} datasets are only compatible with `url(...)` or `Loader.url(...)`."
-    )
+def image(meta: Metadata, /) -> AltairDatasetsError:
+    msg = f"{_failed_tabular(meta)}\n{_suggest_url(meta)}"
     return AltairDatasetsError(msg)
 
 
-# TODO: Pass in `meta`
-def geospatial(backend_name: str) -> NotImplementedError:
-    msg = _suggest_supported(
+def geospatial(meta: Metadata, backend_name: str) -> NotImplementedError:
+    msg = (
+        f"{_failed_tabular(meta)}"
         f"Geospatial data is not supported natively by {backend_name!r}."
+        f"{_suggest_url(meta, 'polars')}"
     )
     return NotImplementedError(msg)
 
 
-# TODO: Pass in `meta`
-def non_tabular_json(backend_name: str) -> NotImplementedError:
-    msg = _suggest_supported(f"Non-tabular json is not supported {backend_name!r}.")
+def non_tabular_json(meta: Metadata, backend_name: str) -> NotImplementedError:
+    msg = (
+        f"{_failed_tabular(meta)}"
+        f"Non-tabular json is not supported natively by {backend_name!r}."
+        f"{_suggest_url(meta, 'polars')}"
+    )
     return NotImplementedError(msg)
 
 
-def _suggest_supported(msg: str) -> str:
-    return f"{msg}\nTry installing `polars` or using `Loader.url(...)` instead."
+def _failed_url(meta: Metadata, /) -> str:
+    return f"Unable to load {meta['file_name']!r} via url.\n"
+
+
+def _failed_tabular(meta: Metadata, /) -> str:
+    return f"Unable to load {meta['file_name']!r} as tabular data.\n"
+
+
+def _suggest_url(meta: Metadata, install_other: str | None = None) -> str:
+    other = f" installing `{install_other}` or" if install_other else ""
+    return (
+        f"\n\nInstead, try{other}:\n\n"
+        "    from altair.datasets import url\n"
+        f"    url({meta['dataset_name']!r})"
+    )
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 55ab96851..c0587653a 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -174,7 +174,7 @@ def url(
         frame = self.query(**_extract_constraints(name, suffix))
         meta = next(_iter_metadata(frame))
         if meta["suffix"] == ".parquet" and not is_available("vegafusion"):
-            raise _ds_exc.AltairDatasetsError.url_parquet(meta)
+            raise _ds_exc.AltairDatasetsError.from_url(meta)
         url = meta["url"]
         if isinstance(url, str):
             return url
@@ -263,7 +263,7 @@ def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
     def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]:
         fn = super()._maybe_fn(meta)
         if meta["is_spatial"]:
-            raise _ds_exc.geospatial(self._name)
+            raise _ds_exc.geospatial(meta, self._name)
         return fn
 
 
@@ -383,9 +383,9 @@ def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]:
             if meta["is_tabular"]:
                 return self._read_json_tabular
             elif meta["is_spatial"]:
-                raise _ds_exc.geospatial(self._name)
+                raise _ds_exc.geospatial(meta, self._name)
             else:
-                raise _ds_exc.non_tabular_json(self._name)
+                raise _ds_exc.non_tabular_json(meta, self._name)
         else:
             return fn
 

From 5dc227e72abd07c0afe705e8275f9a7052996941 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:22:12 +0000
Subject: [PATCH 169/201] test: Refactor decorators, test new errors

---
 tests/test_datasets.py | 101 ++++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 10f030cfa..3ccdba273 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -38,6 +38,12 @@
     from altair.vegalite.v5.schema._typing import OneOrSeq
     from tests import MarksType
 
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame]
+
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
 
@@ -51,23 +57,34 @@ class DatasetSpec(TypedDict, total=False):
 
 requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow()
 
-backends: pytest.MarkDecorator = pytest.mark.parametrize(
-    "backend",
-    [
-        "polars",
-        pytest.param(
-            "pandas",
-            marks=pytest.mark.xfail(
-                find_spec("pyarrow") is None,
-                reason=(
-                    "`pandas` supports backends other than `pyarrow` for `.parquet`.\n"
-                    "However, none of these are currently an `altair` dependency."
-                ),
+_b_params = {
+    "polars": pytest.param("polars"),
+    "pandas": pytest.param(
+        "pandas",
+        marks=pytest.mark.xfail(
+            find_spec("pyarrow") is None,
+            reason=(
+                "`pandas` supports backends other than `pyarrow` for `.parquet`.\n"
+                "However, none of these are currently an `altair` dependency."
             ),
         ),
-        pytest.param("pandas[pyarrow]", marks=requires_pyarrow),
-        pytest.param("pyarrow", marks=requires_pyarrow),
-    ],
+    ),
+    "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=requires_pyarrow),
+    "pyarrow": pytest.param("pyarrow", marks=requires_pyarrow),
+}
+
+backends: pytest.MarkDecorator = pytest.mark.parametrize("backend", _b_params.values())
+backends_no_polars: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _b_params.items() if k != "polars"]
+)
+backends_pandas_any: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _b_params.items() if "pandas" in k]
+)
+backends_single: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _b_params.items() if "[" not in k]
+)
+backends_multi: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _b_params.items() if "[" in k]
 )
 
 datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
@@ -97,14 +114,30 @@ def is_flaky_datasets(request: pytest.FixtureRequest) -> bool:
 
 
 @pytest.fixture(scope="session")
-def polars_loader(
-    tmp_path_factory: pytest.TempPathFactory,
-) -> Loader[pl.DataFrame, pl.LazyFrame]:
+def polars_loader(tmp_path_factory: pytest.TempPathFactory) -> PolarsLoader:
     data = Loader.from_backend("polars")
     data.cache.path = tmp_path_factory.mktemp("loader-cache-polars")
     return data
 
 
+@pytest.fixture(
+    params=("earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m")
+)
+def spatial_datasets(request: pytest.FixtureRequest) -> Dataset:
+    return request.param
+
+
+@backends_no_polars
+def test_spatial(spatial_datasets, backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    pattern = re.compile(
+        rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+url",
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    with pytest.raises(NotImplementedError, match=pattern):
+        load(spatial_datasets)
+
+
 @pytest.fixture
 def metadata_columns() -> frozenset[str]:
     """
@@ -321,13 +354,10 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None
     assert set(nw_frame.columns) == {"symbol", "date", "price"}
 
 
-@backends
+@backends_single
 def test_missing_dependency_single(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    if backend == "pandas[pyarrow]":
-        pytest.skip("Testing single dependency backends only")
-
     monkeypatch.setitem(sys.modules, backend, None)
 
     with pytest.raises(
@@ -340,7 +370,7 @@ def test_missing_dependency_single(
         Loader.from_backend(backend)
 
 
-@pytest.mark.parametrize("backend", ["pandas[pyarrow]"])
+@backends_multi
 @skip_requires_pyarrow
 def test_missing_dependency_multi(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch
@@ -597,9 +627,7 @@ def test_pyarrow_read_json(
     ],
 )
 def test_polars_read_json_roundtrip(
-    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
-    spec: DatasetSpec,
-    column: str,
+    polars_loader: PolarsLoader, spec: DatasetSpec, column: str
 ) -> None:
     frame = polars_loader(spec["name"], ".json")
     tp = frame.schema.to_python()[column]
@@ -620,18 +648,17 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
 
 @slow
 @datasets_debug
-@pytest.mark.parametrize(
-    ("name", "suffix"),
-    list(_dataset_params(skip=("7zip", "ffox", "gimp"))),
-)
+@pytest.mark.parametrize(("name", "suffix"), list(_dataset_params()))
 def test_all_datasets(
-    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
-    name: Dataset,
-    suffix: Extension,
+    polars_loader: PolarsLoader, name: Dataset, suffix: Extension
 ) -> None:
     """Ensure all annotated datasets can be loaded with the most reliable backend."""
-    frame = polars_loader(name, suffix)
-    assert nw_dep.is_polars_dataframe(frame)
+    if name in {"7zip", "ffox", "gimp"}:
+        with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"):
+            polars_loader(name, suffix)
+    else:
+        frame = polars_loader(name, suffix)
+        assert nw_dep.is_polars_dataframe(frame)
 
 
 def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
@@ -686,7 +713,7 @@ def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -
 
 
 @skip_requires_pyarrow
-@pytest.mark.parametrize("backend", ["pandas", "pandas[pyarrow]"])
+@backends_pandas_any
 @pytest.mark.parametrize(
     ("name", "columns"),
     [
@@ -709,7 +736,7 @@ def test_pandas_date_parse(
     backend: _PandasAny,
     name: Dataset,
     columns: OneOrSeq[str],
-    polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
+    polars_loader: PolarsLoader,
 ) -> None:
     """
     Ensure schema defaults are correctly parsed.

From ba01af12e064398dc89a7966f2849c1b1d03ff6c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 17 Jan 2025 18:24:20 +0000
Subject: [PATCH 170/201] docs: Replace outdated docs

- Using `load` instead of `data`
- Don't mention multi-versions, as that was dropped
---
 altair/datasets/_loader.py  | 34 +++++++++++++++++-----------------
 altair/datasets/_readers.py |  4 +++-
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index ef1cf46d3..0bb91aa1f 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -35,8 +35,8 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
 
         from altair.datasets import Loader
 
-        data = Loader.from_backend("polars")
-        >>> data  # doctest: +SKIP
+        load = Loader.from_backend("polars")
+        >>> load  # doctest: +SKIP
         Loader[polars]
 
     .. _vega-datasets:
@@ -94,24 +94,24 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
 
             from altair.datasets import Loader
 
-            data = Loader.from_backend("polars")
-            cars = data("cars")
+            load = Loader.from_backend("polars")
+            cars = load("cars")
 
             >>> type(cars)  # doctest: +SKIP
             polars.dataframe.frame.DataFrame
 
         Using ``pandas``:
 
-            data = Loader.from_backend("pandas")
-            cars = data("cars")
+            load = Loader.from_backend("pandas")
+            cars = load("cars")
 
             >>> type(cars)  # doctest: +SKIP
             pandas.core.frame.DataFrame
 
         Using ``pandas``, backed by ``pyarrow`` dtypes:
 
-            data = Loader.from_backend("pandas[pyarrow]")
-            cars = data("cars")
+            load = Loader.from_backend("pandas[pyarrow]")
+            cars = load("cars")
 
             >>> type(cars)  # doctest: +SKIP
             pandas.core.frame.DataFrame
@@ -165,8 +165,8 @@ def __call__(
 
             from altair.datasets import Loader
 
-            data = Loader.from_backend("polars")
-            source = data("iowa-electricity")
+            load = Loader.from_backend("polars")
+            source = load("iowa-electricity")
 
             >>> source.columns  # doctest: +SKIP
             ['year', 'source', 'net_generation']
@@ -193,8 +193,8 @@ def __call__(
 
         Using ``pandas``:
 
-            data = Loader.from_backend("pandas")
-            source = data("iowa-electricity")
+            load = Loader.from_backend("pandas")
+            source = load("iowa-electricity")
 
             >>> source.columns  # doctest: +SKIP
             Index(['year', 'source', 'net_generation'], dtype='object')
@@ -217,8 +217,8 @@ def __call__(
 
         Using ``pyarrow``:
 
-            data = Loader.from_backend("pyarrow")
-            source = data("iowa-electricity")
+            load = Loader.from_backend("pyarrow")
+            source = load("iowa-electricity")
 
             >>> source.column_names  # doctest: +SKIP
             ['year', 'source', 'net_generation']
@@ -266,13 +266,13 @@ def url(
             import altair as alt
             from altair.datasets import Loader
 
-            data = Loader.from_backend("polars")
-            >>> data.url("cars")  # doctest: +SKIP
+            load = Loader.from_backend("polars")
+            >>> load.url("cars")  # doctest: +SKIP
             'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json'
 
         We can pass the result directly to a chart:
 
-            url = data.url("cars")
+            url = load.url("cars")
             alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
         """
         return self._reader.url(name, suffix)
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index c0587653a..f76cc5a0a 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -186,7 +186,7 @@ def query(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
     ) -> nw.DataFrame[IntoDataFrameT]:
         """
-        Query multi-version trees metadata.
+        Query a tabular version of `vega-datasets/datapackage.json`_.
 
         Applies a filter, erroring out when no results would be returned.
 
@@ -194,6 +194,8 @@ def query(
         -----
         Arguments correspond to those seen in `pl.LazyFrame.filter`_.
 
+        .. _vega-datasets/datapackage.json:
+            https://github.com/vega/vega-datasets/blob/main/datapackage.json
         .. _pl.LazyFrame.filter:
             https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
         """

From 63f4be0232d5d818edaa9e7c67dcfa76e9057dda Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 18 Jan 2025 21:21:36 +0000
Subject: [PATCH 171/201] refactor: Clean up `tools.datasets`

- `Application.generate_typing` now mostly populated by `DataPackage` methods
- Docs are defined alongside expressions
- Factored out repetitive code into `spell_literal_alias`
- `Metadata` examples table is now generated inside the doc
---
 tools/datasets/__init__.py    | 179 ++++----------------
 tools/datasets/datapackage.py | 300 ++++++++++++++++++++++++----------
 tools/datasets/npm.py         |  20 +--
 tools/schemapi/utils.py       |  20 +++
 4 files changed, 275 insertions(+), 244 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 37d487da0..faf5e8d96 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -22,28 +22,28 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
-import polars as pl
-from polars import col
-
 from tools.codemod import ruff
 from tools.datasets.npm import Npm
+from tools.fs import REPO_ROOT
 from tools.schemapi import utils
 
 if TYPE_CHECKING:
     import sys
     from collections.abc import Mapping
 
+    import polars as pl
+
+    from tools.datasets import datapackage
+
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
 
     _PathAlias: TypeAlias = Literal[
-        "typing",
-        "metadata-csv",
-        "metadata",
-        "schemas",
+        "typing", "metadata-csv", "metadata", "schemas", "datapackage"
     ]
+    PathMap: TypeAlias = Mapping[_PathAlias, Path]
 
 __all__ = ["app"]
 
@@ -63,8 +63,6 @@ class Application:
         Directories to store ``.parquet`` metadata files.
     out_fp_typing
         Path to write metadata-derived typing module.
-    kwds_npm
-        Arguments passed to corresponding constructor.
 
     See Also
     --------
@@ -72,16 +70,9 @@ class Application:
     """
 
     def __init__(
-        self,
-        out_dir_tools: Path,
-        out_dir_altair: Path,
-        out_fp_typing: Path,
-        *,
-        kwds_npm: Mapping[str, Any] | None = None,
+        self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path
     ) -> None:
         out_dir_tools.mkdir(exist_ok=True)
-        kwds_npm = kwds_npm or {}
-        self._npm: Npm = Npm(out_dir_tools, **kwds_npm)
         METADATA = "metadata"
         self.paths = types.MappingProxyType["_PathAlias", Path](
             {
@@ -89,8 +80,10 @@ def __init__(
                 "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz",
                 "metadata": out_dir_altair / f"{METADATA}.parquet",
                 "schemas": out_dir_altair / "schemas.json.gz",
+                "datapackage": out_dir_tools / "datapackage.json",
             }
         )
+        self._npm: Npm = Npm(self.paths)
 
     @property
     def npm(self) -> Npm:
@@ -118,20 +111,15 @@ def refresh(
             https://github.com/vega/vega-datasets/issues/654
         """
         print("Syncing datasets ...")
-        package = self.npm.datapackage(tag=tag, frozen=frozen)
-        self.write_parquet(package["features"], self.paths["metadata"])
-        self.write_json_gzip(package["schemas"], self.paths["schemas"])
-        metadata_min = (
-            package["features"]
-            .lazy()
-            .filter(col("suffix") != ".arrow")
-            .sort("dataset_name")
-        )
-        self.write_csv_gzip(metadata_min, self.paths["metadata-csv"])
+        dpkg = self.npm.datapackage(tag=tag, frozen=frozen)
+        self.write_parquet(dpkg.core, self.paths["metadata"])
+        self.write_json_gzip(dpkg.schemas(), self.paths["schemas"])
+        self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"])
+        print("Finished updating datasets.")
 
         if include_typing:
-            self.generate_typing()
-        return package["features"]
+            self.generate_typing(dpkg)
+        return dpkg.core.collect()
 
     def reset(self) -> None:
         """Remove all metadata files."""
@@ -140,10 +128,14 @@ def reset(self) -> None:
 
     def read(self, name: _PathAlias, /) -> pl.DataFrame:
         """Read existing metadata from file."""
+        import polars as pl
+
         return pl.read_parquet(self.paths[name])
 
     def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
         """Scan existing metadata from file."""
+        import polars as pl
+
         return pl.scan_parquet(self.paths[name])
 
     def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
@@ -190,114 +182,16 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
         df = frame.lazy().collect()
         df.write_parquet(fp, compression="zstd", compression_level=17)
 
-    def generate_typing(self) -> None:
-        from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
-
-        dpkg = self.scan("metadata")
-        metadata_schema = dpkg.collect_schema().to_python()
-
-        DATASET_NAME = "dataset_name"
-        names = (
-            dpkg.unique(DATASET_NAME)
-            .select(DATASET_NAME)
-            .sort(DATASET_NAME)
-            .collect()
-            .to_series()
-        )
+    def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
         indent = " " * 4
         NAME = "Dataset"
         EXT = "Extension"
-        EXT_TYPES = tuple(
-            dpkg.filter(is_image=False)
-            .select(col("suffix").unique().sort())
-            .collect()
-            .to_series()
-            .to_list()
-        )
+        EXT_TYPES = dpkg.extensions()
         EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
         EXTENSION_TYPE_TP = (
             f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]"
         )
         EXTENSION_GUARD = "is_ext_read"
-        METADATA_TD = "Metadata"
-        DESCRIPTION_DEFAULT = "_description_"
-        NOTE_SEP = f"\n\n{indent * 2}.. note::\n{indent * 3}"
-
-        sha = (
-            f"Unique hash for the dataset.{NOTE_SEP}"
-            f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
-            f"then this value would remain stable."
-        )
-        links = (
-            f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n"
-            f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n"
-            f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n"
-            f".. _GeoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON\n"
-            f".. _TopoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON#TopoJSON\n"
-        )
-        import textwrap
-
-        # NOTE: Uses `pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80)`
-        examples = f"""\
-        Examples
-        --------
-        ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample:
-
-        ```
-        shape: (73, 13)
-        ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
-        │ dataset_name   ┆ suffix ┆ file_name      ┆ … ┆ sha           ┆ url           │
-        │ ---            ┆ ---    ┆ ---            ┆   ┆ ---           ┆ ---           │
-        │ str            ┆ str    ┆ str            ┆   ┆ str           ┆ str           │
-        ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡
-        │ 7zip           ┆ .png   ┆ 7zip.png       ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ d48850099c17… ┆ sdelivr.net/… │
-        │ airports       ┆ .csv   ┆ airports.csv   ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ 584c3fa1d31e… ┆ sdelivr.net/… │
-        │ annual-precip  ┆ .json  ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │
-        │                ┆        ┆ json           ┆   ┆ 8f16dda65151… ┆ sdelivr.net/… │
-        │ anscombe       ┆ .json  ┆ anscombe.json  ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ 3bdf0c866115… ┆ sdelivr.net/… │
-        │ barley         ┆ .json  ┆ barley.json    ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ e197ce95c24c… ┆ sdelivr.net/… │
-        │ …              ┆ …      ┆ …              ┆ … ┆ …             ┆ …             │
-        │ weekly-weather ┆ .json  ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │
-        │                ┆        ┆ .json          ┆   ┆ ccd6baaa89f9… ┆ sdelivr.net/… │
-        │ wheat          ┆ .json  ┆ wheat.json     ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │
-        │ windvectors    ┆ .csv   ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │
-        │                ┆        ┆ v              ┆   ┆ bd59d09fcd94… ┆ sdelivr.net/… │
-        │ world-110m     ┆ .json  ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │
-        │                ┆        ┆ n              ┆   ┆ 13c94c0c2840… ┆ sdelivr.net/… │
-        │ zipcodes       ┆ .csv   ┆ zipcodes.csv   ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │
-        │                ┆        ┆                ┆   ┆ 0544c95f1bd4… ┆ sdelivr.net/… │
-        └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘
-        ```
-        """
-
-        descriptions: dict[str, str] = {
-            "dataset_name": "Name of the dataset/`Path.stem`_.",
-            "suffix": "File extension/`Path.suffix`_.",
-            "file_name": "Equivalent to `Path.name`_.",
-            "bytes": "File size in *bytes*.",
-            "is_tabular": "Can be read as tabular data.",
-            "is_image": "Only accessible via url.",
-            "is_geo": "`GeoJSON`_ format.",
-            "is_topo": "`TopoJSON`_ format.",
-            "is_spatial": "Any geospatial format. Only natively supported by ``polars``.",
-            "is_json": "Not supported natively by ``pyarrow``.",
-            "has_schema": "Data types available for improved ``pandas`` parsing.",
-            "sha": sha,
-            "url": "Remote url used to access dataset.",
-        }
-        metadata_doc = (
-            f"\n{indent}".join(
-                f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
-                for param in metadata_schema
-            )
-            + f"\n\n{links}\n\n"
-            f"{textwrap.indent(textwrap.dedent(examples), indent)}"
-        )
 
         FIELD = "FlFieldStr"
         FIELD_TYPES = (
@@ -322,23 +216,14 @@ def generate_typing(self) -> None:
             utils.import_typing_extensions((3, 13), "TypeIs"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
-            f"__all__ = {[NAME, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
-            f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
-            f"{EXT}: TypeAlias = {utils.spell_literal(EXT_TYPES)}",
+            f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n",
+            utils.spell_literal_alias(NAME, dpkg.dataset_names()),
+            utils.spell_literal_alias(EXT, EXT_TYPES),
             f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}",
             f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
             f"{indent}return suffix in set({EXT_TYPES!r})\n",
-            UNIVERSAL_TYPED_DICT.format(
-                name=METADATA_TD,
-                metaclass_kwds=", total=False",
-                td_args=f"\n{indent}".join(
-                    f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
-                ),
-                summary="Full schema for ``metadata.parquet``.",
-                doc=metadata_doc,
-                comment="",
-            ),
-            f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n"
+            dpkg.typed_dict(),
+            utils.spell_literal_alias(FIELD, FIELD_TYPES),
             '"""\n'
             "String representation of `frictionless`_ `Field Types`_.\n\n"
             f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n"
@@ -348,15 +233,9 @@ def generate_typing(self) -> None:
         ruff.write_lint_format(self.paths["typing"], contents)
 
 
-_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets"
+_alt_datasets = REPO_ROOT / "altair" / "datasets"
 app = Application(
     Path(__file__).parent / "_metadata",
     _alt_datasets / "_metadata",
     _alt_datasets / "_typing.py",
 )
-
-
-# This is the tag in http://github.com/vega/vega-datasets from
-# which the datasets in this repository are sourced.
-_OLD_SOURCE_TAG = "v1.29.0"  # 5 years ago
-_CURRENT_SOURCE_TAG = "v2.9.0"
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 5272170c2..9747bdb71 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -7,92 +7,189 @@
 
 from __future__ import annotations
 
+import textwrap
 from collections import deque
+from functools import cached_property
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 import polars as pl
 from polars import col
-from polars import selectors as cs
 
-from tools.datasets.models import ParsedPackage
 from tools.schemapi import utils
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Mapping, Sequence
 
     from altair.datasets._typing import Dataset, FlFieldStr
-    from tools.datasets.models import Package
+    from tools.datasets.models import Package, Resource
 
 
-__all__ = ["parse_package"]
+__all__ = ["DataPackage"]
 
+INDENT = " " * 4
 
-DATASET_NAME: Literal["dataset_name"] = "dataset_name"
 
-# # NOTE: Flag columns
-# Storing these instead of the full **56KB** `datapackage.json`
-FEATURES: Sequence[pl.Expr] = (
-    (col("format") == "png").alias("is_image"),
-    (col("type") == "table").alias("is_tabular"),
-    (col("format") == "geojson").alias("is_geo"),
-    (col("format") == "topojson").alias("is_topo"),
-    col("format").is_in(("geojson", "topojson")).alias("is_spatial"),
-    (col("format").str.contains("json")).alias("is_json"),
-)
+class Column:
+    def __init__(self, name: str, expr: pl.Expr, /, doc: str = "_description_") -> None:
+        self._name: str = name
+        self._expr: pl.Expr = expr
+        self._doc: str = doc
 
+    @property
+    def expr(self) -> pl.Expr:
+        return self._expr.alias(self._name)
 
-def parse_package(pkg: Package, base_url: str, /) -> ParsedPackage:
-    return ParsedPackage(
-        features=extract_features(pkg, base_url), schemas=extract_schemas(pkg)
-    )
+    @property
+    def doc(self) -> str:
+        return f"{self._name}\n{INDENT * 2}{self._doc}"
 
+    def is_feature(self) -> bool:
+        return self._name.startswith("is_")
 
-def extract_schemas(pkg: Package, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
-    """Reduce all datasets with schemas to a minimal mapping."""
-    m: Any = {
-        Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]}
-        for rsrc in pkg["resources"]
-        if (s := rsrc.get("schema"))
-    }
-    return m
-
-
-def extract_features(pkg: Package, base_url: str, /) -> pl.DataFrame:
-    EXCLUDE = (
-        "name",
-        "type",
-        "format",
-        "scheme",
-        "mediatype",
-        "encoding",
-        "dialect",
-        "schema",
-        "sources",
-        "licenses",
-        "hash",
-        "description",
-        "path",
-    )
-    return (
-        pl.LazyFrame(pkg["resources"])
-        .with_columns(
-            path_stem("path").alias(DATASET_NAME),
-            cs.exclude("name"),
+
+class DataPackage:
+    NAME: ClassVar[Literal["dataset_name"]] = "dataset_name"
+    """
+    Main user-facing column name.
+
+    - Does not include file extension
+    - Preserves case of original file name
+    """
+
+    sort_by: str | Sequence[str] = "dataset_name", "bytes"
+    """Key(s) used to ensure output is deterministic."""
+
+    _NAME_TYPED_DICT: ClassVar[Literal["Metadata"]] = "Metadata"
+    _columns: ClassVar[Sequence[Column]]
+    _links: ClassVar[Sequence[str]]
+
+    def __init__(self, pkg: Package, base_url: str, path: Path, /) -> None:
+        self._pkg: Package = pkg
+        self._base_url: str = base_url
+        self._path: Path = path
+
+    @classmethod
+    def with_columns(cls, *columns: Column) -> type[DataPackage]:
+        cls._columns = columns
+        return cls
+
+    @classmethod
+    def with_links(cls, *links: str) -> type[DataPackage]:
+        cls._links = links
+        return cls
+
+    @property
+    def columns(self) -> Iterator[Column]:
+        yield from self._columns
+        yield self._url
+
+    @cached_property
+    def core(self) -> pl.LazyFrame:
+        """A minimal, tabular view of ``datapackage.json``."""
+        return pl.LazyFrame(self._resources).select(self._exprs).sort(self.sort_by)
+
+    def schemas(self) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
+        """Reduce all datasets with schemas to a minimal mapping."""
+        m: Any = {
+            Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]}
+            for rsrc in self._resources
+            if (s := rsrc.get("schema"))
+        }
+        return m
+
+    def dataset_names(self) -> Iterable[str]:
+        return self.core.select(col(self.NAME).unique().sort()).collect().to_series()
+
+    def extensions(self) -> tuple[str, ...]:
+        return tuple(
+            self.core.filter(is_image=False)
+            .select(col("suffix").unique().sort())
+            .collect()
+            .to_series()
+            .to_list()
         )
-        .select(
-            DATASET_NAME,
-            path_suffix("path").alias("suffix"),
-            col("path").alias("file_name"),
-            ~cs.by_name(DATASET_NAME, EXCLUDE),
-            *FEATURES,
-            col("schema").is_not_null().alias("has_schema"),
-            col("hash").str.split(":").list.last().alias("sha"),
-            pl.concat_str(pl.lit(base_url), "path").alias("url"),
+
+    # TODO: Collect, then raise if cannot guarantee uniqueness
+    def metadata_csv(self) -> pl.LazyFrame:
+        """Variant with duplicate dataset names removed."""
+        return self.core.filter(col("suffix") != ".arrow").sort(self.NAME)
+
+    def typed_dict(self) -> str:
+        from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
+
+        return UNIVERSAL_TYPED_DICT.format(
+            name=self._NAME_TYPED_DICT,
+            metaclass_kwds=", total=False",
+            td_args=self._metadata_td_args,
+            summary=f"Full schema for ``{self._path.name}``.",
+            doc=self._metadata_doc,
+            comment="",
         )
-        .sort(DATASET_NAME, "bytes")
-        .collect()
-    )
+
+    @property
+    def _exprs(self) -> Iterator[pl.Expr]:
+        return (column.expr for column in self.columns)
+
+    @property
+    def _docs(self) -> Iterator[str]:
+        return (column.doc for column in self.columns)
+
+    @property
+    def _resources(self) -> Sequence[Resource]:
+        return self._pkg["resources"]
+
+    @property
+    def _metadata_doc(self) -> str:
+        NLINDENT = f"\n{INDENT}"
+        return (
+            f"{NLINDENT.join(self._docs)}\n\n{''.join(self._links)}\n"
+            f"{textwrap.indent(self._metadata_examples, INDENT)}"
+            f"{INDENT}"
+        )
+
+    @property
+    def _metadata_examples(self) -> str:
+        with pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80):
+            table = repr(self.core.collect())
+        return (
+            f"\nExamples"
+            f"\n--------\n"
+            f"``{self._NAME_TYPED_DICT}`` keywords form constraints to filter a table like the below sample:\n\n"
+            f"```\n{table}\n```\n"
+        )
+
+    @property
+    def _metadata_td_args(self) -> str:
+        schema = self.core.collect_schema().to_python()
+        return f"\n{INDENT}".join(f"{p}: {tp.__name__}" for p, tp in schema.items())
+
+    @property
+    def _url(self) -> Column:
+        expr = pl.concat_str(pl.lit(self._base_url), "path")
+        return Column("url", expr, "Remote url used to access dataset.")
+
+    def features_typing(self, frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]:
+        """
+        Current plan is to use type aliases in overloads.
+
+        - ``Tabular`` can be treated interchangeably
+        - ``Image`` can only work with ``url``
+        - ``(Spatial|Geo|Topo)`` can be read with ``polars``
+            - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955
+        - ``Json`` should warn when using the ``pyarrow`` backend
+        """
+        guards = deque[str]()
+        ldf = frame.lazy()
+        for column in self.columns:
+            if not column.is_feature():
+                continue
+            guard_name = column._name
+            alias_name = guard_name.removeprefix("is_").capitalize()
+            members = ldf.filter(guard_name).select(self.NAME).collect().to_series()
+            guards.append(guard_literal(alias_name, guard_name, members))
+            yield utils.spell_literal_alias(alias_name, members)
+        yield from guards
 
 
 def path_stem(column: str | pl.Expr, /) -> pl.Expr:
@@ -119,30 +216,65 @@ def path_suffix(column: str | pl.Expr, /) -> pl.Expr:
     return path.str.tail(path.str.reverse().str.find(r"\.") + 1)
 
 
-def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]:
-    """
-    Current plan is to use type aliases in overloads.
-
-    - ``Tabular`` can be treated interchangeably
-    - ``Image`` can only work with ``url``
-    - ``(Spatial|Geo|Topo)`` can be read with ``polars``
-        - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955
-    - ``Json`` should warn when using the ``pyarrow`` backend
-    """
-    guards = deque[str]()
-    ldf = frame.lazy()
-    for feat in FEATURES:
-        guard_name = feat.meta.output_name()
-        alias_name = guard_name.removeprefix("is_").capitalize()
-        members = ldf.filter(guard_name).select(DATASET_NAME).collect().to_series()
-        guards.append(guard_literal(alias_name, guard_name, members))
-        yield f"{alias_name}: TypeAlias = {utils.spell_literal(members)}"
-    yield from guards
-
-
 def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str:
     """Type narrowing function, all members must be literal strings."""
     return (
         f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n"
         f"    return obj in set({sorted(set(members))!r})\n"
     )
+
+
+PATHLIB = "https://docs.python.org/3/library/pathlib.html"
+GEOJSON = "https://en.wikipedia.org/wiki/GeoJSON"
+
+
+def link(name: str, url: str, /) -> str:
+    return f"{INDENT}.. _{name}:\n{INDENT * 2}{url}\n"
+
+
+def note(s: str, /) -> str:
+    return f"\n\n{INDENT * 2}.. note::\n{INDENT * 3}{s}"
+
+
+fmt = col("format")
+DataPackage.with_columns(
+    Column("dataset_name", path_stem("path"), "Name of the dataset/`Path.stem`_."),
+    Column("suffix", path_suffix("path"), "File extension/`Path.suffix`_."),
+    Column("file_name", col("path"), "Equivalent to `Path.name`_."),
+    Column("bytes", col("bytes"), "File size in *bytes*."),
+    Column("is_image", fmt == "png", "Only accessible via url."),
+    Column("is_tabular", col("type") == "table", "Can be read as tabular data."),
+    Column("is_geo", fmt == "geojson", "`GeoJSON`_ format."),
+    Column("is_topo", fmt == "topojson", "`TopoJSON`_ format."),
+    Column(
+        "is_spatial",
+        fmt.is_in(("geojson", "topojson")),
+        "Any geospatial format. Only natively supported by ``polars``.",
+    ),
+    Column(
+        "is_json", fmt.str.contains("json"), "Not supported natively by ``pyarrow``."
+    ),
+    Column(
+        "has_schema",
+        col("schema").is_not_null(),
+        "Data types available for improved ``pandas`` parsing.",
+    ),
+    Column(
+        "sha",
+        col("hash").str.split(":").list.last(),
+        doc=(
+            "Unique hash for the dataset."
+            + note(
+                f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{INDENT * 3}"
+                f"then this value would remain stable."
+            )
+        ),
+    ),
+)
+DataPackage.with_links(
+    link("Path.stem", f"{PATHLIB}#pathlib.PurePath.stem"),
+    link("Path.name", f"{PATHLIB}#pathlib.PurePath.name"),
+    link("Path.suffix", f"{PATHLIB}#pathlib.PurePath.suffix"),
+    link("GeoJSON", GEOJSON),
+    link("TopoJSON", f"{GEOJSON}#TopoJSON"),
+)
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 95856d4fc..40116cb05 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -20,7 +20,9 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from tools.datasets.models import Package, ParsedPackage
+    from tools.datasets import PathMap
+    from tools.datasets.datapackage import DataPackage
+    from tools.datasets.models import Package
 
     BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'
 
@@ -40,16 +42,13 @@ class Npm:
 
     def __init__(
         self,
-        output_dir: Path,
+        paths: PathMap,
         *,
         jsdelivr: Literal["jsdelivr"] = "jsdelivr",
         npm: Literal["npm"] = "npm",
         package: LiteralString = "vega-datasets",
     ) -> None:
-        output_dir.mkdir(exist_ok=True)
-        self._paths: dict[Literal["datapackage"], Path] = {
-            "datapackage": output_dir / "datapackage.json",
-        }
+        self.paths: PathMap = paths
         self._url: NpmUrl = NpmUrl(
             CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
             GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
@@ -107,14 +106,15 @@ def file_gh(
         with self._opener.open(req) as response:
             return read_fn(response)
 
-    def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage:
+    def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> DataPackage:
         pkg: Package = (
-            json.loads(self._paths["datapackage"].read_text("utf-8"))
+            json.loads(self.paths["datapackage"].read_text("utf-8"))
             if frozen
             else self.file_gh(tag, "datapackage.json")
         )
-
-        return datapackage.parse_package(pkg, self.dataset_base_url(tag))
+        return datapackage.DataPackage(
+            pkg, self.dataset_base_url(tag), self.paths["metadata"]
+        )
 
 
 def is_branch(s: BranchOrTag, /) -> bool:
diff --git a/tools/schemapi/utils.py b/tools/schemapi/utils.py
index a9426f15c..3d4b2d347 100644
--- a/tools/schemapi/utils.py
+++ b/tools/schemapi/utils.py
@@ -1227,6 +1227,26 @@ def spell_literal(it: Iterable[str], /, *, quote: bool = True) -> str:
     return f"Literal[{', '.join(it_el)}]"
 
 
+def spell_literal_alias(
+    alias_name: str, members: Iterable[str], /, *, quote: bool = True
+) -> str:
+    """
+    Wraps ``utils.spell_literal`` as a ``TypeAlias``.
+
+    Examples
+    --------
+    >>> spell_literal_alias("Animals", ("Dog", "Cat", "Fish"))
+    "Animals: TypeAlias = Literal['Dog', 'Cat', 'Fish']"
+
+    >>> spell_literal_alias("Digits", "0123456789")
+    "Digits: TypeAlias = Literal['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']"
+
+    >>> spell_literal_alias("LessThanFive", (repr(i) for i in range(5)))
+    "LessThanFive: TypeAlias = Literal['0', '1', '2', '3', '4']"
+    """
+    return f"{alias_name}: TypeAlias = {spell_literal(members, quote=quote)}"
+
+
 def maybe_rewrap_literal(it: Iterable[str], /) -> Iterator[str]:
     """
     Where `it` may contain one or more `"enum"`, `"const"`, flatten to a single `Literal[...]`.

From 7433eb81fe22acf62588016e240f0997ef6df908 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 20 Jan 2025 17:09:49 +0000
Subject: [PATCH 172/201] test: `test_datasets` overhaul

- Eliminated all flaky tests
- Mocking more of the internals that is safer to run in parallel
- Split out non-threadsafe tests with `@no_xdist`
- Huge performance improvement for the slower tests
- Added some helper functions (`is_*`) where common patterns were identified
- **Removed skipping from native `pandas` backend**
  - Confirms that its now safe without `pyarrow` installed
---
 altair/datasets/_readers.py |  24 +-
 pyproject.toml              |  21 +-
 tests/__init__.py           |  10 +
 tests/test_datasets.py      | 556 ++++++++++++++++++------------------
 4 files changed, 308 insertions(+), 303 deletions(-)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index f76cc5a0a..0a18c1e61 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -60,6 +60,7 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
+    from packaging.requirements import Requirement
 
     from altair.datasets._typing import Dataset, Extension, Metadata
     from altair.vegalite.v5.schema._typing import OneOrSeq
@@ -379,7 +380,7 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
 
     def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]:
         fn = super()._maybe_fn(meta)
-        if fn is self._read_json_polars:
+        if fn == self._read_json_polars:
             return fn
         elif meta["is_json"]:
             if meta["is_tabular"]:
@@ -550,7 +551,7 @@ def _requirements(s: _ConcreteT, /) -> _ConcreteT: ...
 def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ...
 
 
-def _requirements(s: _Backend, /):
+def _requirements(s: Any, /) -> Any:
     concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"}
     if s in concrete:
         return s
@@ -559,12 +560,13 @@ def _requirements(s: _Backend, /):
 
         req = Requirement(s)
         supports_extras: set[Literal[_Pandas]] = {"pandas"}
-        if req.name in supports_extras:
-            name = req.name
-            if (extras := req.extras) and extras == {"pyarrow"}:
-                extra = "pyarrow"
-                return name, extra
-            else:
-                raise NotImplementedError(s)
-        else:
-            raise NotImplementedError(s)
+        if req.name in supports_extras and req.extras == {"pyarrow"}:
+            return req.name, "pyarrow"
+        return _requirements_unknown(req)
+
+
+def _requirements_unknown(req: Requirement | str, /) -> Any:
+    from packaging.requirements import Requirement
+
+    req = Requirement(req) if isinstance(req, str) else req
+    return (req.name, *req.extras)
diff --git a/pyproject.toml b/pyproject.toml
index 5ac95f190..03e33cc36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -262,16 +262,18 @@ cwd = "."
 [tool.taskipy.tasks]
 lint       = "ruff check"
 format     = "ruff format --diff --check"
+ruff-check = "task lint && task format"
 ruff-fix   = "task lint && ruff format"
 type-check = "mypy altair tests"
 
-pytest        = "pytest"
-test          = "task lint && task format && task type-check && task pytest"
-test-fast     = "task ruff-fix && pytest -m \"not slow\""
-test-slow     = "task ruff-fix && pytest -m \"slow\""
-test-datasets = "task ruff-fix && pytest tests -k test_datasets -m \"\""
-test-min      = "task lint && task format && task type-check && hatch test --python 3.9"
-test-all      = "task lint && task format && task type-check && hatch test --all"
+pytest-serial = "pytest -m \"no_xdist\" --numprocesses=1"
+pytest        = "pytest && task pytest-serial"
+test          = "task ruff-check && task type-check && task pytest"
+test-fast     = "task ruff-fix && pytest -m \"not slow and not datasets_debug and not no_xdist\""
+test-slow     = "task ruff-fix && pytest -m \"slow and not datasets_debug and not no_xdist\""
+test-datasets = "task ruff-fix && pytest tests -k test_datasets -m \"not no_xdist\" && task pytest-serial"
+test-min      = "task ruff-check && task type-check && hatch test --python 3.9"
+test-all      = "task ruff-check && task type-check && hatch test --all"
 
 
 generate-schema-wrapper = "mypy tools && python tools/generate_schema_wrapper.py && task test"
@@ -303,12 +305,13 @@ addopts = [
     "tests",
     "altair",
     "tools",
-    "-m not datasets_debug",
+    "-m not datasets_debug and not no_xdist",
 ]
 # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks
 markers = [
     "slow: Label tests as slow (deselect with '-m \"not slow\"')",
-    "datasets_debug: Disabled by default due to high number of requests"
+    "datasets_debug: Disabled by default due to high number of requests",
+    "no_xdist: Unsafe to run in parallel"
 ]
 
 [tool.mypy]
diff --git a/tests/__init__.py b/tests/__init__.py
index 5d78dce0d..80c27fc2c 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -60,6 +60,16 @@ def windows_has_tzdata() -> bool:
     >>> hatch run test-slow --durations=25  # doctest: +SKIP
 """
 
+no_xdist: pytest.MarkDecorator = pytest.mark.no_xdist()
+"""
+Custom ``pytest.mark`` decorator.
+
+Each marked test will run **serially**, after all other selected tests.
+
+.. tip::
+   Use as a last resort when a test depends on manipulating global state.
+"""
+
 skip_requires_ipython: pytest.MarkDecorator = pytest.mark.skipif(
     find_spec("IPython") is None, reason="`IPython` not installed."
 )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3ccdba273..b212d79ce 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import contextlib
 import datetime as dt
 import re
 import sys
@@ -15,18 +14,14 @@
 from narwhals.stable import v1 as nw
 from narwhals.stable.v1 import dependencies as nw_dep
 
-from altair.datasets import Loader, url
+from altair.datasets import Loader
 from altair.datasets._exceptions import AltairDatasetsError
 from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
-from tests import skip_requires_pyarrow, slow
-
-if sys.version_info >= (3, 14):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
+from tests import no_xdist, skip_requires_pyarrow
+from tools import fs
 
 if TYPE_CHECKING:
-    from collections.abc import Container, Iterator
+    from collections.abc import Callable, Container, Iterator, Mapping
     from pathlib import Path
     from typing import Literal
 
@@ -34,7 +29,7 @@
     import polars as pl
     from _pytest.mark.structures import ParameterSet
 
-    from altair.datasets._readers import _Backend, _PandasAny, _Polars
+    from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow
     from altair.vegalite.v5.schema._typing import OneOrSeq
     from tests import MarksType
 
@@ -45,46 +40,24 @@
     PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame]
 
 CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
-
-
-class DatasetSpec(TypedDict, total=False):
-    """Exceptional cases which cannot rely on defaults."""
-
-    name: Dataset
-    suffix: Extension
-    marks: MarksType
-
-
-requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow()
-
-_b_params = {
+_backend_params: Mapping[_Backend, ParameterSet] = {
     "polars": pytest.param("polars"),
-    "pandas": pytest.param(
-        "pandas",
-        marks=pytest.mark.xfail(
-            find_spec("pyarrow") is None,
-            reason=(
-                "`pandas` supports backends other than `pyarrow` for `.parquet`.\n"
-                "However, none of these are currently an `altair` dependency."
-            ),
-        ),
-    ),
-    "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=requires_pyarrow),
-    "pyarrow": pytest.param("pyarrow", marks=requires_pyarrow),
+    "pandas": pytest.param("pandas"),
+    "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=skip_requires_pyarrow()),
+    "pyarrow": pytest.param("pyarrow", marks=skip_requires_pyarrow()),
 }
 
-backends: pytest.MarkDecorator = pytest.mark.parametrize("backend", _b_params.values())
+backends: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", _backend_params.values()
+)
 backends_no_polars: pytest.MarkDecorator = pytest.mark.parametrize(
-    "backend", [v for k, v in _b_params.items() if k != "polars"]
+    "backend", [v for k, v in _backend_params.items() if k != "polars"]
 )
 backends_pandas_any: pytest.MarkDecorator = pytest.mark.parametrize(
-    "backend", [v for k, v in _b_params.items() if "pandas" in k]
-)
-backends_single: pytest.MarkDecorator = pytest.mark.parametrize(
-    "backend", [v for k, v in _b_params.items() if "[" not in k]
+    "backend", [v for k, v in _backend_params.items() if "pandas" in k]
 )
-backends_multi: pytest.MarkDecorator = pytest.mark.parametrize(
-    "backend", [v for k, v in _b_params.items() if "[" in k]
+backends_pyarrow: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _backend_params.items() if k == "pyarrow"]
 )
 
 datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
@@ -100,24 +73,12 @@ class DatasetSpec(TypedDict, total=False):
 """
 
 
-@pytest.fixture
-def is_flaky_datasets(request: pytest.FixtureRequest) -> bool:
-    mark_filter = request.config.getoption("-m", None)  # pyright: ignore[reportArgumentType]
-    if mark_filter is None:
-        return False
-    elif mark_filter == "":
-        return True
-    elif isinstance(mark_filter, str):
-        return False
-    else:
-        raise TypeError(mark_filter)
-
-
 @pytest.fixture(scope="session")
-def polars_loader(tmp_path_factory: pytest.TempPathFactory) -> PolarsLoader:
-    data = Loader.from_backend("polars")
-    data.cache.path = tmp_path_factory.mktemp("loader-cache-polars")
-    return data
+def polars_loader() -> PolarsLoader:
+    load = Loader.from_backend("polars")
+    if load.cache.is_not_active():
+        load.cache.path = load.cache._XDG_CACHE
+    return load
 
 
 @pytest.fixture(
@@ -127,17 +88,6 @@ def spatial_datasets(request: pytest.FixtureRequest) -> Dataset:
     return request.param
 
 
-@backends_no_polars
-def test_spatial(spatial_datasets, backend: _Backend) -> None:
-    load = Loader.from_backend(backend)
-    pattern = re.compile(
-        rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+url",
-        flags=re.DOTALL | re.IGNORECASE,
-    )
-    with pytest.raises(NotImplementedError, match=pattern):
-        load(spatial_datasets)
-
-
 @pytest.fixture
 def metadata_columns() -> frozenset[str]:
     """
@@ -158,25 +108,65 @@ def metadata_columns() -> frozenset[str]:
     )
 
 
-def match_url(name: Dataset, url: str) -> bool:
+def is_frame_backend(frame: Any, backend: _Backend, /) -> bool:
+    pandas_any: set[_PandasAny] = {"pandas", "pandas[pyarrow]"}
+    if backend in pandas_any:
+        return nw_dep.is_pandas_dataframe(frame)
+    elif backend == "pyarrow":
+        return nw_dep.is_pyarrow_table(frame)
+    elif backend == "polars":
+        return nw_dep.is_polars_dataframe(frame)
+    else:
+        raise TypeError(backend)
+
+
+def is_loader_backend(loader: Loader[Any, Any], backend: _Backend, /) -> bool:
+    return repr(loader) == f"{type(loader).__name__}[{backend}]"
+
+
+def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool:
     pattern = rf".+/vega-datasets@.+/data/{name}\..+"
+    url = fn_url(name)
     return re.match(pattern, url) is not None
 
 
+def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
+    """
+    User requested ``pyarrow``, but also has ``polars`` installed.
+
+    Notes
+    -----
+    - Currently, defers to ``polars`` only for ``.json``.
+    """
+    return bool(
+        is_loader_backend(loader, "pyarrow")
+        and (fn := getattr(loader._reader, "_read_json_polars", None))
+        and fn == loader._reader.read_fn("dummy.json")
+    )
+
+
+@backends
+def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
+    """Ensure all backends will query the same column names."""
+    load = Loader.from_backend(backend)
+    schema_columns = load._reader._scan_metadata().collect().columns
+    assert set(schema_columns) == metadata_columns
+
+
 @backends
 def test_loader_from_backend(backend: _Backend) -> None:
-    data = Loader.from_backend(backend)
-    assert data._reader._name == backend
+    load = Loader.from_backend(backend)
+    assert is_loader_backend(load, backend)
 
 
 @backends
 def test_loader_url(backend: _Backend) -> None:
-    data = Loader.from_backend(backend)
-    dataset_name: Dataset = "volcano"
-    assert match_url(dataset_name, data.url(dataset_name))
+    load = Loader.from_backend(backend)
+    assert is_url("volcano", load.url)
 
 
-def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
+@no_xdist
+def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None:
     """
     Inferring the best backend available.
 
@@ -187,7 +177,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
     import altair.datasets._loader
     from altair.datasets import load
 
-    assert load._reader._name == "polars"
+    assert is_loader_backend(load, "polars")
     monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
 
     monkeypatch.setitem(sys.modules, "polars", None)
@@ -196,20 +186,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
 
     if find_spec("pyarrow") is None:
         # NOTE: We can end the test early for the CI job that removes `pyarrow`
-        assert load._reader._name == "pandas"
+        assert is_loader_backend(load, "pandas")
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pandas", None)
         with pytest.raises(AltairDatasetsError, match=r"no.+backend"):
             from altair.datasets import load
     else:
-        assert load._reader._name == "pandas[pyarrow]"
+        assert is_loader_backend(load, "pandas[pyarrow]")
         monkeypatch.delattr(altair.datasets._loader, "load")
 
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
         from altair.datasets import load
 
-        assert load._reader._name == "pandas"
+        assert is_loader_backend(load, "pandas")
         monkeypatch.delattr(altair.datasets._loader, "load")
 
         monkeypatch.setitem(sys.modules, "pandas", None)
@@ -217,7 +207,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
         monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
         from altair.datasets import load
 
-        assert load._reader._name == "pyarrow"
+        assert is_loader_backend(load, "pyarrow")
         monkeypatch.delattr(altair.datasets._loader, "load")
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
@@ -225,40 +215,22 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
             from altair.datasets import load
 
 
-# HACK: Using a fixture to get a command line option
-# https://docs.pytest.org/en/stable/example/simple.html#pass-different-values-to-a-test-function-depending-on-command-line-options
-@pytest.mark.xfail(
-    is_flaky_datasets,  # type: ignore
-    reason=(
-        "'pandas[pyarrow]' seems to break locally when running:\n"
-        ">>> pytest -p no:randomly -n logical tests -k test_datasets -m ''\n\n"
-        "Possibly related:\n"
-        "    https://github.com/modin-project/modin/issues/951\n"
-        "    https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L164\n"
-        "    https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L257\n"
-    ),
-    raises=AttributeError,
-)
-@requires_pyarrow
-def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
+@backends
+def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
     import altair.datasets._loader
 
     monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
     from altair.datasets import load
 
-    assert load._reader._name == "polars"
+    assert is_loader_backend(load, "polars")
 
     default = load("cars")
-    df_pyarrow = load("cars", backend="pyarrow")
-    df_pandas = load("cars", backend="pandas[pyarrow]")
+    df = load("cars", backend=backend)
     default_2 = load("cars")
-    df_polars = load("cars", backend="polars")
 
     assert nw_dep.is_polars_dataframe(default)
-    assert nw_dep.is_pyarrow_table(df_pyarrow)
-    assert nw_dep.is_pandas_dataframe(df_pandas)
+    assert is_frame_backend(df, backend)
     assert nw_dep.is_polars_dataframe(default_2)
-    assert nw_dep.is_polars_dataframe(df_polars)
 
 
 @pytest.mark.parametrize(
@@ -296,41 +268,36 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
 def test_url(name: Dataset) -> None:
     from altair.datasets import url
 
-    assert match_url(name, url(name))
+    assert is_url(name, url)
 
 
 def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
-    import altair.datasets
     from altair.datasets._cache import csv_cache
+    from altair.datasets._readers import infer_backend
 
-    monkeypatch.setitem(sys.modules, "polars", None)
-    monkeypatch.setitem(sys.modules, "pandas", None)
-    monkeypatch.setitem(sys.modules, "pyarrow", None)
+    priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4")
 
     assert csv_cache._mapping == {}
-
-    with contextlib.suppress(AltairDatasetsError):
-        monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
     with pytest.raises(AltairDatasetsError):
-        from altair.datasets import load as load
-
-    assert match_url("jobs", url("jobs"))
+        infer_backend(priority=priority)
 
+    url = csv_cache.url
+    assert is_url("jobs", url)
     assert csv_cache._mapping != {}
 
-    assert match_url("cars", url("cars"))
-    assert match_url("stocks", url("stocks"))
-    assert match_url("countries", url("countries"))
-    assert match_url("crimea", url("crimea"))
-    assert match_url("disasters", url("disasters"))
-    assert match_url("driving", url("driving"))
-    assert match_url("earthquakes", url("earthquakes"))
-    assert match_url("flare", url("flare"))
-    assert match_url("flights-10k", url("flights-10k"))
-    assert match_url("flights-200k", url("flights-200k"))
+    assert is_url("cars", url)
+    assert is_url("stocks", url)
+    assert is_url("countries", url)
+    assert is_url("crimea", url)
+    assert is_url("disasters", url)
+    assert is_url("driving", url)
+    assert is_url("earthquakes", url)
+    assert is_url("flare", url)
+    assert is_url("flights-10k", url)
+    assert is_url("flights-200k", url)
 
     if find_spec("vegafusion"):
-        assert match_url("flights-3m", url("flights-3m"))
+        assert is_url("flights-3m", url)
 
     with monkeypatch.context() as mp:
         mp.setitem(sys.modules, "vegafusion", None)
@@ -344,51 +311,14 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
 
 
 @backends
-def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.delenv(CACHE_ENV_VAR, raising=False)
-
-    data = Loader.from_backend(backend)
-    frame = data("stocks", ".csv")
+def test_loader_call(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    frame = load("stocks", ".csv")
     assert nw_dep.is_into_dataframe(frame)
     nw_frame = nw.from_native(frame)
     assert set(nw_frame.columns) == {"symbol", "date", "price"}
 
 
-@backends_single
-def test_missing_dependency_single(
-    backend: _Backend, monkeypatch: pytest.MonkeyPatch
-) -> None:
-    monkeypatch.setitem(sys.modules, backend, None)
-
-    with pytest.raises(
-        ModuleNotFoundError,
-        match=re.compile(
-            rf"{backend}.+requires.+{backend}.+but.+{backend}.+not.+found.+pip install {backend}",
-            flags=re.DOTALL,
-        ),
-    ):
-        Loader.from_backend(backend)
-
-
-@backends_multi
-@skip_requires_pyarrow
-def test_missing_dependency_multi(
-    backend: _Backend, monkeypatch: pytest.MonkeyPatch
-) -> None:
-    secondary = "pyarrow"
-    primary = backend.removesuffix(f"[{secondary}]")
-    monkeypatch.setitem(sys.modules, secondary, None)
-
-    with pytest.raises(
-        ModuleNotFoundError,
-        match=re.compile(
-            rf"{re.escape(backend)}.+requires.+'{primary}', '{secondary}'.+but.+{secondary}.+not.+found.+pip install {secondary}",
-            flags=re.DOTALL,
-        ),
-    ):
-        Loader.from_backend(backend)
-
-
 @backends
 def test_dataset_not_found(backend: _Backend) -> None:
     """
@@ -396,7 +326,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
 
     ``Loader.url`` is used since it doesn't require a remote connection.
     """
-    data = Loader.from_backend(backend)
+    load = Loader.from_backend(backend)
     real_name: Literal["disasters"] = "disasters"
     invalid_name: Literal["fake name"] = "fake name"
     invalid_suffix: Literal["fake suffix"] = "fake suffix"
@@ -411,7 +341,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
         ERR_NO_RESULT,
         match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL),
     ):
-        data.url(invalid_name)
+        load.url(invalid_name)
 
     with pytest.raises(
         TypeError,
@@ -420,7 +350,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
             re.DOTALL,
         ),
     ):
-        data.url(real_name, invalid_suffix)  # type: ignore[arg-type]
+        load.url(real_name, invalid_suffix)  # type: ignore[arg-type]
 
     with pytest.raises(
         ERR_NO_RESULT,
@@ -429,7 +359,44 @@ def test_dataset_not_found(backend: _Backend) -> None:
             re.DOTALL,
         ),
     ):
-        data.url(real_name, incorrect_suffix)
+        load.url(real_name, incorrect_suffix)
+
+
+def test_reader_missing_dependencies() -> None:
+    from packaging.requirements import Requirement
+
+    from altair.datasets._readers import _Reader
+
+    class MissingDeps(_Reader):
+        def __init__(self, name) -> None:
+            self._name = name
+            reqs = Requirement(name)
+            for req in (reqs.name, *reqs.extras):
+                self._import(req)
+
+            self._read_fn = {}
+            self._scan_fn = {}
+
+    fake_name = "not_a_real_package"
+    real_name = "altair"
+    fake_extra = "AnotherFakePackage"
+    backend = f"{real_name}[{fake_extra}]"
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{fake_name}.+requires.+{fake_name}.+but.+{fake_name}.+not.+found.+pip install {fake_name}",
+            flags=re.DOTALL,
+        ),
+    ):
+        MissingDeps(fake_name)
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{re.escape(backend)}.+requires.+'{real_name}', '{fake_extra}'.+but.+{fake_extra}.+not.+found.+pip install {fake_extra}",
+            flags=re.DOTALL,
+        ),
+    ):
+        MissingDeps(backend)
 
 
 @backends
@@ -451,97 +418,112 @@ def test_reader_cache(
 
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
 
-    data = Loader.from_backend(backend)
-    assert data.cache.is_active()
-    cache_dir = data.cache.path
+    load = Loader.from_backend(backend)
+    assert load.cache.is_active()
+    cache_dir = load.cache.path
     assert cache_dir == tmp_path
 
-    assert tuple(data.cache) == ()
+    assert tuple(load.cache) == ()
 
     # smallest csvs
-    lookup_groups = data("lookup_groups")
-    data("lookup_people")
-    data("iowa-electricity")
-    data("global-temp")
+    lookup_groups = load("lookup_groups")
+    load("lookup_people")
+    load("iowa-electricity")
+    load("global-temp")
 
-    cached_paths = tuple(data.cache)
+    cached_paths = tuple(load.cache)
     assert len(cached_paths) == 4
 
     if nw_dep.is_polars_dataframe(lookup_groups):
         left, right = (
             lookup_groups,
-            cast("pl.DataFrame", data("lookup_groups", ".csv")),
+            cast("pl.DataFrame", load("lookup_groups", ".csv")),
         )
     else:
         left, right = (
             pl.DataFrame(lookup_groups),
-            pl.DataFrame(data("lookup_groups", ".csv")),
+            pl.DataFrame(load("lookup_groups", ".csv")),
         )
 
     assert_frame_equal(left, right)
-    assert len(tuple(data.cache)) == 4
-    assert cached_paths == tuple(data.cache)
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
 
-    data("iowa-electricity", ".csv")
-    data("global-temp", ".csv")
-    data("global-temp.csv")
+    load("iowa-electricity", ".csv")
+    load("global-temp", ".csv")
+    load("global-temp.csv")
 
-    assert len(tuple(data.cache)) == 4
-    assert cached_paths == tuple(data.cache)
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
 
-    data("lookup_people")
-    data("lookup_people.csv")
-    data("lookup_people", ".csv")
-    data("lookup_people")
+    load("lookup_people")
+    load("lookup_people.csv")
+    load("lookup_people", ".csv")
+    load("lookup_people")
 
-    assert len(tuple(data.cache)) == 4
-    assert cached_paths == tuple(data.cache)
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
 
 
-@slow
 @datasets_debug
 @backends
 def test_reader_cache_exhaustive(
-    backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+    backend: _Backend,
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    polars_loader: PolarsLoader,
 ) -> None:
     """
     Fully populate and then purge the cache for all backends.
 
     - Does not attempt to read the files
     - Checking we can support pre-downloading and safely deleting
+
+    Notes
+    -----
+    - Requests work the same for all backends
+    - The logic for detecting the cache contents uses ``narhwals``
+    - Here, we're testing that these ``narwhals`` ops are consistent
+    - `DatasetCache.download_all` is expensive for CI, so aiming for it to run at most once
+        - 34-45s per call (4x backends)
     """
+    polars_loader.cache.download_all()
+    CLONED: Path = tmp_path / "clone"
+    fs.mkdir(CLONED)
+    fs.copytree(polars_loader.cache.path, CLONED)
+
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
-    data = Loader.from_backend(backend)
-    assert data.cache.is_active()
-    cache_dir = data.cache.path
+    load = Loader.from_backend(backend)
+    assert load.cache.is_active()
+    cache_dir = load.cache.path
     assert cache_dir == tmp_path
-    assert tuple(data.cache) == ()
+    assert tuple(load.cache) == (CLONED,)
 
-    data.cache.download_all()
-    cached_paths = tuple(data.cache)
+    load.cache.path = CLONED
+    cached_paths = tuple(load.cache)
     assert cached_paths != ()
 
     # NOTE: Approximating all datasets downloaded
     assert len(cached_paths) >= 40
     assert all(
         bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size)
-        for fp in data.cache
+        for fp in load.cache
     )
     # NOTE: Confirm this is a no-op
-    data.cache.download_all()
-    assert len(cached_paths) == len(tuple(data.cache))
+    load.cache.download_all()
+    assert len(cached_paths) == len(tuple(load.cache))
 
     # NOTE: Ensure unrelated files in the directory are not removed
     dummy: Path = tmp_path / "dummy.json"
     dummy.touch(exist_ok=False)
-    data.cache.clear()
+    load.cache.clear()
 
     remaining = tuple(tmp_path.iterdir())
-    assert len(remaining) == 1
-    assert remaining[0] == dummy
-    dummy.unlink()
+    assert set(remaining) == {dummy, CLONED}
+    fs.rm(dummy, CLONED)
 
 
+@no_xdist
 def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     from altair.datasets import load
 
@@ -572,68 +554,66 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
     assert not load.cache.is_empty()
 
 
-movies_fail: ParameterSet = pytest.param(
-    "movies",
-    marks=pytest.mark.xfail(
-        reason="Only working for `polars`.\n"
-        "`pyarrow` isn't happy with the mixed `int`/`str` column."
-    ),
-)
-earthquakes_fail: ParameterSet = pytest.param(
-    "earthquakes",
-    marks=pytest.mark.xfail(
-        reason="Only working for `polars`.\nGeoJSON fails on native `pyarrow`"
-    ),
-)
-
-
+# TODO: Investigate adding schemas for `pyarrow`.
 @pytest.mark.parametrize(
-    "name",
+    ("name", "fallback"),
     [
-        "cars",
-        movies_fail,
-        "wheat",
-        "barley",
-        "gapminder",
-        "income",
-        "burtin",
-        earthquakes_fail,
+        ("cars", "polars"),
+        ("movies", "polars"),
+        ("wheat", "polars"),
+        ("barley", "polars"),
+        ("gapminder", "polars"),
+        ("income", "polars"),
+        ("burtin", "polars"),
+        ("cars", None),
+        pytest.param(
+            "movies",
+            None,
+            marks=pytest.mark.xfail(
+                True,
+                raises=TypeError,
+                reason=(
+                    "msg: `Expected bytes, got a 'int' object`\n"
+                    "Isn't happy with the mixed `int`/`str` column."
+                ),
+                strict=True,
+            ),
+        ),
+        ("wheat", None),
+        ("barley", None),
+        ("gapminder", None),
+        ("income", None),
+        ("burtin", None),
     ],
 )
-@pytest.mark.parametrize("fallback", ["polars", None])
-@skip_requires_pyarrow
+@backends_pyarrow
 def test_pyarrow_read_json(
-    fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch
+    backend: _PyArrow,
+    fallback: _Polars | None,
+    name: Dataset,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    monkeypatch.delenv(CACHE_ENV_VAR, raising=False)
-    monkeypatch.delitem(sys.modules, "pandas", raising=False)
     if fallback is None:
         monkeypatch.setitem(sys.modules, "polars", None)
-
-    data = Loader.from_backend("pyarrow")
-
-    data(name, ".json")
+    load = Loader.from_backend(backend)
+    assert load(name, ".json")
 
 
-@pytest.mark.parametrize(
-    ("spec", "column"),
-    [
-        (DatasetSpec(name="cars"), "Year"),
-        (DatasetSpec(name="unemployment-across-industries"), "date"),
-        (DatasetSpec(name="flights-10k"), "date"),
-        (DatasetSpec(name="football"), "date"),
-        (DatasetSpec(name="crimea"), "date"),
-        (DatasetSpec(name="ohlc"), "date"),
-    ],
-)
-def test_polars_read_json_roundtrip(
-    polars_loader: PolarsLoader, spec: DatasetSpec, column: str
-) -> None:
-    frame = polars_loader(spec["name"], ".json")
-    tp = frame.schema.to_python()[column]
-    assert tp is dt.date or issubclass(tp, dt.date)
+@backends_no_polars
+def test_spatial(spatial_datasets, backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    if is_polars_backed_pyarrow(load):
+        assert nw_dep.is_pyarrow_table(load(spatial_datasets))
+    else:
+        pattern = re.compile(
+            rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url",
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        with pytest.raises(NotImplementedError, match=pattern):
+            load(spatial_datasets)
 
 
+# TODO: Adapt into something useful or simplify into just param name
 def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
     """Temp way of excluding datasets that were removed."""
     names: tuple[Dataset, ...] = get_args(Dataset)
@@ -646,9 +626,8 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
         yield pytest.param(*args, marks=marks)
 
 
-@slow
-@datasets_debug
 @pytest.mark.parametrize(("name", "suffix"), list(_dataset_params()))
+@datasets_debug
 def test_all_datasets(
     polars_loader: PolarsLoader, name: Dataset, suffix: Extension
 ) -> None:
@@ -668,51 +647,62 @@ def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
 def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
     from polars.testing import assert_frame_equal
 
-    data = Loader.from_backend("polars")
-    data.cache.path = tmp_path
+    load = Loader.from_backend("polars")
+    load.cache.path = tmp_path
 
-    data("londonCentroids")
-    data("stocks")
-    data("driving")
+    load("londonCentroids")
+    load("stocks")
+    load("driving")
 
     cached_paths = tuple(tmp_path.iterdir())
     assert len(cached_paths) == 3
 
     raiser = partial(_raise_exception, URLError)
     with monkeypatch.context() as mp:
-        mp.setattr(data._reader._opener, "open", raiser)
+        mp.setattr(load._reader._opener, "open", raiser)
         # Existing cache entries don't trigger an error
-        data("londonCentroids")
-        data("stocks")
-        data("driving")
+        load("londonCentroids")
+        load("stocks")
+        load("driving")
         # Mocking cache-miss without remote conn
         with pytest.raises(URLError):
-            data("birdstrikes")
+            load("birdstrikes")
         assert len(tuple(tmp_path.iterdir())) == 3
 
     # Now we can get a cache-hit
-    frame = data("birdstrikes")
+    frame = load("birdstrikes")
     assert nw_dep.is_polars_dataframe(frame)
     assert len(tuple(tmp_path.iterdir())) == 4
 
     with monkeypatch.context() as mp:
-        mp.setattr(data._reader._opener, "open", raiser)
+        mp.setattr(load._reader._opener, "open", raiser)
         # Here, the remote conn isn't considered - we already have the file
-        frame_from_cache = data("birdstrikes")
+        frame_from_cache = load("birdstrikes")
         assert len(tuple(tmp_path.iterdir())) == 4
 
     assert_frame_equal(frame, frame_from_cache)
 
 
-@backends
-def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
-    """Ensure all backends will query the same column names."""
-    data = Loader.from_backend(backend)
-    schema_columns = data._reader._scan_metadata().collect().columns
-    assert set(schema_columns) == metadata_columns
+@pytest.mark.parametrize(
+    ("name", "column"),
+    [
+        ("cars", "Year"),
+        ("unemployment-across-industries", "date"),
+        ("flights-10k", "date"),
+        ("football", "date"),
+        ("crimea", "date"),
+        ("ohlc", "date"),
+    ],
+)
+def test_polars_date_read_json_roundtrip(
+    polars_loader: PolarsLoader, name: Dataset, column: str
+) -> None:
+    """Ensure ``date`` columns are inferred using the roundtrip json -> csv method."""
+    frame = polars_loader(name, ".json")
+    tp = frame.schema.to_python()[column]
+    assert tp is dt.date or issubclass(tp, dt.date)
 
 
-@skip_requires_pyarrow
 @backends_pandas_any
 @pytest.mark.parametrize(
     ("name", "columns"),

From d64dbee607006108ca617b1e4f5f6240c79c0727 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Tue, 21 Jan 2025 18:42:27 +0000
Subject: [PATCH 173/201] refactor: Reuse `tools.fs` more, fix
 `app.(read|scan)`

Using only `.parquet` was relevant in earlier versions that produced multiple `.parquet` files
Now these methods safely handle all formats in use
---
 tools/datasets/__init__.py | 40 +++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index faf5e8d96..64940ebc1 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -22,9 +22,9 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
+from tools import fs
 from tools.codemod import ruff
 from tools.datasets.npm import Npm
-from tools.fs import REPO_ROOT
 from tools.schemapi import utils
 
 if TYPE_CHECKING:
@@ -60,7 +60,7 @@ class Application:
     Parameters
     ----------
     out_dir_tools, out_dir_altair
-        Directories to store ``.parquet`` metadata files.
+        Directories to store metadata files.
     out_fp_typing
         Path to write metadata-derived typing module.
 
@@ -72,7 +72,7 @@ class Application:
     def __init__(
         self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path
     ) -> None:
-        out_dir_tools.mkdir(exist_ok=True)
+        fs.mkdir(out_dir_tools)
         METADATA = "metadata"
         self.paths = types.MappingProxyType["_PathAlias", Path](
             {
@@ -102,7 +102,7 @@ def refresh(
         include_typing
             Regenerate ``altair.datasets._typing``.
         frozen
-            Don't perform any requests or attempt to check for new versions.
+            Don't perform any requests.
 
             .. note::
                 **Temporary** measure to work from ``main`` until `vega-datasets@3`_.
@@ -123,20 +123,28 @@ def refresh(
 
     def reset(self) -> None:
         """Remove all metadata files."""
-        for fp in self.paths.values():
-            fp.unlink(missing_ok=True)
+        fs.rm(*self.paths.values())
 
     def read(self, name: _PathAlias, /) -> pl.DataFrame:
         """Read existing metadata from file."""
-        import polars as pl
-
-        return pl.read_parquet(self.paths[name])
+        return self.scan(name).collect()
 
     def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
         """Scan existing metadata from file."""
         import polars as pl
 
-        return pl.scan_parquet(self.paths[name])
+        fp = self.paths[name]
+        if fp.suffix == ".parquet":
+            return pl.scan_parquet(fp)
+        elif ".csv" in fp.suffixes:
+            return pl.scan_csv(fp)
+        elif ".json" in fp.suffixes:
+            return pl.read_json(fp).lazy()
+        else:
+            msg = (
+                f"Unable to read {fp.name!r} as tabular data.\nSuffixes: {fp.suffixes}"
+            )
+            raise NotImplementedError(msg)
 
     def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """
@@ -152,8 +160,7 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non
         """
         if fp.suffix != ".gz":
             fp = fp.with_suffix(".csv.gz")
-        if not fp.exists():
-            fp.touch()
+        fp.touch()
         df = frame.lazy().collect()
         buf = BytesIO()
         with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
@@ -169,16 +176,13 @@ def write_json_gzip(self, obj: Any, fp: Path, /) -> None:
         """
         if fp.suffix != ".gz":
             fp = fp.with_suffix(".json.gz")
-        if not fp.exists():
-            fp.touch()
-
+        fp.touch()
         with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
             f.write(json.dumps(obj).encode())
 
     def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
         """Write ``frame`` to ``fp``, with some extra safety."""
-        if not fp.exists():
-            fp.touch()
+        fp.touch()
         df = frame.lazy().collect()
         df.write_parquet(fp, compression="zstd", compression_level=17)
 
@@ -233,7 +237,7 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
         ruff.write_lint_format(self.paths["typing"], contents)
 
 
-_alt_datasets = REPO_ROOT / "altair" / "datasets"
+_alt_datasets = fs.REPO_ROOT / "altair" / "datasets"
 app = Application(
     Path(__file__).parent / "_metadata",
     _alt_datasets / "_metadata",

From 0c72435e09ab6263c48bb566c733deb11f927bd9 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 22 Jan 2025 12:30:07 +0000
Subject: [PATCH 174/201] feat(typing): Set `"polars"` as default in
 `Loader.from_backend`

Without a default, I found that VSCode was always suggesting the **last** overload first (`"pyarrow"`)
This is a bad suggestion, as it provides the *worst native* experience.

The default now aligns with the backend providing the *best native* experience
---
 altair/datasets/_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 0bb91aa1f..8417e2d6a 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -48,7 +48,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     @overload
     @classmethod
     def from_backend(
-        cls, backend_name: Literal["polars"], /
+        cls, backend_name: Literal["polars"] = ..., /
     ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
 
     @overload
@@ -64,7 +64,7 @@ def from_backend(
     ) -> Loader[pa.Table, pa.Table]: ...
 
     @classmethod
-    def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]:
+    def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]:
         """
         Initialize a new loader, with the specified backend.
 

From 8e4c168c6942ace581f7a03026ae0271afdd9871 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 22 Jan 2025 12:54:29 +0000
Subject: [PATCH 175/201] docs: Adds module-level doc to `altair.datasets`

- Multiple **brief** examples, for a taste of the public API
  - See (#3763)
- Refs to everywhere a first-time user may need help from
- Also aligned the (`Loader`|`load`) docs w/ eachother and the new phrasing here
---
 altair/datasets/__init__.py | 65 ++++++++++++++++++++++++++++++++++++-
 altair/datasets/_loader.py  |  2 +-
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index cc6a07d32..01dc35212 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -1,3 +1,64 @@
+"""
+Load example datasets **remotely** from `vega-datasets`_.
+
+Provides over **70+** datasets, used throughout our `Example Gallery`_.
+
+You can learn more about each dataset at `datapackage.md`_.
+
+Examples
+--------
+Load a dataset as a ``DataFrame``/``Table``::
+
+    from altair.datasets import load
+
+    load("cars")
+
+.. note::
+   Requires installation of either `polars`_, `pandas`_, or `pyarrow`_.
+
+Get the remote address of a dataset and use directly in a :class:`altair.Chart`::
+
+    import altair as alt
+    from altair.datasets import url
+
+    source = url("co2-concentration")
+    alt.Chart(source).mark_line(tooltip=True).encode(x="Date:T", y="CO2:Q")
+
+.. note::
+   Works without any additional dependencies.
+
+For greater control over the backend library use::
+
+    from altair.datasets import Loader
+
+    load = Loader.from_backend("polars")
+    load("penguins")
+    load.url("penguins")
+
+This method also provides *precise* <kbd>Tab</kbd> completions on the returned object::
+
+    load("cars").<Tab>
+    #            bottom_k
+    #            drop
+    #            drop_in_place
+    #            drop_nans
+    #            dtypes
+    #            ...
+
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
+.. _Example Gallery:
+    https://altair-viz.github.io/gallery/index.html#example-gallery
+.. _datapackage.md:
+    https://github.com/vega/vega-datasets/blob/main/datapackage.md
+.. _polars:
+    https://docs.pola.rs/user-guide/installation/
+.. _pandas:
+    https://pandas.pydata.org/docs/getting_started/install.html
+.. _pyarrow:
+    https://arrow.apache.org/docs/python/install.html
+"""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -22,7 +83,9 @@
 
 load: _Load[Any, Any]
 """
-For full IDE completions, instead use:
+Get a remote dataset and load as tabular data.
+
+For full <kbd>Tab</kbd> completions, instead use:
 
     from altair.datasets import Loader
     load = Loader.from_backend("polars")
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 8417e2d6a..6c359edb2 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -29,7 +29,7 @@
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
-    Load examples **remotely** from `vega-datasets`_, with caching.
+    Load example datasets **remotely** from `vega-datasets`_, with caching.
 
     A new ``Loader`` must be initialized by specifying a backend:
 

From 106f8bb40ecd59a18606c29fadd97094cf75c968 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 22 Jan 2025 14:37:08 +0000
Subject: [PATCH 176/201] test: Clean up `test_datasets`

- Reduce superfluous docs
- Format/reorganize remaining docs
- Follow up on some comments
Misc style changes
---
 tests/test_datasets.py | 177 +++++++++++------------------------------
 1 file changed, 48 insertions(+), 129 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index b212d79ce..0855b73af 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -16,12 +16,12 @@
 
 from altair.datasets import Loader
 from altair.datasets._exceptions import AltairDatasetsError
-from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read
+from altair.datasets._typing import Dataset, Metadata, is_ext_read
 from tests import no_xdist, skip_requires_pyarrow
 from tools import fs
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Container, Iterator, Mapping
+    from collections.abc import Callable, Mapping
     from pathlib import Path
     from typing import Literal
 
@@ -31,7 +31,6 @@
 
     from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow
     from altair.vegalite.v5.schema._typing import OneOrSeq
-    from tests import MarksType
 
     if sys.version_info >= (3, 10):
         from typing import TypeAlias
@@ -39,7 +38,18 @@
         from typing_extensions import TypeAlias
     PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame]
 
-CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
+datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
+"""
+Custom ``pytest.mark`` decorator.
+
+Use for more exhaustive tests that require many requests.
+
+**Disabled** by default in ``pyproject.toml``:
+
+    [tool.pytest.ini_options]
+    addopts = ...
+"""
+
 _backend_params: Mapping[_Backend, ParameterSet] = {
     "polars": pytest.param("polars"),
     "pandas": pytest.param("pandas"),
@@ -60,52 +70,26 @@
     "backend", [v for k, v in _backend_params.items() if k == "pyarrow"]
 )
 
-datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
-"""
-Custom ``pytest.mark`` decorator.
-
-Use for more exhaustive tests that require many requests.
-
-**Disabled** by default in ``pyproject.toml``:
+datasets_all: pytest.MarkDecorator = pytest.mark.parametrize("name", get_args(Dataset))
+datasets_spatial: pytest.MarkDecorator = pytest.mark.parametrize(
+    "name", ["earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m"]
+)
 
-    [tool.pytest.ini_options]
-    addopts = ...
-"""
+CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
 
 
 @pytest.fixture(scope="session")
 def polars_loader() -> PolarsLoader:
+    """Fastest and **most reliable** backend."""
     load = Loader.from_backend("polars")
     if load.cache.is_not_active():
         load.cache.path = load.cache._XDG_CACHE
     return load
 
 
-@pytest.fixture(
-    params=("earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m")
-)
-def spatial_datasets(request: pytest.FixtureRequest) -> Dataset:
-    return request.param
-
-
 @pytest.fixture
 def metadata_columns() -> frozenset[str]:
-    """
-    Returns all defined keys ``Metadata`` (``TypedDict``).
-
-    Note
-    ----
-    - ``# type: ignore``(s) are to fix a false positive.
-    - Should be recognised by this stub `typing_extensions.pyi`_
-
-    .. _typing_extensions.pyi:
-        https://github.com/python/typeshed/blob/51d0f0194c27347ab7d0083bd7b11210a09fef75/stdlib/typing_extensions.pyi#L222-L229
-    """
-    return Metadata.__required_keys__.union(
-        Metadata.__optional_keys__,
-        Metadata.__readonly_keys__,  # type: ignore[attr-defined]
-        Metadata.__mutable_keys__,  # type: ignore[attr-defined]
-    )
+    return Metadata.__required_keys__.union(Metadata.__optional_keys__)
 
 
 def is_frame_backend(frame: Any, backend: _Backend, /) -> bool:
@@ -131,13 +115,8 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool:
 
 
 def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
-    """
-    User requested ``pyarrow``, but also has ``polars`` installed.
-
-    Notes
-    -----
-    - Currently, defers to ``polars`` only for ``.json``.
-    """
+    """User requested ``pyarrow``, but also has ``polars`` installed."""
+    # NOTE: Would prefer if there was a *less* private method to test this.
     return bool(
         is_loader_backend(loader, "pyarrow")
         and (fn := getattr(loader._reader, "_read_json_polars", None))
@@ -168,18 +147,17 @@ def test_loader_url(backend: _Backend) -> None:
 @no_xdist
 def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None:
     """
-    Inferring the best backend available.
-
-    Based on the following order:
+    Ensure the **most reliable**, available backend is selected.
 
-        priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow"
+    See Also
+    --------
+    ``altair.datasets._readers.infer_backend``
     """
     import altair.datasets._loader
     from altair.datasets import load
 
     assert is_loader_backend(load, "polars")
     monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
-
     monkeypatch.setitem(sys.modules, "polars", None)
 
     from altair.datasets import load
@@ -194,14 +172,12 @@ def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None:
     else:
         assert is_loader_backend(load, "pandas[pyarrow]")
         monkeypatch.delattr(altair.datasets._loader, "load")
-
         monkeypatch.setitem(sys.modules, "pyarrow", None)
 
         from altair.datasets import load
 
         assert is_loader_backend(load, "pandas")
         monkeypatch.delattr(altair.datasets._loader, "load")
-
         monkeypatch.setitem(sys.modules, "pandas", None)
         monkeypatch.delitem(sys.modules, "pyarrow")
         monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
@@ -223,11 +199,9 @@ def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
     from altair.datasets import load
 
     assert is_loader_backend(load, "polars")
-
     default = load("cars")
     df = load("cars", backend=backend)
     default_2 = load("cars")
-
     assert nw_dep.is_polars_dataframe(default)
     assert is_frame_backend(df, backend)
     assert nw_dep.is_polars_dataframe(default_2)
@@ -276,7 +250,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     from altair.datasets._readers import infer_backend
 
     priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4")
-
     assert csv_cache._mapping == {}
     with pytest.raises(AltairDatasetsError):
         infer_backend(priority=priority)
@@ -284,7 +257,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     url = csv_cache.url
     assert is_url("jobs", url)
     assert csv_cache._mapping != {}
-
     assert is_url("cars", url)
     assert is_url("stocks", url)
     assert is_url("countries", url)
@@ -295,7 +267,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     assert is_url("flare", url)
     assert is_url("flights-10k", url)
     assert is_url("flights-200k", url)
-
     if find_spec("vegafusion"):
         assert is_url("flights-3m", url)
 
@@ -303,7 +274,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
         mp.setitem(sys.modules, "vegafusion", None)
         with pytest.raises(AltairDatasetsError, match=r".parquet.+require.+vegafusion"):
             url("flights-3m")
-
     with pytest.raises(
         TypeError, match="'fake data' does not refer to a known dataset"
     ):
@@ -321,17 +291,12 @@ def test_loader_call(backend: _Backend) -> None:
 
 @backends
 def test_dataset_not_found(backend: _Backend) -> None:
-    """
-    Various queries that should **always raise** due to non-existent dataset.
-
-    ``Loader.url`` is used since it doesn't require a remote connection.
-    """
+    """Various queries that should **always raise** due to non-existent dataset."""
     load = Loader.from_backend(backend)
     real_name: Literal["disasters"] = "disasters"
     invalid_name: Literal["fake name"] = "fake name"
     invalid_suffix: Literal["fake suffix"] = "fake suffix"
     incorrect_suffix: Literal[".json"] = ".json"
-
     ERR_NO_RESULT = ValueError
     MSG_NO_RESULT = "Found no results for"
     NAME = "dataset_name"
@@ -342,7 +307,6 @@ def test_dataset_not_found(backend: _Backend) -> None:
         match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL),
     ):
         load.url(invalid_name)
-
     with pytest.raises(
         TypeError,
         match=re.compile(
@@ -351,7 +315,6 @@ def test_dataset_not_found(backend: _Backend) -> None:
         ),
     ):
         load.url(real_name, invalid_suffix)  # type: ignore[arg-type]
-
     with pytest.raises(
         ERR_NO_RESULT,
         match=re.compile(
@@ -403,26 +366,15 @@ def __init__(self, name) -> None:
 def test_reader_cache(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
-    """
-    Using a sample of the smallest datasets, make *"requests"* that are all caught by prior hits.
-
-    Note
-    ----
-    `tmp_path`_ is a built-in fixture.
-
-    .. _tmp_path:
-        https://docs.pytest.org/en/stable/getting-started.html#request-a-unique-temporary-directory-for-functional-tests
-    """
+    """Ensure cache hits avoid network activity."""
     import polars as pl
     from polars.testing import assert_frame_equal
 
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
-
     load = Loader.from_backend(backend)
     assert load.cache.is_active()
     cache_dir = load.cache.path
     assert cache_dir == tmp_path
-
     assert tuple(load.cache) == ()
 
     # smallest csvs
@@ -430,7 +382,6 @@ def test_reader_cache(
     load("lookup_people")
     load("iowa-electricity")
     load("global-temp")
-
     cached_paths = tuple(load.cache)
     assert len(cached_paths) == 4
 
@@ -448,19 +399,15 @@ def test_reader_cache(
     assert_frame_equal(left, right)
     assert len(tuple(load.cache)) == 4
     assert cached_paths == tuple(load.cache)
-
     load("iowa-electricity", ".csv")
     load("global-temp", ".csv")
     load("global-temp.csv")
-
     assert len(tuple(load.cache)) == 4
     assert cached_paths == tuple(load.cache)
-
     load("lookup_people")
     load("lookup_people.csv")
     load("lookup_people", ".csv")
     load("lookup_people")
-
     assert len(tuple(load.cache)) == 4
     assert cached_paths == tuple(load.cache)
 
@@ -476,15 +423,14 @@ def test_reader_cache_exhaustive(
     """
     Fully populate and then purge the cache for all backends.
 
-    - Does not attempt to read the files
-    - Checking we can support pre-downloading and safely deleting
-
     Notes
     -----
-    - Requests work the same for all backends
-    - The logic for detecting the cache contents uses ``narhwals``
-    - Here, we're testing that these ``narwhals`` ops are consistent
-    - `DatasetCache.download_all` is expensive for CI, so aiming for it to run at most once
+    - Does not attempt to read the files
+    - Checking we can support pre-downloading and safely deleting
+        - Requests work the same for all backends
+        - The logic for detecting the cache contents uses ``narhwals``
+        - Here, we're testing that these ``narwhals`` ops are consistent
+    - `DatasetCache.download_all` is expensive for CI, so aiming for it to run **at most once**
         - 34-45s per call (4x backends)
     """
     polars_loader.cache.download_all()
@@ -498,13 +444,12 @@ def test_reader_cache_exhaustive(
     cache_dir = load.cache.path
     assert cache_dir == tmp_path
     assert tuple(load.cache) == (CLONED,)
-
     load.cache.path = CLONED
     cached_paths = tuple(load.cache)
     assert cached_paths != ()
 
     # NOTE: Approximating all datasets downloaded
-    assert len(cached_paths) >= 40
+    assert len(cached_paths) >= 70
     assert all(
         bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size)
         for fp in load.cache
@@ -528,16 +473,13 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
     from altair.datasets import load
 
     monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
-
     assert load.cache.is_active()
     assert load.cache.path == tmp_path
     assert load.cache.is_empty()
     load("cars")
     assert not load.cache.is_empty()
-
-    # RELATED: https://github.com/python/mypy/issues/3004
+    # ISSUE: https://github.com/python/mypy/issues/3004
     load.cache.path = None  # type: ignore[assignment]
-
     assert load.cache.is_not_active()
     with pytest.raises(
         ValueError,
@@ -546,9 +488,7 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
         ),
     ):
         tuple(load.cache)
-
     load.cache.path = tmp_path
-
     assert load.cache.is_active()
     assert load.cache.path == tmp_path
     assert not load.cache.is_empty()
@@ -599,44 +539,29 @@ def test_pyarrow_read_json(
     assert load(name, ".json")
 
 
+@datasets_spatial
 @backends_no_polars
-def test_spatial(spatial_datasets, backend: _Backend) -> None:
+def test_spatial(backend: _Backend, name: Dataset) -> None:
     load = Loader.from_backend(backend)
     if is_polars_backed_pyarrow(load):
-        assert nw_dep.is_pyarrow_table(load(spatial_datasets))
+        assert nw_dep.is_pyarrow_table(load(name))
     else:
         pattern = re.compile(
-            rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url",
+            rf"{name}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url",
             flags=re.DOTALL | re.IGNORECASE,
         )
         with pytest.raises(NotImplementedError, match=pattern):
-            load(spatial_datasets)
-
+            load(name)
 
-# TODO: Adapt into something useful or simplify into just param name
-def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]:
-    """Temp way of excluding datasets that were removed."""
-    names: tuple[Dataset, ...] = get_args(Dataset)
-    args: tuple[Dataset, Extension | None]
-    for name in names:
-        marks: MarksType = ()
-        if name in skip:
-            continue
-        args = name, None
-        yield pytest.param(*args, marks=marks)
 
-
-@pytest.mark.parametrize(("name", "suffix"), list(_dataset_params()))
+@datasets_all
 @datasets_debug
-def test_all_datasets(
-    polars_loader: PolarsLoader, name: Dataset, suffix: Extension
-) -> None:
-    """Ensure all annotated datasets can be loaded with the most reliable backend."""
+def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None:
     if name in {"7zip", "ffox", "gimp"}:
         with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"):
-            polars_loader(name, suffix)
+            polars_loader(name)
     else:
-        frame = polars_loader(name, suffix)
+        frame = polars_loader(name)
         assert nw_dep.is_polars_dataframe(frame)
 
 
@@ -649,14 +574,11 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
 
     load = Loader.from_backend("polars")
     load.cache.path = tmp_path
-
     load("londonCentroids")
     load("stocks")
     load("driving")
-
     cached_paths = tuple(tmp_path.iterdir())
     assert len(cached_paths) == 3
-
     raiser = partial(_raise_exception, URLError)
     with monkeypatch.context() as mp:
         mp.setattr(load._reader._opener, "open", raiser)
@@ -679,7 +601,6 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
         # Here, the remote conn isn't considered - we already have the file
         frame_from_cache = load("birdstrikes")
         assert len(tuple(tmp_path.iterdir())) == 4
-
     assert_frame_equal(frame, frame_from_cache)
 
 
@@ -731,12 +652,12 @@ def test_pandas_date_parse(
     """
     Ensure schema defaults are correctly parsed.
 
-    NOTE:
+    Notes
+    -----
     - Depends on ``frictionless`` being able to detect the date/datetime columns.
     - Not all format strings work
     """
     date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns)
-
     load = Loader.from_backend(backend)
     url = load.url(name)
     kwds: dict[str, Any] = (
@@ -745,10 +666,8 @@ def test_pandas_date_parse(
         else {"parse_dates": date_columns}
     )
     kwds_empty: dict[str, Any] = {k: [] for k in kwds}
-
     df_schema_derived: pd.DataFrame = load(name)
     nw_schema = nw.from_native(df_schema_derived).schema
-
     df_manually_specified: pd.DataFrame = load(name, **kwds)
     df_dates_empty: pd.DataFrame = load(name, **kwds_empty)
 

From c3c2edaa64be37547da3f467e453af1ee8c4ba60 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 22 Jan 2025 17:42:11 +0000
Subject: [PATCH 177/201] docs: Make `sphinx` happy with docs

These changes are very minor in VSCode, but fix a lot of rendering issues on the website
---
 altair/datasets/__init__.py | 23 ++++-------
 altair/datasets/_cache.py   | 21 +++++-----
 altair/datasets/_loader.py  | 78 ++++++++++++++++++-------------------
 doc/user_guide/api.rst      | 16 ++++++++
 tools/generate_api_docs.py  | 19 +++++++++
 5 files changed, 92 insertions(+), 65 deletions(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 01dc35212..3c61eda0b 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -1,5 +1,5 @@
 """
-Load example datasets **remotely** from `vega-datasets`_.
+Load example datasets *remotely* from `vega-datasets`_.
 
 Provides over **70+** datasets, used throughout our `Example Gallery`_.
 
@@ -85,24 +85,18 @@
 """
 Get a remote dataset and load as tabular data.
 
-For full <kbd>Tab</kbd> completions, instead use:
+For full <kbd>Tab</kbd> completions, instead use::
 
     from altair.datasets import Loader
     load = Loader.from_backend("polars")
     cars = load("cars")
     movies = load("movies")
 
-Alternatively, specify ``backend`` during a call:
+Alternatively, specify ``backend`` during a call::
 
     from altair.datasets import load
     cars = load("cars", backend="polars")
     movies = load("movies", backend="polars")
-
-Related
--------
-- https://github.com/vega/altair/pull/3631#issuecomment-2480832609
-- https://github.com/vega/altair/pull/3631#discussion_r1847111064
-- https://github.com/vega/altair/pull/3631#discussion_r1847176465
 """
 
 
@@ -124,17 +118,14 @@ def url(
         .. note::
             Only needed if ``name`` is available in multiple formats.
 
+    Returns
+    -------
+    ``str``
+
     .. _Path.stem:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
     .. _Path.suffix:
         https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-
-    Related
-    -------
-    - https://github.com/vega/altair/pull/3631#issuecomment-2484826592
-    - https://github.com/vega/altair/pull/3631#issuecomment-2480832711
-    - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516
-    - https://github.com/vega/altair/pull/3631#discussion_r1846662053
     """
     from altair.datasets._exceptions import AltairDatasetsError
 
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 08016d622..a415a8380 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -317,26 +317,27 @@ def path(self) -> Path:
         """
         Returns path to datasets cache.
 
-        Defaults to (`XDG_CACHE_HOME`_):
+        Defaults to (`XDG_CACHE_HOME`_)::
 
             "$XDG_CACHE_HOME/altair/"
 
-        But can be configured using the environment variable:
+        But can be configured using the environment variable::
 
             "$ALTAIR_DATASETS_DIR"
 
-        You can set this for the current session via:
+        You can set this for the current session via::
 
-            >>> from pathlib import Path
-            >>> from altair.datasets import load
-            >>> load.cache.path = Path.home() / ".altair_cache"
+            from pathlib import Path
+            from altair.datasets import load
 
-            >>> load.cache.path.relative_to(Path.home()).as_posix()
-            '.altair_cache'
+            load.cache.path = Path.home() / ".altair_cache"
 
-        You can *later* disable caching via:
+            load.cache.path.relative_to(Path.home()).as_posix()
+            ".altair_cache"
 
-            >>> load.cache.path = None
+        You can *later* disable caching via::
+
+           load.cache.path = None
 
         .. _XDG_CACHE_HOME:
             https://specifications.freedesktop.org/basedir-spec/latest/#variables
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 6c359edb2..8f13ab2de 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -29,14 +29,14 @@
 
 class Loader(Generic[IntoDataFrameT, IntoFrameT]):
     """
-    Load example datasets **remotely** from `vega-datasets`_, with caching.
+    Load example datasets *remotely* from `vega-datasets`_, with caching.
 
-    A new ``Loader`` must be initialized by specifying a backend:
+    A new ``Loader`` must be initialized by specifying a backend::
 
         from altair.datasets import Loader
 
         load = Loader.from_backend("polars")
-        >>> load  # doctest: +SKIP
+        load
         Loader[polars]
 
     .. _vega-datasets:
@@ -81,42 +81,35 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]:
             .. warning::
                 Most datasets use a `JSON format not supported`_ by ``pyarrow``
 
-        .. _polars defaults:
-            https://docs.pola.rs/api/python/stable/reference/io.html
-        .. _pandas defaults:
-            https://pandas.pydata.org/docs/reference/io.html
-        .. _JSON format not supported:
-            https://arrow.apache.org/docs/python/json.html#reading-json-files
-
         Examples
         --------
-        Using ``polars``:
+        Using ``polars``::
 
             from altair.datasets import Loader
 
             load = Loader.from_backend("polars")
             cars = load("cars")
 
-            >>> type(cars)  # doctest: +SKIP
+            type(cars)
             polars.dataframe.frame.DataFrame
 
-        Using ``pandas``:
+        Using ``pandas``::
 
             load = Loader.from_backend("pandas")
             cars = load("cars")
 
-            >>> type(cars)  # doctest: +SKIP
+            type(cars)
             pandas.core.frame.DataFrame
 
-        Using ``pandas``, backed by ``pyarrow`` dtypes:
+        Using ``pandas``, backed by ``pyarrow`` dtypes::
 
             load = Loader.from_backend("pandas[pyarrow]")
             cars = load("cars")
 
-            >>> type(cars)  # doctest: +SKIP
+            type(cars)
             pandas.core.frame.DataFrame
 
-            >>> cars.dtypes  # doctest: +SKIP
+            cars.dtypes
             Name                       string[pyarrow]
             Miles_per_Gallon           double[pyarrow]
             Cylinders                   int64[pyarrow]
@@ -127,6 +120,13 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]:
             Year                timestamp[ns][pyarrow]
             Origin                     string[pyarrow]
             dtype: object
+
+        .. _polars defaults:
+            https://docs.pola.rs/api/python/stable/reference/io.html
+        .. _pandas defaults:
+            https://pandas.pydata.org/docs/reference/io.html
+        .. _JSON format not supported:
+            https://arrow.apache.org/docs/python/json.html#reading-json-files
         """
         obj = Loader.__new__(Loader)
         obj._reader = backend(backend_name)
@@ -154,24 +154,19 @@ def __call__(
         **kwds
             Arguments passed to the underlying read function.
 
-        .. _Path.stem:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
-        .. _Path.suffix:
-            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
-
         Examples
         --------
-        Using ``polars``:
+        Using ``polars``::
 
             from altair.datasets import Loader
 
             load = Loader.from_backend("polars")
             source = load("iowa-electricity")
 
-            >>> source.columns  # doctest: +SKIP
+            source.columns
             ['year', 'source', 'net_generation']
 
-            >>> source  # doctest: +SKIP
+            source
             shape: (51, 3)
             ┌────────────┬──────────────┬────────────────┐
             │ year       ┆ source       ┆ net_generation │
@@ -191,15 +186,15 @@ def __call__(
             │ 2017-01-01 ┆ Renewables   ┆ 21933          │
             └────────────┴──────────────┴────────────────┘
 
-        Using ``pandas``:
+        Using ``pandas``::
 
             load = Loader.from_backend("pandas")
             source = load("iowa-electricity")
 
-            >>> source.columns  # doctest: +SKIP
+            source.columns
             Index(['year', 'source', 'net_generation'], dtype='object')
 
-            >>> source  # doctest: +SKIP
+            source
                      year        source  net_generation
             0  2001-01-01  Fossil Fuels           35361
             1  2002-01-01  Fossil Fuels           35991
@@ -215,15 +210,15 @@ def __call__(
 
             [51 rows x 3 columns]
 
-        Using ``pyarrow``:
+        Using ``pyarrow``::
 
             load = Loader.from_backend("pyarrow")
             source = load("iowa-electricity")
 
-            >>> source.column_names  # doctest: +SKIP
+            source.column_names
             ['year', 'source', 'net_generation']
 
-            >>> source  # doctest: +SKIP
+            source
             pyarrow.Table
             year: date32[day]
             source: string
@@ -232,6 +227,11 @@ def __call__(
             year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]]
             source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]]
             net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]]
+
+        .. _Path.stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
         """
         return self._reader.dataset(name, suffix, **kwds)
 
@@ -261,16 +261,16 @@ def url(
 
         Examples
         --------
-        The returned url will always point to an accessible dataset:
+        The returned url will always point to an accessible dataset::
 
             import altair as alt
             from altair.datasets import Loader
 
             load = Loader.from_backend("polars")
-            >>> load.url("cars")  # doctest: +SKIP
-            'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json'
+            load.url("cars")
+            "https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json"
 
-        We can pass the result directly to a chart:
+        We can pass the result directly to a chart::
 
             url = load.url("cars")
             alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
@@ -282,19 +282,19 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
         """
         Caching of remote dataset requests.
 
-        Configure cache path:
+        Configure cache path::
 
             self.cache.path = "..."
 
-        Download the latest datasets *ahead-of-time*:
+        Download the latest datasets *ahead-of-time*::
 
             self.cache.download_all()
 
-        Remove all downloaded datasets:
+        Remove all downloaded datasets::
 
             self.cache.clear()
 
-        Disable caching:
+        Disable caching::
 
             self.cache.path = None
         """
diff --git a/doc/user_guide/api.rst b/doc/user_guide/api.rst
index 5793f0ae8..336c29d54 100644
--- a/doc/user_guide/api.rst
+++ b/doc/user_guide/api.rst
@@ -791,5 +791,21 @@ Typing
    Optional
    is_chart_type
 
+.. _api-datasets:
+
+Datasets
+--------
+.. currentmodule:: altair.datasets
+
+.. autosummary::
+   :toctree: generated/datasets/
+   :nosignatures:
+
+   Loader
+   load
+   url
+
 .. _Generic:
     https://typing.readthedocs.io/en/latest/spec/generics.html#generics
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
diff --git a/tools/generate_api_docs.py b/tools/generate_api_docs.py
index 55c68729e..babd3d3eb 100644
--- a/tools/generate_api_docs.py
+++ b/tools/generate_api_docs.py
@@ -110,8 +110,22 @@
 
    {typing_objects}
 
+.. _api-datasets:
+
+Datasets
+--------
+.. currentmodule:: altair.datasets
+
+.. autosummary::
+   :toctree: generated/datasets/
+   :nosignatures:
+
+   {datasets_objects}
+
 .. _Generic:
     https://typing.readthedocs.io/en/latest/spec/generics.html#generics
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
 """
 
 
@@ -171,6 +185,10 @@ def theme() -> list[str]:
     return sort_3
 
 
+def datasets() -> list[str]:
+    return alt.datasets.__all__
+
+
 def lowlevel_wrappers() -> list[str]:
     objects = sorted(iter_objects(alt.schema.core, restrict_to_subclass=alt.SchemaBase))
     # The names of these two classes are also used for classes in alt.channels. Due to
@@ -194,6 +212,7 @@ def write_api_file() -> None:
             api_classes=sep.join(api_classes()),
             typing_objects=sep.join(type_hints()),
             theme_objects=sep.join(theme()),
+            datasets_objects=sep.join(datasets()),
         ),
         encoding="utf-8",
     )

From d3b3ef2afed2fb1bff4fdcb099787191e04a8b15 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:52:35 +0000
Subject: [PATCH 178/201] refactor: Add `find_spec` fastpath to `is_available`

Have a lot of changes locally that use `find_spec`, but would prefer a single name assoicated with this action
The actual spec is never relevant for this usage
---
 altair/datasets/_readers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
index 0a18c1e61..a1f66dee1 100644
--- a/altair/datasets/_readers.py
+++ b/altair/datasets/_readers.py
@@ -485,6 +485,8 @@ def is_available(
         * ``True`` every package.
         * ``False`` at least one package.
     """
+    if not more_pkg_names and isinstance(pkg_names, str):
+        return find_spec(pkg_names) is not None
     pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
     names = chain(pkgs_names, more_pkg_names)
     fn = all if require_all else any

From b606a7d6e6980865930d8bb8cb720d6340855782 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 29 Jan 2025 15:05:47 +0000
Subject: [PATCH 179/201] feat(DRAFT): Private API overhaul

**Public API is unchanged**
Core changes are to simplify testing and extension:

- `_readers.py` -> `_reader.py`
  - w/ two new support modules `_constraints`, and `_readimpl`
- Functions (`BaseImpl`) are declared with what they support (`include`) and restrictions (`exclude`) on that subset
  - Transforms a lot of the imperative logic into set operations
- Greatly improved `pyarrow` support
  - Utilize schema
  - Provides additional fallback `.json` implementations
  - `_stdlib_read_json_to_arrow` finally resolves `"movies.json"` issue
---
 altair/datasets/_cache.py       | 106 +++++-
 altair/datasets/_constraints.py | 115 +++++++
 altair/datasets/_exceptions.py  |  78 +++--
 altair/datasets/_loader.py      |  40 ++-
 altair/datasets/_reader.py      | 540 ++++++++++++++++++++++++++++++
 altair/datasets/_readers.py     | 574 --------------------------------
 altair/datasets/_readimpl.py    | 414 +++++++++++++++++++++++
 tests/test_datasets.py          |  79 ++---
 8 files changed, 1260 insertions(+), 686 deletions(-)
 create mode 100644 altair/datasets/_constraints.py
 create mode 100644 altair/datasets/_reader.py
 delete mode 100644 altair/datasets/_readers.py
 create mode 100644 altair/datasets/_readimpl.py

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index a415a8380..9abe09726 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -5,10 +5,9 @@
 from collections import defaultdict
 from importlib.util import find_spec
 from pathlib import Path
-from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args
+from typing import TYPE_CHECKING, ClassVar, TypeVar, cast, get_args
 
 import narwhals.stable.v1 as nw
-from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
 
 from altair.datasets._exceptions import AltairDatasetsError
 from altair.datasets._typing import Dataset
@@ -29,12 +28,18 @@
     )
     from io import IOBase
     from typing import Any, Final
+    from urllib.request import OpenerDirector
 
     from _typeshed import StrPath
     from narwhals.stable.v1.dtypes import DType
+    from narwhals.stable.v1.typing import IntoExpr
 
     from altair.datasets._typing import Metadata
 
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
     if sys.version_info >= (3, 11):
         from typing import LiteralString
     else:
@@ -43,8 +48,8 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    from altair.datasets._readers import _Reader
     from altair.datasets._typing import FlFieldStr
+    from altair.vegalite.v5.schema._typing import OneOrSeq
 
     _Dataset: TypeAlias = "Dataset | LiteralString"
     _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
@@ -83,6 +88,10 @@
     https://narwhals-dev.github.io/narwhals/api-reference/dtypes/
 """
 
+_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
+    v: k for k, v in _DTYPE_TO_FIELD.items()
+}
+
 
 def _iter_metadata(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
     """
@@ -179,10 +188,7 @@ def rotated(self) -> Mapping[str, Sequence[Any]]:
                     self._rotated[k].append(v)
         return self._rotated
 
-    def metadata(self, ns: Any, /) -> nw.LazyFrame:
-        data: Any = self.rotated
-        return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy())
-
+    # TODO: Evaluate which errors are now obsolete
     def __getitem__(self, key: _Dataset, /) -> Metadata:
         if meta := self.get(key, None):
             return meta
@@ -194,6 +200,7 @@ def __getitem__(self, key: _Dataset, /) -> Metadata:
             msg = f"{key!r} does not refer to a known dataset."
             raise TypeError(msg)
 
+    # TODO: Evaluate which errors are now obsolete
     def url(self, name: _Dataset, /) -> str:
         if meta := self.get(name, None):
             if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
@@ -207,6 +214,9 @@ def url(self, name: _Dataset, /) -> str:
             msg = f"{name!r} does not refer to a known dataset."
             raise TypeError(msg)
 
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__}: {'COLLECTED' if self._mapping else 'READY'}>"
+
 
 class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]):
     """
@@ -230,8 +240,10 @@ def __init__(
         self,
         *,
         tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"],
+        implementation: nw.Implementation = nw.Implementation.UNKNOWN,
     ) -> None:
         self._mapping: MutableMapping[_Dataset, _FlSchema] = tp()
+        self._implementation: nw.Implementation = implementation
 
     def read(self) -> Any:
         import json
@@ -259,8 +271,63 @@ def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]:
         else:
             return list(match)
 
+    def is_active(self) -> bool:
+        return self._implementation in {
+            nw.Implementation.PANDAS,
+            nw.Implementation.PYARROW,
+            nw.Implementation.MODIN,
+            nw.Implementation.PYARROW,
+        }
+
+    def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
+        name: Any = meta["dataset_name"]
+        impl = self._implementation
+        if (impl.is_pandas_like() or impl.is_pyarrow()) and (self[name]):
+            suffix = meta["suffix"]
+            if impl.is_pandas_like():
+                if cols := self.by_dtype(name, nw.Date, nw.Datetime):
+                    if suffix == ".json":
+                        return {"convert_dates": cols}
+                    elif suffix in {".csv", ".tsv"}:
+                        return {"parse_dates": cols}
+            else:
+                schema = self.schema_pyarrow(name)
+                if suffix in {".csv", ".tsv"}:
+                    from pyarrow.csv import ConvertOptions
+
+                    return {"convert_options": ConvertOptions(column_types=schema)}  # pyright: ignore[reportCallIssue]
+                elif suffix == ".parquet":
+                    return {"schema": schema}
+
+        return {}
+
+    def schema(self, name: _Dataset, /) -> Mapping[str, DType]:
+        return {
+            column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items()
+        }
+
+    # TODO: Open an issue in ``narwhals`` to try and get a public api for type conversion
+    def schema_pyarrow(self, name: _Dataset, /):
+        schema = self.schema(name)
+        if schema:
+            from narwhals._arrow.utils import narwhals_to_native_dtype
+            from narwhals.utils import Version
 
-class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
+            m = {k: narwhals_to_native_dtype(v, Version.V1) for k, v in schema.items()}
+        else:
+            m = {}
+        return nw.dependencies.get_pyarrow().schema(m)
+
+
+class _SupportsScanMetadata(Protocol):
+    _opener: ClassVar[OpenerDirector]
+
+    def _scan_metadata(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame: ...
+
+
+class DatasetCache:
     """Opt-out caching of remote dataset requests."""
 
     _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
@@ -268,8 +335,8 @@ class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]):
         Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair"
     ).resolve()
 
-    def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None:
-        self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader
+    def __init__(self, reader: _SupportsScanMetadata, /) -> None:
+        self._rd: _SupportsScanMetadata = reader
 
     def clear(self) -> None:
         """Delete all previously cached datasets."""
@@ -308,10 +375,24 @@ def download_all(self) -> None:
             return None
         print(f"Downloading {len(frame)} missing datasets...")
         for meta in _iter_metadata(frame):
-            self._rd._download(meta["url"], self.path / (meta["sha"] + meta["suffix"]))
+            self._download_one(meta["url"], self.path_meta(meta))
         print("Finished downloads")
         return None
 
+    def _maybe_download(self, meta: Metadata, /) -> Path:
+        fp = self.path_meta(meta)
+        return (
+            fp
+            if (fp.exists() and fp.stat().st_size)
+            else self._download_one(meta["url"], fp)
+        )
+
+    def _download_one(self, url: str, fp: Path, /) -> Path:
+        with self._rd._opener.open(url) as f:
+            fp.touch()
+            fp.write_bytes(f.read())
+        return fp
+
     @property
     def path(self) -> Path:
         """
@@ -354,6 +435,9 @@ def path(self, source: StrPath | None, /) -> None:
         else:
             os.environ[self._ENV_VAR] = ""
 
+    def path_meta(self, meta: Metadata, /) -> Path:
+        return self.path / (meta["sha"] + meta["suffix"])
+
     def __iter__(self) -> Iterator[Path]:
         yield from self.path.iterdir()
 
diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py
new file mode 100644
index 000000000..e5eaa3b97
--- /dev/null
+++ b/altair/datasets/_constraints.py
@@ -0,0 +1,115 @@
+"""Set-like guards for matching metadata to an implementation."""
+
+from __future__ import annotations
+
+from collections.abc import Set
+from itertools import chain
+from typing import TYPE_CHECKING, Any
+
+from narwhals.stable import v1 as nw
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Iterable, Iterator
+
+    from altair.datasets._typing import Metadata
+
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+
+__all__ = [
+    "Items",
+    "MetaIs",
+    "is_arrow",
+    "is_csv",
+    "is_json",
+    "is_meta",
+    "is_not_tabular",
+    "is_parquet",
+    "is_spatial",
+    "is_tsv",
+]
+
+Items: TypeAlias = Set[tuple[str, Any]]
+
+
+class MetaIs(Set[tuple[str, Any]]):
+    _requires: frozenset[tuple[str, Any]]
+
+    def __init__(self, kwds: frozenset[tuple[str, Any]], /) -> None:
+        object.__setattr__(self, "_requires", kwds)
+
+    @classmethod
+    def from_metadata(cls, meta: Metadata, /) -> MetaIs:
+        return cls(frozenset(meta.items()))
+
+    def to_metadata(self) -> Metadata:
+        if TYPE_CHECKING:
+
+            def collect(**kwds: Unpack[Metadata]) -> Metadata:
+                return kwds
+
+            return collect(**dict(self))
+        return dict(self)
+
+    def to_expr(self) -> nw.Expr:
+        return nw.all_horizontal(nw.col(name) == val for name, val in self)
+
+    def isdisjoint(self, other: Iterable[Any]) -> bool:
+        return super().isdisjoint(other)
+
+    def issubset(self, other: Iterable[Any]) -> bool:
+        return self._requires.issubset(other)
+
+    def __call__(self, meta: Items, /) -> bool:
+        return self._requires <= meta
+
+    def __hash__(self) -> int:
+        return hash(self._requires)
+
+    def __contains__(self, x: object) -> bool:
+        return self._requires.__contains__(x)
+
+    def __iter__(self) -> Iterator[tuple[str, Any]]:
+        yield from self._requires
+
+    def __len__(self) -> int:
+        return self._requires.__len__()
+
+    def __setattr__(self, name: str, value: Any):
+        msg = (
+            f"{type(self).__name__!r} is immutable.\n"
+            f"Could not assign self.{name} = {value}"
+        )
+        raise TypeError(msg)
+
+    def __repr__(self) -> str:
+        items = dict(self)
+        if not items:
+            contents = "<placeholder>"
+        elif suffix := items.pop("suffix", None):
+            contents = ", ".join(
+                chain([f"'*{suffix}'"], (f"{k}={v!r}" for k, v in items.items()))
+            )
+        else:
+            contents = ", ".join(f"{k}={v!r}" for k, v in items.items())
+        return f"is_meta({contents})"
+
+
+def is_meta(**kwds: Unpack[Metadata]) -> MetaIs:
+    return MetaIs.from_metadata(kwds)
+
+
+is_csv = is_meta(suffix=".csv")
+is_json = is_meta(suffix=".json")
+is_tsv = is_meta(suffix=".tsv")
+is_arrow = is_meta(suffix=".arrow")
+is_parquet = is_meta(suffix=".parquet")
+is_spatial = is_meta(is_spatial=True)
+is_not_tabular = is_meta(is_tabular=False)
diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py
index 36dba27ef..2f9c13d45 100644
--- a/altair/datasets/_exceptions.py
+++ b/altair/datasets/_exceptions.py
@@ -5,7 +5,7 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
-    from altair.datasets._readers import _Backend
+    from altair.datasets._reader import _Backend
     from altair.datasets._typing import Metadata
 
 
@@ -26,6 +26,19 @@ def from_url(cls, meta: Metadata, /) -> AltairDatasetsError:
             raise NotImplementedError(msg)
         return cls(msg)
 
+    @classmethod
+    def from_tabular(cls, meta: Metadata, backend_name: str, /) -> AltairDatasetsError:
+        install_other = None
+        mid = "\n"
+        if not meta["is_image"] and not meta["is_tabular"]:
+            install_other = "polars"
+            if meta["is_spatial"]:
+                mid = f"Geospatial data is not supported natively by {backend_name!r}."
+            elif meta["is_json"]:
+                mid = f"Non-tabular json is not supported natively by {backend_name!r}."
+        msg = f"{_failed_tabular(meta)}{mid}{_suggest_url(meta, install_other)}"
+        return cls(msg)
+
     @classmethod
     def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError:
         msg = f"Found no supported backend, searched:\n{priority!r}"
@@ -33,12 +46,12 @@ def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError:
 
 
 def module_not_found(
-    backend_name: str, reqs: str | tuple[str, ...], missing: str
+    backend_name: str, reqs: Sequence[str], missing: str
 ) -> ModuleNotFoundError:
-    if isinstance(reqs, tuple):
-        depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
+    if len(reqs) == 1:
+        depends = f"{reqs[0]!r} package"
     else:
-        depends = f"{reqs!r} package"
+        depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
     msg = (
         f"Backend {backend_name!r} requires the {depends}, but {missing!r} could not be found.\n"
         f"This can be installed with pip using:\n"
@@ -49,29 +62,6 @@ def module_not_found(
     return ModuleNotFoundError(msg, name=missing)
 
 
-def image(meta: Metadata, /) -> AltairDatasetsError:
-    msg = f"{_failed_tabular(meta)}\n{_suggest_url(meta)}"
-    return AltairDatasetsError(msg)
-
-
-def geospatial(meta: Metadata, backend_name: str) -> NotImplementedError:
-    msg = (
-        f"{_failed_tabular(meta)}"
-        f"Geospatial data is not supported natively by {backend_name!r}."
-        f"{_suggest_url(meta, 'polars')}"
-    )
-    return NotImplementedError(msg)
-
-
-def non_tabular_json(meta: Metadata, backend_name: str) -> NotImplementedError:
-    msg = (
-        f"{_failed_tabular(meta)}"
-        f"Non-tabular json is not supported natively by {backend_name!r}."
-        f"{_suggest_url(meta, 'polars')}"
-    )
-    return NotImplementedError(msg)
-
-
 def _failed_url(meta: Metadata, /) -> str:
     return f"Unable to load {meta['file_name']!r} via url.\n"
 
@@ -87,3 +77,35 @@ def _suggest_url(meta: Metadata, install_other: str | None = None) -> str:
         "    from altair.datasets import url\n"
         f"    url({meta['dataset_name']!r})"
     )
+
+
+# TODO:
+# - Use `AltairDatasetsError`
+# - Remove notes from doc
+# - Improve message and how data is selected
+def implementation_not_found(meta: Metadata, /) -> NotImplementedError:
+    """
+    Search finished without finding a *declared* incompatibility.
+
+    Notes
+    -----
+    - New kind of error
+    - Previously, every backend had a function assigned
+        - But they might not all work
+    - Now, only things that are known to be widely safe are added
+        - Should probably suggest using a pre-defined backend that supports everything
+    - What can reach here?
+        - `is_image` (all)
+        - `"pandas"` (using inference wont trigger these)
+          - `.arrow` (w/o `pyarrow`)
+          - `.parquet` (w/o either `pyarrow` or `fastparquet`)
+    """
+    INDENT = " " * 4
+    record = f",\n{INDENT}".join(
+        f"{k}={v!r}"
+        for k, v in meta.items()
+        if not (k.startswith(("is_", "sha", "bytes", "has_")))
+        or (v is True and k.startswith("is_"))
+    )
+    msg = f"Found no implementation that supports:\n{INDENT}{record}"
+    return NotImplementedError(msg)
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 8f13ab2de..9b55daf70 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -2,9 +2,10 @@
 
 from typing import TYPE_CHECKING, Generic, final, overload
 
-from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT
+from narwhals.stable.v1.typing import IntoDataFrameT
 
-from altair.datasets._readers import _Reader, backend
+from altair.datasets import _reader
+from altair.datasets._reader import IntoFrameT
 
 if TYPE_CHECKING:
     import sys
@@ -13,14 +14,16 @@
     import pandas as pd
     import polars as pl
     import pyarrow as pa
+    from narwhals.stable import v1 as nw
 
     from altair.datasets._cache import DatasetCache
+    from altair.datasets._reader import Reader
 
     if sys.version_info >= (3, 11):
-        from typing import LiteralString
+        from typing import LiteralString, Self
     else:
-        from typing_extensions import LiteralString
-    from altair.datasets._readers import _Backend
+        from typing_extensions import LiteralString, Self
+    from altair.datasets._reader import _Backend
     from altair.datasets._typing import Dataset, Extension
 
 
@@ -43,7 +46,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]):
         https://github.com/vega/vega-datasets
     """
 
-    _reader: _Reader[IntoDataFrameT, IntoFrameT]
+    _reader: Reader[IntoDataFrameT, IntoFrameT]
 
     @overload
     @classmethod
@@ -55,16 +58,18 @@ def from_backend(
     @classmethod
     def from_backend(
         cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
-    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
+    ) -> Loader[pd.DataFrame, nw.LazyFrame]: ...
 
     @overload
     @classmethod
     def from_backend(
         cls, backend_name: Literal["pyarrow"], /
-    ) -> Loader[pa.Table, pa.Table]: ...
+    ) -> Loader[pa.Table, nw.LazyFrame]: ...
 
     @classmethod
-    def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]:
+    def from_backend(
+        cls: type[Loader[Any, Any]], backend_name: _Backend = "polars", /
+    ) -> Loader[Any, Any]:
         """
         Initialize a new loader, with the specified backend.
 
@@ -128,8 +133,12 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]:
         .. _JSON format not supported:
             https://arrow.apache.org/docs/python/json.html#reading-json-files
         """
-        obj = Loader.__new__(Loader)
-        obj._reader = backend(backend_name)
+        return cls.from_reader(_reader._from_backend(backend_name))
+
+    @classmethod
+    def from_reader(cls, reader: Reader[IntoDataFrameT, IntoFrameT], /) -> Self:
+        obj = cls.__new__(cls)
+        obj._reader = reader
         return obj
 
     def __call__(
@@ -278,7 +287,7 @@ def url(
         return self._reader.url(name, suffix)
 
     @property
-    def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
+    def cache(self) -> DatasetCache:
         """
         Caching of remote dataset requests.
 
@@ -361,12 +370,9 @@ def __call__(
 
 def __getattr__(name):
     if name == "load":
-        from altair.datasets._readers import infer_backend
-
-        reader = infer_backend()
+        reader = _reader.infer_backend()
         global load
-        load = _Load.__new__(_Load)
-        load._reader = reader
+        load = _Load.from_reader(reader)
         return load
     else:
         msg = f"module {__name__!r} has no attribute {name!r}"
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
new file mode 100644
index 000000000..eacc516ba
--- /dev/null
+++ b/altair/datasets/_reader.py
@@ -0,0 +1,540 @@
+"""
+Backend for ``alt.datasets.Loader``.
+
+Notes
+-----
+Extending would be more ergonomic if `read`, `scan`, `_constraints` were available under a single export::
+
+    from altair.datasets import ext, reader
+    import polars as pl
+
+    impls = (
+        ext.read(pl.read_parquet, ext.is_parquet),
+        ext.read(pl.read_csv, ext.is_csv),
+        ext.read(pl.read_json, ext.is_json),
+    )
+    user_reader = reader(impls)
+    user_reader.dataset("airports")
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Mapping
+from importlib import import_module
+from importlib.util import find_spec
+from itertools import chain
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Literal, cast, overload
+from urllib.request import build_opener as _build_opener
+
+from narwhals.stable import v1 as nw
+from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr
+from packaging.requirements import Requirement
+
+from altair.datasets import _readimpl
+from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata
+from altair.datasets._constraints import is_parquet
+from altair.datasets._exceptions import (
+    AltairDatasetsError,
+    implementation_not_found,
+    module_not_found,
+)
+from altair.datasets._readimpl import IntoFrameT, is_available
+from altair.datasets._typing import EXTENSION_SUFFIXES
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Callable, Sequence
+    from urllib.request import OpenerDirector
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+
+    from altair.datasets._readimpl import BaseImpl, R, ReadImpl, ScanImpl
+    from altair.datasets._typing import Dataset, Extension, Metadata
+    from altair.vegalite.v5.schema._typing import OneOrSeq
+
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs, TypeVar
+    else:
+        from typing_extensions import TypeIs, TypeVar
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    _Polars: TypeAlias = Literal["polars"]
+    _Pandas: TypeAlias = Literal["pandas"]
+    _PyArrow: TypeAlias = Literal["pyarrow"]
+    _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"]
+    _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow]
+    _CuDF: TypeAlias = Literal["cudf"]
+    _Dask: TypeAlias = Literal["dask"]
+    _DuckDB: TypeAlias = Literal["duckdb"]
+    _Ibis: TypeAlias = Literal["ibis"]
+    _PySpark: TypeAlias = Literal["pyspark"]
+    _NwSupport: TypeAlias = Literal[
+        _Polars, _Pandas, _PyArrow, _CuDF, _Dask, _DuckDB, _Ibis, _PySpark
+    ]
+    _NwSupportT = TypeVar(
+        "_NwSupportT",
+        _Polars,
+        _Pandas,
+        _PyArrow,
+        _CuDF,
+        _Dask,
+        _DuckDB,
+        _Ibis,
+        _PySpark,
+    )
+
+
+class Reader(Generic[IntoDataFrameT, IntoFrameT]):
+    """
+    Modular file reader, targeting remote & local tabular resources.
+
+    .. warning::
+        Use ``reader(...)`` instead of instantiating ``Reader`` directly.
+    """
+
+    # TODO: Docs
+    _read: Sequence[ReadImpl[IntoDataFrameT]]
+    """Eager file read functions."""
+
+    # TODO: Docs
+    _scan: Sequence[ScanImpl[IntoFrameT]]
+    """
+    *Optionally*-lazy file read/scan functions.
+
+    Used exclusively for ``metadata.parquet``.
+
+    Currently ``"polars"`` is the only lazy option.
+    All others defer to the eager variant.
+    """
+
+    _name: str
+    """
+    Used in error messages, repr and matching ``@overload``(s).
+
+    Otherwise, has no concrete meaning.
+    """
+
+    _implementation: nw.Implementation
+    """
+    Corresponding `narwhals implementation`_.
+
+    .. _narwhals implementation:
+        https://github.com/narwhals-dev/narwhals/blob/9b6a355530ea46c590d5a6d1d0567be59c0b5742/narwhals/utils.py#L61-L290
+    """
+
+    _opener: ClassVar[OpenerDirector] = _build_opener()
+    _metadata_path: ClassVar[Path] = (
+        Path(__file__).parent / "_metadata" / "metadata.parquet"
+    )
+
+    def __init__(
+        self,
+        read: Sequence[ReadImpl[IntoDataFrameT]],
+        scan: Sequence[ScanImpl[IntoFrameT]],
+        name: str,
+        implementation: nw.Implementation,
+    ) -> None:
+        self._read = read
+        self._scan = scan
+        self._name = name
+        self._implementation = implementation
+        self._schema_cache = SchemaCache(implementation=implementation)
+
+    # TODO: Finish working on presentation
+    # - The contents of both are functional
+    def profile(self, mode: Literal["any", "each"]):
+        """
+        Describe which datasets/groups are supported.
+
+        Focusing on actual datasets, rather than describing wrapped functions (repr)
+
+        .. note::
+            Having this public to make testing easier (``tests.test_datasets.is_polars_backed_pyarrow``)
+        """
+        if mode == "any":
+            relevant_columns = set(
+                chain.from_iterable(impl._relevant_columns for impl in self._read)
+            )
+            frame = self._scan_metadata().select("dataset_name", *relevant_columns)
+            it = (impl._include_expr for impl in self._read)
+            # BUG: ``narwhals`` raises a ``ValueError`` when ``__invert__``-ing a previously used Expr?
+            # - Can't reproduce trivially
+            # - Doesnt seem to be related to genexp
+            inc_expr = nw.any_horizontal(*it)
+            include = _dataset_names(frame, inc_expr)
+            exclude = _dataset_names(frame, ~nw.col("dataset_name").is_in(include))
+            return {"include": include, "exclude": exclude}
+        elif mode == "each":
+            # FIXME: Rough draft of how to group results
+            # - Don't really want a nested dict
+            m = {}
+            frame = self._scan_metadata()
+            for impl in self._read:
+                name = impl._contents
+                m[name] = {"include": _dataset_names(frame, impl._include_expr)}
+                if impl.exclude:
+                    m[name].update(exclude=_dataset_names(frame, impl._exclude_expr))
+            return m
+        else:
+            msg = f"Unexpected {mode=}"
+            raise TypeError(msg)
+
+    def __repr__(self) -> str:
+        from textwrap import indent
+
+        PREFIX = " " * 4
+        NL = "\n"
+        body = f"read\n{indent(NL.join(el._contents for el in self._read), PREFIX)}"
+        if self._scan:
+            body += (
+                f"\nscan\n{indent(NL.join(el._contents for el in self._scan), PREFIX)}"
+            )
+        return f"Reader[{self._name}] {self._implementation!r}\n{body}"
+
+    def read_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]:
+        return self._solve(meta, self._read)
+
+    def scan_fn(self, meta: Metadata | Path | str, /) -> Callable[..., IntoFrameT]:
+        meta = meta if isinstance(meta, Mapping) else {"suffix": _into_suffix(meta)}
+        return self._solve(meta, self._scan)
+
+    @property
+    def cache(self) -> DatasetCache:
+        return DatasetCache(self)
+
+    def dataset(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        frame = self._query(name, suffix)
+        meta = next(_iter_metadata(frame))
+        fn = self.read_fn(meta)
+        fn_kwds = self._merge_kwds(meta, kwds)
+        if self.cache.is_active():
+            fp = self.cache._maybe_download(meta)
+            return fn(fp, **fn_kwds)
+        else:
+            with self._opener.open(meta["url"]) as f:
+                return fn(f, **fn_kwds)
+
+    def url(
+        self, name: Dataset | LiteralString, suffix: Extension | None = None, /
+    ) -> str:
+        frame = self._query(name, suffix)
+        meta = next(_iter_metadata(frame))
+        if is_parquet(meta.items()) and not is_available("vegafusion"):
+            raise AltairDatasetsError.from_url(meta)
+        url = meta["url"]
+        if isinstance(url, str):
+            return url
+        else:
+            msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
+            raise TypeError(msg)
+
+    def _query(
+        self, name: Dataset | LiteralString, suffix: Extension | None = None, /
+    ) -> nw.DataFrame[IntoDataFrameT]:
+        """
+        Query a tabular version of `vega-datasets/datapackage.json`_.
+
+        Applies a filter, erroring out when no results would be returned.
+
+        .. _vega-datasets/datapackage.json:
+            https://github.com/vega/vega-datasets/blob/main/datapackage.json
+        """
+        constraints = _into_constraints(name, suffix)
+        frame = self._scan_metadata(**constraints).collect()
+        if not frame.is_empty():
+            return frame
+        else:
+            msg = f"Found no results for:\n    {constraints!r}"
+            raise ValueError(msg)
+
+    # TODO: Docs
+    def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, Any]:
+        """
+        Hook to utilize ``meta`` to extend ``kwds`` with known helpful defaults.
+
+        - User provided arguments have a higher precedence.
+        - The keywords for schemas vary between libraries
+            - pandas is internally inconsistent
+        - By default, returns unchanged
+        """
+        if self._schema_cache.is_active() and (
+            schema := self._schema_cache.schema_kwds(meta)
+        ):
+            kwds = schema | kwds if kwds else schema
+        return kwds
+
+    @property
+    def _metadata_frame(self) -> nw.LazyFrame:
+        fp = self._metadata_path
+        return nw.from_native(self.scan_fn(fp)(fp)).lazy()
+
+    def _scan_metadata(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame:
+        if predicates or constraints:
+            return self._metadata_frame.filter(*predicates, **constraints)
+        return self._metadata_frame
+
+    # TODO: Docs
+    def _solve(
+        self, meta: Metadata, impls: Sequence[BaseImpl[R]], /
+    ) -> Callable[..., R]:
+        """
+        Return the first function meeting constraints of meta.
+
+        Notes
+        -----
+        - Iterate over impls
+        - Each one can either match or signal an error
+        - An error blocks any additional checking
+            - Both include & exclude
+        - Uses ``ItemsView`` to support set ops
+            - `meta` isn't iterated over
+            - Leaves the door open for caching the search space
+        """
+        items = meta.items()
+        it = (some for impl in impls if (some := impl.unwrap_or(items)))
+        if fn_or_err := next(it, None):
+            if _is_err(fn_or_err):
+                raise fn_or_err.from_tabular(meta, self._name)
+            return fn_or_err
+        if meta["is_image"]:
+            raise AltairDatasetsError.from_tabular(meta, self._name)
+        raise implementation_not_found(meta)
+
+
+# TODO: Review after finishing `profile`
+# NOTE: Temp helper function for `Reader.profile`
+def _dataset_names(
+    frame: nw.LazyFrame,
+    *predicates: OneOrSeq[IntoExpr],
+    **constraints: Unpack[Metadata],
+):
+    return (
+        frame.filter(*predicates, **constraints)
+        .select("dataset_name")
+        .collect()
+        .get_column("dataset_name")
+        .to_list()
+    )
+
+
+class _NoParquetReader(Reader[IntoDataFrameT, IntoFrameT]):
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}\ncsv_cache\n    {self.csv_cache!r}"
+
+    @property
+    def csv_cache(self) -> CsvCache:
+        if not hasattr(self, "_csv_cache"):
+            self._csv_cache = CsvCache()
+        return self._csv_cache
+
+    @property
+    def _metadata_frame(self) -> nw.LazyFrame:
+        ns = self._implementation.to_native_namespace()
+        data = cast("dict[str, Any]", self.csv_cache.rotated)
+        return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns)).lazy()
+
+
+@overload
+def reader(
+    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
+    scan_fns: tuple[()] = ...,
+    *,
+    name: str | None = ...,
+    implementation: nw.Implementation = ...,
+) -> Reader[IntoDataFrameT, nw.LazyFrame]: ...
+
+
+@overload
+def reader(
+    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
+    scan_fns: Sequence[ScanImpl[IntoFrameT]],
+    *,
+    name: str | None = ...,
+    implementation: nw.Implementation = ...,
+) -> Reader[IntoDataFrameT, IntoFrameT]: ...
+
+
+def reader(
+    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
+    scan_fns: Sequence[ScanImpl[IntoFrameT]] = (),
+    *,
+    name: str | None = None,
+    implementation: nw.Implementation = nw.Implementation.UNKNOWN,
+) -> Reader[IntoDataFrameT, IntoFrameT] | Reader[IntoDataFrameT, nw.LazyFrame]:
+    name = name or Counter(el._inferred_package for el in read_fns).most_common(1)[0][0]
+    if implementation is nw.Implementation.UNKNOWN:
+        implementation = _into_implementation(Requirement(name))
+    if scan_fns:
+        return Reader(read_fns, scan_fns, name, implementation)
+    if stolen := _steal_eager_parquet(read_fns):
+        return Reader(read_fns, stolen, name, implementation)
+    else:
+        return _NoParquetReader[IntoDataFrameT](read_fns, (), name, implementation)
+
+
+def infer_backend(
+    *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow")
+) -> Reader[Any, Any]:
+    """
+    Return the first available reader in order of `priority`.
+
+    Notes
+    -----
+    - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``)
+    - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support
+    - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed
+    - ``"pyarrow"``: least reliable
+
+    .. _fastparquet:
+        https://github.com/dask/fastparquet
+    """
+    it = (_from_backend(name) for name in priority if is_available(_requirements(name)))
+    if reader := next(it, None):
+        return reader
+    raise AltairDatasetsError.from_priority(priority)
+
+
+@overload
+def _from_backend(name: _Polars, /) -> Reader[pl.DataFrame, pl.LazyFrame]: ...
+@overload
+def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, nw.LazyFrame]: ...
+@overload
+def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, nw.LazyFrame]: ...
+
+
+# FIXME: The order this is defined in makes splitting the module complicated
+# - Can't use a classmethod, since some result in a subclass used
+def _from_backend(name: _Backend, /) -> Reader[Any, Any]:
+    """
+    Reader initialization dispatcher.
+
+    FIXME: Works, but defining these in mixed shape functions seems off.
+    """
+    if not _is_backend(name):
+        msg = f"Unknown backend {name!r}"
+        raise TypeError(msg)
+    implementation = _into_implementation(name)
+    if name == "polars":
+        rd, sc = _readimpl.pl_only()
+        return reader(rd, sc, name=name, implementation=implementation)
+    elif name == "pandas[pyarrow]":
+        return reader(_readimpl.pd_pyarrow(), name=name, implementation=implementation)
+    elif name == "pandas":
+        return reader(_readimpl.pd_only(), name=name, implementation=implementation)
+    elif name == "pyarrow":
+        return reader(_readimpl.pa_any(), name=name, implementation=implementation)
+
+
+def _is_backend(obj: Any) -> TypeIs[_Backend]:
+    return obj in {"polars", "pandas", "pandas[pyarrow]", "pyarrow"}
+
+
+def _is_err(obj: Any) -> TypeIs[type[AltairDatasetsError]]:
+    return obj is AltairDatasetsError
+
+
+def _into_constraints(
+    name: Dataset | LiteralString, suffix: Extension | None, /
+) -> Metadata:
+    """Transform args into a mapping to column names."""
+    m: Metadata = {}
+    if "." in name:
+        m["file_name"] = name
+    elif suffix is None:
+        m["dataset_name"] = name
+    elif suffix.startswith("."):
+        m = {"dataset_name": name, "suffix": suffix}
+    else:
+        msg = (
+            f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n"
+            f"but got: {suffix!r}"
+        )
+        raise TypeError(msg)
+    return m
+
+
+def _into_implementation(
+    backend: _NwSupport | _PandasAny | Requirement, /
+) -> nw.Implementation:
+    primary = _import_guarded(backend)
+    mapping: Mapping[LiteralString, nw.Implementation] = {
+        "polars": nw.Implementation.POLARS,
+        "pandas": nw.Implementation.PANDAS,
+        "pyarrow": nw.Implementation.PYARROW,
+        "cudf": nw.Implementation.CUDF,
+        "dask": nw.Implementation.DASK,
+        "duckdb": nw.Implementation.DUCKDB,
+        "ibis": nw.Implementation.IBIS,
+        "pyspark": nw.Implementation.PYSPARK,
+    }
+    if impl := mapping.get(primary):
+        return impl
+    msg = f"Package {primary!r} is not supported by `narhwals`."
+    raise ValueError(msg)
+
+
+def _into_suffix(obj: Path | str, /) -> Any:
+    if isinstance(obj, Path):
+        return obj.suffix
+    elif isinstance(obj, str):
+        return obj
+    else:
+        msg = f"Unexpected type {type(obj).__name__!r}"
+        raise TypeError(msg)
+
+
+def _steal_eager_parquet(
+    read_fns: Sequence[ReadImpl[IntoDataFrameT]], /
+) -> Sequence[ScanImpl[nw.LazyFrame]] | None:
+    if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None):
+        return (convertable.to_scan_impl(),)
+    return None
+
+
+@overload
+def _import_guarded(req: _PandasAny, /) -> _Pandas: ...
+
+
+@overload
+def _import_guarded(req: _NwSupportT, /) -> _NwSupportT: ...
+
+
+@overload
+def _import_guarded(req: Requirement, /) -> LiteralString: ...
+
+
+def _import_guarded(req: Any, /) -> LiteralString:
+    requires = _requirements(req)
+    for name in requires:
+        if spec := find_spec(name):
+            import_module(spec.name)
+        else:
+            raise module_not_found(str(req), requires, missing=name)
+    return requires[0]
+
+
+def _requirements(req: Requirement | str, /) -> tuple[Any, ...]:
+    req = Requirement(req) if isinstance(req, str) else req
+    return (req.name, *req.extras)
diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py
deleted file mode 100644
index a1f66dee1..000000000
--- a/altair/datasets/_readers.py
+++ /dev/null
@@ -1,574 +0,0 @@
-"""
-Backends for ``alt.datasets.Loader``.
-
-- Interfacing with the cached metadata.
-    - But not updating it
-- Performing requests from those urls
-- Dispatching read function on file extension
-"""
-
-from __future__ import annotations
-
-import urllib.request
-from collections.abc import Callable, Iterable, Mapping, Sequence
-from functools import partial
-from importlib import import_module
-from importlib.util import find_spec
-from itertools import chain
-from pathlib import Path
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    ClassVar,
-    Final,
-    Literal,
-    Protocol,
-    TypeVar,
-    overload,
-)
-
-import narwhals.stable.v1 as nw
-from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT
-
-from altair.datasets import _exceptions as _ds_exc
-from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata
-from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read
-
-if TYPE_CHECKING:
-    import sys
-    from io import IOBase
-    from urllib.request import OpenerDirector
-
-    import pandas as pd
-    import polars as pl
-    import pyarrow as pa
-    from _typeshed import StrPath
-    from pyarrow.csv import read_csv as pa_read_csv  # noqa: F401
-    from pyarrow.feather import read_table as pa_read_feather  # noqa: F401
-    from pyarrow.json import read_json as pa_read_json  # noqa: F401
-    from pyarrow.parquet import read_table as pa_read_parquet  # noqa: F401
-
-    if sys.version_info >= (3, 13):
-        from typing import TypeIs, Unpack
-    else:
-        from typing_extensions import TypeIs, Unpack
-    if sys.version_info >= (3, 11):
-        from typing import LiteralString
-    else:
-        from typing_extensions import LiteralString
-    if sys.version_info >= (3, 10):
-        from typing import TypeAlias
-    else:
-        from typing_extensions import TypeAlias
-    from packaging.requirements import Requirement
-
-    from altair.datasets._typing import Dataset, Extension, Metadata
-    from altair.vegalite.v5.schema._typing import OneOrSeq
-
-    _IntoSuffix: TypeAlias = "StrPath | Metadata"
-    _ExtensionScan: TypeAlias = Literal[".parquet"]
-    _T = TypeVar("_T")
-
-    # NOTE: Using a constrained instead of bound `TypeVar`
-    #       error: Incompatible return value type (got "DataFrame[Any] | LazyFrame[Any]", expected "FrameT")  [return-value]
-    # - https://typing.readthedocs.io/en/latest/spec/generics.html#introduction
-    # - https://typing.readthedocs.io/en/latest/spec/generics.html#type-variables-with-an-upper-bound
-    # https://github.com/narwhals-dev/narwhals/blob/21b8436567de3631c584ef67632317ad70ae5de0/narwhals/typing.py#L59
-    FrameT = TypeVar("FrameT", nw.DataFrame[Any], nw.LazyFrame)
-
-    _Polars: TypeAlias = Literal["polars"]
-    _Pandas: TypeAlias = Literal["pandas"]
-    _PyArrow: TypeAlias = Literal["pyarrow"]
-    _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow)
-    _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"]
-    _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow]
-
-
-__all__ = ["backend", "infer_backend"]
-
-_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"
-
-
-class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
-    """
-    Describes basic IO for remote & local tabular resources.
-
-    Subclassing this protocol directly will provide a *mostly* complete implementation.
-
-    Each of the following must be explicitly assigned:
-
-        _Reader._read_fn
-        _Reader._scan_fn
-        _Reader._name
-    """
-
-    _read_fn: Mapping[Extension, Callable[..., IntoDataFrameT]]
-    """
-    Eager file read functions.
-
-    Each corresponds to a known file extension within ``vega-datasets``.
-    """
-
-    _scan_fn: Mapping[_ExtensionScan, Callable[..., IntoFrameT]]
-    """
-    *Optionally*-lazy file read/scan functions.
-
-    Used exclusively for ``metadata.parquet``.
-
-    Currently ``"polars"`` is the only lazy option.
-    """
-
-    _name: LiteralString
-    """
-    Used in error messages, repr and matching ``@overload``(s).
-
-    Otherwise, has no concrete meaning.
-    """
-
-    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
-
-    def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]:
-        return self._read_fn[_extract_suffix(source, is_ext_read)]
-
-    def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]:
-        return self._scan_fn[_extract_suffix(source, is_ext_scan)]
-
-    def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
-        """Hook to provide additional schema metadata on read."""
-        return {}
-
-    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]:
-        """Backend specific tweaks/errors/warnings, based on ``Metadata``."""
-        if meta["is_image"]:
-            raise _ds_exc.image(meta)
-        return self.read_fn(meta)
-
-    def dataset(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-        **kwds: Any,
-    ) -> IntoDataFrameT:
-        df = self.query(**_extract_constraints(name, suffix))
-        meta = next(_iter_metadata(df))
-        fn = self._maybe_fn(meta)
-        url = meta["url"]
-        if default_kwds := self._schema_kwds(meta):
-            kwds = default_kwds | kwds if kwds else default_kwds
-
-        if self.cache.is_active():
-            fp = self.cache.path / (meta["sha"] + meta["suffix"])
-            if not (fp.exists() and fp.stat().st_size):
-                self._download(url, fp)
-            return fn(fp, **kwds)
-        else:
-            with self._opener.open(url) as f:
-                return fn(f, **kwds)
-
-    def url(
-        self,
-        name: Dataset | LiteralString,
-        suffix: Extension | None = None,
-        /,
-    ) -> str:
-        frame = self.query(**_extract_constraints(name, suffix))
-        meta = next(_iter_metadata(frame))
-        if meta["suffix"] == ".parquet" and not is_available("vegafusion"):
-            raise _ds_exc.AltairDatasetsError.from_url(meta)
-        url = meta["url"]
-        if isinstance(url, str):
-            return url
-        else:
-            msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
-            raise TypeError(msg)
-
-    def query(
-        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.DataFrame[IntoDataFrameT]:
-        """
-        Query a tabular version of `vega-datasets/datapackage.json`_.
-
-        Applies a filter, erroring out when no results would be returned.
-
-        Notes
-        -----
-        Arguments correspond to those seen in `pl.LazyFrame.filter`_.
-
-        .. _vega-datasets/datapackage.json:
-            https://github.com/vega/vega-datasets/blob/main/datapackage.json
-        .. _pl.LazyFrame.filter:
-            https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html
-        """
-        frame = self._scan_metadata(*predicates, **constraints).collect()
-        if not frame.is_empty():
-            return frame
-        else:
-            terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t)
-            msg = f"Found no results for:\n    {terms}"
-            raise ValueError(msg)
-
-    def _scan_metadata(
-        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.LazyFrame:
-        if predicates or constraints:
-            return self._metadata.filter(*predicates, **constraints)
-        return self._metadata
-
-    @property
-    def _metadata(self) -> nw.LazyFrame:
-        return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy()
-
-    def _download(self, url: str, fp: Path, /) -> None:
-        with self._opener.open(url) as f:
-            fp.touch()
-            fp.write_bytes(f.read())
-
-    @property
-    def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]:
-        return DatasetCache(self)
-
-    def _import(self, name: str, /) -> Any:
-        if spec := find_spec(name):
-            return import_module(spec.name)
-        raise _ds_exc.module_not_found(self._name, _requirements(self._name), name)  # type: ignore[call-overload]
-
-    def __repr__(self) -> str:
-        return f"Reader[{self._name}]"
-
-    def __init__(self, name: LiteralString, /) -> None: ...
-
-
-class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol):
-    """
-    Provides temporal column names as keyword arguments on read.
-
-    Related
-    -------
-    - https://github.com/vega/altair/pull/3631#issuecomment-2480816377
-    - https://github.com/vega/vega-datasets/pull/631
-    - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
-    - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
-    """
-
-    _schema_cache: SchemaCache
-
-    def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
-        name: Any = meta["dataset_name"]
-        suffix = meta["suffix"]
-        if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime):
-            if suffix == ".json":
-                return {"convert_dates": cols}
-            elif suffix in {".csv", ".tsv"}:
-                return {"parse_dates": cols}
-        return super()._schema_kwds(meta)
-
-    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]:
-        fn = super()._maybe_fn(meta)
-        if meta["is_spatial"]:
-            raise _ds_exc.geospatial(meta, self._name)
-        return fn
-
-
-class _PandasReader(_PandasReaderBase):
-    def __init__(self, name: _Pandas, /) -> None:
-        self._name = _requirements(name)
-        if not TYPE_CHECKING:
-            pd = self._import(self._name)
-        self._read_fn = {
-            ".csv": pd.read_csv,
-            ".json": pd.read_json,
-            ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"),
-            ".arrow": pd.read_feather,
-            ".parquet": pd.read_parquet,
-        }
-        self._scan_fn = {".parquet": pd.read_parquet}
-        self._supports_parquet: bool = is_available(
-            "pyarrow", "fastparquet", require_all=False
-        )
-        self._csv_cache = CsvCache()
-        self._schema_cache = SchemaCache()
-
-    @property
-    def _metadata(self) -> nw.LazyFrame:
-        if self._supports_parquet:
-            return super()._metadata
-        return self._csv_cache.metadata(nw.dependencies.get_pandas())
-
-
-class _PandasPyArrowReader(_PandasReaderBase):
-    def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
-        _pd, _pa = _requirements(name)
-        self._name = name
-        if not TYPE_CHECKING:
-            pd = self._import(_pd)
-            pa = self._import(_pa)  # noqa: F841
-
-        self._read_fn = {
-            ".csv": partial["pd.DataFrame"](pd.read_csv, dtype_backend=_pa),
-            ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa),
-            ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa),
-            ".arrow": partial(pd.read_feather, dtype_backend=_pa),
-            ".parquet": partial(pd.read_parquet, dtype_backend=_pa),
-        }
-        self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}
-        self._schema_cache = SchemaCache()
-
-
-def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame:
-    """
-    Try to utilize better date parsing available in `pl.read_csv`_.
-
-    `pl.read_json`_ has few options when compared to `pl.read_csv`_.
-
-    Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_.
-
-    .. _pl.read_json:
-        https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html
-    .. _pl.read_csv:
-        https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html
-    .. _pandas.read_json:
-        https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
-    """
-    from io import BytesIO
-
-    import polars as pl
-
-    df = pl.read_json(source, **kwds)
-    if any(tp.is_nested() for tp in df.schema.dtypes()):
-        # NOTE: Inferred as `(Geo|Topo)JSON`, which wouldn't be supported by `read_csv`
-        return df
-    buf = BytesIO()
-    df.write_csv(buf)
-    if kwds:
-        SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"}
-        kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS}
-    return pl.read_csv(buf, try_parse_dates=True, **kwds)
-
-
-class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]):
-    def __init__(self, name: _Polars, /) -> None:
-        self._name = _requirements(name)
-        if not TYPE_CHECKING:
-            pl = self._import(self._name)
-        self._read_fn = {
-            ".csv": partial(pl.read_csv, try_parse_dates=True),
-            ".json": _pl_read_json_roundtrip,
-            ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True),
-            ".arrow": pl.read_ipc,
-            ".parquet": pl.read_parquet,
-        }
-        self._scan_fn = {".parquet": pl.scan_parquet}
-
-
-class _PyArrowReader(_Reader["pa.Table", "pa.Table"]):
-    """
-    Reader backed by `pyarrow.Table`_.
-
-    Warning
-    -------
-    **JSON**: Only supports `line-delimited`_ JSON.
-    Likely to raise the following error:
-
-        ArrowInvalid: JSON parse error: Column() changed from object to array in row 0
-
-    .. _pyarrow.Table:
-        https://arrow.apache.org/docs/python/generated/pyarrow.Table.html
-    .. _line-delimited:
-        https://arrow.apache.org/docs/python/json.html#reading-json-files
-    """
-
-    def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]:
-        fn = super()._maybe_fn(meta)
-        if fn == self._read_json_polars:
-            return fn
-        elif meta["is_json"]:
-            if meta["is_tabular"]:
-                return self._read_json_tabular
-            elif meta["is_spatial"]:
-                raise _ds_exc.geospatial(meta, self._name)
-            else:
-                raise _ds_exc.non_tabular_json(meta, self._name)
-        else:
-            return fn
-
-    def _read_json_tabular(self, source: Any, /, **kwds: Any) -> pa.Table:
-        import json
-
-        if not isinstance(source, Path):
-            obj = json.load(source)
-        else:
-            with Path(source).open(encoding="utf-8") as f:
-                obj = json.load(f)
-        pa = nw.dependencies.get_pyarrow()
-        return pa.Table.from_pylist(obj)
-
-    def _read_json_polars(self, source: Any, /, **kwds: Any) -> pa.Table:
-        return _pl_read_json_roundtrip(source).to_arrow()
-
-    def __init__(self, name: _PyArrow, /) -> None:
-        self._name = _requirements(name)
-        if not TYPE_CHECKING:
-            pa = self._import(self._name)  # noqa: F841
-            pa_read_csv = self._import(f"{self._name}.csv").read_csv
-            pa_read_feather = self._import(f"{self._name}.feather").read_table
-            pa_read_parquet = self._import(f"{self._name}.parquet").read_table
-
-            # NOTE: Prefer `polars` since it is zero-copy and fast
-            if find_spec("polars") is not None:
-                pa_read_json = self._read_json_polars
-            else:
-                pa_read_json = self._import(f"{self._name}.json").read_json
-
-        # NOTE: Stubs suggest using a dataclass, but no way to construct it
-        tab_sep: Any = {"delimiter": "\t"}
-
-        self._read_fn = {
-            ".csv": pa_read_csv,
-            ".json": pa_read_json,
-            ".tsv": partial(pa_read_csv, parse_options=tab_sep),
-            ".arrow": pa_read_feather,
-            ".parquet": pa_read_parquet,
-        }
-        self._scan_fn = {".parquet": pa_read_parquet}
-
-
-def _extract_constraints(
-    name: Dataset | LiteralString, suffix: Extension | None, /
-) -> Metadata:
-    """Transform args into a mapping to column names."""
-    constraints: Metadata = {}
-    if name.endswith(EXTENSION_SUFFIXES):
-        fp = Path(name)
-        constraints["dataset_name"] = fp.stem
-        constraints["suffix"] = fp.suffix
-        return constraints
-    elif suffix is not None:
-        if not is_ext_read(suffix):
-            msg = (
-                f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n"
-                f"but got: {suffix!r}"
-            )
-            raise TypeError(msg)
-        else:
-            constraints["suffix"] = suffix
-    constraints["dataset_name"] = name
-    return constraints
-
-
-def _extract_suffix(source: _IntoSuffix, guard: Callable[..., TypeIs[_T]], /) -> _T:
-    suffix: Any = (
-        Path(source).suffix if not isinstance(source, Mapping) else source["suffix"]
-    )
-    if guard(suffix):
-        return suffix
-    else:
-        msg = f"Unexpected file extension {suffix!r}, from:\n{source}"
-        raise TypeError(msg)
-
-
-def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
-    return suffix == ".parquet"
-
-
-def is_available(
-    pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True
-) -> bool:
-    """
-    Check for importable package(s), without raising on failure.
-
-    Parameters
-    ----------
-    pkg_names, more_pkg_names
-        One or more packages.
-    require_all
-        * ``True`` every package.
-        * ``False`` at least one package.
-    """
-    if not more_pkg_names and isinstance(pkg_names, str):
-        return find_spec(pkg_names) is not None
-    pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
-    names = chain(pkgs_names, more_pkg_names)
-    fn = all if require_all else any
-    return fn(find_spec(name) is not None for name in names)
-
-
-def infer_backend(
-    *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow")
-) -> _Reader[Any, Any]:
-    """
-    Return the first available reader in order of `priority`.
-
-    Notes
-    -----
-    - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``)
-    - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support
-    - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed
-    - ``"pyarrow"``: least reliable
-
-    .. _fastparquet:
-        https://github.com/dask/fastparquet
-    """
-    it = (backend(name) for name in priority if is_available(_requirements(name)))
-    if reader := next(it, None):
-        return reader
-    raise _ds_exc.AltairDatasetsError.from_priority(priority)
-
-
-@overload
-def backend(name: _Polars, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...
-
-
-@overload
-def backend(name: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ...
-
-
-@overload
-def backend(name: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ...
-
-
-def backend(name: _Backend, /) -> _Reader[Any, Any]:
-    """Reader initialization dispatcher."""
-    if name == "polars":
-        return _PolarsReader(name)
-    elif name == "pandas[pyarrow]":
-        return _PandasPyArrowReader(name)
-    elif name == "pandas":
-        return _PandasReader(name)
-    elif name == "pyarrow":
-        return _PyArrowReader(name)
-    elif name in {"ibis", "cudf", "dask", "modin"}:
-        msg = "Supported by ``narwhals``, not investigated yet"
-        raise NotImplementedError(msg)
-    else:
-        msg = f"Unknown backend {name!r}"
-        raise TypeError(msg)
-
-
-@overload
-def _requirements(s: _ConcreteT, /) -> _ConcreteT: ...
-
-
-@overload
-def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ...
-
-
-def _requirements(s: Any, /) -> Any:
-    concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"}
-    if s in concrete:
-        return s
-    else:
-        from packaging.requirements import Requirement
-
-        req = Requirement(s)
-        supports_extras: set[Literal[_Pandas]] = {"pandas"}
-        if req.name in supports_extras and req.extras == {"pyarrow"}:
-            return req.name, "pyarrow"
-        return _requirements_unknown(req)
-
-
-def _requirements_unknown(req: Requirement | str, /) -> Any:
-    from packaging.requirements import Requirement
-
-    req = Requirement(req) if isinstance(req, str) else req
-    return (req.name, *req.extras)
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
new file mode 100644
index 000000000..119352db5
--- /dev/null
+++ b/altair/datasets/_readimpl.py
@@ -0,0 +1,414 @@
+"""Individual read functions and siuations they support."""
+
+from __future__ import annotations
+
+import sys
+from enum import Enum
+from functools import partial, wraps
+from importlib.util import find_spec
+from itertools import chain
+from operator import itemgetter
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generic, Literal
+
+from narwhals.stable import v1 as nw
+from narwhals.stable.v1.dependencies import get_pandas, get_polars
+from narwhals.stable.v1.typing import IntoDataFrameT
+
+from altair.datasets._constraints import (
+    is_arrow,
+    is_csv,
+    is_json,
+    is_meta,
+    is_not_tabular,
+    is_parquet,
+    is_spatial,
+    is_tsv,
+)
+from altair.datasets._exceptions import AltairDatasetsError
+
+if sys.version_info >= (3, 13):
+    from typing import TypeVar
+else:
+    from typing_extensions import TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Iterable, Iterator, Sequence
+    from io import IOBase
+    from types import ModuleType
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+    from narwhals.stable.v1 import typing as nwt
+
+    from altair.datasets._constraints import Items, MetaIs
+
+__all__ = ["is_available", "pa_any", "pd_only", "pd_pyarrow", "pl_only", "read", "scan"]
+
+R = TypeVar("R")
+IntoFrameT = TypeVar(
+    "IntoFrameT",
+    bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike",
+    default=nw.LazyFrame,
+)
+
+
+class Skip(Enum):
+    """Falsy sentinel."""
+
+    skip = 0
+
+    def __bool__(self) -> Literal[False]:
+        return False
+
+    def __repr__(self) -> Literal["<Skip>"]:
+        return "<Skip>"
+
+
+class BaseImpl(Generic[R]):
+    fn: Callable[..., R]
+    """Wrapped read function."""
+    include: MetaIs
+    """Passing this makes ``fn`` a candidate."""
+    exclude: MetaIs
+    """Passing this overrides ``include``, transforming into an error."""
+
+    def __init__(
+        self,
+        fn: Callable[..., R],
+        include: MetaIs,
+        exclude: MetaIs | None,
+        kwds: dict[str, Any],
+        /,
+    ) -> None:
+        exclude = exclude or self._exclude_none()
+        if not include.isdisjoint(exclude):
+            intersection = ", ".join(f"{k}={v!r}" for k, v in include & exclude)
+            msg = f"Constraints overlap at: `{intersection}`\ninclude={include!r}\nexclude={exclude!r}"
+            raise TypeError(msg)
+        object.__setattr__(self, "fn", partial(fn, **kwds) if kwds else fn)
+        object.__setattr__(self, "include", include)
+        object.__setattr__(self, "exclude", exclude)
+
+    # TODO: Consider renaming
+    # NOTE:
+    # - Fn means call it
+    # - Err means raise it
+    # - Skip means its safe to check other impls
+    def unwrap_or(
+        self, meta: Items, /
+    ) -> Callable[..., R] | type[AltairDatasetsError] | Skip:
+        if self.include.issubset(meta):
+            return self.fn if self.exclude.isdisjoint(meta) else AltairDatasetsError
+        return Skip.skip
+
+    @classmethod
+    def _exclude_none(cls) -> MetaIs:
+        return is_meta()
+
+    def __setattr__(self, name: str, value: Any):
+        msg = (
+            f"{type(self).__name__!r} is immutable.\n"
+            f"Could not assign self.{name} = {value}"
+        )
+        raise TypeError(msg)
+
+    @property
+    def _inferred_package(self) -> str:
+        return _root_package_name(_unwrap_partial(self.fn), "UNKNOWN")
+
+    def __repr__(self) -> str:
+        tp_name = f"{type(self).__name__}[{self._inferred_package}?]"
+        return f"{tp_name}({self._contents})"
+
+    # TODO: Consider renaming
+    @property
+    def _contents(self) -> str:
+        if isinstance(self.fn, partial):
+            fn = _unwrap_partial(self.fn)
+            it = (f"{k}={v!r}" for k, v in self.fn.keywords.items())
+            fn_repr = f"{fn.__name__}(..., {', '.join(it)})"
+        else:
+            fn_repr = f"{self.fn.__name__}(...)"
+        if self.exclude:
+            params = f"include={self.include!r}, exclude={self.exclude!r}"
+        else:
+            params = repr(self.include)
+        return f"{fn_repr}, {params}"
+
+    @property
+    def _relevant_columns(self) -> Iterator[str]:
+        name = itemgetter(0)
+        yield from (name(obj) for obj in chain(self.include, self.exclude))
+
+    @property
+    def _include_expr(self) -> nw.Expr:
+        return (
+            self.include.to_expr() & ~self.exclude.to_expr()
+            if self.exclude
+            else self.include.to_expr()
+        )
+
+    @property
+    def _exclude_expr(self) -> nw.Expr:
+        if self.exclude:
+            return self.include.to_expr() & self.exclude.to_expr()
+        msg = f"Unable to generate an exclude expression without setting exclude\n\n{self!r}"
+        raise TypeError(msg)
+
+
+def _unwrap_partial(fn: Any, /) -> Any:
+    # NOTE: ``functools._unwrap_partial``
+    func = fn
+    while isinstance(func, partial):
+        func = func.func
+    return func
+
+
+class ScanImpl(BaseImpl[IntoFrameT]): ...
+
+
+class ReadImpl(BaseImpl[IntoDataFrameT]):
+    def to_scan_impl(self) -> ScanImpl[nw.LazyFrame]:
+        return ScanImpl(_into_scan_fn(self.fn), self.include, self.exclude, {})
+
+
+def _into_scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]:
+    @wraps(_unwrap_partial(fn))
+    def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame:
+        return nw.from_native(fn(*args, **kwds)).lazy()
+
+    return wrapper
+
+
+def _root_package_name(obj: Any, default: str, /) -> str:
+    # NOTE: Defers importing `inspect`, if we can get the module name
+    if hasattr(obj, "__module__"):
+        return obj.__module__.split(".")[0]
+    else:
+        from inspect import getmodule
+
+        module = getmodule(obj)
+    if module and (pkg := module.__package__):
+        return pkg.split(".")[0]
+    return default
+
+
+def is_available(
+    pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True
+) -> bool:
+    """
+    Check for importable package(s), without raising on failure.
+
+    Parameters
+    ----------
+    pkg_names, more_pkg_names
+        One or more packages.
+    require_all
+        * ``True`` every package.
+        * ``False`` at least one package.
+    """
+    if not more_pkg_names and isinstance(pkg_names, str):
+        return find_spec(pkg_names) is not None
+    pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
+    names = chain(pkgs_names, more_pkg_names)
+    fn = all if require_all else any
+    return fn(find_spec(name) is not None for name in names)
+
+
+def read(
+    fn: Callable[..., IntoDataFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> ReadImpl[IntoDataFrameT]:
+    return ReadImpl(fn, include, exclude, kwds)
+
+
+def scan(
+    fn: Callable[..., IntoFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> ScanImpl[IntoFrameT]:
+    return ScanImpl(fn, include, exclude, kwds)
+
+
+def pl_only() -> tuple[
+    Sequence[ReadImpl[pl.DataFrame]], Sequence[ScanImpl[pl.LazyFrame]]
+]:
+    import polars as pl
+
+    read_fns = (
+        read(pl.read_csv, is_csv, try_parse_dates=True),
+        read(_pl_read_json_roundtrip(get_polars()), is_json),
+        read(pl.read_csv, is_tsv, separator="\t", try_parse_dates=True),
+        read(pl.read_ipc, is_arrow),
+        read(pl.read_parquet, is_parquet),
+    )
+    scan_fns = (scan(pl.scan_parquet, is_parquet),)
+    return read_fns, scan_fns
+
+
+def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]:
+    import pandas as pd
+
+    opt: Sequence[ReadImpl[pd.DataFrame]]
+    if is_available("pyarrow"):
+        opt = read(pd.read_feather, is_arrow), read(pd.read_parquet, is_parquet)
+    elif is_available("fastparquet"):
+        opt = (read(pd.read_parquet, is_parquet),)
+    else:
+        opt = ()
+    return (
+        read(pd.read_csv, is_csv),
+        read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial),
+        read(pd.read_csv, is_tsv, sep="\t"),
+        *opt,
+    )
+
+
+def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]:
+    import pandas as pd
+
+    kwds: dict[str, Any] = {"dtype_backend": "pyarrow"}
+    return (
+        read(pd.read_csv, is_csv, **kwds),
+        read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial, **kwds),
+        read(pd.read_csv, is_tsv, sep="\t", **kwds),
+        read(pd.read_feather, is_arrow, **kwds),
+        read(pd.read_parquet, is_parquet, **kwds),
+    )
+
+
+def pa_any() -> Sequence[ReadImpl[pa.Table]]:
+    from pyarrow import csv, feather, parquet
+
+    return (
+        read(csv.read_csv, is_csv),
+        _pa_read_json_impl(),
+        read(csv.read_csv, is_tsv, parse_options={"delimiter": "\t"}),
+        read(feather.read_table, is_arrow),
+        read(parquet.read_table, is_parquet),
+    )
+
+
+def _pa_read_json_impl() -> ReadImpl[pa.Table]:
+    """
+    Mitigating ``pyarrow``'s `line-delimited`_ JSON requirement.
+
+    .. _line-delimited:
+        https://arrow.apache.org/docs/python/json.html#reading-json-files
+    """
+    if is_available("polars"):
+        return read(_pl_read_json_roundtrip_to_arrow(get_polars()), is_json)
+    elif is_available("pandas"):
+        return read(_pd_read_json_to_arrow(get_pandas()), is_json, exclude=is_spatial)
+    return read(_stdlib_read_json_to_arrow, is_json, exclude=is_not_tabular)
+
+
+def _pd_read_json(ns: ModuleType, /) -> Callable[..., pd.DataFrame]:
+    @wraps(ns.read_json)
+    def fn(source: Path | Any, /, **kwds: Any) -> pd.DataFrame:
+        return _pd_fix_dtypes_nw(ns.read_json(source, **kwds), **kwds).to_native()
+
+    return fn
+
+
+def _pd_fix_dtypes_nw(
+    df: pd.DataFrame, /, *, dtype_backend: Any = None, **kwds: Any
+) -> nw.DataFrame[pd.DataFrame]:
+    kwds = {"dtype_backend": dtype_backend} if dtype_backend else {}
+    return (
+        df.convert_dtypes(**kwds)
+        .pipe(nw.from_native, eager_only=True)
+        .with_columns(nw.selectors.by_dtype(nw.Object).cast(nw.String))
+    )
+
+
+def _pd_read_json_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]:
+    @wraps(ns.read_json)
+    def fn(source: Path | Any, /, *, schema: Any = None, **kwds: Any) -> pa.Table:
+        """``schema`` is only here to swallow the ``SchemaCache`` if used."""
+        return (
+            ns.read_json(source, **kwds)
+            .pipe(_pd_fix_dtypes_nw, dtype_backend="pyarrow")
+            .to_arrow()
+        )
+
+    return fn
+
+
+def _pl_read_json_roundtrip(ns: ModuleType, /) -> Callable[..., pl.DataFrame]:
+    """
+    Try to utilize better date parsing available in `pl.read_csv`_.
+
+    `pl.read_json`_ has few options when compared to `pl.read_csv`_.
+
+    Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_.
+
+    .. _pl.read_json:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html
+    .. _pl.read_csv:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html
+    .. _pandas.read_json:
+        https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
+    """
+    from io import BytesIO
+
+    @wraps(ns.read_json)
+    def fn(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame:
+        df = ns.read_json(source, **kwds)
+        if any(tp.is_nested() for tp in df.schema.dtypes()):
+            return df
+        buf = BytesIO()
+        df.write_csv(buf)
+        if kwds:
+            SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"}
+            kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS}
+        return ns.read_csv(buf, try_parse_dates=True, **kwds)
+
+    return fn
+
+
+def _pl_read_json_roundtrip_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]:
+    eager = _pl_read_json_roundtrip(ns)
+
+    @wraps(ns.read_json)
+    def fn(source: Path | IOBase, /, **kwds: Any) -> pa.Table:
+        return eager(source).to_arrow()
+
+    return fn
+
+
+def _stdlib_read_json(source: Path | Any, /) -> Any:
+    import json
+
+    if not isinstance(source, Path):
+        return json.load(source)
+    else:
+        with Path(source).open(encoding="utf-8") as f:
+            return json.load(f)
+
+
+def _stdlib_read_json_to_arrow(source: Path | Any, /, **kwds: Any) -> pa.Table:
+    import pyarrow as pa
+
+    rows: list[dict[str, Any]] = _stdlib_read_json(source)
+    try:
+        return pa.Table.from_pylist(rows, **kwds)
+    except TypeError:
+        import csv
+        import io
+
+        from pyarrow import csv as pa_csv
+
+        with io.StringIO() as f:
+            writer = csv.DictWriter(f, rows[0].keys(), dialect=csv.unix_dialect)
+            writer.writeheader()
+            writer.writerows(rows)
+            with io.BytesIO(f.getvalue().encode()) as f2:
+                return pa_csv.read_csv(f2)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 0855b73af..3765fa69b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -29,7 +29,7 @@
     import polars as pl
     from _pytest.mark.structures import ParameterSet
 
-    from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow
+    from altair.datasets._reader import _Backend, _PandasAny, _Polars, _PyArrow
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
     if sys.version_info >= (3, 10):
@@ -117,11 +117,14 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool:
 def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
     """User requested ``pyarrow``, but also has ``polars`` installed."""
     # NOTE: Would prefer if there was a *less* private method to test this.
-    return bool(
-        is_loader_backend(loader, "pyarrow")
-        and (fn := getattr(loader._reader, "_read_json_polars", None))
-        and fn == loader._reader.read_fn("dummy.json")
-    )
+    from altair.datasets._constraints import is_meta
+
+    if is_loader_backend(loader, "pyarrow"):
+        items = is_meta(suffix=".json", is_spatial=True)
+        impls = loader._reader._read
+        it = (some for impl in impls if (some := impl.unwrap_or(items)))
+        return callable(next(it, None))
+    return False
 
 
 @backends
@@ -151,7 +154,7 @@ def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None:
 
     See Also
     --------
-    ``altair.datasets._readers.infer_backend``
+    ``altair.datasets._reader.infer_backend``
     """
     import altair.datasets._loader
     from altair.datasets import load
@@ -247,7 +250,7 @@ def test_url(name: Dataset) -> None:
 
 def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
     from altair.datasets._cache import csv_cache
-    from altair.datasets._readers import infer_backend
+    from altair.datasets._reader import infer_backend
 
     priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4")
     assert csv_cache._mapping == {}
@@ -318,7 +321,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
     with pytest.raises(
         ERR_NO_RESULT,
         match=re.compile(
-            rf"{MSG_NO_RESULT}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}",
+            rf"{MSG_NO_RESULT}.+{NAME}.+{real_name}.+{SUFFIX}.+{incorrect_suffix}",
             re.DOTALL,
         ),
     ):
@@ -326,19 +329,7 @@ def test_dataset_not_found(backend: _Backend) -> None:
 
 
 def test_reader_missing_dependencies() -> None:
-    from packaging.requirements import Requirement
-
-    from altair.datasets._readers import _Reader
-
-    class MissingDeps(_Reader):
-        def __init__(self, name) -> None:
-            self._name = name
-            reqs = Requirement(name)
-            for req in (reqs.name, *reqs.extras):
-                self._import(req)
-
-            self._read_fn = {}
-            self._scan_fn = {}
+    from altair.datasets._reader import _import_guarded
 
     fake_name = "not_a_real_package"
     real_name = "altair"
@@ -351,7 +342,7 @@ def __init__(self, name) -> None:
             flags=re.DOTALL,
         ),
     ):
-        MissingDeps(fake_name)
+        _import_guarded(fake_name)  # type: ignore
     with pytest.raises(
         ModuleNotFoundError,
         match=re.compile(
@@ -359,7 +350,7 @@ def __init__(self, name) -> None:
             flags=re.DOTALL,
         ),
     ):
-        MissingDeps(backend)
+        _import_guarded(backend)  # type: ignore
 
 
 @backends
@@ -494,38 +485,10 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -
     assert not load.cache.is_empty()
 
 
-# TODO: Investigate adding schemas for `pyarrow`.
 @pytest.mark.parametrize(
-    ("name", "fallback"),
-    [
-        ("cars", "polars"),
-        ("movies", "polars"),
-        ("wheat", "polars"),
-        ("barley", "polars"),
-        ("gapminder", "polars"),
-        ("income", "polars"),
-        ("burtin", "polars"),
-        ("cars", None),
-        pytest.param(
-            "movies",
-            None,
-            marks=pytest.mark.xfail(
-                True,
-                raises=TypeError,
-                reason=(
-                    "msg: `Expected bytes, got a 'int' object`\n"
-                    "Isn't happy with the mixed `int`/`str` column."
-                ),
-                strict=True,
-            ),
-        ),
-        ("wheat", None),
-        ("barley", None),
-        ("gapminder", None),
-        ("income", None),
-        ("burtin", None),
-    ],
+    "name", ["cars", "movies", "wheat", "barley", "gapminder", "income", "burtin"]
 )
+@pytest.mark.parametrize("fallback", ["polars", None])
 @backends_pyarrow
 def test_pyarrow_read_json(
     backend: _PyArrow,
@@ -550,7 +513,7 @@ def test_spatial(backend: _Backend, name: Dataset) -> None:
             rf"{name}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url",
             flags=re.DOTALL | re.IGNORECASE,
         )
-        with pytest.raises(NotImplementedError, match=pattern):
+        with pytest.raises(AltairDatasetsError, match=pattern):
             load(name)
 
 
@@ -558,7 +521,11 @@ def test_spatial(backend: _Backend, name: Dataset) -> None:
 @datasets_debug
 def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None:
     if name in {"7zip", "ffox", "gimp"}:
-        with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"):
+        pattern = re.compile(
+            rf"Unable to load.+{name}.png.+as tabular data",
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        with pytest.raises((AltairDatasetsError, NotImplementedError), match=pattern):
             polars_loader(name)
     else:
         frame = polars_loader(name)

From 2203972ed49a97c4398116310a3ef2d607d23614 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:26:04 +0000
Subject: [PATCH 180/201] refactor: Simplify obsolete paths in `CsvCache`

They were an artifact of *previously* using multiple `vega-dataset` versions in `.paquet` - but only the most recent in `.csv.gz`

Currently both store the same range of names, so this error handling never triggered
---
 altair/datasets/_cache.py | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 9abe09726..13dca2f23 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -5,12 +5,11 @@
 from collections import defaultdict
 from importlib.util import find_spec
 from pathlib import Path
-from typing import TYPE_CHECKING, ClassVar, TypeVar, cast, get_args
+from typing import TYPE_CHECKING, ClassVar, TypeVar, cast
 
 import narwhals.stable.v1 as nw
 
 from altair.datasets._exceptions import AltairDatasetsError
-from altair.datasets._typing import Dataset
 
 if sys.version_info >= (3, 12):
     from typing import Protocol
@@ -34,7 +33,7 @@
     from narwhals.stable.v1.dtypes import DType
     from narwhals.stable.v1.typing import IntoExpr
 
-    from altair.datasets._typing import Metadata
+    from altair.datasets._typing import Dataset, Metadata
 
     if sys.version_info >= (3, 12):
         from typing import Unpack
@@ -188,31 +187,17 @@ def rotated(self) -> Mapping[str, Sequence[Any]]:
                     self._rotated[k].append(v)
         return self._rotated
 
-    # TODO: Evaluate which errors are now obsolete
     def __getitem__(self, key: _Dataset, /) -> Metadata:
         if meta := self.get(key, None):
             return meta
+        msg = f"{key!r} does not refer to a known dataset."
+        raise TypeError(msg)
 
-        if key in get_args(Dataset):
-            msg = f"{key!r} cannot be loaded via {type(self).__name__!r}."
-            raise TypeError(msg)
-        else:
-            msg = f"{key!r} does not refer to a known dataset."
-            raise TypeError(msg)
-
-    # TODO: Evaluate which errors are now obsolete
     def url(self, name: _Dataset, /) -> str:
-        if meta := self.get(name, None):
-            if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
-                raise AltairDatasetsError.from_url(meta)
-            return meta["url"]
-
-        if name in get_args(Dataset):
-            msg = f"{name!r} cannot be loaded via url."
-            raise TypeError(msg)
-        else:
-            msg = f"{name!r} does not refer to a known dataset."
-            raise TypeError(msg)
+        meta = self[name]
+        if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
+            raise AltairDatasetsError.from_url(meta)
+        return meta["url"]
 
     def __repr__(self) -> str:
         return f"<{type(self).__name__}: {'COLLECTED' if self._mapping else 'READY'}>"

From e68ab89810e6d7aaa7e9ca3b19461b6603866454 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 13:37:03 +0000
Subject: [PATCH 181/201] chore: add workaround for `narwhals` bug

Opened (https://github.com/narwhals-dev/narwhals/issues/1897)
Marking (https://github.com/vega/altair/pull/3631#discussion_r1934313255) as resolved
---
 altair/datasets/_constraints.py | 20 +++++++++++++++++++-
 altair/datasets/_reader.py      | 10 ++++------
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py
index e5eaa3b97..fbfd9cbc8 100644
--- a/altair/datasets/_constraints.py
+++ b/altair/datasets/_constraints.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import functools
+import operator
 from collections.abc import Set
 from itertools import chain
 from typing import TYPE_CHECKING, Any
@@ -59,7 +61,23 @@ def collect(**kwds: Unpack[Metadata]) -> Metadata:
         return dict(self)
 
     def to_expr(self) -> nw.Expr:
-        return nw.all_horizontal(nw.col(name) == val for name, val in self)
+        """
+        Convert constraint into a narhwals expression.
+
+        Notes
+        -----
+        Workaround for `issue`_ is performing the reduction with ``stdlib``
+
+        .. _issue:
+            https://github.com/narwhals-dev/narwhals/issues/1897
+        .. _discussion:
+            https://github.com/vega/altair/pull/3631#discussion_r1934313255
+        """
+        if not self:
+            msg = f"Unable to convert an empty set to an expression:\n\n{self!r}"
+            raise TypeError(msg)
+        exprs = (nw.col(name) == val for name, val in self)
+        return functools.reduce(operator.and_, exprs)
 
     def isdisjoint(self, other: Iterable[Any]) -> bool:
         return super().isdisjoint(other)
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index eacc516ba..b8cc6b859 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -171,13 +171,11 @@ def profile(self, mode: Literal["any", "each"]):
             )
             frame = self._scan_metadata().select("dataset_name", *relevant_columns)
             it = (impl._include_expr for impl in self._read)
-            # BUG: ``narwhals`` raises a ``ValueError`` when ``__invert__``-ing a previously used Expr?
-            # - Can't reproduce trivially
-            # - Doesnt seem to be related to genexp
             inc_expr = nw.any_horizontal(*it)
-            include = _dataset_names(frame, inc_expr)
-            exclude = _dataset_names(frame, ~nw.col("dataset_name").is_in(include))
-            return {"include": include, "exclude": exclude}
+            return {
+                "include": _dataset_names(frame, inc_expr),
+                "exclude": _dataset_names(frame, ~inc_expr),
+            }
         elif mode == "each":
             # FIXME: Rough draft of how to group results
             # - Don't really want a nested dict

From 576a9b40da3bbf27656ff2e3f4c896d0ff3b2d9e Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:49:02 +0000
Subject: [PATCH 182/201] feat(typing): replace `(Read|Scan)Impl` classes with
 aliases

- Shorter names `Read`, `Scan`
- The single unique method is now `into_scan`
- There was no real need to have concrete classes when they behave the same as parent
---
 altair/datasets/_reader.py   |  26 ++++-----
 altair/datasets/_readimpl.py | 107 +++++++++++++++++------------------
 2 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index b8cc6b859..3d7d2d87f 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -52,7 +52,7 @@
     import polars as pl
     import pyarrow as pa
 
-    from altair.datasets._readimpl import BaseImpl, R, ReadImpl, ScanImpl
+    from altair.datasets._readimpl import BaseImpl, R, Read, Scan
     from altair.datasets._typing import Dataset, Extension, Metadata
     from altair.vegalite.v5.schema._typing import OneOrSeq
 
@@ -107,11 +107,11 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]):
     """
 
     # TODO: Docs
-    _read: Sequence[ReadImpl[IntoDataFrameT]]
+    _read: Sequence[Read[IntoDataFrameT]]
     """Eager file read functions."""
 
     # TODO: Docs
-    _scan: Sequence[ScanImpl[IntoFrameT]]
+    _scan: Sequence[Scan[IntoFrameT]]
     """
     *Optionally*-lazy file read/scan functions.
 
@@ -143,8 +143,8 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]):
 
     def __init__(
         self,
-        read: Sequence[ReadImpl[IntoDataFrameT]],
-        scan: Sequence[ScanImpl[IntoFrameT]],
+        read: Sequence[Read[IntoDataFrameT]],
+        scan: Sequence[Scan[IntoFrameT]],
         name: str,
         implementation: nw.Implementation,
     ) -> None:
@@ -356,7 +356,7 @@ def _metadata_frame(self) -> nw.LazyFrame:
 
 @overload
 def reader(
-    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
+    read_fns: Sequence[Read[IntoDataFrameT]],
     scan_fns: tuple[()] = ...,
     *,
     name: str | None = ...,
@@ -366,8 +366,8 @@ def reader(
 
 @overload
 def reader(
-    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
-    scan_fns: Sequence[ScanImpl[IntoFrameT]],
+    read_fns: Sequence[Read[IntoDataFrameT]],
+    scan_fns: Sequence[Scan[IntoFrameT]],
     *,
     name: str | None = ...,
     implementation: nw.Implementation = ...,
@@ -375,8 +375,8 @@ def reader(
 
 
 def reader(
-    read_fns: Sequence[ReadImpl[IntoDataFrameT]],
-    scan_fns: Sequence[ScanImpl[IntoFrameT]] = (),
+    read_fns: Sequence[Read[IntoDataFrameT]],
+    scan_fns: Sequence[Scan[IntoFrameT]] = (),
     *,
     name: str | None = None,
     implementation: nw.Implementation = nw.Implementation.UNKNOWN,
@@ -504,10 +504,10 @@ def _into_suffix(obj: Path | str, /) -> Any:
 
 
 def _steal_eager_parquet(
-    read_fns: Sequence[ReadImpl[IntoDataFrameT]], /
-) -> Sequence[ScanImpl[nw.LazyFrame]] | None:
+    read_fns: Sequence[Read[IntoDataFrameT]], /
+) -> Sequence[Scan[nw.LazyFrame]] | None:
     if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None):
-        return (convertable.to_scan_impl(),)
+        return (_readimpl.into_scan(convertable),)
     return None
 
 
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index 119352db5..fc9c77110 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -31,6 +31,10 @@
     from typing import TypeVar
 else:
     from typing_extensions import TypeVar
+if sys.version_info >= (3, 12):
+    from typing import TypeAliasType
+else:
+    from typing_extensions import TypeAliasType
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable, Iterator, Sequence
@@ -46,12 +50,14 @@
 
 __all__ = ["is_available", "pa_any", "pd_only", "pd_pyarrow", "pl_only", "read", "scan"]
 
-R = TypeVar("R")
+R = TypeVar("R", bound="nwt.IntoFrame")
 IntoFrameT = TypeVar(
     "IntoFrameT",
     bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike",
     default=nw.LazyFrame,
 )
+Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,))
+Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,))
 
 
 class Skip(Enum):
@@ -158,41 +164,35 @@ def _exclude_expr(self) -> nw.Expr:
         raise TypeError(msg)
 
 
-def _unwrap_partial(fn: Any, /) -> Any:
-    # NOTE: ``functools._unwrap_partial``
-    func = fn
-    while isinstance(func, partial):
-        func = func.func
-    return func
-
-
-class ScanImpl(BaseImpl[IntoFrameT]): ...
-
-
-class ReadImpl(BaseImpl[IntoDataFrameT]):
-    def to_scan_impl(self) -> ScanImpl[nw.LazyFrame]:
-        return ScanImpl(_into_scan_fn(self.fn), self.include, self.exclude, {})
+def read(
+    fn: Callable[..., IntoDataFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> Read[IntoDataFrameT]:
+    return BaseImpl(fn, include, exclude, kwds)
 
 
-def _into_scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]:
-    @wraps(_unwrap_partial(fn))
-    def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame:
-        return nw.from_native(fn(*args, **kwds)).lazy()
+def scan(
+    fn: Callable[..., IntoFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> Scan[IntoFrameT]:
+    return BaseImpl(fn, include, exclude, kwds)
 
-    return wrapper
 
+def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame]:
+    def scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]:
+        @wraps(_unwrap_partial(fn))
+        def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame:
+            return nw.from_native(fn(*args, **kwds)).lazy()
 
-def _root_package_name(obj: Any, default: str, /) -> str:
-    # NOTE: Defers importing `inspect`, if we can get the module name
-    if hasattr(obj, "__module__"):
-        return obj.__module__.split(".")[0]
-    else:
-        from inspect import getmodule
+        return wrapper
 
-        module = getmodule(obj)
-    if module and (pkg := module.__package__):
-        return pkg.split(".")[0]
-    return default
+    return BaseImpl(scan_fn(impl.fn), impl.include, impl.exclude, {})
 
 
 def is_available(
@@ -217,29 +217,28 @@ def is_available(
     return fn(find_spec(name) is not None for name in names)
 
 
-def read(
-    fn: Callable[..., IntoDataFrameT],
-    /,
-    include: MetaIs,
-    exclude: MetaIs | None = None,
-    **kwds: Any,
-) -> ReadImpl[IntoDataFrameT]:
-    return ReadImpl(fn, include, exclude, kwds)
+def _root_package_name(obj: Any, default: str, /) -> str:
+    # NOTE: Defers importing `inspect`, if we can get the module name
+    if hasattr(obj, "__module__"):
+        return obj.__module__.split(".")[0]
+    else:
+        from inspect import getmodule
 
+        module = getmodule(obj)
+    if module and (pkg := module.__package__):
+        return pkg.split(".")[0]
+    return default
 
-def scan(
-    fn: Callable[..., IntoFrameT],
-    /,
-    include: MetaIs,
-    exclude: MetaIs | None = None,
-    **kwds: Any,
-) -> ScanImpl[IntoFrameT]:
-    return ScanImpl(fn, include, exclude, kwds)
+
+def _unwrap_partial(fn: Any, /) -> Any:
+    # NOTE: ``functools._unwrap_partial``
+    func = fn
+    while isinstance(func, partial):
+        func = func.func
+    return func
 
 
-def pl_only() -> tuple[
-    Sequence[ReadImpl[pl.DataFrame]], Sequence[ScanImpl[pl.LazyFrame]]
-]:
+def pl_only() -> tuple[Sequence[Read[pl.DataFrame]], Sequence[Scan[pl.LazyFrame]]]:
     import polars as pl
 
     read_fns = (
@@ -253,10 +252,10 @@ def pl_only() -> tuple[
     return read_fns, scan_fns
 
 
-def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]:
+def pd_only() -> Sequence[Read[pd.DataFrame]]:
     import pandas as pd
 
-    opt: Sequence[ReadImpl[pd.DataFrame]]
+    opt: Sequence[Read[pd.DataFrame]]
     if is_available("pyarrow"):
         opt = read(pd.read_feather, is_arrow), read(pd.read_parquet, is_parquet)
     elif is_available("fastparquet"):
@@ -271,7 +270,7 @@ def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]:
     )
 
 
-def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]:
+def pd_pyarrow() -> Sequence[Read[pd.DataFrame]]:
     import pandas as pd
 
     kwds: dict[str, Any] = {"dtype_backend": "pyarrow"}
@@ -284,7 +283,7 @@ def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]:
     )
 
 
-def pa_any() -> Sequence[ReadImpl[pa.Table]]:
+def pa_any() -> Sequence[Read[pa.Table]]:
     from pyarrow import csv, feather, parquet
 
     return (
@@ -296,7 +295,7 @@ def pa_any() -> Sequence[ReadImpl[pa.Table]]:
     )
 
 
-def _pa_read_json_impl() -> ReadImpl[pa.Table]:
+def _pa_read_json_impl() -> Read[pa.Table]:
     """
     Mitigating ``pyarrow``'s `line-delimited`_ JSON requirement.
 

From 91562d55d1b7120cd065b8ad20893de73700b3e6 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 16:02:41 +0000
Subject: [PATCH 183/201] feat: Rename, docs `unwrap_or` -> `unwrap_or_skip`

---
 altair/datasets/_reader.py   |  2 +-
 altair/datasets/_readimpl.py | 22 ++++++++++++++++------
 tests/test_datasets.py       |  2 +-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 3d7d2d87f..46a7f5620 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -311,7 +311,7 @@ def _solve(
             - Leaves the door open for caching the search space
         """
         items = meta.items()
-        it = (some for impl in impls if (some := impl.unwrap_or(items)))
+        it = (some for impl in impls if (some := impl.unwrap_or_skip(items)))
         if fn_or_err := next(it, None):
             if _is_err(fn_or_err):
                 raise fn_or_err.from_tabular(meta, self._name)
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index fc9c77110..f964da253 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -97,14 +97,24 @@ def __init__(
         object.__setattr__(self, "include", include)
         object.__setattr__(self, "exclude", exclude)
 
-    # TODO: Consider renaming
-    # NOTE:
-    # - Fn means call it
-    # - Err means raise it
-    # - Skip means its safe to check other impls
-    def unwrap_or(
+    def unwrap_or_skip(
         self, meta: Items, /
     ) -> Callable[..., R] | type[AltairDatasetsError] | Skip:
+        """
+        Indicate an action to take for a dataset.
+
+        **Supports** dataset, use this function::
+
+            Callable[..., R]
+
+        Has explicitly marked as **not supported**::
+
+            type[AltairDatasetsError]
+
+        No relevant constraints overlap, safe to check others::
+
+            Skip
+        """
         if self.include.issubset(meta):
             return self.fn if self.exclude.isdisjoint(meta) else AltairDatasetsError
         return Skip.skip
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3765fa69b..8acefa0e6 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -122,7 +122,7 @@ def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
     if is_loader_backend(loader, "pyarrow"):
         items = is_meta(suffix=".json", is_spatial=True)
         impls = loader._reader._read
-        it = (some for impl in impls if (some := impl.unwrap_or(items)))
+        it = (some for impl in impls if (some := impl.unwrap_or_skip(items)))
         return callable(next(it, None))
     return False
 

From 1628cbd6c3ff642996d22ac15822854f1017173c Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 16:31:27 +0000
Subject: [PATCH 184/201] refactor: Replace `._contents` w/ `.__str__()`

Inspired by https://github.com/pypa/packaging/blob/8510bd9d3bab5571974202ec85f6ef7b0359bfaf/src/packaging/requirements.py#L67-L71
---
 altair/datasets/_reader.py   |  8 +++-----
 altair/datasets/_readimpl.py | 17 ++++++-----------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 46a7f5620..c06fdc9cc 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -182,7 +182,7 @@ def profile(self, mode: Literal["any", "each"]):
             m = {}
             frame = self._scan_metadata()
             for impl in self._read:
-                name = impl._contents
+                name = str(impl)
                 m[name] = {"include": _dataset_names(frame, impl._include_expr)}
                 if impl.exclude:
                     m[name].update(exclude=_dataset_names(frame, impl._exclude_expr))
@@ -196,11 +196,9 @@ def __repr__(self) -> str:
 
         PREFIX = " " * 4
         NL = "\n"
-        body = f"read\n{indent(NL.join(el._contents for el in self._read), PREFIX)}"
+        body = f"read\n{indent(NL.join(str(el) for el in self._read), PREFIX)}"
         if self._scan:
-            body += (
-                f"\nscan\n{indent(NL.join(el._contents for el in self._scan), PREFIX)}"
-            )
+            body += f"\nscan\n{indent(NL.join(str(el) for el in self._scan), PREFIX)}"
         return f"Reader[{self._name}] {self._implementation!r}\n{body}"
 
     def read_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]:
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index f964da253..4969a25f9 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -136,22 +136,17 @@ def _inferred_package(self) -> str:
 
     def __repr__(self) -> str:
         tp_name = f"{type(self).__name__}[{self._inferred_package}?]"
-        return f"{tp_name}({self._contents})"
+        return f"{tp_name}({self})"
 
-    # TODO: Consider renaming
-    @property
-    def _contents(self) -> str:
+    def __str__(self) -> str:
         if isinstance(self.fn, partial):
             fn = _unwrap_partial(self.fn)
-            it = (f"{k}={v!r}" for k, v in self.fn.keywords.items())
-            fn_repr = f"{fn.__name__}(..., {', '.join(it)})"
+            kwds = self.fn.keywords.items()
+            fn_repr = f"{fn.__name__}(..., {', '.join(f'{k}={v!r}' for k, v in kwds)})"
         else:
             fn_repr = f"{self.fn.__name__}(...)"
-        if self.exclude:
-            params = f"include={self.include!r}, exclude={self.exclude!r}"
-        else:
-            params = repr(self.include)
-        return f"{fn_repr}, {params}"
+        inc, exc = self.include, self.exclude
+        return f"{fn_repr}, {f'include={inc!r}, exclude={exc!r}' if exc else repr(inc)}"
 
     @property
     def _relevant_columns(self) -> Iterator[str]:

From cbd04e33cfb38a646862bb0c5b7bc2c2d1ce815b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 17:01:39 +0000
Subject: [PATCH 185/201] fix: Use correct type for `pyarrow.csv.read_csv`

Resolves:
```py
File ../altair/.venv/Lib/site-packages/pyarrow/csv.pyx:1258, in pyarrow._csv.read_csv()
TypeError: Cannot convert dict to pyarrow._csv.ParseOptions
```
---
 altair/datasets/_readimpl.py | 2 +-
 tests/test_datasets.py       | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index 4969a25f9..0278d48b5 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -294,7 +294,7 @@ def pa_any() -> Sequence[Read[pa.Table]]:
     return (
         read(csv.read_csv, is_csv),
         _pa_read_json_impl(),
-        read(csv.read_csv, is_tsv, parse_options={"delimiter": "\t"}),
+        read(csv.read_csv, is_tsv, parse_options=csv.ParseOptions(delimiter="\t")),  # pyright: ignore[reportCallIssue]
         read(feather.read_table, is_arrow),
         read(parquet.read_table, is_parquet),
     )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 8acefa0e6..60b4a9cfb 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -517,6 +517,12 @@ def test_spatial(backend: _Backend, name: Dataset) -> None:
             load(name)
 
 
+@backends
+def test_tsv(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    is_frame_backend(load("unemployment", ".tsv"), backend)
+
+
 @datasets_all
 @datasets_debug
 def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None:

From c0a92a618469fb44c843b927bd3d3276a2732d7b Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 18:16:12 +0000
Subject: [PATCH 186/201] docs: Add docs for `Read`, `Scan`, `BaseImpl`

---
 altair/datasets/_reader.py   | 11 +----------
 altair/datasets/_readimpl.py | 33 +++++++++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index c06fdc9cc..2162d910c 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -106,20 +106,11 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]):
         Use ``reader(...)`` instead of instantiating ``Reader`` directly.
     """
 
-    # TODO: Docs
     _read: Sequence[Read[IntoDataFrameT]]
     """Eager file read functions."""
 
-    # TODO: Docs
     _scan: Sequence[Scan[IntoFrameT]]
-    """
-    *Optionally*-lazy file read/scan functions.
-
-    Used exclusively for ``metadata.parquet``.
-
-    Currently ``"polars"`` is the only lazy option.
-    All others defer to the eager variant.
-    """
+    """Lazy file read functions."""
 
     _name: str
     """
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index 0278d48b5..cc4c01e07 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -56,8 +56,11 @@
     bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike",
     default=nw.LazyFrame,
 )
-Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,))
 Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,))
+"""An *eager* file read function."""
+
+Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,))
+"""A *lazy* file read function."""
 
 
 class Skip(Enum):
@@ -73,12 +76,33 @@ def __repr__(self) -> Literal["<Skip>"]:
 
 
 class BaseImpl(Generic[R]):
+    """
+    A function wrapped with dataset support constraints.
+
+    The ``include``, ``exclude`` properties form a `NIMPLY gate`_ (`Material nonimplication`_).
+
+    Examples
+    --------
+    For some dataset ``D``, we can use ``fn`` if::
+
+        impl: BaseImpl
+        impl.include(D) and not impl.exclude(D)
+
+
+    .. _NIMPLY gate:
+        https://en.m.wikipedia.org/wiki/NIMPLY_gate
+    .. _Material nonimplication:
+        https://en.m.wikipedia.org/wiki/Material_nonimplication#Truth_table
+    """
+
     fn: Callable[..., R]
-    """Wrapped read function."""
+    """Wrapped read/scan function."""
+
     include: MetaIs
-    """Passing this makes ``fn`` a candidate."""
+    """Constraint indicating ``fn`` **supports** reading a dataset."""
+
     exclude: MetaIs
-    """Passing this overrides ``include``, transforming into an error."""
+    """Constraint *subsetting* ``include`` to mark **non-support**."""
 
     def __init__(
         self,
@@ -121,6 +145,7 @@ def unwrap_or_skip(
 
     @classmethod
     def _exclude_none(cls) -> MetaIs:
+        """Represents the empty set."""
         return is_meta()
 
     def __setattr__(self, name: str, value: Any):

From 2b8bf5e5459ac800711535f6cdb833d065bf0909 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 19:01:49 +0000
Subject: [PATCH 187/201] docs: Clean up `_merge_kwds`, `_solve`

---
 altair/datasets/_reader.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 2162d910c..4075598ec 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -254,15 +254,11 @@ def _query(
             msg = f"Found no results for:\n    {constraints!r}"
             raise ValueError(msg)
 
-    # TODO: Docs
     def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, Any]:
         """
-        Hook to utilize ``meta`` to extend ``kwds`` with known helpful defaults.
+        Extend user-provided arguments with dataset & library-specfic defaults.
 
-        - User provided arguments have a higher precedence.
-        - The keywords for schemas vary between libraries
-            - pandas is internally inconsistent
-        - By default, returns unchanged
+        .. important:: User-provided arguments have a higher precedence.
         """
         if self._schema_cache.is_active() and (
             schema := self._schema_cache.schema_kwds(meta)
@@ -282,22 +278,15 @@ def _scan_metadata(
             return self._metadata_frame.filter(*predicates, **constraints)
         return self._metadata_frame
 
-    # TODO: Docs
     def _solve(
         self, meta: Metadata, impls: Sequence[BaseImpl[R]], /
     ) -> Callable[..., R]:
         """
-        Return the first function meeting constraints of meta.
-
-        Notes
-        -----
-        - Iterate over impls
-        - Each one can either match or signal an error
-        - An error blocks any additional checking
-            - Both include & exclude
-        - Uses ``ItemsView`` to support set ops
-            - `meta` isn't iterated over
-            - Leaves the door open for caching the search space
+        Return the first function that satisfies dataset constraints.
+
+        See Also
+        --------
+        ``altair.datasets._readimpl.BaseImpl.unwrap_or_skip``
         """
         items = meta.items()
         it = (some for impl in impls if (some := impl.unwrap_or_skip(items)))

From 755ab4f560af13f9268e905cf70783c34b30b1d7 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 30 Jan 2025 19:45:00 +0000
Subject: [PATCH 188/201] refactor(typing): Include all suffixes in `Extension`

Also simplifies and removes outdated `Extension`-related tooling
---
 altair/datasets/_reader.py    |  7 +++++--
 altair/datasets/_typing.py    | 22 +++-------------------
 tests/test_datasets.py        |  7 ++-----
 tools/datasets/__init__.py    | 17 +++--------------
 tools/datasets/datapackage.py |  3 +--
 5 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 4075598ec..309080823 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -41,7 +41,6 @@
     module_not_found,
 )
 from altair.datasets._readimpl import IntoFrameT, is_available
-from altair.datasets._typing import EXTENSION_SUFFIXES
 
 if TYPE_CHECKING:
     import sys
@@ -443,8 +442,12 @@ def _into_constraints(
     elif suffix.startswith("."):
         m = {"dataset_name": name, "suffix": suffix}
     else:
+        from typing import get_args
+
+        from altair.datasets._typing import Extension
+
         msg = (
-            f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n"
+            f"Expected 'suffix' to be one of {get_args(Extension)!r},\n"
             f"but got: {suffix!r}"
         )
         raise TypeError(msg)
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 958db2300..7c524f2ec 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -4,25 +4,20 @@
 from __future__ import annotations
 
 import sys
-from typing import Any, Literal
+from typing import Literal
 
 if sys.version_info >= (3, 14):
     from typing import TypedDict
 else:
     from typing_extensions import TypedDict
 
-if sys.version_info >= (3, 13):
-    from typing import TypeIs
-else:
-    from typing_extensions import TypeIs
-
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
     from typing_extensions import TypeAlias
 
 
-__all__ = ["EXTENSION_SUFFIXES", "Dataset", "Extension", "Metadata", "is_ext_read"]
+__all__ = ["Dataset", "Extension", "Metadata"]
 
 Dataset: TypeAlias = Literal[
     "7zip",
@@ -98,18 +93,7 @@
     "world-110m",
     "zipcodes",
 ]
-Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".tsv"]
-EXTENSION_SUFFIXES: tuple[
-    Literal[".arrow"],
-    Literal[".csv"],
-    Literal[".json"],
-    Literal[".parquet"],
-    Literal[".tsv"],
-] = (".arrow", ".csv", ".json", ".parquet", ".tsv")
-
-
-def is_ext_read(suffix: Any) -> TypeIs[Extension]:
-    return suffix in {".arrow", ".csv", ".json", ".parquet", ".tsv"}
+Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".png", ".tsv"]
 
 
 class Metadata(TypedDict, total=False):
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 60b4a9cfb..429f4b16a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -16,7 +16,7 @@
 
 from altair.datasets import Loader
 from altair.datasets._exceptions import AltairDatasetsError
-from altair.datasets._typing import Dataset, Metadata, is_ext_read
+from altair.datasets._typing import Dataset, Metadata
 from tests import no_xdist, skip_requires_pyarrow
 from tools import fs
 
@@ -441,10 +441,7 @@ def test_reader_cache_exhaustive(
 
     # NOTE: Approximating all datasets downloaded
     assert len(cached_paths) >= 70
-    assert all(
-        bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size)
-        for fp in load.cache
-    )
+    assert all(bool(fp.exists() and fp.stat().st_size) for fp in load.cache)
     # NOTE: Confirm this is a no-op
     load.cache.download_all()
     assert len(cached_paths) == len(tuple(load.cache))
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 64940ebc1..6c8c75fe5 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -190,13 +190,6 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
         indent = " " * 4
         NAME = "Dataset"
         EXT = "Extension"
-        EXT_TYPES = dpkg.extensions()
-        EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
-        EXTENSION_TYPE_TP = (
-            f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]"
-        )
-        EXTENSION_GUARD = "is_ext_read"
-
         FIELD = "FlFieldStr"
         FIELD_TYPES = (
             "integer",
@@ -215,17 +208,13 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
             f"{HEADER_COMMENT}",
             "from __future__ import annotations\n",
             "import sys",
-            "from typing import Any, Literal, TYPE_CHECKING",
+            "from typing import Literal, TYPE_CHECKING",
             utils.import_typing_extensions((3, 14), "TypedDict"),
-            utils.import_typing_extensions((3, 13), "TypeIs"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
-            f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n",
+            f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT]}\n",
             utils.spell_literal_alias(NAME, dpkg.dataset_names()),
-            utils.spell_literal_alias(EXT, EXT_TYPES),
-            f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}",
-            f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
-            f"{indent}return suffix in set({EXT_TYPES!r})\n",
+            utils.spell_literal_alias(EXT, dpkg.extensions()),
             dpkg.typed_dict(),
             utils.spell_literal_alias(FIELD, FIELD_TYPES),
             '"""\n'
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index 9747bdb71..ec707c0da 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -103,8 +103,7 @@ def dataset_names(self) -> Iterable[str]:
 
     def extensions(self) -> tuple[str, ...]:
         return tuple(
-            self.core.filter(is_image=False)
-            .select(col("suffix").unique().sort())
+            self.core.select(col("suffix").unique().sort())
             .collect()
             .to_series()
             .to_list()

From 0ba3d677ab91092380f4fa5388766e866d5be924 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:05:02 +0000
Subject: [PATCH 189/201] feat: Finish `Reader.profile`

- Reduced the scope a bit, now just un/supported
- Added `pprint` option
- Finished docs, including example pointing to use `url(...)`
---
 altair/datasets/_reader.py | 100 +++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 44 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 309080823..195607fe5 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -96,6 +96,26 @@
         _PySpark,
     )
 
+_SupportProfile: TypeAlias = Mapping[
+    Literal["supported", "unsupported"], "Sequence[Dataset]"
+]
+"""
+Dataset support varies between backends and available dependencies.
+
+Any name listed in ``"unsupported"`` will raise an error on::
+
+    from altair.datasets import load
+
+    load("7zip")
+
+Instead, they can be loaded via::
+
+    import altair as alt
+    from altair.datasets import url
+
+    alt.Chart(url("7zip"))
+"""
+
 
 class Reader(Generic[IntoDataFrameT, IntoFrameT]):
     """
@@ -144,43 +164,6 @@ def __init__(
         self._implementation = implementation
         self._schema_cache = SchemaCache(implementation=implementation)
 
-    # TODO: Finish working on presentation
-    # - The contents of both are functional
-    def profile(self, mode: Literal["any", "each"]):
-        """
-        Describe which datasets/groups are supported.
-
-        Focusing on actual datasets, rather than describing wrapped functions (repr)
-
-        .. note::
-            Having this public to make testing easier (``tests.test_datasets.is_polars_backed_pyarrow``)
-        """
-        if mode == "any":
-            relevant_columns = set(
-                chain.from_iterable(impl._relevant_columns for impl in self._read)
-            )
-            frame = self._scan_metadata().select("dataset_name", *relevant_columns)
-            it = (impl._include_expr for impl in self._read)
-            inc_expr = nw.any_horizontal(*it)
-            return {
-                "include": _dataset_names(frame, inc_expr),
-                "exclude": _dataset_names(frame, ~inc_expr),
-            }
-        elif mode == "each":
-            # FIXME: Rough draft of how to group results
-            # - Don't really want a nested dict
-            m = {}
-            frame = self._scan_metadata()
-            for impl in self._read:
-                name = str(impl)
-                m[name] = {"include": _dataset_names(frame, impl._include_expr)}
-                if impl.exclude:
-                    m[name].update(exclude=_dataset_names(frame, impl._exclude_expr))
-            return m
-        else:
-            msg = f"Unexpected {mode=}"
-            raise TypeError(msg)
-
     def __repr__(self) -> str:
         from textwrap import indent
 
@@ -234,6 +217,38 @@ def url(
             msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
             raise TypeError(msg)
 
+    @overload
+    def profile(self, *, show: Literal[False] = ...) -> _SupportProfile: ...
+
+    @overload
+    def profile(self, *, show: Literal[True]) -> None: ...
+
+    def profile(self, *, show: bool = False) -> _SupportProfile | None:
+        """
+        Describe which datasets can be loaded as tabular data.
+
+        Parameters
+        ----------
+        show
+            Print a densely formatted repr *instead of* returning a mapping.
+        """
+        relevant_columns = set(
+            chain.from_iterable(impl._relevant_columns for impl in self._read)
+        )
+        frame = self._scan_metadata().select("dataset_name", *relevant_columns)
+        it = (impl._include_expr for impl in self._read)
+        inc_expr = nw.any_horizontal(*it)
+        result: _SupportProfile = {
+            "unsupported": _dataset_names(frame, ~inc_expr),
+            "supported": _dataset_names(frame, inc_expr),
+        }
+        if show:
+            import pprint
+
+            pprint.pprint(result, compact=True, sort_dicts=False)
+            return None
+        return result
+
     def _query(
         self, name: Dataset | LiteralString, suffix: Extension | None = None, /
     ) -> nw.DataFrame[IntoDataFrameT]:
@@ -298,15 +313,12 @@ def _solve(
         raise implementation_not_found(meta)
 
 
-# TODO: Review after finishing `profile`
-# NOTE: Temp helper function for `Reader.profile`
 def _dataset_names(
-    frame: nw.LazyFrame,
-    *predicates: OneOrSeq[IntoExpr],
-    **constraints: Unpack[Metadata],
-):
+    frame: nw.LazyFrame, *predicates: OneOrSeq[IntoExpr]
+) -> Sequence[Dataset]:
+    # NOTE: helper function for `Reader.profile`
     return (
-        frame.filter(*predicates, **constraints)
+        frame.filter(*predicates)
         .select("dataset_name")
         .collect()
         .get_column("dataset_name")

From 845b3eec47a8c1125ce4b2e1977e1c995eea6d61 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:21:15 +0000
Subject: [PATCH 190/201] test: Use `Reader.profile` in
 `is_polars_backed_pyarrow`

---
 tests/test_datasets.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 429f4b16a..2bef2ed70 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -115,16 +115,15 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool:
 
 
 def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
-    """User requested ``pyarrow``, but also has ``polars`` installed."""
-    # NOTE: Would prefer if there was a *less* private method to test this.
-    from altair.datasets._constraints import is_meta
-
-    if is_loader_backend(loader, "pyarrow"):
-        items = is_meta(suffix=".json", is_spatial=True)
-        impls = loader._reader._read
-        it = (some for impl in impls if (some := impl.unwrap_or_skip(items)))
-        return callable(next(it, None))
-    return False
+    """
+    User requested ``pyarrow``, but also has ``polars`` installed.
+
+    Both support nested datatypes, which are required for spatial json.
+    """
+    return (
+        is_loader_backend(loader, "pyarrow")
+        and "earthquakes" in loader._reader.profile()["supported"]
+    )
 
 
 @backends

From 869d2161bde45d59582c687e574fbfe0f7efe776 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:47:20 +0000
Subject: [PATCH 191/201] feat: Clean up, add tests for new exceptions

---
 altair/datasets/_exceptions.py | 68 ++++++++++++++++------------------
 altair/datasets/_reader.py     | 10 +----
 tests/test_datasets.py         | 29 ++++++++++++++-
 3 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py
index 2f9c13d45..3b377f657 100644
--- a/altair/datasets/_exceptions.py
+++ b/altair/datasets/_exceptions.py
@@ -28,16 +28,25 @@ def from_url(cls, meta: Metadata, /) -> AltairDatasetsError:
 
     @classmethod
     def from_tabular(cls, meta: Metadata, backend_name: str, /) -> AltairDatasetsError:
-        install_other = None
-        mid = "\n"
-        if not meta["is_image"] and not meta["is_tabular"]:
-            install_other = "polars"
-            if meta["is_spatial"]:
-                mid = f"Geospatial data is not supported natively by {backend_name!r}."
-            elif meta["is_json"]:
-                mid = f"Non-tabular json is not supported natively by {backend_name!r}."
-        msg = f"{_failed_tabular(meta)}{mid}{_suggest_url(meta, install_other)}"
-        return cls(msg)
+        if meta["is_image"]:
+            reason = "Image data is non-tabular."
+            return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta)}")
+        elif not meta["is_tabular"] or meta["suffix"] in {".arrow", ".parquet"}:
+            if meta["suffix"] in {".arrow", ".parquet"}:
+                install: tuple[str, ...] = "pyarrow", "polars"
+                what = f"{meta['suffix']!r}"
+            else:
+                install = ("polars",)
+                if meta["is_spatial"]:
+                    what = "Geospatial data"
+                elif meta["is_json"]:
+                    what = "Non-tabular json"
+                else:
+                    what = f"{meta['file_name']!r}"
+            reason = _why(what, backend_name)
+            return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta, *install)}")
+        else:
+            return cls(_implementation_not_found(meta))
 
     @classmethod
     def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError:
@@ -70,36 +79,24 @@ def _failed_tabular(meta: Metadata, /) -> str:
     return f"Unable to load {meta['file_name']!r} as tabular data.\n"
 
 
-def _suggest_url(meta: Metadata, install_other: str | None = None) -> str:
-    other = f" installing `{install_other}` or" if install_other else ""
+def _why(what: str, backend_name: str, /) -> str:
+    return f"{what} is not supported natively by {backend_name!r}."
+
+
+def _suggest_url(meta: Metadata, *install_other: str) -> str:
+    other = ""
+    if install_other:
+        others = " or ".join(f"`{other}`" for other in install_other)
+        other = f" installing {others}, or use"
     return (
-        f"\n\nInstead, try{other}:\n\n"
+        f"\n\nInstead, try{other}:\n"
         "    from altair.datasets import url\n"
         f"    url({meta['dataset_name']!r})"
     )
 
 
-# TODO:
-# - Use `AltairDatasetsError`
-# - Remove notes from doc
-# - Improve message and how data is selected
-def implementation_not_found(meta: Metadata, /) -> NotImplementedError:
-    """
-    Search finished without finding a *declared* incompatibility.
-
-    Notes
-    -----
-    - New kind of error
-    - Previously, every backend had a function assigned
-        - But they might not all work
-    - Now, only things that are known to be widely safe are added
-        - Should probably suggest using a pre-defined backend that supports everything
-    - What can reach here?
-        - `is_image` (all)
-        - `"pandas"` (using inference wont trigger these)
-          - `.arrow` (w/o `pyarrow`)
-          - `.parquet` (w/o either `pyarrow` or `fastparquet`)
-    """
+def _implementation_not_found(meta: Metadata, /) -> str:
+    """Search finished without finding a *declared* incompatibility."""
     INDENT = " " * 4
     record = f",\n{INDENT}".join(
         f"{k}={v!r}"
@@ -107,5 +104,4 @@ def implementation_not_found(meta: Metadata, /) -> NotImplementedError:
         if not (k.startswith(("is_", "sha", "bytes", "has_")))
         or (v is True and k.startswith("is_"))
     )
-    msg = f"Found no implementation that supports:\n{INDENT}{record}"
-    return NotImplementedError(msg)
+    return f"Found no implementation that supports:\n{INDENT}{record}"
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 195607fe5..cacb903f2 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -35,11 +35,7 @@
 from altair.datasets import _readimpl
 from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata
 from altair.datasets._constraints import is_parquet
-from altair.datasets._exceptions import (
-    AltairDatasetsError,
-    implementation_not_found,
-    module_not_found,
-)
+from altair.datasets._exceptions import AltairDatasetsError, module_not_found
 from altair.datasets._readimpl import IntoFrameT, is_available
 
 if TYPE_CHECKING:
@@ -308,9 +304,7 @@ def _solve(
             if _is_err(fn_or_err):
                 raise fn_or_err.from_tabular(meta, self._name)
             return fn_or_err
-        if meta["is_image"]:
-            raise AltairDatasetsError.from_tabular(meta, self._name)
-        raise implementation_not_found(meta)
+        raise AltairDatasetsError.from_tabular(meta, self._name)
 
 
 def _dataset_names(
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 2bef2ed70..81ee5e3f3 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -352,6 +352,33 @@ def test_reader_missing_dependencies() -> None:
         _import_guarded(backend)  # type: ignore
 
 
+def test_reader_missing_implementation() -> None:
+    from altair.datasets._constraints import is_csv
+    from altair.datasets._reader import reader
+    from altair.datasets._readimpl import read
+
+    def func(*args, **kwds) -> pd.DataFrame:
+        if TYPE_CHECKING:
+            return pd.DataFrame()
+
+    name = "pandas"
+    rd = reader((read(func, is_csv),), name=name)
+    with pytest.raises(
+        AltairDatasetsError,
+        match=re.compile(rf"Unable.+parquet.+native.+{name}", flags=re.DOTALL),
+    ):
+        rd.dataset("flights-3m")
+    with pytest.raises(
+        AltairDatasetsError,
+        match=re.compile(r"Found no.+support.+flights.+json", flags=re.DOTALL),
+    ):
+        rd.dataset("flights-2k")
+    with pytest.raises(
+        AltairDatasetsError, match=re.compile(r"Image data is non-tabular")
+    ):
+        rd.dataset("7zip")
+
+
 @backends
 def test_reader_cache(
     backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
@@ -527,7 +554,7 @@ def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None:
             rf"Unable to load.+{name}.png.+as tabular data",
             flags=re.DOTALL | re.IGNORECASE,
         )
-        with pytest.raises((AltairDatasetsError, NotImplementedError), match=pattern):
+        with pytest.raises(AltairDatasetsError, match=pattern):
             polars_loader(name)
     else:
         frame = polars_loader(name)

From 7bb6f9e7a8c98152f17a2ec9c57141cc58171148 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 1 Feb 2025 12:19:58 +0000
Subject: [PATCH 192/201] feat: Adds `Reader.open_markdown`

- Will be even more useful after merging https://github.com/vega/vega-datasets/pull/663
- Thinking this is a fair tradeoff vs inlining the descriptions into `altair`
  - All the info is available and it is quicker than manually searching the headings in a browser
---
 altair/datasets/_reader.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index cacb903f2..ec1f00ba5 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -213,6 +213,36 @@ def url(
             msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
             raise TypeError(msg)
 
+    # TODO: (Multiple)
+    # - Settle on a better name
+    # - Add method to `Loader`
+    # - Move docs to `Loader.{new name}`
+    def open_markdown(self, name: Dataset, /) -> None:
+        """
+        Learn more about a dataset, opening `vega-datasets/datapackage.md`_ with the default browser.
+
+        Additional info *may* include: `description`_, `schema`_, `sources`_, `licenses`_.
+
+        .. _vega-datasets/datapackage.md:
+            https://github.com/vega/vega-datasets/blob/main/datapackage.md
+        .. _description:
+            https://datapackage.org/standard/data-resource/#description
+        .. _schema:
+            https://datapackage.org/standard/table-schema/#schema
+        .. _sources:
+            https://datapackage.org/standard/data-package/#sources
+        .. _licenses:
+            https://datapackage.org/standard/data-package/#licenses
+        """
+        import webbrowser
+
+        from altair.utils import VERSIONS
+
+        ref = self._query(name).get_column("file_name").item(0).replace(".", "")
+        tag = VERSIONS["vega-datasets"]
+        url = f"https://github.com/vega/vega-datasets/blob/{tag}/datapackage.md#{ref}"
+        webbrowser.open(url)
+
     @overload
     def profile(self, *, show: Literal[False] = ...) -> _SupportProfile: ...
 

From 760eb66a2f96acf5e6a0f4271a163cb066639ae1 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Sat, 1 Feb 2025 20:55:02 +0000
Subject: [PATCH 193/201] docs: fix typo

Resolves https://github.com/vega/altair/pull/3631#discussion_r1937938282
---
 altair/datasets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
index 3c61eda0b..efdd85c3c 100644
--- a/altair/datasets/__init__.py
+++ b/altair/datasets/__init__.py
@@ -1,7 +1,7 @@
 """
 Load example datasets *remotely* from `vega-datasets`_.
 
-Provides over **70+** datasets, used throughout our `Example Gallery`_.
+Provides **70+** datasets, used throughout our `Example Gallery`_.
 
 You can learn more about each dataset at `datapackage.md`_.
 

From cc6d7573fc453c70a01753166a754de5f961a888 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 3 Feb 2025 21:55:42 +0000
Subject: [PATCH 194/201] fix: fix typo in error message

https://github.com/vega/altair/pull/3631#discussion_r1938474543
---
 altair/datasets/_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index ec1f00ba5..f75a523d4 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -506,7 +506,7 @@ def _into_implementation(
     }
     if impl := mapping.get(primary):
         return impl
-    msg = f"Package {primary!r} is not supported by `narhwals`."
+    msg = f"Package {primary!r} is not supported by `narwhals`."
     raise ValueError(msg)
 
 
From 6c93eb01bbcea3edd9afa69a90efc93f4d5d4364 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 5 Feb 2025 18:27:45 +0000
Subject: [PATCH 195/201] refactor: utilize narwhals fix

https://github.com/narwhals-dev/narwhals/pull/1934
---
 altair/datasets/_constraints.py | 18 ++----------------
 altair/datasets/_reader.py      |  3 +--
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py
index fbfd9cbc8..395a9d906 100644
--- a/altair/datasets/_constraints.py
+++ b/altair/datasets/_constraints.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-import functools
-import operator
 from collections.abc import Set
 from itertools import chain
 from typing import TYPE_CHECKING, Any
@@ -61,23 +59,11 @@ def collect(**kwds: Unpack[Metadata]) -> Metadata:
         return dict(self)
 
     def to_expr(self) -> nw.Expr:
-        """
-        Convert constraint into a narhwals expression.
-
-        Notes
-        -----
-        Workaround for `issue`_ is performing the reduction with ``stdlib``
-
-        .. _issue:
-            https://github.com/narwhals-dev/narwhals/issues/1897
-        .. _discussion:
-            https://github.com/vega/altair/pull/3631#discussion_r1934313255
-        """
+        """Convert constraint into a narwhals expression."""
         if not self:
             msg = f"Unable to convert an empty set to an expression:\n\n{self!r}"
             raise TypeError(msg)
-        exprs = (nw.col(name) == val for name, val in self)
-        return functools.reduce(operator.and_, exprs)
+        return nw.all_horizontal(nw.col(name) == val for name, val in self)
 
     def isdisjoint(self, other: Iterable[Any]) -> bool:
         return super().isdisjoint(other)
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index f75a523d4..8be37d365 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -262,8 +262,7 @@ def profile(self, *, show: bool = False) -> _SupportProfile | None:
             chain.from_iterable(impl._relevant_columns for impl in self._read)
         )
         frame = self._scan_metadata().select("dataset_name", *relevant_columns)
-        it = (impl._include_expr for impl in self._read)
-        inc_expr = nw.any_horizontal(*it)
+        inc_expr = nw.any_horizontal(impl._include_expr for impl in self._read)
         result: _SupportProfile = {
             "unsupported": _dataset_names(frame, ~inc_expr),
             "supported": _dataset_names(frame, inc_expr),

From 790ff10deb678e2d6d6bb7de3627df5b1e66b646 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 5 Feb 2025 18:52:27 +0000
Subject: [PATCH 196/201] refactor: utilize `nw.Implementation.from_backend`

See https://github.com/narwhals-dev/narwhals/issues/1888
---
 altair/datasets/_reader.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 8be37d365..8fbaf657d 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -361,9 +361,9 @@ def csv_cache(self) -> CsvCache:
 
     @property
     def _metadata_frame(self) -> nw.LazyFrame:
-        ns = self._implementation.to_native_namespace()
         data = cast("dict[str, Any]", self.csv_cache.rotated)
-        return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns)).lazy()
+        impl = self._implementation
+        return nw.maybe_convert_dtypes(nw.from_dict(data, backend=impl)).lazy()
 
 
 @overload
@@ -493,17 +493,8 @@ def _into_implementation(
     backend: _NwSupport | _PandasAny | Requirement, /
 ) -> nw.Implementation:
     primary = _import_guarded(backend)
-    mapping: Mapping[LiteralString, nw.Implementation] = {
-        "polars": nw.Implementation.POLARS,
-        "pandas": nw.Implementation.PANDAS,
-        "pyarrow": nw.Implementation.PYARROW,
-        "cudf": nw.Implementation.CUDF,
-        "dask": nw.Implementation.DASK,
-        "duckdb": nw.Implementation.DUCKDB,
-        "ibis": nw.Implementation.IBIS,
-        "pyspark": nw.Implementation.PYSPARK,
-    }
-    if impl := mapping.get(primary):
+    impl = nw.Implementation.from_backend(primary)
+    if impl is not nw.Implementation.UNKNOWN:
         return impl
     msg = f"Package {primary!r} is not supported by `narwhals`."
     raise ValueError(msg)

From 8e538480b7298d7ad9df8d98f987cc0b0352e244 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Wed, 5 Feb 2025 19:25:02 +0000
Subject: [PATCH 197/201] feat(typing): utilize `nw.LazyFrame` working
 `TypeVar`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Possible since https://github.com/narwhals-dev/narwhals/pull/1930

@MarcoGorelli if you're interested what that PR did (besides fix warnings 😉)
---
 altair/datasets/_cache.py    |  2 +-
 altair/datasets/_loader.py   |  5 ++---
 altair/datasets/_reader.py   | 19 +++++++++++--------
 altair/datasets/_readimpl.py | 14 ++++++++------
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index 13dca2f23..eb22cc36e 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -309,7 +309,7 @@ class _SupportsScanMetadata(Protocol):
 
     def _scan_metadata(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.LazyFrame: ...
+    ) -> nw.LazyFrame[Any]: ...
 
 
 class DatasetCache:
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index 9b55daf70..d1db0fb9d 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -14,7 +14,6 @@
     import pandas as pd
     import polars as pl
     import pyarrow as pa
-    from narwhals.stable import v1 as nw
 
     from altair.datasets._cache import DatasetCache
     from altair.datasets._reader import Reader
@@ -58,13 +57,13 @@ def from_backend(
     @classmethod
     def from_backend(
         cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
-    ) -> Loader[pd.DataFrame, nw.LazyFrame]: ...
+    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
 
     @overload
     @classmethod
     def from_backend(
         cls, backend_name: Literal["pyarrow"], /
-    ) -> Loader[pa.Table, nw.LazyFrame]: ...
+    ) -> Loader[pa.Table, pa.Table]: ...
 
     @classmethod
     def from_backend(
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
index 8fbaf657d..4f974fef0 100644
--- a/altair/datasets/_reader.py
+++ b/altair/datasets/_reader.py
@@ -306,13 +306,13 @@ def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, A
         return kwds
 
     @property
-    def _metadata_frame(self) -> nw.LazyFrame:
+    def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]:
         fp = self._metadata_path
         return nw.from_native(self.scan_fn(fp)(fp)).lazy()
 
     def _scan_metadata(
         self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
-    ) -> nw.LazyFrame:
+    ) -> nw.LazyFrame[IntoFrameT]:
         if predicates or constraints:
             return self._metadata_frame.filter(*predicates, **constraints)
         return self._metadata_frame
@@ -360,7 +360,7 @@ def csv_cache(self) -> CsvCache:
         return self._csv_cache
 
     @property
-    def _metadata_frame(self) -> nw.LazyFrame:
+    def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]:
         data = cast("dict[str, Any]", self.csv_cache.rotated)
         impl = self._implementation
         return nw.maybe_convert_dtypes(nw.from_dict(data, backend=impl)).lazy()
@@ -373,7 +373,7 @@ def reader(
     *,
     name: str | None = ...,
     implementation: nw.Implementation = ...,
-) -> Reader[IntoDataFrameT, nw.LazyFrame]: ...
+) -> Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]]: ...
 
 
 @overload
@@ -392,7 +392,10 @@ def reader(
     *,
     name: str | None = None,
     implementation: nw.Implementation = nw.Implementation.UNKNOWN,
-) -> Reader[IntoDataFrameT, IntoFrameT] | Reader[IntoDataFrameT, nw.LazyFrame]:
+) -> (
+    Reader[IntoDataFrameT, IntoFrameT]
+    | Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]]
+):
     name = name or Counter(el._inferred_package for el in read_fns).most_common(1)[0][0]
     if implementation is nw.Implementation.UNKNOWN:
         implementation = _into_implementation(Requirement(name))
@@ -429,9 +432,9 @@ def infer_backend(
 @overload
 def _from_backend(name: _Polars, /) -> Reader[pl.DataFrame, pl.LazyFrame]: ...
 @overload
-def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, nw.LazyFrame]: ...
+def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, pd.DataFrame]: ...
 @overload
-def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, nw.LazyFrame]: ...
+def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, pa.Table]: ...
 
 
 # FIXME: The order this is defined in makes splitting the module complicated
@@ -512,7 +515,7 @@ def _into_suffix(obj: Path | str, /) -> Any:
 
 def _steal_eager_parquet(
     read_fns: Sequence[Read[IntoDataFrameT]], /
-) -> Sequence[Scan[nw.LazyFrame]] | None:
+) -> Sequence[Scan[nw.LazyFrame[IntoDataFrameT]]] | None:
     if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None):
         return (_readimpl.into_scan(convertable),)
     return None
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
index cc4c01e07..1a5840167 100644
--- a/altair/datasets/_readimpl.py
+++ b/altair/datasets/_readimpl.py
@@ -53,8 +53,8 @@
 R = TypeVar("R", bound="nwt.IntoFrame")
 IntoFrameT = TypeVar(
     "IntoFrameT",
-    bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike",
-    default=nw.LazyFrame,
+    bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame[Any] | nwt.DataFrameLike",
+    default=nw.LazyFrame[Any],
 )
 Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,))
 """An *eager* file read function."""
@@ -214,15 +214,17 @@ def scan(
     return BaseImpl(fn, include, exclude, kwds)
 
 
-def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame]:
-    def scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]:
+def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame[IntoDataFrameT]]:
+    def scan_fn(
+        fn: Callable[..., IntoDataFrameT], /
+    ) -> Callable[..., nw.LazyFrame[IntoDataFrameT]]:
         @wraps(_unwrap_partial(fn))
-        def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame:
+        def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame[IntoDataFrameT]:
             return nw.from_native(fn(*args, **kwds)).lazy()
 
         return wrapper
 
-    return BaseImpl(scan_fn(impl.fn), impl.include, impl.exclude, {})
+    return scan(scan_fn(impl.fn), impl.include, impl.exclude)
 
 
 def is_available(

From 2c3b44dab018a3b9e3386ef21be3ba058a9c8ff6 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Thu, 6 Feb 2025 19:06:29 +0000
Subject: [PATCH 198/201] docs: Show less data in examples

---
 altair/datasets/_loader.py | 58 +++++++++++++-------------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
index d1db0fb9d..cc72fb950 100644
--- a/altair/datasets/_loader.py
+++ b/altair/datasets/_loader.py
@@ -108,21 +108,15 @@ def from_backend(
         Using ``pandas``, backed by ``pyarrow`` dtypes::
 
             load = Loader.from_backend("pandas[pyarrow]")
-            cars = load("cars")
+            co2 = load("co2")
 
-            type(cars)
+            type(co2)
             pandas.core.frame.DataFrame
 
-            cars.dtypes
-            Name                       string[pyarrow]
-            Miles_per_Gallon           double[pyarrow]
-            Cylinders                   int64[pyarrow]
-            Displacement               double[pyarrow]
-            Horsepower                  int64[pyarrow]
-            Weight_in_lbs               int64[pyarrow]
-            Acceleration               double[pyarrow]
-            Year                timestamp[ns][pyarrow]
-            Origin                     string[pyarrow]
+            co2.dtypes
+            Date             datetime64[ns]
+            CO2             double[pyarrow]
+            adjusted CO2    double[pyarrow]
             dtype: object
 
         .. _polars defaults:
@@ -174,8 +168,8 @@ def __call__(
             source.columns
             ['year', 'source', 'net_generation']
 
-            source
-            shape: (51, 3)
+            source.head(5)
+            shape: (5, 3)
             ┌────────────┬──────────────┬────────────────┐
             │ year       ┆ source       ┆ net_generation │
             │ ---        ┆ ---          ┆ ---            │
@@ -186,12 +180,6 @@ def __call__(
             │ 2003-01-01 ┆ Fossil Fuels ┆ 36234          │
             │ 2004-01-01 ┆ Fossil Fuels ┆ 36205          │
             │ 2005-01-01 ┆ Fossil Fuels ┆ 36883          │
-            │ …          ┆ …            ┆ …              │
-            │ 2013-01-01 ┆ Renewables   ┆ 16476          │
-            │ 2014-01-01 ┆ Renewables   ┆ 17452          │
-            │ 2015-01-01 ┆ Renewables   ┆ 19091          │
-            │ 2016-01-01 ┆ Renewables   ┆ 21241          │
-            │ 2017-01-01 ┆ Renewables   ┆ 21933          │
             └────────────┴──────────────┴────────────────┘
 
         Using ``pandas``::
@@ -202,21 +190,13 @@ def __call__(
             source.columns
             Index(['year', 'source', 'net_generation'], dtype='object')
 
-            source
-                     year        source  net_generation
-            0  2001-01-01  Fossil Fuels           35361
-            1  2002-01-01  Fossil Fuels           35991
-            2  2003-01-01  Fossil Fuels           36234
-            3  2004-01-01  Fossil Fuels           36205
-            4  2005-01-01  Fossil Fuels           36883
-            ..        ...           ...             ...
-            46 2013-01-01    Renewables           16476
-            47 2014-01-01    Renewables           17452
-            48 2015-01-01    Renewables           19091
-            49 2016-01-01    Renewables           21241
-            50 2017-01-01    Renewables           21933
-
-            [51 rows x 3 columns]
+            source.head(5)
+                    year        source  net_generation
+            0 2001-01-01  Fossil Fuels           35361
+            1 2002-01-01  Fossil Fuels           35991
+            2 2003-01-01  Fossil Fuels           36234
+            3 2004-01-01  Fossil Fuels           36205
+            4 2005-01-01  Fossil Fuels           36883
 
         Using ``pyarrow``::
 
@@ -226,15 +206,15 @@ def __call__(
             source.column_names
             ['year', 'source', 'net_generation']
 
-            source
+            source.slice(0, 5)
             pyarrow.Table
             year: date32[day]
             source: string
             net_generation: int64
             ----
-            year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]]
-            source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]]
-            net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]]
+            year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01]]
+            source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels"]]
+            net_generation: [[35361,35991,36234,36205,36883]]
 
         .. _Path.stem:
             https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem

From 51a967aef63e6c934b1b227cc9214776c3a5c699 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:13:59 +0000
Subject: [PATCH 199/201] feat: Update for `vega-datasets@3.0.0-alpha.1`

Made possible via https://github.com/vega/vega-datasets/pull/681

- Removes temp files
- Removes some outdated apis
- Remove test based on removed `"points"` dataset
---
 altair/datasets/_metadata/metadata.csv.gz  | Bin 3632 -> 3595 bytes
 altair/datasets/_metadata/metadata.parquet | Bin 9208 -> 9174 bytes
 altair/datasets/_metadata/schemas.json.gz  | Bin 2471 -> 2461 bytes
 altair/datasets/_typing.py                 |   3 +-
 altair/utils/schemapi.py                   |   2 +-
 pyproject.toml                             |   6 +-
 tests/test_datasets.py                     |   1 -
 tools/datasets/__init__.py                 |  61 +++++----------------
 tools/datasets/_metadata/datapackage.json  |   1 -
 tools/datasets/models.py                   |   8 ---
 tools/datasets/npm.py                      |  40 ++++++--------
 tools/generate_schema_wrapper.py           |   2 +-
 12 files changed, 37 insertions(+), 87 deletions(-)
 delete mode 100644 tools/datasets/_metadata/datapackage.json

diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz
index 30793abc86eee5f4255edba76dd1d9b739e8d66e..50fef1d82d1f766438ec99a64c1849996887fdee 100644
GIT binary patch
literal 3595
zcmV+m4)pOKiwFn+00002|7~S-VPs)+VJ>5Hb^xtf$(9?r5xm#8G$rhdPk#2*N8j}l
z$V4|sY!=N$vi$XqBCEJ`Yc!=-eUSikYbFB7B?1MeBOT=UWlOK}bU3xP{_j&;Z{)I2
zejMfSv_5=UztXb|gCmvGhW5koS^D!zzdQeZ*wJxKn_>9Jq2E5e5Z3C8ywcO*g`Q6P
zO&9;S-aUQVZJ(cp(RpBV$WO=pNuC;Q4uAD$@F{SpN~IJlyOg|AIaeJWIa7)bF%pZa
zPcO&g?(nzIpDS-a{c&K~tiSC)ZRPlRyL<ioO`d7R7vhJ%+fV9~TG3|rLZ9^KVfpif
zuA}vS*YA&qr%(0p?P;2wCq^q9{D|w91kOJUDisRsb)%?!a@97{Y_*i^ytP&t?`^Va
zA@X~O9c{NK+N^easl&dXPvv3lvSdOqS0=wCaT!LPC0eIKZB?bF12YA$y|yG;Yb(qV
z4E<6n?~sDfq4uvuu3RqD%S=aQe|;S1<?|m}ljNwfDxhu)H&s&zTKhoGkuNbBQ8C5E
z6z%|ah4vfyF=D?=F5~3zf0~IrarKJDc$G^Konw{kt8q1_mKA5#RJ2V?GvE$hSJrzz
z9QW(5GUmbE!g;QZvDW0-xu8obI;)yXRYqDYwKVaRkQps#WHCa@wy>DPBh0Rx_*stA
z{+}n8vCZ20*|Fn`PKf(rQXAr=c~>c?DgpV-#;9gOCT@Bu`WwIz`QZWjVT}5uVxAp5
zlvNhvTF8lP!EV=JW3@_Z_7X)$Zc$qJ5a@>q`~JAzUZD?@vFt=Qd(SyhGE|5-av|4p
zW$d^(WeRvS6(uTG;uZtH!s{ySrwt#5<8CYEvYB0F*bjk>vQJ7=%TmQ~usNn|5b&jh
z=CJ#`5cL&aS9`On{kF>XxJNSv;B$VyTfSs%@-EKK*^;96wXmVYs%o2+Q50?C;@DW-
zQejjhOG~-@3@Uh?Fkj2{K&JU+n$bdM7f_{8_kAw0X;RrmkElX1MPel56iL#`1c_!b
zk$Z?;-LGGTCiv&cm<2k`&jP~@f~N*;l-Fc*6KkBYkzDW;4ErWZm!;`@2e0GxfeuI6
zkCp#A9U~|?H9LX^)gzn&Du0TqK*MFuLLt`dnq?%46cnPR3g`i1$NTlSHDu;xv&Y$3
z^<B_&civi7)T)oIpaggcC_q*cg$lt5{e<e1(Tnp^?05J&q5bjYKPUQn4g0&5u^btl
z)3tNG<8s4)nG3t>P<0`oGy7V7b0$^v?p7FW#8rhoX{;8qcnh;@t^e;u_79`;K*wOx
zeBG}?hP=xzIf`B~ClT+-7|4jEI7F17TuohC|0j4|gT<Lt=6O6Os|rx)v$NG2gtuxb
zdX^L^a%$CSh`m9$S!m6iN+?S)U*TO)uUK}nW!ctSy?p<JYYHeeyPK&}1RjNn2<l(E
z3O5sNv8eu$ZN<+Ulgl5$=RH=CTR*=X4=b&{URhmNMjf3|!DsV{kZE?PJj%g4N@_R}
z*cca3h_dm-so3Z<FXr=z@=C<0cT9$*v85$M+I*$qqJd#CgP3c+*+fHS^{ksql>|n{
zFtmsHC@xo=cVymk`8xSB_?s}hKO6W0qlybq7A*u<1eI1hEG6rbH}L%tStK7u<t>wU
zMDBQGKRf%gF`DaeOA(`iSwk6+kW>*{Q_?5}C}FmK*onMhVP8>jY5$)S49{)tx}>ek
zRv9z9dFSc`hmsU&l(g(ohsD=WYIQiBnl-dwMPc|LIyb0SM*i)&`0-pE`uP0tO3vp7
z;U~6G8d|nV#lusmk{ep0O)8iiefl6G{>{0J_cIqY^=9$B7QB!1k3|8v)o3YMOGfTj
z+mc1@qm-e9Wr?sx1cUuSZ0>+|t?!S8Hk;{G#_1$XRD?5~oh%BLC`Hi7vE--&L1AS<
zOJ`rQF0LT-$S<|j4;Z^=+P$vPWA-D1>-;i}-l*C7Fx6t3YS2<upw7iNRU_J^<w}EU
z3=xGa7Q2wZBg7kn)r-*P__A7WYyT>j;QEsvo`05tntyXir3x%Ywr0>^$tz>gH+_M9
zQqJZekTRa?hjDw4w|ic{j`zmH=sZY9>-qYzHszIPjz)?%-lrB?)P-TdZ3vc|Yo;Z$
z^u}pI2f)XdQyCB9Jf7~1{cl{84F+8W#~f0u)+8a*%CP_F>A~q(DGa<8gP!np&zm0k
zu{z4I-Dm8#g^T0EX3Z~7njG-Kb@+-PLBok$ITy4Nh<F%~;jYrCrB3?|#%@Z2a~6+5
zbjfHF<Lr{;RjDy4X%1~dphRNzKpRqM4N8~LITGc?CFu@dx9-2wN;XoD`*mF(e_W2>
z?ds(myw$VM!9YFP)QsgwK8m$DH7B`g86*kLpbQb!mInXNfOdcMWilbda1^)5IXC;#
zGGw-CxGA#=h84nzO-ex$B40T;SB~0;BujnA6<)WY)qdSiAAP3Tc>?qN8=A?9K#4<T
zBsP#tEn@TG`4B|#7BEdsBCE9Yy5=5Wx9PXsZ~xKn`_uEwwDXUPV~56mnqThw9YM}&
z*lzDz<0f7QOA$h)!jB`RgGT-IEt{n><0m*rLjN4@1N${UGP<sg?i`Ul+hvGxRY5ta
z+Daw%5V6VUA^}{a3D!_+oBt3-k2pt4|2`G@*Lo|Dl&-6zqg5)$xu>Ly(Qtj(1q*|g
zRnaP_WC|mJGNJW&A}%_82&YG&;M9LT?Y=zY=yp@$+%H|SXa03_bO<UNqREswIo3(4
zGi{I_DHLx7sf2~)T;5=ui0q`_ZRCN-Woi82HD5l}m|Muyift`HGu6b5$b;E{<bduK
zR5Rye{TL=UklpM0AbTpCYlPhvCXsHVng7|BQc(>SFS{C@LGtG&HV6!1wpM3nRU6J|
zX*qj?vU}~fw6E7%Hcc<HF#qyA!;Uv{DUw4$uBg4Kp_~&bA}qqM>);HxrSR|Z0sOby
zE!<@??!C=9r`i5OY|_9xeCTj!b`>QUZX^1vlfwa^nCn{oV%K_yu-o?gSI8>EBl_!n
zx;li6n0-BEm9;8JHG`|lt-`h{cqX>wo5?i~rpcM5*SC)lyS~WbEThhV$Jy8_2H81R
ze=roF$-=6D#c+FLl@TbgWL#135$GF4b*cWi$Jg~Qn>yjf@G|nz&c5cz#d5ZS{*2<!
zEFvZsFUDk*wKdiSs*3E=j(r8!?PU9WT5qRVpQdA)My+SJX0KxjYz_WVbVZloL)K&?
z!o?3yeQef<^suzfJRt0L8#*?G5A5er_ItX1v3~!9Nk18P>&<`t)p5{BPU~89C@sQ(
zN@|UQhcHxVc1UDISY7Ic{se8e>o@D;x*|DhKYW%fr{UY`^{{^a(=qCl3A1lK9T8F^
zH?rB(nw?81t_>ocSo@;1vr#vfm*&c^0E6r9v>CsCkMyR6>ES_`&nP6-*rLzH7_A;!
zhr@vxU9C1IlGaTboNHUm=mFy$i7$v#=q%YEzPz0FuvWC4-r@YkFHK+h=Q|c#h(nN;
zwJzD#;Peoax*3IXuTV8oLD07RVPyXlcXtpv9yhZ3j=x`qJGz0)o5k~*OGeMXQ&G((
z$j8D}L#tFTo?&8|wI#FTSWqBEiC@a_CqUmaTD|nA{pQE&(zFggCHUrd&c9;yaNj$d
zFnjOd@ww(WypU8uw}AL9AtP5?M2je>L0mqT@K1nuhaC+2NOKroZp7?w+(arVCdF1_
zH7dfU!PXd@K8Hxz4E+MRS|RP0a+%OwAP--=?s1>zm(bHbhx8jyXkcIz+8HZvfK@Y?
zc%L(cnzO=xjZiMzhrqvr><-7iex1VWJf7a{x%oTqVN<H;0uiL4B?{C;^Tibzqyz#<
zz{8rl^eXa(uRHO2c)M4p_j=d)s3=^ReU*nDk6x?DNmulyHeBN=swivLH!ChiG~=9>
z_TEQ?-DxYYyG{S&73Tlx(-B*Kzj9u%F53B}i#1i!xef)daka&nsB+2y=?!xb*(RUT
z;@2B*Q9ck^QQh~4!)m?d)8YO_)~|nQdaCHrQD#dCj$wEtQ8TQjqMX$5f<i%26xe9;
zfzXn<eI(sK0gfG552EVkg>DOz88Ujd6g)rQVonLJQzZxc+$z+B7#Cfz3{%9Qo5Nya
z4;Z`C@C4y?B5<BesbuW+CDdPDGPQ<dkPs1vp&8avl84eWsAvU6tZ9nmYh3yv_zvKJ
z{63Y2>h-X6sb?{{`B&m8W?#g|+O!O}$WCBMovSDf#TrMD<S?v@F9Y7e9Jm~gbd*)4
z-THX$`_q9uu8-fgm78s{ttk$MTEicuGF)nS2`I_gWiPe1<V=GSw|Y6JpCIkN^&5z{
zexx(brYDZK@AEV6nQ)HUqc^rveG6PH3<bKQ_mK)C_=s3Z<E7>L9%A>PGVj%;HqiX>
zHxx>g<ZW}+mk@*YOeSdd4E>xCSI(jlwZ)(8?q%h7k*`o%R|e-Te_Wid`;wl23Wc2^
zi=w2g#9BENLP+J7?eM6e>fi)_Ccl`;JD}b77tAIi!+3gpmu9=R%Hl$C7IM(wFl+^v
z$XuJ`-1z(%iBSWV=E4nK_Z>~+o77`JJ(ay%7$XTqa{k9t;gCX6g@U#;+<fqGZ^jw4
zys~HzNb;op(krt!D7)|de#0w$>lC+zF^Y6{TFvbiq$~2lu%K5&ZEJQ_Rg(}@*{e{=
z@bCanmOkcs1G4+ydRKd%Z1Oa_7!IYAO~2uvEW;g&HboS4J9<R+O(Q#^Ql6MyiAt-o
Rl)*j5?*Hd1$ncam001G>IDG&B

literal 3632
zcmV-04$tu)iwFn+00002|7~S-VPs)+VJ>5Hb^xtf$&%bSmb~*@GE3N(K8;OZ`#ATq
z2fT+&mLh}Eu}H1{`f)~(v1CeBR8c0gOaKv*+`+fFgJ3$+L5{y~=}n#vr`G!aJhgr!
zmxc1}D2J#1@O%G8FES2}R8AY(kHZ%k&MU+2{QIz@V^5oL_{U+`KD`pw>Z`ob)8Un#
zPWw$4|J(1Le%ozdp2pF6U~|Y%$Nfp38f^}L^)vVsI8>!lij`eTUa6d`j*gruMTZ!P
zMb)R*<8gQR$EQz~x4->yVA=Fv_P=fA`03^K(-(Q6^N(k~kU#v-8}-{y<LuK6tfPLv
z8}`S;({J_g<!PRsCq^q9`~>Mwv7bK-DisRsb)%?!a@97{Y_*i^ytP&t?`^W_?}NTa
z*U@%+qRsPeFLm7Q^RswZyDXUy%$3BS;=c@|&JwLtp|+}0(}9_S*Iru^t+f^A2y%V}
ziw_{d<xq#WB3Bxh>1C#)vVZ;%&dcW?T9f3cvMOM03pZ6$2wM9<&XF%M8BsCCHR$iK
zb%pjD`8L76OfKW(@SiLMow$0%V!X;Fh|aM}_SLwWQ_G4oYbx5N714f&tSkMV566A~
zS*Do3TR6|PG1i*AIPP^xMQ2rWsme%;rIse160)ELjbud(*%sEoe?-`o6TirD-u3h3
zGPYS;zc^TY(FrkLOlo6bH18_qR3#v7*%;MK$iz*r0Di+Zfj&H-K1>mwRLqN`g|f<G
zTnjmoE!g84Y^+vE&0eDD$Sq0>Yf&F&*!yF@y`ml`Q(=j2@lJE1WT+5v<U+3H%GhUd
z$`nv&DoRwW#I3=6Mb=f?&)Yl>r#)86WwW@BupfgK<(!nJmZgf}U~^2_Ah=5j&0){^
z?*qOf>*`>h>#(h|J?_z3f%Tk*AC@o4nY@dOQ?sO~eJyM#v8vi;WfVo*xHvXew^SJQ
z#?lHJ-$MyXC(L`fK8AUInPxP^#nn+M)N`LpY?@Sd(IbpdDUk%pI7O1QGC`tQ!{HuX
zSNHv!(2V>%nIc}N<q>bVLA2DMjq;j|ZeooyHj)dTf??l8>9QhA?~rxeALwwD{Z!Gf
z(<x+<Q;S1pR4&3PprWU!3M5$OEEHnRu31J>NI|Jtp<*7;b-eGt^bm-bEuLmmHFrTT
z-EV7IQL8?-f=u@kP-3hk3Kdcj8VL0zqqF7}u<xLCLi^+Ezfbh}8sc{=Q^7Ghr)x9%
zK;s7eG8cB$p`=3fX7;uE=1i*S#H}#egR2VL(OCU`0`C!atquRV$n<e^9_ScMTCUSo
z$WUk5B}dUq<|N`h83R3#6o-f+ldGvK>iUeVYj7j8K0J@7oKyjddT~-(gHTp2MbDBV
zMNX|c4XrlnE(?{IQwe1S+$*vR;AfVdY+1JTRt!IWa1D2*7B?tWioi}V5kakMS7AJ&
zEf!ThvaR^MF}b{w%14;s)xW$R56@bCz7o2wOyW1AGB2hQp|b2)Ta<&jlhp7W;5b}B
zsmaC@r(&beyoSmnz$*u%K9CrfrY@BbTFWJaiw54pj9RSuW)qEN)3a_aRT6j(!w?bX
z7tpvuyyNka#^>2R!HR^%-PS<I8&zC@L}($nA}FKUVJTUcyn!8$$Qbz*EZ&p&z~PQU
z_KTA|8>6|7=LRtvSSn<@grthtnvzDwqe$8ME1ttU0`?8nl=lBV!JFLHu1ngwY?U#K
zTXC*V@F<cZjiQx3imdn=O05phq-G5%SCRkgncPrb$@uqA!%t7cp(`(sr{sKY&~0K1
zrLh5<R6IO@D!HM{*`$KW(Wmts{JW<y9hO`a(YwX-TJSzDe;5jwtwyWJTHw%IwJlks
zIEodDRF()ALomPViQHlA+AtgoZ8r0>n5MJBP!ZN~abhUAmlVMu$C9HC1eKHp4Vrz)
zy10T^BfrA#KEUf<X!q8mckCzd*ZE}{y-|zhU8==2)u4x9&$SWXRE_9>mMe{-FGQ58
zSnS`&{)p}d{`pmCb9{a7w{>`vOEmr9!}Dh;sO3k7RI0!dWNStpmAo<*t<V=Z9p!8e
z0!`wn{t8+jp>{9*+jLAjj?RN*v|cX%YExc$=4hmN<9%wOMO_%?*oI)axn^22D^7l9
zV8D8OJ(X$u&*S-s*8jvw)o7koaLggWy_+OtS{Zj89XdE2D}|BL8oV>K?qxF|7oLwY
zZt@xSZQ<g}utm$uk0u9ba0$L5D$qzGSIz}30?HePVLT)>YK04a2d|q0{+z5+)LatR
z#JD(d1u|5b#&DGiT|g`f4Qv`)n-Mi`G?gBOb4A&_gVy!KS9+F>RM`32AHQ8T{{8A@
z_r2ANci%t(+0=~XNInX{r{*L#Eu*%;&X+NS+KNE`9>(sEq0FXX98R(nspS@*7KY3=
zjfYiM!BIl|ut_OMLX;~9YspdjkYt5VxFYK|^t|tf`Bln1J5OM2e?s>-fpcfbg~SH3
zr$uZzoS6v;UI6~4No18)oSocb>o&udhwc9k`{DHRI`8-6;?!@ki{?*v`;HrDHEy!^
zt#K2tgQW<?QQ?)5xj~~i`j*X#!2b@!iOWC7)3~AMM>5yd$vGp67rO#6t}3V>Ra>dV
z9`ZB!TqJ;oG{G7QY4dBzJVKn<{NJg_zxu5_vbnBKmJ`(_E`2t-7!7ZR4Y2T1Srx5<
zN~SQ9B@<eTC*q>hS~`yyL1*}U+Wr252ezA&<bLUrDa%iHqeB$g5KX4c$+1pSooS<n
zNTGNuNE<AyqVW#i%wZ?PZX*vIE=$uJq~#K*#@s@tR%~ktx~C>)gc{5Nv;!2Ypk6s2
z>uM4=eBE1rkUf>nHKc9}v$D3)EPrQ8si+1cmR*g`AjfkP8w9p3TdT9Ps*O)(MZtOp
zuzMS}w6E7{HBT?Iu>6=h!zwp&DUw6AtthOip;i+qA}qp7>);Hx70B<Q0r!{NLD*$7
z?YPZ3r{%K3CJm_Liv)-MRZ(Bz4x-OGIcz&BwXW5#v6Xk=y6v!kgETTc0l&`YgFEPf
z#m7WeS*wCnGa8`WDtx7ay<tnfnOyVeZ=6|iWcrA%8;TsxQs|6znoTWW6pTxS1jE0Z
zEUXF$j7KR}8G(vQ#uWwKkA6T@R|tuFXx;F-sWW5@FC!oA;)91=;Lfd}+oGs5i-^g^
zi!oVcZH;w-sv^6h|6Xx*JK4UR`t2Oo^K{C_sP*DT>vb%Ft-)W4uILhc$eL_KeE9MH
z3*<ARI;^N958%4phK>#K0)Kdv{hqFGdOv<JYarupo%XL^-i<oPX<dsBrA0VINv%=v
z&~*y^3;BwWsw-^E4}f;Nq2FHLwqGXG`Bg?CUVNsj18Ho#N;#NjeGVG&6!FIpkBwL7
z%CzD-$sIC4-t@=5B6sUBzK|@Z@rCM*!*@TOGDDfL_=MCEA#-yhn@xoSa|y-1!3+><
zUzBz>>gIA@!|KOaL34N7OfR}8X7j>)vmh)d6Ow9hlR;vPR*zlGA+n6FRvQ!X@R~9R
zz_y0W1H3!@zay6*?__`Y{q?kmaii^gspc=fG{4JVZi;Ln4nbPhx@22}(_`M~W)#|w
zLOn|biPiG2;Q2$y-9hMh+{p7+{C*uz_C_A>7SC%g8NK{qMm3utSPNGTom9PehEHnN
zmduW0LDd!|eg(PjV0=&J`E@w$H{YHw{qyi$ZtuQx{)*M(vG8K9?7a(`xaK(CWmG}8
zfPgI_BQIM-%POdjTz(nX?_ll@JNW;J;W)e;rrDo3x>Qn3imk+IRD=nHb251593o{i
z_AlgRg`!+RV+M9XJHGb1hkTx2Vojqir=PeK1CON8Sy_1ln3_St`<yA%oE82xLP2b6
zkzet3hvQH`&+&8~&-d)y^8NO3LREBu0MigN1#+hO;);w~29YD6U`<_dc6o!=owy(G
z+tvAw-E}@m8W$G7b-+GHuT|uvD|$|&VuGTI!exE4;$nm($Z3TFcm&s-w(_>y4By^l
z3;0KD^!>_t!MbRd7bn(K$38!nw8qsIXQIjw1ju5TkjOUql-AtLcn|P_!!y<Wa5y~o
zTRt7`FID~dm*$&=9-U&bFyI)*TM#wF!z#*2jaMWT^g)3&Cm#r1mfJ64`UA`$-;aXs
zX18t&v)?j$u>w8cz+z4bc2*?^OWrE9ffyHEungbDV4A}kP7m<9(|FO~ZN`3{%xPon
z^+wZQ9#XZ2&6f}nc%d2AQj&*SGpJbwWvXe4<ZE2<_V*6k2>L#4#`^TIbg5V|x#frE
zDP~{9$J(@vhs{plb)BoI0L2<d(Eo74Yt9xvARK8Nj&zjgO1u7e?&9-fcwC<%DTT$>
zELUq_WW4gJrktHN)vAWJVJM=1IMO94wZ@u#A7uAs*g${{6O(B+-$=Z@ieGR}g{P-I
z+FdKvx4^~11)$-1AE_{+jfj;rUQvth(RB|Z@m?Hi1EC&&LTW@w-Zoc#2{CBTWP)bT
zP`(L)<SZIYTl3oPUKM^7`3$vm1%KY?$Hn=OFzMxcU)UM4C_BnZtd&C{xUSr?9j^{l
z9h~52@@q(Zz}S6##cbv<j^|r*X|choEG`shq2>%8fvunsnQN1r8=qe)F;acSwSgP3
z?kk$X7pcc#zRG&HFog~(;PTg|!XbsC3I%OxJVW8}$c-}?d1X=fbwX?ND^A<q0qnjG
z`wc(qTL-u;Oktx@JJixHK)NFK4GVfe)D~{Ns%jFVD0>wu86I!W$%^|}@9=g1?RRzH
z*~iYai$PF2+4K`u$1*&eXj4Q%Y@_yP-!w8ID&>jUm8i5TE5P5w>;4}(t~#USHUI$4
CB{VSr

diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
index 293c93a975929f24b4ccd533d4cb475df654bf9a..840a44a53e4ea0947667311432f299cf492ddf9b 100644
GIT binary patch
delta 5472
zcmb_gc|6qLyZ?+~j4>F?j2XtRiLtMhgrw}d%Ffu0eQ86+64}F)6xkw6WKEJ5LzZMo
zi<A;Y(PH1l9Ti{Q`|H*BzV3a^AM?D<>v=z)=bZDr&w1WwQS*hSfYE*v86+*T6oGsP
z)k7wMkN^OP+?4umj{+;O0xh{VDFT8Proa)v2!Ki0XG46YAuS3_A6F{+^PLz^zF#?2
z(sbcU6^Xmie{D|c+Q=DP>ATbJGgj{^jXS5>Lyulbff4y;)Li_g^<TorZrXs?U0$5(
zsDf8~Nj6WT<4?&xY%CotdM6`bD6y_z+;(@SB4lc0wZ-Ya?IT0y=+4hAnu*sj?+rOa
zsukkJi<^?**BN@PIzOkMKfAZs=Txq2->2I~7$L{RJGbq>62PsUFsI80$(kTD62weI
zP0RinVi$jAr&cfABNhIR*#)uxI!%*e?72iQ9yvVDu=VA7Eng@x=4!(f#rVFwPBNkS
zLMFLco%2+<@kN=p7JIIWkJGl=?HSH1XO-kj(lYb5U~6ExFWmi+*|70Qk(zRq_|e=O
zt$k1sG1O1`-O$&dovY=(Jfd<GIdw}O`zEFth=Ou0D$yw^Z=7z9Li_GC3^0kIgauie
zcIuod2sbh>vKtKpAG%M&js%d&AY>j$eB%#+gdibUvH&*+`;Wcg*V<15fT<gk3<A-!
zlh4qolD~6dpvfRqDI01UM&{+_B-4QPAX-REDxPcx=4TQX0Bs_`?BsATJL$Y|EMC+k
zUjzxqC;$)y1`P=~=HlXh5(!691EtfTYG`odjLM3D2(4sbz;QH0lHl*}a{>UU)iH@g
zror?!QroPKleA16K=hEU0Dr)%q;Y*YE&pI)4+HfBpnZU~`^0=fgem8pU}74RHT*DO
z4wwK~5-81sw`*Z&fAx|RyZpWELM7{dhH8xXN4s?gOM@Z`{IqD*(wj_b-7rb?Pn4MY
z@BiuuNw&yru<zf?rg$L8YuC_}{FIN&70n*;j0$)qH4!MZ1D2U+aAhc|ZA8wQq<%1v
zui*horPtAXhh9Kd8T`RRclJHyXPJZC%rfFhHjX(#kL5T;%#;<+sVG`-q{1llC-@3d
z?Q`|`Nrml^HAN^4{zKSlJV3#M-efZQG<Pz^K5Y@s98CYxr@K@P_j|Kg<fjm@THNH~
zfwC~!IowuJ!hiR9L(&p_cd^fHwM!ne)<p>^l`U^PD_jV)1CQ0NxfeOXPh4*4<m|wc
z&K1U99d4F&q7n7-UOUr$R>;?w5kF~b5<08WzvB#Pf(h!uUs<gGq3a5>w(=U;&YJT`
zt?%pLlF_tgDSNeCvkGXv+pgnA8dqn|@<Hd;d;FxF>BHiR8l4={WpX9ll){9QH6Ajn
z^(H<R7R?8Lg|s@+HXX2-cz|%P-*K)Wne;FsRs~h57)bFAKDl2g6`TInJGk+ywFqoh
zb%uT&bKxQ-)GndB)cI-94d1?SK>GbHbszkVgGcxg9)|Kn5b0oJ%Yp=4Hwb<wWzs!N
z0&FvVIMrxj>@^H9ugpx`w<4cE(*4Q``gAO);*?@;;7BqN>a4>K*fD?0RhK&E<OJku
zti(}*krc{jvja^6Hgl?~{SqZ)@x!pTk!OB=J6o{~OX>EzKD<A#_Bl0D7P!=Ql(KxJ
zCz+<|+{9>}Z}YlO(Ak)i{TB+LdP)d>5xOg738+Nl<%jOus6!5c;F*3W%QciL|5nK}
zxVU>d+j$eboPSn*7#dXv$AytG^aAKG2IK(vw;ERi?VtxV$R_kwAa(K$dNHs?GeCYz
zF9_-(gP>>`%_4*)-#ZpWJxg-VLPrH(e7?T@bksR78j4y*2ko}3wg^4JjsA~>rsjx6
z(*B|)ox`a>=YXI2#W14g>2NjVuiRV=)PVd**$saC!O#a0;lL)pK>$dN$_6S?0n82n
zf<Z&w1MJ+r2(G^-X^tY|(u6jM4FaJ-Km4{u(%cqlnY4D_Ze{)c&CN(KYNSAd!!b{S
z|IdUrV)Y3BDHhYVSU?#0Edz#!dbDg{l5l^*aBRm=lgj}BH8t{n82cu)dFQ`^hHXNp
zu&7Pws>q+98r!})@cJ9q_$o`F)#j|}sk0^q_;~*_<$C;|0&P!!Yx&6bJSftynNN#t
zhteV+$RO*`zm=Ii;{|CTsA>_#Tu}fPCZQ8cJN%mnu$P`7BK60WV4{po<4~O+%e}=D
zN;{W3HawI+#>kUiB}W`hy5e0Jb-^!HAWxNpu8(eB<%zVxT^TD^-<*2VFtJ;=J&BPp
zn-rO68UVQk?N@!@=O*ZMZm!UKS?69())jQ6A6gXjno)wHvuI^##8{wddSFc!j}i(O
z`Vt(})pRDu<C*;-m-O*hhoc`2UURlSV-mM8%&6Zt|I}Sim_hTZfm8<8o89j8ONDjq
zhWwuQUzYH>JhR1-H_vOfl9nDyjTxNZ>lhF^Hh*(`Jjdbe!ivgNfbKlM3Jj}~#vvDy
z=$fAw;t{=+K89lIN`K_MzB|5A!~>h9q?LSO(l3s<))73x-q4;>RY%dP!e8N8O1ExG
z>$*5hkn?P4s+*dm(EI3LmrvJ%woTpTz<D}Z*T<RX+T0??)TO4IEB1WjKiAkT5Z7hO
z%+T%`5mV}KQ*<!9u4j1mq0%P@-;<whlHZ7yv3Dg%HxpHwq0BrzGyt%ltiU8nYBrIP
zI(A?U9BrX+Z*nBUaw5EyTT$kEujkj}whjFQoafJX9r=o~!?T_@#Me!)6}?CBMSp|&
zIg2PS&dlix7@hufVP(=yu~wzei!h~E>T;ep6|e4GrsYnNHt-jjYddvezLpZD>NtWM
z+?TYAkD*I`b*ehSA?~J?*zFfhW28%s3T-2{yg1w0x1S$o7wSWVx^W@HA9XJ_v*q?M
z)pRg~H+yDt*L%g2FGkdVz%08*l{^Z>w-1{==6<<aBQ8|rX76><Pmz3EwU%FY2?9}F
z?M%NzRPnyleq8k4D0h?KKssCDd>P_i?Ex=6f4%PA3)n0%$GbJfu428fXN*bdlb6Ig
zdMd8@vP(V>4G|m7R-Qm*yEh#z<dLX-m9HAb#@P7j$%&G@v3U6=XZQ7J>AAZIi7Vy-
z+8o71p-U6dp5~A7xsR1Pn*v<QuLs25t9UhVGA=5(9W^Fox6T=lD-sz!h?gFT`w$E*
z)0Hw|m&)5E)GW(+B;7>)#K-U=lB%@Riw<M$Qf-e3)e?upvWBUlap#Ckw|462*qA5L
zeb*$_tT0uHo6JeD(p=W=Whk7RPSJrdidpUCj8^&*E8$)dohQ)VhPl=0x8ObApkC<t
zO7Dn%(+a{W(rK9eOq&J=_t!lUuX3*)irD$FL3B>`(5GqFgf|su`C)i@$25|d@e_6J
zy=zd1gvLksnJL=#`-KsDwY(+K_~-kwR2_B32xGVOqFG8f^304`K4d>sJZ4>7!WaL@
zHEMSfr)u{++dv`~zi7B}d1wY*n7*3n2~O=@<=Cskd7Riqx16SQ-s^BRLVx@VPXeMU
zh4p~8)3~hI{L0maHQT|)T}&3z1D2<<uWCeigdCqPySKyFj!<c_yMTXmaNQ+^M|yd6
z!nslgel2zIw1t5qtTr#0j?92aBpJD_lVI8X5@L}%>vD<WI;kSZzV@6zJdb+O(}h1e
zR@Ha(OvlWn@{wHLjI2kRqmNz+R}a<ChKO`Ah6b$*m|iWuCHm}n9b%?Xhww?j;Y0#$
z(!}0A`;v(3=1&Sa)43j|VH0GnkILr>%nilrKw~kxZZ?0HHP6f|?Tw6GJ266<Jv2k%
zA{<#aHLs9rmNbX*$~qn`%M%#1T*2C%xael*$E&$=^q@j<+lQP>E+MbkL>!+qsy}A=
zCO1w0DC*KPMu|xBGh32O$jaGj-q(9-KLwUsVvb8vbYtEW>wYsR5`co#a}aDAb*u>W
zWWI`vO8oaW?lM+T7@(G{ml3oF03iV8F)r>HV22JC6ti0!+*_rgmTbZafRdkr;8Zu@
zQfu@c!O3y~J3;DMztwM(dstomXVZY)Y8t=+ZiJs#L7P0Pmm60hp-g_sYEGi|#XVNY
zXN<oc=AV#soB+zrN02`tz@R|iv{cJOKtcD8Xx8F7cbOdCD&+O(S0b<<0C~{50z`lS
z^xsdc$|QY{CtB=aFq{>Q>YNxQYq~1bINdMrdPH}#obUkRFv8R!j#ShQ^5PWqf4I1-
zx+`OM2bz?|M~lz`0$85vvf?OE^y1aQL7k8GwBaw+c`uHMATRnz-{GJ&dG}?Vx=vb9
z{0b6(?Xx`tWN7Ez=$pMYAD<G8aQw|gxqanl4XfO&GZ1^v<^^@i?b3jo@p5bOIr4|c
zeaMZxbFNZDkPWIU%)g{+pj7nmW}9yT?8{R+R}ATWBNrD#ng_OfAvdxb<7~(^*d4&5
z-VXyO){Tif47fWYyL|NLrLN}}PMa<b*H4)WJu96mL}qkaIO*`+3sF*R5Y}cP+O{U3
zgUkl~BNh8TG33e~YT>#S*?2jM&zBsbf7QJGdpvY)cVY%hTAx3yYoV#n$pb^vm=v)V
zze<wBk_>~EZbpV3p+-@Rv?!h*Zz&zIaNa+Eq1|A|YH(qU^**&kfy%PZ4_#%FHXpiI
z1p^7r2=Cwv-?OgYJ~$<%AsovV3zv=xeB!;DpV0D=WiQ+KdhYwIN8je{Po6GHF%LIx
z!IY$2!yE^P38&g4Z-E5e%s9?KxgyrrpNB$7GKW}=TEN+=ANQ0O(7uWX{KfWo9@2J`
zyi41}HdxBmQ<B|Kybp`NEMYG~=h-@zQ-0yJy4|;>ccJNHEnv+M<l7kI((mqHjAC}i
zm7uRWKyOL&oa|EWb@kOo>z_fJDdlLDxTkf!vc^vv?$c9!KA_j>PdOYtR_7VbKF_Q<
zM|y%>5<9`BT|=*5dH{CVQ$;$wXI<GeR=`PRQDklMt-)ZgGNDz^EvJ#*^sECRvYx@f
zJ#7gz%T;Ygc$ewM8I(08TjwJzHu}-aI>%46@D#itzGX-Aep58p%HD;$r@D;9vREu{
z(5Zip8SW0xiqhI$moEyQ)!gr`JY9yirsLd3q0J7L+GFaFjZhdlZ(lE|<Ib)G2`54z
zA;38>Kuu0kT2fkq;OXl|kd%?y7?uih_jUAfat@Gmq{_`h8#wiUh1*31O8R=cQt7|(
zQt{u^@ox=3_?W-vPwu|7Lw#exjq7Z5WB;cd`Tmk)bu=dd1o|-@>A$%CT!zi*V9-DE
z`dfUm9fq3}!+@Lyp{NUh!J<~-)OUklb}a@OLG#P5;72ZRj;9GBiNA(>gtyEj+?LtA
zbIV*6*)r?#TjsRbmKk%18M(aqmA6Nb%Vc`4FAT{b3>GthTZWNWxwy8!O;ArNXCxZ+
zzy_%Z;If$iIavSg0|e^E27cZKDGT80xc|ZbLCvvjY%zRWagp)#Ok${(jUYv~$U^|G
zMDTBNi<|Uz!~L5rQU$<;31c^<!QZ46YTg-%LY3Mf1p!?BjxE^VbrdSWg$nHY0bImk
zk@CN^9E7SxT%CRXks`k-DOBDDYVrr_h6r{;HVOXH9lyyYRNaOj^$eq?H7ZJ;#B!Sd
za{Empp}IEQYJRxoi*0597d?dfuz`B<12rH{w&USc{ZHBlwY=fB_`@wi`=4F>!}ThS
zN_M60z-0oGde7rtT5bdU|8rXNe??RPKoxD|_R;})j|M<J>j6<;&~Q-HvQhOQFDo7N
L8v_7rJgfc}7rzt&

delta 5537
zcmb_gc|4SB|9&3(ZY<9%vdmb<l8EfbQg({$OO|P9$S#8l8OvC+&LAQOQOF)eqO66e
zRKyW-v?!^x5Yjt3DZTIMJ@4oD`8|Kk=R42mdtJ|UFW>vVuY1I3*$88?$4UWWfUd)!
zd6s=p1_%KFAk`%Q`5>B}zylbH;xaL!&bEL9U<n)sSCyh-6?WEmsZAveVkMleZ=1Sp
zEmSj~_4yB$oOiV(g&mSgc*fUz^QY+2Q@zvne{6k`Vq>wIOK=pvDQn)&nvZF_<TE{W
zOC!KB2h0)T9cTa;;F$%s))tJ?t@*^qsm)`uu4)A;OmCm8^#<yON$X^Hkf<X0qp9fv
z_Pct2ms|Gmfje3y2%+rwN-v)tRgmUBYBjZ}OQ^X^ELJq^6*x-l81G_A%ZF|@QguNa
z(gLH6)K25T`!H2>!JzRlhfe{=G?xd=$R`@FQ*p!aavDm6dft7W<iu&M++7UL#GQ@w
z%0hQhwkJs@#i@lwTO$<ZT-66;-!t66sn3W8WR^RLlZ{w!?VvKe?z%nuyrtfHH^d7|
zmmRPzUwtd48+95d$Z_my>m+#zvNG!Z&G&7JF>_sXd>hEQ-)XNQ#i%|^S4;Wv8(~JK
zijtjsPK>x-<i&fT`QcWNfJu@x76PC`5UK-ZAqsUrohax2<8%N$Z9pg>$P7^<>2#Ql
zY)qTnnw+V31yC#$5I)2UPhqD{(z!F4LkBldM)YWsq&m2XqK6Vojjc+JGn7nhO3{}5
zCQ0HZlu~gM8<R{?16mAmpp*e77z)Wm4Ds;rItFpTX{?2G@GEp!GKZ!LAjKdTObkUb
z$>D>70*(RzZK|Zfp%S)z5aq{INnp_82jWHX7&LJ_?FDJj+9W7k+{c`v2WkP%Uavov
z#aJU+!@YAjojDu<N5BT4Ll1WF9Bg^|_$p3VsVZF~`q@SI({kAw=TzwZKfimX@2{Wj
z@urLAWw<-T?`Ga!mZThKiBrxx9%$5qn{3UF%;)4AM|1gv<M}>L4t!t{aHW=E;IwW6
z!Rj{oW)k0J-3w=(dV`H-7n5J4w<Il`Mdb7wh3~5+dkx8#ffWLdneW2qikNSASB@w%
zwcdz08m-&o`R<Jqp8=)<9nrXIDZ>iyXK;J16c$yGL?q~P&0Pk|7Y4@;7WQ4i9Qh-s
z^uxA$i61R+h#AN5`oeJSMVm=M(#z}P0z+{x_2o#lvU%oE<N2$3W!m&yD@PEr#KL{H
z3oo26G^uX8;3$A`+(q#cFxeKXZpPikI&i;P=@4DnhmVoBB<oEDqX!OH6@Lm`9Se6!
zkor=nj4lTUa~ycwUmY`^!Rs7151Dh3Y0;|GxpzO8ag~unDk~Y<bLztgSMr<S^+oz8
z>$rwl5_?oqdHq2*{z_CyQ6Cs3o?+0$xtl{2MJdhT+QF;Vl*c(GT#~6J^ENO!Tg7bN
z{{2}F-Wh|0%G0~KDwDQhdqgsYb(zCXF84<au;oslEkXDmi8l!hkKS#!v`;ZE!iRk+
z7JXZ-|6vg4_-BckTJ=k^>gN-9`}!ZE$NEjK+L2;%7NeG24#cgV7#rDrapw-H5B<WS
z<ZO#VHU87OO;2W@Y=0YRHl8KqhrZxu!^WRYKm-!5Jh!0sF1^Lroivrs31gVWK1Ggp
z;D6tv4Nv;GmTQ^<Hq!@0l{Vf=%BdD0N;G!)AF19w%bh&FPru+w$sKor^9`0_5n<f?
z>c@w<z>oA>3(12?=HCkG43m!J%`ER8?-tkZ1;Faz<x9Z%<NXMK)j$;zKFonFqOLJv
zkQuDdH2qIA-WXJ71`bh`nVmpAY9{k`Mr}U8v7360SyWeuyVlBN90VBw36>|zV9>D5
zEn(W+1OVf5n48EnH>`m>@6R!l{C82+NY?un-Ts};yM-JSwkj55!TzUWu}#Gc-%6Iz
z?RYSO_?Oyq?C?Y;>;ko(MTkihQrcJv6j+fko+My%B>(|%H$YuwK?%_c`~ch+J0rGH
z-XMqsY1(LC{{e%xz*b1>*Kn&}NIoDHBM9Hb7E1gItG@*cL~S&C<ADH_1i(aFWxR+u
zPeQ<7nm0sDVt++su$72LQ3L={QQwJ}X+*&Rfm@hzq<+O@yp<{H-<jBGOvFHZuovF<
zhpao&zarZzYvPXuffs8c&oBKX%bOVB|AQ%G`>&XO(aUn|aMHIC>X+S$Wq@)Op=sn#
z(=dfySpkq~vO#dwmC=inkf~JzL0x#Zhe2Qf7zjRX#cYFH$66*QbWvt*XluQ{)RkVO
zt>mIvpmd^`-a6E9aH9U+tHwG-b4C642_@54u1`t|^j4^nJYI5e+1NY3IxanS1d}p(
z#3I;xXc~@L^;{D$BhS~b-#5ecA?{z~6I0<5lQ<Y&Y~a93ddcct{Qj`_O`Ktmn|L7k
z#QxjIb78@$0=I5*Si7|LTYUPXG3BI%>08%`;<@oijnpLZ$A|CF5vW~$GZ(m9pXwx?
z;twm(!bvp7YLi0hau%PfU0arTloaIYJ0$<8-OfeGX8uj&lw_oTpK!NPbk_}vTYT-?
z6q~Go!JQ(z9+h;UtXt+qJQa|H)Gljg)&^c}bIU=}@z*C}RMoBE_DB=EvWsFORG){#
zpR6=}<5Q0!RfS!br>niPnfO>i_4ZSp<kXbC_B`7gUxpg-^4Rn8v5yPQDlDt(-#y-s
z!Q`32GHq+vo1XI_4ivOIFuQPY1+=j@{8kpyna30FFqnYsD5gIE0BDwq*`v6ed$}>y
z&t0iY9MU8^`T6ZF5pqf6%l4FsT_tx`Bv(cyJ#0@`KN>ozn!g%D86K;AG_zMCX3>Pv
zeo`bvaHi{y3#Nu6j+xZnF3nKM-4-MwJlH$(?8*+=!lVZ&!+KT*<yDZh8M4=et4vtE
zc6idR`Hg4Q#BA%0w@<}7^5S(Ob1e7Ohx$m2hN)?hb~Q7#Obm2&pPUSDTYlHrcW`!$
zyniiXj^ZH7e<eCp9g~s!il}$l@5rK+#VJiaooWl)JaY-hg5=}$`M>877$D9nPWd4Z
z#ibi?eeBfr>4a-2qW$2F9lOTldPXDM9mw0PXYRK{oui@UsU~T2sAq*;L_*aml@GoK
zVd2_rBtC5-0VOcP@84NAo*vM%Mt$~ZW|!aci^j7(cKPqmTexA6S}W$ZN+&Wsnq^;2
z!OpcKO3L_r<dQ^f{YoFJUmShVqK|X%^VjU@Y&&%FeP->KlLmY?{O=Se<e=OjgGDX!
zNExCc32`@{$1RV2`#$O7)_1$_iy#$WJ}DO;DJ3;6iP+Z5<Vl+7^ca0;R5u{jgsiM+
z?AI0eOztk?yNkB(dX+@6Lgcvb`Epm}Rb;H}Nt+XU$x$AZ;7+f$h*C;NXa7iFTrpR;
z+g^dH;={&@LA`#&iMD)=+B?$uDpcEhZ?wBd(0R)Y7)72Q!|=4C(DI7R=|1%U)3y|z
zD{CZ_yXrh9F@|k0_lc=Zqs4HVP7Kn$uAx6dbI;=I3N*Dkp|aRCB>ma#T&pmLl^xW_
zu`+Q|w)g2jdDfc=*;l@}QL^ZWN&B<s_Qf`aqhbkc>-X9{mNk2&u{x4}M3pM(^rQ{P
zojFa6ZZG2^I6aWnr)q>Vi;;Z~bzAV}UAdLK^Ygk??X}UPZ_M=Im+dB>No7t8{C+0l
zLICT8y@M~@Tsra7(ly6J%qiMm!X14%VnONvcPeQonZ^08V*9<mM(&x+etkoB_e{*t
zYjh=hb@3V7nqfk1Ez7BAyKn<~S}I!E>9?}$O+30n<~~^zAlBZm>zy;S4zMo2a0K&o
ze|F~`z3v4htKihpnhqOA`}&go@;jB2cc>!?%{7Ph^<H%GWy5%P`Fkg=66(|p9Lt#b
zKELiK&80q#Q|_!<KVNqcm*`Rl(>&B$bHG5az3HQR`-0rv_8HBzkG09(R+)W!b`36H
zlCbD4iH*w@ned$D&SKwHfjGb<S<$xcw)~OXQ|l7Z_wJl)lj6KH`4p_x`qYF>@hy!(
zdh>D&3#eNJ<JCV#hCN}veu@j-4g#V8OEe3hNrzrUz@ifYkyj23fNFU-ZU0GUD(&V4
z1DAK|14XqUNCtKTxP)<ax}pRaZro(+?w!u#@t?c5=;rRddElle9>)K*P1GpE69uu!
za#UwtXHtee)X)CY8h?}j5gZ6T?<czR2r=!v8MVgPM<&FqwahE%X<)>}oAI<bzsUt2
z+QY_p2sjMjfHAg<r2NjH^$+ruglticyXVF8rv|0dhrAE>MdHq6vwP!Wqd2?!`Es;k
z@fCHB!#=H?ncI7><%J0MIFfqn=F!JARk#)37+t%TJ=F31ty=mGf`)0&6Vxra56Ae`
zR!OwZ;G#oO2F~|K#wE|GEJyEIHmSu|E4}&@jLc$YsUP2W`{iRQR$vs{J!EOo_B4-A
zkV!b-P{bV_ZA!++wsP}&)$OLEeED-W=794_4tvUV?K5fvNUq-NYIP*%d&k<j5e0)%
z`NPYRg)d=j&2zG@4h1T9kJm;HeQ7WalhMhZQO)ynRgLr}pH^76al5E~a^IX-ON55X
zRZ$aeZ`T_sFq^5M7?siItY1ChWn#zaI8@+0FkhvfvWk#gS4+*881Na3zId-fRgl*5
zBlT`$dsvP4=I7(^q(&9h^*4E?ClB0-Udjq8UFtKJSPM@-7$u<V=9J%e#>u52aV-x=
zmaF3E^s8-{A}U{&T4xof@S`(8tO?>(@BYfe5$l0s#p@y(3l;PJx8qcqqN2--_qoB`
z;*(>}nGvJJVXRh=s%W4CoXi3vPk$jD5!r*xi@cp&%Bh1SNqyk#F1_}o{w*Th?0r<O
zxMA&4dZH%LQO))NHqFp9jb)pq%OwSlLdNRFqwKHPMwA5?qFF_G^YX8fi-zy1tD6nC
z_+v9rl7ll#@YfGk@cM(wFV^i|^5}hHwrbY*=IpGkE^MxS6uhS>v5GUgC7{i6S~BBI
zrrY8NlfDz;<D})-uBNrC$=#lx5%@2;gWAjEfee#8&E`<)Zx^r%y&=BgS073X%jdp>
z;;)Pr%9xA`C6zGqw%!_vIZT*~-Fq|rfxWlzWpQPGgW~r=E*%n8okALA;&o!7YM~(8
z@Qs)A4FVjLxrXksHI=z*$Pegv1!2jUm<yZr!%aK6f)EU1f|vwpr&9wek1$Hk^RRp<
z!4v=WXU8FYFrG*VChqpbd-=<6{F4v!3Um)RL?Ft!)12y_4T#o1K~!lNxj=tU+Faha
z>_Go)wExZ@(1!$lJ0Mb=;FH|gmAxB>v;V6=fnN)}Xo0|k;MZns|8>@1E#GVg3i(wC
z>7Vkq_7v?*PV>wtL3kD`6h*gTpJ{OpCsYo8i)T<!#^(D4G?cg*&xk?3-%<)CzK4@o
z$X9QZMe6(OJJR37jP2jUe%bF~vJsR>{|z^5551@UftqKffG9!K1U8GEdPhKD>pcg8
z1YaOPNO+Yiqyk{uxdgxaxBvA510L@|!_T-viU4+T+YkIN{2%wmD0BSiH);eE*LHZ}
z21gbSG6b+6MFpY%=Eyp>ZR5JMkv@)tv;pjpxFCg<U-a)h7<|YBg2QtULZSdR8B0YA
zA~ub{Pj(AFyODO~Yg)gg;QwZ=;42%TiLam->F=O_uu&v<h9?cS{uOpr<~!`4Jrn%g
z2CxXf(U!IC-+>$7`!D_oesu%Y`4x3(hag0iXNR=^&J4kaHxg&RCQ?j)4C3Ezs*XG1
z&pl}a5bp-jc6jVVr!D#auSKo<Pvy5+t^fd!I!sg4?G9Da0r2;OAnn!-%K~4qyx1np
O!^kqr3IH4Tvi|~tdonZt

diff --git a/altair/datasets/_metadata/schemas.json.gz b/altair/datasets/_metadata/schemas.json.gz
index c435afa6a718f4a5c2649c50149768b2300b4422..8a593ec26f0939437b9a179e955856bf4b995f4d 100644
GIT binary patch
delta 1991
zcmV;&2RQhr6P*)~8-L@t4gM>dOdqZ@^-P?--PnCgoV{&kE|)Y(Z+lM~TB0OwC{jaG
zalAYE?*%EzmH_HDeQ5V74nP7R2m&8qe>-8qG}<_O^7rK1iC~VwhZF0JQ02+rk`u*h
zKBZ*B?WYv$n8S;7tDHgWBy+;Gd0492ddp3qYE@$LE6|y;nSX}=f?*Hb2yUn1{_X!2
z%JH%j^am2KWk&LS*uEEB=2NxL3?$-{>^U<?suh#71;N7P{@*9wdF`TeVe%XC6>u`c
z3YXllC!XBsoXhEgk43>Vmt54|Ux_Y~oYkyEQht%5tXy){dpY-i$HL)*6pAOe4L8hb
zGu?e_L@8AAKz}dku40yZrA1IMWquYim!jawgZRqlSPjp}wLFTNcPBy4-DvBQTUT-H
z;=s7)4TNHh^Qaw@$zKo)bn`s9zIaIBa8F8y2tF(b*VN`EcU}1C)?_@{fD1}(_Q^(f
z;Wn1nEy;!@H1PWsw%rbk;vz;1B2q3&uCjfSQ44ww-G6e~8qV6-mYx&Ql7$x-RnxQR
zrbiHFTd;cG##^;Ww1Mj_76<o^BrMk-&Isilq5MoJ?+N7tp_~)Sicr>=a(+rEOF|(w
zpA(zUiOuK4=5u88<3Gs?a)xtqhI4X;6*<F-oMA=Iup(zzku$8w8CJyaD`NAC*t{Y(
zuZhiTVlnfY*t{k-uZhiTV)L5Vye2lUiOp+d^V#X?DWNP0<&03?5z5bm@}5vW5Xw2B
ztO$jZQ34fzlGwZ?HZO_IXQ#NJhE&oO)9&*hv=A5C@Qm3Q>zT<G!-s8aoKQjJCCeP(
z2ZU_yxDuI=GAtSag9m4L<8-~xFy}WOqkb~1PNBLA<pobveB!FL=&Ttt-9af@xDk@u
z6pDEIiAjJ(xESui)|?xQ=vTrvl4ZQ+%1urChc=dfH+n}Z?q}``ekxQdx0coiE<KPC
z8l80T`~W~YWP#78&lZ_h85nbr@$Cd-m(A_jR5r`MwAMjIpK@LgSW@(0D+N$Eob*qD
zTfrd!WDizHa1($_%3lDvAZMv`W(PQFZV!dI!C(Urhy`nNbnYE&$g#(D1QX@LLNUhW
zmzjEhpV}Ufjj;|aho`~&(E6wx0qK!vbH$mf2yDuYc<v(k8-U-Of7j8q4_iblnMJo)
zNM`sV=M9G#R@rc!Hee>S+cDEai?LxrLf59`XamAB-D)4=cCkEtn%u<kC;-OUp1R#J
zMGVq7(!+KO3Fh!jBB~)@_qyM%$%&d1&kf>#e%wiZnC5pic}KxT{}0{of7q$}raY*B
zyM!Sjk%yzT+d@T&W~3ZmyCrfeUGvm&R!_-n$Mz9&SPk8}A^FO$CydvSAeuT<vqi<3
zbk#yANbKG7jt&nF{IUFQ?=&Yjx@qOXHKWCbm(=42JGwam&2CX+GU<yIUJ9BKWwvlw
z#7a#tN4Top23v+q@SV#orG~p*I9@k%N9cKDJtJ?^K9D%dZ@?+H!AEkwO~+-x=(a%~
z7kbAQfG{#3^|Pd-)k9Ps%fe_B#a#-G=?I#P{qRL^yNUuIv$6!n0e=d;$RN(3FYaCs
z=<1xm0WS+51nD(&MNSm3j-tBbDmpZ{R+&GEa_Oc5j0X<p#*ZUhi>~wzlG;9vOujqD
zMNt4Ci6{@__w=4;Tr|#3UH531vVdi5S{PlY7p<!Ru72e?=&5S~kCx=DP8{TsZEI|p
zS<K3GdS2o<@BkkBoPU*Wm_<+KBls=K*o<$YGydS@>B?U<i>??C#eM&thJ|=!bGSIq
zP5pNq`^b@x0??94v%YmUz1@K)E>Qq?F1=PB%I7V7+@Ls%n-uv*=*a96edfp)GQ@A4
z@pnUz&)<Ax?+}x-JDvgUl{|%eOlc9j=Y^l9_&7Np-jW)4Sbr4P^NM?hD2Arc!txh_
zTY79VBRhr1DHQjiU9|?h%hWIhp;tix_gr6dkctY(SjEI53)L5}$ztb`rSHI?cFX;u
zD~5Kq-)=nMYz=xJzPa#_oG^Am<*4o;#nDO#S~AfH7nwhLRcXy$VpKepMcO^(pmdy^
zZS(&0bkWcL|9^LNndM(1)MpJ9i3ChAX=d`!z?EWbZk^7a`Z(IxTaC*9+k~|F@xgyX
zMXYNJ@CF5bmpbIC1<O2eg&sa{?Qv9VBVXFb=7*3MsyFmw6k?Gm#fH5Jt96@%#}jWv
zf~{R39fvavfupo4n5pp_HsYDqj}qE7n>GQ=Q&<aLe1C@f@_fi}l;6utzqhUqUr?lw
z;Zv#0k}d#r0{7JQ6L>FEZNV~l<aApO;Aty)8cu-`6ts<h!ghw`JaxX{`*By7B%wY?
zCrB;WRT>%~XoAwPYkENdahZ@Y4ifH9zhj0&%Cn?TXv0WGV1OZ%4~<ynEren!6HxTP
zH&A~fW`DzN6p%JyuU`e)w_Y(&hWo-BI0V7b)^5Nxw^nlD6LPmwTc0(DnG*hbVNOKp
zORO5cP?=tx58yPbm?}w*f)@~K)*@p;Yxm$K&e+qmG4oTQ=~or6lT__x#t`ujt{OYI
z7dHnL2Ig1_zz6(=h}ZQTdfkKb-Z+p6Q(O9lku2FTe2LcoNCxTsDjNU8j!r53lSY2#
Z@%CS8{Z~H-o80s3e*o@_=&gJv003J!-e3R#

delta 1996
zcmV;-2Q&Dc6Q>i98-Lum4gM<{Pam!`-5IZQiFf;EuXCwq&S%$lFTE!XSE3}w6saL8
zuQwV0_kxtPD*@ESeQ5I(2Ot3u1c49Gemh~pG}<_O^7rK1iC~VwA1Brsp~{oLB`1p2
zd`iiL+fOOhF^3oFRyl*#N#=xW^RQI4^_H7J)vCngSD-UxGk*>L3x+*#Be<Q4`?vqE
zP>z?Kpg)j+Ei;nu!}h)4GM}n_W*`xtWY3vFQmvSrEeIAS_y2w3o!2fp7bd?EUjZjG
ztZ>N<d*aEB&bgc}_*fJ?bIC>R{gvn<$yv=xB;^+=%E~2Iy_a+U?^rneA%)_}ZNm+7
z+Dv!f8c_<BJb%zjx~rJwUTF~&OqrjB%%v!J@*uwQIab3naxIUd=G{q<b2r+$<knRj
zyEriJc>|#s<2-7|Wbzlp0^K}Mt}h-EINX!cA%YJJ!Zo#d$z2ydx-}V3HsFF%n|-p;
zUAT?qbxX2g2@U*yg>ARPqPU3Bf{2uhlB;Z=WYmJ5Lw~niwuZAdwx#Dpv}EB0M%DBz
zy6F*w*%qvxxA9gj5^dmmi^ajcBMHm(hciNXM<_oN%6mfjKq%*gvLci<rktM=%92os
z&F94Cb7J#3vH2X?{P<6@f}G);oZ+0DVMWfcB4=2UGpxuNR^$vTa)uT0`-<4SA~vsx
z&1+)wnljkDCN{5$&1+)wn%KN1Hm`}zYhv>n*?e|-dP*ouLOCOpcZBjYp}Z%Q4}@|~
zC@VrClS={`e@kNX*(ollA(eE+wEO!HT8Im6c*bmu^~_|8;ls8yPN*RAl4TC?141@;
zT!~Cb85WIz!Gklrak}1TnDZNtQ9l`0r%+vm@`5KSK5^Arbk>ZS?w}Mc+z82S3Pn8q
z#3aBXTnzVMYtD^D^ebT-$ueGZ<))_nLmSH*y(1O(e>3+5KNTvKTTAN$mmWw6jZQjv
zegGgHvcTulXNyd$42(I*_;!M^%jWiMDx2kBTI-;qPdTp#EGc@hl>#UnPWq?7t>6#<
zvInapxCy`|<u8C-kh4@evjdzow}-;qV6XuQ#DcXsI`<AX<k;gnf{Airp%~-x%S^pb
zZ4b!Ce^>{W!_(k>Xnj<Ufb__-x#G-K1UBVHJa>`&4Zv^Czw7AQhb^L&%%WQ?Br|-G
z^M*qVt8BPV8!!{v?U?DI#n`YQp=(oev;kq6ZnY0_yI7t+O>W|N6aeFFPu=dAA_i$3
z>0!Ht1ao*M5!H~dd);r><V4Mh=LT^<?j%1<fAhPVyrbZv|A+4PKkU?fQy$d6UBZx%
z$ivavZK0w>Gg1z(-4Z#Ku6gP>tEXhPWBUj>tcGsgkbLFW6UJ*u5KSGb*`nf1x@sX5
zB=+ukM~4Ro{#bsucbbzM-L&%Hn$cpzOX~519o?LOX1Ay@ne@d9F9pqrGFv!er6!mo
zQ(V<<gDpcQ_|D~)Qp4RY9Iu<XBlNtno{_g{A4nYKH{g`p;3GNTrsFbTblV`03%z3t
zKo}X2`dQM^>LDtRWnnam;x2{8bOcSte)yudT}6REv#|um0e@uLqk%`wI6XERAc4pr
zBB6us-WgccZQa0q2p=x#HS|wT6tIrszvC)8Ubt47SEXFKsQ?YaRdC}67p_H@e+TJr
zpGIcn9pj=X0J=n!he3UM&oeF>XQxMfG)!4QJvJ?juG5RwRRC<i@*MQkwSebMvRfw(
zGS0R&HcU5WWq&$7FL4~A0Hl3(OE=7-$Mq5X7G-S4H_;h?aPoBJubM?yjK}A`(@(=f
zJlr{4oad(gYsfxw<f8zzWYX4eolS3d;E78V>YYokl?VNK3x94<oW)Iwd^<WayONMO
z^67^7tuy{^2=e)xkL(>{a(2fv;KGupaE~c1clW&TbAJ~fC&$BEQY{aQ;(A_jcM(O~
z6ogp*LJ&@mVrFEg@HmCyKD4XWfOnZ1<|XthC^VnzfDTeosTr%7SY)A21BzMfJO=f>
z8dQ0?Uvy>D&UWUFho-GT@58qt9=sFAPN*DZLZmoaGeJux8sQ?7O0O!d*-MOy=eJ0^
zryP`y(|@>a-k+W>`mY54y9&<oFA?gqhT25}CYUrcqiEntF^acNXHR_`?dz>ZK;dmd
z+Wh$7zoBy1H4u1%0>4Wga@B%m9^yi`pSSin)U}b%`>{zQ<b~=Dof`#XBucSiZ$fq5
zCgG9C8!=;R7f8q93`5{3tqNvp{DzHqruCzQ#(&PHO@RFr0D~8w;l4Z{G92aiGSly^
ztHY;^6f%4&by?B{fKK3^y3PadWvVS$29KO>%K>I>B~QaCFxG+w^H12$u$-sP7kob$
z>yjkY2k8W<1-nW^s|4**8g@-D2*5BCGR8r|{pokia7cNUbQEnE-UtjZgz}*g%e;k9
zOn+qpiXQj|>QBUMxQznRChYaAK>OA!2Fh?>cmszZINI8c!sgaWE__1nR%+|B<}iK2
zA0p;Nl)l8O;nU0X>U;pFS;bUIaumFPP_q^p3tGDeFLB16rj3~&6-_^CyiQWJml;FE
zKe%e_;9lGuR2Z0JDF7evrxUO1IdsJb=QX`?bQ9*g^wW~rFno#D|C|Qt{VE#&6Oc|R
e{3nh4Amr`8>H4pJIySlI*Z%-?tB3`rCIA3g6Wq4|

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 7c524f2ec..3357ddf3b 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -69,7 +69,6 @@
     "ohlc",
     "penguins",
     "platformer-terrain",
-    "points",
     "political-contributions",
     "population",
     "population_engineers_hurricanes",
@@ -151,7 +150,7 @@ class Metadata(TypedDict, total=False):
     ``Metadata`` keywords form constraints to filter a table like the below sample:
 
     ```
-    shape: (73, 13)
+    shape: (72, 13)
     ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
     │ dataset_name   ┆ suffix ┆ file_name      ┆ … ┆ sha           ┆ url           │
     │ ---            ┆ ---    ┆ ---            ┆   ┆ ---           ┆ ---           │
diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py
index b1395d4fc..d75cdb593 100644
--- a/altair/utils/schemapi.py
+++ b/altair/utils/schemapi.py
@@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]:
     ],
     str,
 ] = {
-    "vega-datasets": "main",
+    "vega-datasets": "3.0.0-alpha.1",
     "vega-embed": "6",
     "vega-lite": "v5.21.0",
     "vegafusion": "1.6.6",
diff --git a/pyproject.toml b/pyproject.toml
index 9fa203f37..b9edc7ea2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,9 +104,9 @@ doc = [
 
 [tool.altair.vega]
 # Minimum/exact versions, for projects under the `vega` organization
-vega-datasets     = "main" # https://github.com/vega/vega-datasets
-vega-embed        = "6"       # https://github.com/vega/vega-embed
-vega-lite         = "v5.21.0" # https://github.com/vega/vega-lite
+vega-datasets     = "3.0.0-alpha.1" # https://github.com/vega/vega-datasets
+vega-embed        = "6"             # https://github.com/vega/vega-embed
+vega-lite         = "v5.21.0"       # https://github.com/vega/vega-lite
 
 [tool.hatch]
 build    = { include = ["/altair"], artifacts = ["altair/jupyter/js/index.js"] }
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 81ee5e3f3..f112cacb8 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -227,7 +227,6 @@ def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
         "ohlc",
         "penguins",
         "platformer-terrain",
-        "points",
         "political-contributions",
         "population",
         "population_engineers_hurricanes",
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index 6c8c75fe5..a7c1d06c4 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -20,7 +20,7 @@
 import types
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 from tools import fs
 from tools.codemod import ruff
@@ -40,9 +40,7 @@
     else:
         from typing_extensions import TypeAlias
 
-    _PathAlias: TypeAlias = Literal[
-        "typing", "metadata-csv", "metadata", "schemas", "datapackage"
-    ]
+    _PathAlias: TypeAlias = Literal["typing", "metadata-csv", "metadata", "schemas"]
     PathMap: TypeAlias = Mapping[_PathAlias, Path]
 
 __all__ = ["app"]
@@ -54,33 +52,19 @@
 
 
 class Application:
-    """
-    Top-level context.
-
-    Parameters
-    ----------
-    out_dir_tools, out_dir_altair
-        Directories to store metadata files.
-    out_fp_typing
-        Path to write metadata-derived typing module.
-
-    See Also
-    --------
-    - tools.datasets.npm.Npm
-    """
-
-    def __init__(
-        self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path
-    ) -> None:
-        fs.mkdir(out_dir_tools)
+    """Top-level context."""
+
+    OUT_DIR: ClassVar[Path] = fs.REPO_ROOT / "altair" / "datasets"
+
+    def __init__(self) -> None:
         METADATA = "metadata"
+        out_meta = self.OUT_DIR / "_metadata"
         self.paths = types.MappingProxyType["_PathAlias", Path](
             {
-                "typing": out_fp_typing,
-                "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz",
-                "metadata": out_dir_altair / f"{METADATA}.parquet",
-                "schemas": out_dir_altair / "schemas.json.gz",
-                "datapackage": out_dir_tools / "datapackage.json",
+                "typing": self.OUT_DIR / "_typing.py",
+                "metadata-csv": out_meta / f"{METADATA}.csv.gz",
+                "metadata": out_meta / f"{METADATA}.parquet",
+                "schemas": out_meta / "schemas.json.gz",
             }
         )
         self._npm: Npm = Npm(self.paths)
@@ -89,9 +73,7 @@ def __init__(
     def npm(self) -> Npm:
         return self._npm
 
-    def refresh(
-        self, tag: Any, /, *, include_typing: bool = False, frozen: bool = False
-    ) -> pl.DataFrame:
+    def refresh(self, tag: Any, /, *, include_typing: bool = False) -> pl.DataFrame:
         """
         Update and sync all dataset metadata files.
 
@@ -101,17 +83,9 @@ def refresh(
             Branch or release version to build against.
         include_typing
             Regenerate ``altair.datasets._typing``.
-        frozen
-            Don't perform any requests.
-
-            .. note::
-                **Temporary** measure to work from ``main`` until `vega-datasets@3`_.
-
-        .. _vega-datasets@3:
-            https://github.com/vega/vega-datasets/issues/654
         """
         print("Syncing datasets ...")
-        dpkg = self.npm.datapackage(tag=tag, frozen=frozen)
+        dpkg = self.npm.datapackage(tag=tag)
         self.write_parquet(dpkg.core, self.paths["metadata"])
         self.write_json_gzip(dpkg.schemas(), self.paths["schemas"])
         self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"])
@@ -226,9 +200,4 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
         ruff.write_lint_format(self.paths["typing"], contents)
 
 
-_alt_datasets = fs.REPO_ROOT / "altair" / "datasets"
-app = Application(
-    Path(__file__).parent / "_metadata",
-    _alt_datasets / "_metadata",
-    _alt_datasets / "_typing.py",
-)
+app = Application()
diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json
deleted file mode 100644
index df9d40e85..000000000
--- a/tools/datasets/_metadata/datapackage.json
+++ /dev/null
@@ -1 +0,0 @@
-{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2025-01-12T14:23:04.938086+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:6586d6c00887cd48850099c174a42bb1677ade0c", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:608ba6d51fa70584c3fa1d31eb94533302553838", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:719e73406cfc08f16dda651513ae1113edd75845", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:11ae97090b6263bdf0c8661156a44a5b782e0787", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8dc50de2509b6e197ce95c24c98f90d9d1ab138c", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:1b8b190c9bc02ef7bcbfe5a8a70f61b1616d3f6c", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:5b18c08b28fb782f54ca98ce6a1dd220f269adf1", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8a909e24f698a3b0f6c637c30ec95e7e17df7ef6", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d8a82abaad7dba4f9cd8cee402ba3bf07e70d0e4", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:1d56d3fa6da01af9ece2d6397892fe5bb6f47c3d", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b8715cbd2a8d0c139020a73fdb4d231f8bde193a", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:0070959b7f1a09475baa5099098240ae81026e72", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths &times; 1000 &times; 12) &divide; Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d2df500c612051a21fe324237a465a62d5fe01b6", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0584ed86190870b0089d9ea67c94f3dd3feb0ec8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:33d0afc57fb1005e69cd3e8a6c77a26670d91979", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "hash": "sha1:ed4c47436c09d5cc5f428c233fbd8074c0346fd0", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:0691709484a75e9d8ee55a22b1980d67d239c2c4", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:10bbe538daaa34014cd5173b331f7d3c10bfda49", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d232ea60f875de87a7d8fc414876e19356a98b6b", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:769a34f3d0442be8f356651463fe925ad8b3759d", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "hash": "sha1:74f6b3cf8b779e3ff204be2f5a9762763d50a095", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4722e02637cf5f38ad9ea5d1f48cae7872dce22d", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:20c920b46db4f664bed3e1420b8348527cd7c41e", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d9221dc7cd477209bf87e680be3c881d8fee53cd", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "hash": "sha1:9c4e0b480a1a60954a7e5c6bcc43e1c91a73caaa", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8459fa09e3ba8197928b5dba0b9f5cc380629758", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0ba03114891e97cfc3f83d9e3569259e7f07af7b", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d07898748997b9716ae699e9c2d5b91b4bb48a51", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:abce37a932917085023a345b1a004396e9355ac3", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n   originally added to this collection in 2015. The specific criteria for country selection \n   in that version are not known. Data for Aruba are no longer available in the new version. \n   Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n   These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n   corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n   preserve continuity with previous versions of this dataset, we have retained the column \n   name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n   `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8cb2f0fc23ce612e5f0c7bbe3dcac57f6764b7b3", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:cf0505dd72eb52558f6f71bd6f43663df4f2f82c", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:18547064dd687c328ea2fb5023cae6417ca6f050", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:01a4f05ed45ce939307dcd9bc4e75ed5cd1ab202", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:ebfd02fd584009ee391bfc5d33972e4c94f507ab", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart&ltype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:214238f23d7a57e3398f4e9f1e87e61abb23cafc", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:69d386f47305f4d8fd2886e805004fbdd71568e9", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:94ee8ad8198d2954f77e3a98268d8b1f7fe7d086", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:d90805055ffdfe5163a7655c4847dc61df45f92b", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:2e24c01140cfbcad5e1c859be6df4efebca2fbf5", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:1b21ea5339320090b106082bd9d39a1055aadb18", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:741df36729a9d84d18ec42f23a386b53e7e3c428", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:c79f69afb3ff81a0c8ddc01f5cf2f078e288457c", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:a8b0faaa94c7425c49fe36ea1a93319430fec426", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:921dfa487a4198cfe78f743aa0aa87ad921642df", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:e38178f99454568c5160fc759184a1a1471cc558", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4303306ec275209fcba008cbd3a5f29c9e612424", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:6da8129ed0b0333c88302e153824b06f7859aac9", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9b3d93e8479d3ddeee29b5e22909132346ac0a3b", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:517b6d3267174b1b65691a37cbd59c1739155866", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:01df4411cb16bf758fe8ffa6529507419189edc2", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4716a117308962f3596179d7d7d2ad729a19cda7", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4aa2e19fa392cc9448aa8ffbdad15b014371f499", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:680fd336e777314198450721c31227a11f02411f", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:3bad66ef911b93c641edc21f2034302348bffaf9", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d55461adc9742bb061f6072b694aaf73e8b529db", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0f38b53bdc1c42c5e5d484f33b9d4d7b229e0e59", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&amp;P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b82f20656d0521801db7c5599a6c990415a8aaff", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0eb287fb7c207f4ed392821d67a92267180fc8cf", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:58e2ce1bed01eeebe29f5b4be32344aaec5532c0", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:65675107d81c19ffab260ac1f235f3e477fe8982", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4d769356c95c40a9807a7d048ab81aa56ae77df0", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "hash": "sha1:d1aca19c4821fdc3b4270989661a1787d38588d0", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:c6120dd8887a0841a9fcc31e247463dbd3d0a996", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:ff7a7e679c46f2d1eb85cc92521b990f1a7a5c7a", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:8795be57cf1e004f4ecba44cab2b324a074330df", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9c3211c5058c899412c30f5992a77c54a1b80066", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:841151dbfbc5f6db3e19904557abd7a7aad0efd2", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0e7e853f4c5b67615da261d5d343824a43510f50", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:bd42a3e2403e7ccd6baaa89f93e7f0c164e0c185", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:cde46b43fc82f4c3c2a37ddcfe99fd5f4d8d8791", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:ed686b0ba613abd59d09fcd946b5030a918b8154", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:a1ce852de6f2713c94c0c284039506ca2d4f3dee", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d3df33e12be0d0544c95f1bd47005add4b7010be", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]}
\ No newline at end of file
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
index f88a0b842..ee1af8953 100644
--- a/tools/datasets/models.py
+++ b/tools/datasets/models.py
@@ -20,7 +20,6 @@
         from typing import TypeAlias
     else:
         from typing_extensions import TypeAlias
-    import polars as pl
 
     from altair.datasets._typing import Dataset, FlFieldStr
 
@@ -117,10 +116,3 @@ class Package(TypedDict):
     sources: Sequence[Source]
     created: str
     resources: Sequence[Resource]
-
-
-class ParsedPackage(TypedDict):
-    """Minimal representations to write to disk."""
-
-    features: pl.DataFrame
-    schemas: Mapping[Dataset, Mapping[str, FlFieldStr]]
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
index 40116cb05..a10e13a64 100644
--- a/tools/datasets/npm.py
+++ b/tools/datasets/npm.py
@@ -5,6 +5,7 @@
 import urllib.request
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple
+from urllib.request import Request
 
 from tools.datasets import datapackage
 
@@ -22,7 +23,6 @@
         from typing_extensions import TypeAlias
     from tools.datasets import PathMap
     from tools.datasets.datapackage import DataPackage
-    from tools.datasets.models import Package
 
     BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'
 
@@ -54,30 +54,25 @@ def __init__(
             GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
         )
 
-    def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
-        """
-        Common url prefix for all datasets derived from ``version``.
+    def _prefix(self, version: BranchOrTag, /) -> LiteralString:
+        return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/"
 
-        Notes
-        -----
-        - Encodes the endpoint at this stage
-            - Use github if its the only option (since its slower otherwise)
-            - npm only has releases/tags (not branches)
-        """
-        return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/"
+    def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
+        """Common url prefix for all datasets derived from ``version``."""
+        return f"{self._prefix(version)}data/"
 
     @property
     def url(self) -> NpmUrl:
         return self._url
 
-    def file_gh(
+    def file(
         self,
         branch_or_tag: BranchOrTag,
         path: str,
         /,
     ) -> Any:
         """
-        Request a file from the `jsdelivr GitHub`_ endpoint.
+        Request a file from `jsdelivr`  `npm`_ or `GitHub`_ endpoints.
 
         Parameters
         ----------
@@ -86,7 +81,9 @@ def file_gh(
         path
             Relative filepath from the root of the repo.
 
-        .. _jsdelivr GitHub:
+        .. _npm:
+            https://www.jsdelivr.com/documentation#id-npm
+        .. _GitHub:
             https://www.jsdelivr.com/documentation#id-github
         .. _branches:
             https://github.com/vega/vega-datasets/branches
@@ -100,20 +97,15 @@ def file_gh(
             read_fn = json.load
         else:
             raise NotImplementedError(path, suffix)
-        req = urllib.request.Request(
-            f"{self.url.GH}{branch_or_tag}/{path}", headers=headers
-        )
+        req = Request(f"{self._prefix(branch_or_tag)}{path}", headers=headers)
         with self._opener.open(req) as response:
             return read_fn(response)
 
-    def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> DataPackage:
-        pkg: Package = (
-            json.loads(self.paths["datapackage"].read_text("utf-8"))
-            if frozen
-            else self.file_gh(tag, "datapackage.json")
-        )
+    def datapackage(self, *, tag: LiteralString) -> DataPackage:
         return datapackage.DataPackage(
-            pkg, self.dataset_base_url(tag), self.paths["metadata"]
+            self.file(tag, "datapackage.json"),
+            self.dataset_base_url(tag),
+            self.paths["metadata"],
         )
 
 
diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py
index 4ccb3f670..92c6f101d 100644
--- a/tools/generate_schema_wrapper.py
+++ b/tools/generate_schema_wrapper.py
@@ -1405,7 +1405,7 @@ def main() -> None:
     copy_schemapi_util()
     vegalite_main(args.skip_download)
     write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT)
-    datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True, frozen=True)
+    datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True)
 
     # The modules below are imported after the generation of the new schema files
     # as these modules import Altair. This allows them to use the new changes

From a776e2fd5a74dd21917b884e2b418ade18773895 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 10 Feb 2025 13:11:54 +0000
Subject: [PATCH 200/201] refactor: replace `SchemaCache.schema_pyarrow` ->
 `nw.Schema.to_arrow`

Related
- https://github.com/narwhals-dev/narwhals/pull/1924
- https://github.com/vega/altair/pull/3631#discussion_r1937953187
---
 altair/datasets/_cache.py | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
index eb22cc36e..5459f0b16 100644
--- a/altair/datasets/_cache.py
+++ b/altair/datasets/_cache.py
@@ -264,19 +264,22 @@ def is_active(self) -> bool:
             nw.Implementation.PYARROW,
         }
 
+    def schema(self, name: _Dataset, /) -> nw.Schema:
+        it = ((col, _FIELD_TO_DTYPE[tp_str]()) for col, tp_str in self[name].items())
+        return nw.Schema(it)
+
     def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
         name: Any = meta["dataset_name"]
-        impl = self._implementation
-        if (impl.is_pandas_like() or impl.is_pyarrow()) and (self[name]):
+        if self.is_active() and (self[name]):
             suffix = meta["suffix"]
-            if impl.is_pandas_like():
+            if self._implementation.is_pandas_like():
                 if cols := self.by_dtype(name, nw.Date, nw.Datetime):
                     if suffix == ".json":
                         return {"convert_dates": cols}
                     elif suffix in {".csv", ".tsv"}:
                         return {"parse_dates": cols}
             else:
-                schema = self.schema_pyarrow(name)
+                schema = self.schema(name).to_arrow()
                 if suffix in {".csv", ".tsv"}:
                     from pyarrow.csv import ConvertOptions
 
@@ -286,23 +289,6 @@ def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
 
         return {}
 
-    def schema(self, name: _Dataset, /) -> Mapping[str, DType]:
-        return {
-            column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items()
-        }
-
-    # TODO: Open an issue in ``narwhals`` to try and get a public api for type conversion
-    def schema_pyarrow(self, name: _Dataset, /):
-        schema = self.schema(name)
-        if schema:
-            from narwhals._arrow.utils import narwhals_to_native_dtype
-            from narwhals.utils import Version
-
-            m = {k: narwhals_to_native_dtype(v, Version.V1) for k, v in schema.items()}
-        else:
-            m = {}
-        return nw.dependencies.get_pyarrow().schema(m)
-
 
 class _SupportsScanMetadata(Protocol):
     _opener: ClassVar[OpenerDirector]

From ddda22c50f7265728dcec26afec02d6d0dbda189 Mon Sep 17 00:00:00 2001
From: dangotbanned <125183946+dangotbanned@users.noreply.github.com>
Date: Mon, 10 Feb 2025 14:10:51 +0000
Subject: [PATCH 201/201] feat(typing): Properly annotate `dataset_name`,
 `suffix`

Makes more sense following (755ab4f560af13f9268e905cf70783c34b30b1d7)
---
 altair/datasets/_typing.py    |  9 +++++++--
 tools/datasets/__init__.py    |  1 +
 tools/datasets/datapackage.py | 24 ++++++++++++++++++++----
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
index 3357ddf3b..a60f38687 100644
--- a/altair/datasets/_typing.py
+++ b/altair/datasets/_typing.py
@@ -11,6 +11,11 @@
 else:
     from typing_extensions import TypedDict
 
+if sys.version_info >= (3, 11):
+    from typing import LiteralString
+else:
+    from typing_extensions import LiteralString
+
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
@@ -181,8 +186,8 @@ class Metadata(TypedDict, total=False):
     ```
     """
 
-    dataset_name: str
-    suffix: str
+    dataset_name: Dataset | LiteralString
+    suffix: Extension
     file_name: str
     bytes: int
     is_image: bool
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
index a7c1d06c4..a41392d9d 100644
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
@@ -184,6 +184,7 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
             "import sys",
             "from typing import Literal, TYPE_CHECKING",
             utils.import_typing_extensions((3, 14), "TypedDict"),
+            utils.import_typing_extensions((3, 11), "LiteralString"),
             utils.import_typing_extensions((3, 10), "TypeAlias"),
             "\n",
             f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT]}\n",
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
index ec707c0da..9ea6a8c8d 100644
--- a/tools/datasets/datapackage.py
+++ b/tools/datasets/datapackage.py
@@ -31,10 +31,13 @@
 
 
 class Column:
-    def __init__(self, name: str, expr: pl.Expr, /, doc: str = "_description_") -> None:
+    def __init__(
+        self, name: str, expr: pl.Expr, /, doc: str = "_description_", tp_str: str = ""
+    ) -> None:
         self._name: str = name
         self._expr: pl.Expr = expr
         self._doc: str = doc
+        self._tp_str: str = tp_str
 
     @property
     def expr(self) -> pl.Expr:
@@ -161,7 +164,10 @@ def _metadata_examples(self) -> str:
     @property
     def _metadata_td_args(self) -> str:
         schema = self.core.collect_schema().to_python()
-        return f"\n{INDENT}".join(f"{p}: {tp.__name__}" for p, tp in schema.items())
+        return f"\n{INDENT}".join(
+            f"{column._name}: {column._tp_str or tp.__name__}"
+            for column, tp in zip(self.columns, schema.values())
+        )
 
     @property
     def _url(self) -> Column:
@@ -237,8 +243,18 @@ def note(s: str, /) -> str:
 
 fmt = col("format")
 DataPackage.with_columns(
-    Column("dataset_name", path_stem("path"), "Name of the dataset/`Path.stem`_."),
-    Column("suffix", path_suffix("path"), "File extension/`Path.suffix`_."),
+    Column(
+        "dataset_name",
+        path_stem("path"),
+        "Name of the dataset/`Path.stem`_.",
+        tp_str="Dataset | LiteralString",
+    ),
+    Column(
+        "suffix",
+        path_suffix("path"),
+        "File extension/`Path.suffix`_.",
+        tp_str="Extension",
+    ),
     Column("file_name", col("path"), "Equivalent to `Path.name`_."),
     Column("bytes", col("bytes"), "File size in *bytes*."),
     Column("is_image", fmt == "png", "Only accessible via url."),