From 7933771fdfd8590c892935b23e2bf3816100db36 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 2 Oct 2024 22:13:46 +0100 Subject: [PATCH 001/201] wip --- tools/vendor_datasets.py | 660 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 660 insertions(+) create mode 100644 tools/vendor_datasets.py diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py new file mode 100644 index 000000000..003e55062 --- /dev/null +++ b/tools/vendor_datasets.py @@ -0,0 +1,660 @@ +from __future__ import annotations + +import json +import pkgutil +import sys +import textwrap +from functools import partial +from io import BytesIO +from pathlib import Path +from typing import Any, Iterable, Literal, cast +from urllib.request import urlopen + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +import pandas as pd +import polars as pl + +# This is the tag in http://github.com/vega/vega-datasets from +# which the datasets in this repository are sourced. +SOURCE_TAG = "v1.29.0" # 5 years ago +CURRENT_TAG = "v2.9.0" +USE_TAG = CURRENT_TAG + +BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/" + +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] + + +def _load_dataset_info() -> dict[str, dict[str, Any]]: + """ + Loads dataset info from three package files. + + vega_datasets/datasets.json + vega_datasets/dataset_info.json + vega_datasets/local_datasets.json + + It returns a dictionary with dataset information. + """ + + def load_json(path: str) -> dict[str, Any]: + raw = pkgutil.get_data("vega_datasets", path) + if raw is None: + msg = f"Cannot locate package path vega_datasets:{path}" + raise ValueError(msg) + return json.loads(raw.decode()) + + info = load_json("datasets.json") + descriptions = load_json("dataset_info.json") + local_datasets = load_json("local_datasets.json") + + for name in info: + info[name]["is_local"] = name in local_datasets + for name in descriptions: + info[name].update(descriptions[name]) + + return info + + +class Dataset: + """Class to load a particular dataset by name.""" + + _instance_doc = """Loader for the {name} dataset. + + {data_description} + + {bundle_info} + Dataset source: {url} + + Usage + ----- + + >>> from vega_datasets import data + >>> {methodname} = data.{methodname}() + >>> type({methodname}) + {return_type} + + Equivalently, you can use + + >>> {methodname} = data('{name}') + + To get the raw dataset rather than the dataframe, use + + >>> data_bytes = data.{methodname}.raw() + >>> type(data_bytes) + bytes + + To find the dataset url, use + + >>> data.{methodname}.url + '{url}' + {additional_docs} + Attributes + ---------- + filename : string + The filename in which the dataset is stored + url : string + The full URL of the dataset at http://vega.github.io + format : string + The format of the dataset: usually one of {{'csv', 'tsv', 'json'}} + pkg_filename : string + The path to the local dataset within the vega_datasets package + is_local : bool + True if the dataset is available locally in the package + filepath : string + If is_local is True, the local file path to the dataset. + + {reference_info} + """ + _additional_docs = "" + _reference_info = """ + For information on this dataset, see https://github.com/vega/vega-datasets/ + """ + base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/" + _dataset_info = _load_dataset_info() + _pd_read_kwds: dict[str, Any] = {} + _return_type = pd.DataFrame + name: str + + @classmethod + def init(cls, name: str) -> Dataset: + """Return an instance of this class or an appropriate subclass.""" + clsdict = { + subcls.name: subcls + for subcls in cls.__subclasses__() + if hasattr(subcls, "name") + } + return clsdict.get(name, cls)(name) + + def __init__(self, name: str): + info = self._infodict(name) + self.name = name + self.methodname = name.replace("-", "_") + self.filename = info["filename"] + self.url = self.base_url + info["filename"] + self.format = info["format"] + self.pkg_filename = "_data/" + self.filename + self.is_local = info["is_local"] + self.description = info.get("description", None) + self.references = info.get("references", None) + self.__doc__ = self._make_docstring() + + @classmethod + def list_datasets(cls) -> list[str]: + """Return a list of names of available datasets.""" + return sorted(cls._dataset_info.keys()) + + @classmethod + def list_local_datasets(cls) -> list[str]: + return sorted( + name for name, info in cls._dataset_info.items() if info["is_local"] + ) + + @classmethod + def _infodict(cls, name: str) -> dict[str, str]: + """Load the info dictionary for the given name.""" + info = cls._dataset_info.get(name, None) + if info is None: + msg = ( + f"No such dataset {name} exists, " + "use list_datasets() to get a list " + "of available datasets." + ) + raise ValueError(msg) + return info + + def raw(self, use_local: bool = True) -> bytes: + """Load the raw dataset from remote URL or local file.""" + if use_local and self.is_local: + out = pkgutil.get_data("vega_datasets", self.pkg_filename) + if out is not None: + return out + msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}" + raise ValueError(msg) + else: + return urlopen(self.url).read() + + def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame: + """Load and parse the dataset from remote URL or local file.""" + datasource = BytesIO(self.raw(use_local=use_local)) + + kwds = self._pd_read_kwds.copy() + kwds.update(kwargs) + + if self.format == "json": + return pd.read_json(datasource, **kwds) + elif self.format == "csv": + return pd.read_csv(datasource, **kwds) + elif self.format == "tsv": + kwds.setdefault("sep", "\t") + return pd.read_csv(datasource, **kwds) + else: + msg = ( + f"Unrecognized file format: {self.format}. " + "Valid options are ['json', 'csv', 'tsv']." + ) + raise ValueError(msg) + + @property + def filepath(self) -> str: + if not self.is_local: + msg = "filepath is only valid for local datasets" + raise ValueError(msg) + else: + return str((Path(__file__).parent / "_data" / self.filename).resolve()) + + def _make_docstring(self) -> str: + info = self._infodict(self.name) + + # construct, indent, and line-wrap dataset description + description = info.get("description", "") + if not description: + description = ( + "This dataset is described at " "https://github.com/vega/vega-datasets/" + ) + wrapper = textwrap.TextWrapper( + width=70, initial_indent="", subsequent_indent=4 * " " + ) + description = "\n".join(wrapper.wrap(description)) + + # construct, indent, and join references + reflist: Iterable[str] = info.get("references", []) + reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist)) + wrapper = textwrap.TextWrapper( + width=70, initial_indent=4 * " ", subsequent_indent=7 * " " + ) + reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist) + references: str = "\n\n".join(reflist) + if references.strip(): + references = "References\n ----------\n" + references + + # add information about bundling of data + if self.is_local: + bundle_info = ( + "This dataset is bundled with vega_datasets; " + "it can be loaded without web access." + ) + else: + bundle_info = ( + "This dataset is not bundled with vega_datasets; " + "it requires web access to load." + ) + + return self._instance_doc.format( + additional_docs=self._additional_docs, + data_description=description, + reference_info=references, + bundle_info=bundle_info, + return_type=self._return_type, + **self.__dict__, + ) + + +def getattr_to_df(name: str, /) -> pl.DataFrame: + """Subset of what `Dataset` does.""" + js_name = name.replace("_", "-") + file_name = DATASETS_JSON[js_name]["filename"] + suffix = Path(file_name).suffix + if suffix in {".csv", ".json", ".tsv"}: + extension = cast(ExtSupported, suffix) + else: + raise NotImplementedError(suffix, file_name) + + url = f"{BASE_URL}{file_name}" + with urlopen(url) as f: + content = ext_fn(extension)(f) + return content + + +class DSet: + def __init__(self, name: str, /) -> None: + self.name: str = name + js_name = name.replace("_", "-") + file_name = DATASETS_JSON[js_name]["filename"] + suffix = Path(file_name).suffix + self.extension: ExtSupported + if suffix in {".csv", ".json", ".tsv"}: + self.extension = cast(ExtSupported, suffix) + else: + raise NotImplementedError(suffix, file_name) + + self.url: str = f"{BASE_URL}{file_name}" + + def __call__(self, **kwds: Any) -> pl.DataFrame: + with urlopen(self.url) as f: + content = ext_fn(self.extension, **kwds)(f) + return content + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}(\n " + f"name={self.name!r},\n " + f"url={self.url!r}\n" + ")" + ) + + +def ext_fn(ext: ExtSupported, /): + """Very basic mapping to `polars` eager functions.""" + if ext == ".csv": + return pl.read_csv + elif ext == ".json": + return pl.read_json + elif ext == ".tsv": + return partial(pl.read_csv, separator="\t") + else: + raise + + +DATASET_NAMES_USED = [ + "airports", + "anscombe", + "barley", + "cars", + "co2_concentration", + "countries", + "disasters", + "driving", + "earthquakes", + "flights_2k", + "flights_5k", + "flights_airport", + "gapminder_health_income", + "github", + "income", + "iowa_electricity", + "iris", + "jobs", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "monarchs", + "movies", + "normal_2d", + "ohlc", + "population", + "population_engineers_hurricanes", + "seattle_weather", + "sp500", + "stocks", + "unemployment", + "unemployment_across_industries", + "us_10m", + "us_employment", + "us_state_capitals", + "us_unemployment", + "wheat", + "windvectors", + "world_110m", + "zipcodes", +] + +DATASETS_JSON = { + # "7zip": {"filename": "7zip.png", "format": "png"}, + "airports": {"filename": "airports.csv", "format": "csv"}, + "annual-precip": {"filename": "annual-precip.json", "format": "json"}, + "anscombe": {"filename": "anscombe.json", "format": "json"}, + "barley": {"filename": "barley.json", "format": "json"}, + "birdstrikes": {"filename": "birdstrikes.json", "format": "json"}, + "budget": {"filename": "budget.json", "format": "json"}, + "budgets": {"filename": "budgets.json", "format": "json"}, + "burtin": {"filename": "burtin.json", "format": "json"}, + "cars": {"filename": "cars.json", "format": "json"}, + "climate": {"filename": "climate.json", "format": "json"}, + "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"}, + "countries": {"filename": "countries.json", "format": "json"}, + "crimea": {"filename": "crimea.json", "format": "json"}, + "disasters": {"filename": "disasters.csv", "format": "csv"}, + "driving": {"filename": "driving.json", "format": "json"}, + "earthquakes": {"filename": "earthquakes.json", "format": "json"}, + # "ffox": {"filename": "ffox.png", "format": "png"}, + "flare": {"filename": "flare.json", "format": "json"}, + "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"}, + "flights-10k": {"filename": "flights-10k.json", "format": "json"}, + "flights-200k": {"filename": "flights-200k.json", "format": "json"}, + "flights-20k": {"filename": "flights-20k.json", "format": "json"}, + "flights-2k": {"filename": "flights-2k.json", "format": "json"}, + "flights-3m": {"filename": "flights-3m.csv", "format": "csv"}, + "flights-5k": {"filename": "flights-5k.json", "format": "json"}, + "flights-airport": {"filename": "flights-airport.csv", "format": "csv"}, + "gapminder": {"filename": "gapminder.json", "format": "json"}, + "gapminder-health-income": { + "filename": "gapminder-health-income.csv", + "format": "csv", + }, + # "gimp": {"filename": "gimp.png", "format": "png"}, + "github": {"filename": "github.csv", "format": "csv"}, + "graticule": {"filename": "graticule.json", "format": "json"}, + "income": {"filename": "income.json", "format": "json"}, + "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"}, + "iris": {"filename": "iris.json", "format": "json"}, + "jobs": {"filename": "jobs.json", "format": "json"}, + "la-riots": {"filename": "la-riots.csv", "format": "csv"}, + "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"}, + "londonCentroids": {"filename": "londonCentroids.json", "format": "json"}, + "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"}, + "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"}, + "lookup_people": {"filename": "lookup_people.csv", "format": "csv"}, + "miserables": {"filename": "miserables.json", "format": "json"}, + "monarchs": {"filename": "monarchs.json", "format": "json"}, + "movies": {"filename": "movies.json", "format": "json"}, + "normal-2d": {"filename": "normal-2d.json", "format": "json"}, + "obesity": {"filename": "obesity.json", "format": "json"}, + "ohlc": {"filename": "ohlc.json", "format": "json"}, + "points": {"filename": "points.json", "format": "json"}, + "population": {"filename": "population.json", "format": "json"}, + "population_engineers_hurricanes": { + "filename": "population_engineers_hurricanes.csv", + "format": "csv", + }, + "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"}, + "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"}, + "sf-temps": {"filename": "sf-temps.csv", "format": "csv"}, + "sp500": {"filename": "sp500.csv", "format": "csv"}, + "stocks": {"filename": "stocks.csv", "format": "csv"}, + "udistrict": {"filename": "udistrict.json", "format": "json"}, + "unemployment": {"filename": "unemployment.tsv", "format": "tsv"}, + "unemployment-across-industries": { + "filename": "unemployment-across-industries.json", + "format": "json", + }, + "uniform-2d": {"filename": "uniform-2d.json", "format": "json"}, + "us-10m": {"filename": "us-10m.json", "format": "json"}, + "us-employment": {"filename": "us-employment.csv", "format": "csv"}, + "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"}, + "volcano": {"filename": "volcano.json", "format": "json"}, + "weather": {"filename": "weather.json", "format": "json"}, + "weball26": {"filename": "weball26.json", "format": "json"}, + "wheat": {"filename": "wheat.json", "format": "json"}, + "windvectors": {"filename": "windvectors.csv", "format": "csv"}, + "world-110m": {"filename": "world-110m.json", "format": "json"}, + "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, +} + + +class Stocks(Dataset): + name = "stocks" + _additional_docs = """ + For convenience, the stocks dataset supports pivoted output using the + optional `pivoted` keyword. If pivoted is set to True, each company's + price history will be returned in a separate column: + + >>> df = data.stocks() # not pivoted + >>> df.head(3) + symbol date price + 0 MSFT 2000-01-01 39.81 + 1 MSFT 2000-02-01 36.35 + 2 MSFT 2000-03-01 43.22 + + >>> df_pivoted = data.stocks(pivoted=True) + >>> df_pivoted.head() + symbol AAPL AMZN GOOG IBM MSFT + date + 2000-01-01 25.94 64.56 NaN 100.52 39.81 + 2000-02-01 28.66 68.87 NaN 92.11 36.35 + 2000-03-01 33.95 67.00 NaN 106.11 43.22 + """ + _pd_read_kwds = {"parse_dates": ["date"]} + + def __call__(self, pivoted=False, use_local=True, **kwargs): + """ + Load and parse the dataset from remote URL or local file. + + Parameters + ---------- + pivoted : boolean, default False + If True, then pivot data so that each stock is in its own column. + use_local : boolean + If True (default), then attempt to load the dataset locally. If + False or if the dataset is not available locally, then load the + data from an external URL. + **kwargs : + additional keyword arguments are passed to data parser (usually + pd.read_csv or pd.read_json, depending on the format of the data + source) + + Returns + ------- + data : DataFrame + parsed data + """ + __doc__ = super().__call__.__doc__ # noqa:F841 + data = super().__call__(use_local=use_local, **kwargs) + if pivoted: + data = data.pivot(index="date", columns="symbol", values="price") + return data + + +class Cars(Dataset): + name = "cars" + _pd_read_kwds = {"convert_dates": ["Year"]} + + +class Climate(Dataset): + name = "climate" + _pd_read_kwds = {"convert_dates": ["DATE"]} + + +class Github(Dataset): + name = "github" + _pd_read_kwds = {"parse_dates": ["time"]} + + +class IowaElectricity(Dataset): + name = "iowa-electricity" + _pd_read_kwds = {"parse_dates": ["year"]} + + +class LARiots(Dataset): + name = "la-riots" + _pd_read_kwds = {"parse_dates": ["death_date"]} + + +class Miserables(Dataset): + name = "miserables" + _return_type = tuple + _additional_docs = """ + The miserables data contains two dataframes, ``nodes`` and ``links``, + both of which are returned from this function. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs) + nodes = pd.DataFrame.from_records(dct["nodes"], index="index") + links = pd.DataFrame.from_records(dct["links"]) + return nodes, links + + +class SeattleTemps(Dataset): + name = "seattle-temps" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class SeattleWeather(Dataset): + name = "seattle-weather" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class SFTemps(Dataset): + name = "sf-temps" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class Sp500(Dataset): + name = "sp500" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class UnemploymentAcrossIndustries(Dataset): + name = "unemployment-across-industries" + _pd_read_kwds = {"convert_dates": ["date"]} + + +class US_10M(Dataset): + name = "us-10m" + _return_type = dict + _additional_docs = """ + The us-10m dataset is a TopoJSON file, with a structure that is not + suitable for storage in a dataframe. For this reason, the loader returns + a simple Python dictionary. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + return json.loads(self.raw(use_local=use_local).decode(), **kwargs) + + +class World_110M(Dataset): + name = "world-110m" + _return_type = dict + _additional_docs = """ + The world-100m dataset is a TopoJSON file, with a structure that is not + suitable for storage in a dataframe. For this reason, the loader returns + a simple Python dictionary. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + return json.loads(self.raw(use_local=use_local).decode(), **kwargs) + + +class ZIPCodes(Dataset): + name = "zipcodes" + _pd_read_kwds = {"dtype": {"zip_code": "object"}} + + +class DataLoader: + """ + Load a dataset from a local file or remote URL. + + There are two ways to call this; for example to load the iris dataset, you + can call this object and pass the dataset name by string: + + >>> from vega_datasets import data + >>> df = data("iris") + + or you can call the associated named method: + + >>> df = data.iris() + + Optionally, additional parameters can be passed to either of these + + Optional parameters + ------------------- + return_raw : boolean + If True, then return the raw string or bytes. + If False (default), then return a pandas dataframe. + use_local : boolean + If True (default), then attempt to load the dataset locally. If + False or if the dataset is not available locally, then load the + data from an external URL. + **kwargs : + additional keyword arguments are passed to the pandas parsing function, + either ``read_csv()`` or ``read_json()`` depending on the data format. + """ + + _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()} + + def list_datasets(self): + return Dataset.list_datasets() + + def __call__(self, name, return_raw=False, use_local=True, **kwargs): + loader = getattr(self, name.replace("-", "_")) + if return_raw: + return loader.raw(use_local=use_local, **kwargs) + else: + return loader(use_local=use_local, **kwargs) + + def __getattr__(self, dataset_name): + if dataset_name in self._datasets: + return Dataset.init(self._datasets[dataset_name]) + else: + msg = f"No dataset named '{dataset_name}'" + raise AttributeError(msg) + + def __dir__(self): + return list(self._datasets.keys()) + + +class LocalDataLoader(DataLoader): + _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()} + + def list_datasets(self): + return Dataset.list_local_datasets() + + def __getattr__(self, dataset_name): + if dataset_name in self._datasets: + return Dataset.init(self._datasets[dataset_name]) + elif dataset_name in DataLoader._datasets: + msg = ( + f"'{dataset_name}' dataset is not available locally. To " + f"download it, use ``vega_datasets.data.{dataset_name}()" + ) + raise ValueError(msg) + else: + msg = f"No dataset named '{dataset_name}'" + raise AttributeError(msg) From b30081e9de975bed60247c65b477012d68b4e132 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 18:33:06 +0100 Subject: [PATCH 002/201] feat(DRAFT): Minimal reimplementation --- tools/vendor_datasets.py | 478 ++------------------------------------- 1 file changed, 17 insertions(+), 461 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 003e55062..4a435c253 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -1,13 +1,9 @@ from __future__ import annotations -import json -import pkgutil import sys -import textwrap -from functools import partial -from io import BytesIO +from functools import cached_property, partial from pathlib import Path -from typing import Any, Iterable, Literal, cast +from typing import Any, Literal, cast from urllib.request import urlopen if sys.version_info >= (3, 10): @@ -15,7 +11,6 @@ else: from typing_extensions import TypeAlias -import pandas as pd import polars as pl # This is the tag in http://github.com/vega/vega-datasets from @@ -29,247 +24,7 @@ ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] -def _load_dataset_info() -> dict[str, dict[str, Any]]: - """ - Loads dataset info from three package files. - - vega_datasets/datasets.json - vega_datasets/dataset_info.json - vega_datasets/local_datasets.json - - It returns a dictionary with dataset information. - """ - - def load_json(path: str) -> dict[str, Any]: - raw = pkgutil.get_data("vega_datasets", path) - if raw is None: - msg = f"Cannot locate package path vega_datasets:{path}" - raise ValueError(msg) - return json.loads(raw.decode()) - - info = load_json("datasets.json") - descriptions = load_json("dataset_info.json") - local_datasets = load_json("local_datasets.json") - - for name in info: - info[name]["is_local"] = name in local_datasets - for name in descriptions: - info[name].update(descriptions[name]) - - return info - - class Dataset: - """Class to load a particular dataset by name.""" - - _instance_doc = """Loader for the {name} dataset. - - {data_description} - - {bundle_info} - Dataset source: {url} - - Usage - ----- - - >>> from vega_datasets import data - >>> {methodname} = data.{methodname}() - >>> type({methodname}) - {return_type} - - Equivalently, you can use - - >>> {methodname} = data('{name}') - - To get the raw dataset rather than the dataframe, use - - >>> data_bytes = data.{methodname}.raw() - >>> type(data_bytes) - bytes - - To find the dataset url, use - - >>> data.{methodname}.url - '{url}' - {additional_docs} - Attributes - ---------- - filename : string - The filename in which the dataset is stored - url : string - The full URL of the dataset at http://vega.github.io - format : string - The format of the dataset: usually one of {{'csv', 'tsv', 'json'}} - pkg_filename : string - The path to the local dataset within the vega_datasets package - is_local : bool - True if the dataset is available locally in the package - filepath : string - If is_local is True, the local file path to the dataset. - - {reference_info} - """ - _additional_docs = "" - _reference_info = """ - For information on this dataset, see https://github.com/vega/vega-datasets/ - """ - base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/" - _dataset_info = _load_dataset_info() - _pd_read_kwds: dict[str, Any] = {} - _return_type = pd.DataFrame - name: str - - @classmethod - def init(cls, name: str) -> Dataset: - """Return an instance of this class or an appropriate subclass.""" - clsdict = { - subcls.name: subcls - for subcls in cls.__subclasses__() - if hasattr(subcls, "name") - } - return clsdict.get(name, cls)(name) - - def __init__(self, name: str): - info = self._infodict(name) - self.name = name - self.methodname = name.replace("-", "_") - self.filename = info["filename"] - self.url = self.base_url + info["filename"] - self.format = info["format"] - self.pkg_filename = "_data/" + self.filename - self.is_local = info["is_local"] - self.description = info.get("description", None) - self.references = info.get("references", None) - self.__doc__ = self._make_docstring() - - @classmethod - def list_datasets(cls) -> list[str]: - """Return a list of names of available datasets.""" - return sorted(cls._dataset_info.keys()) - - @classmethod - def list_local_datasets(cls) -> list[str]: - return sorted( - name for name, info in cls._dataset_info.items() if info["is_local"] - ) - - @classmethod - def _infodict(cls, name: str) -> dict[str, str]: - """Load the info dictionary for the given name.""" - info = cls._dataset_info.get(name, None) - if info is None: - msg = ( - f"No such dataset {name} exists, " - "use list_datasets() to get a list " - "of available datasets." - ) - raise ValueError(msg) - return info - - def raw(self, use_local: bool = True) -> bytes: - """Load the raw dataset from remote URL or local file.""" - if use_local and self.is_local: - out = pkgutil.get_data("vega_datasets", self.pkg_filename) - if out is not None: - return out - msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}" - raise ValueError(msg) - else: - return urlopen(self.url).read() - - def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame: - """Load and parse the dataset from remote URL or local file.""" - datasource = BytesIO(self.raw(use_local=use_local)) - - kwds = self._pd_read_kwds.copy() - kwds.update(kwargs) - - if self.format == "json": - return pd.read_json(datasource, **kwds) - elif self.format == "csv": - return pd.read_csv(datasource, **kwds) - elif self.format == "tsv": - kwds.setdefault("sep", "\t") - return pd.read_csv(datasource, **kwds) - else: - msg = ( - f"Unrecognized file format: {self.format}. " - "Valid options are ['json', 'csv', 'tsv']." - ) - raise ValueError(msg) - - @property - def filepath(self) -> str: - if not self.is_local: - msg = "filepath is only valid for local datasets" - raise ValueError(msg) - else: - return str((Path(__file__).parent / "_data" / self.filename).resolve()) - - def _make_docstring(self) -> str: - info = self._infodict(self.name) - - # construct, indent, and line-wrap dataset description - description = info.get("description", "") - if not description: - description = ( - "This dataset is described at " "https://github.com/vega/vega-datasets/" - ) - wrapper = textwrap.TextWrapper( - width=70, initial_indent="", subsequent_indent=4 * " " - ) - description = "\n".join(wrapper.wrap(description)) - - # construct, indent, and join references - reflist: Iterable[str] = info.get("references", []) - reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist)) - wrapper = textwrap.TextWrapper( - width=70, initial_indent=4 * " ", subsequent_indent=7 * " " - ) - reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist) - references: str = "\n\n".join(reflist) - if references.strip(): - references = "References\n ----------\n" + references - - # add information about bundling of data - if self.is_local: - bundle_info = ( - "This dataset is bundled with vega_datasets; " - "it can be loaded without web access." - ) - else: - bundle_info = ( - "This dataset is not bundled with vega_datasets; " - "it requires web access to load." - ) - - return self._instance_doc.format( - additional_docs=self._additional_docs, - data_description=description, - reference_info=references, - bundle_info=bundle_info, - return_type=self._return_type, - **self.__dict__, - ) - - -def getattr_to_df(name: str, /) -> pl.DataFrame: - """Subset of what `Dataset` does.""" - js_name = name.replace("_", "-") - file_name = DATASETS_JSON[js_name]["filename"] - suffix = Path(file_name).suffix - if suffix in {".csv", ".json", ".tsv"}: - extension = cast(ExtSupported, suffix) - else: - raise NotImplementedError(suffix, file_name) - - url = f"{BASE_URL}{file_name}" - with urlopen(url) as f: - content = ext_fn(extension)(f) - return content - - -class DSet: def __init__(self, name: str, /) -> None: self.name: str = name js_name = name.replace("_", "-") @@ -435,226 +190,27 @@ def ext_fn(ext: ExtSupported, /): } -class Stocks(Dataset): - name = "stocks" - _additional_docs = """ - For convenience, the stocks dataset supports pivoted output using the - optional `pivoted` keyword. If pivoted is set to True, each company's - price history will be returned in a separate column: - - >>> df = data.stocks() # not pivoted - >>> df.head(3) - symbol date price - 0 MSFT 2000-01-01 39.81 - 1 MSFT 2000-02-01 36.35 - 2 MSFT 2000-03-01 43.22 - - >>> df_pivoted = data.stocks(pivoted=True) - >>> df_pivoted.head() - symbol AAPL AMZN GOOG IBM MSFT - date - 2000-01-01 25.94 64.56 NaN 100.52 39.81 - 2000-02-01 28.66 68.87 NaN 92.11 36.35 - 2000-03-01 33.95 67.00 NaN 106.11 43.22 - """ - _pd_read_kwds = {"parse_dates": ["date"]} - - def __call__(self, pivoted=False, use_local=True, **kwargs): - """ - Load and parse the dataset from remote URL or local file. - - Parameters - ---------- - pivoted : boolean, default False - If True, then pivot data so that each stock is in its own column. - use_local : boolean - If True (default), then attempt to load the dataset locally. If - False or if the dataset is not available locally, then load the - data from an external URL. - **kwargs : - additional keyword arguments are passed to data parser (usually - pd.read_csv or pd.read_json, depending on the format of the data - source) - - Returns - ------- - data : DataFrame - parsed data - """ - __doc__ = super().__call__.__doc__ # noqa:F841 - data = super().__call__(use_local=use_local, **kwargs) - if pivoted: - data = data.pivot(index="date", columns="symbol", values="price") - return data - - -class Cars(Dataset): - name = "cars" - _pd_read_kwds = {"convert_dates": ["Year"]} - - -class Climate(Dataset): - name = "climate" - _pd_read_kwds = {"convert_dates": ["DATE"]} - - -class Github(Dataset): - name = "github" - _pd_read_kwds = {"parse_dates": ["time"]} - - -class IowaElectricity(Dataset): - name = "iowa-electricity" - _pd_read_kwds = {"parse_dates": ["year"]} - - -class LARiots(Dataset): - name = "la-riots" - _pd_read_kwds = {"parse_dates": ["death_date"]} - - -class Miserables(Dataset): - name = "miserables" - _return_type = tuple - _additional_docs = """ - The miserables data contains two dataframes, ``nodes`` and ``links``, - both of which are returned from this function. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs) - nodes = pd.DataFrame.from_records(dct["nodes"], index="index") - links = pd.DataFrame.from_records(dct["links"]) - return nodes, links - - -class SeattleTemps(Dataset): - name = "seattle-temps" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class SeattleWeather(Dataset): - name = "seattle-weather" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class SFTemps(Dataset): - name = "sf-temps" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class Sp500(Dataset): - name = "sp500" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class UnemploymentAcrossIndustries(Dataset): - name = "unemployment-across-industries" - _pd_read_kwds = {"convert_dates": ["date"]} - - -class US_10M(Dataset): - name = "us-10m" - _return_type = dict - _additional_docs = """ - The us-10m dataset is a TopoJSON file, with a structure that is not - suitable for storage in a dataframe. For this reason, the loader returns - a simple Python dictionary. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - return json.loads(self.raw(use_local=use_local).decode(), **kwargs) - - -class World_110M(Dataset): - name = "world-110m" - _return_type = dict - _additional_docs = """ - The world-100m dataset is a TopoJSON file, with a structure that is not - suitable for storage in a dataframe. For this reason, the loader returns - a simple Python dictionary. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - return json.loads(self.raw(use_local=use_local).decode(), **kwargs) - - -class ZIPCodes(Dataset): - name = "zipcodes" - _pd_read_kwds = {"dtype": {"zip_code": "object"}} - - class DataLoader: - """ - Load a dataset from a local file or remote URL. + @cached_property + def _dataset_names(self) -> list[str]: + return sorted(DATASETS_JSON) - There are two ways to call this; for example to load the iris dataset, you - can call this object and pass the dataset name by string: + @cached_property + def _py_js_names(self) -> dict[str, str]: + return {name.replace("-", "_"): name for name in self._dataset_names} - >>> from vega_datasets import data - >>> df = data("iris") + def list_datasets(self) -> list[str]: + return list(self._py_js_names) - or you can call the associated named method: - - >>> df = data.iris() - - Optionally, additional parameters can be passed to either of these - - Optional parameters - ------------------- - return_raw : boolean - If True, then return the raw string or bytes. - If False (default), then return a pandas dataframe. - use_local : boolean - If True (default), then attempt to load the dataset locally. If - False or if the dataset is not available locally, then load the - data from an external URL. - **kwargs : - additional keyword arguments are passed to the pandas parsing function, - either ``read_csv()`` or ``read_json()`` depending on the data format. - """ - - _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()} - - def list_datasets(self): - return Dataset.list_datasets() - - def __call__(self, name, return_raw=False, use_local=True, **kwargs): - loader = getattr(self, name.replace("-", "_")) - if return_raw: - return loader.raw(use_local=use_local, **kwargs) + def __getattr__(self, name: str) -> Dataset: + if name in self._py_js_names: + return Dataset(self._py_js_names[name]) else: - return loader(use_local=use_local, **kwargs) - - def __getattr__(self, dataset_name): - if dataset_name in self._datasets: - return Dataset.init(self._datasets[dataset_name]) - else: - msg = f"No dataset named '{dataset_name}'" + msg = f"No dataset named '{name}'" raise AttributeError(msg) - def __dir__(self): - return list(self._datasets.keys()) - + def __dir__(self) -> list[str]: + return self.list_datasets() -class LocalDataLoader(DataLoader): - _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()} - def list_datasets(self): - return Dataset.list_local_datasets() - - def __getattr__(self, dataset_name): - if dataset_name in self._datasets: - return Dataset.init(self._datasets[dataset_name]) - elif dataset_name in DataLoader._datasets: - msg = ( - f"'{dataset_name}' dataset is not available locally. To " - f"download it, use ``vega_datasets.data.{dataset_name}()" - ) - raise ValueError(msg) - else: - msg = f"No dataset named '{dataset_name}'" - raise AttributeError(msg) +data = DataLoader() From 279586b17dc766382b7a06e5874983e704789bf9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:26:09 +0100 Subject: [PATCH 003/201] refactor: Make version accessible via `data.source_tag` - Allow quickly switching between version tags https://github.com/vega/altair/discussions/3150#discussioncomment-6719752 --- tools/vendor_datasets.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 4a435c253..a50297420 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -3,7 +3,7 @@ import sys from functools import cached_property, partial from pathlib import Path -from typing import Any, Literal, cast +from typing import Any, ClassVar, Literal, cast from urllib.request import urlopen if sys.version_info >= (3, 10): @@ -15,20 +15,25 @@ # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. -SOURCE_TAG = "v1.29.0" # 5 years ago -CURRENT_TAG = "v2.9.0" -USE_TAG = CURRENT_TAG +_OLD_SOURCE_TAG = "v1.29.0" # 5 years ago +_CURRENT_SOURCE_TAG = "v2.9.0" + + +def _py_to_js(s: str, /): + return s.replace("_", "-") + + +def _js_to_py(s: str, /): + return s.replace("-", "_") -BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/" ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] class Dataset: - def __init__(self, name: str, /) -> None: + def __init__(self, name: str, /, base_url: str) -> None: self.name: str = name - js_name = name.replace("_", "-") - file_name = DATASETS_JSON[js_name]["filename"] + file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix self.extension: ExtSupported if suffix in {".csv", ".json", ".tsv"}: @@ -36,7 +41,7 @@ def __init__(self, name: str, /) -> None: else: raise NotImplementedError(suffix, file_name) - self.url: str = f"{BASE_URL}{file_name}" + self.url: str = f"{base_url}{file_name}" def __call__(self, **kwds: Any) -> pl.DataFrame: with urlopen(self.url) as f: @@ -191,22 +196,29 @@ def ext_fn(ext: ExtSupported, /): class DataLoader: + source_tag: ClassVar[str] = "v2.9.0" + _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/" + + @property + def base_url(self) -> str: + return self._base_url_fmt.format(self.source_tag) + @cached_property def _dataset_names(self) -> list[str]: return sorted(DATASETS_JSON) @cached_property def _py_js_names(self) -> dict[str, str]: - return {name.replace("-", "_"): name for name in self._dataset_names} + return {_js_to_py(name): name for name in self._dataset_names} def list_datasets(self) -> list[str]: return list(self._py_js_names) def __getattr__(self, name: str) -> Dataset: if name in self._py_js_names: - return Dataset(self._py_js_names[name]) + return Dataset(self._py_js_names[name], self.base_url) else: - msg = f"No dataset named '{name}'" + msg = f"No dataset named {name!r}" raise AttributeError(msg) def __dir__(self) -> list[str]: From 32150ad6b4b1f79b05be988bcf359e172ea017bf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:47:09 +0100 Subject: [PATCH 004/201] refactor: `ext_fn` -> `Dataset.read_fn` --- tools/vendor_datasets.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index a50297420..e79ad6010 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -3,9 +3,13 @@ import sys from functools import cached_property, partial from pathlib import Path -from typing import Any, ClassVar, Literal, cast +from typing import Any, Callable, ClassVar, Literal from urllib.request import urlopen +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -18,6 +22,12 @@ _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] + + +def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: + return suffix in {".csv", ".json", ".tsv"} + def _py_to_js(s: str, /): return s.replace("_", "-") @@ -27,17 +37,19 @@ def _js_to_py(s: str, /): return s.replace("-", "_") -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] - - class Dataset: + read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + } + def __init__(self, name: str, /, base_url: str) -> None: self.name: str = name file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix - self.extension: ExtSupported - if suffix in {".csv", ".json", ".tsv"}: - self.extension = cast(ExtSupported, suffix) + if is_ext_supported(suffix): + self.extension: ExtSupported = suffix else: raise NotImplementedError(suffix, file_name) @@ -45,7 +57,8 @@ def __init__(self, name: str, /, base_url: str) -> None: def __call__(self, **kwds: Any) -> pl.DataFrame: with urlopen(self.url) as f: - content = ext_fn(self.extension, **kwds)(f) + fn = self.read_fn[self.extension] + content = fn(f, **kwds) return content def __repr__(self) -> str: @@ -57,19 +70,7 @@ def __repr__(self) -> str: ) -def ext_fn(ext: ExtSupported, /): - """Very basic mapping to `polars` eager functions.""" - if ext == ".csv": - return pl.read_csv - elif ext == ".json": - return pl.read_json - elif ext == ".tsv": - return partial(pl.read_csv, separator="\t") - else: - raise - - -DATASET_NAMES_USED = [ +DATASET_NAMES_USED = ( "airports", "anscombe", "barley", @@ -110,7 +111,7 @@ def ext_fn(ext: ExtSupported, /): "windvectors", "world_110m", "zipcodes", -] +) DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, From f1d18a2d3baee9edbb9d17146c90b73a29d7905b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:47:57 +0100 Subject: [PATCH 005/201] docs: Add trailing docs to long literals --- tools/vendor_datasets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index e79ad6010..5b0f25fe8 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -112,6 +112,8 @@ def __repr__(self) -> str: "world_110m", "zipcodes", ) +"""Every name that is referenced in *at least* one example/test.""" + DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, @@ -194,6 +196,13 @@ def __repr__(self) -> str: "world-110m": {"filename": "world-110m.json", "format": "json"}, "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, } +"""Inlined `datasets.json`_. + +- Excluding images + +.. _datasets.json: + https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json +""" class DataLoader: From 4d3c5509f1e656adc08015f5456fe3f5671c7ecd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:51:24 +0100 Subject: [PATCH 006/201] docs: Add module-level doc --- tools/vendor_datasets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 5b0f25fe8..08c3094e7 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -1,3 +1,10 @@ +""" +Adapted from `altair-viz/vega_datasets`_. + +.. _altair-viz/vega_datasets: + https://github.com/altair-viz/vega_datasets +""" + from __future__ import annotations import sys From 3a284a5ea97ebe0ef500c9911eaeddebe88ad741 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:05:34 +0100 Subject: [PATCH 007/201] feat: Adds `.arrow` support To support [flights-200k.arrow](https://github.com/vega/vega-datasets/blob/f637f85f6a16f4b551b9e2eb669599cc21d77e69/data/flights-200k.arrow) --- tools/vendor_datasets.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 08c3094e7..26e1207c4 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -8,6 +8,7 @@ from __future__ import annotations import sys +import tempfile from functools import cached_property, partial from pathlib import Path from typing import Any, Callable, ClassVar, Literal @@ -29,11 +30,11 @@ _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: - return suffix in {".csv", ".json", ".tsv"} + return suffix in {".csv", ".json", ".tsv", ".arrow"} def _py_to_js(s: str, /): @@ -49,6 +50,7 @@ class Dataset: ".csv": pl.read_csv, ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), } def __init__(self, name: str, /, base_url: str) -> None: @@ -63,9 +65,10 @@ def __init__(self, name: str, /, base_url: str) -> None: self.url: str = f"{base_url}{file_name}" def __call__(self, **kwds: Any) -> pl.DataFrame: - with urlopen(self.url) as f: - fn = self.read_fn[self.extension] - content = fn(f, **kwds) + fn = self.read_fn[self.extension] + with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f: + tmp.write(f.read()) + content = fn(tmp, **kwds) return content def __repr__(self) -> str: From 22a50396822dc48d4ed63bae3c8837dc28dab6ad Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 19:46:40 +0100 Subject: [PATCH 008/201] feat: Add support for caching metadata --- .../_vega_datasets_data/metadata-schema.json | 12 ++ tools/_vega_datasets_data/metadata.parquet | Bin 0 -> 9100 bytes tools/vendor_datasets.py | 121 +++++++++++++++++- 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tools/_vega_datasets_data/metadata-schema.json create mode 100644 tools/_vega_datasets_data/metadata.parquet diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json new file mode 100644 index 000000000..2b5b9d955 --- /dev/null +++ b/tools/_vega_datasets_data/metadata-schema.json @@ -0,0 +1,12 @@ +{ + "ext_supported": "bool", + "file_name": "str", + "name_collision": "bool", + "name_js": "str", + "name_py": "str", + "size": "int", + "suffix": "str", + "tag": "str", + "url_github": "str", + "url_npm": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1ab0fb17143528da9cd460e84a0fb18a9f1d5b73 GIT binary patch literal 9100 zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{ zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX#F! zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@ z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn zm+PlC$Uc@#rY?$x`Mjg`Yv5 z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T zOlgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S zuGHkN$akkNd7d(wDi)Mn$5-8AU)8RuX_P&+6+R z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y) z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDuEgQVsa*pFnN&&60_DkTv3H z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0lE^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg zh9JkIRbpE>9&m`sJ*@}sOH>d%$VgLVL#d^fe=wu#W zzWa;&;hxFwt~c39&`sQ?gST% z#lOG}f^cj8lQl0thFevP7| z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk* zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dyIb4vP46txCpA^= zEv{}WiRnf+yZ3ry7?c?hCB2TH~2sn)%_?heQv6j8zF zTi~1xTloE4m7{L@yMJX-Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_C9s9b?J_Xz%#2_K29&h1G6M|fQh$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y zX*;3)!V2`FFAVpZx5%UjHDH)mI`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+< zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B= z$TjO3X9+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%! zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PThS+=1#saM$R zH=l}V%jZdU?oPq9mN2?XU|Y^`iej-xhS>|h zD9c$>dzC7Ig&g-Sd4@rs<;>nDM?r|GwE3OMWJS}|VSO$~d z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM z9M#!^!&pX`^Ou7npXPcVbLz%_>9jDY7Aigq}T?m@)5(+is#5Jy0>3X z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OMlZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=sK09lRpAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB& zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L% zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G zMXkI^_Nwl*GUp3)=)7>OPh!rfGaPwAgH8vnbFU&y<7 z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM) z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_ z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk82_*i znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x) zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m z`o_brZCW#Wb+me0~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<UgQpn* z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N zH(8Y6kEW^tkw`RxenickPl@p5^gsm68%>DQK3|y z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@UTe62cyq14`{y^iG6`gj*7%1_HjXKiYw>>A`6;sa5o z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn} zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 26e1207c4..871ac14af 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -7,11 +7,12 @@ from __future__ import annotations +import json import sys import tempfile from functools import cached_property, partial from pathlib import Path -from typing import Any, Callable, ClassVar, Literal +from typing import Any, Callable, ClassVar, Literal, TypedDict from urllib.request import urlopen if sys.version_info >= (3, 13): @@ -25,12 +26,130 @@ import polars as pl + +class GitHubTree(TypedDict): + path: str + mode: str + type: str + sha: str + size: int + url: str + + +class GitHubTreeResponse(TypedDict): + sha: str + url: str + tree: list[GitHubTree] + truncated: bool + + +class GitHubBlobResponse(TypedDict): + content: str + sha: str + node_id: str + size: int | None + encoding: str + url: str + + +class ParsedTree(TypedDict): + file_name: str + name_js: str + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + + +class ParsedTreeResponse(TypedDict): + tag: str + url: str + tree: list[ParsedTree] + + +_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/" +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + + +def request_trees(tag: str, /) -> GitHubTreeResponse: + with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response: + content: GitHubTreeResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with urlopen(data_url) as response: + data_dir: GitHubTreeResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + +def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=_js_to_py(path.stem), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + ) + + +def parse_github_tree_response( + tree: GitHubTreeResponse, /, tag: str +) -> ParsedTreeResponse: + return ParsedTreeResponse( + tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]] + ) + + +def request_trees_to_df(tag: str, /) -> pl.DataFrame: + response = request_trees(tag) + parsed = parse_github_tree_response(response, tag=tag) + df = ( + pl.DataFrame(parsed["tree"]) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag)) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + +def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: + metadata = request_trees_to_df(tag) + if not fp.exists(): + fp.touch() + metadata.write_parquet(fp, compression="zstd", compression_level=17) + if write_schema: + schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()} + fp_schema = fp.with_name(f"{fp.stem}-schema.json") + if not fp_schema.exists(): + fp_schema.touch() + with fp_schema.open("w") as f: + json.dump(schema, f, indent=2) + + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] +""" +- `'flights-200k.(arrow|json)'` key collison using stem +""" def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: From a618ffc6450922f602391b5511edda37b2fe325c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:49:43 +0100 Subject: [PATCH 009/201] feat: Support env var `VEGA_GITHUB_TOKEN` Not required for these requests, but may be helpful to avoid limits --- tools/vendor_datasets.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 871ac14af..259999fa0 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -8,12 +8,13 @@ from __future__ import annotations import json +import os import sys import tempfile from functools import cached_property, partial from pathlib import Path from typing import Any, Callable, ClassVar, Literal, TypedDict -from urllib.request import urlopen +from urllib.request import Request, urlopen if sys.version_info >= (3, 13): from typing import TypeIs @@ -73,8 +74,15 @@ class ParsedTreeResponse(TypedDict): _SUB_DIR = "data" +def request_github(url: str, /) -> Request: + headers = {} + if tok := os.environ.get("VEGA_GITHUB_TOKEN"): + headers["Authorization"] = tok + return Request(url, headers=headers) + + def request_trees(tag: str, /) -> GitHubTreeResponse: - with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response: + with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response: content: GitHubTreeResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): From 17923404866003e27a510be793ab65c290d8802a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:51:45 +0100 Subject: [PATCH 010/201] feat: Add support for multi-version metadata As an example, for comparing against the most recent I've added the 5 most recent --- .../metadata_v2.5.4-v2.9.0.parquet | Bin 0 -> 11354 bytes tools/vendor_datasets.py | 11 +++++++++++ 2 files changed, 11 insertions(+) create mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5626093db560b805b33261bdc5f6b7754ab3451d GIT binary patch literal 11354 zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2N!-mSkyBTJWA3R8PppmljUo(yIm2w_ zV1x892FjnH8XygUARqvENMBZ15tV0y0FP3$Bcz#Ltq<^af_m3 z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z< z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIWAw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3Lc0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{) z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~yS0|RD^xW_olJIm>Nl{S* zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSVd!vED+2Q8p1g@{ z5cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea- z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7 z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=* z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@ znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH; z>XPbVx$`XWi%sctbgd5vy*l7OgOs_&Ev#e7++rs(bqst-? z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&& zLyhoI!1y?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K7R9DMw0*R7WdN#WPN`3Q+ns>%lP*; zF|Xp*P}l^I?cB$A$OZ|;z!## z^RFC9M&hwhSm6ckhGqH!6W}UyS~7%e2E@igIvIdt z<_TF|opl@6JDw6nK1vMkd^rx068}IMl281}OY|Mut5PCog`f?c((rjfGBt1kGI7j)X-fh*&56dB{-Y~aX%<> z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!* zk_Y|?vsmkqc_spMuQZ1oyvBV8;$2RmmN= zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa zcdAEAx#I8R(jIH1wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(S=O3(y}iGI6Q=DEgGMy6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3 z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r zH*as--Sh6r zrZZ$yS z$~-0I6juVF7vB>!@z{d&;?U(e^kS8p0-17ZrrpFQ8P&U6$X-1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3 zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5( zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-`q37 zePB0%4&6yElvD~k4*Rmgbj|^8)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd) z?az%yAIw|1>U4?8ZDme6BzyeDyF@tU@a7ig|r? z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuzt{wdV!^) z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4 zmHN+K;)(CerBC!q4%J3wm z*?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@ zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc*HHEF&{R@YQgqkUM0+U`)CsB{ z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2 zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@ z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3 ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3 z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3 zdo8jt-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~ zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq4>lP-U5@_>Z!i=|`j`#txKJ~=K$ zDl*ge?29*bH@mWc0)`m9dE5kJ|D$NV zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm& zXZ|?5l>L_0$06E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@< zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g z@uB z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{ z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_F+ z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}| z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9 z!KhQ@?W4-03NxjYV~p*NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8 zMii@U$lI-?Np?34h$QUN3MgX#_1i*mJIpAtsYLJUTM ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1 Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 259999fa0..61c701e1e 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -135,6 +135,17 @@ def request_trees_to_df(tag: str, /) -> pl.DataFrame: return df.select(*sorted(df.columns)) +def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: + import random + import time + + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay + random.triangular()) + dfs.append(request_trees_to_df(tag)) + return pl.concat(dfs) + + def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: metadata = request_trees_to_df(tag) if not fp.exists(): From fa2c9e73c1e09e9721a2e095e4715e1dfac9939c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 21:58:33 +0100 Subject: [PATCH 011/201] refactor: Renaming, docs, reorganize --- tools/vendor_datasets.py | 146 ++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 33 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 61c701e1e..296a5f590 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -9,26 +9,51 @@ import json import os -import sys import tempfile +import warnings from functools import cached_property, partial from pathlib import Path -from typing import Any, Callable, ClassVar, Literal, TypedDict +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar from urllib.request import Request, urlopen -if sys.version_info >= (3, 13): - from typing import TypeIs -else: - from typing_extensions import TypeIs -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - import polars as pl +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from tools.schemapi.utils import OneOrSeq + + _T = TypeVar("_T") + _Guard: TypeAlias = Callable[[Any], TypeIs[_T]] + +_GITHUB_URL = "https://api.github.com/" +_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" +_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + +def _is_str(obj: Any) -> TypeIs[str]: + return isinstance(obj, str) + + + class GitHubTree(TypedDict): + """ + A single file's metadata within the response of `Get a tree`_. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + path: str mode: str type: str @@ -37,7 +62,16 @@ class GitHubTree(TypedDict): url: str -class GitHubTreeResponse(TypedDict): +class GitHubTreesResponse(TypedDict): + """ + Response from `Get a tree`_. + + Describes directory metadata, with files stored in ``"tree"``. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + sha: str url: str tree: list[GitHubTree] @@ -45,6 +79,15 @@ class GitHubTreeResponse(TypedDict): class GitHubBlobResponse(TypedDict): + """ + Response from `Get a blob`_. + + Obtained by following ``GitHubTree["url"]``. + + .. _Get a blob: + https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob + """ + content: str sha: str node_id: str @@ -63,37 +106,55 @@ class ParsedTree(TypedDict): ext_supported: bool -class ParsedTreeResponse(TypedDict): +class ParsedTreesResponse(TypedDict): tag: str url: str tree: list[ParsedTree] -_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/" -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" +def _request_github(url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. -def request_github(url: str, /) -> Request: + .. _personal access token: + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + .. _Media types: + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ headers = {} if tok := os.environ.get("VEGA_GITHUB_TOKEN"): headers["Authorization"] = tok + if raw: + headers["Accept"] = "application/vnd.github.raw+json" return Request(url, headers=headers) -def request_trees(tag: str, /) -> GitHubTreeResponse: - with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response: - content: GitHubTreeResponse = json.load(response) +def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform 2x requests to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}" + else: + url = tag["trees_url"] + with urlopen(_request_github(url)) as response: + content: GitHubTreesResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): with urlopen(data_url) as response: - data_dir: GitHubTreeResponse = json.load(response) + data_dir: GitHubTreesResponse = json.load(response) return data_dir else: raise FileNotFoundError -def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: +def _parse_tree(tree: GitHubTree, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" path = Path(tree["path"]) return ParsedTree( file_name=path.name, @@ -106,17 +167,18 @@ def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: ) -def parse_github_tree_response( - tree: GitHubTreeResponse, /, tag: str -) -> ParsedTreeResponse: - return ParsedTreeResponse( - tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]] +def _parse_trees_response( + tree: GitHubTreesResponse, /, tag: str +) -> ParsedTreesResponse: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return ParsedTreesResponse( + tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]] ) def request_trees_to_df(tag: str, /) -> pl.DataFrame: - response = request_trees(tag) - parsed = parse_github_tree_response(response, tag=tag) + response = _request_trees(tag) + parsed = _parse_trees_response(response, tag=tag) df = ( pl.DataFrame(parsed["tree"]) .lazy() @@ -146,13 +208,21 @@ def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: return pl.concat(dfs) -def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: - metadata = request_trees_to_df(tag) +def _write_parquet( + frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool +) -> None: + """ + Write ``frame`` to ``fp``, with some extra safety. + + When ``write_schema``, an addtional ``...-schema.json`` file is produced + that describes the metadata columns. + """ if not fp.exists(): fp.touch() - metadata.write_parquet(fp, compression="zstd", compression_level=17) + df = frame.lazy().collect() + df.write_parquet(fp, compression="zstd", compression_level=17) if write_schema: - schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()} + schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} fp_schema = fp.with_name(f"{fp.stem}-schema.json") if not fp_schema.exists(): fp_schema.touch() @@ -160,6 +230,16 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non json.dump(schema, f, indent=2) +def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: + """ + Retrieve directory info for a given version ``tag``, writing to ``fp``. + + When ``write_schema``, an addtional ``...-schema.json`` file is produced + that describes the metadata columns. + """ + metadata = request_trees_to_df(tag) + _write_parquet(metadata, fp, write_schema=write_schema) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 24cd7d7d9752d7424f9b8e37436d032f31bc54c1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:02:13 +0100 Subject: [PATCH 012/201] feat: Support collecting release tags See https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags --- tools/vendor_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 296a5f590..0604df780 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -36,14 +36,32 @@ _GITHUB_URL = "https://api.github.com/" _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" +_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags" _GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" +_TAGS_MAX_PAGE: Literal[100] = 100 +_SEM_VER_FIELDS: tuple[ + Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] +] = "major", "minor", "patch", "pre_release" + def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) +class GitHubTag(TypedDict): + name: str + node_id: str + commit: dict[Literal["sha", "url"], str] + zipball_url: str + tarball_url: str + + +class ParsedTag(TypedDict): + tag: str + sha: str + trees_url: str class GitHubTree(TypedDict): @@ -153,6 +171,55 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: raise FileNotFoundError +def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > _TAGS_MAX_PAGE: + raise ValueError(n) + with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + +def _parse_tag(tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}") + + +def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: + """ + Extracts components of a `SemVer`_ string into sortable columns. + + .. _SemVer: + https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions + """ + fields = pl.col(_SEM_VER_FIELDS) + pattern = r"""(?x) + v(?[[:digit:]]*)\. + (?[[:digit:]]*)\. + (?[[:digit:]]*) + (\-next\.)? + (?[[:digit:]]*)? + """ + sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + return ( + df.lazy() + .with_columns(sem_ver) + .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) + .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + .collect() + ) + + +def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver) + + def _parse_tree(tree: GitHubTree, /) -> ParsedTree: """For a single tree (file) convert to an IR with only relevant properties.""" path = Path(tree["path"]) @@ -240,6 +307,13 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non metadata = request_trees_to_df(tag) _write_parquet(metadata, fp, write_schema=write_schema) + +def collect_tags( + n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True +): + tags = request_tags_to_df(n_head, warn_lower=warn_lower) + _write_parquet(tags, fp, write_schema=write_schema) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 7dd461ff536205b5e07c62b2a4e09ab1e4bf5612 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:05:32 +0100 Subject: [PATCH 013/201] feat: Adds `refresh_tags` - Basic mechanism for discovering new versions - Tries to minimise number of and total size of requests --- tools/_vega_datasets_data/tags-schema.json | 10 ++++++++ tools/_vega_datasets_data/tags.parquet | Bin 0 -> 6210 bytes tools/vendor_datasets.py | 26 +++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tools/_vega_datasets_data/tags-schema.json create mode 100644 tools/_vega_datasets_data/tags.parquet diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/_vega_datasets_data/tags-schema.json new file mode 100644 index 000000000..80f248a66 --- /dev/null +++ b/tools/_vega_datasets_data/tags-schema.json @@ -0,0 +1,10 @@ +{ + "tag": "str", + "sha": "str", + "trees_url": "str", + "major": "int", + "minor": "int", + "patch": "int", + "pre_release": "int", + "is_pre_release": "bool" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dc0ff652ed261eebeed70ead42c0f7352ea4e8c3 GIT binary patch literal 6210 zcmds63pi9;``;oFdYNbQk}%r{U;3o$qm;^Z)+O|M~V~_IlTT*Lv6Qec!cyYp>0U zWy63WI1vxWVKrbUAd0{u6kSM7WS5pEQA9*I>nywL4i0T0cNwb{sg%0Bh)s3;^pobhe%i6lsF0)=)saaFDoSWb_P6?EratJ1OkR3!b@jlc0tVsqD?z7=TnwlH@>#-M3B(EM#v>hAD&Fv$fPpjj<=_m$SAMCv>+9$- z=-O^N%en5_`b=FtEw|-5I!qlsrYnQ1t;5rEXKr_O)6yZS(Y3g)dJL{RkH>IzqwBgd z+_vlUv~}EEwe{V!-Q8T7JUyni)^@Gw(7X_tFAs$WPDm1S;XEPb3w8>{`%YPv6tAA@ z8G)WcLr_YQA#VV2;^-}39HTfeG*kE+)sdB|FLmwLXuCe%07RdQYdKJ|0e{sRHs zK5D5=1daF@vC|gzUccl#tEgbU+G{sZ+FPS`-12duTvmgOSuJa^hsTO_bY{HuZRJBH zHf4q_rnXL=OCox68Q!0j><5mWy}7@mC09o*q~L?D%e_>+(2t)_ayk1~7gpPQSRHx4 ze?+YDcyU3|GtG^bstGRCiB3f&5mo1NJL2kH6%OOY$-c2?oASF}+y0o{<1D+QXJ2F0 zi1|C9D$-l2jz_JfoZ|yko{6!zY zMSIy{k2G*~gP*o)S9b2W+2Hgv_sI`zYg5EI(9_{lereafRV(7G!cz%OcQtD&117QJ z;c-cajxQW?%bNFk$B<1`?vilBhff;JmYnMvLQJ!L!WNO&OWkYPJswb6LkPY3c17r> zY`2;m=l1!x&-7WPpBXtwGbDbut6|q#lZ;-@xUA>c666Zs<^0_C2xhy9JiN0JHsQPslfx!zH3za24=wWzc@uh1ct zeg*e7Jwf@>jg;>Vn@=o%)KQhZVChw*kt<YM)Ti1UHpg$~09XyrhBDYd< zfkNKL-9g;W=5FF>R#b{3=fhQd_9v;Ahb=CzTUPNoyj|mzxpUL<{vsW7(dNyHOX4Cd z``BV{h_AJ9UZK0v>*|xrC6gX!9qi035A7f=XpFsWXlweYFL?W%EZetnk?k9twk%#K z>9nRgyT&(Sb?eoX%}Xwmq_1x@?R|el{Pv_?MW5cpKqax*`rR3$x7jP3aRI8GY6%6Z z0iD^k!oJ4Ec^?LeKU05uQM7ipMj!YZT`L7 zlA0@AA@^#HR!Oe@0l#-%5bUNW`i!)LVi86`}8+?sia{)u{WVzUP$!nKoko_ShT z;n{^Ec6KecHbRn{a`D00Lss$3%6B(jHFFa&j6&i$f-+7CufW1{-wTy3x`UDbh%q@q zh=!gNU7+|*9G#CPU`J`!9)gu`m3#4_Q2TBg1euvY@rDX#XE-XzlgsC*3I$|@NUat^ zN$NksK7i*cE#OMxS3KwH864~%WI&^F{k=3jyn;PLTs3)qKD0o0f4?BwE_V+u|F+DH z8_W%I4-TS%HniYCclRLLoEh?OjPX~w)seVY5jq;uD<$J^VQJ7JFXffz*JD&Tyut0L zvbsZ2vB8dUA#uzr=mmtdypw*|d_GcH_oSyOVfCe47jDk;!WV6|Tip&cHn^sYC^$+8 z|I<9J^iIYK`w= zh(B3gP=-l(` zgG&?iwL`xu2#EJx97?>8uLy`YrVr=c5hw|WH=f39h-ST5rK~%c%gP7|vH_aXeP+xM( zNAi_#tsj<&o|3K9yS>ADQ`_F9`V&xLF9$po&@R;#bR>av2?+u&AF9H^B zxVtMMI8{Wo>Em!NU}4oOpZd>?=Cps;yzxy!K4IK8^|^)LXtfMy&C?ev?^~&;^-Oh# zQP-~?lh}!rM;4B&l$tWB*4-4*sOXofS$@Nd^RJ18$L*yZpTJ+fn%b#yr~gBk=z-?T z8^@faA9r0BI&Fq8ii{TpEZjvL&6HETkTrJOzU*^CXlves{I}havO4=c582a5qa8<& z=Z>Z){#vINP?q+fLUDahRj}+~dR)iQBlk6CC?QJ)EaoLeJkE^lRN$HB8n=l{8NcQ< zN%rM0_i9uBo-Oz0b!TUDT*|Yj8EIW5rt(jETb3SrU|Jin(d^&}QPOj{RY=sRUP03O zq)VpvTsFL_%aVUlBlG0dxbn@T+x%K?M1SWsx!a7L7Vs_}5OPD55piZxz5wm zDkK`RlL>2Zd-2OwmMs!1!C2!JC#I$*<@QWsxLG8L_#yk8W6#>|$1+?uOCXK&q+bg) zZzIt);zxhH@pCmIA+ew9O5eFRw-Xz<8H>eDQ?)3c_gq2(z4?Sa#zDrUsfmkF z!u}f+_y3DVVH_+ED|0|0cwLnztb-sd3lps{HHm}39)iGX=ng`dJ^|pdpe1^d@R%zi{+*AT%D@P43(prlX=|x|=CDl*2{ZAs_T3fUmZdpb4THeC ztZfTvmP=l~0+%icBQ;p+ID_xfw*~<&S;3kdSZvxOd9@G-iiyF-GKf>3-`tX$!k~fH zfQ`Qfj1%DeBVeR_I?zI*`1*C05YD7t9U?%KSuywL2J<}mF@K6dx-rzFI(&fNG710^ zU=jz$&H%*ALJ$l3WBTpXW`g%eRA~Bz0cK$n*bst*W@07oAL#B9=knPk- zChEPw2$_f_&zxY%B*B>$SU@0%ubcB3k4&2LtJ^H|e}g^d`hr9z{f$B9N>Q_1n&AR6 zX=Yi^R!=WEpq^eLf1;kApnsyCHiW9}zGMpnkx3Ygpgje_!yQL4vPB}*rXYjeCa^I?Ez_QDWA1wrauWrT!9s0}>CUq87q$�JYJ< zw~@&F-Nx*>J2$>ZCQ*-4*(jRF1{AQA=gj5EVKdORoee8ONPaX1aiZdpWQH`l{7Dil z8=DRG20=VecONdPUWA5TS{|k>7Rwy`So*s_Jn|xW z<11M#8h^fmlmi3kBc7J0=bz2Lx#L5pB%AJF&tpL(5i37$ZeS4k0NC=r`3I@Y10TP@ He-Hi#`swv< literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 0604df780..86deec8ee 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -314,6 +314,32 @@ def collect_tags( tags = request_tags_to_df(n_head, warn_lower=warn_lower) _write_parquet(tags, fp, write_schema=write_schema) + +def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: + if fp.exists(): + print("Checking for new tags") + prev = pl.read_parquet(fp) + prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1) + curr_latest = request_tags_to_df(1) + if curr_latest.equals(prev_latest): + print(f"Already up-to-date {fp!s}") + return prev + else: + # Work out how far behind? + print(f"Refreshing {fp!s}") + fresh = ( + pl.concat((request_tags_to_df(limit_new), prev), how="vertical") + .unique("sha") + .sort(_SEM_VER_FIELDS, descending=True) + ) + _write_parquet(fresh, fp, write_schema=True) + print(f"Collected {fresh.height - prev.height} new tags") + return fresh + else: + print(f"Initializing {fp!s}") + collect_tags(None, fp) + return pl.read_parquet(fp) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 9768495f9974173ecb6b835464174a7b0bea808b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:20:03 +0100 Subject: [PATCH 014/201] feat(DRAFT): Adds `url_from` Experimenting with querying the url cache w/ expressions --- tools/vendor_datasets.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 86deec8ee..65802d130 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -13,7 +13,7 @@ import warnings from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict from urllib.request import Request, urlopen import polars as pl @@ -31,8 +31,9 @@ from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq - _T = TypeVar("_T") - _Guard: TypeAlias = Callable[[Any], TypeIs[_T]] + +_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]" +"""Query result scalar selection.""" _GITHUB_URL = "https://api.github.com/" _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" @@ -340,6 +341,28 @@ def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: collect_tags(None, fp) return pl.read_parquet(fp) + +def url_from( + fp: Path, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, +) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From c38c235fec976be66f7298f484e83828f2edf8a0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:31:21 +0100 Subject: [PATCH 015/201] fix: Wrap all requests with auth --- tools/vendor_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 65802d130..c5ad91459 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -165,7 +165,7 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: content: GitHubTreesResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): - with urlopen(data_url) as response: + with urlopen(_request_github(data_url)) as response: data_dir: GitHubTreesResponse = json.load(response) return data_dir else: From a22cc8a2d8231d0ac56117c3cd2fc56a2cffe762 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 9 Oct 2024 21:16:45 +0100 Subject: [PATCH 016/201] chore: Remove `DATASET_NAMES_USED` --- tools/vendor_datasets.py | 45 ---------------------------------------- 1 file changed, 45 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index c5ad91459..167c55590 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -421,51 +421,6 @@ def __repr__(self) -> str: ) -DATASET_NAMES_USED = ( - "airports", - "anscombe", - "barley", - "cars", - "co2_concentration", - "countries", - "disasters", - "driving", - "earthquakes", - "flights_2k", - "flights_5k", - "flights_airport", - "gapminder_health_income", - "github", - "income", - "iowa_electricity", - "iris", - "jobs", - "londonBoroughs", - "londonCentroids", - "londonTubeLines", - "monarchs", - "movies", - "normal_2d", - "ohlc", - "population", - "population_engineers_hurricanes", - "seattle_weather", - "sp500", - "stocks", - "unemployment", - "unemployment_across_industries", - "us_10m", - "us_employment", - "us_state_capitals", - "us_unemployment", - "wheat", - "windvectors", - "world_110m", - "zipcodes", -) -"""Every name that is referenced in *at least* one example/test.""" - - DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, "airports": {"filename": "airports.csv", "format": "csv"}, From 1181860ca6fa4abcd8662b0c9f5de2257b041b76 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:01:24 +0100 Subject: [PATCH 017/201] feat: Major `GitHub` rewrite, handle rate limiting - `metadata_full.parquet` stores **all known** file metadata - `GitHub.refresh()` to maintain integrity in a safe manner - Roughly 3000 rows - Single release: **9kb** vs 46 releases: **21kb** --- .../metadata_full-schema.json | 12 + .../_vega_datasets_data/metadata_full.parquet | Bin 0 -> 21362 bytes tools/vendor_datasets.py | 668 +++++++++++++----- 3 files changed, 488 insertions(+), 192 deletions(-) create mode 100644 tools/_vega_datasets_data/metadata_full-schema.json create mode 100644 tools/_vega_datasets_data/metadata_full.parquet diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata_full-schema.json new file mode 100644 index 000000000..2b5b9d955 --- /dev/null +++ b/tools/_vega_datasets_data/metadata_full-schema.json @@ -0,0 +1,12 @@ +{ + "ext_supported": "bool", + "file_name": "str", + "name_collision": "bool", + "name_js": "str", + "name_py": "str", + "size": "int", + "suffix": "str", + "tag": "str", + "url_github": "str", + "url_npm": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7a4e691cb414735738f276950d79e8c72c5f4b48 GIT binary patch literal 21362 zcmeIa1y~i`{x-Zf-Mu&6-JQ}6(hVZr(xHGzcXvytA}NhD(j|&?hltWj!?!o;<8#jW zpXYnN_q^YAz2A48VKXzUe)F3(Yu)#nd)ADaga#)F0GfaYeFCz8lG#BB&_5DB00R(6 z00cl5hI;mE6B~dwEI=9#77|Wo2R?)(Am7~HbZ7w8lqi6(Hxx=ZK=_+wl@4g^vi!cR zw4^BfjUO12#MgnhjKjvl1Vp@9_G9Rv0pe1au)3?K3UI@s>%;awM3W<|amXJ|l$y{$ zSrD=_@zPRz*9`&?f)EG^&@DaPjomz)ot<3WEiFK3;AD1iGy=E?h7L*iw}&7&P!0j; z0!0Z_00jMV7!&?E2#`u;O2v`jy4VORu1v1?pqiCVeebV^+D+}mbQ9sf&le{9^(z{y z9>4&g325o;N<)QnbD+KurSa$Eo=tpvdhCOt+t6(fGkpHBqbyas)>$$aX6DL~%_}%3 zH2uqIshdgA+%{ZZo!+^{EH574eQ$@46zh>|%OgUnFsy~uU-CSK$>KlpMAOf9Q(DeD1K}0Rue>^HSV=F z7| zR%@)fS;}`sS2w!*RyF43`B5oq7P^6jBK4AwkkFN0;*yeAg$6$J}V z2UTdVp6W8_0nhPZoW`wOw{_BdT^#ggyB+q=dHNY{DOmg_zQ$2YwHWZEE<#E;B4yw%gmoxbHiD&UFaj`>h6!fD>OdJP9}Rh08B(-vd7 zhuexF_rfi^^mzEt;-b2E7f)erMD~UoPr-Zo_sbf-kiOW-2&zTD>x)>w`XZXQt+Tn4 zg{2#tx!Yp`Bz{Pc&DqiVS7SFpentd%&G@r-B7TGS_csw|AixL@HtVtxAd3&TkSy0@ zOOLkd+3^(3i?@+x%1|jww(GNNHmXjymwh?w*p%(qzkBZ6nddCeLKG)2F3v%LhlVIE zS!2ROf+;V}m9NEuuOwTy>Ml&KCeK5aqC~HySZ`Ku$gZcvSNOr6-%zD-{hN=NiJAZn zI7W_!f`R}GB@~t_Uyp+*7LkT3LyajFm9B8sSttjSfrcnSu>_Z?*r3Ls5}$=GYtpWP zn5}gF(7TMb=678V*3d=D@|jfDWc9~LJ;i2*bWpEPMNKsrv}m|Mg#8?95f z=-NVBwS5T;>ZYu*5vBqHyZqHSNU)Im0w8f z4_ZVfwZ@>kWU&^r&MNoTaXnXK{1ExU^$j_P0F7i62L%Hy)XMx&}sZS@I8qk z2T5=ywE!}&9OP{2a<}B`%?54Ozxhhr93rtnTA77ytQK=>^j4dQn~>4I+>U1^^@m8> z!q|_O*!g|yfzU#d6l&58QjC~wI2=;S60ZOrGOq;&Y)F+(2)h=5a{$01r{BgN8YKFOVi0}y%(v<}CF1LA6biMMF6SICMsgI$b{w5XHjr?+37`C= zOAM%!#^z4;_O@=ePL4m^J%9r*0O=}8|B)~&u|avrN}$Z&gsBV5ivyUZHmBrpnZqPf zz+tX!PhCvSD#1pJkVq|exDB!~%|AQbYcn#?1sDUg0qM5ttXx`1ms+Ml%qfou$4~R? zyavGs5Z2TRW4>p3%NMwLwQH<2jI1uub@^_qv9qu+vyk@kp@*!1+GQppUsRG)x0QU= zSH-I={cTP(?#^1NKvM$$3Vpq`Cj9{AI<>*;C_Ka)ftW1%)DN`*V%CM4P6R>g+?IZl zGVVmt{&6Qn)ACa}e$^=exF!LIg6(ey5;1meHEsl<`f6{sG7vFXuy%oF5qfmEiUF2u z{#WGm)Ic)7vH-PP%;%;A2>ox87r!Q@)_l<38S&vQ(%c2u%-hMTN!^ZpYNUd-UXI$ld z>QU9Lj-6y*gk>14#vIz=ma3;<6^w6G9Z}lk#(Dk1(q{LWsYXD*ZulA&I}~5|lNW+(7FimZvt*s~(@HreVgN z`L3ZC4<$p&Y!V4c;19OK#KBXHlr8#69y<22nscepL4_oxn;s7g3^>NWGoo;WLL$HD z_vk}-8Iu8zD{QSyUX6jVhEUn|!5W0q$9r6~Xc!nZ$kiAuFp45aZ4~EmMAs9ke$gp2 zUm>Hz{qtlz2rc<17Bra%0BL3Da*7y=4f?Dkx2oAJ6e2ZVXxo>(W_4+Xj2Bkj6AO~< zKv~EcJ!^Mdq>yJ?V`dXh$HK;<9SW-^%OG8bD2>>8X%dc3nK_)C&)7Q zg$_)xF$n{w)qlO}b9_v|DN(y)&OwbtvM zwBiK05eG}io*F%p!YGA;wNy63jB|v(lx7o!j$xqqYMxO*d~5RL@fsEO4pKEO(0NlI zZ5Ig(0X{8z)?{=@Eep&n_qbOttwaGf@f~Z4uVGKO+G~%>J=3hZb}o*5Z|y&*3Goi_ z?c9D6(><_1KHVpBDANB+XM@}%K`=l2pD~{ng87AJloSbbEcP;!Abs`H@7pEFKzJCA zgOppm6>DG^if?QJH!BqLwSHs1L=a>0BLXPq+y0N3ulyVH&7qht$yW9(1~#j2kl&QM z=;LGRo6(A6f>1(}X38!OH1P^;fq#$rXKP0pwPN{HEoicxa^C37MYiMjW)uN1hS_|R z$KGdovh~d~f@nP~Wk$y%u!x}WZwF=KOgMu@c~maqH*rbv08GF1f53bR$gR>Cm7RdY zm(}6(SF6D%8or+cZ|UywRS}c(R$2|^GM!nuf%V^okrM0R7ByymTZf!_Zf<*UIwL;*j9)R{1Bu_fS2%qK}8HCQb4pIHS($eP#FMt2GHVo3G>PjWE2X>M6w05BLG)hK%%$+ z{a$Q)VMO~5xCd>36+HwXGT;NdFd^dyn7S}Pd5GXBa*Ygt6thZ0VGam{2hbvIo&jCo zX^oI!H^4L50~63*zyOr^Nfz(`?jOa>1Yz4b0g0TjEg}GfSll`qxTO}r$w%O)vz`MO zTU}T;U)&NDfHMgU2__&F6vP5_K?Kax!=yTs&!(lY09y4O5$t?$7oOh9o&ei2EDiD% zOssCq=*mo#T`SRR|5t4vO`A>N#yKe;a#&!8%pO517An`YVE)rk7g9}-@ht#AfCM@B zU(214YJ&Wn3?R+%SHs=^1*s+xBGu$u{G(LEBJx|R>9hE2sb&x&)u?g(kZO3Ce@Hc{ zf0AmM~>8 zcFu!8zrvvUUZI_Z5Nv(qFE3uG_(Qx&vbqy*#)`M8(IZNJiZ>$;Q1PZ##{nYVqzEvZ znTUJRVTQx;yeOiabZ>TOws>tY{bCfN;I!db%Y3<0aK`;*Q+_Hq0%#kKmRh&B!=Rk?93E4(#P(xm-VCTnRALKR<)p0y&$09i>gp&{*VTiQX z4ZTI*#oGuiG#GLjus-_mhn|x&2+I$QVPYb;wGm>UnQ+f-yq^tP*lJzbDcfq;Oq}f2 zf5|6SmEK>rA6WWB-ASK^s5_|Euja1X%t&BY=<*i;eX?`z zUrOD!G|o)N^}<2`s4C~2Hn{4aOn<&3-?uOX)qi^R50L`0{Fw+bmr#RnSqm+pnviv0;n5d}A#I9In&gZ+*tV`W56hd- zkx>Y~gtAl^AbjI06gZupGspq*w}?9+YWhzGk`k2R?vX#iS$hSOsZJ+#7L&Wm27d_H zuY94j2xLdJ`679Q^d=B)d|gi(Spw64A~fquHpPkt ztX5*uboWAGwSR)EEiz)TeN*1p>hKEkA;p$^*Qd{82I4y__KDd9!FI6C^r>C$(m^;_ zrSG)3IT-Ogi>ixAFij=G0MXaW#Ga6jcDI5%7GpGwgTKX?H| z_geK1p8qFEfDFp5K#ocvv|o=cD7Za3Zq*i2nrot{7d=ni`x!_uMx z_;7`3ogIX(WI9&kY)JPGu9shOC00>H@G$k^nlU6McLha$YyG53jlu=s0&oJx8DBfU ze?3o@Df>vwCHH08!w?o=NZ z$yGEwmiV){P>qa(eJr0%n_{5sX?!%V>bSI*#j}KstRtjdWltHW1>;XV*LC*H#wB(> z(>x7jgo|`!b@a^RtfOJJrQuDh!7&UsPi=_lL(yFXt#-|651i&U5bR_Wg|`X|7;Uca?N1UC;W zD_hSWY~6$lu12K0u!J^;f4V-9hH!-l`VLeA8UAVtCO3c>SO8Za23Ga1yEeg^hlA@d z5Ap7S4Xk5kP$EiYoGpgp=kHIILPS$5uG>$#zui~3l(To@mdktF-LdnTPPNj(8wTo!NLUes*lrs~N-9%WTZ@q()7~4iKZ#YIEEgPQmi%6I z#+!*WYkpmxB}$W9ZcQ(1Pa+#>`m{6hQ9EYk%f$xsXCOWfwN)h4RoA$Tv1VnY7FmLk zfoP4zmiYXmMtws3{N%MaXOdHxK4RU>0CX*{* z(zkQJe#gw&bNBh+9vnJ-HlccW{y7%)%v^OvrwKM%H9R~5qGKN79H0y+me&c8fML(` z2f!JcpbI!t_y@qi&`bmd0?=?sNJuEe{W}q&0_YgjMuweR;mn6XgHP(Fi}8_h*$eoo zd%5sI9JEM_&xzIj@+?Es3Mp!6U%k04i=?DErr9YyE6d?4wHd2dF*e?gu$+veMR*>b zX-MFxnw;#cE@*iXccXXusFnq%yD^whN8uf^^|zx0MT6O)(4@Nj29a`FbRAS48bZPK zdcnOV<>&Ylj9Gk1+I0s@W<3-sXK<8U*nP53WAhq9kg2x@1&tNoL}RyH5o+>(0ZU5& zU#4#cmf4683f*up%Dz;iJU)aQ?4VCqY9Mfjo! zj6BaDE)DCaIbwcO;#Gvq^%DOZrQn$%j>lmYO|!Lh=jYr{#<}U*ww2$7FxzGq@RO6r zKy%EZWzt%0jJa@bmYHSO>iv|WulGz`^3^;KwtM(9)IE1oo<6O}G|1hpEc0BNH?j8X z8&Yn6N@6T(HpX*xSQp)EAhVN-N2S#O!>Ek+Rso}!Ot4CkF?6vy@qtKoj%_7f+EcG| z&&gBo!)^T}osHRG)hnN!L;{COP=QN_Fy1{8H5kMpCiL0WnX)~iM2_qD?}^$0UR56* zp2A;`E8DRha9b-sucdXb%`s!Rj~vx7ltSzn@>L*lV*lRjwt*&F8{M~!o%G-nldf#$ z+A{oSMH!c=A8RT)FfuaUCx;zoBqp6s6N_>biK;a9p{Trzd1+G5!1y-Em%2f+sGTLM zxjp7cVp;PflHB(+>)v&%g?{;Te)LU*avf}I@CB_+RGh#f)#>_E_LrE zNg&(PGpyAkpUnqeW&OpaM+wbbyn}{KY$8qp$0Mry%Uryf-dQY$1o&n*TF3<6;WMPv zZ)5c2pLrITrKY>!rZyQPz?L$+boW_qHYi7_Gz%Af)A=6Ba8M`tF)0G>^?u+K{wV6B znTngzY7=Am=PB@*+BGjj#iiO!cinsz9~M08^CF2#5K|ko(ik^Dpf4A>>O1&KKIVP( zvLH2hz+!5IsoeoNx!i_1>bz_8=pcJ`ccNLEoWtGg5DvUoY)};UHQT_wMHk)n9{zW; zntQ8>5Bemvw|D&>ACu{694oZUtAx!x9LRB2G03nrN)Z?XskRn{v)7U~d$7&3RRnv2 zi^~Yydh0(kgb8`Lm!!ry>4vd8DDOJrW+{@)*_|DS4qvARNzN z!e>^rx{sf@c)+npFV{k@@Ldj_VPRuzHAj)or`dTH?1w8`{``%EsfA!J_U}ns52jD$ z3E6t)U*jr!?kh}Ysy)*j$gHCCTU;u2MWCvz%1rAh!s^yxZaJOnT{V20Zm{(NtjCW} ztyhbbg)YeZt-<(7Q$_XqgzWWFuvZqR#kf4Tc};1OI0~)etXqcZq?a^U6q`TKx}XBhf~!g~S5{K8~sJp!Uc z+~x=!w#q3T*CdnQIg^6)+r0!U1{3IVu>g8(6@%iXN}5lc1)8Z>8U7sP~VH9=^f zs~LyA=(>`oDG9Qk>74!lESi|vxVt;M39+-AI@_{Y+q&C$n6a5VIk3B0Iyi1_7Zl*);V`un;N;-qv*hRc zb+h$fq30i`=KQ}b^nCk6=rQ{x^w@pJ`2H&8@98{_P@Shi$I-WwB2|Fd?jLlXS*Xr4 zH$4Q^d3qr_&)5GOou?J5^UNDGLvOcbyGSlX=lSx7&eL_L z^9*nE+@)=;LY9;Nz6cPfu2YoWudh3ua~57$?SG3~s3me1)X z_I8PM@F7Fa)ezOE6}CYYkB0aghHxtuYX0NSwlSPiUxR&_Ivf%-Kw*d&ntS-|Va~w3 zIEf4(nh$%G-srIe<<$!MgR@AP9-LHo44ksF7%%HI{830gf^8y$~K(2V^8T! z5+Q&jJ`!Ozg-IFkMC)C{t4AoXMkpw>;bB??c}o(O=QL!&!Vkq;$7YduU>g_(`NHg? z48tv*g$laxEkK376gz!*UHTI|WRfsreR*G719#hNWr7MAw5 zk6qauE#282ogMxWsKtMX2kcPbX8$z_h*v5Kd~>vG2iGU$BlmY>G<|#qbR685=$%PSr_7FdAS+ zT%Cy{*`^_BTkj6zEpLs6%%<>N6it7~s$8x(w3|^ol|Hx2;%$g}C&)r+<%WFt!D}*P zF3H_vAS*@jSYMH7tfc&PNn$LQw1NcLZgWm5>^!KjC$uGOn(@{7TsPjUes4PLkfZ*U zRb`l0$`2e8r%IO8n3y%}`mG(w>>~n&)m`z1fc198kaNX?VW5X>V1W4E3WtKpLpzz8 z=DM^v2HO2ETlkruy_1p3u0Xj(Wci%j>7_7~q5NT*TBXO}fo9CiLe_DC(5El4PLYc> zlPi|`8!ywgMv02uOx&tOJ=NEfbbK7fD#yOq4aLpnN&9@nn|F;*r2RHIEgwNT^MaB3 zYh!uSY(a&Mlya7`@rQ{-2mQ2c%`lG;d8K~D$xKu!Z`bJ=)Z|PyK}%Xi#-V7RDcg8E z8hJ^DXM3Zci5jM4H-fZ1JWQtK1#H<~FzJ2BYDr{6xP>fxe_k-Y{0MPz|J%h4q3|J- z@SH4y^jjF%e05F$HfO+K{D8Qjp$%R4Y55XnkxDaor=y_>UnuTz}P%yVKA?;O@6>8VukJ$be)+0q}>k9vm1r0FntJfPx|b5`+OzH0=XC zak3LyFP&9SbJ!!3HKk*j_fcizgm|k6P8{i?)%zxa@XO}j!Z|g@iYtC>1KHxi5htR_ z7}U2zGX&2pj(o3!xG03aMOoxj;~(90sms86`V3rF)?_;>;9Rn{iCl`lvC?ojgV)zZ zNcf((JuLQwba^@rEI9Yf*S+^VH}v?LbA9|aw;24v73=C=e`&w&p?S=E`0k{>Q{qnz z4BPqL^tHE2Qx9g3_YcO<2g1|J=ZlWAJgMErs_7ft68DP+Rd>s+E@^rAOlb2QPFs*R9(ghVu7)LA7h8G|Lz}Jh(kkojjaC+V3$v6@(cpZ_f}rHBL=0dmVN# zF4U2;9O>({gb{C)mUS!smP!mC6c^t91pVXCZ0pn$K5;4B^e^}AKkgTP#=XT>Da}zh zXGHmGBx-Rp35Zslz9*Qi=_h(XMb{Of3FP8~i+ZUK??mpa`?i1RGj8%WBHWSDp7{6U zq=Fk0^@=H}w@LV8amcYF=HmgiAG;$XwLZZXlOv8!Stm#(B?Z%4ya-`vO$MUG4m4`Y zbzztHcDLHTTH`FT#p?G9rIL*N!)n=}Ftx?@5GF4@bl34<6 z!o;-^mzqSK1-=S@jvN&m(W27;IsPbryk4(g>N?V)SM%zFNs-N~ZUn8N@e_}Cg+a3e z@*ub5(ahnnoG7|>pdvu79X5PeFnqFsO2B8W#qvtsom`Fl3x)gU7|2uDj-2pz?EOAx zNqa8aweGlTh`HFMdxG4B`)ZM02^ieJmqPK#C5ltgupAJ@wXC?~VfLO_^^b;(nWyFF7K$P&?@ zfy$z%(3no~GWpUaSZ)sGyy!-bcq6$@jE$34x4OyTqlDrhtn1L6cq+LM*UOAH?{V>_{QHa7yc+^#eeFr7L|jgikB4}CGv%oo5>&f{d$O~{ zc$4W2)7WE$@We9wa2)Sr5<7Hy!dxk~VnwJm35ur(B_B}Kb1iiHjEA-!NRPr(axN03 z3Q&0(iT64;iLX_x5FQILBxA7pBpPvBq&LML!m>q{#-v&K)^>G^4Aia7rAJ$xs-Y;!qCPXTs~Argja6i@qZP6*ow-0YR#0??WL!w zkbUs}lKL67qNiBV(w^8r^7F2 z-4pl3kd$t7ne$f8kmN;g-Y!PsEkO(4)pT&7uB4GKnH__uMH|hzktgpn z7eCid)=e4d^CzBkUW$+X(~k{@3Jlw1HW8Q7O^IsWm5t+I_G4^#7WbtTzIf-i3VXQ-FA)-6s}w-8`;E1bc)zi8o(Rr?&EK=qcV7v+6~V!gL6(5)URG z-q^`~5@MVjBJ3GPq_+UWf!2cY0+8UfIw3k$Rr4 ztS~*V6kSe~7t0X0iQ^m+>m@;?1smdoE7^oD+T@phK;JzEe5zuPNbyFSGr6EII(+6 zxvWZY0;F_1+zX|N!wNTYpY|i|Z%AR(M2KFHNh8>UuO&}%b=wQcmxZ%2ltwVKf%!9e zT5{9IYo753)8XPO5&$DcFJFT?-I#zM6{{hy?d`4^L$(i#dVS&}v27~&W{ z?Q|1+sGH=g5bH$(?h!l^2`%%UhOZ zk$6GXG}Jl7SfVFo2~Y2#EYAyG%nh8{9~ay7y~LhTkj>VItL|NVTVA&6C4D&D`USGIh+@vVzt{gg>906VL1zUC=`3Ir^NZKbt z5SCCsZkp7er3SlO29_yP99(p*li?lse53Y$_IA~+?(>`O>TE%FrW4CGC#k*k>iEYo z;GbPrO)u{uTE?Z@a}ftZ z;EHF{x#A{Wg>`7bS$M5u&=lOQGovjc(`DjSgK$OjH(hBnH zK3&8e{q1VFM&W!h7`|3DE)(-ypPu0s_0gr26n|%UOBt(K`C1rXHjA{dmtXY=v9Ne_ zJbQz{4}4}_GG>ZD+WXC5iI~q66FVve%ZLyrRhYD#4!j0V>CtRf75EI-Ukq_OE;4_n zHWRIiO1H=KL?3DW((?VP{C*m_XY18QY|JdeI|>ifM|=a9x<`LkGvt-3IWeg zFJ8&di|D(FL|#mhgK7Ggt3cjQXy>e~x>HneyRvzhbzTBZU)m(ox0W$ctnp|+QMgB> zeQ0cdt#uL-zM}O2cRAhL!%=&v%uV)1rs&Y^f+E|Z+3*6c6F!c5bv)E4xES0Wjl*^n(@BJ8gDi@e7Zw^<7V24D-Eu8DhG_;YhY|i8;f7=!|g1+R6i{HuCSI8rAk4jn7!BHR_`O)i~+Ld zS_FNwn`_1`2#CX0%O$YD6gsHJEH*51sL_QD7%F&{ z5W_DaOstZWjh8FG+GS2fbFA8vb{q$CwGfRs5Re_5pj506_~w11Ry&LN0y)=$3e>LO zUJJ|KC&&3rwXN=?aCLmNCyeUhP)fL2*_B^7>$eW+G=5U4wiqSutI<4`%P3*280=#= z{x)`@u$(70H_c8`z?Ya=lNuyORX!W2>@*L7J@_iZq?h1kNe0=*7MjmOtpQ#IAOjuEYeKKC3$5 z^iL{{VRZ0mWH|WuzIuvEl7y<>EY=b!uM3?vdAwV+{ zERXHq!i3!@1j^=ikE0;;Nn*q|v=0rSO zc~-kyH=;OO=AV2=i?t+Lj#UTc=Azoii3m0+!b6*0tODwIB zgxhBbpv2fd(rJoUH!;)GbrIjjc-GYpFNsz!r`x6|iX6<6xM$OeHmM-f-o;E@%%yPV!WZW?qI{q-QAo&30s1k%9%h;o?MSs~c?d2i`=4cP(aLQ6q@5kI$8-N(@9XE>zQ{d2z4xNupzX3!Axgm#M`Ii@t=Ao%uFMd|05NBL zN;g_5{FLK*KX$9|yQ*T>7uMdaIFrn|ks&YHY;SW=_#uzpzBP+kuWqb45YQYm{W9U~ zdCz3zE%4A^$dJJ!PDynt>!2p@IXYRDyqiJ=1${m#tt5hU9&BZDtPpvXtZjJk6Cm%} z8X^JJ2$ETNB2eyz;<@Q!9#eKARywBvrIJ5^Oymr8;xfg? zQBGH~V=8|Y1b=|Gen!h%xixnWQ9OxrMR5xmc58X%*}ZoU-iEPDU1=UHq;$}@h!qed z+Q1$;X~hc2KFRkHFR~>Z1+d{u={dLI98rBkq>vG+L`pI#ArwVLB0Ut!4`bjenb z<;+3$2I)QEtw2dHnp!VqL%OW85%2>728M;m25P9mjuL=FQKnOhL>5HHwl|Rh08l^Z+9XjCa`KuhV zQrWy-6Pt*;B`IFerEqEc8u@yHVHw`S{4F=|&3pa=Oj7PO=T@mF>K9#yT-P?8=;0N% z!GVNBGl!WEq(^qS-)7rndoOK&h8e{pK@Y69sMqzE*WC))3S2xZq|X}ATQ_J7kmS4# z3X(TsUcCZ*{0c)$gVVe+)L0q1W=2eCTZAxNiH{t+$}*N#fg+Yul@`=793F~oUV;Z> z<6m%1>0=bxj^enrvySTR92GtEVLyNU;blX>WVOec~pU}@AmyY zD~q30uek@OFLd|J^eJ8q8yT;=mAl+NoNB{!vKpPwe~gwQV7YXWAgt6+>*MxevO>5Z z>gt;2;YX@ZXuZbj`^2K)aq{9}{Ts?fz{i05aba;iLPfnYO;jZHr0<$U7W}e-Cpw

*1L@E^lrhJ;~e%ORkvV6Zr`B(p%srH zkEFJ-rg20+HuCqhZHs=tAZg-G*o#Hr#A^Y09cy%9VdKalWA!x7KpGazcROp2ehk?J z4ch^y!-q6gTu)G4IIDy}-rgn?GQQ|KTsYXT8E7#m{njwI3#p7K>^Bt4M<(+(5Cwgd z*vnC0%4_A>zeetay+ZhEPf%o#oD*EV`J~sIfL%J&G-RPSPXJ$0x-;Fv z)^;kc>RsBTzCUJc^a!K7*^r-u+-6CUwHnTp$l#mp;sMn6UYvVP-mKz?eTY&!hcv-H zS!cB5Y4T^eqRKda=rnWUh^dbB(&&9sexR3YhZ|3=1X6HI8M?$Q#@?}SS2huvF3UZi zUriV0biA)fsVAZx?&%p1zRpTT2EbHPP zW|_iROztbX@88o?Vu%wB@{S~meRl8^ROv)redbGB2R;Sfb{T6BBBGeWMjRR2QHxuA zmb^_Rf0K+T>jm@?$R@o-W;VB07q0t^upZ*Ck?T}HOkAWO0@K96Z|uhq7VOc1RT-a@Gw<0X>uE0^lh80Z#F%qBLNZF~Je(b}*$^-w3e{(@$Yk zm`OIqV#ui+9>}?ae>Qu3phH6N0)8c*#jq{5C+mEx)OxR7V_|A={1dt=)z%xD^8hln z_q@|_w3}LoX3_2Kd7-*lz6tiO3|KWf`k(pZZrs1*e3*EemrDwx#}DWJlr<#RnykmO zYDTaKuPNcf*zK1}MJKvzw~4N9!N?%>>7tYUSW1!G(x=XmDj*&Ci2&yqJ%^No4 zAe+gNYqN`NYy5$%y^)OM`55V&(l*eZb>QurEhz_8%vMwq=`J%hnPSk&!AKq9b5IuS zdl5_vbT8Ge6Vs<0-@{KcNal(2>!KU1{H*NnBQh+f;P;juj30w^p9W>Gh!m`JGwy_V zGm8q6FlJjF@-`O;#IFhowNm;ui47KE8VbJ$S@2$K(C`z`cs!$+aek;tah{(eYo$Ve z5b7f=rs(@%uZJvdc@r_(oLPk*PyDdv3T9UNU~Wz9^w=|N&4Q%<>|iy^^_Jwh()UC? z`wL@r;Pdao<1c_tXAwN(mXoMjE@rWuzAO*uBl*{N)G3R6r*^X7qSiz5IEOVlY)`g1 zw<{2qV6VSmbt*ku{J zqm-_X3_U5qQ5Y+^G-3rkl3#`*KxbVaGWeY0+UNVR((3dy0q@#K~l&n}w7Ku;|N!M$$M&Gaq(JQ*JPtB`Q`u z4Od|;rlO?GdHZmqMN3Z$<0A)@N#r^7E*88te2=uv_?whp3yc)5KFES8N2=}$?caHJ zgCr4`{Pap$XACJ{)NORw_7{|oXkp=g&8zE7*Iq(T zI?x)J@?Fke;Wep6kBjA&MtQeKTrbypA#~?5+^UO`L_Cm=2_!G{y#aE@&z(fTLArby zXOC-#*TFXBzV7aH;1i568_e)={@zAw1!u7v9FE#F;VkK)$2$y=XM(f<$oFaxp^pS1 z&%b^=4-6e81`pWNbyowS4+Q^Dd0<9}4MPa|un*)PG7pvz3;^6k{P9T~&PkH|$>5n&JASHjziT_i{pHtQe3GQY{|0&_u)MG+|pR;@al=dGy zZ2h~_Itc%hxg&q~iSD0H=|6rnN=Wd(qs9I7hy;uTDuRJD762g$I1dAqi1Bm3k`%N6 z{JCGo2?JWlao6L4*&wt!u-=Gp&%2%A#uOI&_N4OB7qiz=KQCxVf@JY>y7DO8KK30#5sHY zii3m1-Mz)}qsY%W{J&D6C7`p;{~Z3)RDeH6|AK>`ageW95t0yA&{e>YK&ticf8Po` zR0abAgV%ClKvV#_S`v~!rXc)2@gIouw?0h*UV)e+00-y6fMfx5k>tOe`KS8&Vex-+IFlTy|%qApg!t zyd}y1(n~nt&O0kJ5G$epx)%<=UHQ9jZ-8g+ETk5|fVcp3UtE7%_`9zsfDi603_vUh z0O-~#fWJ-rscrqJ*Z+S`GXaM~rq&RF{R?40C9sfY`5e+o{y)U3;DS5%OG;oMI0*pf z^pHb(I~@}EAQ1)+3Yr9H5*iw+xkB_!M=AQ>QAbqQuQNcxi}&~ya}3G1KvoFH3Cb7)Ww8m%HBDIp=H4#_T|eP=@* z5|+L*W_wrSmwgEdeyBaKJ9~G=*B8e$KU z1dWo{@iLPY^e~s>_>b*LLB054&(Ty*!^uJyaz3he$8&>v00}`(_0CmES;#KL2~DUU zkgdkg`cQ^uft;@Z#7TCD|BxhTlq83~uK9oL-`%0*RX7bC44f??R`2T1`_7-cl^pcY z5;D;0`iuJ0f_Nr)mlKi;vRFf}7Q|0$XnjM@U&Yzd0eT7kH(eh&UK=xA9V>JBNB{Zx zL$CKwd!~-c|IQw?8vf$^f43iLXs!LUr_UwmW+rFvZK^Bi0l9u|(CeoJN&eaR^r5>F z8juQuUQ}yHeL=VCKbnN}U7~~p`(6Kl99lv`5gKLSVDDz2Bj};8%V`e{|C{#y_x*)l yFF73^3t2&L3q8pvX55M#2D<7J00IOxCwo&@H^K_Y$m1`5fDy= (3, 13): from typing import TypeIs else: from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString, Required + else: + from typing_extensions import LiteralString, Required if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq + _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) + _PathName: TypeAlias = Literal["dir", "tags", "trees"] + -_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]" +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) """Query result scalar selection.""" -_GITHUB_URL = "https://api.github.com/" -_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" -_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags" -_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" -_TAGS_MAX_PAGE: Literal[100] = 100 _SEM_VER_FIELDS: tuple[ Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] ] = "major", "minor", "patch", "pre_release" @@ -51,6 +74,14 @@ def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) +class GitHubUrl(NamedTuple): + BASE: LiteralString + RATE: LiteralString + REPO: LiteralString + TAGS: LiteralString + TREES: LiteralString + + class GitHubTag(TypedDict): name: str node_id: str @@ -65,6 +96,14 @@ class ParsedTag(TypedDict): trees_url: str +class ReParsedTag(ParsedTag): + major: int + minor: int + patch: int + pre_release: int | None + is_pre_release: bool + + class GitHubTree(TypedDict): """ A single file's metadata within the response of `Get a tree`_. @@ -97,24 +136,6 @@ class GitHubTreesResponse(TypedDict): truncated: bool -class GitHubBlobResponse(TypedDict): - """ - Response from `Get a blob`_. - - Obtained by following ``GitHubTree["url"]``. - - .. _Get a blob: - https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob - """ - - content: str - sha: str - node_id: str - size: int | None - encoding: str - url: str - - class ParsedTree(TypedDict): file_name: str name_js: str @@ -123,6 +144,11 @@ class ParsedTree(TypedDict): size: int url: str ext_supported: bool + tag: str + + +class QueryTree(ParsedTree, total=False): + name_js: Required[str] class ParsedTreesResponse(TypedDict): @@ -131,64 +157,442 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] -def _request_github(url: str, /, *, raw: bool = False) -> Request: +class GitHubRateLimit(TypedDict): + limit: int + used: int + remaining: int + reset: int + + +class ParsedRateLimit(GitHubRateLimit): + reset_time: time.struct_time + is_limited: bool + is_auth: bool + + +class GitHubRateLimitResources(TypedDict, total=False): + """ + A subset of response from `Get rate limit status for the authenticated user`_. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ + + core: Required[GitHubRateLimit] + search: Required[GitHubRateLimit] + graphql: GitHubRateLimit + integration_manifest: GitHubRateLimit + code_search: GitHubRateLimit + + +class _ErrorHandler(urllib.request.BaseHandler): + """ + Adds `rate limit`_ info to a forbidden error. + + .. _rate limit: + https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 + """ + + def http_error_default( + self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message + ): + if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): + limit = hdrs.get("X-RateLimit-Limit", "") + remaining = hdrs.get("X-RateLimit-Remaining", "") + msg = ( + f"{msg}\n\nFailed to balance rate limit.\n" + f"{limit=}, {remaining=}\n" + f"Reset: {time.localtime(int(reset))!r}" + ) + raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) + + +class _GitHubRequestNamespace: + """ + Fetching resources from the `GitHub API`_. + + .. _GitHub API: + https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 """ - Wrap a request url with a `personal access token`_ - if set as an env var. - By default the endpoint returns json, specify raw to get blob data. - See `Media types`_. + _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" + _TAGS_MAX_PAGE: Literal[100] = 100 + _VERSION: LiteralString = "2022-11-28" + _UNAUTH_RATE_LIMIT: Literal[60] = 60 + _TAGS_COST: Literal[1] = 1 + _TREES_COST: Literal[2] = 2 + _UNAUTH_DELAY: Literal[5] = 5 + _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_TREES_LIMIT: Literal[10] = 10 + + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self) -> GitHubRateLimitResources: + """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" + with self._gh._opener.open(self._request(self.url.RATE)) as response: + content: GitHubRateLimitResources = json.load(response)["resources"] + return content + + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > self._TAGS_MAX_PAGE: + raise ValueError(n) + req = self._request(f"{self.url.TAGS}?per_page={n}") + with self._gh._opener.open(req) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform **2x requests** to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" + else: + url = tag["trees_url"] + with self._gh._opener.open(self._request(url)) as response: + content: GitHubTreesResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with self._gh._opener.open(self._request(data_url)) as response: + data_dir: GitHubTreesResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + def _request(self, url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. - .. _personal access token: + .. _personal access token: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - .. _Media types: + .. _Media types: https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ + headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} + if tok := os.environ.get(self._ENV_VAR): + headers["Authorization"] = ( + tok if tok.startswith("Bearer ") else f"Bearer {tok}" + ) + if raw: + headers["Accept"] = "application/vnd.github.raw+json" + return urllib.request.Request(url, headers=headers) + + +class _GitHubParseNamespace: + """ + Transform responses into intermediate representations. + + Where relevant: + - Adding cheap to compute metadata + - Dropping information that we don't need for the task """ - headers = {} - if tok := os.environ.get("VEGA_GITHUB_TOKEN"): - headers["Authorization"] = tok - if raw: - headers["Accept"] = "application/vnd.github.raw+json" - return Request(url, headers=headers) + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh -def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: + core = rate_limit["core"] + reset = core["reset"] + return ParsedRateLimit( + **core, + reset_time=time.localtime(reset), + is_limited=core["remaining"] == 0, + is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, + ) + + def tag(self, tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") + + def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: + return [self.tag(t) for t in tags] + + def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=_js_to_py(path.stem), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + tag=tag, + ) + + def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return [self.tree(t, tag) for t in tree["tree"]] + + +class _GitHubQueryNamespace: + """**WIP** Interfacing with the cached metadata.""" + + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh + + @property + def paths(self) -> dict[_PathName, Path]: + return self._gh._paths + + def url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + fp = self.paths["trees"] + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + +class _GitHub: """ - For a given ``tag``, perform 2x requests to get directory metadata. + Primary interface with the GitHub API. + + Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. + + - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. + - Organizes distinct groups of operations into property accessor namespaces. + + + .. _tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags + .. _trees: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + .. _rate_limit: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - Returns response unchanged - but with annotations. """ - if _is_str(tag): - url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}" - else: - url = tag["trees_url"] - with urlopen(_request_github(url)) as response: - content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) - if data_url := next(query, None): - with urlopen(_request_github(data_url)) as response: - data_dir: GitHubTreesResponse = json.load(response) - return data_dir - else: - raise FileNotFoundError + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) + + def __init__( + self, + output_dir: Path, + name_tags: str, + name_trees: str, + *, + write_schema: bool, + base_url: LiteralString = "https://api.github.com/", + ) -> None: + # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced + # that describes column types - in a non-binary format. + self._write_schema: bool = write_schema + output_dir.mkdir(exist_ok=True) + self._paths: dict[_PathName, Path] = { + "dir": output_dir, + "tags": output_dir / f"{name_tags}.parquet", + "trees": output_dir / f"{name_trees}.parquet", + } + repo = f"{base_url}repos/vega/vega-datasets/" + self._url = GitHubUrl( + BASE=base_url, + RATE=f"{base_url}rate_limit", + REPO=repo, + TAGS=f"{repo}tags", + TREES=f"{repo}git/trees/", + ) -def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" - if n < 1 or n > _TAGS_MAX_PAGE: - raise ValueError(n) - with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response: - content: list[GitHubTag] = json.load(response) - if warn_lower and len(content) < n: - earliest = response[-1]["name"] - n_response = len(content) - msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" - warnings.warn(msg, stacklevel=3) - return content + @property + def req(self) -> _GitHubRequestNamespace: + return _GitHubRequestNamespace(self) + @property + def parse(self) -> _GitHubParseNamespace: + return _GitHubParseNamespace(self) -def _parse_tag(tag: GitHubTag, /) -> ParsedTag: - sha = tag["commit"]["sha"] - return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}") + @property + def query(self) -> _GitHubQueryNamespace: + return _GitHubQueryNamespace(self) + + @property + def url(self) -> GitHubUrl: + return self._url + + def rate_limit(self) -> ParsedRateLimit: + return self.parse.rate_limit(self.req.rate_limit()) + + def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) + + def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: + """Retrieve directory info for a given version ``tag``.""" + trees = self.req.trees(tag) + tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"] + parsed = self.parse.trees(trees, tag=tag_v) + df = ( + pl.DataFrame(parsed) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + def refresh( + self, fp_tags: Path | None = None, fp_trees: Path | None = None + ) -> pl.DataFrame: + """ + Use known tags to discover and update missing trees metadata. + + Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + """ + rate_limit = self.rate_limit() + if rate_limit["is_limited"]: + raise NotImplementedError(rate_limit) + fp_tags = fp_tags or self._paths["tags"] + fp_trees = fp_trees or self._paths["trees"] + IS_AUTH = rate_limit["is_auth"] + UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT + + tags = ( + self._refresh_tags(fp_tags) + if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT + else pl.read_parquet(fp_tags) + ) + trees = pl.read_parquet(fp_trees) + + missing_trees = tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" + ) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp_trees!s}") + return trees + else: + missing = ( + ReParsedTag(**row) + for row in islice( + missing_trees.iter_rows(named=True), + None if IS_AUTH else UNAUTH_LIMIT, + ) + ) + fresh_rows = self._trees_batched(missing) + print( + f"Finished collection.\n" + f"Writing {fresh_rows.height} new rows to {fp_trees!s}" + ) + refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + _write_parquet(refreshed, fp_trees, write_schema=self._write_schema) + return refreshed + + def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: + rate_limit = self.rate_limit() + if rate_limit["is_limited"]: + raise NotImplementedError(rate_limit) + elif not isinstance(tags, Sequence): + tags = tuple(tags) + req = self.req + n = len(tags) + cost = req._TREES_COST * n + if rate_limit["remaining"] < cost: + raise NotImplementedError(rate_limit, cost) + delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY + print( + f"Collecting metadata for {n} missing releases.\n" + f"Using {delay_secs=} between requests ..." + ) + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay_secs + random.triangular()) + dfs.append(self.trees(tag)) + return pl.concat(dfs) + + def _refresh_tags( + self, fp: Path | None = None, *, limit_new: int | None = None + ) -> pl.DataFrame: + n_new_tags: int = 0 + fp = fp or self._paths["tags"] + if not fp.exists(): + print(f"Initializing {fp!s}") + tags = self.tags(limit_new) + n_new_tags = tags.height + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + curr_latest = self.tags(1) + if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + else: + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(limit_new), prev_eager), how="vertical") + .unique("sha") + .pipe(_sort_sem_ver) + ) + n_new_tags = tags.height - prev_eager.height + print(f"Collected {n_new_tags} new tags") + _write_parquet(tags, fp, write_schema=self._write_schema) + return tags + + +GitHub = _GitHub( + Path(__file__).parent / "_vega_datasets_data", + name_trees="metadata_full", + name_tags="tags", + write_schema=True, +) + +####################################################################################### + + +def _tag_from(s: str, /) -> str: + # - Actual tag + # - Trees url (using ref name) + # - npm url (works w/o the `v` prefix) + trees_url = GitHub.url.TREES + if s.startswith("v"): + return s + elif s.startswith(trees_url): + return s.replace(trees_url, "") + elif s.startswith(_NPM_BASE_URL): + s, _ = s.replace(_NPM_BASE_URL, "").split("/") + return s if s.startswith("v") else f"v{s}" + else: + raise TypeError(s) def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: @@ -216,64 +620,9 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: ) -def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: - response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver) - - -def _parse_tree(tree: GitHubTree, /) -> ParsedTree: - """For a single tree (file) convert to an IR with only relevant properties.""" - path = Path(tree["path"]) - return ParsedTree( - file_name=path.name, - name_js=path.stem, - name_py=_js_to_py(path.stem), - suffix=path.suffix, - size=tree["size"], - url=tree["url"], - ext_supported=is_ext_supported(path.suffix), - ) - - -def _parse_trees_response( - tree: GitHubTreesResponse, /, tag: str -) -> ParsedTreesResponse: - """For a tree response (directory of files) convert to an IR with only relevant properties.""" - return ParsedTreesResponse( - tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]] - ) - - -def request_trees_to_df(tag: str, /) -> pl.DataFrame: - response = _request_trees(tag) - parsed = _parse_trees_response(response, tag=tag) - df = ( - pl.DataFrame(parsed["tree"]) - .lazy() - .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag)) - .with_columns( - url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), - pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), - pl.col("file_name"), - ) - ) - .collect() - ) - return df.select(*sorted(df.columns)) - - -def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: - import random - import time - - dfs: list[pl.DataFrame] = [] - for tag in tags: - time.sleep(delay + random.triangular()) - dfs.append(request_trees_to_df(tag)) - return pl.concat(dfs) +def _sort_sem_ver(frame: _Frame, /) -> _Frame: + """Sort ``frame``, displaying in descending release order.""" + return frame.sort(_SEM_VER_FIELDS, descending=True) def _write_parquet( @@ -298,71 +647,6 @@ def _write_parquet( json.dump(schema, f, indent=2) -def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: - """ - Retrieve directory info for a given version ``tag``, writing to ``fp``. - - When ``write_schema``, an addtional ``...-schema.json`` file is produced - that describes the metadata columns. - """ - metadata = request_trees_to_df(tag) - _write_parquet(metadata, fp, write_schema=write_schema) - - -def collect_tags( - n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True -): - tags = request_tags_to_df(n_head, warn_lower=warn_lower) - _write_parquet(tags, fp, write_schema=write_schema) - - -def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: - if fp.exists(): - print("Checking for new tags") - prev = pl.read_parquet(fp) - prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1) - curr_latest = request_tags_to_df(1) - if curr_latest.equals(prev_latest): - print(f"Already up-to-date {fp!s}") - return prev - else: - # Work out how far behind? - print(f"Refreshing {fp!s}") - fresh = ( - pl.concat((request_tags_to_df(limit_new), prev), how="vertical") - .unique("sha") - .sort(_SEM_VER_FIELDS, descending=True) - ) - _write_parquet(fresh, fp, write_schema=True) - print(f"Collected {fresh.height - prev.height} new tags") - return fresh - else: - print(f"Initializing {fp!s}") - collect_tags(None, fp) - return pl.read_parquet(fp) - - -def url_from( - fp: Path, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, -) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 31eeb2042a6cfae6c2ca95874797b61e339e41d8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:26:47 +0100 Subject: [PATCH 018/201] feat(DRAFT): Partial implement `data("name")` --- tools/vendor_datasets.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 6bb0d0216..d02ef5130 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -824,5 +824,38 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() + # BUG: # 1.6.0 exists on GH but not npm? + def __call__( + self, + name: str, + ext: ExtSupported | None = None, + /, + tag: LiteralString | Literal["latest"] | None = None, + ): + """ + **WIP** Will be using this *instead of* attribute access. + + - Original supports this as well + - Will only be using the actual (js_name) + - Some have hyphens, others underscores + """ + constraints: dict[Literal["tag", "suffix"], str] = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + if name.endswith(get_args(ExtSupported)): + name, suffix = name.rsplit(".", maxsplit=1) + suffix = "." + suffix + else: + suffix = ext + if suffix is not None: + if not is_ext_supported(suffix): + raise TypeError(suffix) + else: + constraints["suffix"] = suffix + q = QueryTree(name_js=name, **constraints) + return GitHub.query.url_from(**q) + data = DataLoader() From 511a8455f9caa285a7220bf989f6d607a704f070 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:28:01 +0100 Subject: [PATCH 019/201] fix(typing): Resolve some `mypy` errors --- tools/vendor_datasets.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index d02ef5130..2c0f47a90 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -10,6 +10,7 @@ import json import os import random +import sys import tempfile import time import urllib.request @@ -24,18 +25,23 @@ Callable, ClassVar, Iterable, + Iterator, Literal, NamedTuple, Sequence, - TypedDict, + cast, get_args, ) from urllib.request import urlopen import polars as pl +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if TYPE_CHECKING: - import sys from email.message import Message from typing import MutableMapping, TypeVar from urllib.request import OpenerDirector, Request @@ -147,8 +153,15 @@ class ParsedTree(TypedDict): tag: str -class QueryTree(ParsedTree, total=False): +class QueryTree(TypedDict, total=False): + file_name: str name_js: Required[str] + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str class ParsedTreesResponse(TypedDict): @@ -501,13 +514,10 @@ def refresh( print(f"Already up-to-date {fp_trees!s}") return trees else: - missing = ( - ReParsedTag(**row) - for row in islice( - missing_trees.iter_rows(named=True), - None if IS_AUTH else UNAUTH_LIMIT, - ) + it = islice( + missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT ) + missing = cast("Iterator[ReParsedTag]", it) fresh_rows = self._trees_batched(missing) print( f"Finished collection.\n" @@ -847,14 +857,16 @@ def __call__( if name.endswith(get_args(ExtSupported)): name, suffix = name.rsplit(".", maxsplit=1) suffix = "." + suffix - else: - suffix = ext - if suffix is not None: if not is_ext_supported(suffix): raise TypeError(suffix) else: constraints["suffix"] = suffix - q = QueryTree(name_js=name, **constraints) + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] return GitHub.query.url_from(**q) From a770ba9247300809cc18c6a6863cb38c0c7819f5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:27:35 +0100 Subject: [PATCH 020/201] fix(ruff): Apply `3.8` fixes https://github.com/vega/altair/actions/runs/11495437283/job/31994955413 --- tools/vendor_datasets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 2c0f47a90..dc31cc61e 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -15,6 +15,7 @@ import time import urllib.request import warnings +from collections.abc import Iterable, Iterator, Sequence from functools import cached_property, partial from itertools import islice from pathlib import Path @@ -24,11 +25,8 @@ Any, Callable, ClassVar, - Iterable, - Iterator, Literal, NamedTuple, - Sequence, cast, get_args, ) @@ -42,8 +40,9 @@ from typing_extensions import TypedDict if TYPE_CHECKING: + from collections.abc import MutableMapping from email.message import Message - from typing import MutableMapping, TypeVar + from typing import TypeVar from urllib.request import OpenerDirector, Request if sys.version_info >= (3, 13): From 686a48599f86cffb49549d72e697c88aa4440d45 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:31:28 +0100 Subject: [PATCH 021/201] docs(typing): Add `WorkInProgress` marker to `data(...)` - Still undecided exactly how this functionality should work - Need to resolve `npm` tags != `gh` tags issue as well --- tools/vendor_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index dc31cc61e..ad8debbc5 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -61,6 +61,7 @@ _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) _PathName: TypeAlias = Literal["dir", "tags", "trees"] + WorkInProgress: TypeAlias = Any _ItemSlice: TypeAlias = ( @@ -840,7 +841,7 @@ def __call__( ext: ExtSupported | None = None, /, tag: LiteralString | Literal["latest"] | None = None, - ): + ) -> WorkInProgress: """ **WIP** Will be using this *instead of* attribute access. From 0bbf2e9ec2ff2f1d79b4d4f68128625daab2d947 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 5 Nov 2024 19:42:18 +0000 Subject: [PATCH 022/201] feat(DRAFT): Add a source for available `npm` versions --- .../_vega_datasets_data/tags_npm-schema.json | 9 +++ tools/_vega_datasets_data/tags_npm.parquet | Bin 0 -> 3114 bytes tools/vendor_datasets.py | 56 +++++++++++++++++- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 tools/_vega_datasets_data/tags_npm-schema.json create mode 100644 tools/_vega_datasets_data/tags_npm.parquet diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json new file mode 100644 index 000000000..8de9881a0 --- /dev/null +++ b/tools/_vega_datasets_data/tags_npm-schema.json @@ -0,0 +1,9 @@ +{ + "tag": "str", + "major": "int", + "minor": "int", + "patch": "int", + "pre_release": "int", + "is_pre_release": "bool", + "v_tag": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet new file mode 100644 index 0000000000000000000000000000000000000000..38be9c271c7638490835298d6fff114c9e921a7c GIT binary patch literal 3114 zcmcIndrVVT96q-%Dim#X=XkHF6Hz;u2nCUqh3PI2d03=WMIJM}+O?1(q!28~W<*>z zgU?i>lWc@Jooqvw>9UZ>7*j^ns4+U3n^QM3G`<-7V;ZO3@7%Un1QyKZO?%Jp+}}Ch z?>pZ;=WA-8PN}`NwMf6SB`dMBtx24H;>-SBa}0yOvR&I#4rYah zZpfNk+*0s>T#UMH`63h}T!HJEFd(SJGAvWapb%+Pcx? z!7V^CCC7@Z3N2Vc9OUN!vQ46q$Bj}BL<_>R_0V`rE7K7Y@yz;}uE!Dtj0p}IMp#aH z(TkNc?ttE@+Cc$1FcmruhKXVpBP{U{k(L#fmfHm?Ee!dZ(F{r`mr;>`$$%J}^mySA zqxRvQ4*Ct8MU|?K;?ZX+* z@bF~io`5z2W9@pc&QjG4^|G|kkDbOIHkjr+AO5noKhIKk@7Q0L(>%3q?moY^-)C7R zt3Au=kdosfjnhw9)@bEmSp_&sj}=3!(ryH?#NvkdV5RH+ck9+Y`f~=ATZhwzlb2F5 zaE&CdzV4M|j_ z#37=(oe0%$5O0j1sd7CywdnD{)qEF2scNv!hpH(=E^{*-h3b?&2>C&F!t3}xqeEgT zx@PxBPZ$PN*v`39ws^0U8 z<5L$S4$C-pVo7EG(*?6hsitodmBj-CLj3KIKO1Hz9gJCmin>+G<~vROeXgPfcb4q2 zt{{(hEZty~9XJFYl6vKvwnKS(QfaWid~9{{n6EM%Lt1BF^h- z9P4&Dke?tVWQM$hV~1?OH|^O;2-4o;G)CIHiN{EL_asf5+_f1%Wf-6FUZsGrTRTrq zO-_sOX$CkWPE8I=ajhqG%JFgU<1GQW$s6|t;!Q*GBwXqlJ}?EJ6CPs`*O(%)mzP)J zi3ls^hy;(GgY;@BfhEv)TctOq{^E&+BkYcS4-1?_nDx|#c0;>gl?R+SI5k(N*)35& z)6WBGS0fP3}sEEhl@a4ft0=(TEIM#7!oLhI}S08%QQuV4<|lq&f(zrwt!V*QXUi$O==_ zC>4Ov(|&PEnk{>Nnyn}~%UT?@!p*-{kV3do;($sk{tN%*P$X6u38Ub$@Clhzc#B7$ zc&jOSX*taZ`uyMc$&nRBWwABS5#L(qi~k??i?D@KjMm&W{A-An8Om*i_DZe}ymaIT Mt%ctS1N=+)2T| TypeIs[str]: @@ -142,6 +144,30 @@ class GitHubTreesResponse(TypedDict): truncated: bool +class NpmVersion(TypedDict): + version: str + links: dict[Literal["self", "entrypoints", "stats"], str] + + +class NpmPackageMetadataResponse(TypedDict): + """ + Response from `Get package metadata`_. + + Using: + + headers={"Accept": "application/json"} + + .. _Get package metadata: + https://data.jsdelivr.com/v1/packages/npm/vega-datasets + """ + + type: str + name: str + tags: dict[Literal["canary", "next", "latest"], str] + versions: list[NpmVersion] + links: dict[Literal["stats"], str] + + class ParsedTree(TypedDict): file_name: str name_js: str @@ -589,6 +615,31 @@ def _refresh_tags( ####################################################################################### +def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: + """ + Request, parse npm tags metadata. + + Notes + ----- + - Ignores canary releases + - Github tag is stored as `"v_tag"` + - npm tag is `"tag"` + """ + req = urllib.request.Request( + _NPM_METADATA_URL, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + v["version"] for v in content["versions"] if _CANARY not in v["version"] + ] + return ( + pl.DataFrame({"tag": versions}) + .pipe(_with_sem_ver) + .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag"))) + ) + + def _tag_from(s: str, /) -> str: # - Actual tag # - Trees url (using ref name) @@ -614,10 +665,10 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: """ fields = pl.col(_SEM_VER_FIELDS) pattern = r"""(?x) - v(?[[:digit:]]*)\. + v?(?[[:digit:]]*)\. (?[[:digit:]]*)\. (?[[:digit:]]*) - (\-next\.)? + (\-(next)?(beta)?\.)? (?[[:digit:]]*)? """ sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) @@ -835,6 +886,7 @@ def __dir__(self) -> list[str]: return self.list_datasets() # BUG: # 1.6.0 exists on GH but not npm? + # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview def __call__( self, name: str, From 9c386e26515b23b0bccbe5505ed9c9bbcb05b96c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:41:23 +0000 Subject: [PATCH 023/201] refactor: Bake `"v"` prefix into `tags_npm` --- .../_vega_datasets_data/tags_npm-schema.json | 3 +-- tools/_vega_datasets_data/tags_npm.parquet | Bin 3114 -> 2596 bytes tools/vendor_datasets.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json index 8de9881a0..90ea9d52e 100644 --- a/tools/_vega_datasets_data/tags_npm-schema.json +++ b/tools/_vega_datasets_data/tags_npm-schema.json @@ -4,6 +4,5 @@ "minor": "int", "patch": "int", "pre_release": "int", - "is_pre_release": "bool", - "v_tag": "str" + "is_pre_release": "bool" } \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet index 38be9c271c7638490835298d6fff114c9e921a7c..d2e9a34b78eef3da66b7b70e82ed4a6dcf0a5502 100644 GIT binary patch delta 834 zcmaJ=O=uHA6rN2s+ikKT*6a+sWoxmAP!RK1tyP-f#G0f<+a{?^Q@kV_n^HFho7#pb z>dAwkmjMwefL^$SLDORvT8sv*Yo{0f@bQldvtI^=uU|8!~Wf9@7UIt804b^?+64Tq#I=0g_{S* zmW_a;-O#N-OEj>epGMLVUHgAK zkzgbo3?oDys8}OgJ$M8K+LD3A1GlSMT&gYG<7f$_Ed=b=bsu@IVnTlO;5Z6=>o%C> z= zT|XW`+pJ6O{z(>yF0?l+p{7W+Ksd~nbld(0uS~1 ze6l&8VIgljoZkettsV`Hv#!)gV_u;81HCs404xK d$s3f$Vn~ywYgTc&u3H8|e|_kah!FfZzW{02$58+P delta 1411 zcmbtUZAep57(REqyAyLZt8=<{-9QV6LfNKPla_IuwiT9}MAix=x}-Od)|-dc|W+fdr?#j zIdQ2O^6*$9yrwFKK?nfIxVLJuvW?rqgVN(EDYw2h+zZ-ntk_%nJU0>^i3v{{^}^N5 zeUIhv*oL8!_iZ}|@RM-KuDdsb-v(C-o!%F%(cVd4-&2Fs=jk~8?1G5{Y1STn|J%Dl z7c?$SEv#&R)ZJy#n?9YcF1B@EaToUYM%13icfOoYcTRj3V`m-L-I`~wCObB2DY~xnY3iVKcg1iP~XtJ~NvP`0I_G!h!ck|F0-k4zh%}gv@t9pXBk^m=? z*W3ejSZgI|Dv|W4I@Ga5EBQcmm--YJCLh)Hx{>)Tnd3x`j^H+rIE$uf_rWS3OhQ34 zoP;j%#%n1oL};4g)i#;4%%&_808)xd5{0RoBx=Y${vz4Q%e!PWpn?M&qJWZ&ZfRi4 zv>Hx;kK*Qzc`y>!M|AMyPklbDn+_je2wy2?sFKoXlBFZY3%m@Iq}4#sOQJg7#UF+gZvjOmD57ozbAd4`$RxrU;|8#8e*>b|!F985J&EunE&d`YysY#jeC* z>};V)7Mf3849Zcyr^ddqo`ooQWwQYp?wkM%Ss8YJizla`x!j%CT)sW%-{INn$C#W+ zaD|E)iPy2&@5$NhueQ5uz1d|`q%BJQieO~430Cu8IJeOz@{~v#MUM~LTx=-5>81*5 zYwTMC%%e1=;D=^R>Hi4q=7#G1dCfBk{(&E2(Kx<_e{^6s*H-;bFncR=Ub8~!D6L&t ceh9AzqTmeps~QhTN9Z>>`vHeF0I)yr4{sECDF6Tf diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 342575da9..abc90629f 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -622,8 +622,10 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: Notes ----- - Ignores canary releases - - Github tag is stored as `"v_tag"` - - npm tag is `"tag"` + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} """ req = urllib.request.Request( _NPM_METADATA_URL, headers={"Accept": "application/json"} @@ -631,13 +633,11 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: with urllib.request.urlopen(req) as response: content: NpmPackageMetadataResponse = json.load(response) versions = [ - v["version"] for v in content["versions"] if _CANARY not in v["version"] + f"v{version}" + for v in content["versions"] + if (version := v["version"]) and _CANARY not in version ] - return ( - pl.DataFrame({"tag": versions}) - .pipe(_with_sem_ver) - .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag"))) - ) + return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) def _tag_from(s: str, /) -> str: From 1937f2b74df00d2a649ec52879e90ef2fe469cbc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:31:53 +0000 Subject: [PATCH 024/201] refactor: Move `_npm_metadata` into a class --- tools/vendor_datasets.py | 87 +++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index abc90629f..ed094d0c0 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -90,6 +90,11 @@ class GitHubUrl(NamedTuple): TREES: LiteralString +class NpmUrl(NamedTuple): + CDN: LiteralString + TAGS: LiteralString + + class GitHubTag(TypedDict): name: str node_id: str @@ -446,6 +451,8 @@ def __init__( *, write_schema: bool, base_url: LiteralString = "https://api.github.com/", + org: LiteralString = "vega", + package: LiteralString = "vega-datasets", ) -> None: # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced # that describes column types - in a non-binary format. @@ -456,7 +463,7 @@ def __init__( "tags": output_dir / f"{name_tags}.parquet", "trees": output_dir / f"{name_trees}.parquet", } - repo = f"{base_url}repos/vega/vega-datasets/" + repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( BASE=base_url, RATE=f"{base_url}rate_limit", @@ -605,8 +612,10 @@ def _refresh_tags( return tags +_root_dir: Path = Path(__file__).parent + GitHub = _GitHub( - Path(__file__).parent / "_vega_datasets_data", + _root_dir / "_vega_datasets_data", name_trees="metadata_full", name_tags="tags", write_schema=True, @@ -615,29 +624,61 @@ def _refresh_tags( ####################################################################################### -def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: - """ - Request, parse npm tags metadata. +class _Npm: + def __init__( + self, + output_dir: Path, + name_tags: str, + *, + write_schema: bool, + jsdelivr: Literal["jsdelivr"] = "jsdelivr", + npm: Literal["npm"] = "npm", + package: LiteralString = "vega-datasets", + jsdelivr_version: LiteralString = "v1", + ) -> None: + self._write_schema: bool = write_schema + output_dir.mkdir(exist_ok=True) + self._paths: dict[Literal["tags"], Path] = { + "tags": output_dir / f"{name_tags}.parquet" + } + self._url: NpmUrl = NpmUrl( + CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", + TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + ) - Notes - ----- - - Ignores canary releases - - ``npm`` can accept either, but this endpoint returns without "v": + @property + def url(self) -> NpmUrl: + return self._url - {tag} - v{tag} - """ - req = urllib.request.Request( - _NPM_METADATA_URL, headers={"Accept": "application/json"} - ) - with urllib.request.urlopen(req) as response: - content: NpmPackageMetadataResponse = json.load(response) - versions = [ - f"v{version}" - for v in content["versions"] - if (version := v["version"]) and _CANARY not in version - ] - return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + def tags(self) -> pl.DataFrame: + """ + Request, parse tags from `Get package metadata`_. + + Notes + ----- + - Ignores canary releases + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} + + .. _Get package metadata: + https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- + """ + req = urllib.request.Request( + self.url.TAGS, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + f"v{tag}" + for v in content["versions"] + if (tag := v["version"]) and _CANARY not in tag + ] + return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + + +Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True) def _tag_from(s: str, /) -> str: From 66fa6d15cd967a25752e35814a6c3f03ea771487 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:35:11 +0000 Subject: [PATCH 025/201] chore: Remove unused, add todo --- tools/vendor_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index ed094d0c0..048ff8771 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -70,7 +70,6 @@ """Query result scalar selection.""" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_NPM_METADATA_URL = "https://data.jsdelivr.com/v1/packages/npm/vega-datasets" _SUB_DIR = "data" _SEM_VER_FIELDS: tuple[ Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] @@ -595,6 +594,7 @@ def _refresh_tags( print("Checking for new tags") prev = pl.scan_parquet(fp) curr_latest = self.tags(1) + # TODO: Needs a hook for `_npm_metadata()` if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() From 21b2edd0ee1c55ab09e8a31535a3a15f5ab55720 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:19:24 +0000 Subject: [PATCH 026/201] feat: Adds `app` context for github<->npm --- tools/_vega_datasets_data/tags.parquet | Bin 6210 -> 6200 bytes tools/vendor_datasets.py | 222 ++++++++++++++----------- 2 files changed, 127 insertions(+), 95 deletions(-) diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet index dc0ff652ed261eebeed70ead42c0f7352ea4e8c3..1cd7b957b3ce87cfa8e80b05548f72c9133b5d1c 100644 GIT binary patch delta 3091 zcmc&%do)!07eB`ghGxbL&M`B_GmY06GhRLDW+?*0Aj+?jG;+I< zaMJ^IM-wU1NECW_Cz0~#Mfg#-x!ve@_|?rH-QW5>{`j4>&RTnY*Jq#a-fQpu*`ITh zS-})}u-CI7G*qMvje{JB3jqLNQf4{D>j0urfcgqNK{6cWMe2|4b!)WI9v%-uU1Xb9 zdoJz3-T!^(ON$pu#f$)J7CT?_L$8+4gpTf|)?ZFtjy6(+0LTa0N=Kp~WDxa4`gA@B zC?ElGK_#4{K6W{u>RLeO3^;fgxp9TQK4(HVz1YJuIUEq28&ouN4q?f*fD(aBLPhbn zKvqAW4lq&LVZ~U?V3L$~uox^Rg9!jg+T&EH6F)cYMM^jwz$_>ehnI*7 zQAQ$CT1itxEnl^(1Ly$SfW)}9&q5uBR|+%^*rB(dp7#Oje02Er{hr4w>KD=JaD#3^ zorj07%`3CetC1-sAujmijXf#%clV!P9{*vxX$iaiVE01%$H5T|thiVjxyx**aW6I7 zFgY-9C7x2*sW#Y1rE#~Uw%^IKy{RfJeZ&jtm zJ+G79#?I>fa6Tlrko<0wE8Y9j#Us{H0FboaHjyJeyECWNCsppXG_*2%$ZLmmiHN(Y zC@Q5cbEH}2#e@EdfS+x&Jnv*l7TXxc-yR12lQIj}*)MzFojck&wEz3nr|@j|Reyx$ z7*!c--uxrb;Qi9Rv4Z@v4H{Tu-Y;2^gaZ_-xQ2Y;8zvNcA?PyE`PC|d?<1$3!F_R# zS(6xTJ)HY#G5_%Mj@H6$=X5DMtC~_A|3AV5 zYUo=3LvzA4lJ9|M-0vHmo)UC_ZnO^ER#T@L80hSN%VDbcu7Y?BlW8=axvJ9bP3hE< zt#FI(uYSz_^R`Rb1LnlDqILZ|<4WO*oboVnuQEksx-$ON79A@)^y}KH@u$`gxFdvZ zORSGXgbyCOJ(~Zd!p!jY1N5Qa)aD<&A zjFpvSm)A70du0`Inl;O`FFFpM$v+R59eb#yWy%UzXV=d>95yZb`_-ZNV9NED(&09T@4dm)H5DXUd}82G{rCeKX2ay!G;Lscad`me=!R z5+VBv@jVP_t+-|Dx{=Wg|DK>^(Bd3%gkY0Qb_qJfx}zr1Y6bN_ssZSo8``KJ2h_|J z3>63pdq92t1fupR#>8A1NF@S*e_m0GACaN@@}Jd|MmT83^+Sf0_8N<7+Xv`XE|aLn#4&;xV>_jhn(otTR+ikwBu(_ z+O*-kX#U!cwsy#=DAOH(13TTR+o$*<#iv77A;=eFYOz1s%y1yl78>p3n#(VqtPij`D(nU)p<3d1Q1JEgbK~mahxaL3u4tx z#uZTnBOW*`bKHwkL{mNTP)Y<|ULG5TLb$H0D7U=#-h^4@_c~?_uWL62t9QIip4_s1 z*Co!0gQfv%x~LT5je&Owmh8sXb0<&j4qY2FJW)$8T)SWY?CB{xcl9S4s?YXFkNlB4 z7Sqzau~NJt8l_$gF{rv6_N2u@ziC%;TDp>P zYXfuTkH7WoLKU2+LU6Qxbhb`}KNSA2S zgMAH$e&4BbsJ_x`Vw2|J!`rA|mSfKUoAf{viT=OzVEwP@p+&U++XNw;Ll9x!Q+FYR zBtBvjYYCM3nIC0Z`%rAGv;5PiDNqknTQ6Y6vq{ckoxL?YR>^}>5if+kuTH>QT5A8k zB{7Z9qLY+rr!Cz`x+)%+Y|I?m5Vt6Hdi?jip%N7#6h`_-M}*9>$GMfTM3W{HYtMTJ za>Ycfg&`LNjX)-{us`p2#Zdm%u!vc30SH%sv_>%xtLub<{5TNvi-gga*@D1Cjgrcd zYb;gAah$Q|CsCoNi+t1OH4CCR2F_@SvQ_{%jzddvNGcr24-p>X2CT(0E~tpRecTCS zkW6seM*w?%yij(`-E@B3#nM7`tCX3uIe`Rk`y9wMG+d-W8(}$!o3SLZ8;*{|9*eWE zmW|Vk;J1o164!>K}wJk)s7=3 zru`8vWT$MV9qv^SKoy`2xRV z3my4lzg_qzWKfJJW(l}tD)|uYE{ggeiZQs$2_nG-_Ru1L)(y?^{?<;hgvV#l-S$v5 z2++pOzrz2rmF}Cvsm_Sc@ukoi6hKR2QRfu#8{I^q!+FjS3|Bis-vP8No4N3B*PITQ z&s^|v)}hO8j>BJ+*&Dt;<8X~VbHnJOuMYFVBZU4D!Vn=pQaIZkbBb<)L25Spkp8p) z0De9bt9ZIjO}uy{uLf*?|T1oUwRRPF>B0`PA;VJ-~=k29q*%~%x1SQ!9j H{sjI31)iZf delta 3230 zcmai%3p7-F7r@V%85(9VgLBMikVk`%NAe~ODvydNVm!tpkLwZTk}i`+5-E&VqDIM^ zyoxKYh%5=ohe95=yi*=UCB7NA+r8_4_xrxH);VXdb@tx>z5i?P-+!MPrDi4AeE%_J z2!LYHP!vKQqOwFzh?A5ALWyM@bSD(piUg93aCqjs%sitMrgZ9IlflNTGj_xQJ(+@? zrH%`cNw#0_?OvTwywOh&j9;h;i=UyVeQurPRxXhaJ#L?-(rE_ToQhV+$B*^<&xw>l zxym>ksLbh-qMSAWv}tj*{sdMgDm>A9d^}tA(X7>qf?kqL`V9@+7-0y)5_bZeL;~_G z%^kwRRCo{#_aPF$5wQR?EC!|GwgP6nLOIi&7C3o!03iYEym%&*gOs{&C0-bN+q|Uk z;O5XgFb&KAi3K_?lQ7jxeb4p?iOCs9H#mGzwl3aX#MrykO8lpb=I9vNH-)2c}6AlqhG@Hs@&19r}F@5hLu!ht4z90~+i6it4KuCP1t*z;@Z z6<_Mj&)ub@wWODirW1?WswHnkvt3PWYR+h)-C5ta|60YW?!rY*CZd z40oLDOL4)jTRcx0ZoO2qG=*wSRf((Zb3zVBI(qZ7Zk_kB2*ND;rS|42w##}p(&HIX zLKi8)qw=W6sjv2`RsCLF&D3e;+g~Ua2Czeb$H}j`r30}Cw4=;Jl5o_QyX&g_HxQVR zsQ7C)1!i2+w=H|k9IIxEhUmP0+iWOWJTLiVP|bRyHW zF3X{Z=lR`n^OU>uR~2>eyMAu|*_5uII%c*a;;|$OKlL?Pn$Ic4k2glb?kctlu_L-m zaoU87>cwG`ueihru65n=(>v`oTWk6EiCqsZvLNWT3RcUl8}s@+G|)1RWH^TgM$lVt zxCgKDq_@RTip0wbH#2rm=SdAQUQe{lUQfVH&}%qSnBH{P->&P67+0BBiqre}QW{X7 zQZ7l=K1)2W({^jmXkT>#zuY6q`G(*nNvw2o)$T;N`^i&(rPn1%Q@0t8V*7ac#dE%# zJ45Sl8^kZB(-UpYK0mUieC2zeZ2a)BLdCa`9{Jlw4y}783icXtwb@9BMunP;Q@B6k zmsL=nK|iN7HpQ3o#=l9w+MiZV59%ZEw?sbIu{0PRKYOY--SSgZSkFY}2Ju7QSh ztC8>ErDseEb>!0CT47gV)HObBX|9s1HQzh!_1J#~BaW7C9=!;dwl{2C2gsMcMGM zC|6|+RpEc<$0Aoh-N_bzQ3A*a9ODx{ZET{|zJt#*$8ShCfA*+rhLr5dfNpUw#qOmwQLUq+`(F=BQ5J){z7NHz3Y{hX6M9taa%0Mzl<2Ay{X#P?H#Ay zTqx+YpRSW;aU)MGjm?ex(8|gzavG^6&pj!;28Udfm24I%jJRH++kol9loy8k zw^F3>(L~=fEO2B`QDCQoKqbYFbT=7y$$qK4Ax zqhm~F(^wPFioP9SG=>o*>_})5kkJ4@y)tgW(@sDHl2fWDA_BznlSqPP0M4oDLO6h6 zWTMIR#=<&=IYuSye2OOSuQdt2xW)a^04* zLo2?!5v0o39w18l?argiXH`0Vp4a7=j+pDJeh5y(2x!{gvm9(eKNz=nRo8Wwd~^pn z)q(M!@@_oiZj|r~M%Y~Fj;V6yF4L1}QnRGxxwcM)n)K#Q@Ak{-dGr*Hk%%V|U52Cn z7ZM$ZJ)#(kh-7+LG#9IH2k?t&ViNb$m!4afeY+IY zku#F}X)sJ=Z=A<9Yem9hU&c-5VoJ=5Mj8Jy#z=+4k)i6dBFRcoeKVtOhYZ;*%;&$| zB|h{`T3Elhivd%w6T_#sY}U#5J-l@kioo4PH4O{m_`cW1&v??S~bqt_&x?_GjU{JU7l6QW*ZL z(X1A3rt{8Ri27Qe8u&sjSTlb+A?=#PntKZrGpwrmuiFonzD!8$jF#^>Kx$ijwyCIB zJ*;^6j`|6nf?u4w*Bl$?mclFj_&ey3d~PH?{o|l_mj|vh#HSGFu9#EZoh|p}eb0jr zvy#agenfx`0)b-n_V%r@ zfa`34o7)-3zyR;97GE-hmjuZ%jBx%g##@XuNM-O~6(p!2Ce)Z^Hsx5%4B zkPhs}QUI#mOfp_?VDK1M7E;U=9OFavKrE)1K@KdUFzT=hx>S&Zhb!CCMD}j-v33Hu z{=R;-!NpE4OQm^=WhG+SPWVeip1$8Bs^x&|5abm`gMctYpGTgRf)9^^0u@AZp_>ky z04V(?SpWx+{*ujU+F2Kmt!$=2cn~CKFrM&eb5kMQ))=Q|#V{9ywYGBFfjJ2P9IOO6 z^dTJpK>k=Cb2`AyF~H5+jdsRu%Tp2v7lY(sb)DbMxmcNrLQ3q+&hv`Nu`>I^2?P)! z)_BJ2dB*YYKCIwV6!1SKfsyLe#_R0e*RC-y=Kf5=xgeyD6{LZ{do5Q3O_2o>PxcNW3{5Oybnh^LED zi6r6`@-Tw*<6!vPNJ)@|mnaYn*J(o<067QxQ}G{95XMmfPMH;Lr~w4XU*vu$|JRD_ z-o-S42ylfil#2w&vC5<$yZ?{%D9i)b7_g!l)Q9u{av39AK#+?H|5+W|>0NAhsr%WV zm(>2^?LX>auZ|iZgGRA7hJ6vxzrWc}c#`d-Z_7uF=AS _GitHubQueryNamespace: def url(self) -> GitHubUrl: return self._url - def rate_limit(self) -> ParsedRateLimit: - return self.parse.rate_limit(self.req.rate_limit()) + def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: + limit = self.parse.rate_limit(self.req.rate_limit()) + if strict and limit["is_limited"]: + raise NotImplementedError(limit) + return limit - def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + def tags( + self, n_head: int | None = None, *, warn_lower: bool = False + ) -> pl.DataFrame: tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) @@ -516,48 +521,65 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: ) return df.select(*sorted(df.columns)) - def refresh( - self, fp_tags: Path | None = None, fp_trees: Path | None = None - ) -> pl.DataFrame: + def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: """ Use known tags to discover and update missing trees metadata. Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. """ - rate_limit = self.rate_limit() - if rate_limit["is_limited"]: - raise NotImplementedError(rate_limit) - fp_tags = fp_tags or self._paths["tags"] - fp_trees = fp_trees or self._paths["trees"] - IS_AUTH = rate_limit["is_auth"] - UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT - - tags = ( - self._refresh_tags(fp_tags) - if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT - else pl.read_parquet(fp_tags) - ) - trees = pl.read_parquet(fp_trees) - - missing_trees = tags.join( + rate_limit = self.rate_limit(strict=True) + fp = self._paths["trees"] + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( trees.select(pl.col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): - print(f"Already up-to-date {fp_trees!s}") + print(f"Already up-to-date {fp!s}") return trees else: - it = islice( - missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT - ) + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT + it = islice(missing_trees.iter_rows(named=True), stop) missing = cast("Iterator[ReParsedTag]", it) fresh_rows = self._trees_batched(missing) print( f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp_trees!s}" + f"Writing {fresh_rows.height} new rows to {fp!s}" + ) + return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + + def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: + limit = self.rate_limit(strict=True) + npm_tag_only = npm_tags.lazy().select("tag") + fp = self._paths["tags"] + if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: + return ( + pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() + ) + elif not fp.exists(): + print(f"Initializing {fp!s}") + tags = ( + self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + print(f"Collected {tags.height} new tags") + return tags + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + latest = ( + self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(), prev_eager), how="vertical") + .unique("sha") + .pipe(_sort_sem_ver) ) - refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) - _write_parquet(refreshed, fp_trees, write_schema=self._write_schema) - return refreshed + print(f"Collected {tags.height - prev_eager.height} new tags") + return tags def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: rate_limit = self.rate_limit() @@ -581,45 +603,6 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: dfs.append(self.trees(tag)) return pl.concat(dfs) - def _refresh_tags( - self, fp: Path | None = None, *, limit_new: int | None = None - ) -> pl.DataFrame: - n_new_tags: int = 0 - fp = fp or self._paths["tags"] - if not fp.exists(): - print(f"Initializing {fp!s}") - tags = self.tags(limit_new) - n_new_tags = tags.height - else: - print("Checking for new tags") - prev = pl.scan_parquet(fp) - curr_latest = self.tags(1) - # TODO: Needs a hook for `_npm_metadata()` - if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): - print(f"Already up-to-date {fp!s}") - return prev.collect() - else: - print(f"Refreshing {fp!s}") - prev_eager = prev.collect() - tags = ( - pl.concat((self.tags(limit_new), prev_eager), how="vertical") - .unique("sha") - .pipe(_sort_sem_ver) - ) - n_new_tags = tags.height - prev_eager.height - print(f"Collected {n_new_tags} new tags") - _write_parquet(tags, fp, write_schema=self._write_schema) - return tags - - -_root_dir: Path = Path(__file__).parent - -GitHub = _GitHub( - _root_dir / "_vega_datasets_data", - name_trees="metadata_full", - name_tags="tags", - write_schema=True, -) ####################################################################################### @@ -678,14 +661,85 @@ def tags(self) -> pl.DataFrame: return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) -Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True) +class Application: + """ + Top-level context. + + When ``write_schema``, addtional ``...-schema.json`` files are produced + that describes the metadata columns. + """ + + def __init__( + self, + output_dir: Path, + *, + write_schema: bool, + trees_gh: str = "metadata_full", + tags_gh: str = "tags", + tags_npm: str = "tags_npm", + kwds_gh: Mapping[str, Any] | None = None, + kwds_npm: Mapping[str, Any] | None = None, + ) -> None: + output_dir.mkdir(exist_ok=True) + kwds_gh = kwds_gh or {} + kwds_npm = kwds_npm or {} + self._write_schema: bool = write_schema + self._github: _GitHub = _GitHub( + output_dir, + name_tags=tags_gh, + name_trees=trees_gh, + write_schema=write_schema, + **kwds_gh, + ) + self._npm: _Npm = _Npm( + output_dir, + name_tags=tags_npm, + write_schema=write_schema, + **kwds_npm, + ) + + @property + def github(self) -> _GitHub: + return self._github + + @property + def npm(self) -> _Npm: + return self._npm + + def refresh(self) -> pl.DataFrame: + npm_tags = self.npm.tags() + self.write_parquet(npm_tags, self.npm._paths["tags"]) + + gh_tags = self.github.refresh_tags(npm_tags) + self.write_parquet(gh_tags, self.github._paths["tags"]) + + gh_trees = self.github.refresh_trees(gh_tags) + self.write_parquet(gh_trees, self.github._paths["trees"]) + return gh_trees + + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + """Write ``frame`` to ``fp``, with some extra safety.""" + if not fp.exists(): + fp.touch() + df = frame.lazy().collect() + df.write_parquet(fp, compression="zstd", compression_level=17) + if self._write_schema: + schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} + fp_schema = fp.with_name(f"{fp.stem}-schema.json") + if not fp_schema.exists(): + fp_schema.touch() + with fp_schema.open("w") as f: + json.dump(schema, f, indent=2) + + +app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True) def _tag_from(s: str, /) -> str: # - Actual tag # - Trees url (using ref name) # - npm url (works w/o the `v` prefix) - trees_url = GitHub.url.TREES + trees_url = app.github.url.TREES if s.startswith("v"): return s elif s.startswith(trees_url): @@ -727,28 +781,6 @@ def _sort_sem_ver(frame: _Frame, /) -> _Frame: return frame.sort(_SEM_VER_FIELDS, descending=True) -def _write_parquet( - frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool -) -> None: - """ - Write ``frame`` to ``fp``, with some extra safety. - - When ``write_schema``, an addtional ``...-schema.json`` file is produced - that describes the metadata columns. - """ - if not fp.exists(): - fp.touch() - df = frame.lazy().collect() - df.write_parquet(fp, compression="zstd", compression_level=17) - if write_schema: - schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} - fp_schema = fp.with_name(f"{fp.stem}-schema.json") - if not fp_schema.exists(): - fp_schema.touch() - with fp_schema.open("w") as f: - json.dump(schema, f, indent=2) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago @@ -960,7 +992,7 @@ def __call__( else: constraints["suffix"] = ext q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] - return GitHub.query.url_from(**q) + return app.github.query.url_from(**q) data = DataLoader() From 6527305cc5d82f54c529faafeceb90ca301b1e73 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:24:36 +0000 Subject: [PATCH 027/201] fix: Invalidate old trees --- .../_vega_datasets_data/metadata_full.parquet | Bin 21362 -> 20768 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet index 7a4e691cb414735738f276950d79e8c72c5f4b48..071e4bd6cf68fcc17952c5057858fa29399c9415 100644 GIT binary patch delta 9809 zcmdUVby!qi+wKG~bPpXvH_|22-6aBwGz!w)Lzi?bNH>BYsC0LiB1$)cG%|xQ9E|Vx zzUO@3`ObCzIDedV?Q8F~)?WA8&ph|D?maW>IVlC^7Xk@&WcB!=0B9OIbO*={-RDXs zB!{a3?t=jaXkY*Uu+IgQ1_A(Bs2fy=9#BUE1_YyE8pJ?!)UsBG*01hml|v;3IVp4$ zob2|V$nBjD(q&>^ZXg7;3}*)t!gSacFuIOMF&k`_=VhmSu*+;r=o&H3!T=ScIub z2}oo6YagT|H5r>EHGoa9o*Io11c#*zJr$#!C=(YTGLd}olIdeCOHsnyPa+AJEKEm% zT|X02APIvd;e}5VueApgdS{~ntldmyK%-qDO*oy@3V=PrwlNd^EnuMy2# zJYvafxA}fXr{r|Bz^G!Uwzp{8N~~_pllatGqB2Kd^ejI<=#uTUv9%2t-WFgC?9n>~ z^&&%~<$dPq$vVE?M}H}gu_C8~FSD9&Zr6|3s{y{3*^`m7{f^?T&`QO$UNb)hYcIc? zlE3jR?f6zh*OhvCC@B<|QW$_iNJ)WZYv*8N=49b$^T#P0(qt~{o z<49DkTBt?9ovP2gl3X z5xLTb5_+q%l3^GC>M??yT`=Na;>)>($w8tr(l=snsl)LulW+PrXaqu1x4UC;(yRgFD8EPpai(jU`^x7?{1gM!#HW}YlE0;92WbqB((*5;A z3|0eK?0*P^{C1YT`(OIR$lA$R3sWsL$m)0S{{Qd|{Qt3UTzvoWO*0wmpvbQX!u*fW zra%)-{YXL&G^jr(p#VxP_ye6V0QLbSx8Vf<*p(CN(XS8?b`wlS4kR?sFJok`iQ^9o z#njt79x$<1GFrUXxD~it1?j~9tsv@9mfQGyM*}G4`6vEe8#35o>kR84?ui z0C1_cA@0aW0}Wm^1F$N?eNKaIXRR@Pd!96@%Swja)qvh_G1=bl{`|2(A^4=zcRG$b+86AeNE6KAi_QXb^8JFV5sSmEXQY>q%*QZ5XZ1xg zCD?PuA$0^pv2T=+ITizo_E%ld-0l4S3d@oZ*%~S9H=t&!HKjS$Fd_6U`frzwQPRU% zz=7BR+i3~5Bre*Uny5&-}}h*qan z`W|U6cc(Yxi}iE2Q}IueThuHKSoHAO&&^(?Lrf2!D_E5%8`TYSU7jwfer9w4W_?4B zOFlGsHOWHf$&#?d_0h+;Ji7Jmu#ynm+s8>OgTPfbTkc!zk9WA~EGtY(;2Xp7mC=?t zzWhG94x3rS3(px4>m6O}*10HiY~ z1|~ci5INp7z^k}m&&YVOKaV1AIqVRrLrp_}`rc%6>f`#;h$2=b| zt95hKG_Dhtb?-nbdA(|hX)=kJZPt0)z$Dp&`~T8-40lgkTRWdWt004zEE2-{-s;yX zK;!lB@u0v+mDI$9L(!m>5afHL@IyQe^HrdLNdpT%mOJvq5=nF4v=$H__jDl$0{4Hbk2$cK`qej&j1rbs@e^NWr!;4hKcw%2_qgJ`j zS;K(yK5wt-)d;dh{Iz(Fc${xQlLPVrD(zOXhm?LVx-sL2x?S> z^r|!h2Nfc{(r~x!6)R2k#gZ)Gufmd2Qh+@yp8RoM5;oZvG}hc_^;0xR3|&PI&VCx# zYKh?jE!tNVo9on1yE*2^{LNLz*mj-n(%O47_;-uj*_2{gpvFz0rw1ZjiE3rS6_uQ} zz5R&FUe_9*FD!}y$=?U7-RHe5SbE*-shJ=Bex3c5lVH~QKXPLIpX4O@_La8%TS>bA zO-bBx^KX@;o~y|5my%Fb8r=1pmw;*hhmrsn=Fx<~XJBh^@H3uN1bPG%cDe^5Ehk>9 z21v-snEz1|w=9qk$UBSku}>Xee}Q=zU|U=COqn;-5wO66{UUe-ovJxWQr7UP`uYno zGVy{zAjM~w^hGd`*Heba*y3T5e+C`}2am9{%%g5-9j;j^Td(qHWq4hcd<w9vPtORVM8=ubPrY_7Eq0 z?MC!Cl_#C?+szH;^%$TR@CGnZ!0DhVUSERXtkR?)6wyqdX3(!KIH>p>7QxTrHvS{- z3YY)H^Jz&bM4sZ21IPQ1DKqu5u^c9eF}PT8T?#vLtou*)Mo5>p#v9_NGE>1wCOT2$ku6%C~+2@(pbigbHT zxf$TEP^R9<=BQ5`)ku>b(&_;}b|PRGeRz~8PJ@W!J5{Z0KY1PFh#9D6-AwPItuQI)whMpfc&?*2w9OLslhFu-xV zW_;g%IOz*aF<_4Lt6NeE+xgTdl^EL3@JbG*?;k1}=L=symDkMHH2XN2;%JBZts9A^K_w+QMQ4zD_ ztrRf&nBAPhi}8!ZgMTk%`+r&jg8x%V?cjx^cJiY0xhoZ!^2(SDL@Ho_nYvtcVf?xw z^ZA1+){marf*89Q^))3V;`sY1tbU!sazj&p@Y2B{*eM>L^|g#mor@9N?}YD9 z!>rtOoXo2x?8kS%aK8Gi6IH=E@wVU`E3WfTXx(_f__y+VN*lhv6QBJJzh5Ka{N|Y1 zksy66*V@XO?-orm3X`Ig%qyFm8F82+=#2UmM#i-o9*!Tw61sphM@gVB z;|las_zPE%{Oy*uimMBxK$9i>dtCoB8s2pSdH$n3%@|KW000C50L-cTcbMS85C9Ao z5)1%<0Id+!)%qRhafJ;OnQ=AM)Nw^ga1Ij(pNmqj*o`mDp#m_ZeGQa zn)<pTNp}3`%=6CU&Jwlxec7tauhz*_T?2r417tRw$D#^%T6Ke&db$8s8d+)b>`EdJdD1 zU!7c{DQ=?Q3eMf=Gx$~hku+26&x6U`{p!1Rd6TUm5}+^fbJ2aOi7rx!m2} zj3uyQ2f>&+ydU)j<$IQPSf+lHj^J1|T9kTqt(wP+VK7!4@v*Lhc*AJHGOmyd+F7{p zL9b}ivZU|x({?opO1x^dGs3!bHCe?mCQhh@{2Yg(JX<&i@}wY5p;{6rbStp|O0Kxq zAGHaYZt~TsPA05@F=Gd%bb5G3ugO9`x)jMOve7?+%f7gHnXmAQ!YhBrM5qy6?P@4^ zn1LFiGu7R6!>p-w&L%p^(Gt|$H;Ba2+9!F*5;P$YBmPe4iNDG8J*30RQ-#rw!jkbp zphTtTqPerdW`fO+)Irs=<2Z)X9IQv2i#4=@Q*CnX<=eXW?rJ{<(n~i$Wcqu#svK_7 zDastR@XW&EmXsv<7>pJxKuJVO_>rp6;4A-W+;0MV#o(QnE6>EB_pt6N;_&Vo<|rgr z@qE=bD2NuH!rXcvQ^~j*5fj8MdO_?-v(`%gUYA}xFaL77_1@28L%LECDmk;4J|E-~ ztT^Q&%?b}f^12Us8-8>=|6mqTe#%l-#0fT)Z{Ei ztvy{1YrP)ry9|?Mkv&H6YUi+?=jL~$IbI=|3xlWw#-ULR+(6D1o3$rBsHa&Fjo?bE zwrBQ93Z|bkWiLLgg5y6>eYR9Blr=t+#~&WhSni!D$?Naek#vz#zZ0FCc1>Sf9KZsn zW5mOAe&O_FjYnGH6rFu^CjJ~BmfHv8%V8k&HQZN1(omPcpFdGm#wDH+fwa0?^>Oue z4BfSB-oZbS+kLu|alCPMZC85FaG~0|u^c0cWE=1OvmPAVo9M|RM*ba%5LBL=Q+$We z8fH3CrgqeKZ0#3shL&+|G|kL%X(v_SzR{gQ1j{Nksd)?mW|{M|IB%wq!8RCKN{-LY zcehu|pHtN2x4IbUp&kYRBos-MMUT>__iU!Fl3cuwvCaz-cI9{1v%gqao+-};=?AcT zz|DMV&abv!c=4?^xq>3Lz)mR+m>k)oZ7pL4$PcMZ*ln6~8o=-)WdKlE!Ft-rF@@ERo) zeIDNEH}9e5;WMrul~52{R1g070$yTgv}_?DwR!C4_Tn`>3|01ewaLoazAj`SW^fb( zVTSq5-^OcpH5G!eIPvMlqDV&{X7ayg#gFQ(jvPc0TCb&<<(0I7ge&`r8@FI?`m-TSI>LOQY7Q5qER5T?jw7C(_7 zuwu(Z_~eeLHWuxK@3%>Q10s(DbMN;;@oOzv`{N#u<#}0doC|K!L;}1=)dPi&#E6W) z5~6Tq10K&O$oZUfc4SG?iot=(?_`ier7eA{49C?(#&n^$t$Mad2ghxcVrS&~vxzt) zajtXJ5W&hN9FG3U_2F9WJJ%_y|(sRF^dhP#pJOqt;COmN%r?7pC$uSAJe z9W3~a!sLm{F`_YS){C$7Gs@2~WhRhYTm=Ipi4sR~2AIpVbBR2f>W|1%=XNKaOt)C0 zxXPUs$I4Jko8FGZZ{Ft+6TWg>T{5Q2c7jC1TmT;*<`$>DHS{T=d~44nPf)?r?(1H* zD@ksvK4Xl0%KJ#cNdfMkr=XF~(=4GgArZyaxT%UHs-2HW3CX$Kpnm*``Xjrku`1zP z9YgDkG3L9ob8=q3jl&7)fI4j}MP*5cy1quFqqEdHKI~He z#5iic7)1jV=kN!IH*X}Z=GqDa6{b&{qlfTA<`|?WYMxExMR>r)3rjZLI?@Dt(Z5yi zRUZI3LUUivH5c0}Rqsq#SUg?ze3zZru$by8T3jxEZ6WJ%b^T2S{-Ds&mZNc+pJ?p5 z-trx3()(B?-;?M{5aXxP%1jUIB2HA_mbPLGA*EJ{FhfGcgYB1UF%C|}*wPbG_#x2E zlSxCjay^x!e0+8I9Inn{X7I_buE+Wb>DOrePj=_12z?N8(MJaL?q?Z7{ThWxqjCR! zMd+8F>@Rh)Q>rkhT}v);wq^2cAfMO@`Ax;$9F@A#arEhGa7}zpefl~3%28)(ulO!a z$ed@UqTO*^E`6&Fop1(TiT^kaAR~UbJ0vV)?Rt7X`Jp9zIWV0`kD@@1e&+?mB)%92G8NN-$))NBoN2qlvS0%DO9RN(}SJ-FIF3k`VD5 z9p5K((eGcwGbjXJ*|P`nw6WdrvugO$CUJaEJ2N+3Mo z>{8u+s;&ELM7@yDKbJkn=f=%`=qoxjx5(saHqV87H8Ay#5HQWpL%eh%OCLS8p{xp5 z*ceOqaU#A*%r0Y=?0nrar{XVxD3EfxaJ-Thcq=)Qb&owXY$1FL?JgZ39sZ6-L~RgC z^3f617F~S;b;pecKi?x5oq*XC z&;!wuqCqm{49~=9zr1GE;$DzzOxLE$3U4;}P?5gLT90d}yUMCk6jR&HPPlbca*2^s zOVfD%fhQJaO0ctMv3bF12O!-@mFGF=S(h^v(vW zzUsYdez17An{_sI6!%wajK0-R+ z)ZTb?VcPP6a)~0yHcfGT8BiZ912V<2b>t z=9<2)%Y2~?Z{&`(^$c-JXu0D1@q>XKpXuf&t{5}zb85@5hD++|k|!j)uXg(iz7UJ# z?kAmUg%Jv0vD&<)6dqPwS9ugf@^F@m@Vh*}>`lCg4(c8NwFzmH1}G&nkOXtR7J+}& z3JIqs!Y?*&;#o$EU!np~0;mALel$LbW>!sMR-F#Hy}Wn)`s-M?z-CsJNdf$6IeyJh z!5sYi`|{1NvA4(XQwlT3@9~bmT0?)k{`7q9Hj?4fVMoI+G7m=FV&SRGQC8*kV|(?! zXiVgkjwN^`I!Y@(AWYCQELaAHIg53LbqU!OYR_n0-l#9Dg{*xqBHUdoc-N%HElD-j z5*1F@vof(y+4Hkt>gPt|s)$&-=4m4F;q_X!%44_d43l!UWgO$1iqAjAuH{|wqI$H2 z&?Xwjk`V~!oo_+xZg6VKB>^=`#3M~*IyQ-~E(m4G8`8k#=r`+S8zXB!>?v4X749Sl z^tp@6nAG?uu)e7s=|j88&ZKBxsKnt*AC6LJ1O#rV-ao*{s07=?KMWHwB)P}J-$vAI zrAje0@rs0hZKsWK<(CbvC zIm;xbKxv-J`?#+-CL!3X>r?X5h$7_%ocZz0HNXkcqIm8EOTF??1cRGPxR`k?ni|m| z|BU)rd=zpS2!osU6m6ONN?y{ES|ZW#(3q7>0R9m{L{CABR^6PHB1AVEuDcgK$^ zgw-@1L6cd66UD{bI`j*v`cwZajuX+71)`eA6&x@@R}=xl2;(N*iIRz&l_BGBxBd(U zm_P7&lnPJFwSD=80jDNYzH=a9y9{?^le%>kgh0hb35dy?mkH#bJKNT*O^w~zv~$H< zwFxkpuA7>;{WGe_4tdUZ?sg?A-ham9M{71qFM z6S1|)-)u?sg~h=K21I+mOCZyH$aAC^TD0=ia;U~h%<%8w{h+#s)jg@|KGr2VgpV$H zmjtcaAAdRI`vOkl3|a-lQ$ z2CF-;?k75kBjJW!T8H$V?VM;w+^Ylv2bp`~(8BBQ)Rw#cC-)0|QxHqS_`;HK)-4(_ zH03?{>TelcGhuUiNF9%*d}jY7&kg$YIJOq)(>c!_j%1pS^A~9jd=qaG4+!(sWZT$t z{Q@>koI9E4Vcy*1>1V+nv5#q+W3{;t$275@j5-fYXeV`xq!KXCqhEYY7AhlPaRS>- z4}i^js%_hfVPY=shX|{7Svem#C$YQ{AKeN4WAeN>ZT7>LemZIyW(-%@9nT?a>r)(? z+4Hu?J6ZFl%lLUD0YVuj2m_Dc&+v)I8|!!(QtKXA?Gz zoH?iPh0rmO!FmcXiY9)M=o3}UOwC1#I zn4Ia_ZxDIy)~>#?Ox}^Z(|2)a7h^{-NNNyj#KL;~Y@EGkgy`{zqc)5{x5Mn*o)m9E z=G4s9&C}yDvc1mNPw~q@l8l>1E9o05Z)M1(D4r3R5(|HZj8H03re;y^=mhao@D8W? zHb(@Tf*!K0$$gWnr~d(5&*;$$NTm3&_>zi<^(DTvA)lN&XF3En8cRQL$<(pqJ2SIE z-o4wGXpMtuJvCC7vxaBf!d`tgHyK?#Cm$-8Z15AzPz?t^xm}&JIQ4{;h}0{Hxxe z56vR@+uv>tMWO$#zjlOlhl`L)1cdI8z*&`!tX2`vSJV` z=D($Nkd@|vpk!q286YSlfOU(S=AT@L|1w!6RTKhqCTqRLP>`+MVw3=^y>vAHm#HLV zv%m4BnZKX8BK)ss{_pZD$l?3HA%nLN5ddqb=Kl)$KcyYqW*`6nWa0V0>_+{WWYm)8 zzu7w1{GGTEd0KHx5dc6TfvNes^6NyF^Lv_xED%%*z}jm6FI)c0z28)RO;WCg-;k4A zhyZ{U;ruVif8Op)ouVNlCL8??D1HHgiUC;9%7HZh)yCX7|HkbwDzb&!VWr?(`cgr^ mX8u2fUjilSH!AcM2wDsRkfn9d-ezdAVv>!<)LUp1q5Ti6Q?g$G delta 10242 zcmeHt_g7TUvhNH744EO!kYUIrVI zFiZSET2KS`-}OO%psP95>*=szSOC7b#&WPGN8Y;TE9(LK)gM8s_WO7O)Fjci4vB`m z>GA8Wa-K5gyBeFmPgy_4vKJ+OIwvdWT@C#uq>R*4L4m&y^u14by`3cED z+mAu=9J;LN%@8V%BNwSo9hl`-=S~0Y=!YTF#3vpR(0FZaS>zg(qFL9_o|Sux9K-m1 zfetER?Dkn1QAvX)gR zLuzq3WvPPb$r=v^(0x^=8e75?GZHdjF*rxAmP1n|g}cq8&hNb{YMjYjh%aJCj}uR8 z1gZ5AcCnl zzA~XLDX8nqh|T}M@%~?or%UlK<6R&)#wD+fCkR%pPo|&-%e&w@22jjDzJ{x7awv!C zGcA$Ax=7=zo)Ua}Ab%PBvh|bI8%yjlVP-QSTe66m1MoHQT_Z~~>F z`YXrZ2y?K#t~wOSGj^DjXn3>1!Nk9m6;8olWcLek9`+U;Y+oxfLUypFX4OMlJKvgB zG!f$yP;W=_Ro=t>AyOoDJT-eS7cAMV=nORoo(N?CsLLsKn0&}rUSd7H&7pFVkhd&= zvk%%4GHTjmu?lDJ#08t`&i6A*!<<%^wwY~11)=ua$_IFFL$SwJ@2cV}kp_=PW^d;p zm-HcqDd?%L`J$TOCms&?&~T@w{L$s1CH#Hlx>x7dtHO2I#R?2{5meL2aVVQ24h@r6iA zH8MtwrC0p7P;zG|-X7GK3K>z#NlN*x2H#2U+K7C0E|IeEwUN1`0cqMMe( z6S%G(+tYDI61z)3*evJWDy8KiWIAH{)b>LBND`*W?J^QBZKD|XC}5<#YV?x{LdaL^ z4l-@M`;4AW;?>lt?5VXNA5utN4D}C>{6`$+!56XAB489QU#qr6e15Gi;Z`JCtQZy* zmz9A9qXB<)dm=CIHHXc~Dj+4mI&DaR5k$3AZrq`P-RH!9pXmjwa$cGc?1>4`u^5eh_J(U|?*{y9Z@HBh_+*)pg3BPHgGCF1)hv1;Aq={O|h z>>aP8JUxyzITL|9JLl7K})h<{0%voAHbjyc?v0 zM?3z;ElJ)?pvVIPYKR8KY6uZ=!#uNp9T+DxPN7vXST6O*#DcC76K-5D=mdtw5DLE;%-=kFssMt&+v?#r2VhnKX&B;;E0>Rg0DTj?;R_ zs;7?G;*D|drTLg2c;fGW^qYv7MSFSoXQ!bKO;EI>r4%Mc!Wa(!;9$tONzK1T1PBEdfw>>Ba!LJ-k zKp+4F!{Xw+{?S-qDu1ng>D0l^rv^60?Bw^1`d&0k@&>+FQ^~0$z(D!7Qak*#2D5H` zoMP4KzIWdsVR}CMuu$gfcA{(SLhZzooyppZjP()P5>HFdYI)z=tLQrc&ZAYM+m3@t zv-zq4pD5=%l2h1!OiXFUFibz^X8r!UqG?96u+m=TR`xB6kK-xMCK)*fQQi@nH~XLy zPYG50J*K7!Q=bY*+p(j#2jc@K9g-c{G?ldicSg2o8zDVEhjal90Z>8pmX@C z@kzFKev=5glOKT|=f@WE#4R%8k)2h?RegsADYz{RA`=c7NbXlOHMQsHS`J|`(2|jf z7k)==^Xn3p8-j)kFakJuDPB{eCK{mzIwoeW#V35P#Kr57HlBJemM_K}M>l7A5~lT{ zDtN}~3Vz^Y{?e|9=zr+9x)#g7n!ld@i|P8V+yRqFz%D)Lv@|ly?)Gr^&3ENd>=-iQ zqasa@Qz;dBVrdfdb@n;*85ZP00dsfF_RN11#w;u>*u@L~MCr3m{}~3Z*M2U97mEDb z&i=z4f+3ANDo^!e=foyLwi?7Yp#Qw8HpfMa{G7j4?p*%u1AXTc91~;QCqGj*h7^1NhQ+B2 zg!F7+kXO%XUgY7W@ah;QuLSYY19owvudL5_-zMP$P8YiOnC@#M*G54XXC3C|S|7hYKcw<{-A-l3^R zlkgLVZN@_CK6S;$8h(Y8FhNHqA3jk*qr*9EpGR=Dq=E?&`(GPqbdpu{bhS9VT#-EN z-Ry;=MxIj$7Q^CvhBePP+9;==`m(q7`O=cS!e-+R2GPFIk!8F)KFzwOTZZ2-B@`8q zlr)q_lRHS705=SY+wH#te3zVoN@6jYTGCzK4E~2MYZ%d^k znXADfK)c}D&qOsMnS8egNMFU<_j|K^e?sodMHjB&;MkG(`=YR!eoeS%>d4cfsN6V? zHZTgL(FTbgl8&BeWRVJ3X|}t#?Zu?Ww2k!oG79&Vb!4K!jDFY!giG7<1TKxobR%pO z&b^*!oOvx5JC-64eLHE$f867QQKl83IFUuvLif`&M4$RMo=saZC!<1*9vz-9@E?ig zhLm(?Wt2G2_HM-tQI)XO9q^@#E4WE-wv_qeR?0T{L_MghUN)o@pQq$+E;ex|Q>$uplAnlV*;2ic`GSKWwuhcWrp9zmo65mqq6%B$*5qwK$p)8#M1E=qQ~RsnOlhA~&kryU@?r>B!qe$*;X#rstj3A-4a7;mIlQY=yljd;c zWnOY#X)Dk=EBS8waeA|+R9yshq+eyZDLxP~1uUBfqH6;vpOLy*k%$@IQ1m5HR(6|J zIjNMQ_4L^0J{P1<$rHBCyhp!gR7zRRRn3tY)44(yjjj`g0P&n_XgXME+N$;i`Z?2zw)TDb7&&Pl$(nCTZ(7mw_rNk_pXaGq^AnfBYQgjM zzMk$v=#f#cLfGY-F+*C&0qF$gTw%T5(vQK%lTJkP4+7gID9 zvJ~x;ga;LR=%MTgQ}XDW_K^$r1!W(Jza4_URtv_Y`4df8xy_13NmO-MvUUmFP@tdv zsiD-EE7DdAY}rmamt+jd4pF^3h>EA~#*5A{PB75yuFsK?O)SS&xbZoU|7MaD1HiE< zI$xGDq;;k7bvM@OiUCVcj`lgDDy|daQu!#)xUGn3Q8tI<<}hgvxL`WpP-Dts#W#6x zDq2xT31rUg7j*CK<2?7oeW7NB(MTiTV4uv<;1VV)c2lXFc8rbWrsSrNJ3JN4jL|__ zi2#SFBbeHd)M8A@VXVsH_70s#B5+>v(PuhMx7NzIZH9ZUOD$1Ar)QCl*cwP=v5B16=xB$TIN1PNR zc^xgwxB)EQ7*(}6h1q@Uw=MeoDox9HY2VHA0PJxi+^*vwIFtZH-!>kB`C z>ET{>q2;%bduN@ijFfu;KUn<(-z|IAZ@ulR$&nW1IkH=ERoTg`Nq&@|h>aYWWL{4M-tqV4+8*(JqO5bzXL8Oo;3eOtQ`A4c z=?2iDg4iVrjkBDS~P-Nn>YOU-nnMlCIyyo^5nFI>>M`jYP!t$Ju{FFX00Xy~fZ@lZ_he z*Rll)SQ0IIB9=CJ0o_B*w?{swr3Jg;*r`33(X7J?60psPdhz`(4GboB$G0a8aX(w$zt|mu=uC#1E?r#83-evnzT^KO*>TFQdWr25#)mpwfF4KwB zmw32kyZPrug?0w-Ju?B|leV&s&Vi$mr*OFaP1;{}6jczaD{@n_9TgoRc&0 zl#4x^WJ0j@Emp(*eb{FoJm^Z(q3FyJC*5h zB6`X2KE-0DzqgCgV7aII^A5itd(xL&#-z@4Q9KdYBwICJy&!l15W|tDqjt=%JE3k$ z?*HOgFo%3#@g3{Ic^+x6?(?X=W80_NQ5oaeM3*0H4J7TFonvb&6*Yj)ne7{RC3r+7Bf=C!> zdFi#sdh*B(me6MT?Y1{eQK`?&fo#+4^nSbmjVUGdK z+Wj8$NO*k`QNx$^Xsgby%v`$N73u!dsM#Dh#SUxn53sQ}z`gy)o6M&aHeIP3Aw+fh z{$_K{Gtb-{m+>!A`}Jh`76%SNa#IIIVlm{gDoM0Ajt2`kpsHs!9CluTxSX{r61(F5 zPHEJL>b}PWdBaRGi9^tYDC1myC++(%26@ISR8Vq?0ofy{Xx-b9r*9Neq}-I(fo@N$ z4hmI7jcWx67`xHP-JCI+yx2?8QFeZNzx=1xN?M#Qk`a}&)9m>wK>iI0{GrGK{DOae#*t$7JwVhv1_5=NT zKQg6*jIjxIC`Ps+OV)4>UL)wCc!;y zgadV3>yL_-$40uNSj+~~qOHp>fN1_7?W!4)3@WV&H$^W-^7+o=WMK(phn|wHf-+IL zKK55{09O_83)1WfeR}iifHguvHZyQHm5waKIh6patbkJj!$yy&Lei&=(ww90k= zed4Gmy7Oe2L$m0#ri%|&x%cW!J*S4c$$Jjbi!meJ`DV3z%;NyD+(74B`DKfpGle{g zQR&8id30t!Jb(1Ku!QOb*+rPOqjE7Kx1_QMAj8ON?zP*I)i_U8k%Eo`tM`O`bxk%@ zi}uX#TybXYLf+5a2W#I)p0wM=T1#4(V$Q1;=7=7q=f`|I@O#^fJ$mtzc z)Rk7KdS2HU0dB7>mQI|a#&30w-S`&K^< zWNebXc`6LLKf00JaJTc|)B7B-B-Pcoc{w-aJ_)`lWp8b>2wmT^AZpKZm|?^dOdu{i zC9#)b3im;(+*%~c6lS%u9g@=4V!IFOq9(`PJcLpeW z`2rXl$DD#MQJ^U_8!T8*a@Ei)(NXMuWR?D3`VWC$K8BySPs7f z?+446a(O4+)Sb-UtIdB#%vi1Isa1*OEMQ<)##POSRHY`$FjcENM2Gu;#aC9K)GWhz z*3l_ojVt6ctA%`?oD^85@IB_6!PIK8(`44Y@B_|q6I3{r5=CV=u!~o%&)$G4qJ@`G>-doM#ihU-@9)2h5>&Y`*qcvlXLDC5q)xGi9Jm@L zN~!x41Sl3e(2Rfts8sH{wUQsO{D30Wvx;%#GMDnYdGOm#O{XP=+NzF|dS zJPAWzGH;A@Z~Y$IFwV7zV{7wH6#VvsWFaYo=!#p5iqGw{&V7+f`woCOy3!#$lxA>x z|LJ|z;cueva_n>bKW}VdjZo4Pht}A>G7i=>UXNH0T{tP?%H!N^>}Bce1ZyU%A&WN-EX>;ax1Cn{wN<1Q}Q z%s#PvCF-%by-O#L7-K3KGPz=20DTJ4PKrwEmMQL0Yhs~)#qhpKZXU=1AKmfoiZvk! z`Z3&#fvH*A&v%IdX)>m!8{pNhJo!>=Uu5#O~Hxd@9viy^$!6vLf0q1JF zuLS{CIZ~#|_^y)x5W^Z`iqR-zTYLRP`FuR&Y83xT-HN+@H!Uqyt2=lPcz56Zu=e6f zAz5`(d&-dMF=IebS>M`5layX?HTB&bfu2QT*Z1=lLZ=WjmM-bWyxswm{Rd*wV#-Dq z1{N`WWcWWbHf;NVFjcw^$k_sTyg^Fy&O>5ZS&P^~i`yB1a3~ue>HEzU7l12=x^W}q zcxa!kTEvIYUAS5X?(c6ot`l;?v_#EvOo46*1qFFGYn??Z~usaF|T0 zexEU65=;st#t(CQSq}ov8ed9_AL@}$$_>2TDCsBs;3vG(Ho20Q|+u{@UXQQfU-rT%8KGqwfVbs+wr57B!yDEoaIK zyJ(}B@5&iP`}!s$F0<3|L0C0N7QcMPIC_#OWlKQmA8UgyqOh>{IH#@GTXHOg(E6z? zjK^!q_`?TIW)el(0rBA!g)L`aX`K$j<-j2Jdc-jp(`lhk10}G6#2i>SvMSncDPz)^ z-ljs;{lEcIISd$lUYm!vW$U+aS0jS;^ITsI(G_dSVKs5__!WGPzMePKP07JpUpc-$ z44B2)Y`k^)*IvOMYu zwu7z}IAGyTs*{<+y*u>O&vBLt_)J?9yR%Q%%O38u>CaCNjD026Wm$jAb{fK{_d$Fr ziT#V=zIA+CTYjW*cF+?i4=z~k9g{7|B;d+xJNM)Gyow*(yMk_>#3E%m-dG1X`aEeA;1SY zQd?p_-OeMc$3j1&kF)6K=nCrN8y^45gW6cAT)HhP81H}yU!Ubxw?6qxj1NYGaEZ@H6mYlNWb0TXhV3T z68AIYavRohGjK)7Kf20RB`$xp+x5W(kLPFZxP@pBH{d5lZyync!NHVN=QTias`LGm zigAD%Cj<3C6hD(Hw33tgjO}Hbm11&x$xEvKuELAO0P$wfyz;7H`I)q9p30)8!l`O= z*o2D6q%Idg+_o5XDJU*VR74>)X{R_9&|P63*L7aI+J8b>z+^yWs%m{B>A~xj2qFOs ze5;vVjq0eV!`!uu_^8q-%Gbb`9TXHvm&(ZSrHJ_ngm`hlT$S8>`r~F<+7-zgrOIVr z(`AV5WSokPcrQh~eA#hPa%}%CdQG9bXY+n*K?g?Gl1;Z7q=t)@?`LCG;MH_S_H71U z;VC7hKEAkhX9DXY*kA0?zOd+Rzb1EVyZ#7Qw#0JRI81!h#x}txqNr#9c(c69bLl7J z(@xaLQ{aB`lCnuJeoP^+EY80z=5n#l54S6i3v*kZKIXn^LMT&F&=puXdG;s<`+Dn_ z3wL|8`|WKo?;3xb3iTn0vL}rmlk9o@P|Ho>id?8JLpED=@X;pMbux+_bX^Js{gsPy zadZBYh>9GcL-gZ1adg#yWzq7yarJp>YFM@@B#7($3(fVvUIzF)_;0_Q8{R?uw;!+c zpZ=90JO%c5JoF*F9QSWO-wi&D^|${u3k*La|NC$~7rfsIt*QAQ>$>QQUJJ>7ojH!j zMHFLk;QdM*gTd6l>MRh0DmYh_<0(0f{Nf?vx16w07UQB<(xieK8cuURBO z9K$;FXeA2Dzr{3v>Oo*A1b_K&row{XOv!fi|JT9<7^361$n>>{Jc#4DGr9vsiMpQp zf60|2rhkj17h=IhKpfj1^#9Hz{*TFFDzUxa{QcMbQXr0%O7MRK{@2vwh!`UJI!#H7 z2rj~cmqM;>_zdfp=>I8D{u3v(h{E53rKMQ#Vk{6Mx0~s@q>cj*v6oU$dW#JEzW_M8 BVU_>@ From 336eeca4d273ae756b57b234601f681ff30dcd7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:30:21 +0000 Subject: [PATCH 028/201] chore: Remove early test files# --- tools/_vega_datasets_data/metadata-schema.json | 12 ------------ tools/_vega_datasets_data/metadata.parquet | Bin 9100 -> 0 bytes .../metadata_v2.5.4-v2.9.0.parquet | Bin 11354 -> 0 bytes 3 files changed, 12 deletions(-) delete mode 100644 tools/_vega_datasets_data/metadata-schema.json delete mode 100644 tools/_vega_datasets_data/metadata.parquet delete mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json deleted file mode 100644 index 2b5b9d955..000000000 --- a/tools/_vega_datasets_data/metadata-schema.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "ext_supported": "bool", - "file_name": "str", - "name_collision": "bool", - "name_js": "str", - "name_py": "str", - "size": "int", - "suffix": "str", - "tag": "str", - "url_github": "str", - "url_npm": "str" -} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet deleted file mode 100644 index 1ab0fb17143528da9cd460e84a0fb18a9f1d5b73..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9100 zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{ zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX#F! zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@ z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn zm+PlC$Uc@#rY?$x`Mjg`Yv5 z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T zOlgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S zuGHkN$akkNd7d(wDi)Mn$5-8AU)8RuX_P&+6+R z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y) z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDuEgQVsa*pFnN&&60_DkTv3H z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0lE^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg zh9JkIRbpE>9&m`sJ*@}sOH>d%$VgLVL#d^fe=wu#W zzWa;&;hxFwt~c39&`sQ?gST% z#lOG}f^cj8lQl0thFevP7| z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk* zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dyIb4vP46txCpA^= zEv{}WiRnf+yZ3ry7?c?hCB2TH~2sn)%_?heQv6j8zF zTi~1xTloE4m7{L@yMJX-Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_C9s9b?J_Xz%#2_K29&h1G6M|fQh$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y zX*;3)!V2`FFAVpZx5%UjHDH)mI`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+< zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B= z$TjO3X9+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%! zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PThS+=1#saM$R zH=l}V%jZdU?oPq9mN2?XU|Y^`iej-xhS>|h zD9c$>dzC7Ig&g-Sd4@rs<;>nDM?r|GwE3OMWJS}|VSO$~d z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM z9M#!^!&pX`^Ou7npXPcVbLz%_>9jDY7Aigq}T?m@)5(+is#5Jy0>3X z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OMlZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=sK09lRpAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB& zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L% zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G zMXkI^_Nwl*GUp3)=)7>OPh!rfGaPwAgH8vnbFU&y<7 z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM) z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_ z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk82_*i znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x) zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m z`o_brZCW#Wb+me0~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<UgQpn* z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N zH(8Y6kEW^tkw`RxenickPl@p5^gsm68%>DQK3|y z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@UTe62cyq14`{y^iG6`gj*7%1_HjXKiYw>>A`6;sa5o z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn} zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet deleted file mode 100644 index 5626093db560b805b33261bdc5f6b7754ab3451d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11354 zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2N!-mSkyBTJWA3R8PppmljUo(yIm2w_ zV1x892FjnH8XygUARqvENMBZ15tV0y0FP3$Bcz#Ltq<^af_m3 z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z< z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIWAw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3Lc0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{) z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~yS0|RD^xW_olJIm>Nl{S* zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSVd!vED+2Q8p1g@{ z5cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea- z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7 z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=* z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@ znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH; z>XPbVx$`XWi%sctbgd5vy*l7OgOs_&Ev#e7++rs(bqst-? z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&& zLyhoI!1y?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K7R9DMw0*R7WdN#WPN`3Q+ns>%lP*; zF|Xp*P}l^I?cB$A$OZ|;z!## z^RFC9M&hwhSm6ckhGqH!6W}UyS~7%e2E@igIvIdt z<_TF|opl@6JDw6nK1vMkd^rx068}IMl281}OY|Mut5PCog`f?c((rjfGBt1kGI7j)X-fh*&56dB{-Y~aX%<> z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!* zk_Y|?vsmkqc_spMuQZ1oyvBV8;$2RmmN= zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa zcdAEAx#I8R(jIH1wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(S=O3(y}iGI6Q=DEgGMy6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3 z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r zH*as--Sh6r zrZZ$yS z$~-0I6juVF7vB>!@z{d&;?U(e^kS8p0-17ZrrpFQ8P&U6$X-1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3 zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5( zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-`q37 zePB0%4&6yElvD~k4*Rmgbj|^8)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd) z?az%yAIw|1>U4?8ZDme6BzyeDyF@tU@a7ig|r? z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuzt{wdV!^) z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4 zmHN+K;)(CerBC!q4%J3wm z*?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@ zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc*HHEF&{R@YQgqkUM0+U`)CsB{ z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2 zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@ z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3 ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3 z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3 zdo8jt-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~ zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq4>lP-U5@_>Z!i=|`j`#txKJ~=K$ zDl*ge?29*bH@mWc0)`m9dE5kJ|D$NV zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm& zXZ|?5l>L_0$06E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@< zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g z@uB z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{ z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_F+ z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}| z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9 z!KhQ@?W4-03NxjYV~p*NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8 zMii@U$lI-?Np?34h$QUN3MgX#_1i*mJIpAtsYLJUTM ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1 Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O From 225be0a15520d166bddd162307ff1b82f2552bf7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:33:20 +0000 Subject: [PATCH 029/201] refactor: Rename `metadata_full` -> `metadata` Suffix was only added due to *now-removed* test files --- ...tadata_full-schema.json => metadata-schema.json} | 0 .../{metadata_full.parquet => metadata.parquet} | Bin tools/vendor_datasets.py | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename tools/_vega_datasets_data/{metadata_full-schema.json => metadata-schema.json} (100%) rename tools/_vega_datasets_data/{metadata_full.parquet => metadata.parquet} (100%) diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata-schema.json similarity index 100% rename from tools/_vega_datasets_data/metadata_full-schema.json rename to tools/_vega_datasets_data/metadata-schema.json diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata.parquet similarity index 100% rename from tools/_vega_datasets_data/metadata_full.parquet rename to tools/_vega_datasets_data/metadata.parquet diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 208834ebf..45fa27614 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -674,7 +674,7 @@ def __init__( output_dir: Path, *, write_schema: bool, - trees_gh: str = "metadata_full", + trees_gh: str = "metadata", tags_gh: str = "tags", tags_npm: str = "tags_npm", kwds_gh: Mapping[str, Any] | None = None, From e91baab65642dd9b81020b88f50314943d5b15c4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:42:16 +0000 Subject: [PATCH 030/201] refactor: `tools.vendor_datasets` -> `tools.datasets` package Will be following up with some more splitting into composite modules --- tools/{vendor_datasets.py => datasets/__init__.py} | 2 +- .../_metadata}/metadata-schema.json | 0 .../_metadata}/metadata.parquet | Bin .../_metadata}/tags-schema.json | 0 .../_metadata}/tags.parquet | Bin .../_metadata}/tags_npm-schema.json | 0 .../_metadata}/tags_npm.parquet | Bin 7 files changed, 1 insertion(+), 1 deletion(-) rename tools/{vendor_datasets.py => datasets/__init__.py} (99%) rename tools/{_vega_datasets_data => datasets/_metadata}/metadata-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/metadata.parquet (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags.parquet (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm.parquet (100%) diff --git a/tools/vendor_datasets.py b/tools/datasets/__init__.py similarity index 99% rename from tools/vendor_datasets.py rename to tools/datasets/__init__.py index 45fa27614..e27f011f0 100644 --- a/tools/vendor_datasets.py +++ b/tools/datasets/__init__.py @@ -732,7 +732,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None json.dump(schema, f, indent=2) -app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True) +app = Application(Path(__file__).parent / "_metadata", write_schema=True) def _tag_from(s: str, /) -> str: diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json similarity index 100% rename from tools/_vega_datasets_data/metadata-schema.json rename to tools/datasets/_metadata/metadata-schema.json diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/datasets/_metadata/metadata.parquet similarity index 100% rename from tools/_vega_datasets_data/metadata.parquet rename to tools/datasets/_metadata/metadata.parquet diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/datasets/_metadata/tags-schema.json similarity index 100% rename from tools/_vega_datasets_data/tags-schema.json rename to tools/datasets/_metadata/tags-schema.json diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/datasets/_metadata/tags.parquet similarity index 100% rename from tools/_vega_datasets_data/tags.parquet rename to tools/datasets/_metadata/tags.parquet diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json similarity index 100% rename from tools/_vega_datasets_data/tags_npm-schema.json rename to tools/datasets/_metadata/tags_npm-schema.json diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet similarity index 100% rename from tools/_vega_datasets_data/tags_npm.parquet rename to tools/datasets/_metadata/tags_npm.parquet From 7782925b3291a8d3b6ff38b5572e3e47c06ebed3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:55:10 +0000 Subject: [PATCH 031/201] refactor: Move `TypedDict`, `NamedTuple`(s) -> `datasets.models` --- tools/datasets/__init__.py | 187 ++++--------------------------------- tools/datasets/models.py | 166 ++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 167 deletions(-) create mode 100644 tools/datasets/models.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index e27f011f0..2b87ded3b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,7 +10,6 @@ import json import os import random -import sys import tempfile import time import urllib.request @@ -19,27 +18,28 @@ from functools import cached_property, partial from itertools import islice from pathlib import Path -from typing import ( - IO, - TYPE_CHECKING, - Any, - Callable, - ClassVar, - Literal, - NamedTuple, - cast, - get_args, -) +from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args from urllib.request import urlopen import polars as pl -if sys.version_info >= (3, 14): - from typing import TypedDict -else: - from typing_extensions import TypedDict +from tools.datasets.models import ( + GitHubRateLimitResources, + GitHubTag, + GitHubTree, + GitHubTreesResponse, + GitHubUrl, + NpmPackageMetadataResponse, + NpmUrl, + ParsedRateLimit, + ParsedTag, + ParsedTree, + QueryTree, + ReParsedTag, +) if TYPE_CHECKING: + import sys from collections.abc import Mapping, MutableMapping from email.message import Message from typing import TypeVar @@ -50,9 +50,9 @@ else: from typing_extensions import TypeIs if sys.version_info >= (3, 11): - from typing import LiteralString, Required + from typing import LiteralString else: - from typing_extensions import LiteralString, Required + from typing_extensions import LiteralString if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -81,153 +81,6 @@ def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) -class GitHubUrl(NamedTuple): - BASE: LiteralString - RATE: LiteralString - REPO: LiteralString - TAGS: LiteralString - TREES: LiteralString - - -class NpmUrl(NamedTuple): - CDN: LiteralString - TAGS: LiteralString - - -class GitHubTag(TypedDict): - name: str - node_id: str - commit: dict[Literal["sha", "url"], str] - zipball_url: str - tarball_url: str - - -class ParsedTag(TypedDict): - tag: str - sha: str - trees_url: str - - -class ReParsedTag(ParsedTag): - major: int - minor: int - patch: int - pre_release: int | None - is_pre_release: bool - - -class GitHubTree(TypedDict): - """ - A single file's metadata within the response of `Get a tree`_. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - path: str - mode: str - type: str - sha: str - size: int - url: str - - -class GitHubTreesResponse(TypedDict): - """ - Response from `Get a tree`_. - - Describes directory metadata, with files stored in ``"tree"``. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - sha: str - url: str - tree: list[GitHubTree] - truncated: bool - - -class NpmVersion(TypedDict): - version: str - links: dict[Literal["self", "entrypoints", "stats"], str] - - -class NpmPackageMetadataResponse(TypedDict): - """ - Response from `Get package metadata`_. - - Using: - - headers={"Accept": "application/json"} - - .. _Get package metadata: - https://data.jsdelivr.com/v1/packages/npm/vega-datasets - """ - - type: str - name: str - tags: dict[Literal["canary", "next", "latest"], str] - versions: list[NpmVersion] - links: dict[Literal["stats"], str] - - -class ParsedTree(TypedDict): - file_name: str - name_js: str - name_py: str - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - -class QueryTree(TypedDict, total=False): - file_name: str - name_js: Required[str] - name_py: str - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - -class ParsedTreesResponse(TypedDict): - tag: str - url: str - tree: list[ParsedTree] - - -class GitHubRateLimit(TypedDict): - limit: int - used: int - remaining: int - reset: int - - -class ParsedRateLimit(GitHubRateLimit): - reset_time: time.struct_time - is_limited: bool - is_auth: bool - - -class GitHubRateLimitResources(TypedDict, total=False): - """ - A subset of response from `Get rate limit status for the authenticated user`_. - - .. _Get rate limit status for the authenticated user: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ - - core: Required[GitHubRateLimit] - search: Required[GitHubRateLimit] - graphql: GitHubRateLimit - integration_manifest: GitHubRateLimit - code_search: GitHubRateLimit - - class _ErrorHandler(urllib.request.BaseHandler): """ Adds `rate limit`_ info to a forbidden error. @@ -608,6 +461,8 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: class _Npm: + """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + def __init__( self, output_dir: Path, @@ -958,8 +813,6 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() - # BUG: # 1.6.0 exists on GH but not npm? - # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview def __call__( self, name: str, diff --git a/tools/datasets/models.py b/tools/datasets/models.py new file mode 100644 index 000000000..5a6598fed --- /dev/null +++ b/tools/datasets/models.py @@ -0,0 +1,166 @@ +"""API-related data structures.""" + +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING, Literal, NamedTuple + +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +if TYPE_CHECKING: + import time + + if sys.version_info >= (3, 11): + from typing import LiteralString, Required + else: + from typing_extensions import LiteralString, Required + + +class GitHubUrl(NamedTuple): + BASE: LiteralString + RATE: LiteralString + REPO: LiteralString + TAGS: LiteralString + TREES: LiteralString + + +class NpmUrl(NamedTuple): + CDN: LiteralString + TAGS: LiteralString + + +class GitHubTag(TypedDict): + name: str + node_id: str + commit: dict[Literal["sha", "url"], str] + zipball_url: str + tarball_url: str + + +class ParsedTag(TypedDict): + tag: str + sha: str + trees_url: str + + +class ReParsedTag(ParsedTag): + major: int + minor: int + patch: int + pre_release: int | None + is_pre_release: bool + + +class GitHubTree(TypedDict): + """ + A single file's metadata within the response of `Get a tree`_. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + + path: str + mode: str + type: str + sha: str + size: int + url: str + + +class GitHubTreesResponse(TypedDict): + """ + Response from `Get a tree`_. + + Describes directory metadata, with files stored in ``"tree"``. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + + sha: str + url: str + tree: list[GitHubTree] + truncated: bool + + +class NpmVersion(TypedDict): + version: str + links: dict[Literal["self", "entrypoints", "stats"], str] + + +class NpmPackageMetadataResponse(TypedDict): + """ + Response from `Get package metadata`_. + + Using: + + headers={"Accept": "application/json"} + + .. _Get package metadata: + https://data.jsdelivr.com/v1/packages/npm/vega-datasets + """ + + type: str + name: str + tags: dict[Literal["canary", "next", "latest"], str] + versions: list[NpmVersion] + links: dict[Literal["stats"], str] + + +class ParsedTree(TypedDict): + file_name: str + name_js: str + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str + + +class QueryTree(TypedDict, total=False): + file_name: str + name_js: Required[str] + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str + + +class ParsedTreesResponse(TypedDict): + tag: str + url: str + tree: list[ParsedTree] + + +class GitHubRateLimit(TypedDict): + limit: int + used: int + remaining: int + reset: int + + +class ParsedRateLimit(GitHubRateLimit): + reset_time: time.struct_time + is_limited: bool + is_auth: bool + + +class GitHubRateLimitResources(TypedDict, total=False): + """ + A subset of response from `Get rate limit status for the authenticated user`_. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ + + core: Required[GitHubRateLimit] + search: Required[GitHubRateLimit] + graphql: GitHubRateLimit + integration_manifest: GitHubRateLimit + code_search: GitHubRateLimit From bc86ca18101e9e688caec7ea5e66afc2810ef993 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:16:05 +0000 Subject: [PATCH 032/201] refactor: Move, rename `semver`-related tools --- tools/datasets/__init__.py | 55 ++++++------------------------------ tools/datasets/semver.py | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 46 deletions(-) create mode 100644 tools/datasets/semver.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 2b87ded3b..ce61dbbe7 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -23,6 +23,7 @@ import polars as pl +from tools.datasets import semver from tools.datasets.models import ( GitHubRateLimitResources, GitHubTag, @@ -42,7 +43,6 @@ import sys from collections.abc import Mapping, MutableMapping from email.message import Message - from typing import TypeVar from urllib.request import OpenerDirector, Request if sys.version_info >= (3, 13): @@ -59,7 +59,6 @@ from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq - _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) _PathName: TypeAlias = Literal["dir", "tags", "trees"] WorkInProgress: TypeAlias = Any @@ -71,10 +70,6 @@ _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" -_SEM_VER_FIELDS: tuple[ - Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] -] = "major", "minor", "patch", "pre_release" -_CANARY: Literal["--canary"] = "--canary" def _is_str(obj: Any) -> TypeIs[str]: @@ -350,7 +345,7 @@ def tags( self, n_head: int | None = None, *, warn_lower: bool = False ) -> pl.DataFrame: tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) + return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: """Retrieve directory info for a given version ``tag``.""" @@ -398,7 +393,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: f"Finished collection.\n" f"Writing {fresh_rows.height} new rows to {fp!s}" ) - return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + return pl.concat((trees, fresh_rows)).pipe(semver.sort) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) @@ -421,7 +416,7 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: latest = ( self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() ) - if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + if latest.equals(prev.pipe(semver.sort).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() print(f"Refreshing {fp!s}") @@ -429,16 +424,14 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: tags = ( pl.concat((self.tags(), prev_eager), how="vertical") .unique("sha") - .pipe(_sort_sem_ver) + .pipe(semver.sort) ) print(f"Collected {tags.height - prev_eager.height} new tags") return tags def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: - rate_limit = self.rate_limit() - if rate_limit["is_limited"]: - raise NotImplementedError(rate_limit) - elif not isinstance(tags, Sequence): + rate_limit = self.rate_limit(strict=True) + if not isinstance(tags, Sequence): tags = tuple(tags) req = self.req n = len(tags) @@ -511,9 +504,9 @@ def tags(self) -> pl.DataFrame: versions = [ f"v{tag}" for v in content["versions"] - if (tag := v["version"]) and _CANARY not in tag + if (tag := v["version"]) and semver.CANARY not in tag ] - return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) class Application: @@ -606,36 +599,6 @@ def _tag_from(s: str, /) -> str: raise TypeError(s) -def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: - """ - Extracts components of a `SemVer`_ string into sortable columns. - - .. _SemVer: - https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions - """ - fields = pl.col(_SEM_VER_FIELDS) - pattern = r"""(?x) - v?(?[[:digit:]]*)\. - (?[[:digit:]]*)\. - (?[[:digit:]]*) - (\-(next)?(beta)?\.)? - (?[[:digit:]]*)? - """ - sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) - return ( - df.lazy() - .with_columns(sem_ver) - .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) - .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) - .collect() - ) - - -def _sort_sem_ver(frame: _Frame, /) -> _Frame: - """Sort ``frame``, displaying in descending release order.""" - return frame.sort(_SEM_VER_FIELDS, descending=True) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py new file mode 100644 index 000000000..cb4c6c799 --- /dev/null +++ b/tools/datasets/semver.py @@ -0,0 +1,57 @@ +""" +Parsing/transforming semantic versioning strings. + +.. _semantic versioning: + https://semver.org/ +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +import polars as pl + +if TYPE_CHECKING: + from typing import TypeVar + + _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) + +__all__ = ["CANARY", "sort", "with_columns"] + +_SEM_VER_FIELDS: tuple[ + Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] +] = "major", "minor", "patch", "pre_release" +CANARY: Literal["--canary"] = "--canary" + + +def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: + """ + Extracts components of a `SemVer`_ string into sortable columns. + + .. _SemVer: + https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions + """ + fields = pl.col(_SEM_VER_FIELDS) + pattern = r"""(?x) + v?(?[[:digit:]]*)\. + (?[[:digit:]]*)\. + (?[[:digit:]]*) + (\-(next)?(beta)?\.)? + (?[[:digit:]]*)? + """ + sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + ldf = ( + frame.lazy() + .with_columns(sem_ver) + .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) + .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + ) + if isinstance(frame, pl.DataFrame): + return ldf.collect() + else: + return ldf + + +def sort(frame: _Frame, /) -> _Frame: + """Sort ``frame``, displaying in descending release order.""" + return frame.sort(_SEM_VER_FIELDS, descending=True) From a6f56452df200ef2049aa3203e79f1d70005a198 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:19:13 +0000 Subject: [PATCH 033/201] refactor: Remove `write_schema` from `_Npm`, `_GitHub` Handled in `Application` now --- tools/datasets/__init__.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ce61dbbe7..e26472c2f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -296,14 +296,10 @@ def __init__( name_tags: str, name_trees: str, *, - write_schema: bool, base_url: LiteralString = "https://api.github.com/", org: LiteralString = "vega", package: LiteralString = "vega-datasets", ) -> None: - # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced - # that describes column types - in a non-binary format. - self._write_schema: bool = write_schema output_dir.mkdir(exist_ok=True) self._paths: dict[_PathName, Path] = { "dir": output_dir, @@ -461,13 +457,11 @@ def __init__( output_dir: Path, name_tags: str, *, - write_schema: bool, jsdelivr: Literal["jsdelivr"] = "jsdelivr", npm: Literal["npm"] = "npm", package: LiteralString = "vega-datasets", jsdelivr_version: LiteralString = "v1", ) -> None: - self._write_schema: bool = write_schema output_dir.mkdir(exist_ok=True) self._paths: dict[Literal["tags"], Path] = { "tags": output_dir / f"{name_tags}.parquet" @@ -533,18 +527,9 @@ def __init__( kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema self._github: _GitHub = _GitHub( - output_dir, - name_tags=tags_gh, - name_trees=trees_gh, - write_schema=write_schema, - **kwds_gh, - ) - self._npm: _Npm = _Npm( - output_dir, - name_tags=tags_npm, - write_schema=write_schema, - **kwds_npm, + output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) + self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm) @property def github(self) -> _GitHub: From 07a8342c95544fbbacff808f8d4d3868a1215a2c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:00:12 +0000 Subject: [PATCH 034/201] refactor: Rename, split `_Npm`, `_GitHub` into own modules `tools.datasets.npm` will later be performing the requests that are in `Dataset.__call__` currently --- tools/datasets/__init__.py | 497 +------------------------------------ tools/datasets/github.py | 455 +++++++++++++++++++++++++++++++++ tools/datasets/npm.py | 76 ++++++ 3 files changed, 541 insertions(+), 487 deletions(-) create mode 100644 tools/datasets/github.py create mode 100644 tools/datasets/npm.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index e26472c2f..bcbe725a1 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,42 +8,21 @@ from __future__ import annotations import json -import os -import random import tempfile -import time -import urllib.request -import warnings -from collections.abc import Iterable, Iterator, Sequence from functools import cached_property, partial -from itertools import islice from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args from urllib.request import urlopen import polars as pl -from tools.datasets import semver -from tools.datasets.models import ( - GitHubRateLimitResources, - GitHubTag, - GitHubTree, - GitHubTreesResponse, - GitHubUrl, - NpmPackageMetadataResponse, - NpmUrl, - ParsedRateLimit, - ParsedTag, - ParsedTree, - QueryTree, - ReParsedTag, -) +from tools.datasets.github import GitHub +from tools.datasets.models import QueryTree +from tools.datasets.npm import Npm if TYPE_CHECKING: import sys - from collections.abc import Mapping, MutableMapping - from email.message import Message - from urllib.request import OpenerDirector, Request + from collections.abc import Mapping if sys.version_info >= (3, 13): from typing import TypeIs @@ -57,450 +36,10 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.schemapi.utils import OneOrSeq - _PathName: TypeAlias = Literal["dir", "tags", "trees"] WorkInProgress: TypeAlias = Any - -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -"""Query result scalar selection.""" - -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" - - -def _is_str(obj: Any) -> TypeIs[str]: - return isinstance(obj, str) - - -class _ErrorHandler(urllib.request.BaseHandler): - """ - Adds `rate limit`_ info to a forbidden error. - - .. _rate limit: - https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 - """ - - def http_error_default( - self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message - ): - if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): - limit = hdrs.get("X-RateLimit-Limit", "") - remaining = hdrs.get("X-RateLimit-Remaining", "") - msg = ( - f"{msg}\n\nFailed to balance rate limit.\n" - f"{limit=}, {remaining=}\n" - f"Reset: {time.localtime(int(reset))!r}" - ) - raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) - - -class _GitHubRequestNamespace: - """ - Fetching resources from the `GitHub API`_. - - .. _GitHub API: - https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 - """ - - _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" - _TAGS_MAX_PAGE: Literal[100] = 100 - _VERSION: LiteralString = "2022-11-28" - _UNAUTH_RATE_LIMIT: Literal[60] = 60 - _TAGS_COST: Literal[1] = 1 - _TREES_COST: Literal[2] = 2 - _UNAUTH_DELAY: Literal[5] = 5 - _AUTH_DELAY: Literal[1] = 1 - _UNAUTH_TREES_LIMIT: Literal[10] = 10 - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self) -> GitHubRateLimitResources: - """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" - with self._gh._opener.open(self._request(self.url.RATE)) as response: - content: GitHubRateLimitResources = json.load(response)["resources"] - return content - - def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" - if n < 1 or n > self._TAGS_MAX_PAGE: - raise ValueError(n) - req = self._request(f"{self.url.TAGS}?per_page={n}") - with self._gh._opener.open(req) as response: - content: list[GitHubTag] = json.load(response) - if warn_lower and len(content) < n: - earliest = response[-1]["name"] - n_response = len(content) - msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" - warnings.warn(msg, stacklevel=3) - return content - - def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: - """ - For a given ``tag``, perform **2x requests** to get directory metadata. - - Returns response unchanged - but with annotations. - """ - if _is_str(tag): - url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" - else: - url = tag["trees_url"] - with self._gh._opener.open(self._request(url)) as response: - content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) - if data_url := next(query, None): - with self._gh._opener.open(self._request(data_url)) as response: - data_dir: GitHubTreesResponse = json.load(response) - return data_dir - else: - raise FileNotFoundError - - def _request(self, url: str, /, *, raw: bool = False) -> Request: - """ - Wrap a request url with a `personal access token`_ - if set as an env var. - - By default the endpoint returns json, specify raw to get blob data. - See `Media types`_. - - .. _personal access token: - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - .. _Media types: - https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types - """ - headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} - if tok := os.environ.get(self._ENV_VAR): - headers["Authorization"] = ( - tok if tok.startswith("Bearer ") else f"Bearer {tok}" - ) - if raw: - headers["Accept"] = "application/vnd.github.raw+json" - return urllib.request.Request(url, headers=headers) - - -class _GitHubParseNamespace: - """ - Transform responses into intermediate representations. - - Where relevant: - - Adding cheap to compute metadata - - Dropping information that we don't need for the task - """ - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: - core = rate_limit["core"] - reset = core["reset"] - return ParsedRateLimit( - **core, - reset_time=time.localtime(reset), - is_limited=core["remaining"] == 0, - is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, - ) - - def tag(self, tag: GitHubTag, /) -> ParsedTag: - sha = tag["commit"]["sha"] - return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") - - def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: - return [self.tag(t) for t in tags] - - def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: - """For a single tree (file) convert to an IR with only relevant properties.""" - path = Path(tree["path"]) - return ParsedTree( - file_name=path.name, - name_js=path.stem, - name_py=_js_to_py(path.stem), - suffix=path.suffix, - size=tree["size"], - url=tree["url"], - ext_supported=is_ext_supported(path.suffix), - tag=tag, - ) - - def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: - """For a tree response (directory of files) convert to an IR with only relevant properties.""" - return [self.tree(t, tag) for t in tree["tree"]] - - -class _GitHubQueryNamespace: - """**WIP** Interfacing with the cached metadata.""" - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def paths(self) -> dict[_PathName, Path]: - return self._gh._paths - - def url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - fp = self.paths["trees"] - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - -class _GitHub: - """ - Primary interface with the GitHub API. - - Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. - - - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. - - Organizes distinct groups of operations into property accessor namespaces. - - - .. _tags: - https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags - .. _trees: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - .. _rate_limit: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - - """ - - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) - - def __init__( - self, - output_dir: Path, - name_tags: str, - name_trees: str, - *, - base_url: LiteralString = "https://api.github.com/", - org: LiteralString = "vega", - package: LiteralString = "vega-datasets", - ) -> None: - output_dir.mkdir(exist_ok=True) - self._paths: dict[_PathName, Path] = { - "dir": output_dir, - "tags": output_dir / f"{name_tags}.parquet", - "trees": output_dir / f"{name_trees}.parquet", - } - repo = f"{base_url}repos/{org}/{package}/" - self._url = GitHubUrl( - BASE=base_url, - RATE=f"{base_url}rate_limit", - REPO=repo, - TAGS=f"{repo}tags", - TREES=f"{repo}git/trees/", - ) - - @property - def req(self) -> _GitHubRequestNamespace: - return _GitHubRequestNamespace(self) - - @property - def parse(self) -> _GitHubParseNamespace: - return _GitHubParseNamespace(self) - - @property - def query(self) -> _GitHubQueryNamespace: - return _GitHubQueryNamespace(self) - - @property - def url(self) -> GitHubUrl: - return self._url - - def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: - limit = self.parse.rate_limit(self.req.rate_limit()) - if strict and limit["is_limited"]: - raise NotImplementedError(limit) - return limit - - def tags( - self, n_head: int | None = None, *, warn_lower: bool = False - ) -> pl.DataFrame: - tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) - - def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: - """Retrieve directory info for a given version ``tag``.""" - trees = self.req.trees(tag) - tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"] - parsed = self.parse.trees(trees, tag=tag_v) - df = ( - pl.DataFrame(parsed) - .lazy() - .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated()) - .with_columns( - url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), - pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), - pl.col("file_name"), - ) - ) - .collect() - ) - return df.select(*sorted(df.columns)) - - def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: - """ - Use known tags to discover and update missing trees metadata. - - Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. - """ - rate_limit = self.rate_limit(strict=True) - fp = self._paths["trees"] - trees = pl.read_parquet(fp) - missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" - ) - if missing_trees.is_empty(): - print(f"Already up-to-date {fp!s}") - return trees - else: - stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT - it = islice(missing_trees.iter_rows(named=True), stop) - missing = cast("Iterator[ReParsedTag]", it) - fresh_rows = self._trees_batched(missing) - print( - f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp!s}" - ) - return pl.concat((trees, fresh_rows)).pipe(semver.sort) - - def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: - limit = self.rate_limit(strict=True) - npm_tag_only = npm_tags.lazy().select("tag") - fp = self._paths["tags"] - if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: - return ( - pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() - ) - elif not fp.exists(): - print(f"Initializing {fp!s}") - tags = ( - self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) - print(f"Collected {tags.height} new tags") - return tags - else: - print("Checking for new tags") - prev = pl.scan_parquet(fp) - latest = ( - self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) - if latest.equals(prev.pipe(semver.sort).head(1).collect()): - print(f"Already up-to-date {fp!s}") - return prev.collect() - print(f"Refreshing {fp!s}") - prev_eager = prev.collect() - tags = ( - pl.concat((self.tags(), prev_eager), how="vertical") - .unique("sha") - .pipe(semver.sort) - ) - print(f"Collected {tags.height - prev_eager.height} new tags") - return tags - - def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: - rate_limit = self.rate_limit(strict=True) - if not isinstance(tags, Sequence): - tags = tuple(tags) - req = self.req - n = len(tags) - cost = req._TREES_COST * n - if rate_limit["remaining"] < cost: - raise NotImplementedError(rate_limit, cost) - delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY - print( - f"Collecting metadata for {n} missing releases.\n" - f"Using {delay_secs=} between requests ..." - ) - dfs: list[pl.DataFrame] = [] - for tag in tags: - time.sleep(delay_secs + random.triangular()) - dfs.append(self.trees(tag)) - return pl.concat(dfs) - - -####################################################################################### - - -class _Npm: - """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" - - def __init__( - self, - output_dir: Path, - name_tags: str, - *, - jsdelivr: Literal["jsdelivr"] = "jsdelivr", - npm: Literal["npm"] = "npm", - package: LiteralString = "vega-datasets", - jsdelivr_version: LiteralString = "v1", - ) -> None: - output_dir.mkdir(exist_ok=True) - self._paths: dict[Literal["tags"], Path] = { - "tags": output_dir / f"{name_tags}.parquet" - } - self._url: NpmUrl = NpmUrl( - CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", - TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", - ) - - @property - def url(self) -> NpmUrl: - return self._url - - def tags(self) -> pl.DataFrame: - """ - Request, parse tags from `Get package metadata`_. - - Notes - ----- - - Ignores canary releases - - ``npm`` can accept either, but this endpoint returns without "v": - - {tag} - v{tag} - - .. _Get package metadata: - https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- - """ - req = urllib.request.Request( - self.url.TAGS, headers={"Accept": "application/json"} - ) - with urllib.request.urlopen(req) as response: - content: NpmPackageMetadataResponse = json.load(response) - versions = [ - f"v{tag}" - for v in content["versions"] - if (tag := v["version"]) and semver.CANARY not in tag - ] - return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) +__all__ = ["app", "data"] class Application: @@ -526,17 +65,17 @@ def __init__( kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema - self._github: _GitHub = _GitHub( + self._github: GitHub = GitHub( output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) - self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) @property - def github(self) -> _GitHub: + def github(self) -> GitHub: return self._github @property - def npm(self) -> _Npm: + def npm(self) -> Npm: return self._npm def refresh(self) -> pl.DataFrame: @@ -568,22 +107,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None app = Application(Path(__file__).parent / "_metadata", write_schema=True) -def _tag_from(s: str, /) -> str: - # - Actual tag - # - Trees url (using ref name) - # - npm url (works w/o the `v` prefix) - trees_url = app.github.url.TREES - if s.startswith("v"): - return s - elif s.startswith(trees_url): - return s.replace(trees_url, "") - elif s.startswith(_NPM_BASE_URL): - s, _ = s.replace(_NPM_BASE_URL, "").split("/") - return s if s.startswith("v") else f"v{s}" - else: - raise TypeError(s) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago diff --git a/tools/datasets/github.py b/tools/datasets/github.py new file mode 100644 index 000000000..e245b91b1 --- /dev/null +++ b/tools/datasets/github.py @@ -0,0 +1,455 @@ +from __future__ import annotations + +import json +import os +import random +import time +import urllib.request +import warnings +from collections.abc import Iterable, Iterator, Sequence +from itertools import islice +from pathlib import Path +from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast + +import polars as pl + +from tools.datasets import semver +from tools.datasets.models import ( + GitHubRateLimitResources, + GitHubTag, + GitHubTree, + GitHubTreesResponse, + GitHubUrl, + ParsedRateLimit, + ParsedTag, + ParsedTree, +) + +if TYPE_CHECKING: + import sys + from collections.abc import MutableMapping + from email.message import Message + from urllib.request import OpenerDirector, Request + + from tools.datasets import ExtSupported + from tools.datasets.models import ReParsedTag + from tools.schemapi.utils import OneOrSeq + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + _PathName: TypeAlias = Literal["dir", "tags", "trees"] + +__all__ = ["GitHub"] + +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) +"""Query result scalar selection.""" + +# TODO: Work on where these should live/be accessed +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + + +def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} + + +def _is_str(obj: Any) -> TypeIs[str]: + return isinstance(obj, str) + + +class _ErrorHandler(urllib.request.BaseHandler): + """ + Adds `rate limit`_ info to a forbidden error. + + .. _rate limit: + https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 + """ + + def http_error_default( + self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message + ): + if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): + limit = hdrs.get("X-RateLimit-Limit", "") + remaining = hdrs.get("X-RateLimit-Remaining", "") + msg = ( + f"{msg}\n\nFailed to balance rate limit.\n" + f"{limit=}, {remaining=}\n" + f"Reset: {time.localtime(int(reset))!r}" + ) + raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) + + +class _GitHubRequestNamespace: + """ + Fetching resources from the `GitHub API`_. + + .. _GitHub API: + https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 + """ + + _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" + _TAGS_MAX_PAGE: Literal[100] = 100 + _VERSION: LiteralString = "2022-11-28" + _UNAUTH_RATE_LIMIT: Literal[60] = 60 + _TAGS_COST: Literal[1] = 1 + _TREES_COST: Literal[2] = 2 + _UNAUTH_DELAY: Literal[5] = 5 + _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_TREES_LIMIT: Literal[10] = 10 + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self) -> GitHubRateLimitResources: + """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" + with self._gh._opener.open(self._request(self.url.RATE)) as response: + content: GitHubRateLimitResources = json.load(response)["resources"] + return content + + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > self._TAGS_MAX_PAGE: + raise ValueError(n) + req = self._request(f"{self.url.TAGS}?per_page={n}") + with self._gh._opener.open(req) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform **2x requests** to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" + else: + url = tag["trees_url"] + with self._gh._opener.open(self._request(url)) as response: + content: GitHubTreesResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with self._gh._opener.open(self._request(data_url)) as response: + data_dir: GitHubTreesResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + def _request(self, url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. + + .. _personal access token: + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + .. _Media types: + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ + headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} + if tok := os.environ.get(self._ENV_VAR): + headers["Authorization"] = ( + tok if tok.startswith("Bearer ") else f"Bearer {tok}" + ) + if raw: + headers["Accept"] = "application/vnd.github.raw+json" + return urllib.request.Request(url, headers=headers) + + +class _GitHubParseNamespace: + """ + Transform responses into intermediate representations. + + Where relevant: + - Adding cheap to compute metadata + - Dropping information that we don't need for the task + """ + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: + core = rate_limit["core"] + reset = core["reset"] + return ParsedRateLimit( + **core, + reset_time=time.localtime(reset), + is_limited=core["remaining"] == 0, + is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, + ) + + def tag(self, tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") + + def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: + return [self.tag(t) for t in tags] + + def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=path.stem.replace("-", "_"), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + tag=tag, + ) + + def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return [self.tree(t, tag) for t in tree["tree"]] + + def tag_from_str(self, s: str, /) -> str: + # - Actual tag + # - Trees url (using ref name) + # - npm url (works w/o the `v` prefix) + trees_url = self.url.TREES + if s.startswith("v"): + return s + elif s.startswith(trees_url): + return s.replace(trees_url, "") + elif s.startswith(_NPM_BASE_URL): + s, _ = s.replace(_NPM_BASE_URL, "").split("/") + return s if s.startswith("v") else f"v{s}" + else: + raise TypeError(s) + + +class _GitHubQueryNamespace: + """**WIP** Interfacing with the cached metadata.""" + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def paths(self) -> dict[_PathName, Path]: + return self._gh._paths + + def url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + fp = self.paths["trees"] + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + +class GitHub: + """ + Primary interface with the GitHub API. + + Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. + + - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. + - Organizes distinct groups of operations into property accessor namespaces. + + + .. _tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags + .. _trees: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + .. _rate_limit: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + + """ + + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) + + def __init__( + self, + output_dir: Path, + name_tags: str, + name_trees: str, + *, + base_url: LiteralString = "https://api.github.com/", + org: LiteralString = "vega", + package: LiteralString = "vega-datasets", + ) -> None: + output_dir.mkdir(exist_ok=True) + self._paths: dict[_PathName, Path] = { + "dir": output_dir, + "tags": output_dir / f"{name_tags}.parquet", + "trees": output_dir / f"{name_trees}.parquet", + } + repo = f"{base_url}repos/{org}/{package}/" + self._url = GitHubUrl( + BASE=base_url, + RATE=f"{base_url}rate_limit", + REPO=repo, + TAGS=f"{repo}tags", + TREES=f"{repo}git/trees/", + ) + + @property + def req(self) -> _GitHubRequestNamespace: + return _GitHubRequestNamespace(self) + + @property + def parse(self) -> _GitHubParseNamespace: + return _GitHubParseNamespace(self) + + @property + def query(self) -> _GitHubQueryNamespace: + return _GitHubQueryNamespace(self) + + @property + def url(self) -> GitHubUrl: + return self._url + + def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: + limit = self.parse.rate_limit(self.req.rate_limit()) + if strict and limit["is_limited"]: + raise NotImplementedError(limit) + return limit + + def tags( + self, n_head: int | None = None, *, warn_lower: bool = False + ) -> pl.DataFrame: + tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + + def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: + """Retrieve directory info for a given version ``tag``.""" + trees = self.req.trees(tag) + tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"] + parsed = self.parse.trees(trees, tag=tag_v) + df = ( + pl.DataFrame(parsed) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: + """ + Use known tags to discover and update missing trees metadata. + + Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + """ + rate_limit = self.rate_limit(strict=True) + fp = self._paths["trees"] + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" + ) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp!s}") + return trees + else: + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT + it = islice(missing_trees.iter_rows(named=True), stop) + missing = cast("Iterator[ReParsedTag]", it) + fresh_rows = self._trees_batched(missing) + print( + f"Finished collection.\n" + f"Writing {fresh_rows.height} new rows to {fp!s}" + ) + return pl.concat((trees, fresh_rows)).pipe(semver.sort) + + def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: + limit = self.rate_limit(strict=True) + npm_tag_only = npm_tags.lazy().select("tag") + fp = self._paths["tags"] + if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: + return ( + pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() + ) + elif not fp.exists(): + print(f"Initializing {fp!s}") + tags = ( + self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + print(f"Collected {tags.height} new tags") + return tags + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + latest = ( + self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + if latest.equals(prev.pipe(semver.sort).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(), prev_eager), how="vertical") + .unique("sha") + .pipe(semver.sort) + ) + print(f"Collected {tags.height - prev_eager.height} new tags") + return tags + + def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: + rate_limit = self.rate_limit(strict=True) + if not isinstance(tags, Sequence): + tags = tuple(tags) + req = self.req + n = len(tags) + cost = req._TREES_COST * n + if rate_limit["remaining"] < cost: + raise NotImplementedError(rate_limit, cost) + delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY + print( + f"Collecting metadata for {n} missing releases.\n" + f"Using {delay_secs=} between requests ..." + ) + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay_secs + random.triangular()) + dfs.append(self.trees(tag)) + return pl.concat(dfs) diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py new file mode 100644 index 000000000..bdc20f83b --- /dev/null +++ b/tools/datasets/npm.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import json +import urllib.request +from typing import TYPE_CHECKING, Literal + +import polars as pl + +from tools.datasets import semver +from tools.datasets.models import NpmUrl + +if TYPE_CHECKING: + import sys + from pathlib import Path + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from tools.datasets.models import NpmPackageMetadataResponse + +__all__ = ["Npm"] + + +class Npm: + """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + + def __init__( + self, + output_dir: Path, + name_tags: str, + *, + jsdelivr: Literal["jsdelivr"] = "jsdelivr", + npm: Literal["npm"] = "npm", + package: LiteralString = "vega-datasets", + jsdelivr_version: LiteralString = "v1", + ) -> None: + output_dir.mkdir(exist_ok=True) + self._paths: dict[Literal["tags"], Path] = { + "tags": output_dir / f"{name_tags}.parquet" + } + self._url: NpmUrl = NpmUrl( + CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", + TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + ) + + @property + def url(self) -> NpmUrl: + return self._url + + def tags(self) -> pl.DataFrame: + """ + Request, parse tags from `Get package metadata`_. + + Notes + ----- + - Ignores canary releases + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} + + .. _Get package metadata: + https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- + """ + req = urllib.request.Request( + self.url.TAGS, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + f"v{tag}" + for v in content["versions"] + if (tag := v["version"]) and semver.CANARY not in tag + ] + return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) From b89e6dc31691cdeb2c33811c27db92c70ded7940 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:21:28 +0000 Subject: [PATCH 035/201] refactor: Move `DataLoader.__call__` -> `DataLoader.url()` -`data.name()` -> `data(name)` - `data.name.url` -> `data.url(name)` --- tools/datasets/__init__.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index bcbe725a1..c9114aa01 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -284,20 +284,13 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() - def __call__( + def url( self, name: str, ext: ExtSupported | None = None, /, tag: LiteralString | Literal["latest"] | None = None, - ) -> WorkInProgress: - """ - **WIP** Will be using this *instead of* attribute access. - - - Original supports this as well - - Will only be using the actual (js_name) - - Some have hyphens, others underscores - """ + ) -> str: constraints: dict[Literal["tag", "suffix"], str] = {} if tag == "latest": raise NotImplementedError(tag) @@ -318,5 +311,21 @@ def __call__( q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] return app.github.query.url_from(**q) + def __call__( + self, + name: str, + ext: ExtSupported | None = None, + /, + tag: LiteralString | Literal["latest"] | None = None, + ) -> WorkInProgress: + """ + **WIP** Will be using this *instead of* attribute access. + + - Original supports this as well + - Will only be using the actual (js_name) + - Some have hyphens, others underscores + """ + return self.url(name, ext, tag=tag) + data = DataLoader() From 7b0fe294fabe3a562cf7d291951f1bd0da3e2b93 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:53:59 +0000 Subject: [PATCH 036/201] feat(typing): Generate annotations based on known datasets --- tools/datasets/__init__.py | 62 +++++++++++++++++ tools/datasets/_typing.py | 137 +++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 tools/datasets/_typing.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c9114aa01..bf5b7f187 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -16,9 +16,11 @@ import polars as pl +from tools.codemod import ruff from tools.datasets.github import GitHub from tools.datasets.models import QueryTree from tools.datasets.npm import Npm +from tools.schemapi import utils if TYPE_CHECKING: import sys @@ -37,10 +39,17 @@ else: from typing_extensions import TypeAlias + _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] + WorkInProgress: TypeAlias = Any __all__ = ["app", "data"] +HEADER_COMMENT = """\ +# The contents of this file are automatically written by +# tools/datasets.__init__.py. Do not modify directly. +""" + class Application: """ @@ -78,6 +87,14 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm + @property + def _aliases(self) -> dict[_PathAlias, Path]: + return { + "npm_tags": self.npm._paths["tags"], + "gh_tags": self.github._paths["tags"], + "gh_trees": self.github._paths["trees"], + } + def refresh(self) -> pl.DataFrame: npm_tags = self.npm.tags() self.write_parquet(npm_tags, self.npm._paths["tags"]) @@ -89,6 +106,21 @@ def refresh(self) -> pl.DataFrame: self.write_parquet(gh_trees, self.github._paths["trees"]) return gh_trees + def read(self, name: _PathAlias, /) -> pl.DataFrame: + """Read existing metadata from file.""" + return pl.read_parquet(self._from_alias(name)) + + def scan(self, name: _PathAlias, /) -> pl.LazyFrame: + """Scan existing metadata from file.""" + return pl.scan_parquet(self._from_alias(name)) + + def _from_alias(self, name: _PathAlias, /) -> Path: + if name not in {"npm_tags", "gh_tags", "gh_trees"}: + msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' + raise TypeError(msg) + else: + return self._aliases[name] + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): @@ -118,6 +150,36 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None """ +def generate_datasets_typing(application: Application, output: Path, /) -> None: + app = application + tags = app.scan("gh_tags").select("tag").collect().to_series() + names = ( + app.scan("gh_trees") + .filter("ext_supported") + .unique("name_js") + .select("name_js") + .sort("name_js") + .collect() + .to_series() + ) + NAME = "DatasetName" + TAG = "VersionTag" + EXT = "Extension" + contents = ( + f"{HEADER_COMMENT}", + "from __future__ import annotations\n", + "import sys", + "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 10), "TypeAlias"), + "\n", + f"__all__ = {[NAME, TAG, EXT]}\n\n" + f"{NAME}: TypeAlias = {utils.spell_literal(names)}", + f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", + f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + ) + ruff.write_lint_format(output, contents) + + def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: return suffix in {".csv", ".json", ".tsv", ".arrow"} diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py new file mode 100644 index 000000000..9414aaab4 --- /dev/null +++ b/tools/datasets/_typing.py @@ -0,0 +1,137 @@ +# The contents of this file are automatically written by +# tools/datasets.__init__.py. Do not modify directly. + +from __future__ import annotations + +import sys +from typing import Literal + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + + +__all__ = ["DatasetName", "Extension", "VersionTag"] + +DatasetName: TypeAlias = Literal[ + "airports", + "annual-precip", + "anscombe", + "barley", + "birdstrikes", + "budget", + "budgets", + "burtin", + "cars", + "climate", + "co2-concentration", + "countries", + "crimea", + "disasters", + "driving", + "earthquakes", + "flare", + "flare-dependencies", + "flights-10k", + "flights-200k", + "flights-20k", + "flights-2k", + "flights-3m", + "flights-5k", + "flights-airport", + "football", + "gapminder", + "gapminder-health-income", + "github", + "global-temp", + "graticule", + "income", + "iowa-electricity", + "iris", + "jobs", + "la-riots", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "lookup_groups", + "lookup_people", + "miserables", + "monarchs", + "movies", + "normal-2d", + "obesity", + "ohlc", + "penguins", + "platformer-terrain", + "points", + "political-contributions", + "population", + "population_engineers_hurricanes", + "seattle-temps", + "seattle-weather", + "seattle-weather-hourly-normals", + "sf-temps", + "sp500", + "sp500-2000", + "stocks", + "udistrict", + "unemployment", + "unemployment-across-industries", + "uniform-2d", + "us-10m", + "us-employment", + "us-state-capitals", + "volcano", + "weather", + "weball26", + "wheat", + "windvectors", + "world-110m", + "zipcodes", +] +VersionTag: TypeAlias = Literal[ + "v2.9.0", + "v2.8.1", + "v2.8.0", + "v2.7.0", + "v2.5.4", + "v2.5.3", + "v2.5.3-next.0", + "v2.5.2", + "v2.5.2-next.0", + "v2.5.1", + "v2.5.1-next.0", + "v2.5.0", + "v2.5.0-next.0", + "v2.4.0", + "v2.3.1", + "v2.3.0", + "v2.1.0", + "v2.0.0", + "v1.31.1", + "v1.31.0", + "v1.30.4", + "v1.30.3", + "v1.30.2", + "v1.30.1", + "v1.29.0", + "v1.24.0", + "v1.22.0", + "v1.21.1", + "v1.21.0", + "v1.20.0", + "v1.19.0", + "v1.18.0", + "v1.17.0", + "v1.16.0", + "v1.15.0", + "v1.14.0", + "v1.12.0", + "v1.11.0", + "v1.10.0", + "v1.8.0", + "v1.7.0", + "v1.5.0", +] +Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] From 572d069842ea80c085db22cf90aee7286e5a4bfd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 18:02:42 +0000 Subject: [PATCH 037/201] refactor(typing): Utilize `datasets._typing` --- tools/datasets/__init__.py | 28 ++++++++++++---------------- tools/datasets/github.py | 4 ++-- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index bf5b7f187..a92aeb2fc 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -11,7 +11,7 @@ import tempfile from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal from urllib.request import urlopen import polars as pl @@ -38,6 +38,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias + from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] @@ -144,11 +145,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] -""" -- `'flights-200k.(arrow|json)'` key collison using stem -""" - def generate_datasets_typing(application: Application, output: Path, /) -> None: app = application @@ -180,7 +176,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: +def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} @@ -193,7 +189,7 @@ def _js_to_py(s: str, /): class Dataset: - read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = { + read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = { ".csv": pl.read_csv, ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), @@ -205,7 +201,7 @@ def __init__(self, name: str, /, base_url: str) -> None: file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix if is_ext_supported(suffix): - self.extension: ExtSupported = suffix + self.extension: Extension = suffix else: raise NotImplementedError(suffix, file_name) @@ -348,17 +344,17 @@ def __dir__(self) -> list[str]: def url( self, - name: str, - ext: ExtSupported | None = None, + name: DatasetName | LiteralString, + ext: Extension | None = None, /, - tag: LiteralString | Literal["latest"] | None = None, + tag: VersionTag | Literal["latest"] | None = None, ) -> str: constraints: dict[Literal["tag", "suffix"], str] = {} if tag == "latest": raise NotImplementedError(tag) elif tag is not None: constraints["tag"] = tag - if name.endswith(get_args(ExtSupported)): + if name.endswith((".csv", ".json", ".tsv", ".arrow")): name, suffix = name.rsplit(".", maxsplit=1) suffix = "." + suffix if not is_ext_supported(suffix): @@ -375,10 +371,10 @@ def url( def __call__( self, - name: str, - ext: ExtSupported | None = None, + name: DatasetName | LiteralString, + ext: Extension | None = None, /, - tag: LiteralString | Literal["latest"] | None = None, + tag: VersionTag | Literal["latest"] | None = None, ) -> WorkInProgress: """ **WIP** Will be using this *instead of* attribute access. diff --git a/tools/datasets/github.py b/tools/datasets/github.py index e245b91b1..fc0a899f2 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -31,7 +31,7 @@ from email.message import Message from urllib.request import OpenerDirector, Request - from tools.datasets import ExtSupported + from tools.datasets._typing import Extension from tools.datasets.models import ReParsedTag from tools.schemapi.utils import OneOrSeq @@ -62,7 +62,7 @@ _SUB_DIR = "data" -def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: +def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} From 07dcc0baaf955d10c65b68c65165c86bc2cb9ddb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 20:16:07 +0000 Subject: [PATCH 038/201] feat: Adds `Npm.dataset` for remote reading] --- tools/datasets/__init__.py | 5 ++-- tools/datasets/npm.py | 55 +++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a92aeb2fc..b1a5b8550 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -375,7 +375,8 @@ def __call__( ext: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, - ) -> WorkInProgress: + **kwds: Any, + ) -> pl.DataFrame: """ **WIP** Will be using this *instead of* attribute access. @@ -383,7 +384,7 @@ def __call__( - Will only be using the actual (js_name) - Some have hyphens, others underscores """ - return self.url(name, ext, tag=tag) + return app.npm.dataset(self.url(name, ext, tag=tag), **kwds) data = DataLoader() diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index bdc20f83b..589db4660 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,7 +2,9 @@ import json import urllib.request -from typing import TYPE_CHECKING, Literal +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal import polars as pl @@ -11,20 +13,43 @@ if TYPE_CHECKING: import sys - from pathlib import Path + from urllib.request import OpenerDirector + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from tools.datasets._typing import Extension from tools.datasets.models import NpmPackageMetadataResponse + ReadFn: TypeAlias = Callable[..., pl.DataFrame] + __all__ = ["Npm"] +def is_ext_supported(suffix: str) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} + + class Npm: """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + _read_fn: ClassVar[dict[Extension, ReadFn]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + def __init__( self, output_dir: Path, @@ -48,6 +73,30 @@ def __init__( def url(self) -> NpmUrl: return self._url + @classmethod + def reader_from(cls, url: str, /) -> ReadFn: + suffix = Path(url).suffix + if is_ext_supported(suffix): + return cls._read_fn[suffix] + else: + msg = f"Unexpected file extension {suffix!r}, from:\n{url}" + raise NotImplementedError(msg) + + def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + """ + Fetch a remote dataset. + + Parameters + ---------- + url + Full path to a known dataset. + **kwds + Arguments passed to the underlying read function. + """ + fn = self.reader_from(url) + with self._opener.open(url) as f: + return fn(f.read(), **kwds) + def tags(self) -> pl.DataFrame: """ Request, parse tags from `Get package metadata`_. @@ -66,7 +115,7 @@ def tags(self) -> pl.DataFrame: req = urllib.request.Request( self.url.TAGS, headers={"Accept": "application/json"} ) - with urllib.request.urlopen(req) as response: + with self._opener.open(req) as response: content: NpmPackageMetadataResponse = json.load(response) versions = [ f"v{tag}" From d8f37918b130d7f89defcb6f1104268db1997420 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 20:24:38 +0000 Subject: [PATCH 039/201] refactor: Remove dead code --- tools/datasets/__init__.py | 173 ++----------------------------------- 1 file changed, 6 insertions(+), 167 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b1a5b8550..ab1af8d4b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,11 +8,8 @@ from __future__ import annotations import json -import tempfile -from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal -from urllib.request import urlopen +from typing import TYPE_CHECKING, Any, Literal import polars as pl @@ -180,167 +177,9 @@ def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} -def _py_to_js(s: str, /): - return s.replace("_", "-") - - -def _js_to_py(s: str, /): - return s.replace("-", "_") - - -class Dataset: - read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - - def __init__(self, name: str, /, base_url: str) -> None: - self.name: str = name - file_name = DATASETS_JSON[_py_to_js(name)]["filename"] - suffix = Path(file_name).suffix - if is_ext_supported(suffix): - self.extension: Extension = suffix - else: - raise NotImplementedError(suffix, file_name) - - self.url: str = f"{base_url}{file_name}" - - def __call__(self, **kwds: Any) -> pl.DataFrame: - fn = self.read_fn[self.extension] - with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f: - tmp.write(f.read()) - content = fn(tmp, **kwds) - return content - - def __repr__(self) -> str: - return ( - f"{type(self).__name__}(\n " - f"name={self.name!r},\n " - f"url={self.url!r}\n" - ")" - ) - - -DATASETS_JSON = { - # "7zip": {"filename": "7zip.png", "format": "png"}, - "airports": {"filename": "airports.csv", "format": "csv"}, - "annual-precip": {"filename": "annual-precip.json", "format": "json"}, - "anscombe": {"filename": "anscombe.json", "format": "json"}, - "barley": {"filename": "barley.json", "format": "json"}, - "birdstrikes": {"filename": "birdstrikes.json", "format": "json"}, - "budget": {"filename": "budget.json", "format": "json"}, - "budgets": {"filename": "budgets.json", "format": "json"}, - "burtin": {"filename": "burtin.json", "format": "json"}, - "cars": {"filename": "cars.json", "format": "json"}, - "climate": {"filename": "climate.json", "format": "json"}, - "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"}, - "countries": {"filename": "countries.json", "format": "json"}, - "crimea": {"filename": "crimea.json", "format": "json"}, - "disasters": {"filename": "disasters.csv", "format": "csv"}, - "driving": {"filename": "driving.json", "format": "json"}, - "earthquakes": {"filename": "earthquakes.json", "format": "json"}, - # "ffox": {"filename": "ffox.png", "format": "png"}, - "flare": {"filename": "flare.json", "format": "json"}, - "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"}, - "flights-10k": {"filename": "flights-10k.json", "format": "json"}, - "flights-200k": {"filename": "flights-200k.json", "format": "json"}, - "flights-20k": {"filename": "flights-20k.json", "format": "json"}, - "flights-2k": {"filename": "flights-2k.json", "format": "json"}, - "flights-3m": {"filename": "flights-3m.csv", "format": "csv"}, - "flights-5k": {"filename": "flights-5k.json", "format": "json"}, - "flights-airport": {"filename": "flights-airport.csv", "format": "csv"}, - "gapminder": {"filename": "gapminder.json", "format": "json"}, - "gapminder-health-income": { - "filename": "gapminder-health-income.csv", - "format": "csv", - }, - # "gimp": {"filename": "gimp.png", "format": "png"}, - "github": {"filename": "github.csv", "format": "csv"}, - "graticule": {"filename": "graticule.json", "format": "json"}, - "income": {"filename": "income.json", "format": "json"}, - "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"}, - "iris": {"filename": "iris.json", "format": "json"}, - "jobs": {"filename": "jobs.json", "format": "json"}, - "la-riots": {"filename": "la-riots.csv", "format": "csv"}, - "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"}, - "londonCentroids": {"filename": "londonCentroids.json", "format": "json"}, - "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"}, - "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"}, - "lookup_people": {"filename": "lookup_people.csv", "format": "csv"}, - "miserables": {"filename": "miserables.json", "format": "json"}, - "monarchs": {"filename": "monarchs.json", "format": "json"}, - "movies": {"filename": "movies.json", "format": "json"}, - "normal-2d": {"filename": "normal-2d.json", "format": "json"}, - "obesity": {"filename": "obesity.json", "format": "json"}, - "ohlc": {"filename": "ohlc.json", "format": "json"}, - "points": {"filename": "points.json", "format": "json"}, - "population": {"filename": "population.json", "format": "json"}, - "population_engineers_hurricanes": { - "filename": "population_engineers_hurricanes.csv", - "format": "csv", - }, - "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"}, - "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"}, - "sf-temps": {"filename": "sf-temps.csv", "format": "csv"}, - "sp500": {"filename": "sp500.csv", "format": "csv"}, - "stocks": {"filename": "stocks.csv", "format": "csv"}, - "udistrict": {"filename": "udistrict.json", "format": "json"}, - "unemployment": {"filename": "unemployment.tsv", "format": "tsv"}, - "unemployment-across-industries": { - "filename": "unemployment-across-industries.json", - "format": "json", - }, - "uniform-2d": {"filename": "uniform-2d.json", "format": "json"}, - "us-10m": {"filename": "us-10m.json", "format": "json"}, - "us-employment": {"filename": "us-employment.csv", "format": "csv"}, - "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"}, - "volcano": {"filename": "volcano.json", "format": "json"}, - "weather": {"filename": "weather.json", "format": "json"}, - "weball26": {"filename": "weball26.json", "format": "json"}, - "wheat": {"filename": "wheat.json", "format": "json"}, - "windvectors": {"filename": "windvectors.csv", "format": "csv"}, - "world-110m": {"filename": "world-110m.json", "format": "json"}, - "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, -} -"""Inlined `datasets.json`_. - -- Excluding images - -.. _datasets.json: - https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json -""" - - class DataLoader: - source_tag: ClassVar[str] = "v2.9.0" - _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/" - - @property - def base_url(self) -> str: - return self._base_url_fmt.format(self.source_tag) - - @cached_property - def _dataset_names(self) -> list[str]: - return sorted(DATASETS_JSON) - - @cached_property - def _py_js_names(self) -> dict[str, str]: - return {_js_to_py(name): name for name in self._dataset_names} - - def list_datasets(self) -> list[str]: - return list(self._py_js_names) - - def __getattr__(self, name: str) -> Dataset: - if name in self._py_js_names: - return Dataset(self._py_js_names[name], self.base_url) - else: - msg = f"No dataset named {name!r}" - raise AttributeError(msg) - - def __dir__(self) -> list[str]: - return self.list_datasets() + def __init__(self, application: Application, /) -> None: + self._app: Application = application def url( self, @@ -367,7 +206,7 @@ def url( else: constraints["suffix"] = ext q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] - return app.github.query.url_from(**q) + return self._app.github.query.url_from(**q) def __call__( self, @@ -384,7 +223,7 @@ def __call__( - Will only be using the actual (js_name) - Some have hyphens, others underscores """ - return app.npm.dataset(self.url(name, ext, tag=tag), **kwds) + return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds) -data = DataLoader() +data = DataLoader(app) From 4642a238971edea66b4bd5f5e3636a287de2db96 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:26:34 +0000 Subject: [PATCH 040/201] refactor: Replace `name_js`, `name_py` with `dataset_name` Since we're just using strings, there is no need for 2 forms of the name. The legacy package needed this for `__getattr__` access with valid identifiers --- tools/datasets/__init__.py | 9 +++++---- tools/datasets/_metadata/metadata-schema.json | 3 +-- tools/datasets/_metadata/metadata.parquet | Bin 20768 -> 19087 bytes tools/datasets/github.py | 5 ++--- tools/datasets/models.py | 6 ++---- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ab1af8d4b..8217ab355 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -146,12 +146,13 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None def generate_datasets_typing(application: Application, output: Path, /) -> None: app = application tags = app.scan("gh_tags").select("tag").collect().to_series() + DATASET_NAME = "dataset_name" names = ( app.scan("gh_trees") .filter("ext_supported") - .unique("name_js") - .select("name_js") - .sort("name_js") + .unique(DATASET_NAME) + .select(DATASET_NAME) + .sort(DATASET_NAME) .collect() .to_series() ) @@ -205,7 +206,7 @@ def url( raise TypeError(ext) else: constraints["suffix"] = ext - q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] + q = QueryTree(dataset_name=name, **constraints) # type: ignore[typeddict-item] return self._app.github.query.url_from(**q) def __call__( diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json index 2b5b9d955..d3da3f86d 100644 --- a/tools/datasets/_metadata/metadata-schema.json +++ b/tools/datasets/_metadata/metadata-schema.json @@ -1,9 +1,8 @@ { + "dataset_name": "str", "ext_supported": "bool", "file_name": "str", "name_collision": "bool", - "name_js": "str", - "name_py": "str", "size": "int", "suffix": "str", "tag": "str", diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet index 071e4bd6cf68fcc17952c5057858fa29399c9415..97f235546beb0c56abede1cb419eab4afb89dd9c 100644 GIT binary patch delta 1026 zcmY*YUr3Wt6u;lsHnmnR^8b=EMyZU$h`A zfmROA%D`J=t`dG@yEKL5`Eu|zSN?JNX#WC3(=VTHlHcJb>FApGa6Eq zGrwuvR9^%194BxfohY=+H@vN>CqIgW^atT1$IrB-A85QBKYEmGYenfjTEiCpi8tL@ zp4P}7OmOw2VfQ6+tw~N~%_U@^X)C$gWJ+&uzQ+we&x6ZK1OQNLQKlHC_^kiB3_jbO z^~X){Gne(J>)}*>)}OP(WqH=W>V_{?au9*rbX5lk0)UPfXXsn zpgmBnHK7c&X#pS}3nq^($Mw_VGo*z|bv00^)Qq~uT;DF04XGB+^H4x@e>hkJ#P@3L z*kq|LQ@q+S0MJi~u}B#^fjC)3{MH?NGaANF4xQ=^#zS2RLom@VsL*{0{7S(FAf}FxM>d@})5eaJ z8IprZL1FOs^~DEhnb#6Cb%uvVAb#>_$x?~>-zOBbK&pt0ON#Qxw;rJLh;hS%Bt8b@ZW$xoG?sM^yHryd}k6 zwyKYjJ@x~}ZcJA<4r5$}n=rP8%@~`V*hwwh+o^*at&HSih61F)(U@;xe#m1>c#PEp z0Vg@=z=f@Au~2WQB~s}Pla$@O5f4zn;)?hN7){^8Xu&FX%omHAFg}!EY7VntBQw}5 zS*MkF9XfrTgMt-OgbI@=HYS2nsXAK67_IS9jZ~DHcY0}#UXW@9V$DgPyY6(rIGFHz z+Is^=6S1~87O?x8WB!i8C{-Q_bR1ik+RkeFJdp}dReNR7V(t(2xS~EBUQVU3F#n`@5T3{Awh1N1REd`OEQlui%ZB&u? z6QkH&WGMci5F`ugW>uz9r*rWGigV@t3stg%oN6PLw) zmPN0Nt}o`=d!Fud7Isks7oQGgZJ3diKPswu`25L|`B#0^;&ab#JnB5#(z4SO`Rv0A zxuE^iEsUlMU5Uo{*4wM5xUO{!an}bgJrRCR$dPqqi~GJdtW`TRyIyP#@^*>0G)-%` zb!4rypk<+Dq;GJ?5$V2&lN{w)mCbiXYhh2)opg*?yvr`SK4cqdRvtTXN5VNjv;Cu? zZiY9x@y~&F+4O*}*5-&6GFKhghWXhW*Jh6T?KW~*mrmFJ@;m+O#-{Hhip1vr`&a)y zc)xa7Tm86Zl%FIz96CAS(6TFgr;jYNQidBF!a{X{t(`k>$Hq*sYwr%q&4;dA_r)nM zhIFnY`~z=g7Yo<*`S)bpY#2N|swt^Nr(fV*oH%ZB4g?u+Vo$O2eM&Kb=q+* z4dFm=BzmFJ*S{I|v>yv%kfhxWBlQEoCiv$hQp2mj;YrIsRs}Zp?-#F8v|Z1fEAb~ z4f}tiS(GV>E;qMkHvxMKqw2p2gWoJZo9f;~1ffRba%in~fqstFv^?Kz_ZGtLQYE&YHi_@-$NMa_G zWJ69Q1&Ki`B^mHiCq-pJOyYJRU5Eh{LJb8-EKHVBmT%66#w$&#gyp$<-7-^Z@mMyb z)8jLis>iZbA?It^R4`i ParsedTree: path = Path(tree["path"]) return ParsedTree( file_name=path.name, - name_js=path.stem, - name_py=path.stem.replace("-", "_"), + dataset_name=path.stem, suffix=path.suffix, size=tree["size"], url=tree["url"], @@ -361,7 +360,7 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: pl.DataFrame(parsed) .lazy() .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( pl.lit(_NPM_BASE_URL), diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 5a6598fed..0271d09de 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -112,8 +112,7 @@ class NpmPackageMetadataResponse(TypedDict): class ParsedTree(TypedDict): file_name: str - name_js: str - name_py: str + dataset_name: str suffix: str size: int url: str @@ -123,8 +122,7 @@ class ParsedTree(TypedDict): class QueryTree(TypedDict, total=False): file_name: str - name_js: Required[str] - name_py: str + dataset_name: Required[str] suffix: str size: int url: str From 65f87fc2e99b49b781844993a6e45489ed648a65 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:28:20 +0000 Subject: [PATCH 041/201] fix: Remove invalid `semver.sort` op I think this was added in error, since the schema of the file never had `semver` columns Only noticed the bug when doing a full rebuild --- tools/datasets/github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 33d7289af..9b6671646 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -397,7 +397,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: f"Finished collection.\n" f"Writing {fresh_rows.height} new rows to {fp!s}" ) - return pl.concat((trees, fresh_rows)).pipe(semver.sort) + return pl.concat((trees, fresh_rows)) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) From 6349b0f255fab9df3173b5b75c660056317dfe82 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:08:04 +0000 Subject: [PATCH 042/201] fix: Add missing init path for `refresh_trees` --- tools/datasets/github.py | 81 ++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 9b6671646..cb9d74751 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -3,13 +3,14 @@ import json import os import random +import sys import time import urllib.request import warnings -from collections.abc import Iterable, Iterator, Sequence +from collections.abc import Iterable, Iterator, Mapping, Sequence from itertools import islice from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast +from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast import polars as pl @@ -23,16 +24,20 @@ ParsedRateLimit, ParsedTag, ParsedTree, + ReParsedTag, ) +if sys.version_info >= (3, 13): + from typing import is_typeddict +else: + from typing_extensions import is_typeddict + if TYPE_CHECKING: - import sys from collections.abc import MutableMapping from email.message import Message from urllib.request import OpenerDirector, Request from tools.datasets._typing import Extension - from tools.datasets.models import ReParsedTag from tools.schemapi.utils import OneOrSeq if sys.version_info >= (3, 13): @@ -50,8 +55,11 @@ _PathName: TypeAlias = Literal["dir", "tags", "trees"] + __all__ = ["GitHub"] +_TD = TypeVar("_TD", bound=Mapping[str, Any]) + _ItemSlice: TypeAlias = ( "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" ) @@ -379,25 +387,27 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. """ + if gh_tags.is_empty(): + msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" + raise NotImplementedError(msg) rate_limit = self.rate_limit(strict=True) + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT fp = self._paths["trees"] - trees = pl.read_parquet(fp) - missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" - ) - if missing_trees.is_empty(): - print(f"Already up-to-date {fp!s}") - return trees + TP = ReParsedTag + if not fp.exists(): + print(f"Initializing {fp!s}") + return self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: - stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT - it = islice(missing_trees.iter_rows(named=True), stop) - missing = cast("Iterator[ReParsedTag]", it) - fresh_rows = self._trees_batched(missing) - print( - f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp!s}" + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" ) - return pl.concat((trees, fresh_rows)) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp!s}") + return trees + else: + fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) + return pl.concat((trees, fresh)) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) @@ -451,4 +461,35 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: for tag in tags: time.sleep(delay_secs + random.triangular()) dfs.append(self.trees(tag)) - return pl.concat(dfs) + df = pl.concat(dfs) + print(f"Finished collection.\n" f"Found {df.height} new rows") + return df + + +def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator[_TD]: + """ + Wraps `pl.DataFrame.iter_rows`_ with typing to preserve key completions. + + Parameters + ---------- + df + Target dataframe. + stop + Passed to `itertools.islice`_. + tp + Static type representing a row/record. + + .. note:: + Performs a **very basic** runtime check on the type of ``tp`` (*not* ``df``). + + Primarily used to override ``dict[str, Any]`` when a *narrower* type is known. + + .. _itertools.islice: + https://docs.python.org/3/library/itertools.html#itertools.islice + .. _pl.DataFrame.iter_rows: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html + """ + if not TYPE_CHECKING: + assert is_typeddict(tp) or issubclass(tp, Mapping) + + return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop)) From f1d610c528e81c12381114b2fafea13d53267bab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:41:54 +0000 Subject: [PATCH 043/201] refactor: Move public interface to `_io` Temporary home, see module docstring --- tools/datasets/__init__.py | 47 ++-------- tools/datasets/_io.py | 178 +++++++++++++++++++++++++++++++++++++ tools/datasets/github.py | 42 --------- tools/datasets/models.py | 10 --- tools/datasets/npm.py | 49 +--------- 5 files changed, 188 insertions(+), 138 deletions(-) create mode 100644 tools/datasets/_io.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 8217ab355..3adc2321b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -14,8 +14,8 @@ import polars as pl from tools.codemod import ruff +from tools.datasets._io import Reader from tools.datasets.github import GitHub -from tools.datasets.models import QueryTree from tools.datasets.npm import Npm from tools.schemapi import utils @@ -23,10 +23,6 @@ import sys from collections.abc import Mapping - if sys.version_info >= (3, 13): - from typing import TypeIs - else: - from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -174,13 +170,9 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -def is_ext_supported(suffix: str) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - class DataLoader: - def __init__(self, application: Application, /) -> None: - self._app: Application = application + def __init__(self, metadata: Path, /) -> None: + self._reader = Reader(metadata) def url( self, @@ -189,25 +181,8 @@ def url( /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - constraints: dict[Literal["tag", "suffix"], str] = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: - constraints["tag"] = tag - if name.endswith((".csv", ".json", ".tsv", ".arrow")): - name, suffix = name.rsplit(".", maxsplit=1) - suffix = "." + suffix - if not is_ext_supported(suffix): - raise TypeError(suffix) - else: - constraints["suffix"] = suffix - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) - else: - constraints["suffix"] = ext - q = QueryTree(dataset_name=name, **constraints) # type: ignore[typeddict-item] - return self._app.github.query.url_from(**q) + """Return the address of a remote dataset.""" + return self._reader.url(name, ext, tag=tag) def __call__( self, @@ -217,14 +192,8 @@ def __call__( tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, ) -> pl.DataFrame: - """ - **WIP** Will be using this *instead of* attribute access. - - - Original supports this as well - - Will only be using the actual (js_name) - - Some have hyphens, others underscores - """ - return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds) + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(self.url(name, ext, tag=tag), **kwds) -data = DataLoader(app) +data = DataLoader(app._from_alias("gh_trees")) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py new file mode 100644 index 000000000..4a6dce431 --- /dev/null +++ b/tools/datasets/_io.py @@ -0,0 +1,178 @@ +""" +Will be part of the public ``alt.datasets`` subpackage. + +- Interfacing with the cached metadata. + - But not updating it +- Performing requests from those urls +- Dispatching read function on file extension + +Note +---- +- Building with ``polars`` first, then will work backwards with ``narwhals``. + - Since ``narwhals`` is a subset of ``polars`` +""" + +from __future__ import annotations + +import urllib.request +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar + +import polars as pl + +if TYPE_CHECKING: + import sys + from urllib.request import OpenerDirector + + from _typeshed import StrPath + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from narwhals import typing as nw_typing # noqa: F401 + + from tools.datasets._typing import DatasetName, Extension, VersionTag + from tools.schemapi.utils import OneOrSeq + + _ExtensionScan: TypeAlias = Literal[".parquet"] + + ReadFn: TypeAlias = Callable[..., pl.DataFrame] + ScanFn: TypeAlias = Callable[..., pl.LazyFrame] + _T = TypeVar("_T") + +__all__ = ["Reader"] + +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) +"""Query result scalar selection.""" + + +class Reader: + _read_fn: ClassVar[dict[Extension, ReadFn]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + + def __init__(self, fp_trees: Path, /) -> None: + self._fp_trees: Path = fp_trees + + @classmethod + def reader_from(cls, source: StrPath, /) -> ReadFn: + suffix = validate_suffix(source, is_ext_supported) + return cls._read_fn[suffix] + + @classmethod + def scanner_from(cls, source: StrPath, /) -> ScanFn: + suffix = validate_suffix(source, is_ext_scan) + return cls._scan_fn[suffix] + + def url( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + ) -> str: + constraints: dict[str, str] = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + # NOTE: Probably need to remove/move this + if name.endswith((".csv", ".json", ".tsv", ".arrow")): + name, suffix = name.rsplit(".", maxsplit=1) + suffix = "." + suffix + if not is_ext_supported(suffix): + raise TypeError(suffix) + else: + constraints["suffix"] = suffix + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints) + + def _url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + r""" + Querying multi-version trees metadata for `npm` url to fetch. + + Parameters + ---------- + \*predicates, \*\*constraints + Passed directly to `pl.LazyFrame.filter`_. + item + Scalar selection args for `pl.DataFrame.item`_. + + .. _pl.LazyFrame.filter: + https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html + .. _pl.DataFrame.item: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html + """ + source = self._fp_trees + fn = self.scanner_from(self._fp_trees) + results = fn(source).filter(*predicates, **constraints).collect() + if not results.is_empty(): + url = results.item(*item) + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + else: + terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) + msg = f"Found no results for:\n{terms}" + raise NotImplementedError(msg) + + def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + """ + Fetch a remote dataset. + + Parameters + ---------- + url + Full path to a known dataset. + **kwds + Arguments passed to the underlying read function. + """ + fn = self.reader_from(url) + with self._opener.open(url) as f: + return fn(f.read(), **kwds) + + +def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: + suffix: Any = Path(source).suffix + if guard(suffix): + return suffix + else: + msg = f"Unexpected file extension {suffix!r}, from:\n{source}" + raise TypeError(msg) + + +def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: + return suffix == ".parquet" + + +def is_ext_supported(suffix: Any) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} diff --git a/tools/datasets/github.py b/tools/datasets/github.py index cb9d74751..951221765 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -38,7 +38,6 @@ from urllib.request import OpenerDirector, Request from tools.datasets._typing import Extension - from tools.schemapi.utils import OneOrSeq if sys.version_info >= (3, 13): from typing import TypeIs @@ -60,10 +59,6 @@ _TD = TypeVar("_TD", bound=Mapping[str, Any]) -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -"""Query result scalar selection.""" # TODO: Work on where these should live/be accessed _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" @@ -253,38 +248,6 @@ def tag_from_str(self, s: str, /) -> str: raise TypeError(s) -class _GitHubQueryNamespace: - """**WIP** Interfacing with the cached metadata.""" - - def __init__(self, gh: GitHub, /) -> None: - self._gh = gh - - @property - def paths(self) -> dict[_PathName, Path]: - return self._gh._paths - - def url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - fp = self.paths["trees"] - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - class GitHub: """ Primary interface with the GitHub API. @@ -294,7 +257,6 @@ class GitHub: - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. - Organizes distinct groups of operations into property accessor namespaces. - .. _tags: https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags .. _trees: @@ -339,10 +301,6 @@ def req(self) -> _GitHubRequestNamespace: def parse(self) -> _GitHubParseNamespace: return _GitHubParseNamespace(self) - @property - def query(self) -> _GitHubQueryNamespace: - return _GitHubQueryNamespace(self) - @property def url(self) -> GitHubUrl: return self._url diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 0271d09de..6ea7992ae 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -120,16 +120,6 @@ class ParsedTree(TypedDict): tag: str -class QueryTree(TypedDict, total=False): - file_name: str - dataset_name: Required[str] - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - class ParsedTreesResponse(TypedDict): tag: str url: str diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 589db4660..a5f068082 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,9 +2,7 @@ import json import urllib.request -from functools import partial -from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal +from typing import TYPE_CHECKING, ClassVar, Literal import polars as pl @@ -13,41 +11,22 @@ if TYPE_CHECKING: import sys + from pathlib import Path from urllib.request import OpenerDirector - if sys.version_info >= (3, 13): - from typing import TypeIs - else: - from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - from tools.datasets._typing import Extension from tools.datasets.models import NpmPackageMetadataResponse - ReadFn: TypeAlias = Callable[..., pl.DataFrame] __all__ = ["Npm"] -def is_ext_supported(suffix: str) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - class Npm: """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" - _read_fn: ClassVar[dict[Extension, ReadFn]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def __init__( @@ -73,30 +52,6 @@ def __init__( def url(self) -> NpmUrl: return self._url - @classmethod - def reader_from(cls, url: str, /) -> ReadFn: - suffix = Path(url).suffix - if is_ext_supported(suffix): - return cls._read_fn[suffix] - else: - msg = f"Unexpected file extension {suffix!r}, from:\n{url}" - raise NotImplementedError(msg) - - def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: - """ - Fetch a remote dataset. - - Parameters - ---------- - url - Full path to a known dataset. - **kwds - Arguments passed to the underlying read function. - """ - fn = self.reader_from(url) - with self._opener.open(url) as f: - return fn(f.read(), **kwds) - def tags(self) -> pl.DataFrame: """ Request, parse tags from `Get package metadata`_. From c4ef112e0d21872807126c51a62cd144d535dccc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:43:16 +0000 Subject: [PATCH 044/201] refactor(perf): Don't recreate path mapping on every attribute access --- tools/datasets/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3adc2321b..47575278c 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,6 +8,7 @@ from __future__ import annotations import json +import types from pathlib import Path from typing import TYPE_CHECKING, Any, Literal @@ -72,6 +73,13 @@ def __init__( output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._paths = types.MappingProxyType["_PathAlias", Path]( + { + "npm_tags": self.npm._paths["tags"], + "gh_tags": self.github._paths["tags"], + "gh_trees": self.github._paths["trees"], + } + ) @property def github(self) -> GitHub: @@ -81,23 +89,15 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - @property - def _aliases(self) -> dict[_PathAlias, Path]: - return { - "npm_tags": self.npm._paths["tags"], - "gh_tags": self.github._paths["tags"], - "gh_trees": self.github._paths["trees"], - } - def refresh(self) -> pl.DataFrame: npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self.npm._paths["tags"]) + self.write_parquet(npm_tags, self._paths["npm_tags"]) gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self.github._paths["tags"]) + self.write_parquet(gh_tags, self._paths["gh_tags"]) gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self.github._paths["trees"]) + self.write_parquet(gh_trees, self._paths["gh_trees"]) return gh_trees def read(self, name: _PathAlias, /) -> pl.DataFrame: @@ -113,7 +113,7 @@ def _from_alias(self, name: _PathAlias, /) -> Path: msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' raise TypeError(msg) else: - return self._aliases[name] + return self._paths[name] def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" From eb876ebc945776b2f7524ad6e7774347dd7d45ac Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:58:30 +0000 Subject: [PATCH 045/201] refactor: Split `Reader._url_from` into `url`, `_query` - Much more generic now in what it can be used for - For the caching, I'll need more columns than just `"url_npm"` - `"url_github" contains a hash --- tools/datasets/_io.py | 89 ++++++++++++++++++++++------------------ tools/datasets/models.py | 14 +++++++ 2 files changed, 62 insertions(+), 41 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 4a6dce431..812a9eeb0 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -28,14 +28,13 @@ from _typeshed import StrPath if sys.version_info >= (3, 13): - from typing import TypeIs + from typing import TypeIs, Unpack else: - from typing_extensions import TypeIs + from typing_extensions import TypeIs, Unpack if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -43,6 +42,7 @@ from narwhals import typing as nw_typing # noqa: F401 from tools.datasets._typing import DatasetName, Extension, VersionTag + from tools.datasets.models import Metadata from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] @@ -56,7 +56,12 @@ _ItemSlice: TypeAlias = ( "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" ) -"""Query result scalar selection.""" +""" +Scalar selection args for `pl.DataFrame.item`_. + +.. _pl.DataFrame.item: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html +""" class Reader: @@ -89,57 +94,34 @@ def url( /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - constraints: dict[str, str] = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: - constraints["tag"] = tag - # NOTE: Probably need to remove/move this - if name.endswith((".csv", ".json", ".tsv", ".arrow")): - name, suffix = name.rsplit(".", maxsplit=1) - suffix = "." + suffix - if not is_ext_supported(suffix): - raise TypeError(suffix) - else: - constraints["suffix"] = suffix - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) - else: - constraints["suffix"] = ext - return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints) - - def _url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: + df = self._query(**validate_constraints(name, ext, tag)) + item: _ItemSlice = (0, "url_npm") + url = df.item(*item) + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + + def _query( + self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata] + ) -> pl.DataFrame: r""" - Querying multi-version trees metadata for `npm` url to fetch. + Query multi-version trees metadata. Parameters ---------- \*predicates, \*\*constraints Passed directly to `pl.LazyFrame.filter`_. - item - Scalar selection args for `pl.DataFrame.item`_. .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html - .. _pl.DataFrame.item: - https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html """ source = self._fp_trees fn = self.scanner_from(self._fp_trees) results = fn(source).filter(*predicates, **constraints).collect() if not results.is_empty(): - url = results.item(*item) - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) + return results else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) msg = f"Found no results for:\n{terms}" @@ -161,6 +143,31 @@ def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: return fn(f.read(), **kwds) +def validate_constraints( + name: DatasetName | LiteralString, + ext: Extension | None, + tag: VersionTag | Literal["latest"] | None, + /, +) -> Metadata: + constraints: Metadata = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + if name.endswith((".csv", ".json", ".tsv", ".arrow")): + fp = Path(name) + constraints["dataset_name"] = fp.stem + constraints["suffix"] = fp.suffix + return constraints + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + constraints["dataset_name"] = name + return constraints + + def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: suffix: Any = Path(source).suffix if guard(suffix): diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 6ea7992ae..fa0972035 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -126,6 +126,20 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] +class Metadata(TypedDict, total=False): + """Full schema for `metadata.parquet`.""" + + dataset_name: str + ext_supported: bool + file_name: str + name_collision: bool + size: int + suffix: str + tag: str + url_github: str + url_npm: str + + class GitHubRateLimit(TypedDict): limit: int used: int From 661a3851034c39c1c8249a7426ae33821f802f14 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 17:01:41 +0000 Subject: [PATCH 046/201] feat(DRAFT): Adds `GitHubUrl.BLOBS` - Common prefix to all rows in `metadata[url_github]` - Stripping this leaves only `sha` - For **2800** rows, there are only **109** unique hashes, so these can be used to reduce cache size --- tools/datasets/github.py | 1 + tools/datasets/models.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 951221765..4f15140e3 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -287,6 +287,7 @@ def __init__( repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( BASE=base_url, + BLOBS=f"{repo}git/blobs/", RATE=f"{base_url}rate_limit", REPO=repo, TAGS=f"{repo}tags", diff --git a/tools/datasets/models.py b/tools/datasets/models.py index fa0972035..2bca343aa 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -21,6 +21,7 @@ class GitHubUrl(NamedTuple): BASE: LiteralString + BLOBS: LiteralString RATE: LiteralString REPO: LiteralString TAGS: LiteralString From 22dcb17868246c0d79796e3e65c1419442c11c61 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 18:31:36 +0000 Subject: [PATCH 047/201] feat: Store `sha` instead of `github_url` Related 661a3851034c39c1c8249a7426ae33821f802f14 --- tools/datasets/_io.py | 13 +------------ tools/datasets/_metadata/metadata-schema.json | 2 +- tools/datasets/_metadata/metadata.parquet | Bin 19087 -> 18495 bytes tools/datasets/github.py | 3 +-- tools/datasets/models.py | 4 ++-- 5 files changed, 5 insertions(+), 17 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 812a9eeb0..e27bbcb7a 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -53,16 +53,6 @@ __all__ = ["Reader"] -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -""" -Scalar selection args for `pl.DataFrame.item`_. - -.. _pl.DataFrame.item: - https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html -""" - class Reader: _read_fn: ClassVar[dict[Extension, ReadFn]] = { @@ -95,8 +85,7 @@ def url( tag: VersionTag | Literal["latest"] | None = None, ) -> str: df = self._query(**validate_constraints(name, ext, tag)) - item: _ItemSlice = (0, "url_npm") - url = df.item(*item) + url = df.item(0, "url_npm") if isinstance(url, str): return url else: diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json index d3da3f86d..53d9978b3 100644 --- a/tools/datasets/_metadata/metadata-schema.json +++ b/tools/datasets/_metadata/metadata-schema.json @@ -3,9 +3,9 @@ "ext_supported": "bool", "file_name": "str", "name_collision": "bool", + "sha": "str", "size": "int", "suffix": "str", "tag": "str", - "url_github": "str", "url_npm": "str" } \ No newline at end of file diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet index 97f235546beb0c56abede1cb419eab4afb89dd9c..8bf0e17e3673d2b7cfbbe1ddba345f492d12e674 100644 GIT binary patch delta 3023 zcmaJ?c{G%L8=kT6>&zI8H)NNwO^qc@GnTPNwm~I>$&!6gCJZ8BGV<6$XsGBFvZX>V zs;`tnNq9pfiiqr6-!o3%Io~%Ol0x$i$7W-jz`7PPKfn7(ceuT-*y z4@2P)2t=((d&jF)+{F^IDLSMmQg@S6iH^Loe>S3vEbu{%LmhnbQ`4l{)@X~!komO+ZxXUC_y85Ttzsl0PGf?7{Z*n09tbLisw^AAVzU~i8w{3=+nXNO~7MP?N0--@WD z7eDpN(P{>akmu+^kOcW=Dp?!xND}?P<$!Zk`>r-mVcAh#?XK?<40V#ySdT}(r;=^! znxVl@eaVhH+P*FNjP8%*}84eo7#a&m;Gsop2 zBNYQ9$So$v9XrjZa=zj=eLmK{sJG-BlQ`q1Y^7Q}@RCT^2%u=n9(#Rhc(o!A>*PhM`8NwHxCnS@fo55ti8zv1SR2T)S)3NF+dKNQFezqU$ zPKa@HS9|9PvRF{%+4ZgpXqJm1LI}b3;r~XVZ$dtmpS&{hr3QA+$eyeabh9Oz7T^uNE6%fy1Y|T6Ou2_FF zFHCLZj*CL!yCGvFqW-G5sg?6LJ*!^ez{^l|2?$S^NwZWWJX5mF}-R*tSnmqgOb5^IjLJHw9a9 zY?5?q$VC?yxv~7Cz%2geAcl#-pIKoYQQ|GNVWD?(mDU)1Zab1Yvyu%OJp-Q8iP)m8 zr0o6&gbPo3b1$c3rvR@STgfKeH-jgs@8u4lVFIGNvFj@O0-SmQe+>V863;<v40DUge%<`JLeyBl^*`Kw}ug#Xn$a)Bgv5Ddt7_dQh!aW z*2kSLr>9?~E;^~Nth#U)8|!5mR~(iBFRqX!bp#%FWbp1-1f9P8E}x=1c>nm3nG5Nj zi(L!x)Z6s@1DXRyIeoZ$R$(1OOwq%#U*yBv6~bj7HYuW)yGapzaJlyV$CmBBk1pRJ z4$)cJisNBnmMRZoewGmY`W%unvSnaRi24nEqd!Sjw%GT*!^)zv#<44Yo7Al< zXl*nX1-+*MBkkwuC#;pqs(nH;le6M-TU6-rl(5O0^KBn-*B9gkq+~8dyBczxTK%EW zIknM!f60MqV?VW@N(sz<5O#_-$}xSdbo4PL5VLZ~y+sH8WV=d2XIbo?u?3@CT(C;i z-uab;en_Xp+Dor-7$rAz!I-4!(%~Z1cU!7JCQ)-Jy}zc*i}mYN*d{B6EL=433x@EV z6Rc{eAHSgyJs{@)92*puS^LPQvDSd;W3r!KR@_rj(j;`u@xpq>rx9H7$~wthi~s72 zTi0A-M9yJ!_a(2D!0z%7 zU{k6$0+$BA$7%S&2Udc`j8hfeux^F46I8BJ%A}NCHHj1HuL2gNQ+of(``A#Ra-zlade! zM@?gg8yJyKNO12|1poOECh$5hBK(CTIcz2s?H1x>d;gsT} z`jg>8h&fF}GZit8QRzQ)_z*{RPyvuz)PVc z{D`H!XzwPlXlkI7p2#lD+uDQFpGCoiqz^)P#H9B`{%KU260E(_7O)iDmCAM^6a@@R z$oyw`;)sWP`PFSG{0Ky4E)5FAo5`ty(_{~k3mthr63K%mM?{34gFq1dAXWy^xC4dj zK~(NAp}=2eGMfKq;2?Z%5B>#&8$eXb)<9)doZSEJAk@Mfdnt*{T(S#=S8zZO6E{%c z_Eq>09~Pi6_VkfLct=Njk`dL%pAt+)F7s%!4@L)=wJ=lg!TaIyx_EOu-WpH9;|W&y z<9NIo+qc7m#}p4Z5KH;Z*tRdxA_Q==BGBDIhd}csc?6T)z-oIMNN0dyL#)^r0gQn+ zn;4{V!h?+9+Y!$W{H6sZbl6RRTHpb*d!=XD9e@<OF5>2(N z*+7rrAgYI}VU#;bcTeu0=n!V?)`+ecUkgK;uN&cXnoIAE%Y$g4qm*GNf(lA|3HsKX_v`)e-MiLa`|NL@?|%E7ebzl)X&kAU9Cg*a4b~#2 zd!SD>KpsO(Lxb1^2lT2%L*^ke2QS9tn&eQA%{A!uYi0l7jrwc4%?Gnwba8{slMA1ggw>foKEb*Tu@a{v|pmyW^5^ zi|;!kGAy`fiG5^3O(sD#{$R0Jopp`RpRG~j{U@9Jb4a0K_&t;HWa7eshny02Wz16} z2_La-vZ$M@D}0x31j&UviDsWtA4Sd;JFoCf-F$h5r+8`RPO2RoP;$UFFd8cnWx+K> z-*2(RP1Tv%@p7@lR`cYTfx*d~qjUP&l!*^befIL^XSI2R*6NS*cg!vIK94O^Wo*ab zzjO#F7T5=*!Aox>+joixg=%_yuV(JD3pS~GQ~zvE&81({EODkqzh@E}QOH2coKrdB zUT4$VeN9(cN_)ubbBg6irPgD4gRiw~>bdYS`etRGo@8OAyNrn!)+B)%(VgJg$zN0T zwwZn&p@FxagmX{&r{)aZwuHBtpda-o*}iQ{ySCgy#-Ofcyzf{uct%~HRXZu29$jKn ze&rZnOT2RXeAc$)>GUs}5uN6YJiR7|V@ordU29TrzUTr9`Y=>So7wMS{5GisWqhTE$h3a>h`T zm9z77yxT~s6pUFAm+OS~Ii8W>XQSozJ#~w;>Um2+=x$3i#?kC4r~8-X%jT4^!5FaN zTC;AY6t5$p}6dgFf~_V&QZ+=Yk0zVx#=L<+{$*?hn)Uo~Gt2 z!lO2aUCO8OM_Qr{#~g2Rjg$_J_td2&C!x)YZLg3kCk}r6vbyBmN$!9DVsIS`NNOfM zvAS@{S7m|8jcy&Z8YtaZ?~;?l-{LKKvV9sU;*HPTSTOvkR^}q#7He@-{`SJAfK8-N z+_O~C&M2jJ@u2-(gvT`r->y9@k9{DyEZZEy=EP{G~G+gu?d~a zjEJbtr4-K9RD_Pdc5!F+4O(_aV4d}7LuwnJ8j|i&jAyg<9kOqR%3AKbZ^n21fL5)! zY|Pub^pm>zS3PQ^uSA4phmNdFm?iwTA8;l;a(n z9IaT`pU%W@rhQF!hzzU!!#iTn<`c``l$KQ8Eemc*1>U+!!yV&HY#z)M@wqgwnLfOz z-PqpW>cKd1zolCS`Pro>U;b7F>U?R=de$HH)m?l!InOg1#C-^;&jN8%oL!eI|sXl~^%3p)he%y}YvS93Oc zynJMnCNQ~t?&Ha@iU-%rmoMKYs1G=`D(U)0t~@%v zI6+X)4bM|>LZfI~_MGVOxKUAw`^hBJ^P$DGtZe34;jC6?*sgM1Rp7aa+muSK8d{uQ zNB48M(|}_}uqaG;b1~`}>M^(HX!TZkor|++LFOKQhx)1*eZx-bT)?@vr;5+_g<%sf zAF&=fWBY^xld06*=v$gUI2687Rh$*w@A~YKT&Fi@MkPZ&@mtU1<)!?wx#8PJ2l0Vn zpLQd$3)d;7sq^`iz_t^-9@?m{w0iBy^pkxC4%2f?&~cv|v)`wbROouNg)`+^Q~xwX>Fx8vph zutPPqxmUVM1@1b^x2?XqH|cagn=*9+Ng|>oNVnj5yjtpCnw>*htLy$8HrX7H4$D(< zePXIguP+DnnOsulV*xqTkuW2qo*=Qvy{vL9Hc#w=p;`Lu>{s-(RAPEAR$*>B@#gf$ zyS3+vV`fw64~%VG&hwEmIt!84i8=??dNuX-tI}az9+sJ1o7j=BM}t#sOPWgGga}XM zh~9g$MoLz!8R8N=-DW7R;9*Bn9D*k}x)HzVG-kG@P~vWcS$@l-MZQwI+u-;3;RQ(p z$-uNb1K|$Z{N+i@$w?_(GV_@>xW`LkB1&f3elD$6wUQsnV|=V%!BlhlaNX-A{|}aJ ztu&Eu+EbicC4D5F{;8ItBMy|y7xu{ehjO%kjvm)76Xk;%Q4g0+(kXGrbkdJ5buv24tSRB+k zbWEQ=4ezD*r-dCA%SocX@}(O4(~jV#;dKxjauUtzGIH%v(o zxwI2~!-^h)NN1^H!G}D0oXEI1cJ9_TC~lz1U$is0cM-}?X*>ol8i@S34ez&U-GQc9 zkRC+3)DDZ_!Kg!E0vI$bAn42)kI)|*oh5{f*(-h3g1yhXl>$Qp(J&>dzrSw?1cDsi zVJMA+A_x%acO)>}aQ}~MHIj*(*}-12FinW`o68*kq5ns1|KDEvNAH8^Bci|DR>H`O z@$98ciGm`^c0r_vHaM{0u&(HjPD|EAX0zb^$i_=hL@5-4tcww4oeXI>GJS<(AA8f$ zJbG&J5D;k$ z;C3*&s-ri>+e-}qi~`saj ParsedTree: dataset_name=path.stem, suffix=path.suffix, size=tree["size"], - url=tree["url"], + sha=tree["sha"], ext_supported=is_ext_supported(path.suffix), tag=tag, ) @@ -326,7 +326,6 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: df = ( pl.DataFrame(parsed) .lazy() - .rename({"url": "url_github"}) .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 2bca343aa..556aafa1a 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -116,7 +116,7 @@ class ParsedTree(TypedDict): dataset_name: str suffix: str size: int - url: str + sha: str ext_supported: bool tag: str @@ -134,10 +134,10 @@ class Metadata(TypedDict, total=False): ext_supported: bool file_name: str name_collision: bool + sha: str size: int suffix: str tag: str - url_github: str url_npm: str From 669df027cef9d857f2207c77279281a8a42a03d6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 19:07:54 +0000 Subject: [PATCH 048/201] feat(perf): Adds caching to `ALTAIR_DATASETS_DIR` --- tools/datasets/__init__.py | 2 +- tools/datasets/_io.py | 51 ++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 47575278c..de98cd281 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -193,7 +193,7 @@ def __call__( **kwds: Any, ) -> pl.DataFrame: """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(self.url(name, ext, tag=tag), **kwds) + return self._reader.dataset(name, ext, tag=tag, **kwds) data = DataLoader(app._from_alias("gh_trees")) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index e27bbcb7a..228bb9ce1 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -14,10 +14,11 @@ from __future__ import annotations +import os import urllib.request from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast import polars as pl @@ -63,10 +64,25 @@ class Reader: } _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR" def __init__(self, fp_trees: Path, /) -> None: self._fp_trees: Path = fp_trees + @property + def _datasets_dir(self) -> Path | None: # type: ignore[return] + """ + Returns path to datasets cache, if possible. + + Requires opt-in via environment variable:: + + Reader._ENV_VAR + """ + if _dir := os.environ.get(self._ENV_VAR): + datasets_dir = Path(_dir) + datasets_dir.mkdir(exist_ok=True) + return datasets_dir + @classmethod def reader_from(cls, source: StrPath, /) -> ReadFn: suffix = validate_suffix(source, is_ext_supported) @@ -116,20 +132,41 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) - def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + def dataset( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + **kwds: Any, + ) -> pl.DataFrame: """ - Fetch a remote dataset. + Fetch a remote dataset, attempt caching if possible. Parameters ---------- - url - Full path to a known dataset. + name, ext, tag + TODO **kwds Arguments passed to the underlying read function. """ + df = self._query(**validate_constraints(name, ext, tag)) + result = cast("Metadata", df.row(0, named=True)) + url = result["url_npm"] fn = self.reader_from(url) - with self._opener.open(url) as f: - return fn(f.read(), **kwds) + + if cache := self._datasets_dir: + fp = cache / (result["sha"] + result["suffix"]) + if fp.exists(): + return fn(fp, **kwds) + else: + fp.touch() + with self._opener.open(url) as f: + fp.write_bytes(f.read()) + return fn(fp, **kwds) + else: + with self._opener.open(url) as f: + return fn(f.read(), **kwds) def validate_constraints( From 20514100497595b52bd14e55dec0b139b4d1578a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 23:01:06 +0000 Subject: [PATCH 049/201] feat(DRAFT): Adds initial generic backends --- tools/datasets/__init__.py | 4 +- tools/datasets/_io.py | 200 +++++++++++++++++++++++++++---------- 2 files changed, 151 insertions(+), 53 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index de98cd281..96932b9af 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -15,7 +15,7 @@ import polars as pl from tools.codemod import ruff -from tools.datasets._io import Reader +from tools.datasets._io import get_backend from tools.datasets.github import GitHub from tools.datasets.npm import Npm from tools.schemapi import utils @@ -172,7 +172,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: class DataLoader: def __init__(self, metadata: Path, /) -> None: - self._reader = Reader(metadata) + self._reader = get_backend("polars")(metadata) def url( self, diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 228bb9ce1..2074def12 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -17,10 +17,25 @@ import os import urllib.request from functools import partial +from itertools import chain, islice from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast - +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Generic, + Literal, + Protocol, + TypeVar, + cast, + overload, +) + +import narwhals.stable.v1 as nw +import pandas as pd import polars as pl +from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: import sys @@ -40,34 +55,30 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from narwhals import typing as nw_typing # noqa: F401 from tools.datasets._typing import DatasetName, Extension, VersionTag from tools.datasets.models import Metadata from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] - - ReadFn: TypeAlias = Callable[..., pl.DataFrame] - ScanFn: TypeAlias = Callable[..., pl.LazyFrame] _T = TypeVar("_T") -__all__ = ["Reader"] +__all__ = ["get_backend"] -class Reader: - _read_fn: ClassVar[dict[Extension, ReadFn]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR" - def __init__(self, fp_trees: Path, /) -> None: - self._fp_trees: Path = fp_trees +class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): + """ + Common functionality between backends. + + Trying to use ``narwhals`` as much as possible + """ + + _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + _metadata: Path @property def _datasets_dir(self) -> Path | None: # type: ignore[return] @@ -83,15 +94,13 @@ def _datasets_dir(self) -> Path | None: # type: ignore[return] datasets_dir.mkdir(exist_ok=True) return datasets_dir - @classmethod - def reader_from(cls, source: StrPath, /) -> ReadFn: + def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) - return cls._read_fn[suffix] + return self._read_fn[suffix] - @classmethod - def scanner_from(cls, source: StrPath, /) -> ScanFn: + def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) - return cls._scan_fn[suffix] + return self._scan_fn[suffix] def url( self, @@ -108,30 +117,6 @@ def url( msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." raise TypeError(msg) - def _query( - self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata] - ) -> pl.DataFrame: - r""" - Query multi-version trees metadata. - - Parameters - ---------- - \*predicates, \*\*constraints - Passed directly to `pl.LazyFrame.filter`_. - - .. _pl.LazyFrame.filter: - https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html - """ - source = self._fp_trees - fn = self.scanner_from(self._fp_trees) - results = fn(source).filter(*predicates, **constraints).collect() - if not results.is_empty(): - return results - else: - terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n{terms}" - raise NotImplementedError(msg) - def dataset( self, name: DatasetName | LiteralString, @@ -139,7 +124,7 @@ def dataset( /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, - ) -> pl.DataFrame: + ) -> IntoDataFrameT: """ Fetch a remote dataset, attempt caching if possible. @@ -151,7 +136,8 @@ def dataset( Arguments passed to the underlying read function. """ df = self._query(**validate_constraints(name, ext, tag)) - result = cast("Metadata", df.row(0, named=True)) + it = islice(df.iter_rows(named=True), 1) + result = cast("Metadata", next(it)) url = result["url_npm"] fn = self.reader_from(url) @@ -168,6 +154,91 @@ def dataset( with self._opener.open(url) as f: return fn(f.read(), **kwds) + def _query( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.DataFrame[IntoDataFrameT]: + r""" + Query multi-version trees metadata. + + Parameters + ---------- + \*predicates, \*\*constraints + Passed directly to `pl.LazyFrame.filter`_. + + .. _pl.LazyFrame.filter: + https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html + """ + source = self._metadata + fn = self.scanner_from(source) + frame = nw.from_native(fn(source), pass_through=False) + result = frame.filter(_filter_reduce(predicates, constraints)) + df: nw.DataFrame[Any] = ( + result.collect() if isinstance(result, nw.LazyFrame) else result + ) + if not df.is_empty(): + return df + else: + terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) + msg = f"Found no results for:\n{terms}" + raise NotImplementedError(msg) + + +class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + _read_fn = { + ".csv": cast( + partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") + ), + ".json": cast( + partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") + ), + ".tsv": cast( + partial["pd.DataFrame"], + partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), + ), + ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + } + _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + _read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + _scan_fn = {".parquet": pd.read_parquet} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _read_fn = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _scan_fn = {".parquet": pl.scan_parquet} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: + """ + ``narwhals`` only accepts ``filter(*predicates)`. + + Manually converts the constraints into ``==`` + """ + return nw.all_horizontal( + chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + ) + def validate_constraints( name: DatasetName | LiteralString, @@ -209,3 +280,30 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: def is_ext_supported(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} + + +@overload +def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ... +@overload +def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ... +@overload +def get_backend( + backend: Literal["pandas[pyarrow]"], / +) -> type[_PandasPyArrowReader]: ... +def get_backend( + backend: Literal["polars", "pandas", "pandas[pyarrow]"], / +) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]: + if backend == "polars": + return _PolarsReader + elif backend == "pandas[pyarrow]": + return _PandasPyArrowReader + elif backend == "pandas": + return _PandasReader + elif backend in {"pyarrow", "duckdb"}: + msg = "Included in ``dev``, not investigated yet" + raise NotImplementedError(msg) + elif backend in {"ibis", "cudf", "dask", "modin"}: + msg = "Supported by ``narwhals``, not investigated yet" + raise NotImplementedError(msg) + else: + raise TypeError(backend) From 0ea4e21348bcc7cf799cec11c72f19e06e1c8a49 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 10:35:11 +0000 Subject: [PATCH 050/201] feat: Generate and move `Metadata` (`TypedDict`) to `datasets._typing` --- tools/datasets/__init__.py | 47 +++++++++++++++++++++++++++++++- tools/datasets/_io.py | 3 +-- tools/datasets/_typing.py | 55 +++++++++++++++++++++++++++++++++++++- tools/datasets/models.py | 14 ---------- 4 files changed, 101 insertions(+), 18 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 96932b9af..b569e55d0 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -140,8 +140,12 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None def generate_datasets_typing(application: Application, output: Path, /) -> None: + from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT + app = application tags = app.scan("gh_tags").select("tag").collect().to_series() + metadata_schema = app.scan("gh_trees").collect_schema().to_python() + DATASET_NAME = "dataset_name" names = ( app.scan("gh_trees") @@ -152,20 +156,61 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: .collect() .to_series() ) + indent = " " * 4 NAME = "DatasetName" TAG = "VersionTag" EXT = "Extension" + METADATA_TD = "Metadata" + DESCRIPTION_DEFAULT = "_description_" + NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" + + name_collision = ( + f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(ext=...)``." + ) + sha = ( + f"Unique hash for the dataset.{NOTE_SEP}" + f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" + f"then all ``tag``(s) in this range would **share** this value." + ) + descriptions: dict[str, str] = { + "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "ext_supported": "Dataset can be read as tabular data.", + "file_name": "Equivalent to ``Pathlib.Path.name``.", + "name_collision": name_collision, + "sha": sha, + "size": "File size (*bytes*).", + "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", + "tag": "``vega-datasets`` release version.", + "url_npm": "Remote url used to access dataset.", + } + metadata_doc = f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + contents = ( f"{HEADER_COMMENT}", "from __future__ import annotations\n", "import sys", "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 14), "TypedDict"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + UNIVERSAL_TYPED_DICT.format( + name=METADATA_TD, + metaclass_kwds=", total=False", + td_args=f"\n{indent}".join( + f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() + ), + summary="Full schema for ``metadata.parquet``.", + doc=metadata_doc, + comment="", + ), ) ruff.write_lint_format(output, contents) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 2074def12..14159218d 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -56,8 +56,7 @@ else: from typing_extensions import TypeAlias - from tools.datasets._typing import DatasetName, Extension, VersionTag - from tools.datasets.models import Metadata + from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py index 9414aaab4..0a86bc6ba 100644 --- a/tools/datasets/_typing.py +++ b/tools/datasets/_typing.py @@ -6,13 +6,18 @@ import sys from typing import Literal +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -__all__ = ["DatasetName", "Extension", "VersionTag"] +__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"] DatasetName: TypeAlias = Literal[ "airports", @@ -135,3 +140,51 @@ "v1.5.0", ] Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] + + +class Metadata(TypedDict, total=False): + """ + Full schema for ``metadata.parquet``. + + Parameters + ---------- + dataset_name + Equivalent to ``Pathlib.Path.stem``. + ext_supported + Dataset can be read as tabular data. + file_name + Equivalent to ``Pathlib.Path.name``. + name_collision + Dataset is available via multiple ``suffix``(s). + + .. note:: + Requires specifying a preference in calls to ``data(ext=...)``. + sha + Unique hash for the dataset. + + .. note:: + If the dataset did *not* change between ``v1.0.0``-``v2.0.0``; + + then all ``tag``(s) in this range would **share** this value. + size + File size (*bytes*). + suffix + File extension. + + .. note:: + Equivalent to ``Pathlib.Path.suffix`` + tag + ``vega-datasets`` release version. + url_npm + Remote url used to access dataset. + """ + + dataset_name: str + ext_supported: bool + file_name: str + name_collision: bool + sha: str + size: int + suffix: str + tag: str + url_npm: str diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 556aafa1a..044447707 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -127,20 +127,6 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] -class Metadata(TypedDict, total=False): - """Full schema for `metadata.parquet`.""" - - dataset_name: str - ext_supported: bool - file_name: str - name_collision: bool - sha: str - size: int - suffix: str - tag: str - url_npm: str - - class GitHubRateLimit(TypedDict): limit: int used: int From a2e9baa5ddd825efedd26d3aa3a3dfe5630d4e07 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:30:55 +0000 Subject: [PATCH 051/201] feat: Adds optional backends, `polars[pyarrow]`, `with_backend` --- tools/datasets/__init__.py | 48 +++++++++++-- tools/datasets/_io.py | 137 +++++++++++++++++++++++-------------- 2 files changed, 127 insertions(+), 58 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b569e55d0..864829cf6 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,9 +10,10 @@ import json import types from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Generic, Literal, overload import polars as pl +from narwhals.typing import IntoDataFrameT, IntoFrameT from tools.codemod import ruff from tools.datasets._io import get_backend @@ -24,6 +25,8 @@ import sys from collections.abc import Mapping + import pandas as pd + if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -32,6 +35,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias + from tools.datasets._io import _Backend, _Reader from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] @@ -215,9 +219,8 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -class DataLoader: - def __init__(self, metadata: Path, /) -> None: - self._reader = get_backend("polars")(metadata) +class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): + _reader: _Reader[IntoDataFrameT, IntoFrameT] def url( self, @@ -236,9 +239,40 @@ def __call__( /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, - ) -> pl.DataFrame: + ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" return self._reader.dataset(name, ext, tag=tag, **kwds) - -data = DataLoader(app._from_alias("gh_trees")) + @overload + @classmethod + def with_backend( + cls, backend: Literal["polars", "polars[pyarrow]"], / + ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pandas", "pandas[pyarrow]"], / + ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... + + @classmethod + def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: + """ + Initialize a new loader, using the specified backend. + + Parameters + ---------- + backend + DataFrame package/config used to return data. + + * *polars*: _ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: _ + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + """ + obj = DataLoader.__new__(DataLoader) + obj._reader = get_backend(backend) + return obj + + +data = DataLoader.with_backend("polars") diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 14159218d..9bdb6e5e9 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -17,6 +17,8 @@ import os import urllib.request from functools import partial +from importlib import import_module +from importlib.util import find_spec from itertools import chain, islice from pathlib import Path from typing import ( @@ -33,14 +35,15 @@ ) import narwhals.stable.v1 as nw -import pandas as pd -import polars as pl from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: import sys from urllib.request import OpenerDirector + import pandas as pd + import polars as pl + import pyarrow as pa # noqa: F401 from _typeshed import StrPath if sys.version_info >= (3, 13): @@ -61,6 +64,9 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") + _Backend: TypeAlias = Literal[ + "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]" + ] __all__ = ["get_backend"] @@ -77,7 +83,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" - _metadata: Path + _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @property def _datasets_dir(self) -> Path | None: # type: ignore[return] @@ -181,51 +187,76 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + def _import(self, name: str, /) -> Any: + if spec := find_spec(name): + return import_module(spec.name) + else: + msg = f"{type(self).__name__!r} requires missing dependency {name!r}." + raise ModuleNotFoundError(msg, name=name) + + def __init__(self, *specs: str) -> None: ... + class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - _read_fn = { - ".csv": cast( - partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") - ), - ".json": cast( - partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") - ), - ".tsv": cast( - partial["pd.DataFrame"], - partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), - ), - ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), - } - _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pd: str, _pa: str, /) -> None: + if not TYPE_CHECKING: + pd = self._import(_pd) + pa = self._import(_pa) # noqa: F841 + + self._read_fn = { + ".csv": cast( + partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") + ), + ".json": cast( + partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") + ), + ".tsv": cast( + partial["pd.DataFrame"], + partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), + ), + ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + } + self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - _read_fn = { - ".csv": pd.read_csv, - ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), - ".arrow": pd.read_feather, - } - _scan_fn = {".parquet": pd.read_parquet} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pd: str, /) -> None: + if not TYPE_CHECKING: + pd = self._import(_pd) + self._read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + self._scan_fn = {".parquet": pd.read_parquet} class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _read_fn = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - _scan_fn = {".parquet": pl.scan_parquet} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pl: str, /) -> None: + if not TYPE_CHECKING: + pl = self._import(_pl) + self._read_fn = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": pl.read_ipc, + } + self._scan_fn = {".parquet": pl.scan_parquet} + + +class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + def __init__(self, _pl: str, _pa: str, /) -> None: + if not TYPE_CHECKING: + pl = self._import(_pl) + pa = self._import(_pa) # noqa: F841 + self._read_fn = { + ".csv": partial(pl.read_csv, use_pyarrow=True), + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + self._scan_fn = {".parquet": pl.scan_parquet} def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: @@ -281,23 +312,27 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} -@overload -def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ... -@overload -def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ... @overload def get_backend( - backend: Literal["pandas[pyarrow]"], / -) -> type[_PandasPyArrowReader]: ... + backend: Literal["polars", "polars[pyarrow]"], / +) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... + + +@overload def get_backend( - backend: Literal["polars", "pandas", "pandas[pyarrow]"], / -) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]: + backend: Literal["pandas", "pandas[pyarrow]"], / +) -> _Reader[pd.DataFrame, pd.DataFrame]: ... + + +def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": - return _PolarsReader + return _PolarsReader("polars") + elif backend == "polars[pyarrow]": + return _PolarsPyArrowReader("polars", "pyarrow") elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader + return _PandasPyArrowReader("pandas", "pyarrow") elif backend == "pandas": - return _PandasReader + return _PandasReader("pandas") elif backend in {"pyarrow", "duckdb"}: msg = "Included in ``dev``, not investigated yet" raise NotImplementedError(msg) From c8a1429064d20a1ed89e7723363c52779b5650cc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:19:10 +0000 Subject: [PATCH 052/201] feat: Adds `pyarrow` backend --- tools/datasets/__init__.py | 7 +++++ tools/datasets/_io.py | 59 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 864829cf6..3c1c8b13d 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -26,6 +26,7 @@ from collections.abc import Mapping import pandas as pd + import pyarrow as pa if sys.version_info >= (3, 11): from typing import LiteralString @@ -255,6 +256,12 @@ def with_backend( cls, backend: Literal["pandas", "pandas[pyarrow]"], / ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... + @overload + @classmethod + def with_backend( + cls, backend: Literal["pyarrow"], / + ) -> DataLoader[pa.Table, pa.Table]: ... + @classmethod def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: """ diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 9bdb6e5e9..a75d0bd17 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -43,8 +43,12 @@ import pandas as pd import polars as pl - import pyarrow as pa # noqa: F401 + import pyarrow as pa from _typeshed import StrPath + from pyarrow.csv import read_csv as pa_read_csv # noqa: F401 + from pyarrow.feather import read_table as pa_read_feather # noqa: F401 + from pyarrow.json import read_json as pa_read_json # noqa: F401 + from pyarrow.parquet import read_table as pa_read_parquet # noqa: F401 if sys.version_info >= (3, 13): from typing import TypeIs, Unpack @@ -65,7 +69,7 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") _Backend: TypeAlias = Literal[ - "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]" + "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow" ] @@ -259,6 +263,49 @@ def __init__(self, _pl: str, _pa: str, /) -> None: self._scan_fn = {".parquet": pl.scan_parquet} +class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): + """ + Reader backed by `pyarrow.Table`_. + + Warning + ------- + **JSON**: Only supports `line-delimited`_ JSON. + Likely to raise the following error: + + ArrowInvalid: JSON parse error: Column() changed from object to array in row 0 + + .. _pyarrow.Table: + https://arrow.apache.org/docs/python/generated/pyarrow.Table.html + .. _line-delimited: + https://arrow.apache.org/docs/python/json.html#reading-json-files + """ + + def __init__(self, _pa: str, /) -> None: + if not TYPE_CHECKING: + pa = self._import(_pa) # noqa: F841 + pa_csv = self._import(f"{_pa}.csv") + pa_feather = self._import(f"{_pa}.feather") + pa_json = self._import(f"{_pa}.json") + pa_parquet = self._import(f"{_pa}.parquet") + + pa_read_csv = pa_csv.read_csv + pa_read_feather = pa_feather.read_table + pa_read_json = pa_json.read_json + pa_read_parquet = pa_parquet.read_table + + # opt1 = ParseOptions(delimiter="\t") # type: ignore + # Stubs suggest using a dataclass, but no way to construct it + opt2: Any = {"delimiter": "\t"} + + self._read_fn = { + ".csv": pa_read_csv, + ".json": pa_read_json, + ".tsv": partial(pa_read_csv, parse_options=opt2), + ".arrow": pa_read_feather, + } + self._scan_fn = {".parquet": pa_read_parquet} + + def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ ``narwhals`` only accepts ``filter(*predicates)`. @@ -324,6 +371,10 @@ def get_backend( ) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +@overload +def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ... + + def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": return _PolarsReader("polars") @@ -333,7 +384,9 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: return _PandasPyArrowReader("pandas", "pyarrow") elif backend == "pandas": return _PandasReader("pandas") - elif backend in {"pyarrow", "duckdb"}: + elif backend == "pyarrow": + return _PyArrowReader("pyarrow") + elif backend == "duckdb": msg = "Included in ``dev``, not investigated yet" raise NotImplementedError(msg) elif backend in {"ibis", "cudf", "dask", "modin"}: From 279fea952007d83bd99e6cba1dfb79ca1a8ff70a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:19:52 +0000 Subject: [PATCH 053/201] docs: Update `.with_backend()` --- tools/datasets/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3c1c8b13d..6592d5d93 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -272,10 +272,21 @@ def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: backend DataFrame package/config used to return data. - * *polars*: _ + * *polars*: Using `polars defaults`_ * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: _ + * *pandas*: Using `pandas defaults`_. * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files """ obj = DataLoader.__new__(DataLoader) obj._reader = get_backend(backend) From 7d6c7ca2dce60c30b3c5e0107f9a496a17cb9695 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:17:40 +0000 Subject: [PATCH 054/201] chore: Remove `duckdb` comment Not planning to support this anymore, requires `fsspec` which isn't in `dev` ``` InvalidInputException Traceback (most recent call last) Cell In[6], line 5 3 with duck._reader._opener.open(url) as f: 4 fn = duck._reader._read_fn['.json'] ----> 5 thing = fn(f.read()) InvalidInputException: Invalid Input Error: This operation could not be completed because required module 'fsspec' is not installed" ``` --- tools/datasets/_io.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index a75d0bd17..7989ae282 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -386,9 +386,6 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: return _PandasReader("pandas") elif backend == "pyarrow": return _PyArrowReader("pyarrow") - elif backend == "duckdb": - msg = "Included in ``dev``, not investigated yet" - raise NotImplementedError(msg) elif backend in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) From 0bb4210b5aa5ff22c345946a8e73a432373529ff Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:21:09 +0000 Subject: [PATCH 055/201] ci(typing): Add `pyarrow-stubs` to `dev` dependencies Will put this in another PR, but need it here for IDE support --- pyproject.toml | 1 + tests/utils/test_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ae15a8a4b..4132f0a25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ dev = [ "duckdb>=1.0", "ipython[kernel]", "pandas>=1.1.3", + "pyarrow-stubs", "pytest", "pytest-cov", "pytest-xdist[psutil]~=3.5", diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index c3b329cf0..36ed1b097 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -137,10 +137,11 @@ def test_sanitize_pyarrow_table_columns() -> None: ) # Create pyarrow table with explicit schema so that date32 type is preserved + # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]" [arg-type] pa_table = pa.Table.from_pandas( df, pa.schema( - [ + ( pa.field("s", pa.string()), pa.field("f", pa.float64()), pa.field("i", pa.int64()), @@ -148,7 +149,7 @@ def test_sanitize_pyarrow_table_columns() -> None: pa.field("d", pa.date32()), pa.field("c", pa.dictionary(pa.int8(), pa.string())), pa.field("p", pa.timestamp("ns", tz="UTC")), - ] + ) ), ) sanitized = sanitize_narwhals_dataframe(nw.from_native(pa_table, eager_only=True)) From 89844253a51de27d4dac0590b013fb4f5361dd35 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:30:24 +0000 Subject: [PATCH 056/201] refactor: `generate_datasets_typing` -> `Application.generate_typing` --- tools/datasets/__init__.py | 150 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 76 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 6592d5d93..645775fb4 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -134,6 +134,80 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None with fp_schema.open("w") as f: json.dump(schema, f, indent=2) + def generate_typing(self, output: Path, /) -> None: + from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT + + tags = self.scan("gh_tags").select("tag").collect().to_series() + metadata_schema = self.scan("gh_trees").collect_schema().to_python() + + DATASET_NAME = "dataset_name" + names = ( + self.scan("gh_trees") + .filter("ext_supported") + .unique(DATASET_NAME) + .select(DATASET_NAME) + .sort(DATASET_NAME) + .collect() + .to_series() + ) + indent = " " * 4 + NAME = "DatasetName" + TAG = "VersionTag" + EXT = "Extension" + METADATA_TD = "Metadata" + DESCRIPTION_DEFAULT = "_description_" + NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" + + name_collision = ( + f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(ext=...)``." + ) + sha = ( + f"Unique hash for the dataset.{NOTE_SEP}" + f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" + f"then all ``tag``(s) in this range would **share** this value." + ) + descriptions: dict[str, str] = { + "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "ext_supported": "Dataset can be read as tabular data.", + "file_name": "Equivalent to ``Pathlib.Path.name``.", + "name_collision": name_collision, + "sha": sha, + "size": "File size (*bytes*).", + "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", + "tag": "``vega-datasets`` release version.", + "url_npm": "Remote url used to access dataset.", + } + metadata_doc = f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + + contents = ( + f"{HEADER_COMMENT}", + "from __future__ import annotations\n", + "import sys", + "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 14), "TypedDict"), + utils.import_typing_extensions((3, 10), "TypeAlias"), + "\n", + f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" + f"{NAME}: TypeAlias = {utils.spell_literal(names)}", + f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", + f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + UNIVERSAL_TYPED_DICT.format( + name=METADATA_TD, + metaclass_kwds=", total=False", + td_args=f"\n{indent}".join( + f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() + ), + summary="Full schema for ``metadata.parquet``.", + doc=metadata_doc, + comment="", + ), + ) + ruff.write_lint_format(output, contents) + app = Application(Path(__file__).parent / "_metadata", write_schema=True) @@ -144,82 +218,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None _CURRENT_SOURCE_TAG = "v2.9.0" -def generate_datasets_typing(application: Application, output: Path, /) -> None: - from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT - - app = application - tags = app.scan("gh_tags").select("tag").collect().to_series() - metadata_schema = app.scan("gh_trees").collect_schema().to_python() - - DATASET_NAME = "dataset_name" - names = ( - app.scan("gh_trees") - .filter("ext_supported") - .unique(DATASET_NAME) - .select(DATASET_NAME) - .sort(DATASET_NAME) - .collect() - .to_series() - ) - indent = " " * 4 - NAME = "DatasetName" - TAG = "VersionTag" - EXT = "Extension" - METADATA_TD = "Metadata" - DESCRIPTION_DEFAULT = "_description_" - NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" - - name_collision = ( - f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" - "Requires specifying a preference in calls to ``data(ext=...)``." - ) - sha = ( - f"Unique hash for the dataset.{NOTE_SEP}" - f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" - f"then all ``tag``(s) in this range would **share** this value." - ) - descriptions: dict[str, str] = { - "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", - "ext_supported": "Dataset can be read as tabular data.", - "file_name": "Equivalent to ``Pathlib.Path.name``.", - "name_collision": name_collision, - "sha": sha, - "size": "File size (*bytes*).", - "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", - "tag": "``vega-datasets`` release version.", - "url_npm": "Remote url used to access dataset.", - } - metadata_doc = f"\n{indent}".join( - f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" - for param in metadata_schema - ) - - contents = ( - f"{HEADER_COMMENT}", - "from __future__ import annotations\n", - "import sys", - "from typing import Literal, TYPE_CHECKING", - utils.import_typing_extensions((3, 14), "TypedDict"), - utils.import_typing_extensions((3, 10), "TypeAlias"), - "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" - f"{NAME}: TypeAlias = {utils.spell_literal(names)}", - f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", - f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', - UNIVERSAL_TYPED_DICT.format( - name=METADATA_TD, - metaclass_kwds=", total=False", - td_args=f"\n{indent}".join( - f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() - ), - summary="Full schema for ``metadata.parquet``.", - doc=metadata_doc, - comment="", - ), - ) - ruff.write_lint_format(output, contents) - - class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): _reader: _Reader[IntoDataFrameT, IntoFrameT] From 9d062c8c8e030d4ea6b1288cf9e93692c60c78a0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:22:21 +0000 Subject: [PATCH 057/201] refactor: Split `datasets` into public/private packages - `tools.datasets`: Building & updating metadata file(s), generating annotations - `altair.datasets`: Consuming metadata, remote & cached dataset management --- altair/__init__.py | 3 +- altair/datasets/__init__.py | 117 ++++++++++++++++++ .../datasets/_metadata/metadata.parquet | Bin .../_io.py => altair/datasets/_readers.py | 11 +- {tools => altair}/datasets/_typing.py | 0 tools/datasets/__init__.py | 112 +++-------------- tools/datasets/_metadata/metadata-schema.json | 11 -- tools/datasets/github.py | 14 ++- 8 files changed, 146 insertions(+), 122 deletions(-) create mode 100644 altair/datasets/__init__.py rename {tools => altair}/datasets/_metadata/metadata.parquet (100%) rename tools/datasets/_io.py => altair/datasets/_readers.py (97%) rename {tools => altair}/datasets/_typing.py (100%) delete mode 100644 tools/datasets/_metadata/metadata-schema.json diff --git a/altair/__init__.py b/altair/__init__.py index d4e20f02f..d0d23dbaf 100644 --- a/altair/__init__.py +++ b/altair/__init__.py @@ -603,6 +603,7 @@ "core", "data", "data_transformers", + "datasets", "datum", "default_data_transformer", "display", @@ -653,7 +654,7 @@ def __dir__(): from altair.jupyter import JupyterChart from altair.expr import expr from altair.utils import AltairDeprecationWarning, parse_shorthand, Undefined -from altair import typing +from altair import typing, datasets def load_ipython_extension(ipython): diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py new file mode 100644 index 000000000..15c8069f9 --- /dev/null +++ b/altair/datasets/__init__.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, overload + +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._readers import _Reader, get_backend + +if TYPE_CHECKING: + import sys + from typing import Any, Literal + + import pandas as pd + import polars as pl + import pyarrow as pa + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Backend + from altair.datasets._typing import DatasetName, Extension, VersionTag + +__all__ = ["Loader", "data"] + + +class Loader(Generic[IntoDataFrameT, IntoFrameT]): + _reader: _Reader[IntoDataFrameT, IntoFrameT] + + def url( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + ) -> str: + """Return the address of a remote dataset.""" + return self._reader.url(name, ext, tag=tag) + + def __call__( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(name, ext, tag=tag, **kwds) + + def __repr__(self) -> str: + return f"{type(self).__name__}[{type(self._reader).__name__}]" + + @overload + @classmethod + def with_backend( + cls, backend: Literal["polars", "polars[pyarrow]"], / + ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pandas", "pandas[pyarrow]"], / + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pyarrow"], / + ) -> Loader[pa.Table, pa.Table]: ... + + @classmethod + def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: + """ + Initialize a new loader, using the specified backend. + + Parameters + ---------- + backend + DataFrame package/config used to return data. + + * *polars*: Using `polars defaults`_ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: Using `pandas defaults`_. + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files + """ + obj = Loader.__new__(Loader) + obj._reader = get_backend(backend) + return obj + + +def __getattr__(name): + if name == "data": + global data + data = Loader.with_backend("pandas") + from altair.utils.deprecation import deprecated_warn + + deprecated_warn( + "Added only for backwards compatibility with `altair-viz/vega_datasets`.", + version="5.5.0", + alternative="altair.datasets.Loader.with_backend(...)", + stacklevel=3, + ) + return data + else: + raise AttributeError(name) diff --git a/tools/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet similarity index 100% rename from tools/datasets/_metadata/metadata.parquet rename to altair/datasets/_metadata/metadata.parquet diff --git a/tools/datasets/_io.py b/altair/datasets/_readers.py similarity index 97% rename from tools/datasets/_io.py rename to altair/datasets/_readers.py index 7989ae282..cbb02cd00 100644 --- a/tools/datasets/_io.py +++ b/altair/datasets/_readers.py @@ -1,15 +1,10 @@ """ -Will be part of the public ``alt.datasets`` subpackage. +Backends for ``alt.datasets.Loader``. - Interfacing with the cached metadata. - But not updating it - Performing requests from those urls - Dispatching read function on file extension - -Note ----- -- Building with ``polars`` first, then will work backwards with ``narwhals``. - - Since ``narwhals`` is a subset of ``polars`` """ from __future__ import annotations @@ -63,8 +58,8 @@ else: from typing_extensions import TypeAlias - from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag - from tools.schemapi.utils import OneOrSeq + from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag + from altair.vegalite.v5.schema._typing import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") diff --git a/tools/datasets/_typing.py b/altair/datasets/_typing.py similarity index 100% rename from tools/datasets/_typing.py rename to altair/datasets/_typing.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 645775fb4..d9b00d9a5 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,13 +10,11 @@ import json import types from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, overload +from typing import TYPE_CHECKING, Any, Literal import polars as pl -from narwhals.typing import IntoDataFrameT, IntoFrameT from tools.codemod import ruff -from tools.datasets._io import get_backend from tools.datasets.github import GitHub from tools.datasets.npm import Npm from tools.schemapi import utils @@ -25,25 +23,14 @@ import sys from collections.abc import Mapping - import pandas as pd - import pyarrow as pa - - if sys.version_info >= (3, 11): - from typing import LiteralString - else: - from typing_extensions import LiteralString if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.datasets._io import _Backend, _Reader - from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] - WorkInProgress: TypeAlias = Any - -__all__ = ["app", "data"] +__all__ = ["app"] HEADER_COMMENT = """\ # The contents of this file are automatically written by @@ -61,7 +48,8 @@ class Application: def __init__( self, - output_dir: Path, + out_dir_tools: Path, + out_dir_altair: Path, *, write_schema: bool, trees_gh: str = "metadata", @@ -70,14 +58,18 @@ def __init__( kwds_gh: Mapping[str, Any] | None = None, kwds_npm: Mapping[str, Any] | None = None, ) -> None: - output_dir.mkdir(exist_ok=True) + out_dir_tools.mkdir(exist_ok=True) kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema self._github: GitHub = GitHub( - output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh + out_dir_tools, + out_dir_altair, + name_tags=tags_gh, + name_trees=trees_gh, + **kwds_gh, ) - self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], @@ -209,86 +201,14 @@ def generate_typing(self, output: Path, /) -> None: ruff.write_lint_format(output, contents) -app = Application(Path(__file__).parent / "_metadata", write_schema=True) +app = Application( + Path(__file__).parent / "_metadata", + Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata", + write_schema=False, +) # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" - - -class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): - _reader: _Reader[IntoDataFrameT, IntoFrameT] - - def url( - self, - name: DatasetName | LiteralString, - ext: Extension | None = None, - /, - tag: VersionTag | Literal["latest"] | None = None, - ) -> str: - """Return the address of a remote dataset.""" - return self._reader.url(name, ext, tag=tag) - - def __call__( - self, - name: DatasetName | LiteralString, - ext: Extension | None = None, - /, - tag: VersionTag | Literal["latest"] | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, ext, tag=tag, **kwds) - - @overload - @classmethod - def with_backend( - cls, backend: Literal["polars", "polars[pyarrow]"], / - ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ... - - @overload - @classmethod - def with_backend( - cls, backend: Literal["pandas", "pandas[pyarrow]"], / - ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... - - @overload - @classmethod - def with_backend( - cls, backend: Literal["pyarrow"], / - ) -> DataLoader[pa.Table, pa.Table]: ... - - @classmethod - def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: - """ - Initialize a new loader, using the specified backend. - - Parameters - ---------- - backend - DataFrame package/config used to return data. - - * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: Using `pandas defaults`_. - * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` - * *pyarrow*: (*Experimental*) - - .. warning:: - Most datasets use a `JSON format not supported`_ by ``pyarrow`` - - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - """ - obj = DataLoader.__new__(DataLoader) - obj._reader = get_backend(backend) - return obj - - -data = DataLoader.with_backend("polars") diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json deleted file mode 100644 index 53d9978b3..000000000 --- a/tools/datasets/_metadata/metadata-schema.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "dataset_name": "str", - "ext_supported": "bool", - "file_name": "str", - "name_collision": "bool", - "sha": "str", - "size": "int", - "suffix": "str", - "tag": "str", - "url_npm": "str" -} \ No newline at end of file diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 0238aab69..8b58e8690 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -37,7 +37,7 @@ from email.message import Message from urllib.request import OpenerDirector, Request - from tools.datasets._typing import Extension + from altair.datasets._typing import Extension if sys.version_info >= (3, 13): from typing import TypeIs @@ -270,7 +270,8 @@ class GitHub: def __init__( self, - output_dir: Path, + out_dir_tools: Path, + out_dir_altair: Path, name_tags: str, name_trees: str, *, @@ -278,11 +279,12 @@ def __init__( org: LiteralString = "vega", package: LiteralString = "vega-datasets", ) -> None: - output_dir.mkdir(exist_ok=True) + out_dir_tools.mkdir(exist_ok=True) + out_dir_altair.mkdir(exist_ok=True) self._paths: dict[_PathName, Path] = { - "dir": output_dir, - "tags": output_dir / f"{name_tags}.parquet", - "trees": output_dir / f"{name_trees}.parquet", + "dir": out_dir_tools, + "tags": out_dir_tools / f"{name_tags}.parquet", + "trees": out_dir_altair / f"{name_trees}.parquet", } repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( From a17d674303558f0989b2aaac835efa3d04de80cc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:44:57 +0000 Subject: [PATCH 058/201] refactor: Provide `npm` url to `GitHub(...)` --- tools/datasets/__init__.py | 3 ++- tools/datasets/github.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index d9b00d9a5..6319bd65e 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -62,14 +62,15 @@ def __init__( kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema + self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._github: GitHub = GitHub( out_dir_tools, out_dir_altair, name_tags=tags_gh, name_trees=trees_gh, + npm_cdn_url=self._npm.url.CDN, **kwds_gh, ) - self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 8b58e8690..c2d7141aa 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -59,10 +59,7 @@ _TD = TypeVar("_TD", bound=Mapping[str, Any]) - -# TODO: Work on where these should live/be accessed -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" +_DATA = "data" def is_ext_supported(suffix: str) -> TypeIs[Extension]: @@ -152,7 +149,7 @@ def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: url = tag["trees_url"] with self._gh._opener.open(self._request(url)) as response: content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _DATA) if data_url := next(query, None): with self._gh._opener.open(self._request(data_url)) as response: data_dir: GitHubTreesResponse = json.load(response) @@ -237,12 +234,13 @@ def tag_from_str(self, s: str, /) -> str: # - Trees url (using ref name) # - npm url (works w/o the `v` prefix) trees_url = self.url.TREES + npm_url = self._gh._npm_cdn_url if s.startswith("v"): return s elif s.startswith(trees_url): return s.replace(trees_url, "") - elif s.startswith(_NPM_BASE_URL): - s, _ = s.replace(_NPM_BASE_URL, "").split("/") + elif s.startswith(npm_url): + s, _ = s.replace(npm_url, "").split("/") return s if s.startswith("v") else f"v{s}" else: raise TypeError(s) @@ -275,6 +273,7 @@ def __init__( name_tags: str, name_trees: str, *, + npm_cdn_url: LiteralString, base_url: LiteralString = "https://api.github.com/", org: LiteralString = "vega", package: LiteralString = "vega-datasets", @@ -295,6 +294,7 @@ def __init__( TAGS=f"{repo}tags", TREES=f"{repo}git/trees/", ) + self._npm_cdn_url: LiteralString = npm_cdn_url @property def req(self) -> _GitHubRequestNamespace: @@ -331,9 +331,9 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), + pl.lit(self._npm_cdn_url), pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), + pl.lit(f"/{_DATA}/"), pl.col("file_name"), ) ) @@ -345,7 +345,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: """ Use known tags to discover and update missing trees metadata. - Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + Aims to stay well-within API rate limits, both for authenticated and unauthenticated users. """ if gh_tags.is_empty(): msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" From 69a619caeaa803599dfc080ed7b3b34f0ca10386 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:55:42 +0000 Subject: [PATCH 059/201] refactor: Rename `ext` -> `suffix` --- altair/datasets/__init__.py | 8 ++++---- altair/datasets/_readers.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 15c8069f9..0db434979 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -30,23 +30,23 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): def url( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: """Return the address of a remote dataset.""" - return self._reader.url(name, ext, tag=tag) + return self._reader.url(name, suffix, tag=tag) def __call__( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, ext, tag=tag, **kwds) + return self._reader.dataset(name, suffix, tag=tag, **kwds) def __repr__(self) -> str: return f"{type(self).__name__}[{type(self._reader).__name__}]" diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cbb02cd00..cebbe1526 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -109,11 +109,11 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: def url( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - df = self._query(**validate_constraints(name, ext, tag)) + df = self._query(**validate_constraints(name, suffix, tag)) url = df.item(0, "url_npm") if isinstance(url, str): return url @@ -124,7 +124,7 @@ def url( def dataset( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, @@ -134,12 +134,12 @@ def dataset( Parameters ---------- - name, ext, tag + name, suffix, tag TODO **kwds Arguments passed to the underlying read function. """ - df = self._query(**validate_constraints(name, ext, tag)) + df = self._query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -314,7 +314,7 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw. def validate_constraints( name: DatasetName | LiteralString, - ext: Extension | None, + suffix: Extension | None, tag: VersionTag | Literal["latest"] | None, /, ) -> Metadata: @@ -328,11 +328,11 @@ def validate_constraints( constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) + elif suffix is not None: + if not is_ext_supported(suffix): + raise TypeError(suffix) else: - constraints["suffix"] = ext + constraints["suffix"] = suffix constraints["dataset_name"] = name return constraints From a259b1070e1a2dcd356992bc6fd95982cf6b9ef6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:57:31 +0000 Subject: [PATCH 060/201] refactor: Remove unimplemented `tag="latest"` Since `metadata.parquet` is sorted, this was already the behavior when not providing a tag --- altair/datasets/__init__.py | 4 ++-- altair/datasets/_readers.py | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 0db434979..c2ccee2fe 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -32,7 +32,7 @@ def url( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, ) -> str: """Return the address of a remote dataset.""" return self._reader.url(name, suffix, tag=tag) @@ -42,7 +42,7 @@ def __call__( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cebbe1526..b344bd67a 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -111,7 +111,7 @@ def url( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, ) -> str: df = self._query(**validate_constraints(name, suffix, tag)) url = df.item(0, "url_npm") @@ -126,7 +126,7 @@ def dataset( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: """ @@ -315,13 +315,11 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw. def validate_constraints( name: DatasetName | LiteralString, suffix: Extension | None, - tag: VersionTag | Literal["latest"] | None, + tag: VersionTag | None, /, ) -> Metadata: constraints: Metadata = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: + if tag is not None: constraints["tag"] = tag if name.endswith((".csv", ".json", ".tsv", ".arrow")): fp = Path(name) From 88968c8bf188f5c6817fac2edf3c0b8a44602ec3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 11:56:01 +0000 Subject: [PATCH 061/201] feat: Rename `_datasets_dir`, make configurable, add docs Still on the fence about `Loader.cache_dir` vs `Loader.cache` --- altair/datasets/__init__.py | 31 +++++++++++++++++++++++++++++++ altair/datasets/_readers.py | 10 +++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index c2ccee2fe..c89163a48 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -8,11 +8,13 @@ if TYPE_CHECKING: import sys + from pathlib import Path from typing import Any, Literal import pandas as pd import polars as pl import pyarrow as pa + from _typeshed import StrPath if sys.version_info >= (3, 11): from typing import LiteralString @@ -99,6 +101,35 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj + @property + def cache_dir(self) -> Path | None: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You *may* also set this directly, but the value will **not** persist between sessions: + + from pathlib import Path + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + data.cache_dir = Path.home() / ".altair_cache" + + data.cache_dir.relative_to(Path.home()).as_posix() + '.altair_cache' + """ + return self._reader._cache + + @cache_dir.setter + def cache_dir(self, source: StrPath, /) -> None: + import os + + os.environ[self._reader._ENV_VAR] = str(source) + def __getattr__(name): if name == "data": diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index b344bd67a..673e2e6d1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -85,7 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @property - def _datasets_dir(self) -> Path | None: # type: ignore[return] + def _cache(self) -> Path | None: # type: ignore[return] """ Returns path to datasets cache, if possible. @@ -94,9 +94,9 @@ def _datasets_dir(self) -> Path | None: # type: ignore[return] Reader._ENV_VAR """ if _dir := os.environ.get(self._ENV_VAR): - datasets_dir = Path(_dir) - datasets_dir.mkdir(exist_ok=True) - return datasets_dir + cache_dir = Path(_dir) + cache_dir.mkdir(exist_ok=True) + return cache_dir def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) @@ -145,7 +145,7 @@ def dataset( url = result["url_npm"] fn = self.reader_from(url) - if cache := self._datasets_dir: + if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) if fp.exists(): return fn(fp, **kwds) From b98730887d0392ac0a2fbb5d226f5013862201c3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 12:13:25 +0000 Subject: [PATCH 062/201] docs: Adds examples to `Loader.with_backend` --- altair/datasets/__init__.py | 49 ++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index c89163a48..4bcf768b6 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -27,6 +27,13 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Load examples **remotely** from `vega-datasets`_, with *optional* caching. + + .. _vega-datasets: + https://github.com/vega/vega-datasets + """ + _reader: _Reader[IntoDataFrameT, IntoFrameT] def url( @@ -74,7 +81,7 @@ def with_backend( @classmethod def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: """ - Initialize a new loader, using the specified backend. + Initialize a new loader, with the specified backend. Parameters ---------- @@ -96,6 +103,46 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: https://pandas.pydata.org/docs/reference/io.html .. _JSON format not supported: https://arrow.apache.org/docs/python/json.html#reading-json-files + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + cars = data("cars") + + type(cars) + polars.dataframe.frame.DataFrame + + Using ``pandas``: + + data = Loader.with_backend("pandas") + cars = data("cars") + + type(cars) + pandas.core.frame.DataFrame + + Using ``pandas``, backed by ``pyarrow`` dtypes: + + data = Loader.with_backend("pandas[pyarrow]") + cars = data("cars", tag="v1.29.0") + + type(cars) + pandas.core.frame.DataFrame + + cars.dtypes + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year string[pyarrow] + Origin string[pyarrow] + dtype: object """ obj = Loader.__new__(Loader) obj._reader = get_backend(backend) From 4a2a2e068f85d118244ceda09350cf3690781227 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 14:59:40 +0000 Subject: [PATCH 063/201] refactor: Clean up requirements -> imports --- altair/datasets/_readers.py | 100 ++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 673e2e6d1..78ee784a6 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -63,9 +63,14 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") - _Backend: TypeAlias = Literal[ - "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow" - ] + + _Polars: TypeAlias = Literal["polars"] + _Pandas: TypeAlias = Literal["pandas"] + _PyArrow: TypeAlias = Literal["pyarrow"] + _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow) + _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"] + _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] + _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] __all__ = ["get_backend"] @@ -80,6 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _name: LiteralString _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @@ -193,11 +199,16 @@ def _import(self, name: str, /) -> Any: msg = f"{type(self).__name__!r} requires missing dependency {name!r}." raise ModuleNotFoundError(msg, name=name) - def __init__(self, *specs: str) -> None: ... + def __repr__(self) -> str: + return f"Reader[{self._name}]" + + def __init__(self, name: LiteralString, /) -> None: ... class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, _pd: str, _pa: str, /) -> None: + def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: + _pd, _pa = _requirements(name) + self._name = name if not TYPE_CHECKING: pd = self._import(_pd) pa = self._import(_pa) # noqa: F841 @@ -219,9 +230,10 @@ def __init__(self, _pd: str, _pa: str, /) -> None: class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, _pd: str, /) -> None: + def __init__(self, name: _Pandas, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pd = self._import(_pd) + pd = self._import(self._name) self._read_fn = { ".csv": pd.read_csv, ".json": pd.read_json, @@ -232,9 +244,10 @@ def __init__(self, _pd: str, /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, _pl: str, /) -> None: + def __init__(self, name: _Polars, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pl = self._import(_pl) + pl = self._import(self._name) self._read_fn = { ".csv": pl.read_csv, ".json": pl.read_json, @@ -245,7 +258,9 @@ def __init__(self, _pl: str, /) -> None: class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, _pl: str, _pa: str, /) -> None: + def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: + _pl, _pa = _requirements(name) + self._name = name if not TYPE_CHECKING: pl = self._import(_pl) pa = self._import(_pa) # noqa: F841 @@ -275,13 +290,14 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): https://arrow.apache.org/docs/python/json.html#reading-json-files """ - def __init__(self, _pa: str, /) -> None: + def __init__(self, name: _PyArrow, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pa = self._import(_pa) # noqa: F841 - pa_csv = self._import(f"{_pa}.csv") - pa_feather = self._import(f"{_pa}.feather") - pa_json = self._import(f"{_pa}.json") - pa_parquet = self._import(f"{_pa}.parquet") + pa = self._import(self._name) # noqa: F841 + pa_csv = self._import(f"{self._name}.csv") + pa_feather = self._import(f"{self._name}.feather") + pa_json = self._import(f"{self._name}.json") + pa_parquet = self._import(f"{self._name}.parquet") pa_read_csv = pa_csv.read_csv pa_read_feather = pa_feather.read_table @@ -353,34 +369,62 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]: @overload -def get_backend( - backend: Literal["polars", "polars[pyarrow]"], / -) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload -def get_backend( - backend: Literal["pandas", "pandas[pyarrow]"], / -) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... @overload -def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ... +def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": - return _PolarsReader("polars") + return _PolarsReader(backend) elif backend == "polars[pyarrow]": - return _PolarsPyArrowReader("polars", "pyarrow") + return _PolarsPyArrowReader(backend) elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader("pandas", "pyarrow") + return _PandasPyArrowReader(backend) elif backend == "pandas": - return _PandasReader("pandas") + return _PandasReader(backend) elif backend == "pyarrow": - return _PyArrowReader("pyarrow") + return _PyArrowReader(backend) elif backend in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) else: raise TypeError(backend) + + +@overload +def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... + + +@overload +def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... + + +@overload +def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ... + + +def _requirements(s: _Backend, /): + concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} + if s in concrete: + return s + else: + from packaging.requirements import Requirement + + req = Requirement(s) + supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"} + if req.name in supports_extras: + name = req.name + if (extras := req.extras) and extras == {"pyarrow"}: + extra = "pyarrow" + return name, extra + else: + raise NotImplementedError(s) + else: + raise NotImplementedError(s) From e6dd27e6fb680b965e7d698a636d47a389c3e7df Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 15:03:17 +0000 Subject: [PATCH 064/201] docs: Add basic example to `Loader` class Also incorporates changes from previous commit into `__repr__` 4a2a2e068f85d118244ceda09350cf3690781227 --- altair/datasets/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4bcf768b6..6d7a922d3 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -30,12 +30,20 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ Load examples **remotely** from `vega-datasets`_, with *optional* caching. + A new ``Loader`` must be initialized by specifying a backend: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + Loader[polars] + .. _vega-datasets: https://github.com/vega/vega-datasets """ _reader: _Reader[IntoDataFrameT, IntoFrameT] + # TODO: docs (parameters, examples) def url( self, name: DatasetName | LiteralString, @@ -46,6 +54,7 @@ def url( """Return the address of a remote dataset.""" return self._reader.url(name, suffix, tag=tag) + # TODO: docs (parameters, examples) def __call__( self, name: DatasetName | LiteralString, @@ -58,7 +67,7 @@ def __call__( return self._reader.dataset(name, suffix, tag=tag, **kwds) def __repr__(self) -> str: - return f"{type(self).__name__}[{type(self._reader).__name__}]" + return f"{type(self).__name__}[{self._reader._name}]" @overload @classmethod From 2a7bc4f5bbcfea11e416453fa00abbee11ad8c5b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 15:50:51 +0000 Subject: [PATCH 065/201] refactor: Reorder `alt.datasets` module --- altair/datasets/__init__.py | 52 ++++++++++----------- altair/datasets/_readers.py | 92 ++++++++++++++++++------------------- 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 6d7a922d3..260258882 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -43,32 +43,6 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): _reader: _Reader[IntoDataFrameT, IntoFrameT] - # TODO: docs (parameters, examples) - def url( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - ) -> str: - """Return the address of a remote dataset.""" - return self._reader.url(name, suffix, tag=tag) - - # TODO: docs (parameters, examples) - def __call__( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, suffix, tag=tag, **kwds) - - def __repr__(self) -> str: - return f"{type(self).__name__}[{self._reader._name}]" - @overload @classmethod def with_backend( @@ -157,6 +131,29 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj + # TODO: docs (parameters, examples) + def __call__( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(name, suffix, tag=tag, **kwds) + + # TODO: docs (parameters, examples) + def url( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + ) -> str: + """Return the address of a remote dataset.""" + return self._reader.url(name, suffix, tag=tag) + @property def cache_dir(self) -> Path | None: """ @@ -186,6 +183,9 @@ def cache_dir(self, source: StrPath, /) -> None: os.environ[self._reader._ENV_VAR] = str(source) + def __repr__(self) -> str: + return f"{type(self).__name__}[{self._reader._name}]" + def __getattr__(name): if name == "data": diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 78ee784a6..53a18b2d6 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -86,24 +86,10 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] _name: LiteralString - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - @property - def _cache(self) -> Path | None: # type: ignore[return] - """ - Returns path to datasets cache, if possible. - - Requires opt-in via environment variable:: - - Reader._ENV_VAR - """ - if _dir := os.environ.get(self._ENV_VAR): - cache_dir = Path(_dir) - cache_dir.mkdir(exist_ok=True) - return cache_dir - def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) return self._read_fn[suffix] @@ -112,21 +98,6 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] - def url( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - ) -> str: - df = self._query(**validate_constraints(name, suffix, tag)) - url = df.item(0, "url_npm") - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) - def dataset( self, name: DatasetName | LiteralString, @@ -145,7 +116,7 @@ def dataset( **kwds Arguments passed to the underlying read function. """ - df = self._query(**validate_constraints(name, suffix, tag)) + df = self.query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -164,7 +135,22 @@ def dataset( with self._opener.open(url) as f: return fn(f.read(), **kwds) - def _query( + def url( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + ) -> str: + df = self.query(**validate_constraints(name, suffix, tag)) + url = df.item(0, "url_npm") + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + + def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.DataFrame[IntoDataFrameT]: r""" @@ -192,6 +178,20 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + @property + def _cache(self) -> Path | None: # type: ignore[return] + """ + Returns path to datasets cache, if possible. + + Requires opt-in via environment variable:: + + Reader._ENV_VAR + """ + if _dir := os.environ.get(self._ENV_VAR): + cache_dir = Path(_dir) + cache_dir.mkdir(exist_ok=True) + return cache_dir + def _import(self, name: str, /) -> Any: if spec := find_spec(name): return import_module(spec.name) @@ -205,6 +205,20 @@ def __repr__(self) -> str: def __init__(self, name: LiteralString, /) -> None: ... +class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + def __init__(self, name: _Pandas, /) -> None: + self._name = _requirements(name) + if not TYPE_CHECKING: + pd = self._import(self._name) + self._read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + self._scan_fn = {".parquet": pd.read_parquet} + + class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: _pd, _pa = _requirements(name) @@ -229,20 +243,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} -class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, name: _Pandas, /) -> None: - self._name = _requirements(name) - if not TYPE_CHECKING: - pd = self._import(self._name) - self._read_fn = { - ".csv": pd.read_csv, - ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), - ".arrow": pd.read_feather, - } - self._scan_fn = {".parquet": pd.read_parquet} - - class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) From c572180ebc7d876714a38688c53f7e4af87abd93 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:59:10 +0000 Subject: [PATCH 066/201] docs: Fill out `Loader.url` --- altair/datasets/__init__.py | 40 +++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 260258882..b7f87bdaa 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -143,7 +143,6 @@ def __call__( """Get a remote dataset and load as tabular data.""" return self._reader.dataset(name, suffix, tag=tag, **kwds) - # TODO: docs (parameters, examples) def url( self, name: DatasetName | LiteralString, @@ -151,7 +150,44 @@ def url( /, tag: VersionTag | None = None, ) -> str: - """Return the address of a remote dataset.""" + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`stem`_ of filename. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + `vega-datasets release`_ version. + + .. _stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + The returned url will always point to an accessible dataset: + + import altair as alt + from altair.datasets import Loader + + data = Loader.with_backend("polars") + data.url("cars", tag="v2.9.0") + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + + We can pass the result directly to a chart: + + url = data.url("cars", tag="v2.9.0") + alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") + """ return self._reader.url(name, suffix, tag=tag) @property From 9ab9463007a8509c25cc69665ba995f42e84792d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 18:06:03 +0000 Subject: [PATCH 067/201] feat: Adds `_Reader._read_metadata` --- altair/datasets/_readers.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 53a18b2d6..ea8d7088c 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -166,7 +166,7 @@ def query( """ source = self._metadata fn = self.scanner_from(source) - frame = nw.from_native(fn(source), pass_through=False) + frame = nw.from_native(fn(source)) result = frame.filter(_filter_reduce(predicates, constraints)) df: nw.DataFrame[Any] = ( result.collect() if isinstance(result, nw.LazyFrame) else result @@ -178,6 +178,19 @@ def query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + def _read_metadata(self) -> IntoDataFrameT: + """ + Return the full contents of ``metadata.parquet``. + + Effectively an eager read, no filters. + """ + fn = self.scanner_from(self._metadata) + frame = nw.from_native(fn(self._metadata)) + df: nw.DataFrame[Any] = ( + frame.collect() if isinstance(frame, nw.LazyFrame) else frame + ) + return df.to_native() + @property def _cache(self) -> Path | None: # type: ignore[return] """ From dd3edd66e2eb38be3c73f0ad0411e738f2f81495 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 20:43:53 +0000 Subject: [PATCH 068/201] refactor: Rename `(reader|scanner_from()` -> `(read|scan)_fn()` --- altair/datasets/_readers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index ea8d7088c..afa1d2f54 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -90,11 +90,11 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: - suffix = validate_suffix(source, is_ext_supported) + def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: + suffix = validate_suffix(source, is_ext_read) return self._read_fn[suffix] - def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: + def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] @@ -120,7 +120,7 @@ def dataset( it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] - fn = self.reader_from(url) + fn = self.read_fn(url) if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) @@ -165,7 +165,7 @@ def query( https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ source = self._metadata - fn = self.scanner_from(source) + fn = self.scan_fn(source) frame = nw.from_native(fn(source)) result = frame.filter(_filter_reduce(predicates, constraints)) df: nw.DataFrame[Any] = ( @@ -184,7 +184,7 @@ def _read_metadata(self) -> IntoDataFrameT: Effectively an eager read, no filters. """ - fn = self.scanner_from(self._metadata) + fn = self.scan_fn(self._metadata) frame = nw.from_native(fn(self._metadata)) df: nw.DataFrame[Any] = ( frame.collect() if isinstance(frame, nw.LazyFrame) else frame @@ -356,7 +356,7 @@ def validate_constraints( constraints["suffix"] = fp.suffix return constraints elif suffix is not None: - if not is_ext_supported(suffix): + if not is_ext_read(suffix): raise TypeError(suffix) else: constraints["suffix"] = suffix @@ -377,7 +377,7 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" -def is_ext_supported(suffix: Any) -> TypeIs[Extension]: +def is_ext_read(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} From 146cb50c60d0839cf56552b00472f768ec58001c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 21:29:44 +0000 Subject: [PATCH 069/201] refactor(typing): Replace some explicit casts --- altair/datasets/_readers.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index afa1d2f54..78e330047 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -226,7 +226,7 @@ def __init__(self, name: _Pandas, /) -> None: self._read_fn = { ".csv": pd.read_csv, ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"), ".arrow": pd.read_feather, } self._scan_fn = {".parquet": pd.read_parquet} @@ -241,19 +241,12 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: pa = self._import(_pa) # noqa: F841 self._read_fn = { - ".csv": cast( - partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") - ), - ".json": cast( - partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") - ), - ".tsv": cast( - partial["pd.DataFrame"], - partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), - ), - ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + ".csv": partial["pd.DataFrame"](pd.read_csv, dtype_backend=_pa), + ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa), + ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa), + ".arrow": partial(pd.read_feather, dtype_backend=_pa), } - self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} + self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): From 94ad0d1b879f43359dbead2b796db540531a2504 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:51:27 +0000 Subject: [PATCH 070/201] refactor: Shorten and document request delays --- tools/datasets/github.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index c2d7141aa..2d0d16fca 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -106,8 +106,10 @@ class _GitHubRequestNamespace: _UNAUTH_RATE_LIMIT: Literal[60] = 60 _TAGS_COST: Literal[1] = 1 _TREES_COST: Literal[2] = 2 - _UNAUTH_DELAY: Literal[5] = 5 - _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_DELAY: Literal[5_000] = 5_000 + """**ms** delay added between **unauthenticated** ``trees`` requests.""" + _AUTH_DELAY: Literal[500] = 500 + """**ms** delay added between **authenticated** ``trees`` requests.""" _UNAUTH_TREES_LIMIT: Literal[10] = 10 def __init__(self, gh: GitHub, /) -> None: @@ -123,6 +125,10 @@ def rate_limit(self) -> GitHubRateLimitResources: content: GitHubRateLimitResources = json.load(response)["resources"] return content + def delay(self, *, is_auth: bool) -> float: + ms = self._AUTH_DELAY if is_auth else self._UNAUTH_DELAY + return (ms + random.triangular()) / 1_000 + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" if n < 1 or n > self._TAGS_MAX_PAGE: @@ -314,6 +320,11 @@ def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: raise NotImplementedError(limit) return limit + def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: + """Return a delay time in seconds, corresponding with authentication status.""" + limit = rate_limit or self.rate_limit(strict=True) + return self.req.delay(is_auth=limit["is_auth"]) + def tags( self, n_head: int | None = None, *, warn_lower: bool = False ) -> pl.DataFrame: @@ -412,14 +423,13 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: cost = req._TREES_COST * n if rate_limit["remaining"] < cost: raise NotImplementedError(rate_limit, cost) - delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY print( f"Collecting metadata for {n} missing releases.\n" - f"Using {delay_secs=} between requests ..." + f"Using {self.delay(rate_limit)}[ms] between requests ..." ) dfs: list[pl.DataFrame] = [] for tag in tags: - time.sleep(delay_secs + random.triangular()) + time.sleep(self.delay(rate_limit)) dfs.append(self.trees(tag)) df = pl.concat(dfs) print(f"Finished collection.\n" f"Found {df.height} new rows") From 409338397ebb9ff2ec7abb146394f17702762b08 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 17:26:01 +0000 Subject: [PATCH 071/201] feat(DRAFT): Make `[tag]` a `pl.Enum` --- altair/datasets/_metadata/metadata.parquet | Bin 18495 -> 18641 bytes tools/datasets/__init__.py | 6 +++++ tools/datasets/github.py | 17 ++++++++++--- tools/datasets/semver.py | 28 ++++++++++++++++++--- 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 8bf0e17e3673d2b7cfbbe1ddba345f492d12e674..5e7b3bd06439ace1d5eb8387efecde322cca91d7 100644 GIT binary patch delta 725 zcmdl#f$`!*#trL~>c24hh=zzVFfeFr)&HAtm7$fH;gbuWz=qc)p%RZu@A{rHk$(Ll zgLz$*ldAAX^S8QgF~)a3?RlNFaBlbpHYyy|qy~%HsrG>;kfK|T&s{mGC;j*9AbCgGm3!xAi?7p6yzUnRh*oWnwu!m!l4fa*^?K$I4e1O zI;J~120D5=ItG+GIXXHU06C7Sj;=r^i0@K7nZq@{J{P1V*AWOj%YlSbK8P6!BAkHQ zfux(0r=xp1P#ci%4mPeFWJahXh=dZB0YF14K>8hhL4-4y4g}F)MM0saVIe@H9MTNQo%?5#hETE|tFav@90J_G}#R%kJ aFbBj>2l+2>^K92SOdRGM3=9E|L52YRdh{Ux delta 516 zcmcaOk#YY7#trL~>OV00h=zzVFfeFr)&HAtm7$cGLCBR)V8iQ@P>DyScYRNpNWcD& z!Mx5pQZquZmnD1Qmf`LI%bA`Z;$0qC> zM3)%~2rwER%bIGq;Y$4XYcF)4v#r)kF*bBuDDvxW#3xy~O_vtW-%`q?!aCVdIm{zH zYgww-s>fRE6uGy}f4y_=>7TnT*I3T6?5V8Tne#LJ-<1DP^nbeloACef=Eud4&;Qu- zL;GK+{iFIH(*IiRAKL%m{?};#V2kzM$>){j7;kNUr_3nD*thwEZ8sC&y&gu_FmcNXmMht4J`Wa<^{|8T3~3yAOQ2n5cKlet{uCri3IY_@cpz{Ihfoq-|1F~|@AOW4Er diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 6319bd65e..f318f292e 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -88,6 +88,7 @@ def npm(self) -> Npm: return self._npm def refresh(self) -> pl.DataFrame: + """Update and sync all metadata files.""" npm_tags = self.npm.tags() self.write_parquet(npm_tags, self._paths["npm_tags"]) @@ -98,6 +99,11 @@ def refresh(self) -> pl.DataFrame: self.write_parquet(gh_trees, self._paths["gh_trees"]) return gh_trees + def reset(self) -> None: + """Remove all metadata files.""" + for fp in self._paths.values(): + fp.unlink(missing_ok=True) + def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" return pl.read_parquet(self._from_alias(name)) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 2d0d16fca..6bde876ae 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -357,6 +357,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: Use known tags to discover and update missing trees metadata. Aims to stay well-within API rate limits, both for authenticated and unauthenticated users. + + Notes + ----- + Internally handles regenerating the ``tag`` enum. """ if gh_tags.is_empty(): msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" @@ -367,18 +371,23 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: TP = ReParsedTag if not fp.exists(): print(f"Initializing {fp!s}") - return self._trees_batched(_iter_rows(gh_tags, stop, TP)) + result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: - trees = pl.read_parquet(fp) + trees = ( + pl.scan_parquet(fp) + .with_columns(pl.col("tag").cast(pl.String)) + .collect() + ) missing_trees = gh_tags.join( trees.select(pl.col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): print(f"Already up-to-date {fp!s}") - return trees + result = trees else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) - return pl.concat((trees, fresh)) + result = pl.concat((trees, fresh)) + return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags))) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index cb4c6c799..57f6d509f 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -52,6 +52,28 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: return ldf -def sort(frame: _Frame, /) -> _Frame: - """Sort ``frame``, displaying in descending release order.""" - return frame.sort(_SEM_VER_FIELDS, descending=True) +def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum: + """Extract an **ascending** order ``pl.Enum`` from ``col_tag``.""" + return pl.Enum( + frame.lazy() + .pipe(sort, descending=False) + .select(col_tag) + .collect() + .get_column(col_tag) + ) + + +def sort(frame: _Frame, /, descending: bool = True) -> _Frame: + """ + Sort ``frame``, displaying in release order. + + Parameters + ---------- + descending + By default, **most recent** is first. + + Notes + ----- + Ensures pre release versions maintain order, always appearing before actual releases. + """ + return frame.sort(_SEM_VER_FIELDS, descending=descending, nulls_last=not descending) From 76cdd45af0e1dc7ac632899b3618c199be5291ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:12:55 +0000 Subject: [PATCH 072/201] fix: Handle `pyarrow` scalars conversion --- altair/datasets/_readers.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 78e330047..3b122df10 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -147,8 +147,15 @@ def url( if isinstance(url, str): return url else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) + converted = nw.to_py_scalar(url) + if isinstance(converted, str): + return converted + else: + msg = ( + f"Expected 'str' but got {type(converted).__name__!r}\n" + f"from {converted!r}." + ) + raise TypeError(msg) def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] From bb7bc171a7005fd63f39b3d949902f4d553801f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:15:52 +0000 Subject: [PATCH 073/201] test: Adds `test_datasets` Initially quite basic, need to add more parameterize and test caching --- tests/test_datasets.py | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_datasets.py diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 000000000..a15fb9411 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import pytest +from narwhals.dependencies import is_into_dataframe +from narwhals.stable import v1 as nw + +import altair as alt # noqa: F401 +from altair.datasets import Loader + +if TYPE_CHECKING: + from altair.datasets._readers import _Backend + +backends = pytest.mark.parametrize( + "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] +) + + +@backends +def test_loader_with_backend(backend: _Backend) -> None: + data = Loader.with_backend(backend) + assert data._reader._name == backend + + +@backends +def test_loader_url(backend: _Backend) -> None: + data = Loader.with_backend(backend) + dataset_name = "volcano" + pattern = re.compile( + rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" + ) + url = data.url(dataset_name) + assert isinstance(url, str) + assert pattern.match(url) is not None + + +@backends +def test_loader_call(backend: _Backend) -> None: + data = Loader.with_backend(backend) + frame = data("stocks", ".csv") + assert is_into_dataframe(frame) + nw_frame = nw.from_native(frame) + assert set(nw_frame.columns) == {"symbol", "date", "price"} From ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:31:53 +0000 Subject: [PATCH 074/201] fix(DRAFT): hotfix `pyarrow` read --- altair/datasets/_readers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 3b122df10..f58fcd56d 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -98,6 +98,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] + def _response_hook(self, f): + # HACK: pyarrow wants the file obj + return f.read() + def dataset( self, name: DatasetName | LiteralString, @@ -133,7 +137,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(f.read(), **kwds) + return fn(self._response_hook(f), **kwds) def url( self, @@ -329,6 +333,9 @@ def __init__(self, name: _PyArrow, /) -> None: } self._scan_fn = {".parquet": pa_read_parquet} + def _response_hook(self, f): + return f + def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ From fe0ae88201cc699b32ee1e9c07b602d9d7a8d439 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 20:56:22 +0000 Subject: [PATCH 075/201] fix(DRAFT): Treat `polars` as exception, invalidate cache Possibly fix https://github.com/vega/altair/actions/runs/11768349827/job/32778071725?pr=3631 --- altair/datasets/_readers.py | 13 ++++++++----- tests/test_datasets.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index f58fcd56d..eea9f18db 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -99,8 +99,8 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: return self._scan_fn[suffix] def _response_hook(self, f): - # HACK: pyarrow wants the file obj - return f.read() + # HACK: `pyarrow` + `pandas` wants the file obj + return f def dataset( self, @@ -273,6 +273,9 @@ def __init__(self, name: _Polars, /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} + def _response_hook(self, f): + return f.read() + class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: @@ -289,6 +292,9 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} + def _response_hook(self, f): + return f.read() + class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ @@ -333,9 +339,6 @@ def __init__(self, name: _PyArrow, /) -> None: } self._scan_fn = {".parquet": pa_read_parquet} - def _response_hook(self, f): - return f - def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a15fb9411..c37bc0046 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -39,6 +39,7 @@ def test_loader_url(backend: _Backend) -> None: @backends def test_loader_call(backend: _Backend) -> None: data = Loader.with_backend(backend) + data.cache_dir = "" frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) From 7089f2af693c6db2025ee265f31ec4ef228dd8c3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 21:11:07 +0000 Subject: [PATCH 076/201] test: Skip `pyarrow` tests on `3.9` Forgot that this gets uninstalled in CI https://github.com/vega/altair/actions/runs/11768424121/job/32778234026?pr=3631 --- tests/test_datasets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index c37bc0046..ec2f9014f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -9,12 +9,15 @@ import altair as alt # noqa: F401 from altair.datasets import Loader +from tests import skip_requires_pyarrow if TYPE_CHECKING: from altair.datasets._readers import _Backend -backends = pytest.mark.parametrize( - "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] +backends = skip_requires_pyarrow( + pytest.mark.parametrize( + "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] + ) ) @@ -39,7 +42,7 @@ def test_loader_url(backend: _Backend) -> None: @backends def test_loader_call(backend: _Backend) -> None: data = Loader.with_backend(backend) - data.cache_dir = "" + data.cache_dir = "" # type: ignore[assignment] frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) From e1290d4384d4926c24f22a3a23f103e284cfbe1e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 13:50:54 +0000 Subject: [PATCH 077/201] refactor: Tidy up changes from last 4 commits - Rename and properly document "file-like object" handling - Also made a bit clearer what is being called and when - Use a more granular approach to skipping in `@backends` - Previously, everything was skipped regardless of whether it required `pyarrow` - Now, `polars`, `pandas` **always** run - with `pandas` expected to fail - I had to clean up `skip_requires_pyarrow` to make it compatible with `pytest.param` - It has a runtime check for if `MarkDecorator`, instead of just a callable https://github.com/vega/altair/pull/3631/commits/bb7bc171a7005fd63f39b3d949902f4d553801f0, https://github.com/vega/altair/pull/3631/commits/ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63, https://github.com/vega/altair/pull/3631/commits/fe0ae88201cc699b32ee1e9c07b602d9d7a8d439, https://github.com/vega/altair/pull/3631/commits/7089f2af693c6db2025ee265f31ec4ef228dd8c3 --- altair/datasets/_readers.py | 33 ++++++++++++++++++++++----------- tests/__init__.py | 31 +++++++++++++++++++------------ tests/test_datasets.py | 26 ++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index eea9f18db..a3435d231 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -12,6 +12,7 @@ import os import urllib.request from functools import partial +from http.client import HTTPResponse from importlib import import_module from importlib.util import find_spec from itertools import chain, islice @@ -76,6 +77,10 @@ __all__ = ["get_backend"] +def _identity(_: _T, /) -> _T: + return _ + + class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): """ Common functionality between backends. @@ -88,6 +93,18 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _name: LiteralString _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) + """ + Backends that do not support `file-like objects`_, must override with conversion. + + Used only for **remote** files, as *cached* files use a `pathlib.Path`_. + + .. _file-like objects: + https://docs.python.org/3/glossary.html#term-file-object + .. _pathlib.Path: + https://docs.python.org/3/library/pathlib.html#pathlib.Path + """ + _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: @@ -98,10 +115,6 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] - def _response_hook(self, f): - # HACK: `pyarrow` + `pandas` wants the file obj - return f - def dataset( self, name: DatasetName | LiteralString, @@ -137,7 +150,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(self._response_hook(f), **kwds) + return fn(self._response(f), **kwds) def url( self, @@ -261,6 +274,8 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _response = staticmethod(HTTPResponse.read) + def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -273,11 +288,10 @@ def __init__(self, name: _Polars, /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} - def _response_hook(self, f): - return f.read() - class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _response = staticmethod(HTTPResponse.read) + def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: _pl, _pa = _requirements(name) self._name = name @@ -292,9 +306,6 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} - def _response_hook(self, f): - return f.read() - class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ diff --git a/tests/__init__.py b/tests/__init__.py index 617cfca80..17a33e91e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,14 +5,14 @@ import sys from importlib.util import find_spec from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, overload import pytest from tests import examples_arguments_syntax, examples_methods_syntax if TYPE_CHECKING: - from collections.abc import Callable, Collection, Iterator, Mapping + from collections.abc import Collection, Iterator, Mapping from re import Pattern if sys.version_info >= (3, 11): @@ -20,6 +20,7 @@ else: from typing_extensions import TypeAlias from _pytest.mark import ParameterSet + from _pytest.mark.structures import Markable MarksType: TypeAlias = ( "pytest.MarkDecorator | Collection[pytest.MarkDecorator | pytest.Mark]" @@ -96,9 +97,21 @@ def windows_has_tzdata() -> bool: """ +@overload def skip_requires_pyarrow( - fn: Callable[..., Any] | None = None, /, *, requires_tzdata: bool = False -) -> Callable[..., Any]: + fn: None = ..., /, *, requires_tzdata: bool = ... +) -> pytest.MarkDecorator: ... + + +@overload +def skip_requires_pyarrow( + fn: Markable, /, *, requires_tzdata: bool = ... +) -> Markable: ... + + +def skip_requires_pyarrow( + fn: Markable | None = None, /, *, requires_tzdata: bool = False +) -> pytest.MarkDecorator | Markable: """ ``pytest.mark.skipif`` decorator. @@ -109,7 +122,7 @@ def skip_requires_pyarrow( https://github.com/vega/altair/issues/3050 .. _pyarrow: - https://pypi.org/project/pyarrow/ + https://pypi.org/project/pyarrow/ """ composed = pytest.mark.skipif( find_spec("pyarrow") is None, reason="`pyarrow` not installed." @@ -120,13 +133,7 @@ def skip_requires_pyarrow( reason="Timezone database is not installed on Windows", )(composed) - def wrap(test_fn: Callable[..., Any], /) -> Callable[..., Any]: - return composed(test_fn) - - if fn is None: - return wrap - else: - return wrap(fn) + return composed if fn is None else composed(fn) def id_func_str_only(val) -> str: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ec2f9014f..7a4ab51f1 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from importlib.util import find_spec from typing import TYPE_CHECKING import pytest @@ -14,10 +15,27 @@ if TYPE_CHECKING: from altair.datasets._readers import _Backend -backends = skip_requires_pyarrow( - pytest.mark.parametrize( - "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] - ) + +requires_pyarrow = skip_requires_pyarrow() + +backends = pytest.mark.parametrize( + "backend", + [ + "polars", + pytest.param( + "pandas", + marks=pytest.mark.xfail( + find_spec("pyarrow") is None, + reason=( + "`pandas` supports backends other than `pyarrow` for `.parquet`.\n" + "However, none of these are currently an `altair` dependency." + ), + ), + ), + pytest.param("polars[pyarrow]", marks=requires_pyarrow), + pytest.param("pandas[pyarrow]", marks=requires_pyarrow), + pytest.param("pyarrow", marks=requires_pyarrow), + ], ) From 9d88e1bbb20b6b24bc3cefc40c62108e259edf65 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:37:21 +0000 Subject: [PATCH 078/201] refactor: Rework `_readers.py` - Moved `_Reader._metadata` -> module-level constant `_METADATA`. - It was never modified and is based on the relative directory of this module - Generally improved the readability with more method-chaining (less assignment) - Renamed, improved doc `_filter_reduce` -> `_parse_predicates_constraints` --- altair/datasets/_readers.py | 55 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index a3435d231..b2f41af89 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -22,6 +22,7 @@ Any, Callable, ClassVar, + Final, Generic, Literal, Protocol, @@ -76,6 +77,8 @@ __all__ = ["get_backend"] +_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" + def _identity(_: _T, /) -> _T: return _ @@ -105,8 +108,6 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): https://docs.python.org/3/library/pathlib.html#pathlib.Path """ - _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_read) return self._read_fn[suffix] @@ -159,20 +160,13 @@ def url( /, tag: VersionTag | None = None, ) -> str: - df = self.query(**validate_constraints(name, suffix, tag)) - url = df.item(0, "url_npm") + frame = self.query(**validate_constraints(name, suffix, tag)) + url = nw.to_py_scalar(frame.item(0, "url_npm")) if isinstance(url, str): return url else: - converted = nw.to_py_scalar(url) - if isinstance(converted, str): - return converted - else: - msg = ( - f"Expected 'str' but got {type(converted).__name__!r}\n" - f"from {converted!r}." - ) - raise TypeError(msg) + msg = f"Expected 'str' but got {type(url).__name__!r}\n" f"from {url!r}." + raise TypeError(msg) def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] @@ -188,15 +182,14 @@ def query( .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ - source = self._metadata - fn = self.scan_fn(source) - frame = nw.from_native(fn(source)) - result = frame.filter(_filter_reduce(predicates, constraints)) - df: nw.DataFrame[Any] = ( - result.collect() if isinstance(result, nw.LazyFrame) else result + frame = ( + nw.from_native(self.scan_fn(_METADATA)(_METADATA)) + .filter(_parse_predicates_constraints(predicates, constraints)) + .lazy() + .collect() ) - if not df.is_empty(): - return df + if not frame.is_empty(): + return frame else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) msg = f"Found no results for:\n{terms}" @@ -208,12 +201,12 @@ def _read_metadata(self) -> IntoDataFrameT: Effectively an eager read, no filters. """ - fn = self.scan_fn(self._metadata) - frame = nw.from_native(fn(self._metadata)) - df: nw.DataFrame[Any] = ( - frame.collect() if isinstance(frame, nw.LazyFrame) else frame + return ( + nw.from_native(self.scan_fn(_METADATA)(_METADATA)) + .lazy() + .collect() + .to_native() ) - return df.to_native() @property def _cache(self) -> Path | None: # type: ignore[return] @@ -351,11 +344,15 @@ def __init__(self, name: _PyArrow, /) -> None: self._scan_fn = {".parquet": pa_read_parquet} -def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: +def _parse_predicates_constraints( + predicates: tuple[Any, ...], constraints: Metadata, / +) -> nw.Expr: """ - ``narwhals`` only accepts ``filter(*predicates)`. + ``narwhals`` only accepts ``filter(*predicates)``. + + So we convert each item in ``**constraints`` here as:: - Manually converts the constraints into ``==`` + col("column_name") == literal_value """ return nw.all_horizontal( chain(predicates, (nw.col(name) == v for name, v in constraints.items())) From 60d39f5f7f175f94b2511b221ee2fd1760eacb9e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:40:12 +0000 Subject: [PATCH 079/201] test: Adds tests for missing dependencies --- altair/datasets/_readers.py | 14 ++++++++++- tests/test_datasets.py | 48 +++++++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index b2f41af89..20b308aed 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -226,7 +226,19 @@ def _import(self, name: str, /) -> Any: if spec := find_spec(name): return import_module(spec.name) else: - msg = f"{type(self).__name__!r} requires missing dependency {name!r}." + reqs = _requirements(self._name) # type: ignore[call-overload] + if isinstance(reqs, tuple): + depends = ", ".join(f"{req!r}" for req in reqs) + " packages" + else: + depends = f"{reqs!r} package" + + msg = ( + f"Backend {self._name!r} requires the {depends}, but {name!r} could not be found.\n" + f"This can be installed with pip using:\n" + f" pip install {name}\n" + f"Or with conda using:\n" + f" conda install -c conda-forge {name}" + ) raise ModuleNotFoundError(msg, name=name) def __repr__(self) -> str: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 7a4ab51f1..de932137f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import sys from importlib.util import find_spec from typing import TYPE_CHECKING @@ -13,8 +14,12 @@ from tests import skip_requires_pyarrow if TYPE_CHECKING: + from typing import Literal + from altair.datasets._readers import _Backend +CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" + requires_pyarrow = skip_requires_pyarrow() @@ -58,10 +63,49 @@ def test_loader_url(backend: _Backend) -> None: @backends -def test_loader_call(backend: _Backend) -> None: +def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv(CACHE_ENV_VAR, raising=False) + data = Loader.with_backend(backend) - data.cache_dir = "" # type: ignore[assignment] frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) assert set(nw_frame.columns) == {"symbol", "date", "price"} + + +@backends +def test_missing_dependency_single( + backend: _Backend, monkeypatch: pytest.MonkeyPatch +) -> None: + if backend in {"polars[pyarrow]", "pandas[pyarrow]"}: + pytest.skip("Testing single dependency backends only") + + monkeypatch.setitem(sys.modules, backend, None) + + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{backend}.+requires.+{backend}.+but.+{backend}.+not.+found.+pip install {backend}", + flags=re.DOTALL, + ), + ): + Loader.with_backend(backend) + + +@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) +@skip_requires_pyarrow +def test_missing_dependency_multi( + backend: _Backend, monkeypatch: pytest.MonkeyPatch +) -> None: + secondary = "pyarrow" + primary = backend.removesuffix(f"[{secondary}]") + monkeypatch.setitem(sys.modules, secondary, None) + + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{re.escape(backend)}.+requires.+'{primary}', '{secondary}'.+but.+{secondary}.+not.+found.+pip install {secondary}", + flags=re.DOTALL, + ), + ): + Loader.with_backend(backend) From d6f0e45a3ade1fd9ca08e22b2ae9f6710eabd496 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 18:36:28 +0000 Subject: [PATCH 080/201] test: Adds `test_dataset_not_found` --- altair/datasets/_readers.py | 10 ++-- tests/test_datasets.py | 95 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 20b308aed..ebd996d65 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -192,8 +192,8 @@ def query( return frame else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n{terms}" - raise NotImplementedError(msg) + msg = f"Found no results for:\n {terms}" + raise ValueError(msg) def _read_metadata(self) -> IntoDataFrameT: """ @@ -378,16 +378,18 @@ def validate_constraints( /, ) -> Metadata: constraints: Metadata = {} + suffixes = ".csv", ".json", ".tsv", ".arrow" if tag is not None: constraints["tag"] = tag - if name.endswith((".csv", ".json", ".tsv", ".arrow")): + if name.endswith(suffixes): fp = Path(name) constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints elif suffix is not None: if not is_ext_read(suffix): - raise TypeError(suffix) + msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}" + raise TypeError(msg) else: constraints["suffix"] = suffix constraints["dataset_name"] = name diff --git a/tests/test_datasets.py b/tests/test_datasets.py index de932137f..cf26fc0f8 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -109,3 +109,98 @@ def test_missing_dependency_multi( ), ): Loader.with_backend(backend) + + +@backends +def test_dataset_not_found(backend: _Backend) -> None: + """ + Various queries that should **always raise** due to non-existent dataset. + + ``Loader.url`` is used since it doesn't require a remote connection. + """ + import polars as pl + + data = Loader.with_backend(backend) + real_name: Literal["disasters"] = "disasters" + real_suffix: Literal[".csv"] = ".csv" + real_tag: Literal["v1.14.0"] = "v1.14.0" + + invalid_name: Literal["fake name"] = "fake name" + invalid_suffix: Literal["fake suffix"] = "fake suffix" + invalid_tag: Literal["fake tag"] = "fake tag" + + incorrect_suffix: Literal[".json"] = ".json" + incorrect_tag: Literal["v1.5.0"] = "v1.5.0" + + ERR_NO_RESULT = ValueError + # NOTE: ``polars`` enforces enums stricter than other packages. + # Rather than returning an empty dataframe, filtering on a value + # *outside* of the enum range raises an internal error. + ERR_NO_RESULT_OR_ENUM = (ERR_NO_RESULT, pl.exceptions.InvalidOperationError) + + MSG_NO_RESULT = "Found no results for" + NAME = "dataset_name" + SUFFIX = "suffix" + TAG = "tag" + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL), + ): + data.url(invalid_name) + + with pytest.raises( + TypeError, + match=re.compile( + rf"Expected '{SUFFIX}' to be one of.+\(.+\).+but got.+{invalid_suffix}", + re.DOTALL, + ), + ): + data.url(real_name, invalid_suffix) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT_OR_ENUM, + match=re.compile(rf"{invalid_tag}", re.DOTALL), + ): + data.url(real_name, tag=invalid_tag) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT_OR_ENUM, + match=re.compile(rf"{invalid_tag}", re.DOTALL), + ): + data.url(real_name, real_suffix, tag=invalid_tag) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{SUFFIX}.+{real_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, real_suffix, tag=incorrect_tag) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, incorrect_suffix) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{real_tag}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, incorrect_suffix, tag=real_tag) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{NAME}.+{real_name}", re.DOTALL + ), + ): + data.url(real_name, tag=incorrect_tag) From b7d57a0b497de6bc824f3e2600894cc75f5ad413 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:44:28 +0000 Subject: [PATCH 081/201] test: Adds `test_reader_cache` --- tests/test_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index cf26fc0f8..b3cd1ab8c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,10 +3,10 @@ import re import sys from importlib.util import find_spec -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest -from narwhals.dependencies import is_into_dataframe +from narwhals.dependencies import is_into_dataframe, is_polars_dataframe from narwhals.stable import v1 as nw import altair as alt # noqa: F401 @@ -14,6 +14,7 @@ from tests import skip_requires_pyarrow if TYPE_CHECKING: + from pathlib import Path from typing import Literal from altair.datasets._readers import _Backend @@ -204,3 +205,72 @@ def test_dataset_not_found(backend: _Backend) -> None: ), ): data.url(real_name, tag=incorrect_tag) + + +@backends +def test_reader_cache( + backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """ + Using a sample of the smallest datasets, make *"requests"* that are all caught by prior hits. + + Note + ---- + `tmp_path`_ is a built-in fixture. + + .. _tmp_path: + https://docs.pytest.org/en/stable/getting-started.html#request-a-unique-temporary-directory-for-functional-tests + """ + import polars as pl + from polars.testing import assert_frame_equal + + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + + data = Loader.with_backend(backend) + cache_dir = data.cache_dir + assert cache_dir is not None + assert cache_dir == tmp_path + + assert tuple(cache_dir.iterdir()) == () + + # smallest csvs + lookup_groups = data("lookup_groups", tag="v2.5.3") + data("lookup_people", tag="v2.4.0") + data("iowa-electricity", tag="v2.3.1") + data("global-temp", tag="v2.9.0") + + cached_paths = tuple(cache_dir.iterdir()) + assert len(cached_paths) == 4 + + if is_polars_dataframe(lookup_groups): + left, right = ( + lookup_groups, + cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), + ) + else: + left, right = ( + pl.DataFrame(lookup_groups), + pl.DataFrame(data("lookup_groups", tag="v2.5.3")), + ) + + assert_frame_equal(left, right) + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) + + data("iowa-electricity", tag="v1.30.2") + data("global-temp", tag="v2.8.1") + data("global-temp", tag="v2.8.0") + + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) + + data("lookup_people", tag="v1.10.0") + data("lookup_people", tag="v1.11.0") + data("lookup_people", tag="v1.20.0") + data("lookup_people", tag="v1.21.0") + data("lookup_people", tag="v2.1.0") + data("lookup_people", tag="v2.3.0") + data("lookup_people", tag="v2.5.0-next.0") + + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) From b70aef883721ce1ce905e1ec8e82938eb4859257 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 21:23:43 +0000 Subject: [PATCH 082/201] docs: Finish `_Reader`, fill parameters of `Loader.__call__` Still need examples for `Loader.__call__` --- altair/datasets/__init__.py | 31 +++++++++++++++++++--- altair/datasets/_readers.py | 52 +++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index b7f87bdaa..4260314d1 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -131,7 +131,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj - # TODO: docs (parameters, examples) + # TODO: docs (examples) def __call__( self, name: DatasetName | LiteralString, @@ -140,7 +140,30 @@ def __call__( tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" + """ + Get a remote dataset and load as tabular data. + + Parameters + ---------- + name + Name of the dataset/`stem`_ of file name. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + **kwds + Arguments passed to the underlying read function. + + .. _stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + """ return self._reader.dataset(name, suffix, tag=tag, **kwds) def url( @@ -156,14 +179,14 @@ def url( Parameters ---------- name - Name of the dataset/`stem`_ of filename. + Name of the dataset/`stem`_ of file name. suffix File extension/`Path.suffix`_. .. note:: Only needed if ``name`` is available in multiple formats. tag - `vega-datasets release`_ version. + Version identifier for a `vega-datasets release`_. .. _stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index ebd996d65..fe8f8212f 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -23,7 +23,6 @@ Callable, ClassVar, Final, - Generic, Literal, Protocol, TypeVar, @@ -84,16 +83,42 @@ def _identity(_: _T, /) -> _T: return _ -class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): +class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ - Common functionality between backends. + Describes basic IO for remote & local tabular resources. - Trying to use ``narwhals`` as much as possible + Subclassing this protocol directly will provide a *mostly* complete implementation. + + Each of the following must be explicitly assigned: + + _Reader._read_fn + _Reader._scan_fn + _Reader._name """ _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + """ + Eager file read functions. + + Each corresponds to a known file extension within ``vega-datasets``. + """ + _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + """ + *Optionally*-lazy file read/scan functions. + + Used exclusively for ``metadata.parquet``. + + Currently ``polars`` backends are the only lazy options. + """ + _name: LiteralString + """ + Used in error messages, repr and matching ``@overload``(s). + + Otherwise, has no concrete meaning. + """ + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) @@ -124,16 +149,6 @@ def dataset( tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: - """ - Fetch a remote dataset, attempt caching if possible. - - Parameters - ---------- - name, suffix, tag - TODO - **kwds - Arguments passed to the underlying read function. - """ df = self.query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) @@ -171,13 +186,12 @@ def url( def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.DataFrame[IntoDataFrameT]: - r""" + """ Query multi-version trees metadata. - Parameters - ---------- - \*predicates, \*\*constraints - Passed directly to `pl.LazyFrame.filter`_. + Notes + ----- + Arguments correspond to those seen in `pl.LazyFrame.filter`_. .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html From 403b7874f360fc2f1734de538e81a91e4c4ddffe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 21:48:21 +0000 Subject: [PATCH 083/201] refactor: Rename `backend` -> `backend_name`, `get_backend` -> `backend` `get_` was the wrong term since it isn't a free operation --- altair/datasets/__init__.py | 14 ++++++------- altair/datasets/_readers.py | 40 +++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4260314d1..b6f983754 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -4,7 +4,7 @@ from narwhals.typing import IntoDataFrameT, IntoFrameT -from altair.datasets._readers import _Reader, get_backend +from altair.datasets._readers import _Reader, backend if TYPE_CHECKING: import sys @@ -46,29 +46,29 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod def with_backend( - cls, backend: Literal["polars", "polars[pyarrow]"], / + cls, backend_name: Literal["polars", "polars[pyarrow]"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @classmethod def with_backend( - cls, backend: Literal["pandas", "pandas[pyarrow]"], / + cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / ) -> Loader[pd.DataFrame, pd.DataFrame]: ... @overload @classmethod def with_backend( - cls, backend: Literal["pyarrow"], / + cls, backend_name: Literal["pyarrow"], / ) -> Loader[pa.Table, pa.Table]: ... @classmethod - def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: + def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. Parameters ---------- - backend + backend_name DataFrame package/config used to return data. * *polars*: Using `polars defaults`_ @@ -128,7 +128,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: dtype: object """ obj = Loader.__new__(Loader) - obj._reader = get_backend(backend) + obj._reader = backend(backend_name) return obj # TODO: docs (examples) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index fe8f8212f..9645d0bb2 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -74,7 +74,7 @@ _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] -__all__ = ["get_backend"] +__all__ = ["backend"] _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" @@ -428,33 +428,35 @@ def is_ext_read(suffix: Any) -> TypeIs[Extension]: @overload -def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload -def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +def backend(name: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... @overload -def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... - - -def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: - if backend == "polars": - return _PolarsReader(backend) - elif backend == "polars[pyarrow]": - return _PolarsPyArrowReader(backend) - elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader(backend) - elif backend == "pandas": - return _PandasReader(backend) - elif backend == "pyarrow": - return _PyArrowReader(backend) - elif backend in {"ibis", "cudf", "dask", "modin"}: +def backend(name: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... + + +def backend(name: _Backend, /) -> _Reader[Any, Any]: + """Reader initialization dispatcher.""" + if name == "polars": + return _PolarsReader(name) + elif name == "polars[pyarrow]": + return _PolarsPyArrowReader(name) + elif name == "pandas[pyarrow]": + return _PandasPyArrowReader(name) + elif name == "pandas": + return _PandasReader(name) + elif name == "pyarrow": + return _PyArrowReader(name) + elif name in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) else: - raise TypeError(backend) + msg = f"Unknown backend {name!r}" + raise TypeError(msg) @overload From 3fbc759233fdf0203a2f8685245152732f57276a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 00:04:49 +0000 Subject: [PATCH 084/201] fix(DRAFT): Add multiple fallbacks for `pyarrow` JSON --- altair/datasets/_readers.py | 62 ++++++++++++++++++++++++++++++++----- tests/test_datasets.py | 40 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 9645d0bb2..0f30e58b9 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -11,6 +11,7 @@ import os import urllib.request +from collections.abc import Mapping, Sequence from functools import partial from http.client import HTTPResponse from importlib import import_module @@ -34,6 +35,7 @@ from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: + import json # noqa: F401 import sys from urllib.request import OpenerDirector @@ -346,25 +348,71 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): def __init__(self, name: _PyArrow, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: - pa = self._import(self._name) # noqa: F841 + pa = self._import(self._name) pa_csv = self._import(f"{self._name}.csv") pa_feather = self._import(f"{self._name}.feather") - pa_json = self._import(f"{self._name}.json") pa_parquet = self._import(f"{self._name}.parquet") - pa_read_csv = pa_csv.read_csv pa_read_feather = pa_feather.read_table - pa_read_json = pa_json.read_json pa_read_parquet = pa_parquet.read_table - # opt1 = ParseOptions(delimiter="\t") # type: ignore + # HACK: Multiple alternatives to `pyarrow.json.read_json` + # ------------------------------------------------------- + # NOTE: Prefer `polars` since it is zero-copy and fast (1) + if find_spec("polars") is not None: + import polars as pl + + def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: + return pl.read_json(source).to_arrow() + + else: + import json + + def stdlib_read_json(source: Any, /, **kwds) -> pa.Table: + if not isinstance(source, (Path)): + obj = json.load(source) + else: + with Path(source).open(encoding="utf-8") as f: + obj = json.load(f) + # Very naive check, but still less likely to fail + if isinstance(obj, Sequence) and isinstance(obj[0], Mapping): + return pa.Table.from_pylist(obj) + else: + # NOTE: Almost certainly will fail on read as of `v2.9.0` + pa_json = self._import(f"{self._name}.json") + return pa_json.read_json(source) + + # NOTE: Use `pandas` as a slower fallback (2) + if find_spec("pandas") is not None: + import pandas as pd + + def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: + try: + table = ( + nw.from_native( + pd.read_json( + source, dtype_backend="pyarrow" + ).convert_dtypes(dtype_backend="pyarrow") + ) + .with_columns( + nw.selectors.by_dtype(nw.Object).cast(nw.String) + ) + .to_arrow() + ) + except ValueError: + table = stdlib_read_json(source) + return table + else: + # NOTE: Convert inline from stdlib json (3) + pa_read_json = stdlib_read_json + # Stubs suggest using a dataclass, but no way to construct it - opt2: Any = {"delimiter": "\t"} + tab_sep: Any = {"delimiter": "\t"} self._read_fn = { ".csv": pa_read_csv, ".json": pa_read_json, - ".tsv": partial(pa_read_csv, parse_options=opt2), + ".tsv": partial(pa_read_csv, parse_options=tab_sep), ".arrow": pa_read_feather, } self._scan_fn = {".parquet": pa_read_parquet} diff --git a/tests/test_datasets.py b/tests/test_datasets.py index b3cd1ab8c..e39497fb4 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -17,7 +17,8 @@ from pathlib import Path from typing import Literal - from altair.datasets._readers import _Backend + from altair.datasets._readers import _Backend, _Pandas, _Polars + from altair.datasets._typing import DatasetName CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -274,3 +275,40 @@ def test_reader_cache( assert len(tuple(cache_dir.iterdir())) == 4 assert cached_paths == tuple(cache_dir.iterdir()) + + +@pytest.mark.parametrize( + "dataset", + [ + "cars", + "movies", + "wheat", + "barley", + "gapminder", + "income", + "burtin", + pytest.param( + "earthquakes", + marks=pytest.mark.xfail( + reason="GeoJSON seems to not work with pandas -> pyarrow" + ), + ), + ], +) +@pytest.mark.parametrize("fallback", ["polars", "pandas", None]) +@skip_requires_pyarrow +def test_pyarrow_read_json( + fallback: _Polars | _Pandas | None, + dataset: DatasetName, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv(CACHE_ENV_VAR, "") + + if fallback == "polars" or fallback is None: + monkeypatch.delitem(sys.modules, "pandas", raising=False) + elif fallback == "pandas" or fallback is None: + monkeypatch.setitem(sys.modules, "polars", None) + + data = Loader.with_backend("pyarrow") + + data(dataset, ".json") From 4f5b4de6d894a1297bd2edfaecb72c5eefa48bc7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:56:07 +0000 Subject: [PATCH 085/201] test: Remove `pandas` fallback for `pyarrow` There are enough alternatives here, it only added complexity --- altair/datasets/_readers.py | 40 ++++++++++---------------------- tests/test_datasets.py | 46 ++++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 0f30e58b9..2e20fd375 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -366,46 +366,30 @@ def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: return pl.read_json(source).to_arrow() else: + # NOTE: Convert inline from stdlib json (2) import json - def stdlib_read_json(source: Any, /, **kwds) -> pa.Table: - if not isinstance(source, (Path)): + pa_json = self._import(f"{self._name}.json") + + def pa_read_json(source: Any, /, **kwds) -> pa.Table: + if not isinstance(source, Path): obj = json.load(source) else: with Path(source).open(encoding="utf-8") as f: obj = json.load(f) - # Very naive check, but still less likely to fail + # NOTE: Common case of {"values": [{...}]}, missing the `"values"` keys if isinstance(obj, Sequence) and isinstance(obj[0], Mapping): return pa.Table.from_pylist(obj) + elif isinstance(obj, Mapping) and "type" in obj: + msg = ( + "Inferred file as geojson, unsupported by pyarrow.\n" + "Try installing `polars` or using `Loader.url(...)` instead." + ) + raise NotImplementedError(msg) else: # NOTE: Almost certainly will fail on read as of `v2.9.0` - pa_json = self._import(f"{self._name}.json") return pa_json.read_json(source) - # NOTE: Use `pandas` as a slower fallback (2) - if find_spec("pandas") is not None: - import pandas as pd - - def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: - try: - table = ( - nw.from_native( - pd.read_json( - source, dtype_backend="pyarrow" - ).convert_dtypes(dtype_backend="pyarrow") - ) - .with_columns( - nw.selectors.by_dtype(nw.Object).cast(nw.String) - ) - .to_arrow() - ) - except ValueError: - table = stdlib_read_json(source) - return table - else: - # NOTE: Convert inline from stdlib json (3) - pa_read_json = stdlib_read_json - # Stubs suggest using a dataclass, but no way to construct it tab_sep: Any = {"delimiter": "\t"} diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e39497fb4..01167cf10 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -11,21 +11,23 @@ import altair as alt # noqa: F401 from altair.datasets import Loader -from tests import skip_requires_pyarrow +from altair.datasets._typing import DatasetName if TYPE_CHECKING: from pathlib import Path from typing import Literal - from altair.datasets._readers import _Backend, _Pandas, _Polars - from altair.datasets._typing import DatasetName + import polars as pl + from _pytest.mark.structures import ParameterSet + + from altair.datasets._readers import _Backend, _Polars CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" -requires_pyarrow = skip_requires_pyarrow() +requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() -backends = pytest.mark.parametrize( +backends: pytest.MarkDecorator = pytest.mark.parametrize( "backend", [ "polars", @@ -277,36 +279,44 @@ def test_reader_cache( assert cached_paths == tuple(cache_dir.iterdir()) +movies_fail: ParameterSet = pytest.param( + "movies", + marks=pytest.mark.xfail( + reason="Only working for `polars`.\n" + "`pyarrow` isn't happy with the mixed `int`/`str` column." + ), +) +earthquakes_fail: ParameterSet = pytest.param( + "earthquakes", + marks=pytest.mark.xfail( + reason="Only working for `polars`.\n" "GeoJSON fails on native `pyarrow`" + ), +) + + @pytest.mark.parametrize( "dataset", [ "cars", - "movies", + movies_fail, "wheat", "barley", "gapminder", "income", "burtin", - pytest.param( - "earthquakes", - marks=pytest.mark.xfail( - reason="GeoJSON seems to not work with pandas -> pyarrow" - ), - ), + earthquakes_fail, ], ) -@pytest.mark.parametrize("fallback", ["polars", "pandas", None]) +@pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | _Pandas | None, + fallback: _Polars | None, dataset: DatasetName, monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") - - if fallback == "polars" or fallback is None: - monkeypatch.delitem(sys.modules, "pandas", raising=False) - elif fallback == "pandas" or fallback is None: + monkeypatch.delitem(sys.modules, "pandas", raising=False) + if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) data = Loader.with_backend("pyarrow") From 69a72b6e32625687223987d04e3c3f925421c1ab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:59:05 +0000 Subject: [PATCH 086/201] test: Adds `test_all_datasets` Disabled by default, since there are 74 datasets --- pyproject.toml | 8 ++++++-- tests/test_datasets.py | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4132f0a25..2297ca2ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -420,10 +420,14 @@ docstring-code-line-length = 88 # They contain examples which are being executed by the # test_examples tests. norecursedirs = ["tests/examples_arguments_syntax", "tests/examples_methods_syntax"] -addopts = ["--numprocesses=logical"] +addopts = [ + "--numprocesses=logical", + "-m not datasets_debug" +] # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks markers = [ - "slow: Label tests as slow (deselect with '-m \"not slow\"')" + "slow: Label tests as slow (deselect with '-m \"not slow\"')", + "datasets_debug: Disabled by default due to high number of requests" ] [tool.mypy] diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 01167cf10..d3f7625cd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,15 +3,15 @@ import re import sys from importlib.util import find_spec -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, cast, get_args import pytest from narwhals.dependencies import is_into_dataframe, is_polars_dataframe from narwhals.stable import v1 as nw -import altair as alt # noqa: F401 from altair.datasets import Loader from altair.datasets._typing import DatasetName +from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: from pathlib import Path @@ -47,6 +47,27 @@ ], ) +datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug) +""" +Custom ``pytest.mark`` decorator. + +Use for more exhaustive tests that require many requests. + +**Disabled** by default in ``pyproject.toml``: + + [tool.pytest.ini_options] + addopts = ... +""" + + +@pytest.fixture(scope="session") +def polars_loader( + tmp_path_factory: pytest.TempPathFactory, +) -> Loader[pl.DataFrame, pl.LazyFrame]: + data = Loader.with_backend("polars") + data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") + return data + @backends def test_loader_with_backend(backend: _Backend) -> None: @@ -322,3 +343,13 @@ def test_pyarrow_read_json( data = Loader.with_backend("pyarrow") data(dataset, ".json") + + +@datasets_debug +@pytest.mark.parametrize("name", get_args(DatasetName)) +def test_all_datasets( + name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] +) -> None: + """Ensure all annotated datasets can be loaded with the most reliable backend.""" + frame = polars_loader(name) + assert is_polars_dataframe(frame) From 08101cc33aa1d08f25323ea1de161c6863f30ceb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 14:07:42 +0000 Subject: [PATCH 087/201] refactor: Remove `_Reader._response` Can't reproduce the original issue that led to adding this. All backends are supporting `HTTPResponse` directly --- altair/datasets/_readers.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 2e20fd375..65df737e8 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -13,7 +13,6 @@ import urllib.request from collections.abc import Mapping, Sequence from functools import partial -from http.client import HTTPResponse from importlib import import_module from importlib.util import find_spec from itertools import chain, islice @@ -81,10 +80,6 @@ _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" -def _identity(_: _T, /) -> _T: - return _ - - class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -123,17 +118,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) - """ - Backends that do not support `file-like objects`_, must override with conversion. - - Used only for **remote** files, as *cached* files use a `pathlib.Path`_. - - .. _file-like objects: - https://docs.python.org/3/glossary.html#term-file-object - .. _pathlib.Path: - https://docs.python.org/3/library/pathlib.html#pathlib.Path - """ def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_read) @@ -168,7 +152,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(self._response(f), **kwds) + return fn(f, **kwds) def url( self, @@ -295,8 +279,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _response = staticmethod(HTTPResponse.read) - def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -311,8 +293,6 @@ def __init__(self, name: _Polars, /) -> None: class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _response = staticmethod(HTTPResponse.read) - def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: _pl, _pa = _requirements(name) self._name = name From 90428a625bc3928684018d57861f608574812fd8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:49:33 +0000 Subject: [PATCH 088/201] fix: Correctly handle no remote connection Previously, `Path.touch()` appeared to be a cache-hit - despite being an empty file. - Fixes that bug - Adds tests --- altair/datasets/_readers.py | 4 ++-- tests/test_datasets.py | 47 ++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 65df737e8..57b290c32 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -143,11 +143,11 @@ def dataset( if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) - if fp.exists(): + if fp.exists() and fp.stat().st_size: return fn(fp, **kwds) else: - fp.touch() with self._opener.open(url) as f: + fp.touch() fp.write_bytes(f.read()) return fn(fp, **kwds) else: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d3f7625cd..1b866cf58 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -2,8 +2,10 @@ import re import sys +from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, cast, get_args +from typing import TYPE_CHECKING, Any, cast, get_args +from urllib.error import URLError import pytest from narwhals.dependencies import is_into_dataframe, is_polars_dataframe @@ -353,3 +355,46 @@ def test_all_datasets( """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name) assert is_polars_dataframe(frame) + + +def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): + raise e(*args, **kwds) + + +def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + from polars.testing import assert_frame_equal + + data = Loader.with_backend("polars") + data.cache_dir = tmp_path + + data("londonCentroids") + data("stocks") + data("driving") + + cached_paths = tuple(tmp_path.iterdir()) + assert len(cached_paths) == 3 + + raiser = partial(_raise_exception, URLError) + with monkeypatch.context() as mp: + mp.setattr(data._reader._opener, "open", raiser) + # Existing cache entries don't trigger an error + data("londonCentroids") + data("stocks") + data("driving") + # Mocking cache-miss without remote conn + with pytest.raises(URLError): + data("birdstrikes") + assert len(tuple(tmp_path.iterdir())) == 3 + + # Now we can get a cache-hit + frame = data("birdstrikes") + assert is_polars_dataframe(frame) + assert len(tuple(tmp_path.iterdir())) == 4 + + with monkeypatch.context() as mp: + mp.setattr(data._reader._opener, "open", raiser) + # Here, the remote conn isn't considered - we already have the file + frame_from_cache = data("birdstrikes") + assert len(tuple(tmp_path.iterdir())) == 4 + + assert_frame_equal(frame, frame_from_cache) From 8ad78c174933c9b728f30db653354da6aff64f23 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:26:41 +0000 Subject: [PATCH 089/201] docs: Align `_typing.Metadata` and `Loader.(url|__call__)` descriptions Related https://github.com/vega/altair/commit/c572180ebc7d876714a38688c53f7e4af87abd93 --- altair/datasets/__init__.py | 8 ++++---- altair/datasets/_typing.py | 24 +++++++++++++++--------- tools/datasets/__init__.py | 28 +++++++++++++++++++--------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index b6f983754..d6acbf4c2 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -146,7 +146,7 @@ def __call__( Parameters ---------- name - Name of the dataset/`stem`_ of file name. + Name of the dataset/`Path.stem`_. suffix File extension/`Path.suffix`_. @@ -157,7 +157,7 @@ def __call__( **kwds Arguments passed to the underlying read function. - .. _stem: + .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix @@ -179,7 +179,7 @@ def url( Parameters ---------- name - Name of the dataset/`stem`_ of file name. + Name of the dataset/`Path.stem`_. suffix File extension/`Path.suffix`_. @@ -188,7 +188,7 @@ def url( tag Version identifier for a `vega-datasets release`_. - .. _stem: + .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 0a86bc6ba..ed9ca99a6 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -149,16 +149,16 @@ class Metadata(TypedDict, total=False): Parameters ---------- dataset_name - Equivalent to ``Pathlib.Path.stem``. + Name of the dataset/`Path.stem`_. ext_supported Dataset can be read as tabular data. file_name - Equivalent to ``Pathlib.Path.name``. + Equivalent to `Path.name`_. name_collision - Dataset is available via multiple ``suffix``(s). + Dataset is available via multiple formats. .. note:: - Requires specifying a preference in calls to ``data(ext=...)``. + Requires specifying a preference in calls to ``data(name, suffix=...)`` sha Unique hash for the dataset. @@ -169,14 +169,20 @@ class Metadata(TypedDict, total=False): size File size (*bytes*). suffix - File extension. - - .. note:: - Equivalent to ``Pathlib.Path.suffix`` + File extension/`Path.suffix`_. tag - ``vega-datasets`` release version. + Version identifier for a `vega-datasets release`_. url_npm Remote url used to access dataset. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.name: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases """ dataset_name: str diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index f318f292e..5e2ca1dd7 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -158,28 +158,38 @@ def generate_typing(self, output: Path, /) -> None: NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" name_collision = ( - f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" - "Requires specifying a preference in calls to ``data(ext=...)``." + f"Dataset is available via multiple formats.{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(name, suffix=...)``" ) sha = ( f"Unique hash for the dataset.{NOTE_SEP}" f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" f"then all ``tag``(s) in this range would **share** this value." ) + links = ( + f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n" + f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n" + f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" + f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases" + ) + descriptions: dict[str, str] = { - "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "dataset_name": "Name of the dataset/`Path.stem`_.", "ext_supported": "Dataset can be read as tabular data.", - "file_name": "Equivalent to ``Pathlib.Path.name``.", + "file_name": "Equivalent to `Path.name`_.", "name_collision": name_collision, "sha": sha, "size": "File size (*bytes*).", - "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", - "tag": "``vega-datasets`` release version.", + "suffix": "File extension/`Path.suffix`_.", + "tag": "Version identifier for a `vega-datasets release`_.", "url_npm": "Remote url used to access dataset.", } - metadata_doc = f"\n{indent}".join( - f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" - for param in metadata_schema + metadata_doc = ( + f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + + f"\n\n{links}" ) contents = ( From e6504546f89831930168e6bcaa7150f690ef4709 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:14:32 +0000 Subject: [PATCH 090/201] feat: Update to `v2.10.0`, fix tag inconsistency - Noticed one branch that missed the join to `npm` - Moved the join to `.tags()` and added a doc - https://github.com/vega/vega-datasets/releases/tag/v2.10.0 --- altair/datasets/_metadata/metadata.parquet | Bin 18641 -> 19128 bytes altair/datasets/_typing.py | 1 + tools/datasets/_metadata/tags.parquet | Bin 6200 -> 6247 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2596 -> 2597 bytes tools/datasets/github.py | 43 ++++++++++++++------- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 5e7b3bd06439ace1d5eb8387efecde322cca91d7..969f64b18f44b812f11e0e1f34a58c6b592c994a 100644 GIT binary patch delta 7562 zcmb7J1z1#1zdpNkFR=?t=Mo|zNP{%ef`BLu0*Z7fQfEOLq*IWRk`NF@8l+1>KtPag zr9)EoBL3_9?!Di0@7a0I&Y5TC-PzwebDsB|-=|nebt0tP7D~{@2_Glsg`e@pJ40Du zbNCEMKo5ll0Kge91n*O2HdGr520*eYe9Q&EL>g31>ZmZ^Q07uh!Yg4hGK%9TjVotu z4&ri4V6o>-ND4h6m9ICt@=kym=o<%ojD*bfR+$s%sGOZW#oYn4(n&a5o!odJ96Hq9Yp590$)P6c78=IGSoicTgB0G8T9cQqBwd%VjU2g#ttQ zc|Y>=>|)I^u-3Ofk01m)&aNrA&Uy*|2{`%0L_k%!aAm^b1xZd*rG}i%?umJfs zGQsamK!Ux2J;bK`ZhvZr0Vi(b5v{ zfJ;MoKI3e^R~e1_6$(2iGmhk9im;0*#-GX2BYw|r4*w?-P~`fH+4yWIWo0vg z(QkIqmambeHg&T%m6&fRa@|b+o82qu5WVqcSrH1g@1ZA9`L7NXKuos>gg^ifZ;20Z zfv^WB+Quf5?CscE;}ENVAc8Fuv69YOgNCP$x8V0kk!T`0aUkePKps*QCPcDl zOlqW-F>2X0h%Qi5U+pkAWQmCt81K#5a26>bSu&G{=l(@4fz5q;>woepi-<&`rs7ZK z7){S8dzE7Jrw7A3Wz|4I8lnK`jaYT1tN`5jZxyTnqk#L*sl6!T($_ zDHKoDfpB^ccGB`ZEOwCQDcg6~Rtv&0d8h(Z9_pWfBAjOr(Tz^Ol}FX;XxLrKsMRlg zE55fuN~3Iy|g)A znGi8@@;F5P zvjklF6=~lqTKfo(Pi2qsb`2f~;p%wqO?GkzGAHvy-4-4nCcG&*n)dFrHSG+!=e~Ip z#!&Esw-vm*xYhTPVF>vKR7g2C4NcN262FeE z`&pT1nuI_qUD$WTpIh>U{>Ip<2d%)}^jxJqqpTTc{|0U0NXo{PVxciEY!fzOo4d*{9ZkA!98Hu&M%+-_HucF~E&u6GOR3uOsc@*5BV?kDdg@^> zNWtIjl`d_@@kTRqyokfoJ>BdyZbQ-8*8U?|t#4_qMB&&EziCcC({9E#m5v`3xhX#^ zq}br#qS4TjRld=1x?rNh&v{Z^OF7ebaN2FgsHx*iNpwysre}O4MvsWKbcq zYRl>_6Opu=T#i!pTkj_ZVZ&-|#Nv&~_V#S^gZjD95f90*&`YYSsSnenaIYC*OK6L; z=N1vL7R7v*zkJ*Vxuj)Qq)Gjx)o4@WA?H0YaJDa0p7=uu&HIsHYXHE8Y?p=KpI z6e>Qf;<%BYQ_8!9C)(TUW>coHOyS!+u-Hy%SeWR zNdI{mCc+=Zb)&D>xenW^2j2atawLzMfRaB)l&$RwHC%uLfATq<4wz*focw~hf}^wn z#1w|Ju+N{3C=p=*2mFjh6fK`^HQ|D z3q%^I4iebz@5HLz`O*^{UzuGc{gQ*kfJm4fDL(&Jd}B)UIrRW{nn=8U<<^vCD|6x@ zHp>`2H8k!!LZM3sX#uCu7YJ=ieB#*|SCMc+DecJyWGJ0miey$Cjx zN#UmjzM3ae9RwPFT5p4Mg|GQDa7fW#XTm;knw#GnA^rb5_o1Moa6-+$- zsbdO~O)sSd@tT&__me9Iv>vW4+&Vd1dII~o4vHsLZ@4>eQ9W|D21uk$IDZLwQ2CdGnHq|m0mhb zaZma=bf__Eao)<figbv$e?$oHSao$UhdaygooRXPng?+?LC@}-t=ZNmLG*rw<{yt?kKG$ zQL*Y(LBT=_{hOaU8zV$5o``Aj-!G|p^HEdjb?I%cZ=YFfHV$$tDT-C#D&F8%B7^|{ zy;iJau2R~`X{HfA3Y`5oC5x!JxK5ocDXo_0TH|n>0b6 z_H}E8&g4~gi({5`j$-KU`hKlLSr(>7^#hLIQpwAkRe@R~H;JveEj}64xVDy-GpJA; z`-UV@BSlHpdkztYZ2^qDDQ-%9i2a;sVYlfWiKP8e7NPiOI|4E{U3aFuTJeI1I|<$y zzE`bvftMgS+C2xH+6*eIDI6eRVy11j?i1~NRJeYX3ciFL)pc@j8fEc>QJ&5HNw+@z z%J11hdEH@mle8aW_vG4;s>mP8_0>c@L4IsdD5yve8i*_b)xN4Qlm?F(+&{n=7FZ%S zuZB?}{x|4ceKBcMBpuN8CL(hxB`nKie%E@d z8ZT*T%X5Zo{_tL3B=R)>s{YvzH^m92FVbpHN8((kHGOO!!miC*U9GB?xTNU0N*Yw2 zt7-98jE(C~;^gq|fWe#uYthoxMW=8ayCMAkmaF_HvG}nAavP}c`s-5ZS9(ORNbsb; zxuu*f-1@}6nti?5e^#Oaj7zY!mK^>pmpY*)-|E=f8!Ngx7w)jIX*?3{Qvdi->Ze|| zxH`pY#v@!0_C>J}63|xAU%G!cr!&1BqqV~KA^2|0WeK-Jaw+*Nm`{N0=}amqb0Y*m z1I$nW8^8wyFp!vH00@>$Vv}Yf!o_0~ z%zRn+FR_Q^cF*Fsd|!KnI2OT1m555k&mauxehDrT_-h;u_{iUa%Mj{I2n1H%qho!qQEFzp@ok40DOqEiYZgTGukNBElx_-}3f z%O?DZL4&XPo4#N73o~#*mHYolRoI2({>>6Wo@Gs;(7&^!{oh$q;ZfurAd4XZMt3yk zxVhwj)J8s8e_EC)nk=NlmB8tlFB`fv{WGLSi$wgi9byRkn3>a`!Dn4c9WN{s;E6R3aMx-Fcw9H1hA<2UVEd_>Li7nm+Xl=`d1l!Cu z`tMi1-B6YloDsesOw#to|0`Rkxmt!Ccc>As18&?LG=d6|FwdgVTw$p(p+Z%@k>>F5 zOIBeDGT_Ys9QzNah3$6V>ko&ub`?oXW{DV^ytr+%3^&c1ghdBxnnLdU@(~CR!gCF^4!Mq;o9#S_o;BZZ3eZ6pO3r= zd^$6o!y0*LtpE`O1OWj+(}2FAs-M9$E$L*Tp*Y1P$iC4u%jh0~`05k@B_~N=EnL7M zYA$90WnuhU$t%Q%qBB&20d1oKBF!WkEzK`UQuC4xU`WWJb1W0nNL6J|=!2*T0+GZP z3{g8jWsN>=#|YAY&&`uVtQ&(Do;J1MOE zySqNkoX4bQk1*Z$t$SFn_>K1i)lhRz<3^FyHr0!dN86nbIP|wYiUKyD@?={PPEk9` zzglGgn6Y^ZVa>z5R?e5~v0yNN9bA_*NL4t_{l#y+phPJ%IpW2kH48*cm*sKtI=mQh zH-yo)K=aowJ%zx}F5F$bG#e}E_ZAD73!!IEo^_sv4jw@ zf{l6=brDD0hgbN-`2=Y29t!da3h)VB95IZQ(!#6nUA*rR>?zmm)v*BHFd$sTm{`bJ za}aWR@h31Clk&pY28ZscwF#1=xHTa=emll#j?;)oX%GW8DM2c~v#-NI zB^3ar%K7|>$+QOJsWq&>XDkhBnN53aQhQSE^eX(7PWmm1d=%yS;tZT*Uf1Q+rh8B$ZjcflK zf*^=Mft&@OG$_#G2!j=}SJ?XDd;DYUT?8^!b%zqBJmUk!hY~RKR*S+ilzp^tjOj*Sn0oWLq)saJ`Dlq-px147{3X^ikdd;cH*a!R;wp>9K%7u2!B#zI_jEN(cL=Bl2tR5~D z7vZT&8b991(bP8Lwa3fTM;R_B6cT>#eFP2usP+Lj2wyGd>s(HhRj5H{I|QOj)1#Ff zUDNeE?5JK}&uuP?`?%~bT!Qgo_fcu}^z)h~=$-Bu%wUu2y|E&s5I(cfbadi2AR*kw z*te?wWw(A?71!TGi|BiW{APf_iC$p?1AptOQ>VW+I~tGv#A0r4KyT`i-WDAq(*qzlUWHeB9fE2 zs`Qo2zqJK(8l!iv&%_c#aEplSYMeaSG-KS(u8GLrB2r;7s0kOj%Qzk0c2AC^f=P{O zl6S>%mif$4CodoOijDQ~GZr7YV{#M|cd$$D5qNt`iBOCNrnsew73?rKEe4ZhXezwh zt|3{g&P@gXusP36iAuP^+^5bTV2Mp}Pxboa)j)Y++Rnm!*^XiA?zHVUXs6rc$1;At zq-Pj%U)B$j*YRby+oYx!T;$FQJg-E`FtIRxW6cj}KM|WPgD=l7Oui5GZ529xS1Sq{}QEOj)OT6#&5r5r73Q>0%hxAkD^l9Vaaew1yiNxv1XUhtD$c0PR zQ!=rEDbW$NRr)N^kI@wSXQ+L~-T`lBJWKE~URQQA-wI9zcMpe__cXrtd6u8b*4&Eb@nurUWxYlRc5jk9 zCNt%Vt~hN+%xiurSZw&Bf0Qx1wC5XCpONo7I9e79PuzUw7f;PJo;G#$rf%eHT1V=T z7L0xn6i8cnGt4y2^o;i>k~iVnvoh|eMeSVb_V-uBA6&~!AnhZi?u=SZl{e@aV;n>^}cwgqXy1GD>iLw#?mv zx9Zg8Zk?E!db-syzgUN;L4I_y%;M?HWYm;E;v4ym^1}qrEQWAQly<^3gc+!r6h5lK zv|LPiyeIiH!Sr&|Vf2D_eS)VbDGI}M_{?H-KsO!hlZ^~swgO|$fq8<@y(6(ku$&h| zL&M$lY?>=-kt~L~<0Mb~fpkqGY2R~dvZ#zw2z`*5vSjg~_afHH-obY6MgvQFO%gM{kR z_NLL}B4_e2lW7S-gsY4LG;Y0UyUPOu)7lua<*1RY!MvHxw;5`gBIpF=-3Ypj%}MfXIx6vwoT{iJu6d;QVC*CtMrCMlULL0AtxaEMH9oZ#hL5joH+5LacI@oHV!N zaA&R$z2DxgeiFPjriyP5zW43&e0aOs>a4)W>6`5HI_I!8)pVm{pRd~5mzW|ek}!CI zlLwoJoE81$dyU1eI9)qiTa|0|EHU($%TJ!mUXOVA^!>!#N$F`9eN#-JOjC*{=Xp5U zT8XTsMx0Om2IgOhf4}~4U9oyn#)XjU{g{okyZ4!43V%2gwI0MY7hF1ea8x$e4?KGG z)erOB`1+|^@<#mEdVy3Cr&6q98GcYH1fsDp;9k*BK z+#jA``!3y`3%{G8pz=+VN$2$H3S+b@IR1m)QCF649vJNdZdGFZ3;!&6~CG@8-EB%QG9rz4U>!zu0rP%eWkBEP_#OWc$fy zmQaG;KuAAN<1%A0NER0EPxzU-L7Mq4n=@J?BPNH>L?lpA@b-(nVNcF&@NQL@nL^SG z?0QlTRt9MI73p zj^iu0rA;8%7v@9v+NgCA3F6#eAn6fv)bK3mAN4~H_|Bj0OTzFK$RF)3Ch!H=AN3gz z_yXA<^{NmEJOTHQhmIHseBe){<9G=C&@Mhow*KZBFOnXaj@m)c|0T2kKUt{rBorbc z6bh$7^!P*JA^<9Ymi|`~NagP&5cNM24v3PAzUTlboE1QQ6Z-o{|M&a|6It8Epq%qT z$^h!4NIZ``t-}AD&_H~>=(_S7Ki~0h{C~6Le>!pyv3D^d{0S6(el~(?3xd%9CzJS> zNBrff|7oxhkzv20&=m-U%K)gJr{^$#X7&D)ZL#6=f5}EXZOrGGjn#|Gs$^p_H`O)C>d0Fn}Dap=!-}-q>(1CS_X0c1tzJGcr_;)54JGqeLWPM z$ONb%3UPBMip!!>^?aa89GRg;3IJ#2H0ZiNhm7pdP2YWt=w;Ca~| zZ*$Gv=Ap0hY#F7ttq`{Xxbhug%{`JnT1I+j#n-A)U)r$W3&fR>igrdY8(A(%GhtOZ z@G1xbCQEmmniYH(W+$yeRiZcj!ZYb9$RG_q7~fi{nlvxCr|%0z_>b4d+B z{6he~APaBCj1w}GrV_%ILFW&uM9JW$3z&id6Ud(jRRRG3>B;Fyi!sPZ9|;1VAoXcM zgeNnXTP~k<&uFh}s;epzo}l6QF=_lr3;hhOupo%|WCqO(f+CH0rifZMKakWq%#5oH zGuvrcCNS#jf!tS@L!XvDS%?H-LWvm3@OLvv2rnW))z=gfO`Svk?KkvMq%DULJ`|q6{CV#WsgP?`J~*d(WH71Rq*M-;lp!RtF08i1VkO zd}*~^7-Ru3onkdhPK`Q1<8RZQyVWz(*Y}#?y z+};8FHqac@ZoEyych*7(I*vAq5aY5sajcPIn!wU2R<)U=;APg;0Xe zGNS64kAt0iOInj2B5Iso%t7;rfM`QT zx8>7%XRyp(FlMH~A`TZ6T=AG)tM(X^OeuoKyPt)?u^?26q`Wl13j#6X7ZiX&U^rb^ zDVIKM0tCa$ymcbSHcd39+Q0$zI0sebQ!p2rF`jePv)z{L3oij5{+kgQ&G2^5e{;zk zb8TTf-0&!HC52I)*ZbCKF2A5mZw+DGm#8h_=|-*hwMoHObGYX%_#`kmGK)Q$4wu&R0$tQ zt=mjHyL2JR5}OZYBcKb$onok8;!wIf5>4tE9{(Ns$ML4RF*@XL+l>J0hNu#OaQv~t;m_F^M@Xe6e1;G(F* z?b=?}OK&fOAI4#7qk_rlS%fS1q91y+F?jh9F-MTIOl%uh&wtQd|3d#bYov+8{xrtA zFgi)$b6&%W+SVigdv+aNwndEf#4lcdJDhEpaJlsPx-A$COQ?hj?`5J_awfx{+zD@u z`@s9@B{_m0ot@@EqsuoPDUlh~nM-=037Zn8FnF##g9h)Ul z>{*J=>9k1|sxOfiGp|ZSUme}Do({;HHAqP9glVRjuKeifUJ$*p63FM8ePy1tGd_|* z(btkG%py9Iy?ZYu-o^e#LyXD023H;$4{kob5m!w2?Cn_PCBDdT6DbvSnCHz>jXYVK z&K1476+3s|)yMDGAe1~gh$~U~`%um=6O|M|2(sSGRs`NiC;Y|~EokD?d?G@W^+Z`r zr2Qvt9N%@WnSP55D=z(}sOdOp-wl&@sYvhpuIk63=?JTYu#WWFUnhn950gG%it@09 z{u4O`a$^^Rf*$jvYbd2NdUH5?+P+q}V%(;&n7X2LaO-Qu`)X~H8Cpj(ZXKSSPkmba zP7VUhmp?w7_*IlHX@(XWOtN2y8~gS;K*q;Px#aGI{CzNAl;pJvdok{Yjpe)BfL79Moq`$ySTvaVPVtBh30-4a`Pq& zzdX(;NaW`D`-%;rt{@Qj`Z(XCRP^a}4MNR&+nxQHKD$E7;*^=7(wr{(yBd2*D}Qm1 zy=CLzAEfj+sA|8C9*mGFx;%fqc-&M>$$NiiBU3&2>6m1qq>AHifg%@io?e0QNgCH) z9EsU6-SH}24S)Y9tEEtx?8C5>;Ww%Qoy2KTF*1u7n{s`+I;HQj9g&||1%>I!s80fM zC2X^E*;pT56(9v}GwR%yl*jkesc{ACqJZWsv5h^sR({OB$3uTzy=%ZRiqA=aH$Hh;=~iG}&oO*PtL zwqdJke$g4LipgJO2YK0}H)d~t%)02Hw`{M_X_QV2cS#eFnIfC?;L4x5|Ce8~$H-uL zoLi`&;Bsr2+;tKAD`wx6IM3;u6pJsOKfPdDdUb1lni^<+#Vfa*CL*w>n=j zjw3$SylY)^i_Qgv#$DGCB29i@{KzzLl9u=C&^7q>gY3mQxtg(juV_V^EE z+mnwLu$>C&~ZQsZGj%m4sZ)IU@IMKkX2qE0wQTwym5q`?j5uQ?o-* zV(OfDlfk8-@V#96m!zL$B3|q}Z;ky7;=hf=9+dg*>yfeFtt$wqFkt!U5ckZX+&R?8 zODv2#wsJh_>Qy)Np-*HV*(EBM(RG8&R|vIVagq}7fQzI>Mg^$~83_1eE&Dk-53s*u zR#wn!QmjlW*P~q5L=UTK-e>PytAm@r_*)4d_3}7puF2lJK~MO3DJiXW{(_l*`+TTU z+BQ(?X^sO5Auer=ul4nuU9|N8<_t(K__9TAow>>in zd}^A1e^bu)vet>et5dF=ez+QjZ{$@}0Uhr8- z)UY(BocRXh6G~Vap+0Q%A68>TaFYx$so`K@2RWai2RM~NjoD_Yp>5QI54#f@#%G8! zuyRzrlF>EJE^C~qG7j5%{eQ<4289v9h*)@|zhORb zMp5J8(!d%#LjT<&0p(5Y+`;Dt8~QA|rfpw7<*SEmz4IH6V~+9zdnK}j@c9vPBVGL_DD-3gGt`hj zbQR*|Co*+p_a2bisxelyJL8)PIbiibR=; zVRMt+<&vf5jm6h-Hgv|gBUPF{_4;1rDxZhSWpYi7-1M)!$3=xv_6?E46t1N99o-m) z8l9&%lj&p~Gk4#155ljT{*ycqdHFaxI{E&Nef;+KkQntX40N_25Mgg$Z$>ncu&t-3 z+x>HrUq%eu0qMg`{^t0ffCwjf{NTMO_d(-6l@n$lmzog||GW^f!7YKInJEalf;y?#H1q0Q@EXa^a zkc22$LNvLG-GCFx{61X0U6{I2{>N5jU%T+W4$-AudTp?&S!9|`>}Pjs1`j$kERrD- z5y_m$n!$$dQQFEl;z9SeY`qF>4(vLT$dfwl&)Z%lVA3h?LegC4;M_O`wK8^a_r_%sAH7=uA;Fi_&}bYlzw8@4|WXEN?g1)LNZV2K7| zv`i?(+zp07N2li>V)n_N>3Vy&Uq2rP>f9?)@*LMm$%I>yhhtA=HZekJ9N$k*Nsepg zxPL;?Zg*Yx-Af{2-hR2$!dnbVi!9z62x$GsqBSB*`f05$OT(iez(o)67CmNhLA% zXCZV2n_u@vS5Tg*zcDiWLi_B%4@CQ6{?OsV>p5xJW`pfS+Vx{>wzie$ak@n*&kVWw z>9a3~vd3!fd6Dggl8^y(ZGSQj=eGPknAejTF)gl(z}+y=y1;XB(jBKIe@Y)T6IC%^ zve1Y9<;;lkP!~xK?DUdTwkQusKvh)sbwWqv%E(32dLz|e13?QqrfXD0WngFY zt6m!RB(E5Bbwv3$j3WCRVd?P6X4YsAF_nef`M7f{*~OKLU%u~5;IPK=Pf;Ii;dBWD?+a+mB1*c}!$z7g z_a@1-jAHAD7jM;?oX2T{Yyj9PYCdqHJEA8a`%eAdMuf>$+lYyP8>r;x(M53PGc?)#k|%4J_|79I&r zb2f!jqx~^6kOEgCEz_UT^vlX{&Hxzb`GlgSfy?SS5I4nG7V%4|4Qq#B#Y2oIrh7SwRj~aQu_mZY2P{ zWm@2uf}572l9ES#W08eut*X8Jp58k0U@ULn$b0$;S3r`_0$W-fPovWIn9nj_GE2+u zqtv^i1J$_e&?mYoZ+Ub50~ama-f@j1?+boR+X?ZGv0`nAF%(?yHz2$H!L571FsV%% zL(Mf#xHp+BSwhY23U(Uq23xnkbbMQcM#{Q-t>bPrtEl?&)2f+?Uf5!@V#td#Tp~|BRluWn4nRthMK@V6h%u~K1-54lXrd%lh z(yepxh6z<_yM7fo8(Q19CdVw8|Y|keu8~{yYx|-%zh7e%fW{1ZMNAq@c!R#Jx zN#ZLn%K#sla7V;;`F2S!1Lv`S&;GGzv)=3sLrd;*=bp$4(XvFaVh_xW8?|}oBX4^j zja8qEA(Hw^i}kKEJ>``0wzY?+kN3gD<|;pbjfw6gWl#Mk`U8)7sMZ2UJSeL{;_p z(}{ac{vvr1HlPifVA) zaN0Phcm_*eAk{<#H7DNuJ3nQn-{P&v!OEm+sxP^&OV~oEn;=;l<|$O?U%*%8fb-1+l>XKPk>x zlK1ZkAH4b8&%qH0rU;;ZMC0B3tFZr{xcS91a{f0rhkwR9VS>Lt2EoJuRC(gRIr(3i zDWmwtxxsh5fh>S3+B-M+zhj9B9usypQNalaA3XpZo6mxO{vZ$~r8o4!8h%vVwDKxnhJ?TbL z&)k-mu1Hdiq+XQhrg#-9l_Jlko9BI>_kGX#oO6DkbH2at|9AfX@8|pfRWn~Ob=}yj zSr8Jsq5{1KS3n#H007gfxf9%8U>^doyOycDVEO6ZPfJw*0!S?;R6nhpc@!#U5eV_D zMxTQ$_czBGoIPGI)>1E3q-#svs1xhc#_;N8{m%`n7VR)nh5*PN3NVxN5+D?WA|C*u zObH(3i0`Nx&S0!~Y7JpY3jq{?L_~!1wnA8#1B)5ZS9Kd<6cv5b4C>c!`ZpsTe zXEk@VVZwdW_JY0#yZx6xdf5LVC7DH^UHHVSfbIKaZaEDbU2^2qx!SdJ3dpf}!yEUe z5%YJIm1rpKO>#3Z+ft1?2OsuoaWxhMM@JL~(eORxIv48(snu4zyPx};IMl;@f5#=e zNX-XJX-K+-*k~0`xi9@jXVGt`HVH`=`}cQ68Up~^VZ7vswEOlOYOO~MluPyGv5Xfd z){Ry0BFk4C+S_0L+&smxK_haa?xu^!jHaxXEOI`BysGy>n0a^ zwz|4VO)g$bxi?>5zhN>|jmqx)($qLV-s%9q(9gbDml}f^q^Y}vA2$53EA5x9mrjT1 z8*@ZX9Wf)%oQSJT{XGrnN78eiaxZK&Zdqz_*(R23Wam6uw4y60AoJBf6a0i4^f%Na z8m?#OEMqomZvUiXY-8U^r#-2RS?;)q(K5buJ%0C%LGC+e&LDg4?%4KYl_wYsmwV(q zKi3h9lVwQf{r&tHQnb@PeU>%x_WXtKX06&fdya~ePvoGF8+ko0{^?eR!{?r+qsWt*l~i zjEt#AHHUg+&8iC}_KZ*Zgx{QKTM)JR-f~~ zN5=trj;HG;Hp>Lg018mU%`4a8%4>u5ZAcMY|fbL&SAdJtOGJy;|xJ!c~L0XG@ zknTdcKQ%Z&`Go0iU_da|17-xb;-NU&z!hY3oI~c#JsJnvj9fJ584Q_!0v-cB0Bp~W zxO3u6<#QQSUQV*)Qp0qthJGv39P7V+QiA$*Za~jqV9^cZ>%;wdTexSkXzGH%RKlV7 z*_Qml4Zmd2!gAdT^j8}S6YisF$lK?~+On})qFABne*Y0P^kLxE94%pHQjMf&z<)qh zf88>Z6mOJpTl$dbtzBsVcT-ubOCm4%!NY(u|CeK@tz|)fG}CWO-z^UwfE-*`^yccCywyqmThAL%2c|bBMpp^!TFgU)8)Ug>mtpMJNP{$?wHh z#_s^=z3M2Y^U*6hSlf&^A+Q#FFwsG+y6b>u*a4rJM`RWwma;K=a_MqIp zrd@6vh~KxTXC&%Mj-Ic7yR7A+w5T>>jF;V3WS_C7Eq0~}W(bose95CJrsYE|pMUlq zvL4e>EPPH*JeG|07Vte5uSoKu%?N66i_&yhFC}<&EgIs_&UkKJ*XWd>)xRb8c%(~C zO<@$jcZ8us5mb5=Nm4Wd>uZ~WqhyXECzYRGp*IFEILdI$@v&H!T~4q_{O%P=d7T+r zEZ0eDh3zGUSBx5#VTem-iY(fcCsh9x=oGQ_^(%wTWEkGNMV%#nt3KnaDG6Sh^tx1SudBfRNudJ>?)?K!{)};TQ;4 zgOp;X*c0m|M_<6wZ8)GZ?j>;zT0fCs*F-Rgi#}T=ZJXnp*s@T%*6x(z-83Tq;BQ1V zNVr6a(mYRmS&JmL#hJ;;f^c*?2Si|Of#B6jdVta@kO08bstDzUZ{|-`@e+hPK*}TW zb=+dr?|ZiSDzTe3Y&F#u=&5-CfU;a)%5hY406_dGKrK|@>n-pN^yP*5PB9$-;Zl%t z)WZ5}Iz~Q$vmvIu6A(w^b4J z5d`Fyd^Jcg0FcOVz7iw_<)I>$X+am1zQrY`&8H8B~rdKQV2gS9QW~WVLH=)D3yBI#?I9u@XAM~ZlvH>c z@Nb<>9hNEJ)z;7@5TMA+#5_$M3rUJ_7Du` zvmqOR@=jc+rLD?=r%7UlFv~$s*fsTCx@a2Te~4hAwK-slM#|?xaS+j8pNczNs^G}A zQ;c!T@+Zgt5`pQre6hQ>x~tsHVfg-3e(_@R|5bpZ5D=8Fy?p2{MQWBkWB|aDXqZDm M!!lNdR9l4j6FMyk+5i9m delta 2375 zcmai#eK=Hk8^?dgjA<~&4D&n9Ov50?+jtqIL}kVnN@$y?cE#98Wm2eQWf1bTy>Zm4 zBy37l7l|Z^G&Z!7x1CUumn>EmZLXqrSheh_JdA-8b@sKdoHTHgRM`G57c+GudH6e<)SrOSyB%VZ|h%-#kWJ_ zuLi9mr`C4F*MIx+$`?`Z_848|H-Fc**sgthNDVJCg2Rv*OL}V$vCiqn1!OppnRi;% zdTLo5;kLx)hZ&3REAz+1}+`{kk-*TT!;X^k5bCpIh--#OR2VPD}=;n%rg@zRu`2BlYz zUyg1)V?BTU!_)b;*1AzY4kA9WDcS4nR=jOXA8zeEylM3XGT+VRy*N#;%z$?PeSpr} zWu33HGK)MlNCv(?oDQWNVOmDkWcvSV0VA#j-lRJI;zIF$Dv*S4?~HUfJw{x>R&~3G ziH`TRG-mJ0(PD;|)y4mZb?s-PArZcP#O%ho^unYUPhvuQ<4Zz|J8u{2I#++y5M4_5 zKGG%JR4aHv(K=IW6|k#PN(%^ZbbDYwmG?*?*lX%h(INI>%jcEcetmCSY3b^mHmOfZnX8_1AI&l!}zf0_N;`3B?a4Q~!#qKvOk{?#S;w-TnS zbN}#)4iA09_K&h~j+@gcX(w6`l{2n?-O-k_PR3G_#Jo004IWr<>49=YT)cVuB=JsV za;tvhODvA)ff!+f#55!cJEF8mfSBe`hbY!@j8%b0c@Nd{=QkqSlWG7T?%Bke>{c^T z&{d!)97MFWqp1sqiH0WgKq3_YpA`8|G7B~~f)~$_t@(GdLjy%KWS6kfd@3gwOIP%k zObFnx+FZKJ)WmFXc4s6RMw<*&;A^A5NUBSuCO7K2s9Wm3wiyQ z|4GB+B|;aP4JS-Fo4tY*naZ;FtbkL%`=DrS^GB+1O3&G3>@j#AZYyr%@(^atwOtg80oMe*s5nC78MU*6Tn1rpyNZi3X$cUghm+b1!{ zL_98?Q1!ulEnQ2=gLsbk7nxehT{C3jr^&>U)KMiNn&X7#(>N7azIr`IBN-q<%${Vr zP>3LUhy{T7*}1GJ7VQWQnc+7JK}!%$tvPm-v_-BSB$+N3B8sX%rV5BUaTGxU1?kv% zt>~1S(Za~<1=;GB@zd|-7;WD*YgCFya}_v4JZxBlfd#8_vpv|dbwWDlq5>kUxk#2L*t^LQgBeVniNJ$LSidOkp;D zWB@PXXX8Zq{GAQ^EMfj9`NKu<!m<9}a%C8zG6AHcS$1#<;IzRx!vCFBo7K>fX>pq!EJpxm!~{F3KL2m{JT^&m zlnYXq!Am&6iRH0AU-v)5!%#&t93hI9JHX|Da~f;VpeYN{IZ!Z-il+@HXTHm9=J@>+ z1`Dj}r`4tWX}94ef2sea+4J4|rmL8k$*Wz;`qvKI+W2E{+PTUizQ>VQurU;O40#iLRxjtZHVGc{Cig!%c2 zxZF>wUXyor({D{t2GIi1CPp1m#;S>%wSdGu1Ie1_wW2(drVMNXk`l}%iRq#|VpU@A zc*N#RzQ`&$*@01=v1)P#qpnI7BclnU8-pkdR4r?6Vpe_;SoI=Cu^o(RO{_MP^BE;3 zUu5(EYLQ~n^{irK737?o?4`MMN)VT%GYc>v7%J{pzpr`S^M3F5xM-YVuZQ#fU1gCML04jA~1G zZOqa%IM_gr0s;vR2n`~E!K5T=q63UNpa~!|7&I6eB^W_cpomlu>||hI0&=Pt*?}Y{ z*ldQ${OpqKRbmlhS!!P<`>`xyOq=|XMbzdBljsFTF`zO@8L>tSK&xA#l7{t~{h*ha^h#lGN I&wiB&05aa^ng9R* delta 884 zcmZuuZAcVB7@pa^-8;`a+tr(CH#fSZ*2}u{lv4~2QG`r0x~?eW`k``SN~u`LrbQwW zB#K>gNTfv^St*r2xi4>Y;5*&U&5csD+!*=Zy2?yq#XXbt1hj*T*$MVEt z_2hcfkPfXG(F>{wDF^^ydz1R7WC)h)K-m}{uv%`h&-#56xdn5>uljvO_^PKaWw5$n zwQKjZeIe4&{&{+?Y`rfvGk>hpHU1%TwwD7O<$lhMd~xX?BYCg2#n2WwXKbZ8SIPNa z=f{rg%k3w=bGe7xd@nD&3p&jAq`Tp&j9~huaz2>~)tolxt=Dx?2%uWj438is5Ke(g zVBRTST)c@)Vg_J2QJ}AtRv;5r@QevRAU2o-6;cW92NVd?;W2=8(K_ZzX;n?#Mr8=% z0Vp>yZc+;cG6j#TEs2ypUx5t2B%LWd)D!fM3rVU1U}W`MKP%u@pK z(oUm-IK(1W%Jh>q`XG~oe#0#Cmfl~i;ARTXLOEo1XZmdhR+~p%g5lq(MOB+58g6;To|%ORb#vsXIV;8y2l>(d=a z9Zo@CpXNw&I-CID+ZH6?Fpsi;JitoZl-}9F6#U47F#f@#Y#=YhY;V}XnmuM5*7*D4 zwekVd#9CsuY#|TX#JCA9y|j^Di<8JOEpY!^K}uMQDWcUqP0(n*k~n>0a#Yh@+P9)M Y;5BWuNCHX%xKqH2Y{2hdd-R{EzgUUmzyJUM diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 6bde876ae..3e57cd469 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -326,10 +326,33 @@ def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: return self.req.delay(is_auth=limit["is_auth"]) def tags( - self, n_head: int | None = None, *, warn_lower: bool = False + self, + n_head: int | None = None, + *, + npm_tags: pl.DataFrame | pl.LazyFrame | None = None, + warn_lower: bool = False, ) -> pl.DataFrame: + """ + Get release info, enhance with `SemVer`_ context. + + Parameters + ---------- + n_head + Limit to most recent releases. + npm_tags + Used to remove any github-only releases. + warn_lower + Emit a warning if fewer than ``n_head`` tags were returned. + + .. _SemVer: + https://semver.org/#semantic-versioning-200 + """ tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + frame = pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + if npm_tags is not None: + return frame.lazy().join(npm_tags.lazy().select("tag"), on="tag").collect() + else: + return frame def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: """Retrieve directory info for a given version ``tag``.""" @@ -394,29 +417,23 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: npm_tag_only = npm_tags.lazy().select("tag") fp = self._paths["tags"] if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: - return ( - pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() - ) + return pl.scan_parquet(fp).join(npm_tag_only, on="tag").collect() elif not fp.exists(): print(f"Initializing {fp!s}") - tags = ( - self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) + tags = self.tags(npm_tags=npm_tag_only) print(f"Collected {tags.height} new tags") return tags else: print("Checking for new tags") prev = pl.scan_parquet(fp) - latest = ( - self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) + latest = self.tags(1, npm_tags=npm_tag_only) if latest.equals(prev.pipe(semver.sort).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() print(f"Refreshing {fp!s}") prev_eager = prev.collect() tags = ( - pl.concat((self.tags(), prev_eager), how="vertical") + pl.concat((self.tags(npm_tags=npm_tag_only), prev_eager)) .unique("sha") .pipe(semver.sort) ) @@ -434,7 +451,7 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: raise NotImplementedError(rate_limit, cost) print( f"Collecting metadata for {n} missing releases.\n" - f"Using {self.delay(rate_limit)}[ms] between requests ..." + f"Using {self.delay(rate_limit):.2f}[ms] between requests ..." ) dfs: list[pl.DataFrame] = [] for tag in tags: From 72296b0e630dad0d2d7c397c6e4887d74c537846 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:25:13 +0000 Subject: [PATCH 091/201] refactor: Tidying up `tools.datasets` --- tools/datasets/github.py | 31 ++++++++++++++----------------- tools/datasets/semver.py | 19 ++++++++----------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 3e57cd469..385ac1079 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -13,6 +13,7 @@ from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast import polars as pl +from polars import col from tools.datasets import semver from tools.datasets.models import ( @@ -171,9 +172,9 @@ def _request(self, url: str, /, *, raw: bool = False) -> Request: See `Media types`_. .. _personal access token: - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens .. _Media types: - https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types """ headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} if tok := os.environ.get(self._ENV_VAR): @@ -267,7 +268,6 @@ class GitHub: https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree .. _rate_limit: https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) @@ -359,17 +359,16 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: trees = self.req.trees(tag) tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"] parsed = self.parse.trees(trees, tag=tag_v) + url = pl.concat_str( + pl.lit(self._npm_cdn_url), + col("tag"), + pl.lit(f"/{_DATA}/"), + col("file_name"), + ) df = ( - pl.DataFrame(parsed) - .lazy() - .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) + pl.LazyFrame(parsed) .with_columns( - url_npm=pl.concat_str( - pl.lit(self._npm_cdn_url), - pl.col("tag"), - pl.lit(f"/{_DATA}/"), - pl.col("file_name"), - ) + name_collision=col("dataset_name").is_duplicated(), url_npm=url ) .collect() ) @@ -397,12 +396,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: trees = ( - pl.scan_parquet(fp) - .with_columns(pl.col("tag").cast(pl.String)) - .collect() + pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect() ) missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" + trees.select(col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): print(f"Already up-to-date {fp!s}") @@ -410,7 +407,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) result = pl.concat((trees, fresh)) - return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags))) + return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags))) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index 57f6d509f..f18e1e992 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Literal import polars as pl +from polars import col if TYPE_CHECKING: from typing import TypeVar @@ -24,14 +25,14 @@ CANARY: Literal["--canary"] = "--canary" -def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: +def with_columns(frame: _Frame, /, *, tag: str = "tag") -> _Frame: """ Extracts components of a `SemVer`_ string into sortable columns. .. _SemVer: https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions """ - fields = pl.col(_SEM_VER_FIELDS) + fields = col(_SEM_VER_FIELDS) pattern = r"""(?x) v?(?[[:digit:]]*)\. (?[[:digit:]]*)\. @@ -39,12 +40,12 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: (\-(next)?(beta)?\.)? (?[[:digit:]]*)? """ - sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + sem_ver = col(tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) ldf = ( frame.lazy() .with_columns(sem_ver) .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) - .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + .with_columns(is_pre_release=col("pre_release").is_not_null()) ) if isinstance(frame, pl.DataFrame): return ldf.collect() @@ -52,14 +53,10 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: return ldf -def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum: - """Extract an **ascending** order ``pl.Enum`` from ``col_tag``.""" +def tag_enum(frame: _Frame, /, *, tag: str = "tag") -> pl.Enum: + """Extract an **ascending** order ``pl.Enum`` from ``tag``.""" return pl.Enum( - frame.lazy() - .pipe(sort, descending=False) - .select(col_tag) - .collect() - .get_column(col_tag) + frame.lazy().pipe(sort, descending=False).select(tag).collect().get_column(tag) ) From ca1b500c220a5ef7042bac75070d679696923cc8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:57:38 +0000 Subject: [PATCH 092/201] revert: Remove tags schema files --- tools/datasets/_metadata/tags-schema.json | 10 ---------- tools/datasets/_metadata/tags_npm-schema.json | 8 -------- 2 files changed, 18 deletions(-) delete mode 100644 tools/datasets/_metadata/tags-schema.json delete mode 100644 tools/datasets/_metadata/tags_npm-schema.json diff --git a/tools/datasets/_metadata/tags-schema.json b/tools/datasets/_metadata/tags-schema.json deleted file mode 100644 index 80f248a66..000000000 --- a/tools/datasets/_metadata/tags-schema.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "tag": "str", - "sha": "str", - "trees_url": "str", - "major": "int", - "minor": "int", - "patch": "int", - "pre_release": "int", - "is_pre_release": "bool" -} \ No newline at end of file diff --git a/tools/datasets/_metadata/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json deleted file mode 100644 index 90ea9d52e..000000000 --- a/tools/datasets/_metadata/tags_npm-schema.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "tag": "str", - "major": "int", - "minor": "int", - "patch": "int", - "pre_release": "int", - "is_pre_release": "bool" -} \ No newline at end of file From 5bd70d11bce05e75ffce42274ffe5307aaf5cf21 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:21:24 +0000 Subject: [PATCH 093/201] ci: Introduce `datasets` refresh to `generate_schema_wrapper` Unrelated to schema, but needs to hook in somewhere --- tools/datasets/__init__.py | 21 ++++++++++++++++++--- tools/generate_schema_wrapper.py | 3 +++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 5e2ca1dd7..b0730bd32 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -50,6 +50,7 @@ def __init__( self, out_dir_tools: Path, out_dir_altair: Path, + out_fp_typing: Path, *, write_schema: bool, trees_gh: str = "metadata", @@ -78,6 +79,7 @@ def __init__( "gh_trees": self.github._paths["trees"], } ) + self._fp_typing: Path = out_fp_typing @property def github(self) -> GitHub: @@ -87,8 +89,16 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh(self) -> pl.DataFrame: - """Update and sync all metadata files.""" + def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: + """ + Update and sync all dataset metadata files. + + Parameters + ---------- + include_typing + Regenerate ``altair.datasets._typing``. + """ + print("Syncing datasets ...") npm_tags = self.npm.tags() self.write_parquet(npm_tags, self._paths["npm_tags"]) @@ -97,6 +107,9 @@ def refresh(self) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) + + if include_typing: + self.generate_typing(self._fp_typing) return gh_trees def reset(self) -> None: @@ -218,9 +231,11 @@ def generate_typing(self, output: Path, /) -> None: ruff.write_lint_format(output, contents) +_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets" app = Application( Path(__file__).parent / "_metadata", - Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata", + _alt_datasets / "_metadata", + _alt_datasets / "_typing.py", write_schema=False, ) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index e024c2ca1..39b672082 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -1373,6 +1373,8 @@ def generate_encoding_artifacts( def main() -> None: + from tools import datasets + parser = argparse.ArgumentParser( prog="generate_schema_wrapper.py", description="Generate the Altair package." ) @@ -1387,6 +1389,7 @@ def main() -> None: output=EXPR_FILE, header=HEADER_COMMENT, ) + datasets.app.refresh(include_typing=True) # The modules below are imported after the generation of the new schema files # as these modules import Altair. This allows them to use the new changes From 012f98b9516ddb05dfb6888e802f3d0c894f206f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:34:28 +0000 Subject: [PATCH 094/201] docs: Add `tools.datasets.Application` doc --- tools/datasets/__init__.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b0730bd32..f66c22795 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -42,8 +42,27 @@ class Application: """ Top-level context. - When ``write_schema``, addtional ``...-schema.json`` files are produced - that describes the metadata columns. + Parameters + ---------- + out_dir_tools, out_dir_altair + Directories to store ``.parquet`` metadata files. + out_fp_typing + Path to write metadata-derived typing module. + write_schema + Produce addtional ``...-schema.json`` files that describe table columns. + trees_gh + ``GitHub.trees`` metadata file name. + tags_gh + ``GitHub.tags`` metadata file name. + tags_npm + ``Npm.tags`` metadata file name. + kwds_gh, kwds_npm + Arguments passed to corresponding constructor. + + See Also + -------- + - tools.datasets.github.GitHub + - tools.datasets.npm.Npm """ def __init__( From 5e677c05447e177a5bcd78086a2f080584b731e9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:10:19 +0000 Subject: [PATCH 095/201] revert: Remove comment --- tests/utils/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 36ed1b097..2e8ae1214 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -137,7 +137,6 @@ def test_sanitize_pyarrow_table_columns() -> None: ) # Create pyarrow table with explicit schema so that date32 type is preserved - # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]" [arg-type] pa_table = pa.Table.from_pandas( df, pa.schema( From a99d2c924786f3a2585f2f84bc4641002f9bafce Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:44:10 +0000 Subject: [PATCH 096/201] docs: Add a table preview to `Metadata` --- altair/datasets/_typing.py | 36 ++++++++++++++++++++++++++++++++ tools/datasets/__init__.py | 42 +++++++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 270ac9ab8..c13f847c0 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -184,6 +184,42 @@ class Metadata(TypedDict, total=False): https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix .. _vega-datasets release: https://github.com/vega/vega-datasets/releases + + Examples + -------- + ``Metadata`` keywords form constraints to filter a table like the below sample: + + ``` + shape: (2_879, 9) + ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ + │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ + │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ + │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ + │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ + ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ + │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ + │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ + │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ + │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ + │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ + │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ + │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ + │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ + └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + ``` """ dataset_name: str diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index f66c22795..44c766850 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -204,6 +204,45 @@ def generate_typing(self, output: Path, /) -> None: f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases" ) + import textwrap + + examples = f"""\ + Examples + -------- + ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample: + + ``` + shape: (2_879, 9) + ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ + │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ + │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ + │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ + │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ + ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ + │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ + │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ + │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ + │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ + │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ + │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ + │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ + │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ + └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + ``` + """ descriptions: dict[str, str] = { "dataset_name": "Name of the dataset/`Path.stem`_.", @@ -221,7 +260,8 @@ def generate_typing(self, output: Path, /) -> None: f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" for param in metadata_schema ) - + f"\n\n{links}" + + f"\n\n{links}\n\n" + f"{textwrap.indent(textwrap.dedent(examples), indent)}" ) contents = ( From 7e6da39db8f9bbb691c5a734b2ed96e953fe35f4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 11:49:30 +0000 Subject: [PATCH 097/201] docs: Add examples for `Loader.__call__` --- altair/datasets/__init__.py | 88 ++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d6acbf4c2..d3a93cfa7 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -35,6 +35,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): from altair.datasets import Loader data = Loader.with_backend("polars") + >>> data # doctest: +SKIP Loader[polars] .. _vega-datasets: @@ -96,7 +97,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("polars") cars = data("cars") - type(cars) + >>> type(cars) # doctest: +SKIP polars.dataframe.frame.DataFrame Using ``pandas``: @@ -104,7 +105,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("pandas") cars = data("cars") - type(cars) + >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame Using ``pandas``, backed by ``pyarrow`` dtypes: @@ -112,10 +113,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("pandas[pyarrow]") cars = data("cars", tag="v1.29.0") - type(cars) + >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame - cars.dtypes + >>> cars.dtypes # doctest: +SKIP Name string[pyarrow] Miles_per_Gallon double[pyarrow] Cylinders int64[pyarrow] @@ -131,7 +132,6 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: obj._reader = backend(backend_name) return obj - # TODO: docs (examples) def __call__( self, name: DatasetName | LiteralString, @@ -163,6 +163,80 @@ def __call__( https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix .. _vega-datasets release: https://github.com/vega/vega-datasets/releases + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + shape: (560, 3) + ┌────────┬────────────┬────────┐ + │ symbol ┆ date ┆ price │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞════════╪════════════╪════════╡ + │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ + │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ + │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ + │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ + │ MSFT ┆ May 1 2000 ┆ 25.45 │ + │ … ┆ … ┆ … │ + │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ + │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ + │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ + │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ + │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ + └────────┴────────────┴────────┘ + + Using ``pandas``: + + data = Loader.with_backend("pandas") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + Index(['symbol', 'date', 'price'], dtype='object') + + >>> source # doctest: +SKIP + symbol date price + 0 MSFT Jan 1 2000 39.81 + 1 MSFT Feb 1 2000 36.35 + 2 MSFT Mar 1 2000 43.22 + 3 MSFT Apr 1 2000 28.37 + 4 MSFT May 1 2000 25.45 + .. ... ... ... + 555 AAPL Nov 1 2009 199.91 + 556 AAPL Dec 1 2009 210.73 + 557 AAPL Jan 1 2010 192.06 + 558 AAPL Feb 1 2010 204.62 + 559 AAPL Mar 1 2010 223.02 + + [560 rows x 3 columns] + + Using ``pyarrow``: + + data = Loader.with_backend("pyarrow") + source = data("stocks", tag="v2.10.0") + + >>> source.column_names # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + pyarrow.Table + symbol: string + date: string + price: double + ---- + symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] + date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] + price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] """ return self._reader.dataset(name, suffix, tag=tag, **kwds) @@ -203,7 +277,7 @@ def url( from altair.datasets import Loader data = Loader.with_backend("polars") - data.url("cars", tag="v2.9.0") + >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' We can pass the result directly to a chart: @@ -231,7 +305,7 @@ def cache_dir(self) -> Path | None: data = Loader.with_backend("polars") data.cache_dir = Path.home() / ".altair_cache" - data.cache_dir.relative_to(Path.home()).as_posix() + >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP '.altair_cache' """ return self._reader._cache From b49e679e58729930513a54d13f039039bc9a0837 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:02:43 +0000 Subject: [PATCH 098/201] refactor: Rename `DatasetName` -> `Dataset`, `VersionTag` -> `Version` --- altair/datasets/__init__.py | 10 +++++----- altair/datasets/_readers.py | 15 ++++++--------- altair/datasets/_typing.py | 6 +++--- tests/test_datasets.py | 10 ++++------ tools/datasets/__init__.py | 4 ++-- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d3a93cfa7..3760a4f2a 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -21,7 +21,7 @@ else: from typing_extensions import LiteralString from altair.datasets._readers import _Backend - from altair.datasets._typing import DatasetName, Extension, VersionTag + from altair.datasets._typing import Dataset, Extension, Version __all__ = ["Loader", "data"] @@ -134,10 +134,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: def __call__( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: """ @@ -242,10 +242,10 @@ def __call__( def url( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, ) -> str: """ Return the address of a remote dataset. diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 57b290c32..9b0e7007c 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -60,7 +60,7 @@ else: from typing_extensions import TypeAlias - from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag + from altair.datasets._typing import Dataset, Extension, Metadata, Version from altair.vegalite.v5.schema._typing import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] @@ -129,10 +129,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: def dataset( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: df = self.query(**validate_constraints(name, suffix, tag)) @@ -156,10 +156,10 @@ def dataset( def url( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, ) -> str: frame = self.query(**validate_constraints(name, suffix, tag)) url = nw.to_py_scalar(frame.item(0, "url_npm")) @@ -398,10 +398,7 @@ def _parse_predicates_constraints( def validate_constraints( - name: DatasetName | LiteralString, - suffix: Extension | None, - tag: VersionTag | None, - /, + name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: constraints: Metadata = {} suffixes = ".csv", ".json", ".tsv", ".arrow" diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index c13f847c0..e9546d2b1 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -17,9 +17,9 @@ from typing_extensions import TypeAlias -__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"] +__all__ = ["Dataset", "Extension", "Metadata", "Version"] -DatasetName: TypeAlias = Literal[ +Dataset: TypeAlias = Literal[ "airports", "annual-precip", "anscombe", @@ -95,7 +95,7 @@ "world-110m", "zipcodes", ] -VersionTag: TypeAlias = Literal[ +Version: TypeAlias = Literal[ "v2.10.0", "v2.9.0", "v2.8.1", diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 1b866cf58..6d349dc9b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -12,7 +12,7 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import DatasetName +from altair.datasets._typing import Dataset from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: @@ -333,9 +333,7 @@ def test_reader_cache( @pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | None, - dataset: DatasetName, - monkeypatch: pytest.MonkeyPatch, + fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") monkeypatch.delitem(sys.modules, "pandas", raising=False) @@ -348,9 +346,9 @@ def test_pyarrow_read_json( @datasets_debug -@pytest.mark.parametrize("name", get_args(DatasetName)) +@pytest.mark.parametrize("name", get_args(Dataset)) def test_all_datasets( - name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] + name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 44c766850..c1c7e0655 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -182,8 +182,8 @@ def generate_typing(self, output: Path, /) -> None: .to_series() ) indent = " " * 4 - NAME = "DatasetName" - TAG = "VersionTag" + NAME = "Dataset" + TAG = "Version" EXT = "Extension" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" From 7a14394093cba4b78613f0afe0754a8d0886d966 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:18:49 +0000 Subject: [PATCH 099/201] fix: Ensure latest `[tag]` appears first When updating from `v2.9.0` -> `v2.10.0`, new tags were appended to the bottom. This invalidated an assumption in `Loader.(dataset|url)` that the first result is the latest --- altair/datasets/_metadata/metadata.parquet | Bin 19128 -> 18921 bytes tools/datasets/github.py | 7 ++++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 969f64b18f44b812f11e0e1f34a58c6b592c994a..d47c4ebed0528df5c68dedf307f03f66fec5e63f 100644 GIT binary patch delta 12802 zcma)?2Uru|y6)2m9YP5Z2oNAt5vkI9?;VjQy$MPeX$id}5I{hY-m8F$i1aSKD2Pgz zA_#~gO}J72d+&3edu~~oJekZ|Yi2U@{_?%wtkseU+`<7H+(E!K+-(pKfhq!k=Elv_ zBXP~3A+Wvm7d zT~f>{FkXX;R;$ma%Tbg`ZvcLy}YYVx|vJiiIWiBDqt~ zd)NJ>ZN`6sq0vg8lbn0D0u{hYU0w?4lmvp}OrbDUlAjS6o~WkKYnWPKU_?uj)8lP2?D}OI2J;Gl{v~clAVIZrc8A$!NDUEpom#qXmEG|v5lZpGAgR$ zu@lj_;-j|)IR>F61eIkBTDGC}f{EUdR3oRbEb@Xc1hznf;2|rhD9oEO@rtU)@Hti@ z&hZQ@i3SKL4S~N9f}qjbBJNAj{laY9RO2t{FP{4)VGd@=+$X=PU6cicPy3vdZ1MUK2}2oM7Uj36+&31+Y#EDB_WC5_Hf;(chZ zJgH=-Nv-%Xgqlv|Wa+_#0H~GqYgdKSMOb0>=LfdVtwe5bZ4Z8u0@z~w0azj!%7FU= zwCiAmsM%?8UU7=aUez*%K}Q?t^<^`>v;uV&3&e#(Ss;%=Tsuw0&_xjNF)or=kbsjB zpu&sSxIT`MBnu}8X2np%tFBTJ|8GLhNLY^;a=rdIWrN02-w1zO-yQzKD*o0fOB&mM zROR#9wh>Y5{OaGT#8VNd@>|M^Zg?qc|CaI}CI1#B0x!xFLFJ;)j?KA;M4uPjCuoyb}=fW{}#}C`lS2Vf{47WmqwHziiey~#lCVR zwL((ZV)TCSXNvfVqJuaH1PUp|gUSCdS>q&_$72!HV(R}!T3fAJ^6oh17q*Ly%0h-6 z7+zwlV8BQb*K3r`X6XBgk^xCw*vA0NcxvLTF@qF9)P8Q0^~D4GlXQiQNXwj0WClrJ zA$d&#t(0isJLD?1v77Vymd~i+Kis_GES`P*!X@ktUgz#9c%3VaQc3;MdGGIk2>gYe zMpl~fV-Rhh9uQq+(HTv1?@ON?`0^``S@E~jO9=g*+S&x%x?|_*1Sg}r;HO(rpvyvVtue~r9uOl5;Q)9u1YY(V0}LZT7XJ+xr4(}ZF^YZMDp->lR;${ zw=qp0-vS4OS&XD4PL`|~Iea_w=jm6BJ(K?sn9N*vwamgWVZko7^Fe6PCdN9J zwMpV58}n3}5bL&8bI#a)8%>tkF*@8rR7$(oN1scv zpkFSOhDIh!y|(gc(G6Mld~wcwkX|z|e`u<-@V&CuB0_!Or0?369^_vC=ktPvcU9Gq z37&gDLsvF3cN9Y$Jy^99+*tcXj11ewJms8QRYwuVk~8vS_JYH73&xMKYvwQmyzVF0 z_Fn7nbEyxnkZ=rD`30;w_%PJ@wbQ;BtvEF*(BZM#aYSgSIUx*9yxe_dF>T9wb% z`UNhoB|Rg>A7hU9B4TL9l6ZP2 zLgJ`AJk6m11=vA-f(11@PFJMK1V#(Q$nliGN)uYsFwi>U0XEYg4u&UusFc;dao^XF zdmOPO7t`%!p)9KqF8Oi`Wux5u&Xg&jwAe;cJ127Zxw~l`m;Vt(Otgn%uK{&t>xkB` z3!eIp*t>_LN_6#RpEiityX-;&zFi4P^uE>EX&Ru#wZ3?1Y8am8$q;(+vvJN96Ir=- zkCm@nh`@4E)=|o0$bT<3j>J#-aoEm2+;5q@a{4&aEx>ey(ppYdhAZvXme?qVZtBg~ z6_i6=8|f8^7|tPC4v^WU%H?Tqjh64TclVb@Kerc@_|;9=94K-Yt>~Vhv#(66>^_j9 zI4JZdOeBr&j+AF@A4=-BtpBj!z&$&2YIRSwPh{7#p{c`uooi7x)W4w69o073dOgT6CyA%iX=0$_QlrrE|$nAB)??J7K=^w>u=Qasgy~0 zF3|7Y)sBZ``0tFS!5ZtS-}Tm$-D#V7{o<_rkXdC}sb^x0FSg(fi=0Y}%&ikMmTm8? z$`Hpo9bIAiI_+`GC7HwNvllwaWuLf6>HDLzjL`gP4$ z!VqMSi)yA0LO0w?`C1a#PQQbUIlMS64d44S=zQ-W)w?Bnt;VkDG`~9{l+E9aPR69K zjuAwD5>q>V;D0av2)^N66UbJDkHYL7_)#Me!YI>fmXR#Wfh-uZ~t*suXkPG zdrsAmwB5m4-}wML&R*XDa1c?(hkA)PT8oL0dAWu@ai-Fg9kqlXngODGyKdcLL0^bS zmEEJCydI^VB)CHn==;b{(w*(lUf!+UDfB_cvT&QO^kme}C$zQ1v*OV;Cm8o{?{-7^ zTu>{gRrgP{N!UD|6$RJpBFCJPDxK=w!~K1QA~+Kp=2MN0@1FU`z9EsO@SNGx&3=JU zK1vc5p$nEGE-@&|kbQ`t%Tsq;rtt;`Ip^kvwh*s~XP|SQ+1dO?M7;5*D8j^Jg8?iTs~>FSV+Wq|Iu5MlZA0APmJSF9x)5;;ZX5| zym`g*qzAKeE88`&)4=#lA-kipcVZqGpyB8)fYCZ~19|xTjhD44%3fKU(@-}^ZM9@%?p!&hbZz&j2GpB}>iBz_!7TX&Eo^|jd?+zJSX;}jh;V>Wh{IY)q z6$-))AyHIfP>fq4T*{$9odaM*00=uEPQe=J^$s72{eA108&aFnC=0+* z@VCpQCL>f)dTJ|*q|{JJFEPo`b~eVff;e#245VyhVD`d5Va60fG`pq|-!NxW<-AwH z6G|R5W13bO4PR*jE}nq}L$Co%Gmt@$4}huD(53M8^!XVN34?J$TIysC&S~OIBh{N% z-;vVcMqR}g$RAGc2Ffl{GSL1PQ+Mi%*9I~Xj$V&K8 zF7M1C=4Fh^+&|8{qUE>qUhiEjMq7mfDauuq*;B`w zIyR~RKE69I@g5wrL+#SXK@DSoRgCpW7(SEBQIgHStr5kXV2O9+&`&Xc9XT11$%ftI z3XA-4u_$JGG!2n-y$;t{Laet#j7WgV2cIEV*Zu2{U*D3{b^mcQ88JTh*QFbJ<6U}T z*Me8rx@c7r&Fn}}e+oJIm2yLJIRr&N*^iJh%p8@J&E5^YRmMxm=ZcL; z=E@@b-_Bj)3QG^rjyv=HTpAx=)@0{z|2}6O+RjXv;>~P`YT(L1G=U|AI!$SPh`tjH z_G53KOdNRRXp+9(tp(jREYm$Ew~^^|(wV#t5)YyBBkJ$5M)pgPnPC-KvcsCccW6V& zAOtjkjwhE-dNi3O8t^Thg2Kb6OAz8Pk>EEe$Z+2K1e~cg<_S>td0p%#effus81j^2 zC8q))ghHafIT#K6(mr4IkbEkERV*A8{~w1B_I0)U>#KL}(KU#3J;&8D*}_{0bl`9Q z9f&jh!Qv=PHY4gk1Uz>HY;=gDE+loK%z^862f zJ?p*`7a!c{XaPVhUYlz zpQk)y3G3-2uGf)e8#MZs5?5q$p6Jgy{&$XfD2G;R%ms#$oDUuG2xI<9nN4y37lwgV zc!mwP(*LH+y*Hm)r6WdhTPHfo?AuN6|1kW+{of2f?Y;SV=zEBgbF5JCQ7af^f+x6W zit{hQ@lEzG(IUCr(}y`RDl=4-m_H9+xL*7w`2Cmb^0qKM!A*iFg-k*fc0TLr)HtcN z=Xij1gFk+@Kl(#&g-WURUwUy*bjDp6sSW?3H}}JDdfn<5`2V7d&zrG)lpj@0S{*CP z*r@!ghCF|x+lwcc0ah@qEKZdvfVnR$oA0Kf!tC>#+p^bp{+&r1tQS6t(|U-%hU20t z%=F*mr1rjW;?cbWb{6zsz>{mg>;LHW;yoQlKO^$&o|(Vo>fXVV8~^N`XV{`px*yFX z{%3cVrRdh2rKFJwASj?C-$S}*exv)7nZ~8h0fDi`<9pcaqvTaA zWVS9k-UnIa*Y+dL|4<8vL<#^84PGU%3Pe!Y-s6|dv9XSD!Fcl*HHW{#?y$g? zjKC&CBnwJkZ<=+MG{7nl!uPN(J+oLU8i3Q!569BR1b`;EC%uvzHktVG6NQN-kEtZ4 zxSL?F2S_aM{Rm-SODRt%yOVd`4^ji?^if5!1_>WiD5Js>)3pKlohg$xVcl}M+muzITy!;@>)nZRpP0?g-7{dmPT{Ap+II$d zUad#pa>XX^JkcogbJxH2ZexV6LjgB}dX91>}8F1Y#@EZ2El!zes&k{xFnUh|m z<2LK@{^L}ZZvrV3O6Y|lO zkq3C0T(}c?;|j!Oqo*{DM$F!QC-=3;o8((J$aN(zbZHl*H5p?`0N~PGMO&Cy_0a5g zODfQ;OSId9!JKpRmaxVA9D9lXIP@;&It@j8(iW4iqD5`<)x5n;MmwYT{Le7pyRG|U zR+-nY@i)jmwEIQqb}4L0p3{Ck;g-{jgpViHIQLI7&*NsfJ*D1qeQgU;LTZ{RSKXI= zR@Kn&G~A>(x25Ie`*=+D&Q`Z9#ciKiwnB9(-k*2PLeg2pjkAi^p>J*GkR;^v#F&@} zQnkhcL+qdxlFO#cfIoLwQK@%?cc3%D!uPPLrf7D=L64En#xc1%rk-Mcc;H^#a867R zeAMJ(Jfg$Z|D~^pRfnd3izMGq+3B!%o072y#_VYt;fJ$szXC1trXAdCJT9|so(?uk zV>>H^ZnBGT3LB-OHGOT>(29(?yBO-ueb*#51qp~E5iAB^M??#XMzI+X^%HfnW=A+w z`M{7>6$w?mHGEni$~&eid~>aQz>yygrZ^b|l%=_a>cW<5wFV4mP!}=nN6eppL-(Fr zJ7A?u$^mSsT;rn{_0&GX=;re48b%?vY&fxE!Fi2F?&HaXt}ur((<0Z~m}k^Y#4fdg zT&i>KD;f%x&-{(<^wE~MMiA67>RhM4L8;lcUbVKK^0Hgu^VDoqy0&%Sd?3*)&>*@@ z)$PqEzK6)6 z>@19xw^)zZ$GM0GdyofzG&N>yumW|;<|J?`4v${NrMeqhSXI5q^>D8V|{=sqYP%-S&Vc>3UNxZ z&a-r+FC(LNzlWg>-P7i|&384?J_4Og7}4vi*Zoq|uIdAwHtt&jfMHrnRkNvNVDe-O zLS&jm0x;Jwwz#RKpsOq)AcXCz*hhdHxYRUrDX8PpA;l052}PxS2gnr&8Sny_Oi5uD z?Ijqf62dx&Ee2moVxVu?yT`>|PK2T3ot{ljy)g`*dZiOvLa!aG`)>2c(#9UA9ETil z2#X)jiC9-+|6PcF?Az5Y`76h7$_(pz*~(tzvM%Px-q7%0PAYVT-(pJk3sOB9oT^@< znHDd9pk7&)lYT|CVMupsd5UzCqx7EI>o)CYSa`e1WR0LIlnx_PHn_deLmbuVZE2G0 zo{Fj{6Wk|ylz#QZ$Cw-pSfvH{Ryi&6`|};BxRx84-guIpdMU-%Um%wmAus8)+S;Gh$xe%Q^YFd0O}|Lq9r#n+f))!aB4-> z@3MV~FNEO{6?Va7-g;y&l7Q5MY!D6zHwkq_KHG5=-~!ss$dS4znod^QP#SU2qH%}3 zO{7HWP-h}8Z(z+QcdM1c?Ae5VPxY@{tVt&SehOxaPk3aXy1~g6%>RuzR57Sn87`>q zKy7ZM(fiz+0i2%KiY@B7BjHPweCG+#24&Y43N;PapY}R2Wz+siysz&rg-|q4h^%KqDuI z2l1E-9rRbDi63XQd(XbBt%tGxLN{FgU>)I%*YXQ$<ncm>{ zi?GXouzI=vKnGg|e?utUHz6z8pxiXTy8l7LT-3yFA4T;Vg8tNT=)zNHXd3mROPDr@ z;Ka)u<>dcl-PP|Zl6Az6#_pLN_i~1pkAgZpJa{4a;nA~a?!lOuEss~^GX(?BcK~Jv z1|wp2oV+)sUm4lX2F`OmTMbgq>%T_dIi(ZYyf0ucfU+Q%1MMzt`+iTVuet70UQafc zH0?;FKO$iBJo(ATOiw7;utbKFwB~cq)KqT?tB+5Ewuf9Ji}dCHzL?_a;q%Ddt3iX#u)WuGmZfA%wETH~Es z&znAVtpY0gJsO2+%Qhfz4ty*4>bu5*w|2hR3H#9w=iLlHJ-QmMgsn-7+QsvY(@yJ` z?c_HIi`ddzMl{ClO`EdbSAo~VD-w)r%cN)%`zr!=?9VE#F(kYa@Pa(Pvd_}=6*F!Y z9R99{n+ERIhIZ6a6a-X3gr*Aa0MpHg6BC*3ZzgOvI+d$q-9Sg~>{HC9!vL)_k@+v1 z4TFL?r0iuM>rBeN$S|<5VZY6W4=j2hSMA{mu91{|slt13sYDDle7A6HXl zQsvGHpo+1in3R4ewhNM#r3MABACHrJ=5CV73H5MuZe!`f9}W5en0W!m7qtN(v_a@uhuq*Ld3hWqyo+MetnDm|w5ROcLMm9$plF5>q%(51RNV`zb1dF})M7@pu4xFS94Y1;$6rJMR{Z&&AkAgMq}_Pcl4+L%+E2MaeANd}S!$5!01$az)T1w7VJ^pjcKb zS~!;7Tvyih{P>Cr?_NB;L-U@|0t+uuQ9v|^h8?t!Q#!~gJlj(tSpTn(_raYTzv zypm}hUaJtPk{joh_GazdftkS}Sz&=!y!jK1rY}VTzF(nJmik&tJ?mVrDV)>2?!6KSKan71A&o2wT48mF9i~oAsKuFXpS&;?*_IDi2M)85*g@jx@`R)A-(V*Wlc zFGn}ul9ZliL43>L)nr97{#Twr)uJ|Y20$`3#mYQDQ$jlRhTTNPN>rVoxtjJTVUY2u z`cn|SNO_=tPBb(&oYCHO?|~cLGJsDD4L^N6Va;rTlslv%WpXYkGcwB4SZ+Ef z|Md++kh|7R_uRo;ctPuI_2M~z!6jR3d%{5`{fS}C%6^)#q``gb%^&&2fmc~1EGoHmL0mXTj!yogcob7Q>4lxmDs6Ap=SZ~!}XDrQ+s%J3i z5By&Doj`MqdgqI6r|5Uj3$bfX8tK{@%*U%wFvCbNt)?UKA~uTksdULT>LS!3gGLvw z0!7Ne#<1z(k6>X-=@Rx9CDTjb22Xhjr26zdFzf=FdB0NOLsq&EAJ$B2+#>p4Zs4M81E|8aZO*VQv zLv_NN34Rrb-&|I7=1B~!e^&YJxb9ctMAOzG^NIoNHi#Vkuzl=n$?(9|#769jc=-kB zCvUuu`XVl@A~~~MmlPZl=kE9iZ}ZD`K(G4()o;j_%3f`p)6B|HkVaTDVYB;l|M^< z43>1!xICugjn{hZZ6Y7(*&zKbf-3VImzN$UQoCepEc4JH#@>r*aLG31lu0)dQ{L@; z-^&?VHqxx{8%0B+-=W4R9IZL^H)_y%*$HprD=7c@{yZ<|p8{6?V=eJ@r(h~7s^1X`-xK5U zC*1rAS-cAA?Ee(%zai|OSVTz-g%AON;8=WT`F~n~V&7NbM!(Rw;{%hf(tm!tx%M5Cch5diA1eo>vrbYPJV{dypil z0eS>8t|>Kb z9@8t*5kp^mH9CDRarg>is>aS*`bt{(+AMj3scN^S-b$yHjsLB|iYb%N&oQZA179-4 z-ykUAJCdnxquIX~z8Ri&(D6MwBFg?2+;TDI^TO&8-chn5C z*+3L~L|zobZTHc7m!wrqtvqDbNyn2DpEwBRDoJzE?VA$AunzB48HVu*RYEBR)`3?K zZAu~5ElTHI9?)X^u?!_0mQiy`7ID~_LX0;HA@oDp>ao< z?qDm5z0P`W_Lny;1R}+YfcbXNXSsuosny4vQpL#6fbilCiR{N@-5dmlrW>w7$Vu#H zZg4;%U<=4bZSB(x3dHcr8Cv}s`#~!jr;9708dUhar%kZZ@`$STMys2jv}8;@3);}3 z&yZ!pVAD`(CK*Lb#{)_1R1PWbMMNM|cD|tR3nOn3nxz?IKga1=N$Sh^X&ZbaV~h{- z$-=e^`oXIoP*?+advs~0D%2Hb>KhaK*%%Fz^{1pm{NKk>%3|Wt&y&1cHo6|zee;Mj z+9(CT-A4r!&5f{uQoi^J{>qNaXRjMHRq~C~Q<3BrZq+Vg-KM^`$KN-jT)gFMCifu% z^s6$}XJ4ri!_5a9_=IxDxUCNM4Kc`khiTUS*bB4zz)*7r)rDIEgyWuciIa>FO-Q)j z<<1%R3oK>GG{7Jfdi{zrgTHB7f=cgAD1KnuAU}!@uQ$KHSpjhsx+7J>u3Z$ac^jw{ zEpQAMT&78>f$ALcdxAp<-!9j3D>j-VkYDEag>T5_qTPKYdx7Uvdxk7r*e<(Q64_kW z5Z>3)IZLKEZ1{LYo=}WQ-rX16qI9!~(Hw_qe$XZ4nZdMpi7Oh+VqKpw1-imNaN}BlQAy*)R8GXl30d3W?Gr1?-V31vP{WglXb|9cfD>nidfy8~DrTq|Rgi)bCM)kw-R>iy%~( z%Q0cHb-+OH zIGL3@@}Pid_+vTFe+!yRQQn8}MSX-fEWlTOt7J;iDuH4`!8g5W08dl%Nrp8|z zC_kz;^z+J*k5(m*-BKvbcQZ`aU5eY(jh2CE#T9=7h+Z10FRz#cE1Yk>y|U%rC7qw5 z4xKI6G)PdlB{@((Hq#RbEk$Y{8#c7P8Dki-C2Lb*AZbc6aZnJU-NXhLl`$w{R=Fp` zXF7TrIE7HYD-ZRmQo-?!QMm;?xeS7YDkQQ?Wl>WT?1L~3|3+hAbPOk(RJu4SSD(=+ zj!{T&Mudc1Rf**(+J5&7@Gy#>L+Bwd7RpfivXmHDt?@MOJz!ag2Lh$5wMX<97ALZd z>44g}FV_vnzN+W=%l~>!^i#_Lb%S)pa`4L0SJ%9Ip7#T9@X>nNlGC2%3ZQR7MySU8 z1{YK~y(GqKRH-2x$uTKJC|#QhP`bXyZYh$J-OWVMf(SoDl@oxUAxh})<8kYbm*?Lh z?Cb_cx^K>0D;Yo;92I9`laoR<5YtRfhaaFuAnA7K;g3Q8tY2`#&;PakNEE&T{Ac?I zYj^_HKkH=y@O{dE*5l%U@G9s(U+U6;@QHuz34bv7H|O6qa1f$K1LAY{t{@oBqGdXr)Vu z{@rQ?--;c;b&Bg!W`X`^<}D>%*`MmDOCVegz%_@1sj#sB5A$N_+W#~g#y7+700i z7x4G#Lj?Q+{TQGEc0N9L?*RaGF)<)GBi$yxgD`;W0%xa+{5u@RAYAaLCN}~EmjH0h ooZ%1t|8q+j>01Bvo5J@i0RiZ+SZ4gZWG*7Qr0-34)U+V~3;G_F8~^|S delta 13034 zcmd^_byOVByXJ?%T?co!5G*(ZcZU#yy9EmtEcD>6gF|qa1VXT&!6CRya8C$Ma9EPB z{_eedcK7eC(`TlutGaro`uWT}Z}qzigKJ2FtFi;b&BI=TnGq>SK-!G3CRB3RM{qyD z#z*-+)n2xI&RK9VX6ED;Dv|3f&rS*l_Zh0rNsfcs9WdrBLXF!WBw4DBqe9lG97y*X_tH0**iDIPll{P&yizbyivjtW`#E^~R z0-o4<0wACpk1BufarsIzge^<+BA7OLdxuNOqhcds;$X zq8%gyHLa6204^8L*Mw^>N(ZP})R3cAmt~etMy{aI*H3til~6_79?a~TNMp~Qm>hPE zCE2XcUC&7k8k`6D>hqXr?@}j{5jZ=03c7)yGS?AwO%n4tZCwuRdxRv)>MHU^lMn{` zn2)p@(vDDM6GUXIFl2;x^H73m&2e53x&i0Q&?$&8%Bdf1bo|qRi*YL13@0Rqqol3vX4&A(Ytv8 z1fB^Q`5L6iHe(P9R|Pys8kd@ysjm`=7nfR)XBlyJ?z=tCDY&}m5i-(*CfZl;$#zX` z^^Oh*hy;Q}LX2tY>1OQe?(FR3;$~?9!63Y2CtN}#%mFj55sC1DnBm}v;Xoh|1OZZ0 zi!=&`OMr!-aw6W6fTWopM8B!aRG(eI`7rY}ix0Q;{q{9n;4jIF(EfMD|B~!)Vfsq` zBfd@V&lWs-Lmt1?`%Ao`U3o`g4~mB=qEi6W#MMc$!n!n5X*66dKBk}i z99+UCCdbQ_S;s|1!9Y>e{afWc^mZ}w3=^nHnw5a=qm(Ee*QKmqEKAXT^ppmJ6)5AO z?a&-va-8r&PN}@Df1yjoDL^k7$w|$Oi7ahaIv7_X$gMZz<+Js`@&QpdF-~F#JZ-hM zRf*tZbys6&`5?mFt&)T>yWs_%w#|a{{6D!$R+23jbQ-x0m?Q@t)E|^i_n`fL8v%-u zVnJ?PJ(09@x~T|;{Fs?*++Vtf{{F#eTGf)FR+nbhO8H0j+|W?%`F3$WJjL%}*I>!7 z4tSures?%jPGsaQ4v-TLToeaIkV~K$a%j9%D5$EzrOR~rY86CV+iUg}q zX9IYdWNCf7lEU^TOQ6r#1Sa1;YkOuWSjQ(}o*r8wVkixv<7}hTVy#*Nx;Ku`AvRbf zP;?1FP;g*CAqhVY59X->mcC-vj9K3}v{+GTx5q@6CN7?H{!8AzGhZ?0ma!zH0CtN( zju1t{xCEoXq(P`cfK)svpIn_7WC;e1cVa+Bw?QBbaU2cWHZy6ONoocY9=v##wh<=~ zkbkOwYqa;;^??7fvb;NNdHZ~x#6@}m8;x4M{M99A*K)N0sdPG6qk#Oyiq`eQ9TF`(f-)Zz?|tmCXM1TW0bz>Zk0I z>-kw-kowb9gbuQ)Ke1(Aq(7O-l_Yh$&W@CCUoBc2-XHNMGcWo6cQ0wKWF zsNqn+0gP3Ex;%{{#GURQ!vH@_(N%Be>E5 zofmr>&o2nqTcQ5tdx^>eZTIDQV7dj)tD6H$pn42RtN%buTh$z`F3+dW?PJ3CC2U;C zPKdzm>?+$%2bpH^5!=LGSt`{hUvE!k+P3kR>2m4RHQpLj34tRF8btSs2|lt26?frx zXW=iBw(Q=Yv}X*xkxH5c7qR1lJoi z0`P54o|J*)Z?TQB72rp3H7C0j+i&7%0JerZv{v8p&@O zD4_b-wxaIiu!_uw$`{Pvrf3`Y&kJhs%H<$(K3~xZIUn0}!e230k}fV&O>^KOo+Zeb z#;i4MQrK1um{8=>NEN*L{BCdFfK7Vc55tSu!#f87EAbiWhfY{Lo+nVRA-3ZaVRY1a z;OANlq_DG12_=p_tyXaUJQs^CLbgTys+Hyy*{51sj?;nXE+`O+Ryx&Fl^;u|xpH%P z3A8KD&hm_nUME?S)??}ckqM*(wS*rod4$TJD=70%kIpk-lL`BtSzo!H$7A7#Qf(Ah z5jHof3y7pT-w19pm3;&Bc(S#EUX7!+Xi#Yq@M37@ydxz0fhWAMcBs9x&2>vz>FwR( z{fvgY+jgPV6VO+rqXiV2E)lhnZIXX(6NxZkq)^z8vP8WCxThmIhE;m<>wj?K$~ z`eXH2CACdDS4+EuMrT9sCGd7y_EMXr9q$?6iv~`PuA5Rk&QlzqT39wJ9~Zq8TTOcq zkH@hC@BE9&3V~%(%z^N8wr8K8VT=_MrIPxy%|(pA>om!`H*~(r^D@QEn7St*+(R#| z`LL49>M@05)5mZBCgy#U(#Q7#vF4(5Ku+dRK~^n>s;19Hs-x1wIard zXLWKYLUsi5xIV%rNg>)^AR@43N+{oxkLYF-Ef1y)#ck{1$#jWK~F!!1(9 z)tSMgQ5AsVS2>ExkXaqu^LyQ5Gs1(X^ZHJC1j?xfxcja5ozXM1rWz3uTyh5GF)k%q za@Ka27f9*OiXJDFx|$~npB7X@=MDR^oK^MGY$a3MYIM4rfu^B>{Bc{Ava$%0YOYq+ zSD9>C9tya|7Q!yy4eAVaQI9egp_E{Tb2cp6U_xHwI&Ax+A=y&8rg*bA2^7(aKIzWU zOF|tTaeBGdPBqeQ{u%H~U#)aw^!l1-)zY0_EYn#wU^q>v@~*Zp$-I<2Q=xg;nttQ% zlRZEZ*3fEa1l%P^qUcxdEWN}JbtN&kODoqiSu}_;>8G{l+nAhO@LOE{&=Dg=cIJb> z<Ddh+bpb=)tsne^mKyD zvWl;@2hP)V^Cxu;LTu8WyI8?h1uCYs0(8uJNsE&wqdIFsv}IdQHk~36?Iuu0I-Wef zjz@_fmDp$e{_KZHCieh8w-9UQ$LF%Syq$se4fH?S{a1xr5)!Q~g(s&Z(iRjYI~_Z} z#Pjd2MK}N(2L{s-ubRDHr;UA~OK6f_ro2RQr{5F^#RROm{6$Aj@_IA7?^Sm=hC(dj z9tpXY;EG7*;P?iJ->jrzQMbZ@pde~62myow;=G58D+htXM8F^@90&}3_+MVZ27z%< zKsYGWAkaL{!-3zWy`-j+#J?Ro__2a0G6V$aVuD1$+==WFHz`28j~^_m8f^lyv}CRY zFFCfnG*4yg5;tOl^VR=d%)Zm0a0>FY7zwK~O2ov~fIlKYg_>Wn9O?G> zNqmU(MD3<`zfEn~S^8gA*(|&!J=%5mGRW>H*3ZhM45T3IW2n4%zTzLDAk6BwV;(Nt zro#*``qMEFI{tFZbvllI?bOKvY<4E%o{ZRG2)tzlG-Gaa4viKc=cfmTfCt~)g=a1E zTw;{_UI(l_{H1ZM6F}<)!j= z|1i&xngy2@D$X#5;RsG@5g4))nDp7Yfw$OroQ=?4odLI=!^_RTO*D|zAIJ|E&B{t; zYa{gdV$|(IDHF29*u1t|VpYGHFxI2b#{ZNv?MumFMDZV%I(6~EQj39%aChOSJ5Y?HNjWl z!iOx(EA@1|^M%(D&aDwQhT3B^5L)Ba=Wv3!-xxx{Pb8b+a`LeeWHT^U`8};dS%R#c zLo1Jii7?H{(fQ<^@BE~I4cb|x&x@LU0D_Tf&Qvfxz6AqsR&TXDegp>8ADw7URJ^?q zX%Ysh3;tyN0vaEnc>>?XV1WDO(DE|K6Q`!k#Lb!9R~u6Z*6#>$ z?=|5SA7cf-2+W^Im+T=ptiptRKvafI|Kp%_!Eew&K`=>jDJ0g{+64Vbun2N*7&3(s zZ*PhY!Y_91N^mr(R1vx^?CJZL=9U{K{{reF;P;1hyd$ zOs^9^Fg^5o_Yzu6*Ss{s#>o8(>&@R-xBd%j&8*Z1tN~ZHe9K>0?~{~C3s3xwHR{89 zSqaa7#aivZV=c!j%|419hY6ZFR$gOamH?%-a)|qr(kv0@kT`G$Ew9Y&YZ8yFVCiq7 zKPK%FAheHLxw#Fw>yxR%3}`Bh15leY#mmtZ7*7@PL}Jb383{;0@I)LRe4tPz%n~Lx zdvy1a4^&i@gD}Rx zifBv`ptz7e7&An*8)#*_HsQ-)dZK^6m7; zpy-w5Jld!WODQ-m5EqCOq@qLKQa3_jl#zVB(Ndmj7;N8al%sEh>h|&2@Vs@2UABgD ztc%rrNSsn)<>+C7DS$ZIox64j8Q+@*Ym!kvG2?x82;VX@kgKwj(=%Pmqe4EXM^DNo z+7a>0Xe|3&wS&+!MppUE*E;J_nd%N+%qKj#gCx6Vc5P{+7YuaiZNeh_OkYA9a#U#C zWv=x?2JjBKVpx|?>s^qnms*g(j|i_G<=1_BpFzm!6|R}XI2HCo^f||7#dY#m(^tR* z5i>QXEHBC@VN-653;q=Xu7Iu?PQG?b<&O4J%(TK39UKz43+H$$s_D9#fv}e`k*Iu0 z9rx(nkMSFyA@xj?_@BCZGANcf?vAfE&C*OTB@V2_iR<`^cyTGQIuJFsbH%ahOo$-| zU=IgH$QfYpIIE1q-8|frU&XMhB)-L=e`#+4^D<#i71;Oyqvv{5M~KpW@3h9G&nh#1 z546d^4S~8q??EHEtoAB`HH8H! zrHBpeS?Z<|CTb)iSoC7UF;F##0b3%}XV%}p5G-#Eh0G=KT@_4EWxQLiIP#uVKbO9= z%Xm^3IWEY~VC9N_H03oGG7ob*>&r+|Jb9r?I$Tuxu_z%%NLoRL@_S=eGW-IBt}EnI z=rl{^rCujNWv@3gF5G9TpiqAFk9-cb3aT02QtB__b3A;vwmv79^x-qei34%sm3k}> ze;>h6e}acMe?e|CZVAr7)1DGBwSh z+Nr0n6*K!a<0MaPXD`Mn;&yCo#qz~QdCJxhX`!o$>wB?Ojdhrgufy=W;XS*7xOr+B z-x-1hm$(F`v$1LUaEjS77J%+YeQCp7UWJX6N`{K@)M$dkifpO9n-jfhgYz+c>*|z7B;q`$OMI>n0Q}f}Xj%n@q?H*m9I{>rQ2SO5i~H zbrbjhT?qI7sB!cBmje7#ca_aIY5s{PMXU#I=6{Jxka=bx86{Y#z?T>&l=l#CFt7V4|2vUfjB z8^xtZ`AO(kHd&LSP`)72YL)6%DZ4pL1;%`an>dVnrTcU5pO5x?YPfrdeV&x!?*V@t ze`+38{e~Dp0_V@|_V4b%=X>PHpCySeFuA!wa9|LK-aH%u4sI)#<^_lV3MY>T0)asn zaLgz+)DQ6n2!scI2sog~AZlv=hbkCE-EaUpym?D~5P+3ZM%=os*6rHEOnX?k9VmXb zK^F?#6Xh>H$>h?cY0dGxD8N?R@i<3qZtFfF3zO7n2~D8eFkU6?#E!}|emcr7u5ugi z#tR~C^L{LH ze&ewf<@3J7&3lKrJRyQ{VEgd$NP6#MuDcv?(VXK2USvD=dVUdbD)`b*$NR=rzyl<4 z#r-3T*EiZozEQOY)m>nf!p7UytIQ!A<%eMAcO3O|X6=^@7aQv18B==~X-*@D^6{5Y zJqu~wbytjI(KpPmvT4R|JyYGopjxQLJv08r#Cr=Pw67A_eXd4Do<#mu;NnS2!+@Bh5ENuuJ1!g(|)%_fn&}5pcjmhV}60Efs_GRT47Jjd1nzS2$_nUBMN|2z?Vo0cO@^u%d}w+DfQgFzwOd#jMm zfuucD5Ij}ghPR>n)hsHH4bq+me682aA2loNp0#OJ6e7W_R6ND0PEwSR8KPr_m`csC z$Vf2-v0NVKCQ4U|pa*QleuNOoeD4X}L>_PSQm>51c}Go;=9kd!<{q{t0hw~nmylti zGLT3+PtTElOYD)eqsQBTsCd&49iSmYRv+tVyr)-DJ!9e@Woh#J)ZKRppsMcSr!)AC zaEA-l@>=`ojf-B|uh>WrPCXHc_5;VtM)GG(LXCMo87P4(Cx_8>##tB-Sr^|?@Qk%c zww7#bV7V&(>`f}#0269`&r)D<2}_V;sY=N!XiA8a3P)-(2Neb<1RW~y_PzBP$N0wm zy#RhEedVP9L=;t62AyLU(8-jJFMqV4rkxumIEJ!S7hXoY8XWG&&VP>guu9oXRi{BE zn4NPq-Yj~3q(fQ6M=EKY?pZGxW6mlWVw|_{pWSi5@$qL{WW8~4xevH9sVC#L8o?qS zC9Ui8&}YMIc(zwQ$s}Ci?dmplNI9R^qXRm5n(@M(QlYhdKP|JD1a>$8vtywnSASUY{<0lCyk$r(CtyPXg#+YTODPKbATF`*Ok*=A- z2~0|Lksr5(9AU!*ixdfp6ii}q6_^K{S(fG;BKfX?HOgw3G8DL31T+Gfw(){Gyt0{C zNdt0`_x-PXNQxNcfEzGT_wLGKBAu=8D%zU8A*s@>>soE;l{T06o`41v3vc5Q9tL~P1B9uBMi(4vKqli^R!&=@!_6gqo*$X- zFeH?Q9gcOya?QH)&L$OE8Cpi#^3pC&+Hf{S;=FzpKDwTC)=VK!;>D>H)h!B_GB-~ z$qIo;YXvzB0B?d1x!Kjco2|R8U)c834%V^cuFX#J(Z=bmZIP(X*Gh|q5~NW4ZOpot zo#S)Y9JfFQeb97#n9%jLLh3wfkW&%ag;h zX?%@S<|fhKk6l77ky2$HUY_l4ua-m-zsqTM*4Dh!2Yf+7GWc@*he_k#Eyr%+oIQ?E z&+;y8ON7@m=1t8`9Xf7Nv=4`h zb88Fp)3@mU4lz?AJg15x*;p|Cb_Y3KP*p5T(MWQ}OvmfkjISprc3Y27$Al*;lDW+1 z0%5M|z;eVF9NZ+Ui~bOLVM9v+(gDgQdX1*;(n1l(+m~ja89{vcJEhk%UeUrjCjE3< zQr=YW?Wy#=8wYD{?36tQarh&H8ocM+6x}?BwL)WZBl2tDuhR-`b(c-K#Ws(;UD7Im zz`Np#)kbqCyK4X5@V-H$3uBaTK9(Mnt1#pX!1UO&3zawtae&U}mJutot1_hT4##3W z=|gw#xBC+k#HiT#oy)zJj=;rm7PRIcc+9U@3x-Y8A(dLCbK}(pM4h|r}3Ko^eb{HKukI6-0}75dyD9T zGH^^Vve;JS=c`XG!)6>k62rY>MTcYkm|qQ*V#Mp)D3>;_+rG)`I^O8DrqsP*HzO$y zhI^;n`$Rt9*MmxLcn<||Uo7C3F6|w*?_}CT)?bc9UE*aqh4y2wEC6)uphR>K+Hew_=rvE^+HShP=wV&%g5!~(rT4No2haTSo4l*Z9c-oT zaQdciO>jnOJd%0kupQPVxJ&Lo%$UGHx`%YcVi{eH**KzCcJ8}1W^G?a`=UkY8pG6BzS?nf2vu<}aF`v1eVh$n z=Sl8%ivY`!ihd$iE6-?BNzD#WdL-R(Q_^D-y~+3yaK!7X?v2Y*>MS5jeVODIy<%%H za+*X^eM3fqWD>Gl`&ml_S@2%n%bGH*t|FP3`>h?b-=h|$doD(0ABs2@KE?M*PtpzX zu-R{N7TLte7*6%?R9Na~0l%azEpgdC)AemXfy9lN9lwFXpm!rCta{8nwR|zT+&#xr z)%vU1yZozfyc&+qZehWWEPQ!ky{p*$=teHA9s4HHF4%J}_0o9oAw7F9J&~e3Bl^Kd zhDj@~y&e(k$2I1~HbNI}0JB(!w?gyY8g3Wz?Dewy@XWCBPc0dH;8T-$w#uiOAdio9 zU0<;2=H`$+)I*94Y%EF2gQPcxk~|&p&SNu*=|$Qrnr7sEgf4Q$9M2tYo^pQ>`I0Ki z91!?5XbVA@l9Li^$1S+Z?@q+vkaUYzc{j^PuH6s6@eZBKlV@q!%d#^GXQTsJp*12l z*Y_|LyT4`faOc!L7!c#6lDVML;{F!KJJNnCVr-K)_q-4_$(|EJrPq1}uhw0|eM{e5 ze_Fme*S(I&Z1^on-dz38y^rR0Rl1n}t$1_vFwe~}Eyr!F2ucv{N#ql9W`WQwz~-do zKnjkHW?{#L94p5Siik~b2pS4+QTRAv7v2{a_Bk){TH(ylK}+GMQ3 z(9u|Bl+O>Z>R`s%IxM_G%B&)9IIDjYac79V^WAjwoYfXYumO|kl;pMj$pqfSM-Ic+ z+kG^i+PHl#t@K!ji5ayF-?B(0a z8`TeLCIG^Taz6-O;b7{Arx-ULOW^$Th4h zTFfgOf)arWr$No({T6H{MmiEA7>58N);g42dGxb1CtM>%v=UhiAHC&za=<#CQ;W_; zz#R1P>j91Jqrgq4B0+RV>YJIyg>~_Hil7Ge2rGAgM1_`7d+yL0F9__FrmPE-SNp4>26ejs@=AhRxS;D@h2$%gYoic8`?nos>s z6m++VEF%YE)MS8|JkN!XD>qvDgdr9e&Fk_ZfxaIMCIgXyC_!y^4`C&hCA~tz!$$b< zGG7NcEYvwHV6kcaIv7r^ApWcRYcr(uzx*Qz#lFA)CrIa^`+YLKLe1}PvI!WP=CuCu zSPaCZaes_!|HlZZ!KhNWG$>fXL8h1Y@S&$AokQPYUI-_IBxwrA_&BmFzF-tEDj49$ zL3!AD134iML#P;zRF8I?2y_9)6bO5QmGGyE49xH zYorlU0+`QA;&t<<%Rk>rNH z?Bz#VLRNbe--dC3^Kjz+8qc1a7xDt<; z_R?$ZLo&i_s}wvGzacCtYqEE_E2e0dtX>9LVwUv^+vC$OSHxc37&BfrY>bPC&4}85 zC~{7rMa_y#Nnx&gCS&rg3kYE{fF8@Q#ACoAm7&`;I=R!S#JS!*rAW2NzT>z`ednlAScJrFWjUEn<12B6%Sgo%@~YtS#i0zE05OjAp#pq} z!`iX{jxa@AiAA@v@Fyh}0?5$8IyF9HqB`}U(&GR#1UwrBd9U3dNr0EMx1>nCXOgHt z_hUnBN<$yL;R>v_r!3DB?$5BFh`EB<_kmxue9WXehtY76o))3#`6}Y_D%D zehz!n$#YfTEc%ryabLNX!kQp3x)ZrO+!sbuIm(oPa`FmprigWe3tK&JX}Ms@oWvV@ zL!A8pB$%wSG~miubpTMUtrBrP%QC!ZIP3y(gNb8hP z(sES#wv;x>2Ak!QSbR`wY~-i9XEd>1P`tA{#xu$qO4~JclW%WB$f{|6X@=zL7hYSID7tA%H{Q@`3)Rq|G z)?vq2IzleP%ol_E6YJ2~((wBL?F$7gaZSh%k+3f^RZ_d3>3zge4Sm$e3#s1qH3C;z zCRa{HFR{rIn*?$77LHOUt1>$S_#@x|v!5)}Ey>t@wjoZ@k;k76&V?rhG9Tmf7jMdN zCD@IPUp~JXzhDB|+c+5Z#a*SnGb(*-Hzi(~U$>cf%Bw`a-4oyBg$qNZQH8#P+H3*Fn#!8}Y{Hv8->X z%KCa&sZwtjF#u`!|FN=i;xWf4-f$Hv5r(Y9b~qDw&a#`OmBZm9%(fXnNeeXO=A48MQi?Bjli&u-HC zNJwZniK=@%7UUE^vehE}x}Yr*c^BKeerNUO`K8UJ(}yxo>#NtkMmdk$CYHa~6wIb> zArQ0fw7+lKeX{!Qz^yJ54yr;TIH=P;P%!D7a5<||FqRcxt zN%Xb36lev2D)xI~VwSe_W#*Vo>^P#H>q5boqK!#dgKvqjW3npY$b*e#h0Di%HsN

| ztV$by*O$B=Oaiu}1AVVDXWVeZWg#v?7cmEL!jH0}KKFYZ)%^);hDPDW`;V(dRudgd zsJ)V|sC@<|n0|iVX{oA(G1!y%J_U76^&d6`?>lC1$(CwAPQgy()rO8f zCAd=q_{k>7%wU;ex9EuOfW2m#IP&{Pfp5j-BR!%&FRWcx-t>{T z#RZAArFt?w%qLwZx~0gp^Nru=`a8kz@*Y2=8y3Z0p)r4+wGwsnxzkO396?2-4QEua zb@}r0{n`lV_3N*G_iqj4Z&WkevewpWQhBAr$ZW+>D0GWojEPEWgkZ)k6a6rzHp9)j zzXt4rC=J$nUhJ;9d0ZpB*|J!Ru*i~<`=&yrar0z{GS&r{|4HtsDbU>nZ5@Q~Kq}8Y zSikGpc-G*ZkJc&?qHi;pjcIgJczNYU5^vM?v9jxPnPQc*;_^5Q`GTamUgV!n zdNLgW7IopqQpqbg@)=>_LaKDmaVMpT9uqX!Un&hYDa6lO*;ls)}2|3{!5hMrv=l%r)2O0fqq=#!lF6@4_K)@sw@(3IQ3JB>y`bo>-kQF$@eLCLHhwL-;_90i@*rZ~5zl6~9l127n>7AjWSz zGcu3$Lw|9sN=9}ilce@#2eU&#=}C@K%r@kb$y3jg(8`+~p_F%aWGG)zsJ_@8Hl zPSOzEf`<;IgdxFT2sel^83soVyQe4l<8dJ|ZtL&HhKEK05aaRPL*xH^t4RrGfA>6m a=ve>;5&pa pl.DataFrame: else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) result = pl.concat((trees, fresh)) - return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags))) + return ( + result.lazy() + .with_columns(col("tag").cast(semver.tag_enum(gh_tags))) + .sort("tag", descending=True) + .collect() + ) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) From 99f823eda9cc51189d3de53c298c9ac861306441 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:49:10 +0000 Subject: [PATCH 100/201] refactor: Misc `models.py` updates - Remove unused `ParsedTreesResponse` - Align more of the doc style - Rename `ReParsedTag` -> `SemVerTag` --- tools/datasets/github.py | 15 ++++----------- tools/datasets/models.py | 37 +++++++++++++++++++++++++++++++------ tools/datasets/semver.py | 2 +- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index fe8a0ab33..921fdfc75 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -25,7 +25,7 @@ ParsedRateLimit, ParsedTag, ParsedTree, - ReParsedTag, + SemVerTag, ) if sys.version_info >= (3, 13): @@ -121,7 +121,6 @@ def url(self) -> GitHubUrl: return self._gh.url def rate_limit(self) -> GitHubRateLimitResources: - """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" with self._gh._opener.open(self._request(self.url.RATE)) as response: content: GitHubRateLimitResources = json.load(response)["resources"] return content @@ -131,7 +130,6 @@ def delay(self, *, is_auth: bool) -> float: return (ms + random.triangular()) / 1_000 def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" if n < 1 or n > self._TAGS_MAX_PAGE: raise ValueError(n) req = self._request(f"{self.url.TAGS}?per_page={n}") @@ -145,11 +143,7 @@ def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: return content def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: - """ - For a given ``tag``, perform **2x requests** to get directory metadata. - - Returns response unchanged - but with annotations. - """ + """For a given ``tag``, perform **2x requests** to get directory metadata.""" if _is_str(tag): url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" else: @@ -390,10 +384,9 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: rate_limit = self.rate_limit(strict=True) stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT fp = self._paths["trees"] - TP = ReParsedTag if not fp.exists(): print(f"Initializing {fp!s}") - result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) + result = self._trees_batched(_iter_rows(gh_tags, stop, SemVerTag)) else: trees = ( pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect() @@ -405,7 +398,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: print(f"Already up-to-date {fp!s}") result = trees else: - fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) + fresh = self._trees_batched(_iter_rows(missing_trees, stop, SemVerTag)) result = pl.concat((trees, fresh)) return ( result.lazy() diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 044447707..449c412ef 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -34,6 +34,13 @@ class NpmUrl(NamedTuple): class GitHubTag(TypedDict): + """ + A single release's metadata within the response of `List repository tags`_. + + .. _List repository tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags. + """ + name: str node_id: str commit: dict[Literal["sha", "url"], str] @@ -47,7 +54,22 @@ class ParsedTag(TypedDict): trees_url: str -class ReParsedTag(ParsedTag): +class SemVerTag(ParsedTag): + """ + Extends ``ParsedTag`` with `semantic versioning`_. + + These values are extracted via: + + tools.datasets.with_columns + + Describes a row in the dataframe returned by: + + tools.datasets.GitHub.tags + + .. _semantic versioning: + https://semver.org/ + """ + major: int minor: int patch: int @@ -121,13 +143,16 @@ class ParsedTree(TypedDict): tag: str -class ParsedTreesResponse(TypedDict): - tag: str - url: str - tree: list[ParsedTree] +class GitHubRateLimit(TypedDict): + """ + An individual item in `Get rate limit status for the authenticated user`_. + All categories share this schema. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ -class GitHubRateLimit(TypedDict): limit: int used: int remaining: int diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index f18e1e992..788bbb2a2 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -1,5 +1,5 @@ """ -Parsing/transforming semantic versioning strings. +Parsing/transforming `semantic versioning`_ strings. .. _semantic versioning: https://semver.org/ From dcef1d984b79cf622f418b7e6ecb72214656e62a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 13:22:44 +0000 Subject: [PATCH 101/201] docs: Update `tools.datasets.__init__.py` --- tools/datasets/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c1c7e0655..c8e67c394 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -1,6 +1,14 @@ """ -Adapted from `altair-viz/vega_datasets`_. +Metadata generation from `vega/vega-datasets`_. +Inspired by `altair-viz/vega_datasets`_. + +The core interface of this package is provided by:: + + tools.datasets.app + +.. _vega/vega-datasets: + https://github.com/vega/vega-datasets .. _altair-viz/vega_datasets: https://github.com/altair-viz/vega_datasets """ From 173f3d6f5c43a0f248502240c8f1bf6ca7536415 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 14:55:31 +0000 Subject: [PATCH 102/201] test: Fix `@datasets_debug` selection Wasn't being recognised by `-m not datasets_debug` and always ran --- pyproject.toml | 4 ++++ tests/test_datasets.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2297ca2ea..e7ce8ca7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,10 @@ test-slow = [ "ruff check .", "ruff format .", "pytest -p no:randomly -n logical --numprocesses=logical --doctest-modules tests altair tools -m \"slow\" {args}" ] +test-datasets = [ + "ruff check .", "ruff format .", + "pytest -p no:randomly -n logical tests -k test_datasets -m \"\" {args}" +] [tool.hatch.envs.hatch-test] # https://hatch.pypa.io/latest/tutorials/testing/overview/ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6d349dc9b..fa2543ced 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -49,7 +49,7 @@ ], ) -datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug) +datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() """ Custom ``pytest.mark`` decorator. @@ -345,6 +345,7 @@ def test_pyarrow_read_json( data(dataset, ".json") +@slow @datasets_debug @pytest.mark.parametrize("name", get_args(Dataset)) def test_all_datasets( From 3f5a805b34d22727e93d4eb4dad27874e68461f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:25:56 +0000 Subject: [PATCH 103/201] test: Add support for overrides in `test_all_datasets` https://github.com/vega/vega-datasets/issues/627 --- tests/test_datasets.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index fa2543ced..fc61caf8c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -4,7 +4,7 @@ import sys from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, cast, get_args +from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args from urllib.error import URLError import pytest @@ -12,10 +12,11 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import Dataset +from altair.datasets._typing import Dataset, Extension, Version from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: + from collections.abc import Iterator, Mapping from pathlib import Path from typing import Literal @@ -23,6 +24,7 @@ from _pytest.mark.structures import ParameterSet from altair.datasets._readers import _Backend, _Polars + from tests import MarksType CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -345,14 +347,43 @@ def test_pyarrow_read_json( data(dataset, ".json") +class DatasetSpec(TypedDict, total=False): + """Exceptional cases which cannot rely on defaults.""" + + suffix: Extension + tag: Version + marks: MarksType + + +def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]: + """https://github.com/vega/vega-datasets/issues/627.""" + names: tuple[Dataset, ...] = get_args(Dataset) + args: tuple[Dataset, Extension | None, Version | None] + for name in names: + marks: MarksType = () + if name in overrides: + el = overrides[name] + args = name, el.get("suffix"), el.get("tag") + marks = el.get("marks", ()) + else: + args = name, None, None + yield pytest.param(*args, marks=marks) + + @slow @datasets_debug -@pytest.mark.parametrize("name", get_args(Dataset)) +@pytest.mark.parametrize( + ("name", "suffix", "tag"), + list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})), +) def test_all_datasets( - name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], + name: Dataset, + suffix: Extension, + tag: Version, ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" - frame = polars_loader(name) + frame = polars_loader(name, suffix, tag=tag) assert is_polars_dataframe(frame) From 4fc84469c4d69331bd4c1f5bf30c63b396c99b4d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 17:54:43 +0000 Subject: [PATCH 104/201] test: Adds `test_metadata_columns` --- tests/test_datasets.py | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index fc61caf8c..205a0d958 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -4,7 +4,7 @@ import sys from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args +from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError import pytest @@ -12,9 +12,15 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import Dataset, Extension, Version +from altair.datasets._readers import _METADATA +from altair.datasets._typing import Dataset, Extension, Metadata, Version from tests import skip_requires_pyarrow, slow +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if TYPE_CHECKING: from collections.abc import Iterator, Mapping from pathlib import Path @@ -73,6 +79,26 @@ def polars_loader( return data +@pytest.fixture +def metadata_columns() -> frozenset[str]: + """ + Returns all defined keys ``Metadata`` (``TypedDict``). + + Note + ---- + - ``# type: ignore``(s) are to fix a false positive. + - Should be recognised by this stub `typing_extensions.pyi`_ + + .. _typing_extensions.pyi: + https://github.com/python/typeshed/blob/51d0f0194c27347ab7d0083bd7b11210a09fef75/stdlib/typing_extensions.pyi#L222-L229 + """ + return Metadata.__required_keys__.union( + Metadata.__optional_keys__, + Metadata.__readonly_keys__, # type: ignore[attr-defined] + Metadata.__mutable_keys__, # type: ignore[attr-defined] + ) + + @backends def test_loader_with_backend(backend: _Backend) -> None: data = Loader.with_backend(backend) @@ -428,3 +454,13 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - assert len(tuple(tmp_path.iterdir())) == 4 assert_frame_equal(frame, frame_from_cache) + + +@backends +def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: + """Ensure all backends will query the same column names.""" + data = Loader.with_backend(backend) + fn = data._reader.scan_fn(_METADATA) + native = fn(_METADATA) + schema_columns = nw.from_native(native).lazy().collect().columns + assert set(schema_columns) == metadata_columns From 9e9deeb95668d2c4e7d30311e85a8f9f6acdc88c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 18:13:52 +0000 Subject: [PATCH 105/201] fix: Warn instead of raise for hit rate limit There should be enough handling elsewhere to stop requesting https://github.com/vega/altair/actions/runs/11823002117/job/32941324941#step:8:102 --- tools/datasets/github.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 921fdfc75..6f55c1d52 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -311,7 +311,11 @@ def url(self) -> GitHubUrl: def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: limit = self.parse.rate_limit(self.req.rate_limit()) if strict and limit["is_limited"]: - raise NotImplementedError(limit) + warnings.warn( + f"Reached rate limit:\n{limit!r}\n\n" + f"Try setting environment variable {self.req._ENV_VAR!r}", + stacklevel=2, + ) return limit def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: From fa5bea8b25f55cc5bba32c1ae8963a89f66481ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:33:13 +0000 Subject: [PATCH 106/201] feat: Update for `v2.11.0` https://github.com/vega/vega-datasets/releases/tag/v2.11.0 Includes support for `.parquet` following: - https://github.com/vega/vega-datasets/pull/628 - https://github.com/vega/vega-datasets/issues/627 --- altair/datasets/_metadata/metadata.parquet | Bin 18921 -> 18777 bytes altair/datasets/_readers.py | 19 ++++++++++------ altair/datasets/_typing.py | 24 ++++++++++++++++++--- pyproject.toml | 2 ++ tests/test_datasets.py | 2 +- tools/datasets/__init__.py | 13 ++++++++--- tools/datasets/_metadata/tags.parquet | Bin 6247 -> 6290 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2597 -> 2599 bytes tools/datasets/github.py | 2 +- 9 files changed, 47 insertions(+), 15 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index d47c4ebed0528df5c68dedf307f03f66fec5e63f..3eaa28ca39d5ab0230c23b9bb1799d78ffd64eb4 100644 GIT binary patch delta 13381 zcmb8W2V7H4_cfY?00}je(2>x4?;0qnVO002O$01$trsRYCTL>&R6m01mjeUTMLye6Y35&ia}7O(t!*L<&9kAKC% zi|%1=Z~}wDILK~K1vW}T)!xG}Wi|?e3cEHNkv!t6l>}aiE01$rRcq%wr9Y&+J^C4@ zSmLS9M+XFUhv^A1z~ajsdq3SvE!Hz=pw)4tYV2i73^4Foc>YAj6ZT;FV!a^-s^qY)1W@FwAvljeJJxs`|s zPj(e*qT=$+c>#Na1}mXl?65Tw3^2y7ZH(ib_9TfMMde%-7X6M8_2RS|Q85Dwb$!B2 zTM`9zE@gkYl4~Gv5}gf>t;P5Y%Y>)JFX67@z|S0x&DE3Sfe_pRMr+i9U0))ONh ze3ICgy;VIO5xUlQN&*vBO9`Xrz+33Tha}{MQHCl`Ms=#%nZ#Jk07fF{My+Nf>S2Qj zI~*1t4X?9|U491gJk!$Sk}EdELOh~8EK!8Qk_8L`A698?_}_%-&2;+lBdgyohNzv^ z|HzR74C!eBQ=OD7-a-NS%^4y#$mm*hCoU}{Re)+yhN_+#{SlYWxgHn347@NQINrq{ zg9LyJ2&4gGZ3w{CBKXtB3?PdGFi?h3b;(N5o`8x|Fq}Gf-fJHDvOh1!umKg9ezNg7 zewmO*29d#jEnP{n+S|X0dGuXPNVEgu{Lb^_s~`YOXNCueL)+ppp^6Vo`PY-Wj2YWaX6}DSDxKu?Tj%u1~|Mmh7hL6y|Qwg=<<=C2kI>nBA0taxg z-eN(Zy_4s%DjsFD++vwyhYteYhC!d;W;TAH_?FaIl-nTH#5ljU2osv|iQAJtQC%5n4K@Xq!;7#f$ zYxlg85DKa27zhrtm9pLMuo46v=JEZk#&!n+x&H+>v3O>IYMxXQSzBefejKB)4u8Ys zBYH!PlP~kZmx6@>=Lw|@ymA+N9n{3WL&h~GlNJ(}S=o)}>=24qw^t4HXF-E2ebWcey<{x`>oF2l>r6uH|Cp-?W+Bi4`+>-mHjfv@4$y z0nNMKLh7fPv25kNhmFWrB;xKLo{9RYM?O^CX$}p!!`=^ zWP>TI9C=9z=rHOKq_6WT-{D5BOxX0yB&T#dREoyc$Ti4Gx452Ub%Q+_sZ{Rt#aPcO z@7F_{9ak$Yl1{ZuhWl~aY##XRJUNO6eW+^`S8 zRwd8M56jAX&V0C10Q8((Hwm<1zh*mi=&D@G)~>8x2fGit zow2u@reKro^&=`iD-vhn-qBi1Rv_grwM1X<%6~I4s>p zj_Run0<;gmv@80=_Bl=5QwD)>dM0!JlDx(gQGCI_s}pH9 zK4N_3h?-ZaL(=>BL?rPO%R~?@8y;OzHt&?|B$J?pV${W=^c@#3s$wZjT?me@qV-N8 zTxFnUATR45lD2PtcZBz9L9lJ3`p-JDmOkM!PA~RiU0RKrhUkWC({SDM%}{ z*hG(-Et*^G6LK0?b+HcTmN}pnQ2pHgQFyD^;wTwC%w9mTK((=>pMe}ejie3R27+;j zs9qryv~ME3q&pLaS&Q=~T@<*@RPPX4fM2(S2X7Sg_INEr%<%5$#8E)f{7EYJ5E5m$ zFGHcmIUqO@tvCP=MoR;Abar*P>2B-h@SAaykW`;(IZEl#SBMc;`@rzx|7|T`0f>VD z-b#ez_1nIMHw>z~R-Kv4bMm}hL~zl;#QZpE40q9W#Iz_ZlZW9_%>y}7CH_w)kJE)e z;?Rb#|I6fk$ju{ zOK8o{Gg|MDC}{Ccba8;NmubAh_T8T`C&3YS1s7sG4tK>#`Ae zl!dxCB_EmB|NZ_K)J#4XE(M2a)bssw&lPe1)+aeGD%|d|iXizfpGxjBhUT2T_Ql)% z2&?R*F0DYjkdTKPwWN*QRRWQ6>OAnX85wiRyy_oVh(|=r=}h*Mic8AMW}kw1B}#C8 zi%5ciTB>=Y!hSF}d7*(@M!YIOW#||Dg(?MeP@TNrRP3tg32LV=F1c4-*p<}G>($q8 zzn1s@b&q@S@OT((nXUGOr=*`k72iT4X>;{4mee9fza)59STYRSsc$;TxXKp}D5rah zZ5?|LFG(CmKmG9eL7!IRmD}T{P`9b7Coa${j2qN1%CC~m)8-#Vy5FpQM8fl=!*P)3OLU*M}>quWL-|9_q`wU7skpQ`p%#Mlo8-q(vk{ zCf+7AU&O_|RO&r)%=EKi@2$N#Ta*V>v@X4zVr*-THM6-flbG|il3S307F`{cle&oa zz#X6Yj(i!xVnYcxj#|g1j{X=$oYL^wATSHi~Qmz#O^Q{Ngz`f)u!n$P-S`#70$MDMqqRnu=&E1hBCwL2Nw zBQMTtOF^vfh0cELJ2Xt-84mBu8C}0c@T4?o$hSFtdx4Y;z8NF+b9RY$vxnZG>X3J8 z2b06}EI9Cv zwQQC2cbs<9W0M&>&qRN=eOHf-?(%Tk2QAX9{D#hq>e2JyM^d7OF|P z2~c!++{wF6c|RWJdRMgBVtMToC&z@z0jRfD!7;Af5A0H*0F&=L>g zkf(8RXRpVZXpa1+T0;RN|CMWcAy;6~ZTN_?^h3T} zovIkRli+%0eSb}iTbs3rh1=-vKhn*5xz153IZ*Dq5=4&wC42+xjh|gZkk9aDT9rvfv9jhdik><+}9Q$sMMvHzJAS8~-S}Nv~0?qU*8p%9i52kaOlj2%`u1 z-aUcQBD1MnSJ@e#T45hlV0;r+)xFsWFQ~UJ(JLz=m0%l58RW;xJ3^oS3w4LOkat^0 z3fIjZ5_o41nUD&{Z%yK!Utf%QT-+OWWp;USz!nfewW@f0^(H+< z)?)X<<5fsU1bDo!VF;=s*VT0cc8Eg`b_q8y8#ceg{1wG_)yBXKGEJ0wRiS{|sVfeu zKzmKn{Wb<{^F(DE$p!Z;1cd{ua>t~ia-uE$S0RfWg2hn+yw@ib*q%cR#1jE-V(|S| zh(aWL>0=3nW%VK=-{^XruAC?LlWyD)d%xPKsMp{KD;LN8M<(LmQA$=WlztFF?S$Zyh$-p-y(pzWQwZ;D{nJ7kIvWeq2j`x1!_yHq37RPQ( z0LuRs$E2Xd3tvcpHF^yTVWHm;*5-nbs^m_%O6mLugr)yL_%jy5Ilm#yF`lqi`qD|F zpz=Q;T=2g`*!BX#+P@);zTNMwf8kqA9YLM=i;8u|IlUX7W_8;KnD9|Ms5XU5$UYbu z{!au0QU8Hp*bOX#KV=Mmt}_!DNy2+~99+$;?>FLn3EX-gDDkg^+OQZ# zVO=zAgLqM!pBHu1t77ReDTJX$jK9cl_eV-XC1sf~tuk|NSQsBtjlQ~oF{$pyRp-XU6HHn4rVKcm3A`16Fx#cYgdV)xVmSiZvu8m7H z&vWXiCdFv%(ItveSQsyb6>x5DsjuyfT%tHj<$o;AWRn_L*P6-aa?nwQ? zbaWd0^}Q=8D%ZIh0eE|$IFy1dcfo3%MOaW4zYsTy8q^Z=k`#8gl}^YGR7_MvuUj_f zm`qh-)OCUjW-NH+8^hce3~Xc_1V{#iN03$dz9Z5m!0j||f;jh<99-|ku`y%&6^3WV zP(D;o4PjJBa{kV>v(-6QnN5sI5RXK}bHl-ZTs*LkbKoEE?o5KJoQh-G`x*+np$5SM z5`3|*M+4Eh$SxBW&!O;8Z8*u_y)}e<3lLBmNk~@gMwB|r|I|{^xa{16Jv;|QZ+af= zb~4WI-cWK(EhH>)Ov=?#CVwxPru=vu2#6lV#R9mU41SDzxtj?4yPF6W@V)1YW@3S% ztZ||V0qfI>oMAdLqC)44?q6qK)YR@+5iw_3w)Rt)CxJZ_j<{JFxs`0`8~mC51x4yV<_wFjB_KZ3qm12_IegJVJny8#cAjmOt**n{;=kyqJwN2 zm+4t=zj5e095~_LTaYD2Ott=%!tI(y_e$+C1!K#mz_(xD#|0Vt4a632#h4;7-XYJu zy)B@mX7hS06LA#c+)T}xA+K**n7#HtPmiNBwU`{+Ht%EgHdxbsLHMX@*RZt%m64jZ z&s1J#FtL74beDY3kc1PVpsGCdlB89n&{8>Q@jY#<3XZ?c5Ys#V^=rxg^a_@Dt&Z}K zlPbx-{RD|5KH_+*zv@4DBh-2r`iz^HQ!|@nu69)7F^Hz&MMWwZlZJVeLI7^{qNV!x zaEBO9_A9t&f@Z-Y6@N(`ZLUms;hGmyP|f0VEXM;ceDllMp}`Zd_4f9-3jkovVFn-m z1E>u*`)d$3+kgt}4sbsS4D0n9(=7Xa&>0b<-#&1{*$Y|1a2kn_p{B6=+v2_a~<5ymj^g0A}85SSy>wfG4lCD-H}6u8-%H50n$qa!tiN>iktT zMMQ2ZjIpL752wP-y`5{_UmO~5^GQqAo$+pmIV418w)Dj4+lUVM|VZYGzN!{ z(Y^*B4?>qfO-4pmiDpqrpyW1St7I47eYFc9(ZV$X89$+`hde$Y+V>2j$LI8dM*QK4IQY0VeTGq(V z>L?zGjfWB|p>qpQ*v;`BiI<#3;rxYU$`z_@tbGY{<+r{Vea?N-Kl(myNx^N{Q(Cp- zsb2qj-+DBv?j^bGrotK`Okq!%C8a#CZ)_+}F;T@_k>eYpaC}Mg29m5dKrd2(h-D}1 z%Kbrb*Qa~et^k3jdK9SwUq>}FSX=_QJxZPu)wbs(j@uzzE%Lpe`nKf^NOW-rNq?#9 zHENJ1Wfymt)VTh>NT7yM%h?O5%Q?gX-I$$vP8|b8X>(?F6Um?fs_7GEj_XdL!x8Ig zaLa`08+z}wHrchE*hTTlWH()R`uml8tBGoIE0D*{lO)$qtKCN!;iaXOZic@|yILhu zYBH&+ruoE8p1EhGEjLuY%&;euK5{;uERB#RurtXI%=qqRnIi7yNAc5l$#9^cLyB`D z(UD{s#SQO>dZqE)Ir)^2%{)b?U9bcr=IBwCvu9hNU*+pbfpn2>49jp7J#A(tAj*z_ zjP`V~E0(4z-s`fW<7i0NQ)sX!;K9W|{~r?yT%2pLQw!_8w3c8&HiH9H$TM9LI5h>OeXYVXWFt$i7B zemXkIHg0hDfozBfiH9Vw?Bq_!%Fyfx2dxV@5aPDsoDa|AWP9~4`p%lOmVU*p4ZuXV}Cx?Cy+OZ;l@ zAz?t;8e2P3X1$8{D*eY9;kKU?6QO!0eV;R2kGT6=S9B_$MA!t%#QUkmI@j!g7W<3h znfdZ#bw7ljXe>ZINxOe_fg)zaW&+jfqPlzceI_au&EIOw-I4vk8$;Id#FCg*R>TW+ zmC2_2-GewN(?ss%yHwq;OMsDt2Nnv$jyjl%^yTQ$#w~N-7;SuDEv1g?VO?Yl*;z;d zTXc%%f}nC<@?xGpv$0%hNAeZt5EVUVCZx9&-sYRI)bZT2XOxTwkf1wMrla4+vx^zR zqklptthrwZ32Kr%wG#`NrZsX=ubzgYT-6z6;|WczOxFY&g*4g?7J1NG2@2o+j3p_@cCyP7}j7L&S<=@CuOoRlB=_T zo50y%s3I~I_7YYHLx5kN0XfZ27TD%vgJsug&)QeIiq21=D3=N2oyE0*DIK+dp5dmi zo}%xUZl%Xs){oN9yY4m)b+JE_d2hUP+MILSE_H4VdAv&|^!PTb@aMTRou;87OFn|T`(@L@H`Vx04;#hT093Tv|F$D9{zN3F;DQyZl6OsDG5 z!Lpk?a?*Wa@yJcdi! zy8PgLa;n_)rHAf@XNIB+X2*6xTTEU%^w}}!_;$rYa%T+cF1Jh9az{p~2BG13_!`{s z#nyB0bfv}NC&g14bmYD_3$G^mKoH>Pcap2S zp-&xma(_u?fy2Ys-}9|q0t&wsptini z>E3_CBiG+v_#<%6ZG~Dlq%{^zuA8u65*pN(QqBp@R%cAk6}gx3^ULI@lj)t0G>no~ zyGg@M5-^gx)5??2o)PzYF}h?135KL?<}7VFUmvV_IR+=4FA&#Ew`n_OHj-2sr@@Ts2HYYwvg75{vSj_`WY2eFup1?A?b0NljTlA4g3F5vQpl(V8(UIn+mX=Ka+2r7 zH~7wg-NACJ@=*S^A5_e{vL2w;5pAR+wu*Dvp*? zjhQ)xf{aqL3yhsi#h>9>G~{V%5_sh52p9@127U`6kBC|%i-=NSR)D;o4`J}!)R^Fz z=QVsS{-jZW&qq}bmCF_2U^AC$#?F5~k1nX>RR$7IfE;rUlz-b&%8DiUuszX>%k zcWEiQjU=R;MynIJ&1q1B`8&OWajc9~ru4J>5)Lb;>oi>PjU4Th4iH4dd_s?T9imXr z%h~XKKkP(u^Y`y@ede;EQtpYf*)COGWQTN;gDk`;r@|-IR(;3f_y~#Eh#Hw$;+K+7 z7b=!p=xdW5jar)=oEIAoy#Nix?FNin=(AC}ep8o57cq3|yZvq1)+v!EcFw{3YAeDj zFP)Srw1i)W-sI0Os>nV{UknxfSwDx8Pi6j+``tRr#h3Ka4$xCCtC9~WG2Ew5-uRKx z`aL5n3t{rl&^`%wVS;@IdpiT>Z}sOM(&91fw>f&81ZkCU%TgV>el46Gz4g%B;$>`l z-|_34{?bvY9hz&Ao$&F_S=+{Aqi5-^(QBnIGOQ`6jY9*suL9;sf1KRu06|;QQ4)i7 zB`vvZfo#dcXS9S{qDfuB$<`mA2a-@*RgmgS&)@Ca@~D5p!@ZVUhOsXzs|>`e*01=U zXmdWTMh*%V0yu7p)7t1LGBW}K?wy+|vv*cS$>PoF7|_Jn0&tiCVgMe%rBwibU^COd z2J=sjx+vkEb84isX{3{rW5x0ROFH~xV@!njbbL0&3dK?BsbPQZ?&ihaE^ozlij0b; z{C3eOPOlIV_VxSg|N9-4&X)Fvb+|7**`VY0{DK$as+dd4= zM`oQzV&f#dbVvGYVi0rG5d~fGjmm_w$yr2(#dA@X29(ZCJSL&A!T~bf%+`ZwuV+`@ zxyf9ay5%ddrJS?ez@URp@>&Pd7n-iWGO#xCBE-mOTKnXkEqzT4ej(ffZdG!9mF`>q zC=ye-`St37MVsW1bJKi&FnEgZ6iEJJQdQEpdSwy}V|)XD&Ko)MT~A?M)I@ z&@@eGD>#nP-(x@0EwkyEu4Ua(xyEFWR)oGDDrNJY2pMHI*#K@>4l6`i#{6-j57G}`yprSVARekSAj_Q3MJvOn|DnN@|61kejcWPH0dKN+~UXJby zYNNzX@4nXa4}eKf#>aDM-u#}^OZuj~xY_w(i7P^wBw@teU~;VZ5Xgq0Xya1ejm!XQ zC8@-FQr>%jgQ@81CDPLeU3&$HCI`uDB43*)D5fxTQEDgJ_Ce@J1c(#86&en2iL_6E z-;gi*X~!ao*VI7c61X_FTg)sg94dvgVAReWsX}G|e1@1MIvvtgP40w8U`h23Fk9Yg zC=^~!i@V*`sIgqs>fyzE@nONFGNC4`NFb*St}xzAO4k!`zKLT=Xr~ zGM&=f)HZL*eka4o4*yv#x}?g(#&SM>6X-Q6k&!Ule3g6v?^2vsU}4U^te;2eA5mqu z!nm)oK9;Or6MD$$!d&}Fh@XWY<1|`rOfseaWuVcN;>GCnHguPvmnDay;!2Xp8^niI zcYz>yPXf}KHeCDEhOUQ=E-=Ubxt6|4aigN4!TtL#@*IVE%~CY8j-{rHF$2*sMkhlJ z>j{SvodbcS?+$5@JZ^J*zN;#~_GnOKr3sN5=ph){To*9OAr61?br;() zB$`7sA}YyOhWeIr)%X|9z@@CiInO33$*%HN$c-xa#Ib%vOw{FZus)Q|R~SY;)q0`v zg<{HDdum+@&$f~)P`KI0&^i5X!#ssvoxxTnSq?6ZH^wbWHDJNceg#xP8Af{|aO3S2 zu$O|cE8^7I(+UbDjC4(u07>Xi8>=9=D@UYk+E~41s>0JL8PsW@Sz(&QS1@;MERdWd z+2RaHSmZp-Iei?ux6tH>?;(n_e=Te8Yo=LLf>Cxn`O8JY2EFO!z%g9_DRJhKp=CmJ zcN%x2Nh4HUlsJYvN4+c>T6gE|Cm}PVH!42Hq{HsyblD=4$i;2r!f^E<`y%nE$A#*f zN$GyY>4uA^kHXn9Y1D<{iNZ687#>b6W@tN1v)=i2GfvTpXoC)d?$trvIjVb--Q$qI zcaq5$k>H!H0mb%VPH(zFW>7bbFgAUO8{3DaROd#42B><|H4nP{T^NG?AbV{fP8|c_%PbT2s6;9WkpQ^ZmEDjdz=##V z0-y%`L-MGbVIv&!F;G6ILbPpJJ(pH*JRt`VH@EdTu8wK zHcyaH)*B|x&+(Hzdbv1;;06s4rkQ$4#qkGin3f(3>;bVppg{N=J1ph=bTNxy*|(Bn zs*1zIhFAGG@9;#jxQ1G0HT zz8s;X-#zD7`y{t}BIj`J-N~+Jhdu3}t1{mK2P<$#%q`o?mX_^1AO^#sRc03{VQbh% z{vZt)@?6>qAr$5eWyIa&u|TAml6;Q$7HBw*anNH^CZi>$Qpk`a)Ca~{zcnYpUDSe! zDH{`{>QfFLW#Jh^TcDH`;{t$VSuzWctxyy@;1z-+CRj{4S!?EpviJHO1B#L{oR3@z zHxJBxxyP z-UThg@*k2ree-?sg+jN*?1$3)(wq7g{L;>Hl9$(%Z$e`#vKbuao31a4i8B`p5y!m) zMq69XnEUOe@^X#Z#!@hGQ6E~+eM7I_LY3QF2sIp+!7%Zm0@pv3Y(9v~G8-99MrZEC zF+X775r8+69CMas$hf7&9sq~ws_yBn`-;kQ_D$*2iT9z`V<%lyD?QE(IhC%&M{sew z?1)x?ecb()N`$W&G`G_ydGq`#r@dmh+Um<0w+VV3j(fA@+mc3ig8WDnMH(019%_}Q zUuK?K-;ncz?x5(*bAJ3wTUa@MJ)|E!qG6Eq6sF&&BLQr3cXyTdGo#MXPhG?jLgu4>XD4DF`#Ocn<_`!nAWZ47&t-eP5_EF8IPFOp)J$~EAa1u&NknvwH!#ZTu- zJpf6S=G3JmBv&jogQxKTlej26yaVaBQSjwf`|nNm-{^S}*8bzh`>}?L2iym>Iu1cg7`AJ#72pBvMu-Wz*cmxC)Lo1Rp_gnDcF8CgEb%*`Fc>{! z<3DV1KkjdWgtb@e$t}*!5|hWlzOi1on_35?95%i?qWBcUZ`BV}c@UL!TEatJ+@K+- zmQWJ?=EL|~Otaiyg|fPno)sjy_1ywF3J0xwA`f9d2pswf$k-m52@RBlmqAL=5d*|z zM|lwY2hnd(K}v2j8pC#tDCX-FLh_FMyW+95{YvF`>f&j{JQKja215tT1~)0fzcl&o z7fUEbg24CyyHOIz;-@l}5W51xf5@-m?~pF@t=~gB?ERgqz4?bp!Puc8I)UA_hPP;O z$WiLBZxj&F`D27ls=TDVA?H*ana$PhoL=RCoFsiBTB(FOhjme|0XvO}&B$91!4HkL zV_dg}8H929*8Tl+xR}a?6O?eSSKJO4(_i0Jo;d7oQN?o`@Kf^NT0_gEo&-yAuoH|$ ziZ|WH7afV+7krR-jY}^aBB4 z5}$;mz6NK}h)?-yc~gdl0?EGv{4=_XvB_6!!nbntjH5NfSR5#4pSZn$!jBzcz1*Q@ zQamM(3l2FK83Ihw4IyN`HF`s=I zZ9yL@D5I0um7B8UYk{Z(Vk#^@&@xa})4+>Bf69{_@E(#s`%Gz z9((}yr+k(Ggm>Zn`Ouny{qXlr*f!y_wio|_gJ|p2!QLJozHoAEM1vhfLu7C3i|$e^ zB}f&ZWuPtKXr*TOzkeD)jiQ>tMqg;Cl70Z;5&(_?I)?v$C-0iXBgt@az*OnKKVV8#ba^f2 z=VgOCp8q!Ze;jA{_cVhU)sxE_FR?W`fdGzgaX^OuHH`W{MNz57F7K0Y4us1AIG&}U zb(P_S{}F0Kdnm*C*0I;nQ4L|Q5y3WCBZq1=nu(timp(TImdT~s*3R*(X4G@o8p p_-$kVA54snD(|v-V;Bft4gyd`%`;+uVZi~R+Bs}`prQ%>e*if<3Wopy delta 13569 zcma)@2RIyW+xE9vomCbq%3{@s=)L#eiD=PVh%SQI)w}3Hh~8^R5D~pcCkc^g(L)d- zTKG2Q|2)t89`EThuk$WDmjc{M1=c%)@O#~M#Eq__Ce3yqT zDlrzPXxU$uIR!sL|Cu>&7IE2Uxlnnv$!J6VTwTt$v2SBXoz&;a+ZlB^>{iNEbvZbn zA+(&$RtO?Sn4UR7Aarm-<(>nKB8mbRpMX_~MSsXgzQpjl@pa)MPB)FyVA%q6k{CBF zFeFY{wpTAHUss=!AgjmIaLz-V0mYF0m>}HZQICQz=Uft7bJN zp6U$JQE95_slSLF4IiUA&eRmcelCCaNHsoSy*t>caJ z3?mykgJlrsf5EW=>IM#(Lj_@;q;c0>hc7Uq(RSw;F%*DDehB;pR{#W!&=7E0g6FIuXtXW@`(8Yld$+fPzaBC-ox^0`HQWYDJ9d zh${6b4lRBm4txptyM`d4)T(*_AuvE60;8Is2m8RnLFO33hzxno$2JO+^44nPa-V|8 zsRT}!9!>CoS{c7~mD^ti7i6wKvT|s}bAD%a@PiP*6zL1V;H6@qj3DHBY5|lMjs3a> z9I6};+YYs)Fn|LHqyb_pARGdJN&`P5(1aHP;n?>aFK+&{BE`vt12UMcnR%1FJ~hKj zYSUi$GS4Yy%wTj0yII`1E@Loz!coe*yF@ok0Hy!~Kx=2$V?v0x8_StA3L`8qm(u^^ zFo@(YUq}p??RM4XWI^Ri3HcgXg-; zE)@m$GCfL8hnZIk8yp4Dl=M4#KMSKV9DlGr6w(_Dqp7EKYB}ekLW4DNm`_!7m1kBO z2lv%xsnO+X&qHmDIpf3pCCxf5{Ny5y&LZ6td$90ur-c;>DNi9I{j8g_=MQh zPMu{TNL>8FBxS+Ha$p3aSn2+ta7-wG$FP5pCasG$lF;Lj`Aw}{R=p-qNfHO*{VH;Jc;;B&rN z@97lCT*r)x`s4YKTIj~O&j;79kW(&8^YwGBb5{6zFPRH`yt=749zFc&lb&psU{OO% z&Z@Uy;SBrCqJA(O0LRL04)HIT;4nB84yD6ZccRozdJ z5VdCNGX`l8AzSJJ0}nAy4=`Dmn}&D*OrETnDhTFG-6EhxhYmRj=Ok`WnCa+aak`nQ^Ty|!vV)HuBQml`YBO!_Ngl^Q7LEQ%!8=nI|nkVKLVh5r&`1Xhes zc@+xR?Ve{J9*=DKwLRU#B_fDYbQ+DEJ_wQrhfZM?Jmar|qf_upekmcX_Q0Waq~hqm zgmW!r(q(L3K*q{LHJlg8L5!~axk}7^{3oW1iOf`z85mM*C96wI65Xqx$)xAKKuUwABQNNq zfu%h&bWok4<3ZG{w@G|Dd8Ob%6DKFur7c$OBBL5n)OyJ~37k^#+Td9?7Q zFV9iSG})0t`rxoc!v`L# zyj*BoV_Y~I&v!VuEPDN&O}#682)_LF zT;`*H0)6d`Qd7OydF!OkM-KxwEut7pr3G{4&PBp2_9C3NinGHMkw$LC-##UE_NYxe z(q}dZn64$}T9yG7`qfMoJ8%d{sAta=dqkJxUW$h$bmNfhKH!}5bSS!yc8%2@b+#mH zK=;*6Dh@Jy(fM@5bgC&O!DsWCSD$DeY;1n`(8FT4iT+DfeMks_iV5E1z!9G@H~9&n zZYyPnwC#5hMCl!)!!39vlzV;D*(CEiWqipfx&-mp=3XsYLCbE<7wiWq)dO>fM)LDV z6*Z=zDg&o|H@CDQ_xsl`(E0Q4E33j{-S&PwT-iw9kqfeOWz>juX6zTx*J~GYlXhrT z8buh0&B%<|@D5YW8$8LZo*m$HIlZ~}T4$e4Wq5^vWvJ4}Z`Ia|rq-vOvU#-pAaSL? zhmf)D2C-a?;z-ixz@dI&ejW=YE-Q=X)aaI!v^3}DA=~{%rToJ*1+;u+HnNc5CC^Nu z4qg4qQOO4%KGC-CV|MpLBPqt>IeI68qRCv{OrQW+*g;*aDLHehmO#}>6RU1%VGZ{> zuE?7nZ-+XC4f2Pb6Kj1GCM}O$lBJuxT_(3Ig^G90XXE<5hHKn|6B^~#UB!j4ap;b` z@puKX9fl9xZ+urmjgO|dSt!EN%TEpGWySlHdrc3WZy))%2>Fb)=Ji#1EEDPwwN-Xf z{#3ZkY3zmeojV&b_LGlTp}Eo7<}~+o($YNi#O%)=U7Gy3+rUlU-5icER-3|i{QfOj z-E?j^GPSXwNveXC*qBfdg%JDvLhS2;Rxb4f=M&*nqsILOn;s-O{RG0qPu8a#n3&%b zfzPr&Bz8yrbv#+?1;RjNhe}ZnSe3aJTJ9t}B!0=usa&PYba((6>R`{*Y+|U4OU_z>N=v_4@c#x>WmjQ z@R+--gZ#c-4~p}=)7fd{r_Q#vcx9v)lI%wF@bX8)tW#LU>U~D8GCmx$Nhv#V(;?rz zsAvKog`D7>eYnpuamDmWy0f3r2&sj%lq6g7oh_kJ7Of=o?bqd`LtPsw<#A}%At@G+ z@s;A$S#Pzv_lx%rmPXgx^NW3ICoB)-SPNIQPEncHrxkY}iIW@@_~OP9Ms$bCFt!iH zcbnCHoVR^Db7p>DsZU_nt-i6tW{qu8D)A;GpCj$%$!%qunnE890x~YM5YRKdPeLf` zGp1eED$wC>>R0kz^qdIlonhv4D}&&BqT#95%&dy zO>SCs4?X>`!gun*Xl2LmTrpeyw5G3Ct!<;~E;Z3#rKhY{_QNhH--U`$6GY)sB+8Us zC>mMZUH6XV)fO)h*;a@Rjg%vpd23T-VdH2<_mtqLWa(m&5n8A2RvnjOsi@mL^`5&% z3?$8WXEYhsP)Gj0w~pvu+sy0c^Rh#F#bx=Pi7~FI{5K5JiY=0NPK_D1J+~@?>}oZ& z_^E3(#?6)_52w$YH4{n~*a)fnBQj--K#>g0egn~-f;LRwmt$gY+w^OhtOO&7atf=b z4jyi}lyJ4gGO3;&fectYSk3f4_|j+|br5Rb5xiM#-FTMQ9r}>T*O*Guu&y!d%G5U^i=n!*kGW2+iSf45O)sqL{|>LCNbj`zQNDQVo4 zV#72I2ZPjB5KF2v0U`4kn7P28pEeN}#jK{!HO$&rG3Sy0l#~NSo2*_+!BrC5keWfj zz_fw&T#|_2o(ok1UrU*raTV1a$E74sP-UIQPuEvUm1AT;haO^7IiAKxeg?*o~SQh&QOHGNY(o-D0V%I#CMxdQW!;JnvstY!0iGy0EZd)0f}f#7?P{JW@wKwAH6 zV_?wR2ePQjckF-K-o?)g&H0~XbIu&Tj8vTcYuw6Pei^qVH&Gd@dW{@-2QN}L7gw>+ z<)QKWBs>XtvpK&haU1c5Urgi5kKg{S(y&Gc{iiWLRH{3^K}&A^*S!_4{Bmz0&C6K# z#^Y)H%e^&N$>xibepY;YW>;OyMCQlEb?+tC#O;tf_OXz|Xkg`IJ)(Lqq_gCu^6sjJ z)5n@&4IOkL@{ge-!ZX~k&M7y|8y5hC+gT`jb#4Y!3 zX@91D$WI6J(|>6vEaXtW5k_B8X!FbYiC$;u0a~-Cf0#|?;>wup{N?{mlpe&}oQ9uqa=r5v3R25zb&g=I-glfoql;;p^QRkh@-~)(NqtWT(C6q%cS%h|CAC zzsG{EUzErgBgc>#+!)<*4ZKPfl9~_1|dL z^S|Tk%{+|7)#MMZS-M;q8vI3b9-04vtIHo;*M2pT{@|*a1AoJsHR&p<`5(9n{lWDc z7T1hFxMFM2TFQu%FkR^eg^ro%b;+!!I&ba%E3O=*Lo3xLJVWsghj#ygYU=+3RbV9+ zRlTj0KMkwi+t18X5TmJEr&|WN|G_d?(1rf4zo(KF=dCsbe^j^R4EPZ|NU+pd3wSp(n>W{Iva9mK4p86Tv6tOh zny6KMo|sI;gCK#990hgH{6h2xJ%wYREy4mz>0z&znuDNK>4pM*MYj3BDCN@`n@O%5 zN}qiG3sP!&i&A@Oj89%nk!=Ngr%D zL@+P^^|oW!}%bT-`h;w4a#iqnfskk*CAz8j;LPL%Lqyn<*5(Z0KMexhy4;FGP!mmB^@y4(47`0Ra(jaIh&gk_^6p^E;&uF*BkS zz&8j1Y-uu_N!RJG&;yl~u@Z?m;WO02vlqbwQyfGqkq1wXOK3Tsk*&Q0)h!H1f`kBq z*gu}`{(7ashWBS;J$Df@rWU3(UM&7-P9eG{hNx8KjMSLv>EL;2d` za_B4P7DmA}>CHWE?oVHDk(1i0-`Z(=$IPG;#&ze+!inAu-M*U}t`T$vvrTX%U_(3l z$9dqgA$64n?fh_C_;bw^1DECzQL)evM>{W3RF{9XR*?JnF*r%3MD_*i%9VuZaPon~0 zYL^1VuRhT!a;gSgHcS-oSHl^?zv`X^|&tT3WmW#^q`(kfK9PD&THFm(Bd^nGOD5zdnTZ$KGTE8 zyYMP1yC??7GA(Q}fL{RNmw);UI9Qp{!7spol_)yKBq`|CuSKDqVaRat?NtYOie4Sf z@w4D4YAP?uv~&AzJ;BZdx!W@B9+1TYbe-WQj{uJdkA$?0w!&p{$EwV!!DV{K8u~%P zgWS`*mhP81=#9No^kqJJ^Cq^%RE*woZ++OB(3;z7(Gmd0mLKrDhlA=@7YEqK+VM{Y zt0th@gs^2iN<}JC6JyIdTSE(Pu=`!L&sj6kt{Py#b9#WVv;R%ZU2y?k@b5L8$}KCU zP}BLX>xWNM8Q!sk*N}Y8dKAg^bbc=ro@e2kmMwzZ0B!&~0I7jkV8-`n=+qmHtjkf# zrS)e&KYrs>x+(TyPw#GNY7JQ>N|EpIO@pnZ_U$G!;zb^XwVeZZzZ$3HcT(D7L}+Bv zAO*r7^|n`rp0OxG-xxN0W7Rva_PPBk&A2iVNY3Mjj1g@H|uVMWiC8)bvM;-yacso90e(t?6bv5f?bN3sM@6F~kB=lbr-xt}$w( zn5~x-p&3^w=Xu>(hv!@TrgO8*#c1DgsJk!)NqhX(HE%i7n&&ri_cm#*^*?aGK!e>| z_s7iBg>Q1#OFg#!iR*mDZ$_NeE*yKuzB%^OX;rGrf@D+lTXr|`cWhtV0_5q`j1?*$ zNWG}6@3$Xrl$+gBxA)E&le)LnEk$zI>n&4(3K{1QcjKTG1`&gdqBZC{%UL=!0Wmdx zWGJC>L%tqn(43CVvP+jcdst4scZ74GGuG7mu(7)E?TD>5EtRERLRDlP$=vY3{pjJW z$N>1L;pKQ}hm-G1ZvpcTHQyF7t{+mBunNf)mluahw| z)V9(KMps!LTgh3?r4Ay!XQaqA+sXwT`EF~JDk+OJGci?~-*T!_g#mSIL#KW4x$|ym z-FI#Wtdxq|g7p-tz2w55*@WufUVdFo%jcX4$Ct~$s8-F+nT+iUwk<^)6*}F0LEeb( zSmV#8H0!dWDr@$_SN~ogWwBEzPA#pbF!e1`wYIg&)wRTz-LmUbZ^KhGEc)jB@m_%j zQKd@GZx*;7(+wqDsHI80T)Rq7ymq~lZ|5?mg;d^Bv#kEDp+2s?HwCuudGnOT)7dgj zc$mk~k~@QE#dRkyHg0VQJ%N8g8pu~)KA%m~z896C_FdXPu_-;^VG1)f%d5wcFVQVE z#>(iO`LVJVixHb>N5Mc>;=oVrP~Ee$f{3&?)iI{SuPse7_KF5WBYVQ~OdIu%E+$1M zcgg1PiWE%tx~G>;-ZRALE@Li*B~pm53(|_|Zcx%BCHcm8O)(khpaDv>lG_eKgfrrZ zGlDgarDGjQN#zGUG;OG!Hn(lA8*w(Fs07^5UT^L0mx9)nAE`7_zr_M*rp1-i8jA-e zPq!d=M)3py6IBD#+v>7f3ZguGn6C1D1h}3}SuLA{JURtZ1Yr@Cli#<6T!#<=FM$c9 zB*qaQy#9(ojDr|-5%@|B`i{AKTuAV1d{pF;FQQlp*|Skgf6Sy^p*onw zV?=9ylErnq4eep@eVn|^l#Di@((prmY&hJ)P)HXMZuo2pUy&F<9*%>zp?6>p&t;ZKCFQAo(B@gayJ*KpvXMbP^7@gm%-iBrOW25S7%IgdVi0 z-Xm@kD3(9eoQTdHSoO-@Y9%p#F`?5_^)nk|h)(C;Pn_c7AK53bw|4?_f5U$$7f_@C z=T)&KH_=z^ZStf6r{uO`3VZH}dgCSBdy2O~+O>s5PQ$fX)Ksd!9f8;$WCNdM_+#p2 z({0J_QQQdeQ;X=lE=TKiNbiC-6|=2Qqy9d3N$kF#Q16T)4TBlkv@iX|nS36!?pO$@ zkG8jc6mzMG`l>(i{hV^|#gXz_Fyl{De0erm2K!A8_~>xduA?&7qjs*(C!5!9XUdM8d4z^JCO8j7)q# z9cVe2GczO<*>MWp`DYi+kAqCy%v4yobkXv``gT`r4RxMpyF1(H8OIT{*7|YOHJ8iK zkAWv0Yu~xMA1^x!F6WI07VUEu9FRnu3JrJMwLioiy5XyK@795}OLG)DMW>pnjRDXQ zMGnt*BU!Qztag8=UZZ9#XeWSw%g7vTWMl-#K0QP1|3AxPobY`~B#Ck;+F4RF%!3A3 znJ<%Dt80lnO&FkTEY!4&8^fFG)~{!ZIK+AE_M5tl-B#b}$vN7wYf$0VN0*G`k}z5;(G z`yPR&Us2ut5=S=O$=Yqo!N<#-k50(XNNKkZSB(PHq z7Mq|9*#gu>b0)}e@;_}@RX(^36Qjy)ZZ|uJA%=?yFd%`?k4Uai=Y9Bm$C^K1S9N56 zPPPhEKA{M3+`7JY>gM~AFlD&qY7CH_cIcoKc!kfGJ zDFbj$Jxq}HzE9Vjd~VP&j#yJzzp!RsPV?}RRe^^D&Idj|M!$IB5;(Kv`igiaf1qgx zV63Y-HHrH+-E^iT!t(-E^U zr^U;5;#;_dOerlRs^d0BjTs**!D}Jqu?97z;*@dy<$gP8oAU|_0!~qQelAz(x&(Fk zjI$|=uhZeCu8W19HMuwm4p{)Mk*o{AXfyQGP;&d5A=9l+g{mlL(6I~i6ur?fK>b`` z?#pKVAa52SbLppA!_qI3Gz>lMw9Wl`Ec$7BST{kK-3{f26*Fr^sV^U zlw#`lw>${PXjR<{o=C;J-?};qKBuI=#hf@mLcB3H6&b#I>KTsv=25hOA4QPpb%gO%PuSL& zzgRZT%usg{&yF7GX}o!)Vei=h?#T2y-SaR#A}K<{w~U2B5{@JkfkLm{knf_7fRoe@ z{qC`15E^&6{WAgVbTqUHwrKIB?r&=qR?7M&cqACZ1@(5(R(--X*BD1G z?&M^5`SCLcsH1weHzWm4X=?9;fmkj0r|@eDCc-9rV6i4QFEIx<=@eLph)zBrDHq7A z@Dyx0@>os~BA(O4pdn~Vgi=xd3~_o~&KJQnR|qm=nohy9w^#f#AKFPQcZ+S3+WKaU zrdEEM)irO_cakos`FRS6nvJ%N0u70>nWT9XjcUI1MoKPQex=Fh5YiI0cS2AjG`bpS zpcqDTN;sPjq%}Hlr++;D^CSQ&aGU{yv!{JqO_S2Um^%`eU63Xxf$VW(S`#mIz3Cd( z+?URHm$cgu;*Unql&GCWS;2>;u6d;#5k;qkQcW|0C%t3?U)_&l-Y!_Z_A_=@WmFN9Z&Rh}ZeVH>=+cjCBu*3i3S~uJ2y~OO?=gb0$yKyh&mK4l@X{sIkec z3_G=WNp*#chhAc%dwC5hvI=wGrCnto!JZA?(n(ZzbIE@9PS;rVdz504gw41ro+#+E zbFyp(7XuTcuBmKRxgkK9*9CFGL2P&NS;IHIgovIh7U9bAs@n#p%QG^lTp#E0C12Ex z*gkpX%#8$XOH4(8M{q6^bL!ns`!zG<=tV1+|68VQ~rqGE)-=Dgjg9Ni16;+Z{UAv*OWp z@8Ux1L73A)(_Y%HS@sKVp-0b2Z~^#V6PyOEp^Xbl90cws&Y75KnS2TrS#6_v;S%1B)|3W_SIG6h?NU^Z%p7Ex8j`kN*V)D=*WC77HseVJ0!9pw0DdP zZr-c*QE8i+elt{ZFnx-a2wx?n{It8`i4wketJp~Aa85n!&KXu9buCSU&Ag`s%$-B# z%a9j|AzU;li|lygu~u^z&r9NMfX)_9rAe0KOIJhjU7}TnIcNZF@8vYOc$*YGW!Ozu z*6sPg!vR`4@?7@&!7cu+Fl9<`OWvd*ow<97EP!g7l_I}l3dqiWw&?Dx-1HHZMDcSm zkRCt}U;Qg{}f1 z1UZ59unbXwiOO5y03pCL`Yl1WxB(^dgS~FJ;z2o`sI<9VxE-o_5tcZHizG*f*aL=w zCk;FjpqQ@Ps(^<-<&Q$2irZv)ZR103BZ|*(%-0mWyIZ2AsbQ-M7g~OY~aF)%?yC?<|v;_3SQ><3+!(&L`f>`L$38 zwHgx)XSt)RlF%xakX8FLXod%!AZSYz60#kdJy;W*oP^vEylnF|8ZTKiqb529x98&X zW;_F&7l?1N@qdz{li$k-C}*-k^7in@v?9~PH{GLqGi`d>A$N+qD^yP=IH>Ka_z!N% zH*TI#%TeJ&o4@LxHHjX+f*2_?GnTxP;J^8nIMzt1+e~|<)7;Yc&R{wA`L8!6ef57y z6LSlvnCn=ws*Pg*e#mA>@BXw$&1HSaqNW8av8* znM@#(Jvt#ZwHyM4)sykT`%K`zqodbiwEVZ0#Cm|PF z_up9b`#p5nI7q(s>~ZRhIS)}TvTXLg?g|j0}pu{geygxIe^O$P~jpV(fwH3wK`CBy#8Mn#r?{W9dC=_iu7)yT) z1^ui@^4gbgK(lkf1{RPmXy?y^eM2;oM=-UT?|Z@KA8D%3p;~Y=0Dtt;E)jweya`b! zclNAtAAfWSO}Z{Zw%4aHJ@A{F8K~s`gxp7_4dUa7kUEnGo8=HEzI)=u%o>FuYIlM1 z5j-bw-ero!YN+NRw;MPJ_uX<0yIg|_g6_-QKL0JLY?OTp*Ow|jdH_2ji+v0j7-h)sbF z+1;>y1zmC?t2j}T<4-xCB*IRnnBgy)DGy^lhAa`!lbr6Sw@xIRe$_}m|KyRAl^3Nk z;e*Z=P5Bz%20QMZ9h^Ynk2t6vvhFgpX^(Z);i}GtE^?4{Etrz)wjJmc~8>z$Cs#k9_-=r zNJKSA_ptX9$;L?#d#ADR_Xf+1jn5^90!q~BkfL9Ehh92lm>LQ zPZAi}!w&NChCh|18b0YGoAe|1rkmg;X#+Z4qO^K?3MlL(J>{jCZrfE{3kLS-aRfkuB*xu4)em@k(8r4Aq!#9=C`rThutIRvAYPW0;A-i*-7!Ro?~!U_>M~#=fd#`O5rsd5!l&9eImz#cc5U@mHtZ`)&{XZ*ft2SP@g6W%HnJLq^EP zduw1hE3>m9uPGjO;Dr@{odJsL>*adqo`>6? zp_j~hTB>gjY%6KM$FNeZWffD^*c2O~GF3qIb7Cs&gi-_@)ebd02lRLQk`;dO&*#U2 z@D^x^SJgWGk KMp7kp$o~QO3ASPY diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 9b0e7007c..cd9ef157f 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -33,6 +33,8 @@ import narwhals.stable.v1 as nw from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read + if TYPE_CHECKING: import json # noqa: F401 import sys @@ -257,6 +259,7 @@ def __init__(self, name: _Pandas, /) -> None: ".json": pd.read_json, ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"), ".arrow": pd.read_feather, + ".parquet": pd.read_parquet, } self._scan_fn = {".parquet": pd.read_parquet} @@ -274,6 +277,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa), ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa), ".arrow": partial(pd.read_feather, dtype_backend=_pa), + ".parquet": partial(pd.read_parquet, dtype_backend=_pa), } self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} @@ -288,6 +292,7 @@ def __init__(self, name: _Polars, /) -> None: ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), ".arrow": pl.read_ipc, + ".parquet": pl.read_parquet, } self._scan_fn = {".parquet": pl.scan_parquet} @@ -304,6 +309,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), ".arrow": partial(pl.read_ipc, use_pyarrow=True), + ".parquet": partial(pl.read_parquet, use_pyarrow=True), } self._scan_fn = {".parquet": pl.scan_parquet} @@ -378,6 +384,7 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: ".json": pa_read_json, ".tsv": partial(pa_read_csv, parse_options=tab_sep), ".arrow": pa_read_feather, + ".parquet": pa_read_parquet, } self._scan_fn = {".parquet": pa_read_parquet} @@ -401,17 +408,19 @@ def validate_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: constraints: Metadata = {} - suffixes = ".csv", ".json", ".tsv", ".arrow" if tag is not None: constraints["tag"] = tag - if name.endswith(suffixes): + if name.endswith(EXTENSION_SUFFIXES): fp = Path(name) constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints elif suffix is not None: if not is_ext_read(suffix): - msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}" + msg = ( + f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n" + f"but got: {suffix!r}" + ) raise TypeError(msg) else: constraints["suffix"] = suffix @@ -432,10 +441,6 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" -def is_ext_read(suffix: Any) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - @overload def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index e9546d2b1..cdaa57322 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -4,20 +4,32 @@ from __future__ import annotations import sys -from typing import Literal +from typing import Any, Literal if sys.version_info >= (3, 14): from typing import TypedDict else: from typing_extensions import TypedDict +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -__all__ = ["Dataset", "Extension", "Metadata", "Version"] +__all__ = [ + "EXTENSION_SUFFIXES", + "Dataset", + "Extension", + "Metadata", + "Version", + "is_ext_read", +] Dataset: TypeAlias = Literal[ "airports", @@ -96,6 +108,7 @@ "zipcodes", ] Version: TypeAlias = Literal[ + "v2.11.0", "v2.10.0", "v2.9.0", "v2.8.1", @@ -140,7 +153,12 @@ "v1.7.0", "v1.5.0", ] -Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] +Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"] +EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet") + + +def is_ext_read(suffix: Any) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} class Metadata(TypedDict, total=False): diff --git a/pyproject.toml b/pyproject.toml index a3f99b7e9..43370cf7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,6 +250,8 @@ extend-safe-fixes=[ "ANN204", # unnecessary-dict-comprehension-for-iterable "C420", + # unnecessary-literal-set + "C405" ] # https://docs.astral.sh/ruff/preview/#using-rules-that-are-in-preview diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 205a0d958..e325147b2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -400,7 +400,7 @@ def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[Parame @datasets_debug @pytest.mark.parametrize( ("name", "suffix", "tag"), - list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})), + list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})), ) def test_all_datasets( polars_loader: Loader[pl.DataFrame, pl.LazyFrame], diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c8e67c394..3702028ac 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -193,6 +193,9 @@ def generate_typing(self, output: Path, /) -> None: NAME = "Dataset" TAG = "Version" EXT = "Extension" + EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet" + EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" + EXTENSION_GUARD = "is_ext_read" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" @@ -276,14 +279,18 @@ def generate_typing(self, output: Path, /) -> None: f"{HEADER_COMMENT}", "from __future__ import annotations\n", "import sys", - "from typing import Literal, TYPE_CHECKING", + "from typing import Any, Literal, TYPE_CHECKING", utils.import_typing_extensions((3, 14), "TypedDict"), + utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", - f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}", + f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}", + f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" + f"{indent}return suffix in set({EXTENSION_TYPES!r})\n", UNIVERSAL_TYPED_DICT.format( name=METADATA_TD, metaclass_kwds=", total=False", diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet index b932af7c5de7eaa7decace6422fd8191fcecb3c0..f8ed6f54e46e03902d48eed24ad595faeeae94fc 100644 GIT binary patch delta 2269 zcmai#2UHX37RP5uXd#doGDCuaP_vX!1w@DFaDDAmQ3sN+Su)v6L zA_};GC_JSZkS0Z11f|JS5EVg%=#!!-_z+%#tjm`3-pn~O_sqHX|K0iS@4NT4X{s4@ zr~Pg-2o3eiLLb3x5DNkTK<}2x7q2%!A_`!}r%-9Tvpn8XOKO|j+IGjz)1dv;+-?xq zWLSW&94RL!LweqLN-W)tjAD&84HWg&p73yYDXl&+LD|x{sCIRGb z0pbM?(;-$$x&g>aRnS(sv2BC_Vab+&44zCvg|dAiEX;y4Qs$}LY;~vzw=$z-86R~( z_IbhLQZxgr)SSV@aw7~f29b`wA+T(CTFh_ z)`?{Ei)D5l?P_`ac)RkpB(|$XyLTaxj+^cxs_>nu)*S4<#kRD$SJ&TbS(IUE{D|nVwJ(Whh$gUf>w27@l{dP*0i4X;dJ>sx7v6OXH*JRHH%qtx{b_J zgU=6_jL*!wYP}Y?pB@mrm^2FSHte_&Q>biJo0a6BS9<2cp}yGkoc--p%&wRb{Zp>I zS5J-2Ti7a*hwM$ph5G$zvvX~4_a!!t>E~DeIJ_g6GpN?HSv8aRrcim_@F}CI`e?m{ z6LGKN$vt2G6>4v%`}pz)nCkW*E)KugWK@9m`H%MM2WXMYm3Y2lPE^4M-Qt@Jo$i37 zCnIJHnREGd@#k6c>g-R=2$jpW&dL49neyWseoBg+I+DgvxitTIHYiz6^I?{`U&S>? zfhUgdSl(86+=+23U$23*_3pUm`B|(_MG>#}qY-(ilJP$4$90psWwG7b9woB1lXJTg zdQz(@b@qfDZm%>=eMXk6)+0II;)&00pA%*7Z%j;{)wrs@MzYnM@*KLZ+envg(y_d^iHrLvO z2=96x%rSm@EsS5qbXh{M;n~1Q_468hp(@Fp9!g+m3|m#~^;ZpxGWaEAu&_k%PtqM9 zhYy&t!Y87*?g+u#;*~|?6NF>m|J)ZNLC{%_ z-QL@GEZkb)%~_WbG_iIv2ta+3R+Pbdb=8wOsDA@{>#10;;$TmJs`CuJ!G{$i~G(o&`u*m z1}BU$({Dyj?$U~LFDolQ_;wn1)}5usX&APfcQX6G(cLBL?AoVoAdS9fbb)bR&IrSQnbOa zAUq@k$xy6zB3vSDvou^Z6g&VxT1<#Eu*VtzP~S382n_b|4EFK&VTbrENu`2t5lH)J z&G;HFFPHsB$=S{M+)v6)JQx!HbD6( zv6?yM+s^$h1FA}J-zEr#r|qE)08MC#bSo>11y>YTl`E167BeLd=d4%g{k5D7D?m>U zE#5WBMUwmamOSAYN3q7cC5EdUlp!YrB`oNHe8k{E;fFMAqHGynhq delta 2132 zcmZvddpwkB8^@o=Fb!tL44%gflbmLZnP$vr9TUbWWRb(v`-W!P9Lphx(pH8^X{{PO zs0t z$Mfex%FvZ1&|A6LD8Tu8?lPKx{A|~zxdSg*?(Ro)pITQA&bTcB z2q2?^&@|MjcLJ*55(w$swoQk*>nBUt`~#uS0~synB-&Cx#_YQ6X}m`L$TQ>f*4s@~ zAOKnqMK6c=xkniwA7QE+>#)6t>>(`K5Z+4Sd8E`DI$4W?MtCFjm2$k zMxS2_R7JnQ;p<9KCL&Tj`)0zgLbdz-fZ{9Pc%gfL;DO;sY>PJGg_9nzOC3p#kxdIJ zW$mcF**Sbg$G@Oi{Q6W1Bex_~ptgnZ1|3sa^zxglPf6}>h`rS7>ZF0jx7$wcoJ!x@ zCaQZQ2`-4MwKMlO-g}RJr)c<(-BD{EJs5eHxu46@KPd|>;)Th~8<^N#RmEks&4Fe` zl+=QW4L@d+3RCKG}#G%`!LeZ zXB>-+a$l7~?mS$>RA$*ACO$%yVM#}SIs8LeF!}Pxp@E%L0N}ZPtSXj1yfw){=t?~z zP|JbSo^t$Oy=yKeWF=Em9V|ZZOE}+WnZ>Sqk5C4-XC~#LO|Q?4D$iwE_e}h$$2myc z$9E9~%~^)@`g%%DFJI5RyTZ`$+FZOkgV+D5y=_IhU0&qLaF>c@j6JGjObySZW5(}N zvK=E@D&q{PeDUi0dtNL zr-A@MAPE`AbXBVZP8fjF!hG@>F$Ap=^A}@!Uow^$DV%34#{eE6B7T~|3eK*%LWP9I zoEZ_Q27qpCFCI!`j$TpjOmoe>d4QPLXX2@8VKiqPb9&@PX&rj4dGzD4t>qom8xtc1n+4VROpS!CS%ja{^=%`^{_U8JKNR%}1HBvVGSd>}?z>o5J{mQuW*FpUnz<1z+9DYj zzjm&P76jM#dS(ds|M4KYKI+AErG0nI*|q`crjJK(x0zkF3DsIwmyf4zEp6C&B>YaI zVSRANMd$9`0+YB=23zDU-t%*$7rn&#;ac@jTgSBY4`MV5&50buHl}X}Sc4j9j>m~B zI#{QiG?f|@3WwJw#5<+qNlt0$Vl)mb9}xl+bd(Vy*ucp){`JdE2vsxv|b|<^>1cMVqf5(w%r+^7b?}InJ|+mJ{_>TN0qFPW+{+ zJZ8taOP#U&OoX8NP)VPSOsP6)um8}O|0b8Iwp|_khc{2PCx7t84HVzDHo7waZZ zfpIJcKB$6wK?=YaW)Pg)2}WtV&YYL_nT2KaSV{w(%M||>Rf~#UsCH2qu2Q0PS|I5- zMWGuwmN_3psX76%{z?Xbk`G7#;6-i33zNQ!oNwa=2;T>3k1XwxZX5|&i8oZ^Lj(v7 zp($2ANI0H~N@G|dhwyqhKAZ=_H6ZPTjlFzFir1GPa~Ho%P2R7Z5*3B)6J8g?hXdho zTC3eT1C8H6&wwSWMvd-*?7 z)ue>5u*8kY30vo{=nHdM@USB5?JB4Y1<(e$ltpfQ-CHL7$pwPpLLTG*(5AjwobWaK z$V@ojRer+2yk2U(NdNz?AtBnj_!@=$Cuc?ZE?WwciziupVHO=Dzxw$_`WyiOEFXk0 wd~ZIjVkHHc!;|TMn`dtY&_R#{%9G#$g)H9%G6LYbT`-@9fxEfq2#^@{Ul5T0H~;_u diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet index acd04f2c79bb6936ef8776b2a066caa0c99d7515..dac952f9fa86a2de165343eb808b376195022c29 100644 GIT binary patch delta 815 zcmZvaUr19?9LIm>?Eaa5rkn0}xHm7DQn$5oZIPp3h#{;nDVYQYJvbIL?T@3xhdM%x zhzcRb@iC#K(HleCRQx!U{5c3aLiDK{&KV(iNC`M&tt$<&d2s=HXh9Xwy=&Z%sH+68foI>aT~*3OB`12iKVfJhWQN?@W76sy8j zCOMKyuluL$3$fo-$HX!x?*sKrtgFUBN6&Y-vZIIpbBV#{(*a7y} zs>+5sNZNye!FAc_RH+%KCSAR?Y0=>_>t^fc4 delta 776 zcmZvZUr3Wt7{=dozJJ@?bZdJL-?vRHp|5S^+9XGXg(CeSx~SzvAVexyS*{TxaVm(> zMMMPabg30G(F^IKKSPAV!kdC3qHYWdZ&V7j3!#gc^nGi_pj{kzIQ)3t=Y8JEz(k*s3Vv*{iTS~Y%dQn9+oEi{!rR~a%*iUJnn31-5Im=PDK+J-tJv9( zz-|RlXGI_G0drOjSJhhMP{9~`6hOvSeRu%O6=yYG;|dmDwB4t-*!Zlw3jeQm@lXxb zzqje{WqPANEM7@^t*ZSVwq%1-e%tn|@M@tQm4+MaB?+ TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} + return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} def _is_str(obj: Any) -> TypeIs[str]: From 95582df0847c84c61b41a349887a4a1b703477cb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:35:45 +0000 Subject: [PATCH 107/201] feat: Always use `pl.read_csv(try_parse_dates=True)` Related https://github.com/vega/altair/pull/3631#issuecomment-2480670438 --- altair/datasets/_readers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cd9ef157f..54edb909e 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -288,9 +288,9 @@ def __init__(self, name: _Polars, /) -> None: if not TYPE_CHECKING: pl = self._import(self._name) self._read_fn = { - ".csv": pl.read_csv, + ".csv": partial(pl.read_csv, try_parse_dates=True), ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), + ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True), ".arrow": pl.read_ipc, ".parquet": pl.read_parquet, } @@ -305,9 +305,11 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: pl = self._import(_pl) pa = self._import(_pa) # noqa: F841 self._read_fn = { - ".csv": partial(pl.read_csv, use_pyarrow=True), + ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), + ".tsv": partial( + pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True + ), ".arrow": partial(pl.read_ipc, use_pyarrow=True), ".parquet": partial(pl.read_parquet, use_pyarrow=True), } From dc4a23013d39b88b2047c8408b902081a30aec96 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 21:46:07 +0000 Subject: [PATCH 108/201] feat: Adds `_pl_read_json_roundtrip` First mentioned in https://github.com/vega/altair/pull/3631#issuecomment-2480670438 Addresses most of the `polars` part of https://github.com/vega/altair/pull/3631#issuecomment-2479333070 --- altair/datasets/_readers.py | 36 ++++++++++++++++++++++++++++++-- tests/test_datasets.py | 41 +++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 54edb909e..e55d28359 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -38,6 +38,7 @@ if TYPE_CHECKING: import json # noqa: F401 import sys + from io import IOBase from urllib.request import OpenerDirector import pandas as pd @@ -282,6 +283,37 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} +def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame: + """ + Try to utilize better date parsing available in `pl.read_csv`_. + + `pl.read_json`_ has few options when compared to `pl.read_csv`_. + + Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_. + + .. _pl.read_json: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html + .. _pl.read_csv: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html + .. _pandas.read_json: + https://pandas.pydata.org/docs/reference/api/pandas.read_json.html + """ + from io import BytesIO + + import polars as pl + + df = pl.read_json(source, **kwds) + if any(tp.is_nested() for tp in df.schema.dtypes()): + # NOTE: Inferred as `(Geo|Topo)JSON`, which wouldn't be supported by `read_csv` + return df + buf = BytesIO() + df.write_csv(buf) + if kwds: + SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"} + kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS} + return pl.read_csv(buf, try_parse_dates=True, **kwds) + + class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) @@ -289,7 +321,7 @@ def __init__(self, name: _Polars, /) -> None: pl = self._import(self._name) self._read_fn = { ".csv": partial(pl.read_csv, try_parse_dates=True), - ".json": pl.read_json, + ".json": _pl_read_json_roundtrip, ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True), ".arrow": pl.read_ipc, ".parquet": pl.read_parquet, @@ -306,7 +338,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: pa = self._import(_pa) # noqa: F841 self._read_fn = { ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), - ".json": pl.read_json, + ".json": _pl_read_json_roundtrip, ".tsv": partial( pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True ), diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e325147b2..221666c35 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime as dt import re import sys from functools import partial @@ -35,6 +36,15 @@ CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" +class DatasetSpec(TypedDict, total=False): + """Exceptional cases which cannot rely on defaults.""" + + name: Dataset + suffix: Extension + tag: Version + marks: MarksType + + requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() backends: pytest.MarkDecorator = pytest.mark.parametrize( @@ -346,7 +356,7 @@ def test_reader_cache( @pytest.mark.parametrize( - "dataset", + "name", [ "cars", movies_fail, @@ -361,7 +371,7 @@ def test_reader_cache( @pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch + fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") monkeypatch.delitem(sys.modules, "pandas", raising=False) @@ -370,15 +380,28 @@ def test_pyarrow_read_json( data = Loader.with_backend("pyarrow") - data(dataset, ".json") + data(name, ".json") -class DatasetSpec(TypedDict, total=False): - """Exceptional cases which cannot rely on defaults.""" - - suffix: Extension - tag: Version - marks: MarksType +@pytest.mark.parametrize( + ("spec", "column"), + [ + (DatasetSpec(name="cars", tag="v2.11.0"), "Year"), + (DatasetSpec(name="unemployment-across-industries", tag="v2.11.0"), "date"), + (DatasetSpec(name="flights-10k", tag="v2.11.0"), "date"), + (DatasetSpec(name="football", tag="v2.11.0"), "date"), + (DatasetSpec(name="crimea", tag="v2.11.0"), "date"), + (DatasetSpec(name="ohlc", tag="v2.11.0"), "date"), + ], +) +def test_polars_read_json_roundtrip( + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], + spec: DatasetSpec, + column: str, +) -> None: + frame = polars_loader(spec["name"], ".json", tag=spec["tag"]) + tp = frame.schema.to_python()[column] + assert tp is dt.date or issubclass(tp, dt.date) def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]: From 7ddb2a8c1e8ec6477cfc646c385e0b168f2fd330 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 17 Nov 2024 19:28:43 +0000 Subject: [PATCH 109/201] feat(DRAFT): Adds infer-based `altair.datasets.load` Requested by @joelostblom in: https://github.com/vega/altair/pull/3631#issuecomment-2480832609 https://github.com/vega/altair/pull/3631#issuecomment-2479333070 --- altair/datasets/__init__.py | 35 +++++++++++++++--------- altair/datasets/_readers.py | 32 +++++++++++++++++++++- tests/test_datasets.py | 54 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 13 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 3760a4f2a..4545d36b0 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -23,7 +23,7 @@ from altair.datasets._readers import _Backend from altair.datasets._typing import Dataset, Extension, Version -__all__ = ["Loader", "data"] +__all__ = ["Loader", "load"] class Loader(Generic[IntoDataFrameT, IntoFrameT]): @@ -320,18 +320,29 @@ def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" +load: Loader[Any, Any] + + def __getattr__(name): - if name == "data": - global data - data = Loader.with_backend("pandas") - from altair.utils.deprecation import deprecated_warn - - deprecated_warn( - "Added only for backwards compatibility with `altair-viz/vega_datasets`.", - version="5.5.0", - alternative="altair.datasets.Loader.with_backend(...)", + if name == "load": + import warnings + + from altair.datasets._readers import infer_backend + + reader = infer_backend() + global load + load = Loader.__new__(Loader) + load._reader = reader + + warnings.warn( + "For full IDE completions, instead use:\n\n" + " from altair.datasets import Loader\n" + " load = Loader.with_backend(...)\n\n" + "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", + UserWarning, stacklevel=3, ) - return data + return load else: - raise AttributeError(name) + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e55d28359..953401bae 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -11,7 +11,7 @@ import os import urllib.request -from collections.abc import Mapping, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import partial from importlib import import_module from importlib.util import find_spec @@ -475,6 +475,36 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" +def is_available(pkg_names: str | Iterable[str], *more_pkg_names: str) -> bool: + pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) + names = chain(pkgs_names, more_pkg_names) + return all(find_spec(name) is not None for name in names) + + +def infer_backend( + *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow") +) -> _Reader[Any, Any]: + """ + Return the first available reader in order of `priority`. + + Notes + ----- + - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``) + - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support + - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed + - ``"pyarrow"``: least reliable + + .. _fastparquet: + https://github.com/dask/fastparquet + + """ + it = (backend(name) for name in priority if is_available(_requirements(name))) + if reader := next(it, None): + return reader + msg = f"Found no supported backend, searched:\n" f"{priority!r}" + raise NotImplementedError(msg) + + @overload def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 221666c35..f903d500a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,7 +3,9 @@ import datetime as dt import re import sys +import warnings from functools import partial +from importlib import import_module from importlib.util import find_spec from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError @@ -127,6 +129,58 @@ def test_loader_url(backend: _Backend) -> None: assert pattern.match(url) is not None +def test_load(monkeypatch: pytest.MonkeyPatch) -> None: + """ + Inferring the best backend available. + + Based on the following order: + + priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" + """ + import altair.datasets + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + from altair.datasets import load + + assert load._reader._name == "polars" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "polars", None) + + from altair.datasets import load + + if find_spec("pyarrow") is None: + # NOTE: We can end the test early for the CI job that removes `pyarrow` + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pandas", None) + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + else: + assert load._reader._name == "pandas[pyarrow]" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pyarrow", None) + + from altair.datasets import load + + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) + from altair.datasets import load + + assert load._reader._name == "pyarrow" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pyarrow", None) + + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + + @backends def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) From 9544d9b68e1e6c1786d823cdd9ef3e961497cfa3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:39:24 +0000 Subject: [PATCH 110/201] refactor: Rename `Loader.with_backend` -> `Loader.from_backend` https://github.com/vega/altair/pull/3631#discussion_r1847157544 --- altair/datasets/__init__.py | 28 ++++++++++++++-------------- tests/test_datasets.py | 24 ++++++++++++------------ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4545d36b0..d01ef6f60 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -34,7 +34,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") >>> data # doctest: +SKIP Loader[polars] @@ -46,24 +46,24 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["polars", "polars[pyarrow]"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / ) -> Loader[pd.DataFrame, pd.DataFrame]: ... @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["pyarrow"], / ) -> Loader[pa.Table, pa.Table]: ... @classmethod - def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. @@ -94,7 +94,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") cars = data("cars") >>> type(cars) # doctest: +SKIP @@ -102,7 +102,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: Using ``pandas``: - data = Loader.with_backend("pandas") + data = Loader.from_backend("pandas") cars = data("cars") >>> type(cars) # doctest: +SKIP @@ -110,7 +110,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: Using ``pandas``, backed by ``pyarrow`` dtypes: - data = Loader.with_backend("pandas[pyarrow]") + data = Loader.from_backend("pandas[pyarrow]") cars = data("cars", tag="v1.29.0") >>> type(cars) # doctest: +SKIP @@ -170,7 +170,7 @@ def __call__( from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") source = data("stocks", tag="v2.10.0") >>> source.columns # doctest: +SKIP @@ -198,7 +198,7 @@ def __call__( Using ``pandas``: - data = Loader.with_backend("pandas") + data = Loader.from_backend("pandas") source = data("stocks", tag="v2.10.0") >>> source.columns # doctest: +SKIP @@ -222,7 +222,7 @@ def __call__( Using ``pyarrow``: - data = Loader.with_backend("pyarrow") + data = Loader.from_backend("pyarrow") source = data("stocks", tag="v2.10.0") >>> source.column_names # doctest: +SKIP @@ -276,7 +276,7 @@ def url( import altair as alt from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' @@ -302,7 +302,7 @@ def cache_dir(self) -> Path | None: from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = Path.home() / ".altair_cache" >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP @@ -337,7 +337,7 @@ def __getattr__(name): warnings.warn( "For full IDE completions, instead use:\n\n" " from altair.datasets import Loader\n" - " load = Loader.with_backend(...)\n\n" + " load = Loader.from_backend(...)\n\n" "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", UserWarning, stacklevel=3, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f903d500a..0d2deae7f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -86,7 +86,7 @@ class DatasetSpec(TypedDict, total=False): def polars_loader( tmp_path_factory: pytest.TempPathFactory, ) -> Loader[pl.DataFrame, pl.LazyFrame]: - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") return data @@ -112,14 +112,14 @@ def metadata_columns() -> frozenset[str]: @backends -def test_loader_with_backend(backend: _Backend) -> None: - data = Loader.with_backend(backend) +def test_loader_from_backend(backend: _Backend) -> None: + data = Loader.from_backend(backend) assert data._reader._name == backend @backends def test_loader_url(backend: _Backend) -> None: - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) dataset_name = "volcano" pattern = re.compile( rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" @@ -185,7 +185,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) @@ -208,7 +208,7 @@ def test_missing_dependency_single( flags=re.DOTALL, ), ): - Loader.with_backend(backend) + Loader.from_backend(backend) @pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) @@ -227,7 +227,7 @@ def test_missing_dependency_multi( flags=re.DOTALL, ), ): - Loader.with_backend(backend) + Loader.from_backend(backend) @backends @@ -239,7 +239,7 @@ def test_dataset_not_found(backend: _Backend) -> None: """ import polars as pl - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) real_name: Literal["disasters"] = "disasters" real_suffix: Literal[".csv"] = ".csv" real_tag: Literal["v1.14.0"] = "v1.14.0" @@ -344,7 +344,7 @@ def test_reader_cache( monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) cache_dir = data.cache_dir assert cache_dir is not None assert cache_dir == tmp_path @@ -432,7 +432,7 @@ def test_pyarrow_read_json( if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) - data = Loader.with_backend("pyarrow") + data = Loader.from_backend("pyarrow") data(name, ".json") @@ -497,7 +497,7 @@ def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: from polars.testing import assert_frame_equal - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = tmp_path data("londonCentroids") @@ -536,7 +536,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - @backends def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: """Ensure all backends will query the same column names.""" - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) fn = data._reader.scan_fn(_METADATA) native = fn(_METADATA) schema_columns = nw.from_native(native).lazy().collect().columns From 7b3a89e5b5374eb391b7ae73ace219327069f979 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:52:47 +0000 Subject: [PATCH 111/201] feat(DRAFT): Add optional `backend` parameter for `load(...)` Requested by @jonmmease https://github.com/vega/altair/pull/3631#discussion_r1847111064 https://github.com/vega/altair/pull/3631#discussion_r1847176465 --- altair/datasets/__init__.py | 94 +++++++++++++++++++++++++++++++------ tests/test_datasets.py | 81 ++++++++++++++++++++------------ 2 files changed, 132 insertions(+), 43 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d01ef6f60..26fd39b20 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, overload +from typing import TYPE_CHECKING, Generic, final, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -320,28 +320,94 @@ def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" -load: Loader[Any, Any] +@final +class _Load(Loader[IntoDataFrameT, IntoFrameT]): + @overload + def __call__( # pyright: ignore[reportOverlappingOverload] + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: None = ..., + **kwds: Any, + ) -> IntoDataFrameT: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["polars", "polars[pyarrow]"] = ..., + **kwds: Any, + ) -> pl.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pandas", "pandas[pyarrow]"] = ..., + **kwds: Any, + ) -> pd.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pyarrow"] = ..., + **kwds: Any, + ) -> pa.Table: ... + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + backend: _Backend | None = None, + **kwds: Any, + ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: + if backend is None: + return super().__call__(name, suffix, tag, **kwds) + else: + return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + + +load: _Load[Any, Any] +""" +For full IDE completions, instead use: + + from altair.datasets import Loader + load = Loader.from_backend("polars") + cars = load("cars") + movies = load("movies") + +Alternatively, specify ``backend`` during a call: + + from altair.datasets import load + cars = load("cars", backend="polars") + movies = load("movies", backend="polars") + +Related +------- +- https://github.com/vega/altair/pull/3631#issuecomment-2480832609 +- https://github.com/vega/altair/pull/3631#discussion_r1847111064 +- https://github.com/vega/altair/pull/3631#discussion_r1847176465 +""" def __getattr__(name): if name == "load": - import warnings - from altair.datasets._readers import infer_backend reader = infer_backend() global load - load = Loader.__new__(Loader) + load = _Load.__new__(_Load) load._reader = reader - - warnings.warn( - "For full IDE completions, instead use:\n\n" - " from altair.datasets import Loader\n" - " load = Loader.from_backend(...)\n\n" - "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", - UserWarning, - stacklevel=3, - ) return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 0d2deae7f..3d986ec75 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,7 +3,6 @@ import datetime as dt import re import sys -import warnings from functools import partial from importlib import import_module from importlib.util import find_spec @@ -11,7 +10,12 @@ from urllib.error import URLError import pytest -from narwhals.dependencies import is_into_dataframe, is_polars_dataframe +from narwhals.dependencies import ( + is_into_dataframe, + is_pandas_dataframe, + is_polars_dataframe, + is_pyarrow_table, +) from narwhals.stable import v1 as nw from altair.datasets import Loader @@ -138,47 +142,66 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" """ import altair.datasets + from altair.datasets import load - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - from altair.datasets import load + assert load._reader._name == "polars" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "polars", None) - assert load._reader._name == "polars" + from altair.datasets import load + + if find_spec("pyarrow") is None: + # NOTE: We can end the test early for the CI job that removes `pyarrow` + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pandas", None) + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + else: + assert load._reader._name == "pandas[pyarrow]" monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "polars", None) + monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load - if find_spec("pyarrow") is None: - # NOTE: We can end the test early for the CI job that removes `pyarrow` - assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pandas", None) - with pytest.raises(NotImplementedError, match="no.+backend"): - from altair.datasets import load - else: - assert load._reader._name == "pandas[pyarrow]" - monkeypatch.delattr(altair.datasets, "load") + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) + from altair.datasets import load - monkeypatch.setitem(sys.modules, "pyarrow", None) + assert load._reader._name == "pyarrow" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pyarrow", None) + with pytest.raises(NotImplementedError, match="no.+backend"): from altair.datasets import load - assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pandas", None) - monkeypatch.delitem(sys.modules, "pyarrow") - monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) - from altair.datasets import load +@requires_pyarrow +def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: + import altair.datasets + + monkeypatch.delattr(altair.datasets, "load", raising=False) + + load = altair.datasets.load + assert load._reader._name == "polars" - assert load._reader._name == "pyarrow" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pyarrow", None) + default = load("cars") + df_pyarrow = load("cars", backend="pyarrow") + df_pandas = load("cars", backend="pandas[pyarrow]") + default_2 = load("cars") + df_polars = load("cars", backend="polars") - with pytest.raises(NotImplementedError, match="no.+backend"): - from altair.datasets import load + assert is_polars_dataframe(default) + assert is_pyarrow_table(df_pyarrow) + assert is_pandas_dataframe(df_pandas) + assert is_polars_dataframe(default_2) + assert is_polars_dataframe(df_polars) @backends From c835c131282cc189b9bc4cc91bef2492c0b2dd36 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:25:27 +0000 Subject: [PATCH 112/201] feat(DRAFT): Adds `altair.datasets.url` A dataframe package is still required currently,. Can later be adapted to fit the requirements of (https://github.com/vega/altair/pull/3631#discussion_r1846662053). Related: - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 @mattijn, @joelostblom --- altair/datasets/__init__.py | 415 ++++-------------------------------- altair/datasets/_loader.py | 394 ++++++++++++++++++++++++++++++++++ tests/test_datasets.py | 59 ++++- 3 files changed, 491 insertions(+), 377 deletions(-) create mode 100644 altair/datasets/_loader.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 26fd39b20..ac7ac9f06 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,380 +1,23 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from typing import TYPE_CHECKING -from narwhals.typing import IntoDataFrameT, IntoFrameT - -from altair.datasets._readers import _Reader, backend +from altair.datasets._loader import Loader if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal - - import pandas as pd - import polars as pl - import pyarrow as pa - from _typeshed import StrPath + from typing import Any if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - from altair.datasets._readers import _Backend - from altair.datasets._typing import Dataset, Extension, Version - -__all__ = ["Loader", "load"] - - -class Loader(Generic[IntoDataFrameT, IntoFrameT]): - """ - Load examples **remotely** from `vega-datasets`_, with *optional* caching. - - A new ``Loader`` must be initialized by specifying a backend: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data # doctest: +SKIP - Loader[polars] - - .. _vega-datasets: - https://github.com/vega/vega-datasets - """ - - _reader: _Reader[IntoDataFrameT, IntoFrameT] - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["polars", "polars[pyarrow]"], / - ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / - ) -> Loader[pd.DataFrame, pd.DataFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pyarrow"], / - ) -> Loader[pa.Table, pa.Table]: ... - - @classmethod - def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: - """ - Initialize a new loader, with the specified backend. - - Parameters - ---------- - backend_name - DataFrame package/config used to return data. - - * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: Using `pandas defaults`_. - * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` - * *pyarrow*: (*Experimental*) - - .. warning:: - Most datasets use a `JSON format not supported`_ by ``pyarrow`` - - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - polars.dataframe.frame.DataFrame - - Using ``pandas``: - - data = Loader.from_backend("pandas") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - Using ``pandas``, backed by ``pyarrow`` dtypes: - - data = Loader.from_backend("pandas[pyarrow]") - cars = data("cars", tag="v1.29.0") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - >>> cars.dtypes # doctest: +SKIP - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year string[pyarrow] - Origin string[pyarrow] - dtype: object - """ - obj = Loader.__new__(Loader) - obj._reader = backend(backend_name) - return obj - - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """ - Get a remote dataset and load as tabular data. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - **kwds - Arguments passed to the underlying read function. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - data = Loader.from_backend("polars") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - shape: (560, 3) - ┌────────┬────────────┬────────┐ - │ symbol ┆ date ┆ price │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞════════╪════════════╪════════╡ - │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ - │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ - │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ - │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ - │ MSFT ┆ May 1 2000 ┆ 25.45 │ - │ … ┆ … ┆ … │ - │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ - │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ - │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ - │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ - │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ - └────────┴────────────┴────────┘ - - Using ``pandas``: - - data = Loader.from_backend("pandas") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - Index(['symbol', 'date', 'price'], dtype='object') - - >>> source # doctest: +SKIP - symbol date price - 0 MSFT Jan 1 2000 39.81 - 1 MSFT Feb 1 2000 36.35 - 2 MSFT Mar 1 2000 43.22 - 3 MSFT Apr 1 2000 28.37 - 4 MSFT May 1 2000 25.45 - .. ... ... ... - 555 AAPL Nov 1 2009 199.91 - 556 AAPL Dec 1 2009 210.73 - 557 AAPL Jan 1 2010 192.06 - 558 AAPL Feb 1 2010 204.62 - 559 AAPL Mar 1 2010 223.02 - - [560 rows x 3 columns] - - Using ``pyarrow``: - - data = Loader.from_backend("pyarrow") - source = data("stocks", tag="v2.10.0") - - >>> source.column_names # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - pyarrow.Table - symbol: string - date: string - price: double - ---- - symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] - date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] - price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] - """ - return self._reader.dataset(name, suffix, tag=tag, **kwds) - - def url( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - ) -> str: - """ - Return the address of a remote dataset. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - The returned url will always point to an accessible dataset: - - import altair as alt - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP - 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' - - We can pass the result directly to a chart: - - url = data.url("cars", tag="v2.9.0") - alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") - """ - return self._reader.url(name, suffix, tag=tag) - - @property - def cache_dir(self) -> Path | None: - """ - Returns path to datasets cache. - - By default, this can be configured using the environment variable: - - "ALTAIR_DATASETS_DIR" - - You *may* also set this directly, but the value will **not** persist between sessions: - - from pathlib import Path - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - data.cache_dir = Path.home() / ".altair_cache" - - >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP - '.altair_cache' - """ - return self._reader._cache - - @cache_dir.setter - def cache_dir(self, source: StrPath, /) -> None: - import os - - os.environ[self._reader._ENV_VAR] = str(source) - - def __repr__(self) -> str: - return f"{type(self).__name__}[{self._reader._name}]" + from altair.datasets._loader import _Load + from altair.datasets._typing import Dataset, Extension, Version -@final -class _Load(Loader[IntoDataFrameT, IntoFrameT]): - @overload - def __call__( # pyright: ignore[reportOverlappingOverload] - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: None = ..., - **kwds: Any, - ) -> IntoDataFrameT: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["polars", "polars[pyarrow]"] = ..., - **kwds: Any, - ) -> pl.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pandas", "pandas[pyarrow]"] = ..., - **kwds: Any, - ) -> pd.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pyarrow"] = ..., - **kwds: Any, - ) -> pa.Table: ... - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - backend: _Backend | None = None, - **kwds: Any, - ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: - if backend is None: - return super().__call__(name, suffix, tag, **kwds) - else: - return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +__all__ = ["Loader", "load", "url"] load: _Load[Any, Any] @@ -400,14 +43,50 @@ def __call__( """ +def url( + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, +) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Related + ------- + - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 + - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 + - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 + - https://github.com/vega/altair/pull/3631#discussion_r1846662053 + """ + from altair.datasets._loader import load + + return load.url(name, suffix, tag=tag) + + def __getattr__(name): if name == "load": - from altair.datasets._readers import infer_backend + from altair.datasets._loader import load - reader = infer_backend() - global load - load = _Load.__new__(_Load) - load._reader = reader return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py new file mode 100644 index 000000000..3c2a0ee21 --- /dev/null +++ b/altair/datasets/_loader.py @@ -0,0 +1,394 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, final, overload + +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._readers import _Reader, backend + +if TYPE_CHECKING: + import sys + from pathlib import Path + from typing import Any, Literal + + import pandas as pd + import polars as pl + import pyarrow as pa + from _typeshed import StrPath + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Backend + from altair.datasets._typing import Dataset, Extension, Version + +__all__ = ["Loader", "load"] + + +class Loader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Load examples **remotely** from `vega-datasets`_, with *optional* caching. + + A new ``Loader`` must be initialized by specifying a backend: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data # doctest: +SKIP + Loader[polars] + + .. _vega-datasets: + https://github.com/vega/vega-datasets + """ + + _reader: _Reader[IntoDataFrameT, IntoFrameT] + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["polars", "polars[pyarrow]"], / + ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pyarrow"], / + ) -> Loader[pa.Table, pa.Table]: ... + + @classmethod + def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + """ + Initialize a new loader, with the specified backend. + + Parameters + ---------- + backend_name + DataFrame package/config used to return data. + + * *polars*: Using `polars defaults`_ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: Using `pandas defaults`_. + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + polars.dataframe.frame.DataFrame + + Using ``pandas``: + + data = Loader.from_backend("pandas") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + Using ``pandas``, backed by ``pyarrow`` dtypes: + + data = Loader.from_backend("pandas[pyarrow]") + cars = data("cars", tag="v1.29.0") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + >>> cars.dtypes # doctest: +SKIP + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year string[pyarrow] + Origin string[pyarrow] + dtype: object + """ + obj = Loader.__new__(Loader) + obj._reader = backend(backend_name) + return obj + + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """ + Get a remote dataset and load as tabular data. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + **kwds + Arguments passed to the underlying read function. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + shape: (560, 3) + ┌────────┬────────────┬────────┐ + │ symbol ┆ date ┆ price │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞════════╪════════════╪════════╡ + │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ + │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ + │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ + │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ + │ MSFT ┆ May 1 2000 ┆ 25.45 │ + │ … ┆ … ┆ … │ + │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ + │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ + │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ + │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ + │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ + └────────┴────────────┴────────┘ + + Using ``pandas``: + + data = Loader.from_backend("pandas") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + Index(['symbol', 'date', 'price'], dtype='object') + + >>> source # doctest: +SKIP + symbol date price + 0 MSFT Jan 1 2000 39.81 + 1 MSFT Feb 1 2000 36.35 + 2 MSFT Mar 1 2000 43.22 + 3 MSFT Apr 1 2000 28.37 + 4 MSFT May 1 2000 25.45 + .. ... ... ... + 555 AAPL Nov 1 2009 199.91 + 556 AAPL Dec 1 2009 210.73 + 557 AAPL Jan 1 2010 192.06 + 558 AAPL Feb 1 2010 204.62 + 559 AAPL Mar 1 2010 223.02 + + [560 rows x 3 columns] + + Using ``pyarrow``: + + data = Loader.from_backend("pyarrow") + source = data("stocks", tag="v2.10.0") + + >>> source.column_names # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + pyarrow.Table + symbol: string + date: string + price: double + ---- + symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] + date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] + price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] + """ + return self._reader.dataset(name, suffix, tag=tag, **kwds) + + def url( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + ) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + The returned url will always point to an accessible dataset: + + import altair as alt + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + + We can pass the result directly to a chart: + + url = data.url("cars", tag="v2.9.0") + alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") + """ + return self._reader.url(name, suffix, tag=tag) + + @property + def cache_dir(self) -> Path | None: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You *may* also set this directly, but the value will **not** persist between sessions: + + from pathlib import Path + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + data.cache_dir = Path.home() / ".altair_cache" + + >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP + '.altair_cache' + """ + return self._reader._cache + + @cache_dir.setter + def cache_dir(self, source: StrPath, /) -> None: + import os + + os.environ[self._reader._ENV_VAR] = str(source) + + def __repr__(self) -> str: + return f"{type(self).__name__}[{self._reader._name}]" + + +@final +class _Load(Loader[IntoDataFrameT, IntoFrameT]): + @overload + def __call__( # pyright: ignore[reportOverlappingOverload] + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: None = ..., + **kwds: Any, + ) -> IntoDataFrameT: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["polars", "polars[pyarrow]"] = ..., + **kwds: Any, + ) -> pl.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pandas", "pandas[pyarrow]"] = ..., + **kwds: Any, + ) -> pd.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pyarrow"] = ..., + **kwds: Any, + ) -> pa.Table: ... + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + backend: _Backend | None = None, + **kwds: Any, + ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: + if backend is None: + return super().__call__(name, suffix, tag, **kwds) + else: + return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + + +load: _Load[Any, Any] + + +def __getattr__(name): + if name == "load": + from altair.datasets._readers import infer_backend + + reader = infer_backend() + global load + load = _Load.__new__(_Load) + load._reader = reader + return load + else: + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3d986ec75..6de691ff2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -141,11 +141,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" """ - import altair.datasets + import altair.datasets._loader from altair.datasets import load assert load._reader._name == "polars" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) monkeypatch.setitem(sys.modules, "polars", None) @@ -154,20 +154,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: if find_spec("pyarrow") is None: # NOTE: We can end the test early for the CI job that removes `pyarrow` assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) with pytest.raises(NotImplementedError, match="no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) monkeypatch.delitem(sys.modules, "pyarrow") @@ -175,7 +175,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load assert load._reader._name == "pyarrow" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) with pytest.raises(NotImplementedError, match="no.+backend"): @@ -184,11 +184,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @requires_pyarrow def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: - import altair.datasets + import altair.datasets._loader - monkeypatch.delattr(altair.datasets, "load", raising=False) + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + from altair.datasets import load - load = altair.datasets.load assert load._reader._name == "polars" default = load("cars") @@ -204,6 +204,47 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: assert is_polars_dataframe(df_polars) +@pytest.mark.parametrize( + "name", + [ + "jobs", + "la-riots", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "lookup_groups", + "lookup_people", + "miserables", + "monarchs", + "movies", + "normal-2d", + "obesity", + "ohlc", + "penguins", + "platformer-terrain", + "points", + "political-contributions", + "population", + "population_engineers_hurricanes", + "seattle-temps", + "seattle-weather", + "seattle-weather-hourly-normals", + "sf-temps", + "sp500", + "sp500-2000", + "stocks", + "udistrict", + ], +) +def test_url(name: Dataset) -> None: + from altair.datasets import url + + pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") + result = url(name) + assert isinstance(result, str) + assert pattern.match(result) is not None + + @backends def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) From 0817ff8503f728a4bc0c8d160abaab311f829fd7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:46:22 +0000 Subject: [PATCH 113/201] feat: Support `url(...)` without dependencies https://github.com/vega/altair/pull/3631#discussion_r1846662053, https://github.com/vega/altair/pull/3631#issuecomment-2488621316, https://github.com/vega/altair/pull/3631#issuecomment-2481977891 --- altair/datasets/__init__.py | 13 ++++- altair/datasets/_loader.py | 77 +++++++++++++++++++++++++-- altair/datasets/_metadata/url.csv.gz | Bin 0 -> 855 bytes altair/datasets/_readers.py | 5 +- tests/test_datasets.py | 70 +++++++++++++++++++----- tools/datasets/__init__.py | 23 ++++++++ 6 files changed, 168 insertions(+), 20 deletions(-) create mode 100644 altair/datasets/_metadata/url.csv.gz diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index ac7ac9f06..e426ca467 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -78,9 +78,18 @@ def url( - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 - https://github.com/vega/altair/pull/3631#discussion_r1846662053 """ - from altair.datasets._loader import load + from altair.datasets._readers import AltairDatasetsError - return load.url(name, suffix, tag=tag) + try: + from altair.datasets._loader import load + + url = load.url(name, suffix, tag=tag) + except AltairDatasetsError: + from altair.datasets._loader import url_cache + + url = url_cache[name] + + return url def __getattr__(name): diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 3c2a0ee21..5d8c1ec8b 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from pathlib import Path +from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -8,8 +9,8 @@ if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal + from collections.abc import MutableMapping + from typing import Any, Final, Literal import pandas as pd import polars as pl @@ -23,8 +24,15 @@ from altair.datasets._readers import _Backend from altair.datasets._typing import Dataset, Extension, Version + __all__ = ["Loader", "load"] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_T = TypeVar("_T") + +_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" + class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -377,6 +385,69 @@ def __call__( return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +class UrlCache(Generic[_KT, _VT]): + """ + `csv`_, `gzip`_ -based, lazy url lookup. + + Operates on a subset of available datasets: + - Only the latest version + - Excludes `.parquet`, which `cannot be read via url`_ + - Name collisions are pre-resolved + - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) + + .. _csv: + https://docs.python.org/3/library/csv.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _cannot be read via url: + https://github.com/vega/vega/issues/3961 + """ + + def __init__( + self, + fp: Path, + /, + *, + columns: tuple[str, str] = ("dataset_name", "url_npm"), + tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + ) -> None: + self.fp: Path = fp + self.columns: tuple[str, str] = columns + self._mapping: MutableMapping[_KT, _VT] = tp() + + def read(self) -> Any: + import csv + import gzip + + with gzip.open(self.fp, mode="rb") as f: + b_lines = f.readlines() + reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) + header = tuple(next(reader)) + if header != self.columns: + msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" + raise ValueError(msg) + return dict(reader) + + def __getitem__(self, key: _KT, /) -> _VT: + if url := self.get(key, None): + return url + + from altair.datasets._typing import Dataset + + if key in get_args(Dataset): + msg = f"{key!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) + + def get(self, key: _KT, default: _T) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) load: _Load[Any, Any] diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3580606d7cca77cefee4c5bd2b48134f9fac22d9 GIT binary patch literal 855 zcmV-d1E~BTiwFn-B0gsV|8;U~E@N|c0Ik?dQ`;~Q0N^{n1vw!zOlNxPwAUVc&&ZNu zi^y7452x_!tCyXSP&%W{2?*?`_3GgjoO3H3_r`tQY#7(w zi{nDc*>+m^P5g_^ECxz=iFM!RUHA0VZ8zzIO$zRe9v-N)2CR3@(gJkM%@0)TKov1o zFhp|ilo$y*!j8ez3xrvK!u8ZD@!E`)@JdO`owxER+G}`W zEtvIEBdio&C`N62QYpAHupLh2qjt z=LMpUtB{|STRBTTv}+~4Bqyl#OjpT<8rlR6jb?__SlKys|TI+ysb(2Ji^3oO1m3l7I%_CtIcgP|{!TI~FZ z5nzH6z(o`Ca%eSoP~I-#4E#o3^u+CDCVsCkDHGIC#d&IkW>6RrX~Y| zRj;Hh`SzhdXFnSGUPBezJa4zDciy(MD{&TaSaCeCBciT3JWC;7FH^hF-VLupS%yK! z7D>VD6yKbLG7HYdW|IepyP<#1-VS}2fjXZmq-8pJFh~EHsEMX~-qgJq{nRH8>u8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#? hPGA)$PnhL%CQ6!b(lFqNy}1B6!M}FHK(GfF008u|rd9v| literal 0 HcmV?d00001 diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 953401bae..e93fb55e1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -83,6 +83,9 @@ _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" +class AltairDatasetsError(Exception): ... + + class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -502,7 +505,7 @@ def infer_backend( if reader := next(it, None): return reader msg = f"Found no supported backend, searched:\n" f"{priority!r}" - raise NotImplementedError(msg) + raise AltairDatasetsError(msg) @overload diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6de691ff2..e5d1f1d3f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import datetime as dt import re import sys @@ -18,8 +19,8 @@ ) from narwhals.stable import v1 as nw -from altair.datasets import Loader -from altair.datasets._readers import _METADATA +from altair.datasets import Loader, url +from altair.datasets._readers import _METADATA, AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, Version from tests import skip_requires_pyarrow, slow @@ -115,6 +116,13 @@ def metadata_columns() -> frozenset[str]: ) +def match_url(name: Dataset, url: str) -> bool: + return ( + re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url) + is not None + ) + + @backends def test_loader_from_backend(backend: _Backend) -> None: data = Loader.from_backend(backend) @@ -124,13 +132,8 @@ def test_loader_from_backend(backend: _Backend) -> None: @backends def test_loader_url(backend: _Backend) -> None: data = Loader.from_backend(backend) - dataset_name = "volcano" - pattern = re.compile( - rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" - ) - url = data.url(dataset_name) - assert isinstance(url, str) - assert pattern.match(url) is not None + dataset_name: Dataset = "volcano" + assert match_url(dataset_name, data.url(dataset_name)) def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @@ -178,7 +181,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) - with pytest.raises(NotImplementedError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match="no.+backend"): from altair.datasets import load @@ -239,10 +242,49 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: def test_url(name: Dataset) -> None: from altair.datasets import url - pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") - result = url(name) - assert isinstance(result, str) - assert pattern.match(result) is not None + assert match_url(name, url(name)) + + +def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: + import altair.datasets + from altair.datasets._loader import url_cache + + monkeypatch.setitem(sys.modules, "polars", None) + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.setitem(sys.modules, "pyarrow", None) + + assert url_cache._mapping == {} + + with contextlib.suppress(AltairDatasetsError): + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + with pytest.raises(AltairDatasetsError): + from altair.datasets import load as load + + assert match_url("jobs", url("jobs")) + + assert url_cache._mapping != {} + + assert match_url("cars", url("cars")) + assert match_url("stocks", url("stocks")) + assert match_url("countries", url("countries")) + assert match_url("crimea", url("crimea")) + assert match_url("disasters", url("disasters")) + assert match_url("driving", url("driving")) + assert match_url("earthquakes", url("earthquakes")) + assert match_url("flare", url("flare")) + assert match_url("flights-10k", url("flights-10k")) + assert match_url("flights-200k", url("flights-200k")) + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("climate") + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("flights-3m") + + with pytest.raises( + TypeError, match="'fake data' does not refer to a known dataset" + ): + url("fake data") @backends diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3702028ac..ae4d0b583 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -15,12 +15,15 @@ from __future__ import annotations +import gzip import json import types +from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Literal import polars as pl +from polars import col from tools.codemod import ruff from tools.datasets.github import GitHub @@ -107,6 +110,7 @@ def __init__( } ) self._fp_typing: Path = out_fp_typing + self._fp_url: Path = out_dir_altair / "url.csv.gz" @property def github(self) -> GitHub: @@ -135,6 +139,14 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) + if include_typing: self.generate_typing(self._fp_typing) return gh_trees @@ -159,6 +171,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path: else: return self._paths[name] + def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + if fp.suffix != ".gz": + fp = fp.with_suffix(".csv.gz") + if not fp.exists(): + fp.touch() + df = frame.lazy().collect() + buf = BytesIO() + with gzip.open(fp, mode="wb") as f: + df.write_csv(buf) + f.write(buf.getbuffer()) + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): From e01fdd727b2bbfa389e995d126506d647d60ea9f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:52:32 +0000 Subject: [PATCH 114/201] fix(DRAFT): Don't generate csv on refresh https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 --- altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes tools/datasets/__init__.py | 21 +++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz index 3580606d7cca77cefee4c5bd2b48134f9fac22d9..49a227404cc162e9177aee307c297cfffd4869a1 100644 GIT binary patch delta 15 Wcmcc4cAbq)zMF$1Dsm%RATt0aRs=Z! delta 15 Wcmcc4cAbq)zMF%CQE4MvATt0XEd!|l diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ae4d0b583..398c06f84 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -120,7 +120,9 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: + def refresh( + self, *, include_typing: bool = False, include_csv: bool = False + ) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -139,13 +141,16 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + if include_csv: + # BUG: Non-deterministic + # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) if include_typing: self.generate_typing(self._fp_typing) From 0c5195e92d428033b311b784b30c69f5ebeac6ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:57:19 +0000 Subject: [PATCH 115/201] test: Replace rogue `NotImplementedError` https://github.com/vega/altair/actions/runs/11942364658/job/33289235198?pr=3631 --- tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e5d1f1d3f..a4bbe40c4 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -159,7 +159,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: assert load._reader._name == "pandas" monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) - with pytest.raises(NotImplementedError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match="no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" From 5595d905c29a89d6388b12b46caa016e9cd91d27 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:50:12 +0000 Subject: [PATCH 116/201] fix: Omit `.gz` last modification time header Previously was creating a diff on every refresh, since the current time updated. https://docs.python.org/3/library/gzip.html#gzip.GzipFile.mtime https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 --- altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes tools/datasets/__init__.py | 23 +++++++++-------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz index 49a227404cc162e9177aee307c297cfffd4869a1..07cb52ec1c834808609b204ed2ffe0b4cd83f62e 100644 GIT binary patch delta 17 Xcmcc4cAbqwzMF%C0SGp7_%j0lCyxV& delta 17 Ycmcc4cAbqwzMF$1D$*`}BZogT05C-a-~a#s diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 398c06f84..a3690f65f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -120,9 +120,7 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh( - self, *, include_typing: bool = False, include_csv: bool = False - ) -> pl.DataFrame: + def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -141,16 +139,13 @@ def refresh( gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) - if include_csv: - # BUG: Non-deterministic - # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) if include_typing: self.generate_typing(self._fp_typing) @@ -183,7 +178,7 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non fp.touch() df = frame.lazy().collect() buf = BytesIO() - with gzip.open(fp, mode="wb") as f: + with gzip.GzipFile(fp, mode="wb", mtime=0) as f: df.write_csv(buf) f.write(buf.getbuffer()) From 9f621519ac4eb84e506632d81e6b794e55eee00c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:01:02 +0000 Subject: [PATCH 117/201] docs: Add doc for `Application.write_csv_gzip` --- tools/datasets/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a3690f65f..26955e9c0 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -172,6 +172,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path: return self._paths[name] def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + """ + Write ``frame`` as a `gzip`_ compressed `csv`_ file. + + - *Much smaller* than a regular ``.csv``. + - Still readable using ``stdlib`` modules. + + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _csv: + https://docs.python.org/3/library/csv.html + """ if fp.suffix != ".gz": fp = fp.with_suffix(".csv.gz") if not fp.exists(): From 1bd455206d5898800ae87d7c22cafba05c9c012e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:34:02 +0000 Subject: [PATCH 118/201] revert: Remove `"polars[pyarrow]" backend Partially related to https://github.com/vega/altair/pull/3631#issuecomment-2484826592 After some thought, this backend didn't add support for any unique dependency configs. I've only ever used `use_pyarrow=True` for `pl.DataFrame.write_parquet` to resolve an issue with invalid headers in `"polars<1.0.0;>=0.19.0"` --- altair/datasets/_loader.py | 5 ++--- altair/datasets/_readers.py | 32 +++----------------------------- tests/test_datasets.py | 5 ++--- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 5d8c1ec8b..3e31aea2e 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -55,7 +55,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod def from_backend( - cls, backend_name: Literal["polars", "polars[pyarrow]"], / + cls, backend_name: Literal["polars"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @@ -81,7 +81,6 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: DataFrame package/config used to return data. * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` * *pandas*: Using `pandas defaults`_. * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` * *pyarrow*: (*Experimental*) @@ -347,7 +346,7 @@ def __call__( suffix: Extension | None = ..., /, tag: Version | None = ..., - backend: Literal["polars", "polars[pyarrow]"] = ..., + backend: Literal["polars"] = ..., **kwds: Any, ) -> pl.DataFrame: ... @overload diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e93fb55e1..f7b8aecf5 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -73,9 +73,8 @@ _Pandas: TypeAlias = Literal["pandas"] _PyArrow: TypeAlias = Literal["pyarrow"] _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow) - _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"] _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] - _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] + _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow] __all__ = ["backend"] @@ -332,25 +331,6 @@ def __init__(self, name: _Polars, /) -> None: self._scan_fn = {".parquet": pl.scan_parquet} -class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: - _pl, _pa = _requirements(name) - self._name = name - if not TYPE_CHECKING: - pl = self._import(_pl) - pa = self._import(_pa) # noqa: F841 - self._read_fn = { - ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), - ".json": _pl_read_json_roundtrip, - ".tsv": partial( - pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True - ), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - ".parquet": partial(pl.read_parquet, use_pyarrow=True), - } - self._scan_fn = {".parquet": pl.scan_parquet} - - class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ Reader backed by `pyarrow.Table`_. @@ -509,7 +489,7 @@ def infer_backend( @overload -def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def backend(name: _Polars, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload @@ -524,8 +504,6 @@ def backend(name: _Backend, /) -> _Reader[Any, Any]: """Reader initialization dispatcher.""" if name == "polars": return _PolarsReader(name) - elif name == "polars[pyarrow]": - return _PolarsPyArrowReader(name) elif name == "pandas[pyarrow]": return _PandasPyArrowReader(name) elif name == "pandas": @@ -548,10 +526,6 @@ def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... -@overload -def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ... - - def _requirements(s: _Backend, /): concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} if s in concrete: @@ -560,7 +534,7 @@ def _requirements(s: _Backend, /): from packaging.requirements import Requirement req = Requirement(s) - supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"} + supports_extras: set[Literal[_Pandas]] = {"pandas"} if req.name in supports_extras: name = req.name if (extras := req.extras) and extras == {"pyarrow"}: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a4bbe40c4..e31f7990e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -68,7 +68,6 @@ class DatasetSpec(TypedDict, total=False): ), ), ), - pytest.param("polars[pyarrow]", marks=requires_pyarrow), pytest.param("pandas[pyarrow]", marks=requires_pyarrow), pytest.param("pyarrow", marks=requires_pyarrow), ], @@ -302,7 +301,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None def test_missing_dependency_single( backend: _Backend, monkeypatch: pytest.MonkeyPatch ) -> None: - if backend in {"polars[pyarrow]", "pandas[pyarrow]"}: + if backend == "pandas[pyarrow]": pytest.skip("Testing single dependency backends only") monkeypatch.setitem(sys.modules, backend, None) @@ -317,7 +316,7 @@ def test_missing_dependency_single( Loader.from_backend(backend) -@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) +@pytest.mark.parametrize("backend", ["pandas[pyarrow]"]) @skip_requires_pyarrow def test_missing_dependency_multi( backend: _Backend, monkeypatch: pytest.MonkeyPatch From 11da9c8f584e466a02a021ef8e93b895145fb333 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:41:10 +0000 Subject: [PATCH 119/201] test: Add a complex `xfail` for `test_load_call` Doesn't happen in CI, still unclear why the import within `pandas` breaks under these conditions. Have tried multiple combinations of `pytest.MonkeyPatch`, hard imports, but had no luck in fixing the bug --- tests/test_datasets.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e31f7990e..50ece0a26 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -86,6 +86,19 @@ class DatasetSpec(TypedDict, total=False): """ +@pytest.fixture +def is_flaky_datasets(request: pytest.FixtureRequest) -> bool: + mark_filter = request.config.getoption("-m", None) # pyright: ignore[reportArgumentType] + if mark_filter is None: + return False + elif mark_filter == "": + return True + elif isinstance(mark_filter, str): + return False + else: + raise TypeError(mark_filter) + + @pytest.fixture(scope="session") def polars_loader( tmp_path_factory: pytest.TempPathFactory, @@ -184,6 +197,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load +# HACK: Using a fixture to get a command line option +# https://docs.pytest.org/en/stable/example/simple.html#pass-different-values-to-a-test-function-depending-on-command-line-options +@pytest.mark.xfail( + is_flaky_datasets, # type: ignore + reason=( + "'pandas[pyarrow]' seems to break locally when running:\n" + ">>> pytest -p no:randomly -n logical tests -k test_datasets -m ''\n\n" + "Possibly related:\n" + " https://github.com/modin-project/modin/issues/951\n" + " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L164\n" + " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L257\n" + ), + raises=AttributeError, +) @requires_pyarrow def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets._loader From 694ada0ad496ecd0e07f49ff97e0c5c0753a6085 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:46:48 +0000 Subject: [PATCH 120/201] refactor: Renaming/recomposing `_readers.py` The next commits benefit from having functionality decoupled from `_Reader.query`. Mainly, keeping things lazy and not raising a user-facing error --- altair/datasets/_readers.py | 68 +++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index f7b8aecf5..2c8d53820 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -69,6 +69,13 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") + # NOTE: Using a constrained instead of bound `TypeVar` + # error: Incompatible return value type (got "DataFrame[Any] | LazyFrame[Any]", expected "FrameT") [return-value] + # - https://typing.readthedocs.io/en/latest/spec/generics.html#introduction + # - https://typing.readthedocs.io/en/latest/spec/generics.html#type-variables-with-an-upper-bound + # https://github.com/narwhals-dev/narwhals/blob/21b8436567de3631c584ef67632317ad70ae5de0/narwhals/typing.py#L59 + FrameT = TypeVar("FrameT", nw.DataFrame[Any], nw.LazyFrame) + _Polars: TypeAlias = Literal["polars"] _Pandas: TypeAlias = Literal["pandas"] _PyArrow: TypeAlias = Literal["pyarrow"] @@ -111,7 +118,7 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): Used exclusively for ``metadata.parquet``. - Currently ``polars`` backends are the only lazy options. + Currently ``"polars"`` is the only lazy option. """ _name: LiteralString @@ -125,12 +132,10 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: - suffix = validate_suffix(source, is_ext_read) - return self._read_fn[suffix] + return self._read_fn[_extract_suffix(source, is_ext_read)] def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: - suffix = validate_suffix(source, is_ext_scan) - return self._scan_fn[suffix] + return self._scan_fn[_extract_suffix(source, is_ext_scan)] def dataset( self, @@ -140,7 +145,7 @@ def dataset( tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: - df = self.query(**validate_constraints(name, suffix, tag)) + df = self.query(**_extract_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -166,7 +171,7 @@ def url( /, tag: Version | None = None, ) -> str: - frame = self.query(**validate_constraints(name, suffix, tag)) + frame = self.query(**_extract_constraints(name, suffix, tag)) url = nw.to_py_scalar(frame.item(0, "url_npm")) if isinstance(url, str): return url @@ -180,6 +185,8 @@ def query( """ Query multi-version trees metadata. + Applies a filter, erroring out when no results would be returned. + Notes ----- Arguments correspond to those seen in `pl.LazyFrame.filter`_. @@ -187,12 +194,7 @@ def query( .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ - frame = ( - nw.from_native(self.scan_fn(_METADATA)(_METADATA)) - .filter(_parse_predicates_constraints(predicates, constraints)) - .lazy() - .collect() - ) + frame = self._scan_metadata(*predicates, **constraints).collect() if not frame.is_empty(): return frame else: @@ -200,18 +202,13 @@ def query( msg = f"Found no results for:\n {terms}" raise ValueError(msg) - def _read_metadata(self) -> IntoDataFrameT: - """ - Return the full contents of ``metadata.parquet``. - - Effectively an eager read, no filters. - """ - return ( - nw.from_native(self.scan_fn(_METADATA)(_METADATA)) - .lazy() - .collect() - .to_native() - ) + def _scan_metadata( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.LazyFrame: + frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() + if predicates or constraints: + return _filter(frame, *predicates, **constraints) + return frame @property def _cache(self) -> Path | None: # type: ignore[return] @@ -406,24 +403,30 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: self._scan_fn = {".parquet": pa_read_parquet} -def _parse_predicates_constraints( - predicates: tuple[Any, ...], constraints: Metadata, / -) -> nw.Expr: +def _filter( + frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] +) -> FrameT: """ ``narwhals`` only accepts ``filter(*predicates)``. So we convert each item in ``**constraints`` here as:: col("column_name") == literal_value + + - https://github.com/narwhals-dev/narwhals/issues/1383 + - https://github.com/narwhals-dev/narwhals/pull/1417 """ - return nw.all_horizontal( - chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + return frame.filter( + nw.all_horizontal( + *chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + ) ) -def validate_constraints( +def _extract_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: + """Transform args into a mapping to column names.""" constraints: Metadata = {} if tag is not None: constraints["tag"] = tag @@ -445,7 +448,7 @@ def validate_constraints( return constraints -def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: +def _extract_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: suffix: Any = Path(source).suffix if guard(suffix): return suffix @@ -479,7 +482,6 @@ def infer_backend( .. _fastparquet: https://github.com/dask/fastparquet - """ it = (backend(name) for name in priority if is_available(_requirements(name))) if reader := next(it, None): From 6f41c7e5b830bff1e901ecbe1fcec862f72c4683 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:49:46 +0000 Subject: [PATCH 121/201] build: Generate `VERSION_LATEST` Simplifies logic that relies on enum/categoricals that may not be recognised as ordered --- altair/datasets/_typing.py | 10 +++++++++- tools/datasets/__init__.py | 10 ++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index cdaa57322..0b681b834 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -24,6 +24,7 @@ __all__ = [ "EXTENSION_SUFFIXES", + "VERSION_LATEST", "Dataset", "Extension", "Metadata", @@ -154,7 +155,14 @@ "v1.5.0", ] Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"] -EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet") +VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0" +EXTENSION_SUFFIXES: tuple[ + Literal[".csv"], + Literal[".json"], + Literal[".tsv"], + Literal[".arrow"], + Literal[".parquet"], +] = (".csv", ".json", ".tsv", ".arrow", ".parquet") def is_ext_read(suffix: Any) -> TypeIs[Extension]: diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 26955e9c0..1402a9c7b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -226,9 +226,14 @@ def generate_typing(self, output: Path, /) -> None: indent = " " * 4 NAME = "Dataset" TAG = "Version" + LATEST = "VERSION_LATEST" + LATEST_TAG = f"{tags.first()!r}" EXT = "Extension" EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet" EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" + EXTENSION_TYPE_TP = ( + f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXTENSION_TYPES)}]" + ) EXTENSION_GUARD = "is_ext_read" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" @@ -318,11 +323,12 @@ def generate_typing(self, output: Path, /) -> None: utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES, LATEST]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}", - f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}", + f"{LATEST}: Literal[{LATEST_TAG}] = {LATEST_TAG}", + f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXTENSION_TYPES!r}", f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" f"{indent}return suffix in set({EXTENSION_TYPES!r})\n", UNIVERSAL_TYPED_DICT.format( From 88d06a64ac8a21350314b5300fbd7142d57e13cf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:54:16 +0000 Subject: [PATCH 122/201] feat: Adds `_cache.py` for `UrlCache`, `DatasetCache` Docs to follow --- altair/datasets/__init__.py | 2 +- altair/datasets/_cache.py | 226 ++++++++++++++++++++++++++++++++++++ altair/datasets/_loader.py | 110 ++---------------- altair/datasets/_readers.py | 21 +--- tests/test_datasets.py | 75 +++++++++--- 5 files changed, 304 insertions(+), 130 deletions(-) create mode 100644 altair/datasets/_cache.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index e426ca467..70d01eacc 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -85,7 +85,7 @@ def url( url = load.url(name, suffix, tag=tag) except AltairDatasetsError: - from altair.datasets._loader import url_cache + from altair.datasets._cache import url_cache url = url_cache[name] diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py new file mode 100644 index 000000000..9239911fd --- /dev/null +++ b/altair/datasets/_cache.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args + +import narwhals.stable.v1 as nw +from narwhals.dependencies import get_pyarrow +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._typing import VERSION_LATEST + +if TYPE_CHECKING: + import sys + from collections.abc import Iterator, MutableMapping + from typing import Any, Final + + from _typeshed import StrPath + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Reader + from altair.datasets._typing import Dataset + +__all__ = ["DatasetCache", "UrlCache", "url_cache"] + + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_T = TypeVar("_T") + +_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" + + +class UrlCache(Generic[_KT, _VT]): + """ + `csv`_, `gzip`_ -based, lazy url lookup. + + Operates on a subset of available datasets: + - Only the latest version + - Excludes `.parquet`, which `cannot be read via url`_ + - Name collisions are pre-resolved + - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) + + .. _csv: + https://docs.python.org/3/library/csv.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _cannot be read via url: + https://github.com/vega/vega/issues/3961 + """ + + def __init__( + self, + fp: Path, + /, + *, + columns: tuple[str, str] = ("dataset_name", "url_npm"), + tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + ) -> None: + self.fp: Path = fp + self.columns: tuple[str, str] = columns + self._mapping: MutableMapping[_KT, _VT] = tp() + + def read(self) -> Any: + import csv + import gzip + + with gzip.open(self.fp, mode="rb") as f: + b_lines = f.readlines() + reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) + header = tuple(next(reader)) + if header != self.columns: + msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" + raise ValueError(msg) + return dict(reader) + + def __getitem__(self, key: _KT, /) -> _VT: + if url := self.get(key, None): + return url + + from altair.datasets._typing import Dataset + + if key in get_args(Dataset): + msg = f"{key!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) + + def get(self, key: _KT, default: _T) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + + def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: + self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader + + def download_all(self) -> None: + """ + Download any missing datasets for latest version. + + ``v2.11.0`` stats + ----------------- + - **66** items + - **27.8** MB + - Only 1 file > 2 MB + """ + stems = tuple(fp.stem for fp in self) + latest = nw.col("tag") == nw.lit(VERSION_LATEST) + predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,) + frame = ( + self._rd._scan_metadata( + *predicates, ext_supported=True, name_collision=False + ) + .select("sha", "suffix", "url_npm") + .unique("sha") + .collect() + ) + if frame.is_empty(): + print("Already downloaded all datasets") + return None + print(f"Downloading {len(frame)} missing datasets...") + for row in frame.iter_rows(named=True): + fp: Path = self.path / (row["sha"] + row["suffix"]) + with self._rd._opener.open(row["url_npm"]) as f: + fp.touch() + fp.write_bytes(f.read()) + print("Finished downloads") + return None + + def clear(self) -> None: + # unlink all matching sha + # stricter than `__iter__` + # - to avoid deleting unrelated files in dir + self.ensure_active() + if self.is_empty(): + return None + ser = ( + self._rd._scan_metadata() + .select("sha", "suffix") + .unique("sha") + .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) + .collect() + .get_column("sha_suffix") + ) + names = set[str]( + ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ) + for fp in self: + if fp.name in names: + fp.unlink() + + def __iter__(self) -> Iterator[Path]: + yield from self.path.iterdir() + + def __repr__(self): + name = type(self).__name__ + if self.is_not_active(): + return f"{name}" + else: + return f"{name}<{self.path.as_posix()!r}>" + + def is_active(self) -> bool: + return not self.is_not_active() + + def is_not_active(self) -> bool: + return os.environ.get(self._ENV_VAR) is None + + def is_empty(self) -> bool: + """Cache is active, but no files in the directory.""" + return next(iter(self), None) is None + + def ensure_active(self) -> None: + # Fail fast when the cache op is later + # Otherwise, just get the error from `self.path` + if self.is_not_active(): + msg = ( + f"Cache is unset.\n" + f"To enable dataset caching, set the environment variable:\n" + f" {self._ENV_VAR!r}\n\n" + f"You can set this for the current session via:\n" + f" from pathlib import Path\n" + f" from altair.datasets import load\n\n" + f" load.cache.path = Path.home() / '.altair_cache'" + ) + raise ValueError(msg) + + @property + def path(self) -> Path: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You can set this for the current session via: + + >>> from pathlib import Path + >>> from altair.datasets import load + >>> load.cache.path = Path.home() / ".altair_cache" + + >>> load.cache.path.relative_to(Path.home()).as_posix() + '.altair_cache' + """ + self.ensure_active() + fp = Path(os.environ[self._ENV_VAR]) + fp.mkdir(exist_ok=True) + return fp + + @path.setter + def path(self, source: StrPath | None, /) -> None: + if source is not None: + os.environ[self._ENV_VAR] = str(Path(source).resolve()) + else: + os.environ.pop(self._ENV_VAR, None) + + +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 3e31aea2e..ac56aa892 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -1,7 +1,6 @@ from __future__ import annotations -from pathlib import Path -from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload +from typing import TYPE_CHECKING, Generic, final, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -9,13 +8,13 @@ if TYPE_CHECKING: import sys - from collections.abc import MutableMapping - from typing import Any, Final, Literal + from typing import Any, Literal import pandas as pd import polars as pl import pyarrow as pa - from _typeshed import StrPath + + from altair.datasets._cache import DatasetCache if sys.version_info >= (3, 11): from typing import LiteralString @@ -27,12 +26,6 @@ __all__ = ["Loader", "load"] -_KT = TypeVar("_KT") -_VT = TypeVar("_VT") -_T = TypeVar("_T") - -_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" - class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -294,34 +287,18 @@ def url( """ return self._reader.url(name, suffix, tag=tag) + # TODO: Examples for tasklist @property - def cache_dir(self) -> Path | None: + def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ - Returns path to datasets cache. - - By default, this can be configured using the environment variable: - - "ALTAIR_DATASETS_DIR" - - You *may* also set this directly, but the value will **not** persist between sessions: - - from pathlib import Path - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - data.cache_dir = Path.home() / ".altair_cache" + Dataset caching. - >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP - '.altair_cache' + - [x] Enable via 2 examples + - [ ] Disable after enabling (self.cache.path = None) + - [ ] Pre-download missing + - [ ] Clear entire cache """ - return self._reader._cache - - @cache_dir.setter - def cache_dir(self, source: StrPath, /) -> None: - import os - - os.environ[self._reader._ENV_VAR] = str(source) + return self._reader.cache def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" @@ -384,69 +361,6 @@ def __call__( return self.from_backend(backend)(name, suffix, tag=tag, **kwds) -class UrlCache(Generic[_KT, _VT]): - """ - `csv`_, `gzip`_ -based, lazy url lookup. - - Operates on a subset of available datasets: - - Only the latest version - - Excludes `.parquet`, which `cannot be read via url`_ - - Name collisions are pre-resolved - - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) - - .. _csv: - https://docs.python.org/3/library/csv.html - .. _gzip: - https://docs.python.org/3/library/gzip.html - .. _cannot be read via url: - https://github.com/vega/vega/issues/3961 - """ - - def __init__( - self, - fp: Path, - /, - *, - columns: tuple[str, str] = ("dataset_name", "url_npm"), - tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], - ) -> None: - self.fp: Path = fp - self.columns: tuple[str, str] = columns - self._mapping: MutableMapping[_KT, _VT] = tp() - - def read(self) -> Any: - import csv - import gzip - - with gzip.open(self.fp, mode="rb") as f: - b_lines = f.readlines() - reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) - header = tuple(next(reader)) - if header != self.columns: - msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" - raise ValueError(msg) - return dict(reader) - - def __getitem__(self, key: _KT, /) -> _VT: - if url := self.get(key, None): - return url - - from altair.datasets._typing import Dataset - - if key in get_args(Dataset): - msg = f"{key!r} cannot be loaded via url." - raise TypeError(msg) - else: - msg = f"{key!r} does not refer to a known dataset." - raise TypeError(msg) - - def get(self, key: _KT, default: _T) -> _VT | _T: - if not self._mapping: - self._mapping.update(self.read()) - return self._mapping.get(key, default) - - -url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) load: _Load[Any, Any] diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 2c8d53820..e7c97b9d1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -9,7 +9,6 @@ from __future__ import annotations -import os import urllib.request from collections.abc import Iterable, Mapping, Sequence from functools import partial @@ -33,6 +32,7 @@ import narwhals.stable.v1 as nw from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from altair.datasets._cache import DatasetCache from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read if TYPE_CHECKING: @@ -128,7 +128,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): Otherwise, has no concrete meaning. """ - _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: @@ -151,8 +150,8 @@ def dataset( url = result["url_npm"] fn = self.read_fn(url) - if cache := self._cache: - fp = cache / (result["sha"] + result["suffix"]) + if self.cache.is_active(): + fp = self.cache.path / (result["sha"] + result["suffix"]) if fp.exists() and fp.stat().st_size: return fn(fp, **kwds) else: @@ -211,18 +210,8 @@ def _scan_metadata( return frame @property - def _cache(self) -> Path | None: # type: ignore[return] - """ - Returns path to datasets cache, if possible. - - Requires opt-in via environment variable:: - - Reader._ENV_VAR - """ - if _dir := os.environ.get(self._ENV_VAR): - cache_dir = Path(_dir) - cache_dir.mkdir(exist_ok=True) - return cache_dir + def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: + return DatasetCache(self) def _import(self, name: str, /) -> Any: if spec := find_spec(name): diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 50ece0a26..1d0990abf 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -7,6 +7,7 @@ from functools import partial from importlib import import_module from importlib.util import find_spec +from pathlib import Path from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError @@ -21,7 +22,7 @@ from altair.datasets import Loader, url from altair.datasets._readers import _METADATA, AltairDatasetsError -from altair.datasets._typing import Dataset, Extension, Metadata, Version +from altair.datasets._typing import Dataset, Extension, Metadata, Version, is_ext_read from tests import skip_requires_pyarrow, slow if sys.version_info >= (3, 14): @@ -104,7 +105,7 @@ def polars_loader( tmp_path_factory: pytest.TempPathFactory, ) -> Loader[pl.DataFrame, pl.LazyFrame]: data = Loader.from_backend("polars") - data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") + data.cache.path = tmp_path_factory.mktemp("loader-cache-polars") return data @@ -273,7 +274,7 @@ def test_url(name: Dataset) -> None: def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets - from altair.datasets._loader import url_cache + from altair.datasets._cache import url_cache monkeypatch.setitem(sys.modules, "polars", None) monkeypatch.setitem(sys.modules, "pandas", None) @@ -477,11 +478,11 @@ def test_reader_cache( monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) data = Loader.from_backend(backend) - cache_dir = data.cache_dir - assert cache_dir is not None + assert data.cache.is_active() + cache_dir = data.cache.path assert cache_dir == tmp_path - assert tuple(cache_dir.iterdir()) == () + assert tuple(data.cache) == () # smallest csvs lookup_groups = data("lookup_groups", tag="v2.5.3") @@ -489,7 +490,7 @@ def test_reader_cache( data("iowa-electricity", tag="v2.3.1") data("global-temp", tag="v2.9.0") - cached_paths = tuple(cache_dir.iterdir()) + cached_paths = tuple(data.cache) assert len(cached_paths) == 4 if is_polars_dataframe(lookup_groups): @@ -504,15 +505,15 @@ def test_reader_cache( ) assert_frame_equal(left, right) - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) data("iowa-electricity", tag="v1.30.2") data("global-temp", tag="v2.8.1") data("global-temp", tag="v2.8.0") - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) data("lookup_people", tag="v1.10.0") data("lookup_people", tag="v1.11.0") @@ -522,8 +523,52 @@ def test_reader_cache( data("lookup_people", tag="v2.3.0") data("lookup_people", tag="v2.5.0-next.0") - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) + + +@slow +@datasets_debug +@backends +def test_reader_cache_exhaustive( + backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """ + Fully populate and then purge the cache for all backends. + + - Does not attempt to read the files + - Checking we can support pre-downloading and safely deleting + """ + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + data = Loader.from_backend(backend) + assert data.cache.is_active() + cache_dir = data.cache.path + assert cache_dir == tmp_path + assert tuple(data.cache) == () + + data.cache.download_all() + cached_paths = tuple(data.cache) + assert cached_paths != () + + # NOTE: Approximating all datasets downloaded + assert len(cached_paths) >= 40 + assert all( + bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size) + for fp in data.cache + ) + # NOTE: Confirm this is a no-op + data.cache.download_all() + assert len(cached_paths) == len(tuple(data.cache)) + + # NOTE: Ensure unrelated files in the directory are not removed + dummy: Path = tmp_path / "dummy.json" + dummy.touch(exist_ok=False) + data.cache.clear() + + remaining = tuple(tmp_path.iterdir()) + assert len(remaining) == 1 + assert remaining[0] == dummy + dummy.unlink() movies_fail: ParameterSet = pytest.param( @@ -559,7 +604,7 @@ def test_reader_cache( def test_pyarrow_read_json( fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv(CACHE_ENV_VAR, "") + monkeypatch.delenv(CACHE_ENV_VAR, raising=False) monkeypatch.delitem(sys.modules, "pandas", raising=False) if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) @@ -630,7 +675,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - from polars.testing import assert_frame_equal data = Loader.from_backend("polars") - data.cache_dir = tmp_path + data.cache.path = tmp_path data("londonCentroids") data("stocks") From f21b52b6c932c517383de02087f75228af0f7a28 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:59:09 +0000 Subject: [PATCH 123/201] ci(ruff): Ignore `0.8.0` violations https://github.com/vega/altair/discussions/3687#discussioncomment-11351453 --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c43e00504..e398dfb6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -378,7 +378,9 @@ ignore = [ # doc-line-too-long "W505", # Any as annotation - "ANN401" + "ANN401", + # 0.8.0 + "RUF039", "RUF200" ] # https://docs.astral.sh/ruff/settings/#lintpydocstyle pydocstyle={ convention="numpy" } From e7974d90c78a38c06d7e19aeeb54e32179948022 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 19:53:01 +0000 Subject: [PATCH 124/201] fix: Use stable `narwhals` imports https://github.com/narwhals-dev/narwhals/issues/1426, https://github.com/vega/altair/pull/3693#discussion_r1854513083 --- altair/datasets/_cache.py | 8 +++++--- altair/datasets/_loader.py | 2 +- altair/datasets/_readers.py | 2 +- tests/test_datasets.py | 25 ++++++++++--------------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 9239911fd..0166c50e8 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.dependencies import get_pyarrow -from narwhals.typing import IntoDataFrameT, IntoFrameT +from narwhals.stable.v1 import dependencies as nw_dep +from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -151,7 +151,9 @@ def clear(self) -> None: .get_column("sha_suffix") ) names = set[str]( - ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ser.to_list() + if nw.get_native_namespace(ser) is nw_dep.get_pyarrow() + else ser ) for fp in self: if fp.name in names: diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index ac56aa892..5be85e60a 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Generic, final, overload -from narwhals.typing import IntoDataFrameT, IntoFrameT +from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._readers import _Reader, backend diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e7c97b9d1..5adcf3751 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -30,7 +30,7 @@ ) import narwhals.stable.v1 as nw -from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT from altair.datasets._cache import DatasetCache from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 1d0990abf..20515069b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -12,13 +12,8 @@ from urllib.error import URLError import pytest -from narwhals.dependencies import ( - is_into_dataframe, - is_pandas_dataframe, - is_polars_dataframe, - is_pyarrow_table, -) from narwhals.stable import v1 as nw +from narwhals.stable.v1 import dependencies as nw_dep from altair.datasets import Loader, url from altair.datasets._readers import _METADATA, AltairDatasetsError @@ -227,11 +222,11 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: default_2 = load("cars") df_polars = load("cars", backend="polars") - assert is_polars_dataframe(default) - assert is_pyarrow_table(df_pyarrow) - assert is_pandas_dataframe(df_pandas) - assert is_polars_dataframe(default_2) - assert is_polars_dataframe(df_polars) + assert nw_dep.is_polars_dataframe(default) + assert nw_dep.is_pyarrow_table(df_pyarrow) + assert nw_dep.is_pandas_dataframe(df_pandas) + assert nw_dep.is_polars_dataframe(default_2) + assert nw_dep.is_polars_dataframe(df_polars) @pytest.mark.parametrize( @@ -320,7 +315,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None data = Loader.from_backend(backend) frame = data("stocks", ".csv") - assert is_into_dataframe(frame) + assert nw_dep.is_into_dataframe(frame) nw_frame = nw.from_native(frame) assert set(nw_frame.columns) == {"symbol", "date", "price"} @@ -493,7 +488,7 @@ def test_reader_cache( cached_paths = tuple(data.cache) assert len(cached_paths) == 4 - if is_polars_dataframe(lookup_groups): + if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), @@ -664,7 +659,7 @@ def test_all_datasets( ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name, suffix, tag=tag) - assert is_polars_dataframe(frame) + assert nw_dep.is_polars_dataframe(frame) def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): @@ -698,7 +693,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - # Now we can get a cache-hit frame = data("birdstrikes") - assert is_polars_dataframe(frame) + assert nw_dep.is_polars_dataframe(frame) assert len(tuple(tmp_path.iterdir())) == 4 with monkeypatch.context() as mp: From c907dc500504cdff8e2342f488fb679cd2108975 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 13:52:44 +0000 Subject: [PATCH 125/201] revert(ruff): Ignore `0.8.0` violations f21b52b6c932c517383de02087f75228af0f7a28 --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a44b4459e..c353b9b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -377,9 +377,7 @@ ignore = [ # doc-line-too-long "W505", # Any as annotation - "ANN401", - # 0.8.0 - "RUF039", "RUF200" + "ANN401" ] # https://docs.astral.sh/ruff/settings/#lintpydocstyle pydocstyle={ convention="numpy" } From a3b38c49836c850681c41c797865351bddfccbb7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 13:58:53 +0000 Subject: [PATCH 126/201] revert: Remove `_readers._filter` Feature has been adopted upstream in https://github.com/narwhals-dev/narwhals/pull/1417 --- altair/datasets/_readers.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 5adcf3751..354a45532 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -206,7 +206,7 @@ def _scan_metadata( ) -> nw.LazyFrame: frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() if predicates or constraints: - return _filter(frame, *predicates, **constraints) + return frame.filter(*predicates, **constraints) return frame @property @@ -392,26 +392,6 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: self._scan_fn = {".parquet": pa_read_parquet} -def _filter( - frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] -) -> FrameT: - """ - ``narwhals`` only accepts ``filter(*predicates)``. - - So we convert each item in ``**constraints`` here as:: - - col("column_name") == literal_value - - - https://github.com/narwhals-dev/narwhals/issues/1383 - - https://github.com/narwhals-dev/narwhals/pull/1417 - """ - return frame.filter( - nw.all_horizontal( - *chain(predicates, (nw.col(name) == v for name, v in constraints.items())) - ) - ) - - def _extract_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: From a6c5096ddab82fd4682006f90158b71b0f3aa479 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 14:43:11 +0000 Subject: [PATCH 127/201] feat: Adds example and tests for disabling caching --- altair/datasets/_cache.py | 4 ++++ altair/datasets/_loader.py | 2 +- tests/test_datasets.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 0166c50e8..f801a26d1 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -211,6 +211,10 @@ def path(self) -> Path: >>> load.cache.path.relative_to(Path.home()).as_posix() '.altair_cache' + + You can *later* disable caching via: + + >>> load.cache.path = None """ self.ensure_active() fp = Path(os.environ[self._ENV_VAR]) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 5be85e60a..111af950b 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -294,7 +294,7 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: Dataset caching. - [x] Enable via 2 examples - - [ ] Disable after enabling (self.cache.path = None) + - [x] Disable after enabling (self.cache.path = None) - [ ] Pre-download missing - [ ] Clear entire cache """ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 20515069b..5d2b93c2d 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -566,6 +566,36 @@ def test_reader_cache_exhaustive( dummy.unlink() +def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + from altair.datasets import load + + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + + assert load.cache.is_active() + assert load.cache.path == tmp_path + assert load.cache.is_empty() + load("cars") + assert not load.cache.is_empty() + + # RELATED: https://github.com/python/mypy/issues/3004 + load.cache.path = None # type: ignore[assignment] + + assert load.cache.is_not_active() + with pytest.raises( + ValueError, + match=re.compile( + rf"Cache.+unset.+{CACHE_ENV_VAR}.+\.cache\.path =", flags=re.DOTALL + ), + ): + tuple(load.cache) + + load.cache.path = tmp_path + + assert load.cache.is_active() + assert load.cache.path == tmp_path + assert not load.cache.is_empty() + + movies_fail: ParameterSet = pytest.param( "movies", marks=pytest.mark.xfail( From 71423eadfe63a767c2b591f743b3a36272d59c7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 15:11:41 +0000 Subject: [PATCH 128/201] refactor: Tidy up `DatasetCache` --- altair/datasets/_cache.py | 124 ++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 67 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index f801a26d1..f9e3c683a 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.stable.v1 import dependencies as nw_dep +from narwhals.stable.v1.dependencies import get_pyarrow from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -102,22 +102,38 @@ class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader + def clear(self) -> None: + """Delete all previously cached datasets.""" + self._ensure_active() + if self.is_empty(): + return None + ser = ( + self._rd._scan_metadata() + .select("sha", "suffix") + .unique("sha") + .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) + .collect() + .get_column("sha_suffix") + ) + names = set[str]( + ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ) + for fp in self: + if fp.name in names: + fp.unlink() + def download_all(self) -> None: """ Download any missing datasets for latest version. - ``v2.11.0`` stats - ----------------- - - **66** items - - **27.8** MB - - Only 1 file > 2 MB + Requires **30-50MB** of disk-space. """ stems = tuple(fp.stem for fp in self) latest = nw.col("tag") == nw.lit(VERSION_LATEST) predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,) frame = ( self._rd._scan_metadata( - *predicates, ext_supported=True, name_collision=False + predicates, ext_supported=True, name_collision=False ) .select("sha", "suffix", "url_npm") .unique("sha") @@ -135,65 +151,6 @@ def download_all(self) -> None: print("Finished downloads") return None - def clear(self) -> None: - # unlink all matching sha - # stricter than `__iter__` - # - to avoid deleting unrelated files in dir - self.ensure_active() - if self.is_empty(): - return None - ser = ( - self._rd._scan_metadata() - .select("sha", "suffix") - .unique("sha") - .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) - .collect() - .get_column("sha_suffix") - ) - names = set[str]( - ser.to_list() - if nw.get_native_namespace(ser) is nw_dep.get_pyarrow() - else ser - ) - for fp in self: - if fp.name in names: - fp.unlink() - - def __iter__(self) -> Iterator[Path]: - yield from self.path.iterdir() - - def __repr__(self): - name = type(self).__name__ - if self.is_not_active(): - return f"{name}" - else: - return f"{name}<{self.path.as_posix()!r}>" - - def is_active(self) -> bool: - return not self.is_not_active() - - def is_not_active(self) -> bool: - return os.environ.get(self._ENV_VAR) is None - - def is_empty(self) -> bool: - """Cache is active, but no files in the directory.""" - return next(iter(self), None) is None - - def ensure_active(self) -> None: - # Fail fast when the cache op is later - # Otherwise, just get the error from `self.path` - if self.is_not_active(): - msg = ( - f"Cache is unset.\n" - f"To enable dataset caching, set the environment variable:\n" - f" {self._ENV_VAR!r}\n\n" - f"You can set this for the current session via:\n" - f" from pathlib import Path\n" - f" from altair.datasets import load\n\n" - f" load.cache.path = Path.home() / '.altair_cache'" - ) - raise ValueError(msg) - @property def path(self) -> Path: """ @@ -216,7 +173,7 @@ def path(self) -> Path: >>> load.cache.path = None """ - self.ensure_active() + self._ensure_active() fp = Path(os.environ[self._ENV_VAR]) fp.mkdir(exist_ok=True) return fp @@ -228,5 +185,38 @@ def path(self, source: StrPath | None, /) -> None: else: os.environ.pop(self._ENV_VAR, None) + def __iter__(self) -> Iterator[Path]: + yield from self.path.iterdir() + + def __repr__(self) -> str: + name = type(self).__name__ + if self.is_not_active(): + return f"{name}" + else: + return f"{name}<{self.path.as_posix()!r}>" + + def is_active(self) -> bool: + return not self.is_not_active() + + def is_not_active(self) -> bool: + return os.environ.get(self._ENV_VAR) is None + + def is_empty(self) -> bool: + """Cache is active, but no files are stored in ``self.path``.""" + return next(iter(self), None) is None + + def _ensure_active(self) -> None: + if self.is_not_active(): + msg = ( + f"Cache is unset.\n" + f"To enable dataset caching, set the environment variable:\n" + f" {self._ENV_VAR!r}\n\n" + f"You can set this for the current session via:\n" + f" from pathlib import Path\n" + f" from altair.datasets import load\n\n" + f" load.cache.path = Path.home() / '.altair_cache'" + ) + raise ValueError(msg) + url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) From 7dd9c18a6eef4c15baa91540ef887c30e38bff04 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 15:25:13 +0000 Subject: [PATCH 129/201] docs: Finish `Loader.cache` Not using doctest style here, none of these return anything but I want them hinted at --- altair/datasets/_cache.py | 2 ++ altair/datasets/_loader.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index f9e3c683a..ce058c561 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -97,6 +97,8 @@ def get(self, key: _KT, default: _T) -> _VT | _T: class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): + """Optional caching of remote dataset requests.""" + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 111af950b..ce2559aed 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -287,16 +287,22 @@ def url( """ return self._reader.url(name, suffix, tag=tag) - # TODO: Examples for tasklist @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ - Dataset caching. + Optional caching of remote dataset requests. - - [x] Enable via 2 examples - - [x] Disable after enabling (self.cache.path = None) - - [ ] Pre-download missing - - [ ] Clear entire cache + Enable caching: + + self.cache.path = ... + + Download the latest datasets *ahead-of-time*: + + self.cache.download_all() + + Remove all downloaded datasets: + + self.cache.clear() """ return self._reader.cache From a982759715061c436ea93aea8234cd04dfca4657 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:26:20 +0000 Subject: [PATCH 130/201] refactor(typing): Use `Mapping` instead of `dict` Mutability is not needed. Also see https://github.com/vega/altair/pull/3573 --- altair/datasets/_readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 354a45532..9228c5531 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -105,14 +105,14 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _Reader._name """ - _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + _read_fn: Mapping[Extension, Callable[..., IntoDataFrameT]] """ Eager file read functions. Each corresponds to a known file extension within ``vega-datasets``. """ - _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _scan_fn: Mapping[_ExtensionScan, Callable[..., IntoFrameT]] """ *Optionally*-lazy file read/scan functions. From d20e9c11071898bb3f418fda22bf3f915ff949e8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 30 Nov 2024 14:44:42 +0000 Subject: [PATCH 131/201] perf: Use `to_list()` for all backends https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508957161, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508928135, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508981618 --- altair/datasets/_cache.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index ce058c561..edca990d6 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.stable.v1.dependencies import get_pyarrow from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -117,9 +116,7 @@ def clear(self) -> None: .collect() .get_column("sha_suffix") ) - names = set[str]( - ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser - ) + names = set[str](ser.to_list()) for fp in self: if fp.name in names: fp.unlink() From 909e7d05e57718b2f634a7e6781cb4e58a835837 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 15:38:12 +0000 Subject: [PATCH 132/201] feat(DRAFT): Utilize `datapackage` schemas in `pandas` backends Provides a generalized solution to `pd.read_(csv|json)` requiring the names of date columns to attempt parsing. cc @joelostblom The solution is possible in large part to https://github.com/vega/vega-datasets/pull/631 https://github.com/vega/altair/pull/3631#issuecomment-2480816377 --- altair/datasets/_cache.py | 149 ++++++++++++++++-- .../_metadata/datapackage_schemas.json.gz | Bin 0 -> 2490 bytes altair/datasets/_readers.py | 37 ++++- altair/datasets/_typing.py | 22 +++ tests/test_datasets.py | 68 +++++++- tools/datasets/__init__.py | 85 +++++++--- tools/datasets/datapackage.py | 133 ++++++++++++++++ tools/datasets/models.py | 94 ++++++++++- tools/datasets/npm.py | 54 ++++++- 9 files changed, 600 insertions(+), 42 deletions(-) create mode 100644 altair/datasets/_metadata/datapackage_schemas.json.gz create mode 100644 tools/datasets/datapackage.py diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index edca990d6..22c652bf3 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import sys from pathlib import Path from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args @@ -9,19 +10,32 @@ from altair.datasets._typing import VERSION_LATEST +if sys.version_info >= (3, 12): + from typing import Protocol +else: + from typing_extensions import Protocol + if TYPE_CHECKING: - import sys - from collections.abc import Iterator, MutableMapping + from collections.abc import Iterator, Mapping, MutableMapping + from io import IOBase from typing import Any, Final from _typeshed import StrPath + from narwhals.stable.v1.dtypes import DType if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias from altair.datasets._readers import _Reader - from altair.datasets._typing import Dataset + from altair.datasets._typing import Dataset, FlFieldStr + + _Dataset: TypeAlias = "Dataset | LiteralString" + _FlSchema: TypeAlias = Mapping[str, FlFieldStr] __all__ = ["DatasetCache", "UrlCache", "url_cache"] @@ -31,9 +45,62 @@ _T = TypeVar("_T") _URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" +_SCHEMA: Final[Path] = ( + Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz" +) + +_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { + "integer": nw.Int64, + "number": nw.Float64, + "boolean": nw.Boolean, + "string": nw.String, + "object": nw.Struct, + "array": nw.List, + "date": nw.Date, + "datetime": nw.Datetime, + # "time": nw.Time, (Not Implemented, but we don't have any cases using it anyway) + "duration": nw.Duration, +} +""" +Similar to an inverted `pl.datatypes.convert.dtype_to_ffiname`_. + +But using the string repr of ``frictionless`` `Field Types`_ to `narwhals.dtypes`_. + +.. _pl.datatypes.convert.dtype_to_ffiname: + https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165 +.. _Field Types: + https://datapackage.org/standard/table-schema/#field-types +.. _narwhals.dtypes: + https://narwhals-dev.github.io/narwhals/api-reference/dtypes/ +""" + +_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = { + v: k for k, v in _FIELD_TO_DTYPE.items() +} + + +class CompressedCache(Protocol[_KT, _VT]): + fp: Path + _mapping: MutableMapping[_KT, _VT] + + def read(self) -> Any: ... + def __getitem__(self, key: _KT, /) -> _VT: ... + + def __enter__(self) -> IOBase: + import gzip + + return gzip.open(self.fp, mode="rb").__enter__() + def __exit__(self, *args) -> None: + return -class UrlCache(Generic[_KT, _VT]): + def get(self, key: _KT, default: _T, /) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +class UrlCache(CompressedCache[_KT, _VT]): """ `csv`_, `gzip`_ -based, lazy url lookup. @@ -65,9 +132,8 @@ def __init__( def read(self) -> Any: import csv - import gzip - with gzip.open(self.fp, mode="rb") as f: + with self as f: b_lines = f.readlines() reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) header = tuple(next(reader)) @@ -89,10 +155,72 @@ def __getitem__(self, key: _KT, /) -> _VT: msg = f"{key!r} does not refer to a known dataset." raise TypeError(msg) - def get(self, key: _KT, default: _T) -> _VT | _T: - if not self._mapping: - self._mapping.update(self.read()) - return self._mapping.get(key, default) + +class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]): + """ + `json`_, `gzip`_ -based, lazy schema lookup. + + - Primarily benefits ``pandas``, which needs some help identifying **temporal** columns. + - Utilizes `data package`_ schema types. + - All methods return falsy containers instead of exceptions + + .. _json: + https://docs.python.org/3/library/json.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _data package: + https://github.com/vega/vega-datasets/pull/631 + """ + + def __init__( + self, + fp: Path, + /, + *, + tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"], + ) -> None: + self.fp: Path = fp + self._mapping: MutableMapping[_Dataset, _FlSchema] = tp() + + def read(self) -> Any: + import json + + with self as f: + return json.load(f) + + def __getitem__(self, key: _Dataset, /) -> _FlSchema: + return self.get(key, {}) + + def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]: + """ + Return column names specfied in ``name``'s schema. + + Parameters + ---------- + name + Dataset name. + *dtypes + Optionally, only return columns matching the given data type(s). + """ + if (match := self[name]) and dtypes: + include = {_DTYPE_TO_FIELD[tp] for tp in dtypes} + return [col for col, tp_str in match.items() if tp_str in include] + else: + return list(match) + + def schema(self, name: _Dataset, /) -> Mapping[str, DType]: + return { + column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items() + } + + def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]: + """ + Can be passed directly to `.with_columns(...). + + BUG: `cars` doesnt work in either pandas backend + """ + for column, dtype in self.schema(name).items(): + yield nw.col(column).cast(dtype) class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): @@ -219,3 +347,4 @@ def _ensure_active(self) -> None: url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) +schema_cache = SchemaCache(_SCHEMA) diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..537dcd28ba9377319523683299cb1773ddf40e79 GIT binary patch literal 2490 zcmV;r2}SlFiwFn+00002|72lwVQ^t%Yhh<)Uvpz$tz$R-L!ItIS||(ffKCc2x9p&f;3z=l4IH%W+WdN6

XuiARkfAhqQ;6aWN zvyuC)aAS!);apf|9XB@6{-~V5biPnJSDd-ZVjh{YnFf{ur8NhZ?3B`f5ScF{DT0u# zaiUk=CCh**!KMW6xB`Po84iu)UpvDar|T}mnBRB|`pK|5T{|X}Cp@|06IZQ8I+AQ= zgm%4(3Sq2E6|0eNNEQWClY;~0+)PgyhFvZ9Do%qlj+~O^x#8AguM}mq(WcTmpJH6s zU@o;Pg==#@Rfu<61oeW~`l&bK%gKXq64CniI{9R@wMf3t+Q@t>avsIPM^}GbC2J-* zM%@{mx0#QhZ4-MA^IKIP5K0e6Pj>A_K`C z7H3EG?DTQM`72y*C(uaL=Cw`WtfSpKzzhBQy6m=N8S%%Xmy z8;)F{0zXFP(OaZqsV&Y2qEt+x){Hw+F+o}Op!vi1IJ>nk+H}cE=a}|zB-hEeQPw3Hd`@^iCp@1Mp3f1_55JQWM22%B!#R;*L1b7E85Tr_ z1(9JvWLOXx7DVq0!t;XgydXR;3C~Ny^OEqqBs?z(&r8DdlJLAFJTD2)OT;rY<);KP zCy+A&c}E~W6Uci4`9L7&1hOCy!t>p^5d3|e7p0>qhCD1kXrH-?3mJ3;XN<>3d#G)L3m3x zjr&9KdtN6qrC$=nLh%1Q-VRN8X>{B4nIrrp?+wNtX%zd9`rnaP02pm}#%v7r#AJ(i zGsEhuCkjae?e-(fBk~jx*+WKK_xN%qw#o5~_1nKU4r#5O1ukY8^OltRB@tN1MC%VqR3yQXut#T*W-p- z=UsBctWAC@F>c+zUX29!Q#8Iu?gfB9X{2L*yLJ92C&0SQ4McZXp2M&xa{m1pSkO0# zl36^Z?}o`)#hG*!4BVNn`yn^#jC=fa)b-y^MLoJfH&Creh}!0T=+0sTWCK~4ADKGm zKYmxd5hnM49|ym<=rch=8xz1}h7}GR?vW=q;5%p=^RXyk&Q31sFtF(&$yq%t5)Oxc z38E%=)wCxLKcrARxee=kyvgdfmr113-Btei2`=q*IYoXJGWV;1WDT9iF;py~!`oDl z1Ub%1TbJBM9ybEQ7kv@Mgwme;4achAIVaZ_>jZ>*RN9e(R%^vfRvp{rgCiJ6-gfYk z-G|0AraTG}N|$(d^mPd@SDY6T2kWl9jeHOCHF^}=uex@Ap|mQPsqu}FK4{xkY7Kos z6J9Fmo1Nw62_nzqp|PO&foA%>b9H!_P9f2!QkNw?07C)D*guxyZ<%Tfmbuo@&+^u= zjpS*N0&jh_`WD(5mh;s4KHJA`|6mAhnRJ3A&26P2lQJ33geQPX-7Lb}$CZN{@-9TZ8unpEIxV7-DKLxjM{KWv-*$aQb!3hq$ z_9^|?S;>WeLF`Xz8+wPKwBe`BV Callable[..., IntoDataFrameT]: def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: return self._scan_fn[_extract_suffix(source, is_ext_scan)] + def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: + """Hook to provide additional schema metadata on read.""" + return {} + def dataset( self, name: Dataset | LiteralString, @@ -149,6 +153,8 @@ def dataset( result = cast("Metadata", next(it)) url = result["url_npm"] fn = self.read_fn(url) + if default_kwds := self._schema_kwds(result): + kwds = default_kwds | kwds if kwds else default_kwds if self.cache.is_active(): fp = self.cache.path / (result["sha"] + result["suffix"]) @@ -238,7 +244,32 @@ def __repr__(self) -> str: def __init__(self, name: LiteralString, /) -> None: ... -class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): +class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol): + """ + Provides temporal column names as keyword arguments on read. + + Related + ------- + - https://github.com/vega/altair/pull/3631#issuecomment-2480816377 + - https://github.com/vega/vega-datasets/pull/631 + - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html + """ + + def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: + from altair.datasets._cache import schema_cache + + name: Any = result["dataset_name"] + suffix = result["suffix"] + if cols := schema_cache.by_dtype(name, nw.Date, nw.Datetime): + if suffix == ".json": + return {"convert_dates": cols} + elif suffix in {".csv", ".tsv"}: + return {"parse_dates": cols} + return super()._schema_kwds(result) + + +class _PandasReader(_PandasReaderBase): def __init__(self, name: _Pandas, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -253,7 +284,7 @@ def __init__(self, name: _Pandas, /) -> None: self._scan_fn = {".parquet": pd.read_parquet} -class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): +class _PandasPyArrowReader(_PandasReaderBase): def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: _pd, _pa = _requirements(name) self._name = name diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 0b681b834..c83c6066e 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -257,3 +257,25 @@ class Metadata(TypedDict, total=False): suffix: str tag: str url_npm: str + + +FlFieldStr: TypeAlias = Literal[ + "integer", + "number", + "boolean", + "string", + "object", + "array", + "date", + "datetime", + "time", + "duration", +] +""" +String representation of `frictionless`_ `Field Types`_. + +.. _frictionless: + https://github.com/frictionlessdata/frictionless-py +.. _Field Types: + https://datapackage.org/standard/table-schema/#field-types +""" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 5d2b93c2d..9d91c275e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -30,10 +30,12 @@ from pathlib import Path from typing import Literal + import pandas as pd import polars as pl from _pytest.mark.structures import ParameterSet - from altair.datasets._readers import _Backend, _Polars + from altair.datasets._readers import _Backend, _PandasAny, _Polars + from altair.vegalite.v5.schema._typing import OneOrSeq from tests import MarksType CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -743,3 +745,67 @@ def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) - native = fn(_METADATA) schema_columns = nw.from_native(native).lazy().collect().columns assert set(schema_columns) == metadata_columns + + +@skip_requires_pyarrow +@pytest.mark.parametrize("backend", ["pandas", "pandas[pyarrow]"]) +@pytest.mark.parametrize( + ("name", "columns"), + [ + ("birdstrikes", "Flight Date"), + ("cars", "Year"), + ("co2-concentration", "Date"), + ("crimea", "date"), + ("football", "date"), + ("iowa-electricity", "year"), + ("la-riots", "death_date"), + ("ohlc", "date"), + ("seattle-weather-hourly-normals", "date"), + ("seattle-weather", "date"), + ("sp500-2000", "date"), + ("unemployment-across-industries", "date"), + ("us-employment", "month"), + ], +) +def test_pandas_date_parse( + backend: _PandasAny, + name: Dataset, + columns: OneOrSeq[str], + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], +) -> None: + """ + Ensure schema defaults are correctly parsed. + + NOTE: + - Depends on ``frictionless`` being able to detect the date/datetime columns. + - Not all format strings work + """ + date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns) + + load = Loader.from_backend(backend) + url = load.url(name) + kwds: dict[str, Any] = ( + {"convert_dates": date_columns} + if url.endswith(".json") + else {"parse_dates": date_columns} + ) + kwds_empty: dict[str, Any] = {k: [] for k in kwds} + + df_schema_derived: pd.DataFrame = load(name) + nw_schema = nw.from_native(df_schema_derived).schema + + df_manually_specified: pd.DataFrame = load(name, **kwds) + df_dates_empty: pd.DataFrame = load(name, **kwds_empty) + + assert set(date_columns).issubset(nw_schema) + for column in date_columns: + assert nw_schema[column] in {nw.Date, nw.Datetime} + + assert nw_schema == nw.from_native(df_manually_specified).schema + assert nw_schema != nw.from_native(df_dates_empty).schema + + # NOTE: Checking `polars` infers the same[1] as what `pandas` needs a hint for + # [1] Doesn't need to be exact, just recognise as *some kind* of date/datetime + pl_schema: pl.Schema = polars_loader(name).schema + for column in date_columns: + assert pl_schema[column].is_temporal() diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 1402a9c7b..66c31e6f6 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -39,7 +39,15 @@ else: from typing_extensions import TypeAlias - _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] + _PathAlias: TypeAlias = Literal[ + "npm_tags", + "gh_tags", + "gh_trees", + "typing", + "url", + "dpkg_features", + "dpkg_schemas", + ] __all__ = ["app"] @@ -102,15 +110,17 @@ def __init__( npm_cdn_url=self._npm.url.CDN, **kwds_gh, ) - self._paths = types.MappingProxyType["_PathAlias", Path]( + self.paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], "gh_tags": self.github._paths["tags"], "gh_trees": self.github._paths["trees"], + "typing": out_fp_typing, + "url": out_dir_altair / "url.csv.gz", + "dpkg_features": out_dir_altair / "datapackage_features.parquet", + "dpkg_schemas": out_dir_altair / "datapackage_schemas.json.gz", } ) - self._fp_typing: Path = out_fp_typing - self._fp_url: Path = out_dir_altair / "url.csv.gz" @property def github(self) -> GitHub: @@ -131,13 +141,13 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: """ print("Syncing datasets ...") npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self._paths["npm_tags"]) + self.write_parquet(npm_tags, self.paths["npm_tags"]) gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self._paths["gh_tags"]) + self.write_parquet(gh_tags, self.paths["gh_tags"]) gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self._paths["gh_trees"]) + self.write_parquet(gh_trees, self.paths["gh_trees"]) npm_urls_min = ( gh_trees.lazy() @@ -145,31 +155,29 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: .filter(col("size") == col("size").min().over("dataset_name")) .select("dataset_name", "url_npm") ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + self.write_csv_gzip(npm_urls_min, self.paths["url"]) + + package = self.npm.datapackage() + # TODO: Re-enable after deciding on how best to utilize + # self.write_parquet(package["features"], self.paths["dpkg_features"]) + self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) if include_typing: - self.generate_typing(self._fp_typing) + self.generate_typing() return gh_trees def reset(self) -> None: """Remove all metadata files.""" - for fp in self._paths.values(): + for fp in self.paths.values(): fp.unlink(missing_ok=True) def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" - return pl.read_parquet(self._from_alias(name)) + return pl.read_parquet(self.paths[name]) def scan(self, name: _PathAlias, /) -> pl.LazyFrame: """Scan existing metadata from file.""" - return pl.scan_parquet(self._from_alias(name)) - - def _from_alias(self, name: _PathAlias, /) -> Path: - if name not in {"npm_tags", "gh_tags", "gh_trees"}: - msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' - raise TypeError(msg) - else: - return self._paths[name] + return pl.scan_parquet(self.paths[name]) def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """ @@ -193,6 +201,21 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non df.write_csv(buf) f.write(buf.getbuffer()) + def write_json_gzip(self, obj: Any, fp: Path, /) -> None: + """ + Write ``obj`` as a `gzip`_ compressed ``json`` file. + + .. _gzip: + https://docs.python.org/3/library/gzip.html + """ + if fp.suffix != ".gz": + fp = fp.with_suffix(".json.gz") + if not fp.exists(): + fp.touch() + + with gzip.GzipFile(fp, mode="wb", mtime=0) as f: + f.write(json.dumps(obj).encode()) + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): @@ -207,7 +230,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None with fp_schema.open("w") as f: json.dump(schema, f, indent=2) - def generate_typing(self, output: Path, /) -> None: + def generate_typing(self) -> None: from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT tags = self.scan("gh_tags").select("tag").collect().to_series() @@ -314,6 +337,20 @@ def generate_typing(self, output: Path, /) -> None: f"{textwrap.indent(textwrap.dedent(examples), indent)}" ) + FIELD = "FlFieldStr" + FIELD_TYPES = ( + "integer", + "number", + "boolean", + "string", + "object", + "array", + "date", + "datetime", + "time", + "duration", + ) + contents = ( f"{HEADER_COMMENT}", "from __future__ import annotations\n", @@ -341,8 +378,14 @@ def generate_typing(self, output: Path, /) -> None: doc=metadata_doc, comment="", ), + f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n" + '"""\n' + "String representation of `frictionless`_ `Field Types`_.\n\n" + f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n" + f".. _Field Types:\n{indent}https://datapackage.org/standard/table-schema/#field-types\n" + '"""\n', ) - ruff.write_lint_format(output, contents) + ruff.write_lint_format(self.paths["typing"], contents) _alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets" diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py new file mode 100644 index 000000000..da1f8375e --- /dev/null +++ b/tools/datasets/datapackage.py @@ -0,0 +1,133 @@ +""" +``frictionless`` `datapackage`_ parsing. + +.. _datapackage: + https://datapackage.org/ +""" + +from __future__ import annotations + +from collections import deque +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, get_args + +import polars as pl +from polars import col +from polars import selectors as cs + +from tools.datasets.models import ParsedPackage +from tools.schemapi import utils + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Mapping, Sequence + + from altair.datasets._typing import Dataset, FlFieldStr + from tools.datasets.models import FlPackage + + +__all__ = ["parse_package"] + + +DATASET_NAME: Literal["dataset_name"] = "dataset_name" + +# # NOTE: Flag columns +# Storing these instead of the full **56KB** `datapackage.json` +FEATURES: Sequence[pl.Expr] = ( + (col("format") == "png").alias("is_image"), + (col("type") == "table").alias("is_tabular"), + (col("format") == "geojson").alias("is_geo"), + (col("format") == "topojson").alias("is_topo"), + col("format").is_in(("geojson", "topojson")).alias("is_spatial"), + (col("format").str.contains("json")).alias("is_json"), +) + + +def parse_package(pkg: FlPackage, /) -> ParsedPackage: + return ParsedPackage(features=extract_features(pkg), schemas=extract_schemas(pkg)) + + +def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: + """Reduce all datasets with schemas to a minimal mapping.""" + m: Any = { + Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]} + for rsrc in pkg["resources"] + if (s := rsrc.get("schema")) + } + return m + + +def extract_features(pkg: FlPackage, /) -> pl.DataFrame: + # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision` + # - This only considers latest version + # - Those others are based on whatever tag the tree refers to + # https://github.com/vega/vega-datasets/issues/633 + EXCLUDE = ( + "name", + "type", + "format", + "scheme", + "mediatype", + "encoding", + "dialect", + "schema", + ) + return ( + pl.LazyFrame(pkg["resources"]) + .with_columns( + path_stem("path").alias(DATASET_NAME), + cs.exclude("name"), + col("name").is_duplicated().alias("is_name_collision"), + ) + .select( + DATASET_NAME, + path_suffix("path").alias("suffix"), + ~cs.by_name(DATASET_NAME, EXCLUDE), + *FEATURES, + col("schema").is_not_null().alias("has_schema"), + ) + .collect() + ) + + +def path_stem(column: str | pl.Expr, /) -> pl.Expr: + """ + The final path component, minus its last suffix. + + Needed since `Resource.name`_ must be lowercase. + + .. _Resource.name: + https://specs.frictionlessdata.io/data-resource/#name + """ + path = col(column) if isinstance(column, str) else column + rfind = (path.str.len_bytes() - 1) - path.str.reverse().str.find(r"\.") + return path.str.head(rfind) + + +def path_suffix(column: str | pl.Expr, /) -> pl.Expr: + """ + The final component's last suffix. + + This includes the leading period. For example: '.txt'. + """ + path = col(column) if isinstance(column, str) else column + return path.str.tail(path.str.reverse().str.find(r"\.") + 1) + + +def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: + guards = deque[str]() + ldf = frame.lazy() + for feat in FEATURES: + guard_name = feat.meta.output_name() + alias_name = guard_name.removeprefix("is_").capitalize() + members = ldf.filter(guard_name).select(DATASET_NAME).collect().to_series() + guards.append(guard_literal(alias_name, guard_name, members)) + yield f"{alias_name}: TypeAlias = {utils.spell_literal(members)}" + yield from guards + + +def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str: + """Type narrowing function, all members must be literal strings.""" + return ( + f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n" + f" return obj in set({sorted(set(members))!r})\n" + ) diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 449c412ef..a454ed30c 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -3,7 +3,8 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Literal, NamedTuple +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Any, Literal, NamedTuple if sys.version_info >= (3, 14): from typing import TypedDict @@ -14,9 +15,18 @@ import time if sys.version_info >= (3, 11): - from typing import LiteralString, Required + from typing import LiteralString, NotRequired, Required else: - from typing_extensions import LiteralString, Required + from typing_extensions import LiteralString, NotRequired, Required + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + import polars as pl + + from altair.datasets._typing import Dataset, FlFieldStr + +Map: TypeAlias = Mapping[str, Any] class GitHubUrl(NamedTuple): @@ -31,6 +41,7 @@ class GitHubUrl(NamedTuple): class NpmUrl(NamedTuple): CDN: LiteralString TAGS: LiteralString + GH: LiteralString class GitHubTag(TypedDict): @@ -178,3 +189,80 @@ class GitHubRateLimitResources(TypedDict, total=False): graphql: GitHubRateLimit integration_manifest: GitHubRateLimit code_search: GitHubRateLimit + + +##################################################### +# frictionless datapackage +##################################################### + + +FlCsvDialect: TypeAlias = Mapping[ + Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]] +] +FlJsonDialect: TypeAlias = Mapping[ + Literal[r"json"], Mapping[Literal["keyed"], Literal[True]] +] + + +class FlField(TypedDict): + """https://datapackage.org/standard/table-schema/#field.""" + + name: str + type: FlFieldStr + + +class FlSchema(TypedDict): + """https://datapackage.org/standard/table-schema/#properties.""" + + fields: Sequence[FlField] + + +class FlResource(TypedDict): + """https://datapackage.org/standard/data-resource/#properties.""" + + name: Dataset + type: Literal["table", "file", r"json"] + path: str + format: Literal[ + "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv" + ] + mediatype: Literal[ + "application/parquet", + "application/vnd.apache.arrow.file", + "image/png", + "text/csv", + "text/tsv", + r"text/json", + "text/geojson", + "text/topojson", + ] + schema: NotRequired[FlSchema] + scheme: Literal["file"] + dialect: NotRequired[FlCsvDialect | FlJsonDialect] + encoding: NotRequired[Literal["utf-8"]] + + +class FlPackage(TypedDict): + """ + A subset of the `Data Package`_ standard. + + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + + name: Literal["vega-datasets"] + version: str + homepage: str + description: str + licenses: Sequence[Map] + contributors: Sequence[Map] + sources: Sequence[Map] + created: str + resources: Sequence[FlResource] + + +class ParsedPackage(TypedDict): + """Minimal representations to write to disk.""" + + features: pl.DataFrame + schemas: Mapping[Dataset, Mapping[str, FlFieldStr]] diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index a5f068082..f71037d5c 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,23 +2,28 @@ import json import urllib.request -from typing import TYPE_CHECKING, ClassVar, Literal +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Literal import polars as pl -from tools.datasets import semver +from tools.datasets import datapackage, semver from tools.datasets.models import NpmUrl if TYPE_CHECKING: import sys - from pathlib import Path from urllib.request import OpenerDirector if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - from tools.datasets.models import NpmPackageMetadataResponse + from altair.datasets._typing import Version + from tools.datasets.models import ( + FlPackage, + NpmPackageMetadataResponse, + ParsedPackage, + ) __all__ = ["Npm"] @@ -46,6 +51,7 @@ def __init__( self._url: NpmUrl = NpmUrl( CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", ) @property @@ -78,3 +84,43 @@ def tags(self) -> pl.DataFrame: if (tag := v["version"]) and semver.CANARY not in tag ] return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) + + def file_gh( + self, + branch_or_tag: Literal["main"] | Version | LiteralString, + path: str, + /, + ) -> Any: + """ + Request a file from the `jsdelivr GitHub`_ endpoint. + + Parameters + ---------- + branch_or_tag + Version of the file, see `branches`_ and `tags`_. + path + Relative filepath from the root of the repo. + + .. _jsdelivr GitHub: + https://www.jsdelivr.com/documentation#id-github + .. _branches: + https://github.com/vega/vega-datasets/branches + .. _tags: + https://github.com/vega/vega-datasets/tags + """ + path = path.lstrip("./") + suffix = Path(path).suffix + if suffix == ".json": + headers = {"Accept": "application/json"} + read_fn = json.load + else: + raise NotImplementedError(path, suffix) + req = urllib.request.Request( + f"{self.url.GH}{branch_or_tag}/{path}", headers=headers + ) + with self._opener.open(req) as response: + return read_fn(response) + + def datapackage(self, *, tag: LiteralString | None = None) -> ParsedPackage: + pkg: FlPackage = self.file_gh(tag or "main", "datapackage.json") + return datapackage.parse_package(pkg) From 9274284a16962c55df1faff2db20ec1e0d55313f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:08:48 +0000 Subject: [PATCH 133/201] refactor(ruff): Apply `TC006` fixes in new code Related https://github.com/vega/altair/pull/3706 --- tests/test_datasets.py | 2 +- tools/datasets/datapackage.py | 2 +- tools/datasets/github.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 9d91c275e..f9dd4c5a3 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -493,7 +493,7 @@ def test_reader_cache( if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, - cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), + cast("pl.DataFrame", data("lookup_groups", tag="v2.5.3")), ) else: left, right = ( diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index da1f8375e..deb63fbb9 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -9,7 +9,7 @@ from collections import deque from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import TYPE_CHECKING, Any, Literal import polars as pl from polars import col diff --git a/tools/datasets/github.py b/tools/datasets/github.py index b9b156c60..406eca3dc 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -487,4 +487,4 @@ def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator if not TYPE_CHECKING: assert is_typeddict(tp) or issubclass(tp, Mapping) - return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop)) + return cast("Iterator[_TD]", islice(df.iter_rows(named=True), stop)) From 8e232b8d38d39c2832e64f5b959482585c4cc4e3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 17:01:45 +0000 Subject: [PATCH 134/201] docs(DRAFT): Add notes on `datapackage.features_typing` --- tools/datasets/datapackage.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index deb63fbb9..445974795 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -114,6 +114,15 @@ def path_suffix(column: str | pl.Expr, /) -> pl.Expr: def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: + """ + Current plan is to use type aliases in overloads. + + - ``Tabular`` can be treated interchangeably + - ``Image`` can only work with ``url`` + - ``(Spatial|Geo|Topo)`` can be read with ``polars`` + - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955 + - ``Json`` should warn when using the ``pyarrow`` backend + """ guards = deque[str]() ldf = frame.lazy() for feat in FEATURES: From 93308958fbf40873fc4023d6b20e1e81bc97d5ab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:16:04 +0000 Subject: [PATCH 135/201] docs: Update `Loader.from_backend` example w/ dtypes Related https://github.com/vega/altair/pull/3631/commits/909e7d05e57718b2f634a7e6781cb4e58a835837 --- altair/datasets/_loader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index ce2559aed..f9190f789 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -117,15 +117,15 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: pandas.core.frame.DataFrame >>> cars.dtypes # doctest: +SKIP - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year string[pyarrow] - Origin string[pyarrow] + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year timestamp[ns][pyarrow] + Origin string[pyarrow] dtype: object """ obj = Loader.__new__(Loader) From caf534da20f9b96187283d67a458f17c0b0346bb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:27:33 +0000 Subject: [PATCH 136/201] feat: Use `_pl_read_json_roundtrip` instead of `pl.read_json` for `pyarrow` Provides better dtype inference --- altair/datasets/_readers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 5b9829b9e..e2607acbc 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -380,10 +380,9 @@ def __init__(self, name: _PyArrow, /) -> None: # ------------------------------------------------------- # NOTE: Prefer `polars` since it is zero-copy and fast (1) if find_spec("polars") is not None: - import polars as pl def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: - return pl.read_json(source).to_arrow() + return _pl_read_json_roundtrip(source).to_arrow() else: # NOTE: Convert inline from stdlib json (2) From 75bf2bad9d5d8f59c6084f1f58686085409f604c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:57:03 +0000 Subject: [PATCH 137/201] docs: Replace example dataset Switching to one with a timestamp that `frictionless` recognises https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L2674-L2689 https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L45-L57 --- altair/datasets/_loader.py | 88 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index f9190f789..2b8a2cd95 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -171,72 +171,72 @@ def __call__( from altair.datasets import Loader data = Loader.from_backend("polars") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.columns # doctest: +SKIP - ['symbol', 'date', 'price'] + ['year', 'source', 'net_generation'] >>> source # doctest: +SKIP - shape: (560, 3) - ┌────────┬────────────┬────────┐ - │ symbol ┆ date ┆ price │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞════════╪════════════╪════════╡ - │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ - │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ - │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ - │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ - │ MSFT ┆ May 1 2000 ┆ 25.45 │ - │ … ┆ … ┆ … │ - │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ - │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ - │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ - │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ - │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ - └────────┴────────────┴────────┘ + shape: (51, 3) + ┌────────────┬──────────────┬────────────────┐ + │ year ┆ source ┆ net_generation │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ i64 │ + ╞════════════╪══════════════╪════════════════╡ + │ 2001-01-01 ┆ Fossil Fuels ┆ 35361 │ + │ 2002-01-01 ┆ Fossil Fuels ┆ 35991 │ + │ 2003-01-01 ┆ Fossil Fuels ┆ 36234 │ + │ 2004-01-01 ┆ Fossil Fuels ┆ 36205 │ + │ 2005-01-01 ┆ Fossil Fuels ┆ 36883 │ + │ … ┆ … ┆ … │ + │ 2013-01-01 ┆ Renewables ┆ 16476 │ + │ 2014-01-01 ┆ Renewables ┆ 17452 │ + │ 2015-01-01 ┆ Renewables ┆ 19091 │ + │ 2016-01-01 ┆ Renewables ┆ 21241 │ + │ 2017-01-01 ┆ Renewables ┆ 21933 │ + └────────────┴──────────────┴────────────────┘ Using ``pandas``: data = Loader.from_backend("pandas") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.columns # doctest: +SKIP - Index(['symbol', 'date', 'price'], dtype='object') + Index(['year', 'source', 'net_generation'], dtype='object') >>> source # doctest: +SKIP - symbol date price - 0 MSFT Jan 1 2000 39.81 - 1 MSFT Feb 1 2000 36.35 - 2 MSFT Mar 1 2000 43.22 - 3 MSFT Apr 1 2000 28.37 - 4 MSFT May 1 2000 25.45 - .. ... ... ... - 555 AAPL Nov 1 2009 199.91 - 556 AAPL Dec 1 2009 210.73 - 557 AAPL Jan 1 2010 192.06 - 558 AAPL Feb 1 2010 204.62 - 559 AAPL Mar 1 2010 223.02 - - [560 rows x 3 columns] + year source net_generation + 0 2001-01-01 Fossil Fuels 35361 + 1 2002-01-01 Fossil Fuels 35991 + 2 2003-01-01 Fossil Fuels 36234 + 3 2004-01-01 Fossil Fuels 36205 + 4 2005-01-01 Fossil Fuels 36883 + .. ... ... ... + 46 2013-01-01 Renewables 16476 + 47 2014-01-01 Renewables 17452 + 48 2015-01-01 Renewables 19091 + 49 2016-01-01 Renewables 21241 + 50 2017-01-01 Renewables 21933 + + [51 rows x 3 columns] Using ``pyarrow``: data = Loader.from_backend("pyarrow") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.column_names # doctest: +SKIP - ['symbol', 'date', 'price'] + ['year', 'source', 'net_generation'] >>> source # doctest: +SKIP pyarrow.Table - symbol: string - date: string - price: double + year: date32[day] + source: string + net_generation: int64 ---- - symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] - date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] - price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] + year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]] + source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]] + net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]] """ return self._reader.dataset(name, suffix, tag=tag, **kwds) From d4930e7e91f2518c98edc917e7c8ceec6787e517 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 20 Dec 2024 22:10:52 +0000 Subject: [PATCH 138/201] fix(ruff): resolve `RUF043` warnings https://github.com/vega/altair/actions/runs/12439154550/job/34732432411?pr=3631 --- tests/test_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f9dd4c5a3..33779efa8 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -169,7 +169,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: assert load._reader._name == "pandas" monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) - with pytest.raises(AltairDatasetsError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match=r"no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" @@ -191,7 +191,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) - with pytest.raises(AltairDatasetsError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match=r"no.+backend"): from altair.datasets import load From 5a31333c9ff425f623134d03bab0aacd8c62c74e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 20 Dec 2024 22:12:51 +0000 Subject: [PATCH 139/201] build: run `generate-schema-wrapper` https://github.com/vega/altair/actions/runs/12439184312/job/34732516789?pr=3631 --- .../_metadata/datapackage_schemas.json.gz | Bin 2490 -> 2473 bytes altair/datasets/_metadata/metadata.parquet | Bin 18777 -> 19296 bytes tools/datasets/_metadata/tags.parquet | Bin 6290 -> 6289 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2599 -> 2598 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz index 537dcd28ba9377319523683299cb1773ddf40e79..34433838d4623a280939f19d83e067930800a66f 100644 GIT binary patch delta 2463 zcmV;Q31IfR6R8uBB!608bKAHP|0*0$AFeaijOAQnsc*KNOFeTwyS97jJ#iosBoTuE z2LLTA8GrXKKvE)i!HfIQ<|)ns*ne2;{&uNvCrp?|8)r}co_sqI%rW@m#5yBXdGfd9 zM6sGrDVcElDaAVG@Fv|VXV5;$oN#R(wyL(?aucXpm6-erbbqF7rs01i@v>v|2V$^gM)G|)z875PQ?btsIATlooEgN`iivH3u`s#+?-TF5cG0;o`HlDr zRx-m1m)x)?p4{l1%jt%XMZq(dT-4rQi7t|y)vQEZevzWATyoWWIrsmLnZqAaD4yIl z+%Tuj^z^L}rGHS#1HGiXidpW37D2#t@w1S*6a`Nn#8*DI)bNa4%cH1yX9;rdMq8KM zx{6~LhlP9IKq$sm9<^gK`3qVC**s6KFCG#w?n&to!G{gun%cbNt_vUCnv5qKa6zff zKH2Cl+{XO6E!nVz27bT8w%cJ-oW*EEgv&+ARklwus((ShLXTXwhO;(yq+f|h$-)ba zis_f=p+{h5ThMx5$6K{Xw1Mj_76=a(+rE zOF|(!pA((WiO%Ol=X0d<<3Gs`V#7JH;hfm8A~vju4J%^9irBCsHmryZE8_PR(RoF5 zUJ;$wM1SWs(RodDUK5?yMCUcpc};X)6P?#Y=QYy#?DX`MP?m&pMkwzH=IGm|Zb58Ku_p#sZGmN~!=2-)0mB{Csp*nczv1`p2g#_4*W;g#QbjQYv2I)&;g zlxI8<@rkR}B3m#UnEd5j9R&WRa*@M*)+yvl~@)tlZ$XP0#IRQ?ZJ40b^P}l$j zVnN#+*}a1UIrg}YV4_@DD8?LP8X&_H;C%>vRE~i0$n&`3%vA&)nQFG$Cf!&X@O-~@Y*fmQ|X$gjdyXPGp5**gY{JXQ$+}r4;l?T_1HXB}2k012t<^eRjMU}~< zFG_eTNJf;|0ud`IK^@_$b{ljVGQoE)x0D+0cHwy4%$d-yjpdBIP5V9KD1QK_+y1^z%|DE{)@ z8b(~}E#xLcZj)?bgzgb(A&hk?b)RHWazDNqdx!UdH}*ZtNV0({o#JQ9^4xH1v4Uc{Y zYJarhx{-5%OI=*?VS+V|m0BxivTDu@NaO|w16&oN_^k0lIX@4`=219_82{@b`DC=k zVCl0qGT#Bzimt+;tKYAZ2S!oWozZ!l`PYzK9;Pm%^?P_`ew24bD%j24#YJ)tca2+< z`&RjWB?;5IS}596z}9U`rhkbKkq3LjzG#y`vt!W0#z0JqXP!Kgu)Gq`%@OYR+BvS0 zM_n_gxgBXf((U@@>TmnyD)HLGB!LG5AWcEvUjuxA zGHh7i(7&LA7^BIy)|T{fmbuo%00YeoBF7b3;!u6!xb15+@U%w-kCJhEY&1XukwHX4 zr`)|W(5gGSf%_0XTz}GQ=%1V@U?0VQ$6a)caIG>gO1X4X0UCs>;KmOqT#GLM4$|K~ zjm*Nkg^Qv9=n@eg2J-1W&$wuuotpY+n6iL+Y+4vyrx&fO0N8%zImoGN0neLcw~ifT zoNa4tm}1QGbb6lSI79(R`|Or(m_?84Bls=4uo>S(r~JXm(|?`6Y8Krw9+UfyJ`Ef3 z5a)1lel_)9J@yfkj||Y3Nn5{lHoe_}CoWN_cP_nF9`xre{JBAK7FQ|qjp(@8m4wWZ zPc+1Do$_}>kk8+In+$b0$QHm9N6RPVD z36DJ9h#6bEKr#+C41uGxDwwJ98#dyZ){hbzJDWBE_EP{1UVMi8^6QY{=z7mH{ocMh ze6mO(!+)nzmnGc*=mhSm>pbvTrrLsK@W|=09AMT~@-$e1u@*F#f3oZh%X#X2!S{o) zE=fXtkWP?Vu&Xq*O3*H);nehk01PuBV=IWcw|>VAhm>bYN707ijlcjyC?6WJ%v%V> zR3@P4fp4JxMAU}cC?IXZS-%UUZ@plk4EKc(aDNDbqpsa3Y;LdQ!YAZzr?x(84$~+6 z!C{U?=}W8{KCw)%&IfRsRZNv6N5KmSHEWTvAhmn&5@+mb+L-zA(Db9m>m*fso-suH zgR904?#0bPg<)~b1>ghzbmC<_hpzbGyf=<+!hDy0LNXhMFVXs+%^;Y#WPPdJHZ;8|G1=wt|ac+Bk5@?B%*ifWE zQgO6F{`(Cn*_LMLHutc-H+zN}&i5BzPK4gF8J9eBMr6WuC;v=dPCCvE{5Z*(<0t<} zPONTC#y!NsF;#k);w~+@;)Xe)mB$I?c*)JlD_|Slv~q_z5P#eqffKCc2x9p&f;3z= zl4IH%W+WdN6

XuiARkfAhqQ;6aWNvyuC)aAS!);apf|9XB@6{-~V5biPnJSDd-Z zVjh{YnFf{ur8NhZ?3B`f5ScF{DT0u#aiUk=CCh**!KMW6xB`Po84iu)UpvDar|T}m znBRB|`pK|5U4J_!lqWp7;uBY`MLLpfW`uUViV9(@OBJh;Z%7seQ1I-g=(*I+KSDuruvK2?Z!TLkrj*ZQe9;>*c{a1zn_ z_d5Axw6#dS&)Ud*D{>yi!bewsTqSEJIY!+Xowu2fpMT`C%}eeO{vJY>S)N=mhd8;p zySPa1K?ZEi`&M}qCl8|LNi7uiW8sf(TBngc){)@9VPCaLz&{K^tPI#nJn`g#sQXG7 z$TFDvy>^bPLcHk9fPjxw;%-4z>!2KeXc{09(cDM(VES$_1l|91%5x>P9v` zWUZ8gZ+|Ebxgf1t1F4az!T8n@l@rpF>^Da2sZPb1Trm+NGhi(iF2?@9)ve00-)L8H zoS`7Cd0)Y#0qDSwXImd#+{`Mlv+QO{o0>^Wu5`dikj)2OYf#WQ(6%oH1hEw=M_Mi< z#Hh0~CQPGEZyjw!*}LR8>@8S)uf_u+1IZm0XMaca?DTQM`gyW6glCb&gONmQ`O63NVyT~#~z=zi7K<*1_oo=xwMM5tc-VkS*XoQR0;e#!$ z*>env%R7oc57}QjW+}v!*;Tj{nwOobzS%_;9NH8)^hqO+5Cu>9a|6CJMX2=17og>1 z-}J0?h+#0NDPo|g<-IdIuexnw_rr_!X4h7~3>|I)wTYM7bjeER znD%fa+Z~}M98TDFvpWTa(@96QPw3H zd`@^iCp@1Mp3f1_55JQWM22%B!#R;*L1b7E85Tr_1(9JvWLOXx7DVq0!hiFE@Vp>A zFA2{}!t;{wyd*p?3C~Ny^OEqqBs?z(&r8HJH07rRGAEET0(nOuKNHA%0{K87=LE7K z5W@4E@H{6x&k4_S!t;!aOP;ehd2`WC9+Q9wHHE8mK# zT==SyUPD22qJVRBU_6|n1AoP}%6vIeE?reHQNV-_v&9hX(%#!;~%jmZ31ZPA^(lK^6Ox=WtJ53+Fi54E5w-p4qf8xcF}8 zC{3r2Q#_o%cWcH?Y%y9G^LqVB&}=`~#SB=Q!1u8^O_Sz6Og&;i-6(=yyMM)cM* zf46h;1MC%VqR3yQXut#T*W-p-=YL&t!>mnyDlu-|zg~?5 z_)|2#NA3lHKWU_6e!F%4CnvzV%nd|$Sf0bMD02S&8CcLaiIQ17rSFExS;d)j6%5>& zuKOW3>Wq8*bkz0VPDMSsK{rsXNQm0zedx|&17rhPm>-!s=RbZ|yb&h%e;)_Gxac!M zLK_pnWQG+E9DnYSCpX|bXdCmfC}7S`F6uC_=_1KlJuDIqhkgm7CV17fCk{WPP&~N} z>wCP(>bI9kq|)70{`mT%4 z5ygbkp8O5Rs^2*$*B9#qgnLxlk%Crh#Y|Qm+vS5J7=K6JcJPwjhsHCeJPHv?mw0#d zbqOz5oEH)Y>#n?wd=K(9dKBBQx^{k{v?`dX@r{r^Xxmn54ShiqUMlIEo#o~UBG2QY zv7q>YX8OHzb$FOgA z^VIo1+keMy|6mAhnRJ3A&26P2lQJ33g zeQPX-7Lb}$CZN{@-9TZ8unpEIxV7-DKLxjM{KWv-*$aQb!3hq$_9^|?S;>WeLF`Xz z8+wPKwBe`BV$XiXhwHQLy2 zSr-9^(>Buv2TM3l5aVL>{)gA1uX@ubIQW6`+oYL^w+OM7I24f#H;MUS5_&;TBcwSD zO@-88qM7gakL{=jT&$$Ot0!6RhWYXthFWm^*u u+$lo1x?{TDA;%*YbQ!o6v`%M_!)ruWZ?ue$f8XfKUjGZA$V*s_CIA2g4#rLZ diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 3eaa28ca39d5ab0230c23b9bb1799d78ffd64eb4..633815d1ff27144689ab23aacff171a20598293e 100644 GIT binary patch delta 1870 zcmaJ=eN0BvKJMx5QLuO#T-r-US5m3#;2TPQ?@5Aj)C#6v0UHXx%sAm|HB`DpS=9X&kIMttAsbs&EB-`^xXZGuVQaB zcbpqvK6|2N4>2fDBZ_A~(49Ka-f^C+e&d_l&5`-%?o59lERdZ~O``~9Awen?8Li>| z=Dtu*Pj_#B>oF>w9x0|LNP0LnL%vUfQ6Z4=IIj1?qw96b2$?)RYtf%E{mNr>gLzqX zD_57=J^at=7gyHaT3_D2*1xu@xH99*I(g*y`q`6*s}7H=QsjgoK2nwrUs z<3T;O1mbwKnYx)B#}i@d+RiwB@H(KT$@t~tF?2~N&Bd}|3b0*&Nk%E?m;VNoo=|_1 zqe}Gif1{SEFDP(*@g1Ol5#uLOd_JKbDpLv1X5?)f-B&40?oiq2!Bbd$0kLL6y;Q!b zo|x8F!_rIT&xiu!>!SP};U2RO%nCM@XE%LGWUe7*3J~i1uL6}&nV*xGZWjHe$PfJs zs4CQ$FlEdG^TL9W&R#@B4n2bi9f~`R;wB4~;tMFV{}Q@e%1`T;C}+#y69`r6Ug=)x{spkHTgDAm>m zpsudO9j*=dLROQz{8)eu)fr0%xQNJyty0_|vHO#lllX8FpC~7F;*$Z=1a-*(UXqMQ z#U+ZsI2d*)A|}JrL5bqi;cL-UHl||< zS#cv9d<5C?^bl`HJR8}-&XCop4S8JAc1=l%&nhbcc(KfOh`^aeAO+HwT1$a?F3_#9U z!znCO7ZowsRKsaKot~an1V!hF=qGhHBIjJjfrKh>Fy3)=+Mv;nbK%_R)NnnmolcA% znKMoNf7Ni^ZoBX=-%Bc)Cm(iQ>HXQH+1N`;vRk zx#v6I`R=*zr#Il0*Wu&_rgyS{orjy)Asx%mPh$dA5ONTZRrP^>Wq%OVt^;1X3eD7- z)VF1$CnhT0?LWGH>#F!{&X5PpI1bG zNZh`o2lC<|1CFXn>nn52=S#DSsmRqU_;N|Ju1>UpbSu2r(BRz{d)^-|b55;1P+I@) zvws*SjI$$kXJ^fi@yY&y*EDG=kWq8Wj&MBE)Y2J?MN`qv4z>^t=}=z*%0Yf8irt5U>{TC&m5NvX1+VAHDK2<-bl!9DrXmOZ@j4kmFNtnVD1bhcGP5M}DFpo8qfo=j)ir=osX2Knnu)9qec6P@5I>is zoQkhb`Q`M-7SG!rt(YHx6ff6xb$4-$E}TqurU5`d6J#;^ znjou5LlN3F(u=8;ef>1COcGlGKP3wG(tqF2g3d_EbA2*)Oa@SY?h*1Y@CvlJ1lQ9i ze6Jhc2c*IoGTpFz1{Q(yXj_=tcO}sjn@E|69aB~U6vb}G&3-_q)^WLksP qbn|AHAruhb6a=+2y^zykXbkR&k-Sujac!+=+*PSCD*&j+UgJOY5TF16 diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet index f8ed6f54e46e03902d48eed24ad595faeeae94fc..189dbbcae0b49d624a63d54b76e6ca9ce9425e3c 100644 GIT binary patch delta 255 zcmbPaIMHy!c|IQrQ7s+O2vG)6Hc>ViGX@PtMhQj+Akf&V|5t%0k%55;$cbQN2n7j> zGDxa2ut`erWfsR56s5)&rRJn27N?5xh((C`i6v~7<#%LaOqd)mc!sURMN~rU(qu!S z8ny)>X4mA+Lf_eTfS4yHZxFU;yWk=!CiZUfS7AH0XCTovlPyK;*#3c-uO`=uxcEwAQoH;}+ZL>7LBNJoVkHMeEdeq6CT|q_&b9}{JTrNn zusz!q7f~^>Pm@0j+p)a@iEfx|E@H>lViGX@PtMhQj+Akf&V|5t&pkb!{-$f;pu2z|f~ z5*B5URApe3l;F!OjxQ)mjW0^gNlh$H73C4D5epJane5Nw^XW0RRYpV5?EY>9}dnAoSuW^8tByFk(#CRYKOmq5&SllQT?_`L!#*9eQ+F{nKf wkil=4+7Eskgp Date: Sat, 21 Dec 2024 16:37:08 +0000 Subject: [PATCH 140/201] chore: update schemas Changes from https://github.com/vega/vega-datasets/pull/648 Currently pinned on `main` until `v3.0.0` introduces `datapackage.json` https://github.com/vega/vega-datasets/tree/main --- .../_metadata/datapackage_schemas.json.gz | Bin 2473 -> 2483 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz index 34433838d4623a280939f19d83e067930800a66f..2655d6baca5f11208703bd123308b16b171b0f53 100644 GIT binary patch delta 2472 zcmV;Z30L;16SEVLC4W(K+_(+?D;iH9t~1>kuXBla`)03msb|h-*LE+xCk)%))}G7lfNY= ziq(8d$%NZaDb_KE7wJ|xgVssrglqG#RJHY%n?Tj7#N=0?Gk;|>4gU*m5p z4RhK|ci$RO3V)S6&`Y|jnB`t+5fn_BpM}h&D0uQ9zVbO%!!vR%kD}(?Nsx0l+PdV{ zRUErGFz$H+p%~*lYR6>q7sLYHJWsAK9uhd*lhPrA4-3LIwRy>17e2Z*8BaFgf>N7( zve8|*jpcPqvSA4g{C9Xv#O8Bi^Et8k9NGN%PqKoX;hdb|oSb1r&afh9SdlZV$Qf4T3@dVm74iFu*t{Y( zuZYcSVt@0R*t{k-uZhiTV)L5Vye2lUiOp+b^BUQFc6xeBC`&>)Bb0Z9@-v~lCzKC_ za!x2KLLoLUiOox5^OD%SBsMRJ%}ZkQlGwZ?HZO_IOJeicDK4lXm2}0l`}+@Chzo6a z#%zrB%w&tZS?w}Mc+z82S3Pn8q#3aBXTnzVMYtD^D^ebT-$ueGZ<))_nLmSH* zy(1O(Gxr5Q6)KfmOX~xd9!Lm{PC9sg03aQ*z~|Fvi%hEwj5)~oc7n0X=JsqVo8@0x z>wlo4PdTp#EGc@hl>#UnPWq?7t>6#Ks^vJWh;>=Y9HswY< zcai)Jz;DjK>*(5tEuxjoqFXE^GklTrhJQm0t8BPV8!!{v?U?DI#n`YQp=(oev;kq6 zZnY0_yI7t+O>W|N6aeFFPu=dAA_i$3>0!Ht1ao*M5!H~dd);r>2!_nGpp`t`HQVy@(5;>KwdFnW;r)0Kc`+o>I ztcGsgkbLFW6UJ*u5KSGb*`nf1x@sX5B=+ukM~4Ro{#bsucbbzM-L&%Hn$cpzOX~51 z9o?LOX1Ay@ne@d9F9pqrGFv!er6!moT-9!aEkh>w&gGU;!`&_%uba6e^t`d2k+*3d zNF3!i;FR0oBRSuu<1%1$+aQk%y?LDtRWnnam;x2{8bOcSte)yud zT}6RE(0Gc!e7A-X*ZLE3lOeZBwlG5XNVO2gx|F(4vM9M9>x{j_`@kFfo@FH2K$TAM zvt@a1xV6|rkUqBBR9fd#j@vib%E;I)KBIf>R_9u6JgCzmw=;p0y&!nBCx2q}Q`@8= zEIJ{F2$?$Gl84%FZMbgaT;Ng{SA3Xkjbo+OikYmMI|d|jgM$IC3Q>I4c%hP?2W0ao zoJ4~E^^klr+G4QuSsR(}0BS|Ku<7detK@-Elyzrx-e&$aB$tP2%!q#v&&-ebu1E#D zxx2VX?%}R+%W~f;-^V0jihoxNMSBfcx^2mn@geeHZ`c=Y5@>=9Vr&e=w0P#pBMHkZ z0bL%Eey^S5DtXj3bDG}y!oMRXU#_!s3f_0TPH6+ioJlt4|Cd4Rs^_JTq zUy*cZdKVwJGlc&7Z1vJ=y^$Pu~JbgCUH7<=zk=jtZaqq;xszM zy|$7mltV5#drE$Vt*%Obs_VZ_H3#k$bj~s6FH$t{s`2jeBva@6aVt|2W29e{6EODqlaoi3z8f4m|fk(|a zJvJI3fyf{tp@Z(;8Gl&SZQa0q2p=x#HS|wT6tIrszvC)8Ubt47SEXFKsQ?YaRdC}6 z7p_H@e+TJrpGIcn9pj=X0J=n!he3UM&oeF>XQxMfG)!4QJvJ?juG5RwRRCYYokl?VNK3x94{a(2fv;KGupaE~c1clW&Ta~B^c$HQAvEf0(0 zdR}pN5k=b+gnwB6LJ&@mVrFEg@HmCyKD4XWfOnZ1<|XthC^VnzfDTeosTr%7SY)A2 z1BzMfJO=f>8dQ0?Uvy>D&UWUFho-GT@58qt9=sFAPN*DZLZmoaGeJux8sQ?7O0O!d z*-MOy=eJ0^ryP`y)3|NkpPnxIuLS?Q3eNH`5$dys+J8j?CYUrcqiEntF^acNXHR_` z?dz>ZK;dmd+Wh$7zoBy1H4u1%0>4Wga@B%m9^yi`pSSin)U}b%`>{zQ%K>I>B~QaCFxG+w^H12$ zu$-sP7kob$>yjkY2k8W<1-nW^s|4**8g@-D2*5BCGR8r|{pokia7cNUbQEnE-UtjZ zgz}*g%e;k9Ol1O!9{2|8PsD7vjRMjp?DeZa`+wFe2Fh?>cmszZINI8c!sgaWE__1n zR%+|B<}iK2A0p;Nl)l8O;nU0X>U;pFS;bUIaumFPP_q^p3tGDeFLB16rj3~&6-_^C zyiQWJml;FEKe%e_;9lGuR2Z0JDF7evrxUO1IdsJb=e==s6Xv`0(~{XRe2LcooCfLr mDj^#G6Oc|R{3nh4Amr`8>H4pJIySlI*Z%-?tB3`rCIA559?s?f delta 2462 zcmV;P31RlL6R8uBC4XOY+qe<`DjZKAt~1q)N^Z?>FEJ##+0wtMM4aUc>T5rY5+ z04*ySfA=mxQX+T3i~G>#Db51ee^~7PcByYCOqfO+XHWi~d^-`$G5F)eIwMqh^0(wf zv6@dQnQ;3l#X9EjCfzD$&_2nWaBUv8s4uL*!84a!)ZSl-E|Q$ptVCRXk)o_za@Bh|_y3NW!yi&8p4>Lv zFsIG*^sN!4P=Cn-y`;N}S?+}vLBMqJvyizI1y3HtS3bAY@Qhr`qo{dj33Bd6TbJCr ziendtg?rvWD8^PEwPP~*3t9r%JWsAK9uhF_N$C*5hYjJH+Pvhh3m@H@j3*m#L8;9? z+2}6Z#{9Z1*|3EMe!s)E+hJ3j#b`r>%SFjmwofvuL4UtOk6gBfvo?05Ux`S`!V8Rw z>6hrCM_^`K(0X3STeV2Ef$J?62ltLREY}~-2<082{7fkC3FQN!oD<55P}Z1oeo81y zLLoYz6P?eA&gVqubENa*KgkYa!#T0xoY=4;HmryZD`LZn*svlttcVRO;`bHNc|~+y z5uMjW=YKWPc};X)6P?#Y=QYuJO>|xpo!3O?HPZR)^z@WamV|OfDDMd6XF_>TC?5#r zoKRMTLUdjdotH%CCDD0FbY2pjmqh0!(RoR9UJ{*`MCY?pTu?(Q>56Ih_aC$n7uxWQ z+1S!ElP!i1+txUt0?SL5IlvDH+1zm@G9hKyG=BmH563W~xmEU-b`pK|5h3YDl zXFL({iL2HkTQg=lLn&Ig5t7>!ig@~oNq|MT816yWoEwYiSHd=uWxVFfO;!7cHkLPf zM=I`T?hAe@R4TWY)(0*4# zLE9YJy@LZe_PCBjeF%M2j)3vV^SI*7RRkX8Mm%>B`wc*D&cEx> z+J`5imCT~sDkL*}k@JQ_tg39dHXE=JI)Ck$=^?~evB05gM{;xkVU}*U4`sVpo<2=H zel!I@H``P9I;Mz18b@?EZXuBzo`{llKx_^#73k z{)buJH{U`0+c^x0hddmu-4-fGH1p)}+AZNz>6)jGvwBKqJGPII!*1xx4Yoe)Cw~jC zAwe{Cs9KAPGwG^@P>{g8=N%mq9M;GDyR*~W+vuj12iJ@?8(vb6AN1(v0W`ZsmC2+p zN_ZSsnrt$&B8JeGyg$ceiY8qg608T;Xj-gXrQ{y<|W{_@=# zMqKMHpJY*TKfW1zhxdUu_C3o;vVkg{;%Ce9+;D5Lhai1y zwW+kurz>vXU@ar#wfHRVwcDL*wejFhi`>oxPWFOe(4Mf-Pi>Qiu;`c^B7bD+cuO8? zf3)Gck#m7dU0m^Df;EnnS}SI9aO6-vQK$uEL?K->;GfMp4$C(RrKs*N|KurY@uPdw6Dkly^lc*v;L=MRE^!ja!rZ zR{4G<3Dde-DB4rN)@@6siGL4~2YbW5Xp=y*W6;9JKun8go;;GUyb{pO5$^ZeIj)jN zT{EY-9cez&?fT~GaAfd>O1O+nvZ1AKro zY*^pWzo3H{qsg|`mh^F!xz@x01I-K~#}!%PP<`UK?Q1mfv_}Pxl5u)$G(ZB8K}15Q z+`TiWm=F1=PB^ye-7xj}IjS1Iz1=(yOGgv^mo zG{kS6@^?d!&)=2W)C!PTpmOO=fOli5h=Y^lI_&7NlZ%MU0Y>LZy#oa{|ZBr0p z`3pffJ&Kv(oqxjP6pH)Mu37`$Wonq0(5s-(e69mJa7CqNEMlUOg*pu=X0h`a)c0yo z<>h|Sl}$U_nKvGqwg$Zq--dYbP8N1TrfnHfa`SAU98ymdNz>f>l%Z#4o6Z)4Kt z#|QrnmAkHizy}ogUFwjl7A*4+7rOntwa4MDjeN$BO&TFDRB!0qC>SGAiWPeks_PC3 zk38Op8C$zRG7dHjfupo4n5pp_HsYDqj}jU?n>GRVQveKJe1`k->yY8-de1Zc-o83~ zvPdDrr+-qHCEWn%1n#NpJn&hj+Ja^9$my{hVAfXhG+2SL7BrZDvg{1YdFp(@_k*!6 zNkV;)PLNu#t2DGq&@QFn)bxS?3^O5PD~P$be#Z=llxImt(T3rTzyL!i9~!aDTL{He zCZOnnZ=n7})P~zAAZ@}~zYC;qyBvpH!F+}`> ztHuuQ#mzy5VR6g_-~;}2;$=ODuK3`*H;!(?e3yPgG8=|3(fXgwAh};f Date: Sun, 22 Dec 2024 12:40:22 +0000 Subject: [PATCH 141/201] feat(typing): Update `frictionless` model hierarchy - Adds some incomplete types for fields (`sources`, `licenses`) - Misc changes from https://github.com/vega/vega-datasets/pull/651, https://github.com/vega/vega-datasets/pull/643 --- tools/datasets/models.py | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tools/datasets/models.py b/tools/datasets/models.py index a454ed30c..f8414f739 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -209,6 +209,7 @@ class FlField(TypedDict): name: str type: FlFieldStr + description: NotRequired[str] class FlSchema(TypedDict): @@ -217,12 +218,29 @@ class FlSchema(TypedDict): fields: Sequence[FlField] +class FlSource(TypedDict, total=False): + title: str + path: Required[str] + email: str + version: str + + +class FlLicense(TypedDict): + name: str + path: str + title: NotRequired[str] + + class FlResource(TypedDict): """https://datapackage.org/standard/data-resource/#properties.""" name: Dataset type: Literal["table", "file", r"json"] + description: NotRequired[str] + licenses: NotRequired[Sequence[FlLicense]] + sources: NotRequired[Sequence[FlSource]] path: str + scheme: Literal["file"] format: Literal[ "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv" ] @@ -236,10 +254,20 @@ class FlResource(TypedDict): "text/geojson", "text/topojson", ] - schema: NotRequired[FlSchema] - scheme: Literal["file"] - dialect: NotRequired[FlCsvDialect | FlJsonDialect] encoding: NotRequired[Literal["utf-8"]] + bytes: int + dialect: NotRequired[FlCsvDialect | FlJsonDialect] + schema: NotRequired[FlSchema] + + +class Contributor(TypedDict, total=False): + title: str + givenName: str + familyName: str + path: str + email: str + roles: Sequence[str] + organization: str class FlPackage(TypedDict): @@ -254,9 +282,9 @@ class FlPackage(TypedDict): version: str homepage: str description: str - licenses: Sequence[Map] - contributors: Sequence[Map] - sources: Sequence[Map] + licenses: Sequence[FlLicense] + contributors: Sequence[Contributor] + sources: Sequence[FlSource] created: str resources: Sequence[FlResource] From fdffed0a15be3967c6b9513787fd40feb59c9cdc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 12 Jan 2025 12:08:31 +0000 Subject: [PATCH 142/201] chore: Freeze all metadata Mainly for `datapackage.json`, which is now temporarily stored un-transformed Using version (https://github.com/vega/vega-datasets/commit/7c2e67f6e7ba69b00e7cb1473503518942385d11) --- tools/datasets/__init__.py | 50 +++++++++++++++-------- tools/datasets/_metadata/datapackage.json | 1 + tools/datasets/npm.py | 15 +++++-- tools/generate_schema_wrapper.py | 2 +- 4 files changed, 45 insertions(+), 23 deletions(-) create mode 100644 tools/datasets/_metadata/datapackage.json diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 395119dd7..c30c43867 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -130,7 +130,9 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: + def refresh( + self, *, include_typing: bool = False, frozen: bool = False + ) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -138,26 +140,38 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: ---------- include_typing Regenerate ``altair.datasets._typing``. - """ - print("Syncing datasets ...") - npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self.paths["npm_tags"]) - - gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self.paths["gh_tags"]) + frozen + Don't perform any requests or attempt to check for new versions. - gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self.paths["gh_trees"]) + .. note:: + **Temporary** measure to work from ``main`` until `vega-datasets@3`_. - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self.paths["url"]) + .. _vega-datasets@3: + https://github.com/vega/vega-datasets/issues/654 + """ + if not frozen: + print("Syncing datasets ...") + npm_tags = self.npm.tags() + self.write_parquet(npm_tags, self.paths["npm_tags"]) + + gh_tags = self.github.refresh_tags(npm_tags) + self.write_parquet(gh_tags, self.paths["gh_tags"]) + + gh_trees = self.github.refresh_trees(gh_tags) + self.write_parquet(gh_trees, self.paths["gh_trees"]) + + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self.paths["url"]) + else: + print("Reusing frozen metadata ...") + gh_trees = pl.read_parquet(self.paths["gh_trees"]) - package = self.npm.datapackage() + package = self.npm.datapackage(frozen=frozen) # TODO: Re-enable after deciding on how best to utilize # self.write_parquet(package["features"], self.paths["dpkg_features"]) self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json new file mode 100644 index 000000000..dbb2e51dc --- /dev/null +++ b/tools/datasets/_metadata/datapackage.json @@ -0,0 +1 @@ +{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2024-12-31T18:32:26.970186+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths × 1000 × 12) ÷ Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]} \ No newline at end of file diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index f71037d5c..fd2aa848d 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -45,8 +45,9 @@ def __init__( jsdelivr_version: LiteralString = "v1", ) -> None: output_dir.mkdir(exist_ok=True) - self._paths: dict[Literal["tags"], Path] = { - "tags": output_dir / f"{name_tags}.parquet" + self._paths: dict[Literal["tags", "datapackage"], Path] = { + "tags": output_dir / f"{name_tags}.parquet", + "datapackage": output_dir / "datapackage.json", } self._url: NpmUrl = NpmUrl( CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", @@ -121,6 +122,12 @@ def file_gh( with self._opener.open(req) as response: return read_fn(response) - def datapackage(self, *, tag: LiteralString | None = None) -> ParsedPackage: - pkg: FlPackage = self.file_gh(tag or "main", "datapackage.json") + def datapackage( + self, *, tag: LiteralString | None = None, frozen: bool = False + ) -> ParsedPackage: + pkg: FlPackage = ( + json.loads(self._paths["datapackage"].read_text("utf-8")) + if frozen + else self.file_gh(tag or "main", "datapackage.json") + ) return datapackage.parse_package(pkg) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index 27ef56f97..3177b56cf 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -1394,7 +1394,7 @@ def main() -> None: copy_schemapi_util() vegalite_main(args.skip_download) write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT) - datasets.app.refresh(include_typing=True) + datasets.app.refresh(include_typing=True, frozen=True) # The modules below are imported after the generation of the new schema files # as these modules import Altair. This allows them to use the new changes From e259fbabfc38c3803de0a952f7e2b081a22a3ba3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:46:46 +0000 Subject: [PATCH 143/201] feat: Support and extract `hash` from `datapackage.json` Related https://github.com/vega/vega-datasets/pull/665 --- tools/datasets/_metadata/datapackage.json | 2 +- tools/datasets/datapackage.py | 5 +++++ tools/datasets/models.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json index dbb2e51dc..df9d40e85 100644 --- a/tools/datasets/_metadata/datapackage.json +++ b/tools/datasets/_metadata/datapackage.json @@ -1 +1 @@ -{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2024-12-31T18:32:26.970186+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths × 1000 × 12) ÷ Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]} \ No newline at end of file +{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2025-01-12T14:23:04.938086+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:6586d6c00887cd48850099c174a42bb1677ade0c", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:608ba6d51fa70584c3fa1d31eb94533302553838", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:719e73406cfc08f16dda651513ae1113edd75845", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:11ae97090b6263bdf0c8661156a44a5b782e0787", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8dc50de2509b6e197ce95c24c98f90d9d1ab138c", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:1b8b190c9bc02ef7bcbfe5a8a70f61b1616d3f6c", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:5b18c08b28fb782f54ca98ce6a1dd220f269adf1", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8a909e24f698a3b0f6c637c30ec95e7e17df7ef6", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d8a82abaad7dba4f9cd8cee402ba3bf07e70d0e4", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:1d56d3fa6da01af9ece2d6397892fe5bb6f47c3d", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b8715cbd2a8d0c139020a73fdb4d231f8bde193a", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:0070959b7f1a09475baa5099098240ae81026e72", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths × 1000 × 12) ÷ Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d2df500c612051a21fe324237a465a62d5fe01b6", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0584ed86190870b0089d9ea67c94f3dd3feb0ec8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:33d0afc57fb1005e69cd3e8a6c77a26670d91979", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "hash": "sha1:ed4c47436c09d5cc5f428c233fbd8074c0346fd0", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:0691709484a75e9d8ee55a22b1980d67d239c2c4", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:10bbe538daaa34014cd5173b331f7d3c10bfda49", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d232ea60f875de87a7d8fc414876e19356a98b6b", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:769a34f3d0442be8f356651463fe925ad8b3759d", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "hash": "sha1:74f6b3cf8b779e3ff204be2f5a9762763d50a095", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4722e02637cf5f38ad9ea5d1f48cae7872dce22d", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:20c920b46db4f664bed3e1420b8348527cd7c41e", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d9221dc7cd477209bf87e680be3c881d8fee53cd", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "hash": "sha1:9c4e0b480a1a60954a7e5c6bcc43e1c91a73caaa", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8459fa09e3ba8197928b5dba0b9f5cc380629758", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0ba03114891e97cfc3f83d9e3569259e7f07af7b", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d07898748997b9716ae699e9c2d5b91b4bb48a51", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:abce37a932917085023a345b1a004396e9355ac3", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8cb2f0fc23ce612e5f0c7bbe3dcac57f6764b7b3", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:cf0505dd72eb52558f6f71bd6f43663df4f2f82c", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:18547064dd687c328ea2fb5023cae6417ca6f050", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:01a4f05ed45ce939307dcd9bc4e75ed5cd1ab202", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:ebfd02fd584009ee391bfc5d33972e4c94f507ab", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:214238f23d7a57e3398f4e9f1e87e61abb23cafc", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:69d386f47305f4d8fd2886e805004fbdd71568e9", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:94ee8ad8198d2954f77e3a98268d8b1f7fe7d086", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:d90805055ffdfe5163a7655c4847dc61df45f92b", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:2e24c01140cfbcad5e1c859be6df4efebca2fbf5", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:1b21ea5339320090b106082bd9d39a1055aadb18", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:741df36729a9d84d18ec42f23a386b53e7e3c428", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:c79f69afb3ff81a0c8ddc01f5cf2f078e288457c", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:a8b0faaa94c7425c49fe36ea1a93319430fec426", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:921dfa487a4198cfe78f743aa0aa87ad921642df", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:e38178f99454568c5160fc759184a1a1471cc558", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4303306ec275209fcba008cbd3a5f29c9e612424", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:6da8129ed0b0333c88302e153824b06f7859aac9", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9b3d93e8479d3ddeee29b5e22909132346ac0a3b", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:517b6d3267174b1b65691a37cbd59c1739155866", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:01df4411cb16bf758fe8ffa6529507419189edc2", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4716a117308962f3596179d7d7d2ad729a19cda7", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4aa2e19fa392cc9448aa8ffbdad15b014371f499", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:680fd336e777314198450721c31227a11f02411f", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:3bad66ef911b93c641edc21f2034302348bffaf9", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d55461adc9742bb061f6072b694aaf73e8b529db", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0f38b53bdc1c42c5e5d484f33b9d4d7b229e0e59", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b82f20656d0521801db7c5599a6c990415a8aaff", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0eb287fb7c207f4ed392821d67a92267180fc8cf", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:58e2ce1bed01eeebe29f5b4be32344aaec5532c0", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:65675107d81c19ffab260ac1f235f3e477fe8982", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4d769356c95c40a9807a7d048ab81aa56ae77df0", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "hash": "sha1:d1aca19c4821fdc3b4270989661a1787d38588d0", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:c6120dd8887a0841a9fcc31e247463dbd3d0a996", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:ff7a7e679c46f2d1eb85cc92521b990f1a7a5c7a", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:8795be57cf1e004f4ecba44cab2b324a074330df", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9c3211c5058c899412c30f5992a77c54a1b80066", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:841151dbfbc5f6db3e19904557abd7a7aad0efd2", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0e7e853f4c5b67615da261d5d343824a43510f50", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:bd42a3e2403e7ccd6baaa89f93e7f0c164e0c185", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:cde46b43fc82f4c3c2a37ddcfe99fd5f4d8d8791", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:ed686b0ba613abd59d09fcd946b5030a918b8154", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:a1ce852de6f2713c94c0c284039506ca2d4f3dee", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d3df33e12be0d0544c95f1bd47005add4b7010be", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]} \ No newline at end of file diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 445974795..ac6ae7087 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -70,6 +70,10 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame: "encoding", "dialect", "schema", + "sources", + "licenses", + "hash", + "description", ) return ( pl.LazyFrame(pkg["resources"]) @@ -84,6 +88,7 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame: ~cs.by_name(DATASET_NAME, EXCLUDE), *FEATURES, col("schema").is_not_null().alias("has_schema"), + col("hash").str.split(":").list.last().alias("sha"), ) .collect() ) diff --git a/tools/datasets/models.py b/tools/datasets/models.py index f8414f739..e2036b4ea 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -255,6 +255,7 @@ class FlResource(TypedDict): "text/topojson", ] encoding: NotRequired[Literal["utf-8"]] + hash: str bytes: int dialect: NotRequired[FlCsvDialect | FlJsonDialect] schema: NotRequired[FlSchema] From 3fa7cacca21ae9f440619aac8df2ce81a606a8c7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:21:54 +0000 Subject: [PATCH 144/201] feat: Build dataset url with `datapackage.json` New column deviates from original approach, to support working from `main` https://github.com/vega/altair/blob/e259fbabfc38c3803de0a952f7e2b081a22a3ba3/altair/datasets/_readers.py#L154 --- tools/datasets/datapackage.py | 9 ++++++--- tools/datasets/npm.py | 32 +++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index ac6ae7087..49baf7a32 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -42,8 +42,10 @@ ) -def parse_package(pkg: FlPackage, /) -> ParsedPackage: - return ParsedPackage(features=extract_features(pkg), schemas=extract_schemas(pkg)) +def parse_package(pkg: FlPackage, base_url: str, /) -> ParsedPackage: + return ParsedPackage( + features=extract_features(pkg, base_url), schemas=extract_schemas(pkg) + ) def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: @@ -56,7 +58,7 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS return m -def extract_features(pkg: FlPackage, /) -> pl.DataFrame: +def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision` # - This only considers latest version # - Those others are based on whatever tag the tree refers to @@ -89,6 +91,7 @@ def extract_features(pkg: FlPackage, /) -> pl.DataFrame: *FEATURES, col("schema").is_not_null().alias("has_schema"), col("hash").str.split(":").list.last().alias("sha"), + pl.concat_str(pl.lit(base_url), "path").alias("url"), ) .collect() ) diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index fd2aa848d..8f9182c45 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import string import urllib.request from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal @@ -18,6 +19,10 @@ from typing import LiteralString else: from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias from altair.datasets._typing import Version from tools.datasets.models import ( FlPackage, @@ -25,6 +30,8 @@ ParsedPackage, ) + BranchOrTag: TypeAlias = 'Literal["main"] | Version | LiteralString' # noqa: TC008 + __all__ = ["Npm"] @@ -55,6 +62,19 @@ def __init__( GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", ) + def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString: + """ + Common url prefix for all datasets derived from ``version``. + + Notes + ----- + - Encodes the endpoint at this stage + - Use github if its the only option (since its slower otherwise) + - npm only has releases/tags (not branches) + - So the column can be renamed ``"url_npm"`` -> ``"url"`` + """ + return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/" + @property def url(self) -> NpmUrl: return self._url @@ -88,7 +108,7 @@ def tags(self) -> pl.DataFrame: def file_gh( self, - branch_or_tag: Literal["main"] | Version | LiteralString, + branch_or_tag: BranchOrTag, path: str, /, ) -> Any: @@ -125,9 +145,15 @@ def file_gh( def datapackage( self, *, tag: LiteralString | None = None, frozen: bool = False ) -> ParsedPackage: + tag = tag or "main" pkg: FlPackage = ( json.loads(self._paths["datapackage"].read_text("utf-8")) if frozen - else self.file_gh(tag or "main", "datapackage.json") + else self.file_gh(tag, "datapackage.json") ) - return datapackage.parse_package(pkg) + + return datapackage.parse_package(pkg, self.dataset_base_url(tag)) + + +def is_branch(s: BranchOrTag, /) -> bool: + return s == "main" or not (s.startswith(tuple("v" + string.digits))) From 34b869e7d1420287887796a2612fe12af133cf3b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:02:28 +0000 Subject: [PATCH 145/201] revert: Removes `is_name_collision` Not relevant following upstream change https://github.com/vega/vega-datasets/issues/633 --- tools/datasets/datapackage.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 49baf7a32..2ff40c32b 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -59,10 +59,6 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: - # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision` - # - This only considers latest version - # - Those others are based on whatever tag the tree refers to - # https://github.com/vega/vega-datasets/issues/633 EXCLUDE = ( "name", "type", @@ -82,7 +78,6 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: .with_columns( path_stem("path").alias(DATASET_NAME), cs.exclude("name"), - col("name").is_duplicated().alias("is_name_collision"), ) .select( DATASET_NAME, From 5af370162945c41efcddb55059623afea2bc098b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:13:03 +0000 Subject: [PATCH 146/201] build: Re-enable and generate `datapackage_features.parquet` Eventually, will replace `metadata.parquet` - But for a single version (current) only - Paired with a **limited** `.csv.gz` version, to support cases where `.parquet` reading is not available (`pandas` w/o (`pyarrow`|`fastparquet`)) --- .../_metadata/datapackage_features.parquet | Bin 0 -> 9189 bytes tools/datasets/__init__.py | 3 +-- tools/datasets/datapackage.py | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 altair/datasets/_metadata/datapackage_features.parquet diff --git a/altair/datasets/_metadata/datapackage_features.parquet b/altair/datasets/_metadata/datapackage_features.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c76395167255bd0c1b8c374e51ce292a7c51de48 GIT binary patch literal 9189 zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFJFCc|M|0<}pLYhz6n%8B&obLxeO) ziAwTmP%0^Z@2H>``q*B>~{_C?|s)=?^>zMO%Dj!SNT~I!&-lZuY5@-UVE%fp{sw>swV^pGi>m$5@u0U| z4UU>KPklq4vv}BOTE7>!(L2}9IB%3mOqU(_NbkfLz3&z-*qMEO*NqBg-WG$o0uKT0 zUBW1u$rN?2-m>v5b!^m$-el5o{WDiAXeC>UD38*cMx>loiyg@f#7^Qu*lP;qO zy3&}D%H5wqU)H$0vq(!w-_Kz224lsuIQk0*awWr$OmlTy45N*kv*6C<=sB*O^+%Z6 z(9FJ*9epVO_K2$YqKip1!dZHCfFbmb7RRT^V9NX)ai|x$bm?EB%pc)=y%_}eM zjPI9vN7LhipAZhns&EpGH)OoJDTn5D=k>{F&GpvW5EnI@q7S<8v+TOi zG8X>Od(d8-zF@I*5*68IWKI_X;EhxREJn8WmVvi7xDx-BbWIT*wANdMG-2 zhqeToEjiw`-BVRyf3llSiH#Jmsln+-A?kt_MvB6Drs6gxXI#&0ld(5dK6cwzWt;iA zKj$K~c37y2qS6hQ+aMN<5b|h>Jhbj|{P(vO&}}E>s#bjzRrr4#Cmy z5A8CASXM(Ht8xegggcoy_nXnrTtwxogoU-dI+YHYgLTb|mkN8bs~1J8Ob1;=Z(Do? zobXJ3-5$OME&6i%~_i< z{-_t1M)|HKywH=bl`1rc8c$p(EZ3xF`xL}0MLBG`_0{ujC(o#?JL$kD-O z4sXWZ$=E;Cr07OfKKFj_Rq=Wg{+ND8tCEj_pGP9LCrNxgtc0xq2lDMXNiTO9O%Szq zo9~?3zM)y8O6%580qtj67Kz+ch;U%;4qNI>$l@&Z<3*Q-N#^L3iuxU{JXM&|qrG6X z=zje(tlBKXnC!CsY@4`L&lIvg6e!Ko*zhJWHBZ@W+V0&k7On~Xq^g73Y*i`iaF+$M z1a#=b_s#c3_A?cXA1mec4N5c)jEK>;{a`Ae5b4AGAr5<8weM~)>*yD;iCX+wDg22f zuHL@8*pWWt3%0TOv(X>6J0^VIGjd1!wCpB{xjuo=_&kflsyqjE&ODxcyz$Lmv(eo3 ze%O<)HcUKuWZppX`DYe6-5=fv+3quu%nzrT#63X|w-b*LG*Kz<7Ya<$z$WT|=(5JE zDfu;g6tTum|DYP}N%quHQ@!Hzr8h`?CmJk8BE#8v@Vf`W_tZZ>CRkRF3273%Ul@?1 zLORgjV|B;;ljEY&Ub-O7y>n`p=F$D0dK(JOD{{RrdpA4R75EyQ9Pw`|_V0P|aqp$0 zfksk%nOBzE)^-+6ATh*qd0$ndQG+bFqwL7O`b&@C9k!nl8mInYhZ%uN^ne>n3_H-O zM2+E~IY9@A3zbxHHP6!?F#ygnr6q$DBOoRoOkw~Ej1yQ{4{_bs0}fkAB+VRR-QkBG zip>{MU#HbfCGh9K@HIM708I1YY8L|!2?5+#U8^WMU4HO0EzoPi#;_hp!;3IJQU%Qb zU=Ls;dy=YV8|0h_9Axl9$cJ_AR{ntPj_1eP{*83lDth- znE>Di80+mZLSKy6Al1I6&$XDbu0b;n zS#wjzUXjaa8TG!nnWXkgD9cM-*)8Quh zmrnVgxdVGH3@I)h(7@{b1cGP~@pj2m&J~!ygV8NKUeHG@qy@N-vMA1 zpAN?NAP0QI9>V=DHVl0f0HUMUu;~%s0|LJTPm}mvaMzK-kTEO{|!8G<8OlhWK}(hu=gZSvLEpq{w1mJ@oAs}d1xH{!)cPn ztRx3)IAg=hR##3f8ih`;?ho!HGTjXZ1HeG=NsG`rly#hCYEozRM31J%yR)5{M>Q3< zs~0QoDWSFwH5j;Gf9qvqoxHic-n*pI@$;9)#QC}_Roq{&u-Vvcd%0V3BuFT2EXX3n z``S24=(EQHpIQ8L{o;_BOE2%xX>Ji^HW9HM5heQej8QKby-VJCdS7udxa=w#7{6!B z_1y&=A?bWqudrBeZ|Sr6c&{;SpM}XA=g5+&(Yb{BTVQ3Wr=lSs(uRJ<#MiAe2pd&s#ZL=Nc#>N++T%2}X+{~lvC*|hxdN+6X2?-UN zab#^h#eC)&H?L!Hn?3z@7Pf#^=7vAYcXkwVCfW}qq1#KS9RUE#P&paql7Fipw&t00 zVMhDp`41aAs5G$VTA{79+xKWcXIbb|Y}wGxzFo6bAm_^UcL$$j9e1;pncmnODV;Jp zZvq4dTl@lV6z?pqJmJh-+`rRZ~P_TWg>gNe;zv9rdsc4LA&`6oJWY!^Dkl0YBT z)+R|)#oih$C@|1{=jr)PQioG+rw!^_=~q-kl4j^`W43YueC^KG2KNH@-y!aP0PNg-DsP@ePE50B7f;?|^)F?@G;{5#4= zGXpL!Pz@{GD>%$`sz4MVzjM%o|N5 zOIqG)4+)~>Up%f5y;FAPgW%Ts4Ta*yT9*yy8u9v+Q#(I>Qrn`#_a(mTDECdQUFXY` zY%AV;65;Dj!IyjEr1shD*&HA3o*mNR)f!or-QLl6r#GR5Ey8s(Uv-J6k$iBsALV{) zky`By$s*;Pt+!@0yY65M=V^rGIWHSTWE>5xsLUGg#Rr(Qrg5HMz>rj?g_2{L1_~aV z*fd%UW@yEtNp%fW?1vX6x;$ zp7)f_ItXR_dHMS3R+=!8B&Nk%ZSM2x-I6#h@q5u_idvU51``e)q{Ot9vyq){OX=mP zMbL}H`?_^ma21}vn)+o?qW0o&*o>Jj>YVM^Q;Do`z9WYsPX;jFx3l*}nM)>r{BY61 zjXq8DYlMR@OB~oH6y)^4o71j!Hh&xW=A%C*^d1hqGIP=B6Oxi-N{ZW9IQb;*16plg zKBKgWx5eGWz9-k8H9Ctip05XWTPBI5VW+6$T491yvO!<4TDKEt-E|+$Fm=3nntP<8 zYTdz)#iiZGC&D{5yDA##Dl2Nl(#{$;)pgIgJ$owmMW7};yfxe%Dx7FsKej8%$+S#1 zo?+Wf`LAWq%5Ipud!9kH6@3(o*E?A)-oJ;s^HSEtD&rsmw7d>-q^i! z=hR1wV%~*!i@L`RtOKkoP6i1**^<|BL$~V{nvwru*r|3KTD$tvEi$r7shjXeK zJ>BP>+<6>a3jW?HpUHKq`VQsv++SYzO{G6cQ0k~&JW;p9C3$-thq_z$DMx+XwlnYX zZLg$nwoRyKysu65w#w=yC=AS>6|?9rjY}vHyzeo|p3AII$?HfbUfH_nI{%*CL*p#P z_vV!I8TsjL@dr3stPkAZP;ymmfZDu5hym0&%E6_#E1u&q{iOqJ*fu!f1Q=o%;LK~5 zmm?+_5PWG*1E^F)z;|sXU0D|w7&tGh2OO;h!FWI$a7h})Qyop>a2YlJ`P-O;ySoZe zg{-WCCn+ks%e%>uT~(DxB)q(myqqf@FY7KxR3R#nRETH^Sy>fXyb|731+J6DE2=2D z5{XK(a38#?yrL|TtSTofuS`~vhdud^z0Iv=?$i{|m3#7(5-OP=7cGt4l>d>zAaC(@ zP#^OT_wE&*2Ph!)grBf%5GCV;S|_nA(KG)Jl_Ul2BC_jc|{ zVv$ian+Lle#2NFx?A}u48MzoJvc!{BPWxX^P@#*CsVFgZ<#0_*jXiEgi5BHx zw1QNG1MN}q3>@*}U!#Hq3FyMT*Hg<_wa^lCtX*XnAJ@O(jWBx`T_9>u8%9l0r#Pr? zy^YH-Fv(z8XSw~X9Lr(an%OYsmrQq*_+P~^3Ud_}U5GzAcngm=8*KK+?Z=1@OngAS zzWs@)H=y)<(e?$W?nip7COvP~j@p{TO|=g~2uG8vS!0?5S}n)L_aDk~ot-o8-6J|W zAJ=(i;X-Pc#}{7W*Mb4f`O!d{G0rA)sO*oEIJxefz7ZGhN(#snyoD0a4YZ3m@>JaBE19?LyampQD)H!-ybIpmE>_*KUaefTP9#({6l5Cgc`@C<$C5qO z&^5B4JarL0hfS0d5RZ*L`SZg<-!mj6kfJ6dLvr(%_NKUzeZ4}1rTxhvG9I2Xp=1x@ z($}UHR*j8h>& zPQ@=@K^XKeUj^R{;_t^L2L11ej}@o*QMrszG!?>#V)#+{tWX8GB2QqT{j2qpSSWc_ zzAFOxu5`&2TN8(HYhs$jn%FJ5CMIrN6E8`viP45oGW81Ds2%ifNrtE}(?&GG0RbTp z9{{n4QT@oGivgzmu)9kQD@^qFRZA4wY6T6fyiCi$IW{b>@g~c~?}(1&iWO#Jxv)wl zeCxZC`+sXjlJ%A5Z`t*-YsmYV^*BO!raI@>a>Hn()Br4Y(#z}R73;^7;FhBHLTo|ev0lDD< zX#%(|QGPfye~IKmb-6<*RK^ZS7{Ep2*3`aR)Nee;59}3c6j7{OQoJO-rudI(Dbx(2 zG`OTRFR`Yy8YJIL{lGV&5|l0a}rzLv}^f2$z)yO;2nzzvahg8o+c%aWjDOn?>Pr#|8j{8{Qp zAXtRMN~$mlYyhb;vfb@U(4)YWGNwqk<<1U>NU(zYz##e_us!5yi8#3G4S^uDG(OnG zN)<`o)041e37>-WTk5Q{_@?We6vMj%} z9y-W+82X0WD_Dh+3|0R=9y0X^p1hl{D~$bj{nK5=Q?T-M@VBKn*uyLQx7Qm1)d?iY z;R9Xm4E!CJw$t+ZY=K*Th(}A<-alAxJQ9yf+Y9Nnq-ugpJsi>oNZG;PSJu%U3Aw*m z|6h-HI8F%!lA(bt5fT1A9ugx2l7g+Dqo1$hzX5M)Upd15!TZS9%>cjC!A{PXK+w>{tGPJB$Ee0sjY|{{sw;My>z= literal 0 HcmV?d00001 diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c30c43867..a4ef8f833 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -172,8 +172,7 @@ def refresh( gh_trees = pl.read_parquet(self.paths["gh_trees"]) package = self.npm.datapackage(frozen=frozen) - # TODO: Re-enable after deciding on how best to utilize - # self.write_parquet(package["features"], self.paths["dpkg_features"]) + self.write_parquet(package["features"], self.paths["dpkg_features"]) self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) if include_typing: diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 2ff40c32b..549889f6d 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -88,6 +88,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: col("hash").str.split(":").list.last().alias("sha"), pl.concat_str(pl.lit(base_url), "path").alias("url"), ) + .sort(DATASET_NAME, "bytes") .collect() ) From c3139f11a017b62b53c62a06a560d6e32379e164 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:00:30 +0000 Subject: [PATCH 147/201] feat: add temp `_Reader.*_dpkg` methods - Will be replacing the non-suffixed versions - Need to do this gradually as `tag` will likely be dropped - Breaking most of the tests --- altair/datasets/_readers.py | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index efd1af7ce..37357ae5a 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -87,6 +87,9 @@ __all__ = ["backend"] _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" +_DATAPACKAGE: Final[Path] = ( + Path(__file__).parent / "_metadata" / "datapackage_features.parquet" +) class AltairDatasetsError(Exception): ... @@ -215,6 +218,71 @@ def _scan_metadata( return frame.filter(*predicates, **constraints) return frame + def dataset_dpkg( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + df = self.query_dpkg(**_extract_constraints(name, suffix, tag)) + result = next(df.iter_rows(named=True)) + url = result["url"] + fn = self.read_fn(url) + if default_kwds := self._schema_kwds(result): # type: ignore + kwds = default_kwds | kwds if kwds else default_kwds + + if self.cache.is_active(): + fp = self.cache.path / (result["sha"] + result["suffix"]) + if fp.exists() and fp.stat().st_size: + return fn(fp, **kwds) + else: + with self._opener.open(url) as f: + fp.touch() + fp.write_bytes(f.read()) + return fn(fp, **kwds) + else: + with self._opener.open(url) as f: + return fn(f, **kwds) + + def url_dpkg( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + ) -> str: + frame = self.query_dpkg(**_extract_constraints(name, suffix, tag)) + url = frame.item(0, "url") + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." + raise TypeError(msg) + + def query_dpkg( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.DataFrame[IntoDataFrameT]: + frame = self._scan_dpkg(*predicates, **constraints).collect() + if not frame.is_empty(): + return frame + else: + terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) + msg = f"Found no results for:\n {terms}" + raise ValueError(msg) + + def _scan_dpkg( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.LazyFrame: + if "tag" in constraints: + msg = f"{_DATAPACKAGE.name!r} only supports the latest version, but got: {constraints.get('tag')!r}" + raise NotImplementedError(msg) + frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy() + if predicates or constraints: + return frame.filter(*predicates, **constraints) + return frame + @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: return DatasetCache(self) From 6035b39ac69b70330373e3c5a58838fb60fd1e88 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:40:36 +0000 Subject: [PATCH 148/201] test: Remove/replace all `tag` based tests --- tests/test_datasets.py | 143 +++++++++++++---------------------------- 1 file changed, 46 insertions(+), 97 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index b35efa60e..66353b9e4 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -17,7 +17,7 @@ from altair.datasets import Loader, url from altair.datasets._readers import _METADATA, AltairDatasetsError -from altair.datasets._typing import Dataset, Extension, Metadata, Version, is_ext_read +from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read from tests import skip_requires_pyarrow, slow if sys.version_info >= (3, 14): @@ -26,7 +26,7 @@ from typing_extensions import TypedDict if TYPE_CHECKING: - from collections.abc import Iterator, Mapping + from collections.abc import Container, Iterator from pathlib import Path from typing import Literal @@ -46,7 +46,6 @@ class DatasetSpec(TypedDict, total=False): name: Dataset suffix: Extension - tag: Version marks: MarksType @@ -127,10 +126,8 @@ def metadata_columns() -> frozenset[str]: def match_url(name: Dataset, url: str) -> bool: - return ( - re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url) - is not None - ) + pattern = rf".+/vega-datasets@.+/data/{name}\..+" + return re.match(pattern, url) is not None @backends @@ -253,10 +250,10 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: "political-contributions", "population", "population_engineers_hurricanes", - "seattle-temps", + "unemployment", "seattle-weather", "seattle-weather-hourly-normals", - "sf-temps", + "gapminder-health-income", "sp500", "sp500-2000", "stocks", @@ -367,30 +364,16 @@ def test_dataset_not_found(backend: _Backend) -> None: ``Loader.url`` is used since it doesn't require a remote connection. """ - import polars as pl - data = Loader.from_backend(backend) real_name: Literal["disasters"] = "disasters" - real_suffix: Literal[".csv"] = ".csv" - real_tag: Literal["v1.14.0"] = "v1.14.0" - invalid_name: Literal["fake name"] = "fake name" invalid_suffix: Literal["fake suffix"] = "fake suffix" - invalid_tag: Literal["fake tag"] = "fake tag" - incorrect_suffix: Literal[".json"] = ".json" - incorrect_tag: Literal["v1.5.0"] = "v1.5.0" ERR_NO_RESULT = ValueError - # NOTE: ``polars`` enforces enums stricter than other packages. - # Rather than returning an empty dataframe, filtering on a value - # *outside* of the enum range raises an internal error. - ERR_NO_RESULT_OR_ENUM = (ERR_NO_RESULT, pl.exceptions.InvalidOperationError) - MSG_NO_RESULT = "Found no results for" NAME = "dataset_name" SUFFIX = "suffix" - TAG = "tag" with pytest.raises( ERR_NO_RESULT, @@ -407,27 +390,6 @@ def test_dataset_not_found(backend: _Backend) -> None: ): data.url(real_name, invalid_suffix) # type: ignore[arg-type] - with pytest.raises( - ERR_NO_RESULT_OR_ENUM, - match=re.compile(rf"{invalid_tag}", re.DOTALL), - ): - data.url(real_name, tag=invalid_tag) # type: ignore[arg-type] - - with pytest.raises( - ERR_NO_RESULT_OR_ENUM, - match=re.compile(rf"{invalid_tag}", re.DOTALL), - ): - data.url(real_name, real_suffix, tag=invalid_tag) # type: ignore[arg-type] - - with pytest.raises( - ERR_NO_RESULT, - match=re.compile( - rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{SUFFIX}.+{real_suffix}.+{NAME}.+{real_name}", - re.DOTALL, - ), - ): - data.url(real_name, real_suffix, tag=incorrect_tag) - with pytest.raises( ERR_NO_RESULT, match=re.compile( @@ -437,23 +399,6 @@ def test_dataset_not_found(backend: _Backend) -> None: ): data.url(real_name, incorrect_suffix) - with pytest.raises( - ERR_NO_RESULT, - match=re.compile( - rf"{MSG_NO_RESULT}.+{TAG}.+{real_tag}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", - re.DOTALL, - ), - ): - data.url(real_name, incorrect_suffix, tag=real_tag) - - with pytest.raises( - ERR_NO_RESULT, - match=re.compile( - rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{NAME}.+{real_name}", re.DOTALL - ), - ): - data.url(real_name, tag=incorrect_tag) - @backends def test_reader_cache( @@ -482,10 +427,10 @@ def test_reader_cache( assert tuple(data.cache) == () # smallest csvs - lookup_groups = data("lookup_groups", tag="v2.5.3") - data("lookup_people", tag="v2.4.0") - data("iowa-electricity", tag="v2.3.1") - data("global-temp", tag="v2.9.0") + lookup_groups = data("lookup_groups") + data("lookup_people") + data("iowa-electricity") + data("global-temp") cached_paths = tuple(data.cache) assert len(cached_paths) == 4 @@ -493,32 +438,29 @@ def test_reader_cache( if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, - cast("pl.DataFrame", data("lookup_groups", tag="v2.5.3")), + cast("pl.DataFrame", data("lookup_groups", ".csv")), ) else: left, right = ( pl.DataFrame(lookup_groups), - pl.DataFrame(data("lookup_groups", tag="v2.5.3")), + pl.DataFrame(data("lookup_groups", ".csv")), ) assert_frame_equal(left, right) assert len(tuple(data.cache)) == 4 assert cached_paths == tuple(data.cache) - data("iowa-electricity", tag="v1.30.2") - data("global-temp", tag="v2.8.1") - data("global-temp", tag="v2.8.0") + data("iowa-electricity", ".csv") + data("global-temp", ".csv") + data("global-temp.csv") assert len(tuple(data.cache)) == 4 assert cached_paths == tuple(data.cache) - data("lookup_people", tag="v1.10.0") - data("lookup_people", tag="v1.11.0") - data("lookup_people", tag="v1.20.0") - data("lookup_people", tag="v1.21.0") - data("lookup_people", tag="v2.1.0") - data("lookup_people", tag="v2.3.0") - data("lookup_people", tag="v2.5.0-next.0") + data("lookup_people") + data("lookup_people.csv") + data("lookup_people", ".csv") + data("lookup_people") assert len(tuple(data.cache)) == 4 assert cached_paths == tuple(data.cache) @@ -644,12 +586,12 @@ def test_pyarrow_read_json( @pytest.mark.parametrize( ("spec", "column"), [ - (DatasetSpec(name="cars", tag="v2.11.0"), "Year"), - (DatasetSpec(name="unemployment-across-industries", tag="v2.11.0"), "date"), - (DatasetSpec(name="flights-10k", tag="v2.11.0"), "date"), - (DatasetSpec(name="football", tag="v2.11.0"), "date"), - (DatasetSpec(name="crimea", tag="v2.11.0"), "date"), - (DatasetSpec(name="ohlc", tag="v2.11.0"), "date"), + (DatasetSpec(name="cars"), "Year"), + (DatasetSpec(name="unemployment-across-industries"), "date"), + (DatasetSpec(name="flights-10k"), "date"), + (DatasetSpec(name="football"), "date"), + (DatasetSpec(name="crimea"), "date"), + (DatasetSpec(name="ohlc"), "date"), ], ) def test_polars_read_json_roundtrip( @@ -657,40 +599,47 @@ def test_polars_read_json_roundtrip( spec: DatasetSpec, column: str, ) -> None: - frame = polars_loader(spec["name"], ".json", tag=spec["tag"]) + frame = polars_loader(spec["name"], ".json") tp = frame.schema.to_python()[column] assert tp is dt.date or issubclass(tp, dt.date) -def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]: - """https://github.com/vega/vega-datasets/issues/627.""" +def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: + """Temp way of excluding datasets that were removed.""" names: tuple[Dataset, ...] = get_args(Dataset) - args: tuple[Dataset, Extension | None, Version | None] + args: tuple[Dataset, Extension | None] for name in names: marks: MarksType = () - if name in overrides: - el = overrides[name] - args = name, el.get("suffix"), el.get("tag") - marks = el.get("marks", ()) - else: - args = name, None, None + if name in skip: + continue + args = name, None yield pytest.param(*args, marks=marks) @slow @datasets_debug @pytest.mark.parametrize( - ("name", "suffix", "tag"), - list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})), + ("name", "suffix"), + list( + _dataset_params( + skip=( + "climate", + "graticule", + "sf-temps", + "iris", + "weball26", + "seattle-temps", + ) + ) + ), ) def test_all_datasets( polars_loader: Loader[pl.DataFrame, pl.LazyFrame], name: Dataset, suffix: Extension, - tag: Version, ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" - frame = polars_loader(name, suffix, tag=tag) + frame = polars_loader(name, suffix) assert nw_dep.is_polars_dataframe(frame) From 5d8b6db6774421bd232dfebd2db25e731ab2c89c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 13 Jan 2025 19:46:07 +0000 Subject: [PATCH 149/201] revert: Remove all `tag` based features --- altair/datasets/__init__.py | 9 +--- altair/datasets/_cache.py | 16 ++----- altair/datasets/_loader.py | 39 +++++----------- altair/datasets/_readers.py | 89 ++++--------------------------------- 4 files changed, 27 insertions(+), 126 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 70d01eacc..6095dd404 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -14,7 +14,7 @@ from typing_extensions import LiteralString from altair.datasets._loader import _Load - from altair.datasets._typing import Dataset, Extension, Version + from altair.datasets._typing import Dataset, Extension __all__ = ["Loader", "load", "url"] @@ -47,7 +47,6 @@ def url( name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, ) -> str: """ Return the address of a remote dataset. @@ -61,15 +60,11 @@ def url( .. note:: Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases Related ------- @@ -83,7 +78,7 @@ def url( try: from altair.datasets._loader import load - url = load.url(name, suffix, tag=tag) + url = load.url(name, suffix) except AltairDatasetsError: from altair.datasets._cache import url_cache diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index c247988d6..0cbb7a251 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -8,8 +8,6 @@ import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT -from altair.datasets._typing import VERSION_LATEST - if sys.version_info >= (3, 12): from typing import Protocol else: @@ -105,10 +103,7 @@ class UrlCache(CompressedCache[_KT, _VT]): `csv`_, `gzip`_ -based, lazy url lookup. Operates on a subset of available datasets: - - Only the latest version - Excludes `.parquet`, which `cannot be read via url`_ - - Name collisions are pre-resolved - - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) .. _csv: https://docs.python.org/3/library/csv.html @@ -256,13 +251,10 @@ def download_all(self) -> None: Requires **30-50MB** of disk-space. """ stems = tuple(fp.stem for fp in self) - latest = nw.col("tag") == nw.lit(VERSION_LATEST) - predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,) + predicates = (~(nw.col("sha").is_in(stems)),) if stems else () frame = ( - self._rd._scan_metadata( - predicates, ext_supported=True, name_collision=False - ) - .select("sha", "suffix", "url_npm") + self._rd._scan_metadata(predicates, is_image=False) # type: ignore + .select("sha", "suffix", "url") .unique("sha") .collect() ) @@ -272,7 +264,7 @@ def download_all(self) -> None: print(f"Downloading {len(frame)} missing datasets...") for row in frame.iter_rows(named=True): fp: Path = self.path / (row["sha"] + row["suffix"]) - with self._rd._opener.open(row["url_npm"]) as f: + with self._rd._opener.open(row["url"]) as f: fp.touch() fp.write_bytes(f.read()) print("Finished downloads") diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 2b8a2cd95..63bd5f3f7 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -21,7 +21,7 @@ else: from typing_extensions import LiteralString from altair.datasets._readers import _Backend - from altair.datasets._typing import Dataset, Extension, Version + from altair.datasets._typing import Dataset, Extension __all__ = ["Loader", "load"] @@ -111,7 +111,7 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: Using ``pandas``, backed by ``pyarrow`` dtypes: data = Loader.from_backend("pandas[pyarrow]") - cars = data("cars", tag="v1.29.0") + cars = data("cars") >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame @@ -137,7 +137,6 @@ def __call__( name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: """ @@ -152,8 +151,6 @@ def __call__( .. note:: Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. **kwds Arguments passed to the underlying read function. @@ -161,8 +158,6 @@ def __call__( https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases Examples -------- @@ -171,7 +166,7 @@ def __call__( from altair.datasets import Loader data = Loader.from_backend("polars") - source = data("iowa-electricity", tag="v2.10.0") + source = data("iowa-electricity") >>> source.columns # doctest: +SKIP ['year', 'source', 'net_generation'] @@ -199,7 +194,7 @@ def __call__( Using ``pandas``: data = Loader.from_backend("pandas") - source = data("iowa-electricity", tag="v2.10.0") + source = data("iowa-electricity") >>> source.columns # doctest: +SKIP Index(['year', 'source', 'net_generation'], dtype='object') @@ -223,7 +218,7 @@ def __call__( Using ``pyarrow``: data = Loader.from_backend("pyarrow") - source = data("iowa-electricity", tag="v2.10.0") + source = data("iowa-electricity") >>> source.column_names # doctest: +SKIP ['year', 'source', 'net_generation'] @@ -238,14 +233,13 @@ def __call__( source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]] net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]] """ - return self._reader.dataset(name, suffix, tag=tag, **kwds) + return self._reader.dataset(name, suffix, **kwds) def url( self, name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, ) -> str: """ Return the address of a remote dataset. @@ -259,15 +253,11 @@ def url( .. note:: Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases Examples -------- @@ -277,15 +267,15 @@ def url( from altair.datasets import Loader data = Loader.from_backend("polars") - >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP - 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + >>> data.url("cars") # doctest: +SKIP + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json' We can pass the result directly to a chart: - url = data.url("cars", tag="v2.9.0") + url = data.url("cars") alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") """ - return self._reader.url(name, suffix, tag=tag) + return self._reader.url(name, suffix) @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: @@ -318,7 +308,6 @@ def __call__( # pyright: ignore[reportOverlappingOverload] name: Dataset | LiteralString, suffix: Extension | None = ..., /, - tag: Version | None = ..., backend: None = ..., **kwds: Any, ) -> IntoDataFrameT: ... @@ -328,7 +317,6 @@ def __call__( name: Dataset | LiteralString, suffix: Extension | None = ..., /, - tag: Version | None = ..., backend: Literal["polars"] = ..., **kwds: Any, ) -> pl.DataFrame: ... @@ -338,7 +326,6 @@ def __call__( name: Dataset | LiteralString, suffix: Extension | None = ..., /, - tag: Version | None = ..., backend: Literal["pandas", "pandas[pyarrow]"] = ..., **kwds: Any, ) -> pd.DataFrame: ... @@ -348,7 +335,6 @@ def __call__( name: Dataset | LiteralString, suffix: Extension | None = ..., /, - tag: Version | None = ..., backend: Literal["pyarrow"] = ..., **kwds: Any, ) -> pa.Table: ... @@ -357,14 +343,13 @@ def __call__( name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, backend: _Backend | None = None, **kwds: Any, ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: if backend is None: - return super().__call__(name, suffix, tag, **kwds) + return super().__call__(name, suffix, **kwds) else: - return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + return self.from_backend(backend)(name, suffix, **kwds) load: _Load[Any, Any] diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 37357ae5a..6ac13695e 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -14,7 +14,7 @@ from functools import partial from importlib import import_module from importlib.util import find_spec -from itertools import chain, islice +from itertools import chain from pathlib import Path from typing import ( TYPE_CHECKING, @@ -25,7 +25,6 @@ Literal, Protocol, TypeVar, - cast, overload, ) @@ -63,7 +62,7 @@ else: from typing_extensions import TypeAlias - from altair.datasets._typing import Dataset, Extension, Metadata, Version + from altair.datasets._typing import Dataset, Extension, Metadata from altair.vegalite.v5.schema._typing import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] @@ -148,15 +147,13 @@ def dataset( name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: - df = self.query(**_extract_constraints(name, suffix, tag)) - it = islice(df.iter_rows(named=True), 1) - result = cast("Metadata", next(it)) - url = result["url_npm"] + df = self.query(**_extract_constraints(name, suffix)) + result = next(df.iter_rows(named=True)) + url = result["url"] fn = self.read_fn(url) - if default_kwds := self._schema_kwds(result): + if default_kwds := self._schema_kwds(result): # type: ignore kwds = default_kwds | kwds if kwds else default_kwds if self.cache.is_active(): @@ -177,10 +174,9 @@ def url( name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: Version | None = None, ) -> str: - frame = self.query(**_extract_constraints(name, suffix, tag)) - url = nw.to_py_scalar(frame.item(0, "url_npm")) + frame = self.query(**_extract_constraints(name, suffix)) + url = frame.item(0, "url") if isinstance(url, str): return url else: @@ -213,71 +209,6 @@ def query( def _scan_metadata( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.LazyFrame: - frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() - if predicates or constraints: - return frame.filter(*predicates, **constraints) - return frame - - def dataset_dpkg( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - df = self.query_dpkg(**_extract_constraints(name, suffix, tag)) - result = next(df.iter_rows(named=True)) - url = result["url"] - fn = self.read_fn(url) - if default_kwds := self._schema_kwds(result): # type: ignore - kwds = default_kwds | kwds if kwds else default_kwds - - if self.cache.is_active(): - fp = self.cache.path / (result["sha"] + result["suffix"]) - if fp.exists() and fp.stat().st_size: - return fn(fp, **kwds) - else: - with self._opener.open(url) as f: - fp.touch() - fp.write_bytes(f.read()) - return fn(fp, **kwds) - else: - with self._opener.open(url) as f: - return fn(f, **kwds) - - def url_dpkg( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - ) -> str: - frame = self.query_dpkg(**_extract_constraints(name, suffix, tag)) - url = frame.item(0, "url") - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." - raise TypeError(msg) - - def query_dpkg( - self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.DataFrame[IntoDataFrameT]: - frame = self._scan_dpkg(*predicates, **constraints).collect() - if not frame.is_empty(): - return frame - else: - terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n {terms}" - raise ValueError(msg) - - def _scan_dpkg( - self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.LazyFrame: - if "tag" in constraints: - msg = f"{_DATAPACKAGE.name!r} only supports the latest version, but got: {constraints.get('tag')!r}" - raise NotImplementedError(msg) frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy() if predicates or constraints: return frame.filter(*predicates, **constraints) @@ -491,12 +422,10 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: def _extract_constraints( - name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / + name: Dataset | LiteralString, suffix: Extension | None, / ) -> Metadata: """Transform args into a mapping to column names.""" constraints: Metadata = {} - if tag is not None: - constraints["tag"] = tag if name.endswith(EXTENSION_SUFFIXES): fp = Path(name) constraints["dataset_name"] = fp.stem From df26bc23de09102175d4afee25255bf354c19760 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 11:49:39 +0000 Subject: [PATCH 150/201] feat: Source version from `tool.altair.vega.vega-datasets` --- altair/utils/schemapi.py | 2 +- pyproject.toml | 2 +- tools/datasets/__init__.py | 6 ++++-- tools/datasets/npm.py | 5 +---- tools/generate_schema_wrapper.py | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 3a49b928d..a6e5464d8 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]: ], str, ] = { - "vega-datasets": "v2.11.0", + "vega-datasets": "main", "vega-embed": "6", "vega-lite": "v5.21.0", "vegafusion": "1.6.6", diff --git a/pyproject.toml b/pyproject.toml index 7fba9a9ee..c582fd1b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,7 +102,7 @@ doc = [ [tool.altair.vega] # Minimum/exact versions, for projects under the `vega` organization -vega-datasets = "v2.11.0" # https://github.com/vega/vega-datasets +vega-datasets = "main" # https://github.com/vega/vega-datasets vega-embed = "6" # https://github.com/vega/vega-embed vega-lite = "v5.21.0" # https://github.com/vega/vega-lite diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a4ef8f833..c9f35ae7f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -131,13 +131,15 @@ def npm(self) -> Npm: return self._npm def refresh( - self, *, include_typing: bool = False, frozen: bool = False + self, tag: Any, /, *, include_typing: bool = False, frozen: bool = False ) -> pl.DataFrame: """ Update and sync all dataset metadata files. Parameters ---------- + tag + Branch or release version to build against. include_typing Regenerate ``altair.datasets._typing``. frozen @@ -171,7 +173,7 @@ def refresh( print("Reusing frozen metadata ...") gh_trees = pl.read_parquet(self.paths["gh_trees"]) - package = self.npm.datapackage(frozen=frozen) + package = self.npm.datapackage(tag=tag, frozen=frozen) self.write_parquet(package["features"], self.paths["dpkg_features"]) self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 8f9182c45..99d5fe5b0 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -142,10 +142,7 @@ def file_gh( with self._opener.open(req) as response: return read_fn(response) - def datapackage( - self, *, tag: LiteralString | None = None, frozen: bool = False - ) -> ParsedPackage: - tag = tag or "main" + def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage: pkg: FlPackage = ( json.loads(self._paths["datapackage"].read_text("utf-8")) if frozen diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index 3177b56cf..94ad19faf 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -1394,7 +1394,7 @@ def main() -> None: copy_schemapi_util() vegalite_main(args.skip_download) write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT) - datasets.app.refresh(include_typing=True, frozen=True) + datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True, frozen=True) # The modules below are imported after the generation of the new schema files # as these modules import Altair. This allows them to use the new changes From 9f23ccdaca6fa3b9e2a5e6bef40dbed4fb8f0ddd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 14:08:22 +0000 Subject: [PATCH 151/201] refactor(DRAFT): Migrate to `datapackage.json` only Major switch from multiple github/npm endpoints -> a single file. Was Only possible following https://github.com/vega/vega-datasets/pull/665 Still need to rewrite/fill out the `Metadata` doc, then moving onto features --- altair/datasets/_cache.py | 12 +- .../_metadata/datapackage_features.parquet | Bin 9189 -> 0 bytes altair/datasets/_metadata/metadata.parquet | Bin 19296 -> 9208 bytes ...ackage_schemas.json.gz => schemas.json.gz} | Bin 2483 -> 2471 bytes altair/datasets/_metadata/url.csv.gz | Bin 855 -> 858 bytes altair/datasets/_readers.py | 5 +- altair/datasets/_typing.py | 135 ++--- tests/test_datasets.py | 9 +- tools/datasets/__init__.py | 149 ++---- tools/datasets/_metadata/tags.parquet | Bin 6289 -> 0 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2598 -> 0 bytes tools/datasets/datapackage.py | 2 + tools/datasets/github.py | 490 ------------------ tools/datasets/models.py | 163 +----- tools/datasets/npm.py | 46 +- tools/datasets/semver.py | 76 --- 16 files changed, 106 insertions(+), 981 deletions(-) delete mode 100644 altair/datasets/_metadata/datapackage_features.parquet rename altair/datasets/_metadata/{datapackage_schemas.json.gz => schemas.json.gz} (88%) delete mode 100644 tools/datasets/_metadata/tags.parquet delete mode 100644 tools/datasets/_metadata/tags_npm.parquet delete mode 100644 tools/datasets/github.py delete mode 100644 tools/datasets/semver.py diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 0cbb7a251..3e4beb82d 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -43,9 +43,7 @@ _T = TypeVar("_T") _URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" -_SCHEMA: Final[Path] = ( - Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz" -) +_SCHEMA: Final[Path] = Path(__file__).parent / "_metadata" / "schemas.json.gz" _FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { "integer": nw.Int64, @@ -118,7 +116,7 @@ def __init__( fp: Path, /, *, - columns: tuple[str, str] = ("dataset_name", "url_npm"), + columns: tuple[str, str], tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], ) -> None: self.fp: Path = fp @@ -253,7 +251,7 @@ def download_all(self) -> None: stems = tuple(fp.stem for fp in self) predicates = (~(nw.col("sha").is_in(stems)),) if stems else () frame = ( - self._rd._scan_metadata(predicates, is_image=False) # type: ignore + self._rd._scan_metadata(predicates, is_image=False) .select("sha", "suffix", "url") .unique("sha") .collect() @@ -338,5 +336,7 @@ def _ensure_active(self) -> None: raise ValueError(msg) -url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache( + _URL, columns=("dataset_name", "url") +) schema_cache = SchemaCache(_SCHEMA) diff --git a/altair/datasets/_metadata/datapackage_features.parquet b/altair/datasets/_metadata/datapackage_features.parquet deleted file mode 100644 index c76395167255bd0c1b8c374e51ce292a7c51de48..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9189 zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFJFCc|M|0<}pLYhz6n%8B&obLxeO) ziAwTmP%0^Z@2H>``q*B>~{_C?|s)=?^>zMO%Dj!SNT~I!&-lZuY5@-UVE%fp{sw>swV^pGi>m$5@u0U| z4UU>KPklq4vv}BOTE7>!(L2}9IB%3mOqU(_NbkfLz3&z-*qMEO*NqBg-WG$o0uKT0 zUBW1u$rN?2-m>v5b!^m$-el5o{WDiAXeC>UD38*cMx>loiyg@f#7^Qu*lP;qO zy3&}D%H5wqU)H$0vq(!w-_Kz224lsuIQk0*awWr$OmlTy45N*kv*6C<=sB*O^+%Z6 z(9FJ*9epVO_K2$YqKip1!dZHCfFbmb7RRT^V9NX)ai|x$bm?EB%pc)=y%_}eM zjPI9vN7LhipAZhns&EpGH)OoJDTn5D=k>{F&GpvW5EnI@q7S<8v+TOi zG8X>Od(d8-zF@I*5*68IWKI_X;EhxREJn8WmVvi7xDx-BbWIT*wANdMG-2 zhqeToEjiw`-BVRyf3llSiH#Jmsln+-A?kt_MvB6Drs6gxXI#&0ld(5dK6cwzWt;iA zKj$K~c37y2qS6hQ+aMN<5b|h>Jhbj|{P(vO&}}E>s#bjzRrr4#Cmy z5A8CASXM(Ht8xegggcoy_nXnrTtwxogoU-dI+YHYgLTb|mkN8bs~1J8Ob1;=Z(Do? zobXJ3-5$OME&6i%~_i< z{-_t1M)|HKywH=bl`1rc8c$p(EZ3xF`xL}0MLBG`_0{ujC(o#?JL$kD-O z4sXWZ$=E;Cr07OfKKFj_Rq=Wg{+ND8tCEj_pGP9LCrNxgtc0xq2lDMXNiTO9O%Szq zo9~?3zM)y8O6%580qtj67Kz+ch;U%;4qNI>$l@&Z<3*Q-N#^L3iuxU{JXM&|qrG6X z=zje(tlBKXnC!CsY@4`L&lIvg6e!Ko*zhJWHBZ@W+V0&k7On~Xq^g73Y*i`iaF+$M z1a#=b_s#c3_A?cXA1mec4N5c)jEK>;{a`Ae5b4AGAr5<8weM~)>*yD;iCX+wDg22f zuHL@8*pWWt3%0TOv(X>6J0^VIGjd1!wCpB{xjuo=_&kflsyqjE&ODxcyz$Lmv(eo3 ze%O<)HcUKuWZppX`DYe6-5=fv+3quu%nzrT#63X|w-b*LG*Kz<7Ya<$z$WT|=(5JE zDfu;g6tTum|DYP}N%quHQ@!Hzr8h`?CmJk8BE#8v@Vf`W_tZZ>CRkRF3273%Ul@?1 zLORgjV|B;;ljEY&Ub-O7y>n`p=F$D0dK(JOD{{RrdpA4R75EyQ9Pw`|_V0P|aqp$0 zfksk%nOBzE)^-+6ATh*qd0$ndQG+bFqwL7O`b&@C9k!nl8mInYhZ%uN^ne>n3_H-O zM2+E~IY9@A3zbxHHP6!?F#ygnr6q$DBOoRoOkw~Ej1yQ{4{_bs0}fkAB+VRR-QkBG zip>{MU#HbfCGh9K@HIM708I1YY8L|!2?5+#U8^WMU4HO0EzoPi#;_hp!;3IJQU%Qb zU=Ls;dy=YV8|0h_9Axl9$cJ_AR{ntPj_1eP{*83lDth- znE>Di80+mZLSKy6Al1I6&$XDbu0b;n zS#wjzUXjaa8TG!nnWXkgD9cM-*)8Quh zmrnVgxdVGH3@I)h(7@{b1cGP~@pj2m&J~!ygV8NKUeHG@qy@N-vMA1 zpAN?NAP0QI9>V=DHVl0f0HUMUu;~%s0|LJTPm}mvaMzK-kTEO{|!8G<8OlhWK}(hu=gZSvLEpq{w1mJ@oAs}d1xH{!)cPn ztRx3)IAg=hR##3f8ih`;?ho!HGTjXZ1HeG=NsG`rly#hCYEozRM31J%yR)5{M>Q3< zs~0QoDWSFwH5j;Gf9qvqoxHic-n*pI@$;9)#QC}_Roq{&u-Vvcd%0V3BuFT2EXX3n z``S24=(EQHpIQ8L{o;_BOE2%xX>Ji^HW9HM5heQej8QKby-VJCdS7udxa=w#7{6!B z_1y&=A?bWqudrBeZ|Sr6c&{;SpM}XA=g5+&(Yb{BTVQ3Wr=lSs(uRJ<#MiAe2pd&s#ZL=Nc#>N++T%2}X+{~lvC*|hxdN+6X2?-UN zab#^h#eC)&H?L!Hn?3z@7Pf#^=7vAYcXkwVCfW}qq1#KS9RUE#P&paql7Fipw&t00 zVMhDp`41aAs5G$VTA{79+xKWcXIbb|Y}wGxzFo6bAm_^UcL$$j9e1;pncmnODV;Jp zZvq4dTl@lV6z?pqJmJh-+`rRZ~P_TWg>gNe;zv9rdsc4LA&`6oJWY!^Dkl0YBT z)+R|)#oih$C@|1{=jr)PQioG+rw!^_=~q-kl4j^`W43YueC^KG2KNH@-y!aP0PNg-DsP@ePE50B7f;?|^)F?@G;{5#4= zGXpL!Pz@{GD>%$`sz4MVzjM%o|N5 zOIqG)4+)~>Up%f5y;FAPgW%Ts4Ta*yT9*yy8u9v+Q#(I>Qrn`#_a(mTDECdQUFXY` zY%AV;65;Dj!IyjEr1shD*&HA3o*mNR)f!or-QLl6r#GR5Ey8s(Uv-J6k$iBsALV{) zky`By$s*;Pt+!@0yY65M=V^rGIWHSTWE>5xsLUGg#Rr(Qrg5HMz>rj?g_2{L1_~aV z*fd%UW@yEtNp%fW?1vX6x;$ zp7)f_ItXR_dHMS3R+=!8B&Nk%ZSM2x-I6#h@q5u_idvU51``e)q{Ot9vyq){OX=mP zMbL}H`?_^ma21}vn)+o?qW0o&*o>Jj>YVM^Q;Do`z9WYsPX;jFx3l*}nM)>r{BY61 zjXq8DYlMR@OB~oH6y)^4o71j!Hh&xW=A%C*^d1hqGIP=B6Oxi-N{ZW9IQb;*16plg zKBKgWx5eGWz9-k8H9Ctip05XWTPBI5VW+6$T491yvO!<4TDKEt-E|+$Fm=3nntP<8 zYTdz)#iiZGC&D{5yDA##Dl2Nl(#{$;)pgIgJ$owmMW7};yfxe%Dx7FsKej8%$+S#1 zo?+Wf`LAWq%5Ipud!9kH6@3(o*E?A)-oJ;s^HSEtD&rsmw7d>-q^i! z=hR1wV%~*!i@L`RtOKkoP6i1**^<|BL$~V{nvwru*r|3KTD$tvEi$r7shjXeK zJ>BP>+<6>a3jW?HpUHKq`VQsv++SYzO{G6cQ0k~&JW;p9C3$-thq_z$DMx+XwlnYX zZLg$nwoRyKysu65w#w=yC=AS>6|?9rjY}vHyzeo|p3AII$?HfbUfH_nI{%*CL*p#P z_vV!I8TsjL@dr3stPkAZP;ymmfZDu5hym0&%E6_#E1u&q{iOqJ*fu!f1Q=o%;LK~5 zmm?+_5PWG*1E^F)z;|sXU0D|w7&tGh2OO;h!FWI$a7h})Qyop>a2YlJ`P-O;ySoZe zg{-WCCn+ks%e%>uT~(DxB)q(myqqf@FY7KxR3R#nRETH^Sy>fXyb|731+J6DE2=2D z5{XK(a38#?yrL|TtSTofuS`~vhdud^z0Iv=?$i{|m3#7(5-OP=7cGt4l>d>zAaC(@ zP#^OT_wE&*2Ph!)grBf%5GCV;S|_nA(KG)Jl_Ul2BC_jc|{ zVv$ian+Lle#2NFx?A}u48MzoJvc!{BPWxX^P@#*CsVFgZ<#0_*jXiEgi5BHx zw1QNG1MN}q3>@*}U!#Hq3FyMT*Hg<_wa^lCtX*XnAJ@O(jWBx`T_9>u8%9l0r#Pr? zy^YH-Fv(z8XSw~X9Lr(an%OYsmrQq*_+P~^3Ud_}U5GzAcngm=8*KK+?Z=1@OngAS zzWs@)H=y)<(e?$W?nip7COvP~j@p{TO|=g~2uG8vS!0?5S}n)L_aDk~ot-o8-6J|W zAJ=(i;X-Pc#}{7W*Mb4f`O!d{G0rA)sO*oEIJxefz7ZGhN(#snyoD0a4YZ3m@>JaBE19?LyampQD)H!-ybIpmE>_*KUaefTP9#({6l5Cgc`@C<$C5qO z&^5B4JarL0hfS0d5RZ*L`SZg<-!mj6kfJ6dLvr(%_NKUzeZ4}1rTxhvG9I2Xp=1x@ z($}UHR*j8h>& zPQ@=@K^XKeUj^R{;_t^L2L11ej}@o*QMrszG!?>#V)#+{tWX8GB2QqT{j2qpSSWc_ zzAFOxu5`&2TN8(HYhs$jn%FJ5CMIrN6E8`viP45oGW81Ds2%ifNrtE}(?&GG0RbTp z9{{n4QT@oGivgzmu)9kQD@^qFRZA4wY6T6fyiCi$IW{b>@g~c~?}(1&iWO#Jxv)wl zeCxZC`+sXjlJ%A5Z`t*-YsmYV^*BO!raI@>a>Hn()Br4Y(#z}R73;^7;FhBHLTo|ev0lDD< zX#%(|QGPfye~IKmb-6<*RK^ZS7{Ep2*3`aR)Nee;59}3c6j7{OQoJO-rudI(Dbx(2 zG`OTRFR`Yy8YJIL{lGV&5|l0a}rzLv}^f2$z)yO;2nzzvahg8o+c%aWjDOn?>Pr#|8j{8{Qp zAXtRMN~$mlYyhb;vfb@U(4)YWGNwqk<<1U>NU(zYz##e_us!5yi8#3G4S^uDG(OnG zN)<`o)041e37>-WTk5Q{_@?We6vMj%} z9y-W+82X0WD_Dh+3|0R=9y0X^p1hl{D~$bj{nK5=Q?T-M@VBKn*uyLQx7Qm1)d?iY z;R9Xm4E!CJw$t+ZY=K*Th(}A<-alAxJQ9yf+Y9Nnq-ugpJsi>oNZG;PSJu%U3Aw*m z|6h-HI8F%!lA(bt5fT1A9ugx2l7g+Dqo1$hzX5M)Upd15!TZS9%>cjC!A{PXK+w>{tGPJB$Ee0sjY|{{sw;My>z= diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 633815d1ff27144689ab23aacff171a20598293e..293c93a975929f24b4ccd533d4cb475df654bf9a 100644 GIT binary patch literal 9208 zcmd5?2{={V`rq3;&-*w>;+PIf=IQuM$4tnSvFIF*Df1D9GLM<3NC{CWp;RQw5FsiJ zK0Zl44N4`YRMLNMWBs4Ld%J!2-skQ|d#~yJz3*D3^h@Z9eOJ^m(t@9fpKXtn02nzG5BiE1}l|FDhua z1NML=;0b#YNFtHye=5%$#!<*I{&rIij`GX<Y52-TJ zywkV6Z+@O?WAV9w>>zMm%Dk1aNT}t)?(v5=)chUt!TgO}eP;n0)TXA$EULDnCj#Gg zo^{lmdFm7VoW;XN)B3%*jo!sJ#(AR*Vw!CKM|vm5sDpQKLC)+OJ8xAg^EMmI6?h0} z?-fSTOeU*y^^}iisbeEg_9VHU&_93Of>yG*i1H|{aahV(^@uXvo5u@X0XkulT6t}e zIf6w)X&HQW3SGyf=iPUs9IO&IVr;p~FCHF}lVlIIdN`{?K69H=DsRxm7fNXx>7Yw5 zg045Dr*QYB)0a2w>nPF^()Tr3yv11gESCQApL8uOtg0ew~1EPcI_>}<(hA^)HSARD$1x=#@Cs(`+yKNs8Kn-WIL69Db zirS?ufo4mJBei*|3S=g^>6F??@tPW(dlalLXknx%oM$R-V{)E!eus>`sq*oAJ}NuR zFa9|fp|#6GRTPzGxLgOZV3Ytl4h)*bjTlU%kb_!3P&C!D(5Z^}&4}dwe>-+B0NC#6w&~(zopdkOy)y|9K!PU-FLrLXER^>*SipDnO zSW6z`WKrYna#3R&<1Ar)IK9>>I?5h*cdxz6-6^I*U82Ipu^|2Ve;VN4q@$oRCR1b< zR@ZVOjRp^pAWRg(Qby9B_gfnWg^TVs&(;MsfE`}1-&e#~^EQWj=dB*zZE&__DGt{D=o^w4cou_s}ncDBm+oyVddU@_M9SkqRU1^S)d3#w#=I;#$8-1S?7Z z#`q&&Tpi&Xh<~9cT`N^+4mF&-Tv(w=&GspfSBg?>y8YGj9jDH#Y&hk>C*+`z?Zs!j zAr^1O-oe;+w^7lJs$%Z_fg9rWCj8NTj#i}~13nLj?@W~VTC9Yv1pD*tIbC1wHJTu5 z?K9sqwR2OGMzz+Ry9KnLX;~z4Qy{{jx%+G>Gr^0q)Q=Zk&Q3B%C0Ew(BJosX%1U~` zDA7#)^Q_t|!kFyxOtvjts^<$?9}1LZX>58EkdmitHf{IrI1AT=eq!}uZMN#<4Y+H9 zSpqurVF%}XBl?&M#*dfr`UEBz2ZTp!+kP;WkB`{R{2>;5Q?>W^Al8vDViUFa3sU%# ziCjIszhj4cjW65Azsc)`FQi217;(+ z8-1~-NH$D7d1T%I@}*}MIb9##2-zMqk<1UHnZ!Lo5491G5j0WB?-vS8Qo%-Q|EThY z8_D@Kd=#;U4!^(}?Me2O5mUV*m&$Ir@|`?uDH0LJ&V%1K2)?KOahPCP4HMF?@OoiD zLWOjIpU3Ko`6uC`%3iu4)xBeCuVzW6r{1PQ^U7TBYu-)Hbp<{Kr-uC+kN9=J_;}!I zNq~_QU&i(2y0w+X=;`&Ro;ANJ{2#r&Jx6F({C3?V(C7K;* zQKH6h(43?L#Dz+$xtiu_j~M_LnNpKLiV+YU2PQHA1;+8LtVg*<^?+gviNu+sth;>C zcVqHJ)Hi4~Q3?FnKQurm3V^A*x!T0Qqe1{TR@W+$PM072ObhgwurX`|Qt={;k5oZ3 z0N4-M$eyCA*#S8x0EZd8P%?Djqf3w`H{j68>7!2XblK+$4af+L+tb~X0n{<)5+(0Z zRmB7Ne#UxzU?Y32mGKA&(gI=(kLSUlL7N)_@TdcTQ6&dSaGagvEPLUf!^TB#qiWEM zcP%>o+8ebu+ROcT78zi}mxR#_uwAPUa-8^Q0sog~kY**|(~|ZDlPSxg@dPs}i4OOQ ze<{@e$Q;;mVMuW)N&~B#6bPb0#M&iGIg^3a=nMj&_Hr*E03!y_VbBa-6c;aFqQ}a< z;)wFWO^6`6Rv`Wr^iPqCS6n`Sjob|f>Sc5$7(JLs+T%kETJ8;kUo7^E_(0UM?EtWf zPY2_Bkp0)O$8f)j4MUdzKvdKkHa%jyf5123`4Ybh4jcYGI5IGl0AjEg(P!QGTav$s zKl;ZjzUK>q&n|ptyf?+)Zyh{y^Dlz`U{*beu=TE*lg@}yxb=_94M4J7HARd zJur?E`s}g5XBIbIzj)WorHA+KIc^bUHW9I1;idZajFB%Gy-VMDdS7=jxJD8Uh}*yI z=Dq@s;55D)*IBH0HuqY5{G%cDpoPgB=ZMm&kppUJ$)b-u?@p0(I(#Qiu{S@_NpMqL4^lpZ5ZpOX+aM;G&!xqnx642m!`R8f8@T``(sq_Rwz5Pv2DdU5y70@zYl)2 zQuj$n3q`94IM0vQc;(S?GlKZmL+x>Csax$hH#fWp;o`L8;$|M%I4L)e*So#1S4gPP zj3aCN8RqlPxOp9qwA$0}WMT7fVLtm?#h&&;&IJ4ZM08szwIcvv8LB2jUGnb~#MC@< zE=+Hmy!2snJCz33Tq~r7cISTW=PV1oip`ta*mr8S2;^M9`R?$OtP^gwGSi!zBBYZ? z=Iyda70PaX68|(L?!NtS&4Yo1Dn*}TvImE&A53f&icWHYW9{OL4p3R`%f=zkt$BUmpZ6xrC)g(k~Bki8M9Rg;A;oRY#V1hPLEDDcfWZe z(pH$Dbs*o;v_53F*ie|NhC(A<(`a8u=fSb?mif00J-a4{Y`)$WEzO|G^@;jyZnc4lpgNk)$znD7>Zqg z_ZEd=>1#s~uJ&;otS9caLhVB#m1)N5Q<$g49Tf8EL&|eL`eEUkOx&6jGKTLyk6(Mm zNQVElg`B4kCKP<?Y>@|IO_@0Ly$-tJre z2!C$qUXz}Sy|178we}XbgYU9xzaG@*w&8gzKPn9s1nJLe#NDsptxD#-UBpQ$WZrBl zS=#(o`>r5b{>9@;(fj4+KL~EG-&81WtaZ(Bt^u!4IkV@}C$()ld|%=^OSo@i?K)m2 zXIt^+yArCFPGF#AtiavsHQikH}am-?eez1c>) z>gU~MvkpS(e_p$Ju7xI4B#~+HPOJO8dY2?lOZ<4Wh{4^yIBE7-_R_oVc4 z)WYdS;(XjXEw~CV-AMVeC{cT5D0Ie57j@Bg?5RZ7IN!0O5vTkaNA2u=Q09_JA3t1i zaHCJv{2K1y!x9U&3I#en@aD8@na$rpzWwO83B8ARUzxdR^a@EyG9|`tE}VQ4`vI+X zFrQJ{#M|Qcgx)7NpEWp(FB z**JzBx8+;!^fa(fWcBJ9FuP_64ZOxyGuMgVPNfVU3n@{=*hOc_FKB0uh5MA4@1wi+0fe6mu-`gRZ7`{N0S@R zcBWM#_N%mZ3g(_Z0I`OKOMbi9xb`OmU z6rbBu&gbQ)cf=jyXtq8yx~cSrT0gaUr4R$CQ^LWew>OUCG5ys;Y}i)#)(J2~Gr-rc zSzeClBtY<`Jq@5z84mBY8Fb~HTwuVwtR7HO3xaWgHsF#tf~P7;yl6EVoaJ7Cr2j%oy^b0aG>_TaB{qt+vzI$xBbHcI=PZY-K)s@-hXAG}g$s1^U_C__Mo2+JX?J?$t^xR$^)z6Xen6pt~ zvinZ<9*9qwQl5_{%p2DdYZPC83`XbDGt`fm-hA;W2gf&r>m0DOXn9h|%}*y#WFY8@ zjW&rR#x}EadDUrCWqU^qG)jbK=BI> zrp75LXZs_{wvQI>yL~-t61GVzZ$hQe*IDI&cidsQMH|vN{6W(xk){YW<;%jx?B34Z zsT?*BgJP72o-rHNgSkvl$aA{lqgXS zMk`1~IKUni$G{Oc{xvdCkbo{ca5JTxRSPXK$J$wb<#GKR-f**bQ3av~wV~7$b&7-P z_ItQ=1Cw-y4VF7E$gvdD*35=7zht_v#Q!RqQJAZ+=yF`i;2k{PY_Q1>mx&SYpZI`! zeeV-duV3l;qU{S#-H-HEje6d!?X@+fek6cqjI+@kD*x>iPOfW@Px$5EB?V*(-a-kNhKe^Ck8DgX zqvvYAaX-eBJQch3dd59FZ-I-VN<8|d?}B!=iJflWs8%6bClaC>0x}JDznDJD$C5pD zwsUwvdFl#!4x1n+ARZHQ>c_!C-!nKkfTAWNA~#TDeDLJE6wdai6v#A?AN*2r!&fm}SBg zzH+V(Fno)HS+u396+#higYb*rRID*4Fb%pYhfpftHd(!z!_cc2du`K(YSxRRd0K$)xMudqyw8EVh;$!>uLrCDxL+B-fIeo7a-PQftW+Lnw)Q1#Qv}dbgB@s4&w;G(rCU z!4Mw+v4~N9Po;|iru?wEOBE}0^tVOpJlbk04UDkN&A=CJIK4)mEN8!AJC+Mp=!xab zDxa|4c7*^~uSRa{E7j{)>}C6qL&T~XNJo$rzmkW455VwaMsc~!{A(QGzxa~@>J=G6 zqfR?R$^fpFZ5`%kNqSU*J1jrp49Nqy*$wOR>qEFq6#Ntwi7H0Ga$O(;0QX*)ANnO$ z40Qnk8F7I$0o;HnKkTGGMR%bF+#wXIU>76|;8JjFV&BZ{I^*#jkA<2<1TQTK_KL3w z{$qj)^$8IgT@s3sToYRLlyz0#F;A#W4_In(N$T>ZHK~72DWOgvf+a+xa|@f-1QB)r zfGa{>Mx@%8q%Le(lVV!;&~-b%BY{wZh~VUsV7AG+Q~Vdt!>DH-uoondAowbR``zgq z8~)cfOjML7YgOC$;Bmx{I$2o!=YfdGFk37ak8`p{6M-k3n}SZODVd|e4}u>w+T;sH0; zF+-|#k-V9u4xG}5#R*o+m~b9ZNg%i*uO&6hZxsZ8-xB^3xFJ$c(BBTfED1Ws1XvLM z>Ld2R&r&-A!6FP6QiV}q1xS&R?LHDgj{+CUm?F)V8#^FLf)(5b2GRF`^&wwN#K2W+ z2n3m>{=q6%id^MAJqg>E@F_^UrN%l-rs*IbSgAFl7yeqUjE5Q04i;BM^dl+=1b;ZM zF^PWWAtaB#jR()~GM=k~j;EWUk0%Mv z|HXVD)ACdEp@Ynap--5-f>nsCq3YkqLxvu~lXvqW!PtM-KHXJ31uIVnKU<1}Jv_sI zd%h7+od8!kd;rPLz|V1MIW5o6Hn`?Hf3$@4{e$_&BmTIwypUE)q9(}D!!B)rGim@RE*5!9DN-F$o~esrKn+O t1^cH#n4`UwJIU7|7#?2@08MA%5C2$1!HmKuv!D3`CK&<10{%Zg{|BD>P{#lO literal 19296 zcmdVC2Urx%(my)8EMbXDSTd4xT5?drE;;9nNS2&If*?8PoF!)iAPR~|jw%urMI{I* zK|#Q*+*#09&w0=JzVqGh-uwLTJh;;{)l*&FU0uJfu5R_wh9XE9@+vv90^IOJFudBpOVpQQ1G|)uuWResug+ihcM2Al&- zfI?RTK~ZCZuf}#U{CVzllOHSFL&m6ghd=HcC-9$kI+BcD`?hsXdjf7C-S>5R{Z?;v zb#XQ4*Rz4D()wSE@g-Aoy!)NiukC1lE3w!RAr|XxQ;N2ujclt_a+Ouloa2K8C=HKd1 z8*ib1cfAOoQ~mmhoU~SS_u&>wP_8&qAq6{(;>{JQDZKmG!{;69boLZ*}XJ zxF1+Iq-M1a?^wggspj1Et5+x{b!YS6Upq2&!we-kCEyr7CEi=t3pOapQn&~W7Un&7 zAiuyFd+8lK`%s@MHxoKiCz*Bng6u&`N=%-hb^J%;2ic#PAvt5PXf!FKuZvU!)PhG* z)pq`oEKA{(`cqGi~N9?6!YTR>*c5-gz`Z#5V*c-BPY@k`_uk9Yaq8I&!GU(NK zSDsM?p@cfymVd|W{wEREire}haCE!PykT$3-sxOWUGtHwSbl0dFypqI)z2^C(VgH$ z_O2^ZeLoJpy{sB8h0lokLRW|cko`Q!3K9WirVtW=#6#jSN&-9tp{_m- z-i}WGv@j_!`AhFu;S%(KBM%PvoGApsU&W1#v{mGxB~6UA?{kqpm6_;>sWcX#(N=9= z4miWCuP)A(r_F7w*c97Ugw&%sut&x4%6&6)C$W}3vOqDWKcoPp;VUNIMZOhldYz(wL2Nx{rc zu-r>BS}MYX!76eDf|vkz0FtFIKua=$Ly@J;lbgI2J?0`=n!bTUn50^h{U%nk(X=Ih z3pZuV`9|?h{KT7}?(%o|#DsB5_EQ7|1bDI0%aQ6l)T!~SkR)|3D$OCkK1Xo|-I0Ar zoHD!NSOBKhnBR2bFnY*Q%<@6dz*@j*o5_zjcyax#tu8Pa3Y5s_5wJn$wsvqImn zl}^XiYQ^t1)3uGSkQfZBe-xzBR+@!EZ76!m=L`$^SS+T|T&VDV{LCH;y0XSanXgE< zA=holZOB=?)-&nMOEMlRQD0*Yo?NOcPg3h{2>Qlh)-%n;q0sbX-0J0EMYe2s3@k3Y zioDTLw$iW(*^Dfi%Du^XQK3SAl#x0uvUjCHU7v>+tQKM1eLHDBG72iqWK$1Sqo?_u zrCuZZ*BOy;&E#Avd@91C0bYBt;^~`%U^|p|)+8E>mg)1(CC?>4aM655A-@Vr%&9^{ zTv79SQOMD-)f!aFn7AROx&cw19jT%^ zk8+TFv5o?2=Nf9Dy>5&XvcrSYK36hH=sUpzRu|2Bxrty~Zz<4zMg!R1Sb;JmZJr0|{ zhVPf$9YIElVza7)%2T!^C_XVhwpbz&sZvIv=db8&1?M95=X(MLF%6HG!qq-s|5hjs zjOuGaGF_A`9|I*faz$-1am`3#Itc&|K}SpMXcu7T?-X#(%g)mYNlcw7M7<jFFM<%48p=w^lLs=(9KHLTbEn zPdRMOFPgczUU|9S-TGnUh1JX%^wj0SSsjGs)=r4L%G5dc%QbIG!tAJkNCHAYXs}T< zhTp}#4o@xZ{=&J?MeFt6XCBZxTtXy3#|hxk(ULd?2b}W{^zrfb3vhBgk-i0wIu*** zK#LXsr(7V2jng7CiL{Y5zd?)>_Yetiu{*G_#_p3pF`7) zXlBCeJyO)4W6Set4$o(}=Zqct<4qf+Y4<;nM1|Mh9ljW8CvErcuC*|9uUO!F1K4N; z1Q{Q2Q!3_PP%n~6VVdi$)(_$p*O7C~{$gj^h~Ci&x|3s5AA>DfbMs zX$Ps>6S<8$oCK5)&aduTQF2Zh6U#pR&XX5I#L3p0!4k)^G0^p1gN}*!<59NUZsqM; z>MJRytc!^&&f!Jy!?XKiJEC{PwD`Tkz6E?^BkCaSVa$d9^cooIAYUT)o7~ zuSWzeHqI0^EzWhBx+jR6-noHpv#*&Eg)VwHU<|U%**1%xA;uMIQ*htyeHIt`o~S+< z8m7Qim&B5_KHz!(#g?r0=Z_wJiX<*Eb_%?a)>d^ixwD*L|DLOM)3gxTx-%I>_YQ?~u{Y3X;b zD8=Dw6d4v~Tk57-pXE%$DAl-pFwwUz{&B^2+rwImv_~zEF(Xl%!yBKIuMllGc&WF> zf3TR#D{b>$LHPylkhol!jj_%n97==W$1@$z-TYb**#UghRKnwwp-J8c!*8`(A3T>~ zb9a!wq?F}EdkbYi_GJ3eOaoU@wh@ z1ZAeSat!YB5A)fWuZ7(eZWqp3xhT3wTv76cRP_E+WR#tC8WI)g7Zf%Rke76zdqY`;IEj5l`Xu&}>rlIgSs$U-S z!*mzt1Uy&%2|S5^fv1pze-JGJ7C?o3yw3j`o~&>=2Edr_zXDOLxlYBJclnCU-{!i@ z=K3)ncwvp|*5`=c3M?LEsQ(>AjRguE{}T+B=Acef^5tuZ{wEj=LijK{#os{` zIl)=D!j2DYpv;0LZcS2&;e<8Gz{f#Jyohyi>+J7M_2uay=AS8Vj zS7O`v3%&ftTsk&LitjR`q&ZY+5&7VZq|ew{{S~7S-IWjBX#a#km#JW7C={oEy6_jI zYRnKPmHwU}a8}QiOSwe!E@jdRp2~{sMX?Bw;44QrKFA!E0jC?JNmDDt1X{LydSV=Y z=>@5#VJ|AM(}J(Kc(JwM1O$c5si*TxCwj$5!$?s*G%DkG*yre-*ccRx-FA5-k9LMSPza6&rtRwv4LkR`GDP5p#q z?lVE4P*Ejk@w;i+jV^;kmda5a;?yv$hze7E8jd(#@i&-RT-Bu(90v=a8EClUcr~h1 zd})H50pTcwE>&&2JH$u<&|v5yw;(VaV(JGhirVK`d}Vr)$Ji^1r`;5J&E1JCVUIeZ zE^d?#^!u(5nB%$YBvKM&1(DXhWs$7Ly&nNLDTMxovT$ct52rsMHWfqto{lS<0Si+9 zcD$ZoZNqW{;FcRpo&e}n%i;L{=Poqh+h z2!){UB@uG+ay5)F0zewp%<;+eJ;mJF5I=`L*S@_>lwjB|TqVU$2F^Kndw96|yLx;5 z1+1?iT#=Ih2&{>E$g5o1$o1dAI!NKE4OpJK!a;hy7Kh^!Z5sQQ-+&$A6IG}hR253a zW_%OMt3x%cL0WrC=9(sN{PXMCR&u4~u2;DV!a@pzr^2q$Tq>7vRVml`G&orxl!T7O zNbxC5Vrua&YwJ)odt@L2W_dE%J|OPS!K_vll0#O6T0*m&&0f1P9T{JKPmrtefH6IG zH>S{^316%nhNzOx;wZN>L)Ma{^~&_yqgLG^lNirdp)V5eY+tx}4zmzu^n8_$ zL%cE^5=l`lv9xpB2;I-W6g;C4Q#|xFsmW!#Z9enAOJ#;tQX(02TxOWx_A&@wz?I&N3; z@6TkNiPcd!K+^acFzDQTH zALDhd`6?;j&ATr@CS5jb!{5tl4Hw|83#Xk_SNS}D<8$E~YKimOYx+mhDSbRv9(tbk zZT8Id4QuzhKg#!&F!HAGr*bq;FJxtm*>z5THe8gf9zLDSKqh=~_og_kgF-|p{T(O6 zv(e4~WMgFDb6vI-O+&-F=Uf?mlhjcUjY#2*ZqbQ+uWsw7$ouvt&lI@L3@z=>pU!y> zr`MGht3oL4v#MV2u%iP2w6w;~vT(fjk7J)ng|>>7#eFib{Dh7B zHVF?ozeH9}q;OyN5uRfU_t==Nnz`$y{;cadP0Dk4!6F}ZlkB-88XO({io$~BAoy@D zfA%})D_X)7TsF$ys)l)K~dj@izx>Kd@ zWj#F)DJQC!wTNZOCAv_HtzK6bR=i1mhxwPe>&KHOYm&%4o+9Ny@?{@NXq4nQ21L_QslS#aGkGbhT+-~x*hQ~;tEkr zwKzd;59z|Sq;oyznh-jh+f2kn3!{mZ9DIRAd0)O8*jyrexn2?{U~{E=l3Y1@;M4Xi zvrp8kJ&{q(+qv4~_l}yYpzP03U%tI_YMsI}8ha;itm{B{vnp&f;70b=3o>%#>v-w! zPnP*#_cIvQ@A1!UHx)A9z8D&;&313vy;!Z7r9ISuIXh;O(}3nkGdoNb*776i$6=C1 z-n!eh{oaxX;b@IF)|GcxVlh0LpT=!pWD7OtQ1Bz~LDR(90wAjJ4P--kGZ|Lw=;sb5rKOh9qF- zWRY@LNiW|1$eWmM6qaw-<`=v^CHp9SUP>m_kLQ}ni>*5wh0V^7m58)*D8~d|W_FT1 zw$otEqY7YYqavy1Fj-18r5WUKF7oI*94U%L_qJ?jCWpm9O zbqwwXddv9b;^un5l@szgokJ9&)CV^LqCVBg%-^9LXAmb&Z z$K%yEOGyh#D)aOW<~!IgjL0yRyK8f|2~o6i!}DDCJwZ+i{rOfYHOy#_rZwd~(at@? zk1yU{es}>9M?|7@x|uOrDC`qr`TfN;tB{6~+WPldU7`|W)}2Cf-ckGa_1?V{l_(A^ z5wNDEvHxKVrwvVf!o)s2l^8dhz|H5JL?>mJ^wkOFlia2`^0BNruW5-d{`Cxzc3N+q zhl=ZCJJi&XkaYM|_x1P(o0kZQKE%yhSnTQ}@3;|&zhby4Kl1oF=To!Nk=F7o1^tK4 zhLe_DD!hEdjJ{W7$18Q2${N>J8C?2#7aoM!YCPb6<8_+J?E-69zl^I>9o+86LrdiT zTS@*5&3cK(CO+rE;ZIG2zE^{va;jX++8Mp#zYt`{Ip`mR6H1V~cuhKy%JR|01^Mek zQVg}}+p5Wj8bRWGJFfi_p<4t*N^WuGcOOtp;d+q<`g=naoR_-dJ(PxPre_M9T6FKo9^hHXu>)EX3zA z=)DE{UuC+UDG0H>3;>Z3Vitg0QE`w6AW$qEjBkYiEC7VD!5D-63d#n+dK=oWfk;VK zB0R=+;wOG1`^QCA9%pAMJ1HkgDH#WGNoP?<5hr_T2?qxmQ3+8Idl?y&vxuFPorHsw z9W4(EC54iakg=Bn-=Sp0r6la_>?BZN9vNv-ag?2tv%6~x%(xA3p;nRZc%hbuH;=RZ&8&yC!5{0q8W#Uzo%gqo*}o4i)K;7< z)aK5NXD^|I0u)e`yk~_QY zRyo9}s?;?9ew|T!rK;Rz&GCQdc2emTQJG0&fSytbRdZZ|`QqUuP&G|5Si5Af`iNj`h7sJo&p`Oc>ZUs7~)a%LqcLy!VFJc{2%wdZe z&+%F>h0|=qMhr<(+fUzg+IIr$v=#xyI)0|AVtU>=Y zG^ri*PvQ|SONpg&*j|hT}Dv|%tFGqEAUs2cm8a~5#I#GTu(|IAv^FuN+ zYI4j1vZrOex*Tr=6!Jd=m)BoRws1X69+4nyS!aB`eeAySz3Bwq*J-<}(hBXxk4NDf zD(BVmNsHz>@hXY^(?jK7Zxe_$XOUp&NY$;n93{(&{b%>b(v7EEkGYPjy$Byl{o0w(F$#i4Gu2f@|>m0eP?Kr7LItUUemee%7>O<`nbFp?3#E#-Ul3t;w zjs#QEQTUyyyo>8g@z*K_B2PV8SsJzjqN!h@4^E$Bpv+(DdvWa*!QE)sRYe8$1>J;~0)3niN)|xL@DiR9BjhVC zFs4aLzyzequ)~+7AvypM4j7}Fp;xStHt|3)fh)Bz9%T78Ql1j97$o;NgYR(GdjilHC-VT_ssb>PT{F~p3W4APoCIs1Ab!)v_fL%!nDSjPCclZx z=K;wgZh`{ZQjCjKK)_=Kdl{S_5g=Ox=e8VxmrmKv0ef*9ICuS~7LA3<41 z2>38U8OeZrWQ+jB7aw@W4bAsqUMR>D0Pa{UhC#gW0Jn>jpxFih_XdO^^X>u(qMz(h zd`4Md^9`AWq-Dc^^cX*#6_1XfkK|%a22$-@Ct&jSpTL9wloN(!LrVa*!msN%TFeUf zWCp1I4+euGFzA=^KU{E(TLZxWbpi(3V9-wmyWptv2N=lwH(>A{1cSn#QM<|HwJPj_ zW8EKMQ2JkkfgKhMv`@ew=lqbL0d~Q0W*_b%SYDws$?ey6E5FY%#8iOBNwqynQttBj z*uTR9#90sv-sFzGZ!s4dPsMwBaIt~aAaLCE7z%ch@BRe^79bShc!tV{-cf3h^GsQb zg0S8Oprwr^t5!1)*rrq=pOGIfg-OUlV4IGWP$VA0^2eb)&ZfaT%E zzJ9-?qh6mt-;_=iVa)V{;`~sI6kJMeW6E^OQ0{tU?e*y3;8JcRJw{T zPaA;u7MgfT(T=zDRf}b0SiT^Nmj>Dqf1eC-p_3kE53L|BXV9x&a89GHH10jbg)x;r z2#9ALya;Jy9|5F7qN2&`1D+CV6XNz*v=g`vRPO2y;MkgT{D{OeXRH~$PD8+?nCkkK zXM3|}p{{_WDOn`MZYx3@Uz+q^_andgzCRfq!W%Krsv+<%<#!h`~X18DJt?EL(^FMuZM*gDzZ#1R4OvuN%}9a%Bd5tG-) z`FoAc+t$Ra`Bt4nloqKlZ^eD>t}FHza_~{q2RKrI6d(=weAQDOnbH`Qd`&Np6JYdS z-t&^+Llj2x9PeJ>qUy!jrk798ZND|_9HpbVn%%6zMj#*r0E!5@we-ce!g1bwiz+5? z_9yeMZM_7P5)U6eK}^}n7y5D=IbWoId+F8fmT)rjMg7&OL`n%><{Nq8 zkIq?|KMFd^PNX-poPM}vG05&`xTbxN=xY6rQRm&<%&d3JH7$ly>qo>FDBc>8awAw?09!&*2 zr2^F}8j5^-vM~SGrWL#(Q6U*26fJ&`h>!?M2=!CLSfT=P0E$HVQz)}|904ZBMtnR1 z`eYh%a`L(yHkD*5UPF#LPKlidI{>K`t})c)CjE7SYrDkne27G}iID?CLc`aF0~bf) z7Dw|I$7&h*)h6yTib(lOT=18<o5;k2u)-g(KUD-r_0U(JgXV0Zqc zZAS^IpTf8bvya>M?~;Nw8jWE<7M}aZxW1B0K!A_Gtgx_yqnD7IzoV0fYmlFims5bS zmyf4#kkffP!Jpd){1s1XFN}Q_{)Y&p;{SxnNzEigoPt0;>r-Y5eVowDWpfMR^{%`{+}vH5GHK7^VOipLE1;`zShANlOy)z& zfN`q~8K;EPw1)1pa-l{hEmvQR9``64eB;T?9h!KEHg{ehv1|^YnmuLiyzUY)7QLQ@ zv`U^mtN&E%HK(=KQXmZjrgVwv&%MpqdfjB$<$(Uo=hc zz|5OUZhg`p$}eh2wr_fpj;X1r=O11hHoaFW$q>cYU~raAlwH&?lS9KlD=m{iz&`mk zgd(y+J&{!r6$`N1vOr;M>Ii5Y3#&ds9fAH=h_b}?{JM$++JIKFiF7=X2}u$9OqI{E z+g*e2U;?Gr*pNxtUt5Joqbz399^6*a^f^YN9@EQKuF07MF@E4MWNEB}=G3oE9;Bkx zGa-XzI5kxbKDyEr)jMfxQf6EcV&aLSi8pz^h6XH1r<@0q~KFIbx-l=QL9H1~!V z6Gp}d{=f?@G&cb>9VCd7ZJj#X3=2ryObJ=k#V;e%682mq99}+il`s;wbyMj`I<(vo z<8*3fN_X*;5L0?*8Wa7RD`Fr|luII`StPr2+@x`PgO6C5@+?V3O-r-;!mZ}}(MO*r zra2}JFI<)j7bW$U;+LD=4xis~eE7IckyVCun2cwH?!1(1!n%UoS&Ew)Wj1#{ zyfl&3ZoK$`@V>($kI~j@@(~f8;P>$;bxQf0jGse#v@AX_{=48>A+f+=u1Yz#fW)q(#3o00+;=aSD zY=Pd@#SbGOFl4^OTQ*t2G}5p zNdZ4RI7x4ZpJH#O(E948=5a7lsI$S^VFfTa z8k32*k7z-#!0vy6a9bR{;8;w!D7Q}crF*rv{OB{>ZOUYOX>E8$M=hj(to@^p*t2Da z>;$Xp6AX(UJ8h%AoVR74nQVW)QFz`yb72j0utSczcAnj80`}w!lR{;4u^{UuiTLxG zi>%x|NkI{eWOd}ilMumB+4^f#d)<4Vv#Rkd+2*e5arxIA@Z`vpRgF`W&CU z%%IWcc?~_u4rMQgi-I}a3oiveCvPryQ_XCxGhIpgz8GIfCYVSaXEQe$&m&_~vwJi> zQ)71DTW`ZB7wy)xZTCW3TtPeH_5t+ZeC><0p7;yAZoMmabE`CnjEvb^v`BVkieV*QV-d%mAU zs^m?`zn#UCmDO*3mD!DUa?d81d)dXBQ)yVcPF`I@0zgNIDJxVaAOO>9g_6!Q)$4^f zh=E-KxC1QRk7a_Km_Bm%q&pQvs7|^U{DpvBn)LyNm!+$nk71E9E$q#jx zeG6UiT&2+q?@Y*{&`W+{8WA>_Uc(J9P-jXj5)ID%{$YB;#mxOBEt8b>PU={@B!cw9 ztn&2j+av?NOm2B$!r@u33zs)tbw?WSKSYu(mP%-5+jbqW8cV56(&A0SOFDMxLLA79 z?fJR2tk{3JI0~E{>65T=YgZ^>naDw0tSK)|C#N21>qw{T#=u|IklrCVEARzUg8-&e z$tg2sCLnp6ldri8pHL7p=H5Qt4kiE9MZbpNU-wJIedfTJoe0mcbRr#<8Y^o$B{`L5 zFAVHS^!KQIT8gYJNjwU576eQj_LzVoI(CUXI#!WYk>Jr{IHS*NjVZoGexpYcH`{~+ z{8i;I74d{P*)C+7a|&jJpMB;lTOIYqIgL9`Wbwg=$I|$jXp+qva}kCXZXM<4F+^0e zIqHO-3mP;q!5-g>IM&80GX@2N$$NFPEgBy9#?FqZyDY>c0;p^JPO*1tIPe3WZ^!To z4!z}h&tpri+@EN>Ri(PbNzg+9wUnTmiJDe>74R@ofQXobxQ&%9X*uoli`tbAhUPS9 zol-H_U+L9E)?F8EjiLs*-^w1%8@qq zg^p-bEVcJyn$62Qp`MIlKiOL)v&d0EZP22l#+vjevXuZ)U%gJzq1oBR9 z@4s0;jm^;!vn_3f;&LAppO}ROrO$$nZEUC>xh5fI9h;=$D6WA|85x<(f{N>vffB%? zkT5EkeAJT3JDkw|$=Y(*^oEJho1)hn8w}5zEz%pq*0=l_UH8p<_QfYj`RVr!*2D=c zF6~p&SDdX&uAY9vl52TKjICAY93C?&vTT@KFRybq&iD4Ir=GH>W*h>9HkAukS{Ze6 zQhnDU3}t5P4-9RLeTkZ6HEsNhzbt=Dxp+^c1JZ*fV2Vw+tv+A0wieqXdprz>U=#PFvh0&p*#lYuJ5H4 z!~QPhUDVFY#d4J-au(tT;3y<`JLJ*QvNl84K z=e`yWkj>Rp+;F{8>A@mGnmq1hIQ_6<55mDh*~O#06O#+kN>xenp$fi?(^T6#K&)>7 zJ@WvFqkt-CVjfu}qtjV=sI*h;1_>C(g-BBT6kGQkM7yV8a}-N~+6fqvH8tp@BrcBK zCM(-2mrB_a*!BXMVqOSxo`fwfo1nLW!Ud1eiux(=q^8pd6;(rry9FM;d9`WHjxqF! z6sHiN%S*fSwxS+Qkw}FM@oEUz0%{%Ek zQ|tEHY6JvL;b*8t$0O}ZR7wE;Fl2u+#)d9R`?D>5KL>qi zq2nDb1C@$4w2@&(hMNLcS@8{N+9%FcW=rwIaR??CBMqA=r%Ih&q13NVSp>zr7We|E zRerpsC9g`3(a0G^kT3K?Qk@d>XF3wN#OGheBr`pMv=+ig>VmcJ;A_!---%QY4|0 z2D2tAEWCB&(zadfezNsZ*;I_`wD9~$O_Ebh?zWZ~?(qT%Mhqfmk@kooF=wF6bYI{K z_t0AY+mCtXvB|~-Nfh^&gj@AzS3)1^0c0e3%SKkoaeZ05ZKiE-bup57-a_^2ICzWu z<2NXC<2e<76S6Tc3i<-kY0T1=Nm-P7m}9v_?6orW*Qwco71>5hpRY!7n zi5agDngL3RiDG<4s?(!zURSt>)J%Taex~aiP*Q%L_>sVEAH*Ew4=d%XW?= z)!BFWBb?2lI4z98hbPS7hQv%#Vj)A)rCOghU(v{t~yL3A`{^OMh}*I%1A~*RZm>1wCcg7ELt`p=h8iMR34K z`Uv~ref%(iZ#c?2Q1>(JGHY^ONr;UoZyt>BE{6{kl|RR<`95j7U%v>)2-d0rZMs3T ztm6EQE>cUM4RM*o04Npt#ED2h`n;4+xDrrDIaAN&ZOgBGP`r02@4D4e9x}dvT4MOK zYk^(ZoLFO5 z!QPbZPx!z@g)!?Hh9-bkX?KF<6t~z)c#4W_z&;hx5$%g5bZUdd>Q?ylyoKC;?#2bT z)cqNufe+=6XL?%QxJX|S=SA>nrnE0mu4-TY)Kz2mrDwuV_p#E};6{T+n6%L(S(S=y z9^TVi{CUfR_^*p^u93#ntBT`YRdX;F(pd4i09`?pT%mCJ6mahYrCx{mH>Jhpx$7PH zRXsJN_s=MwgU8nvFgh)^>n@2)u$G}n5}!iiY^>%j0^erx^Gw(!P%`t->{-%(%6a8b z<7kO$J*Y-BB}EA7KCgUzIWgaSd?GC;Z#$9oG9#Z5vW@hByE<3aGb>>iGDcq?tiK*0 zrocToV?Zx4n6sWR?WS7i{l$n|=~PlQ53k#{SS`%oD{#3|CMh$>y7`xabMqt6`82!3 z+DKU>X2nr7-71r3*f}4tkP4fZ5A?v#=E>ZJN>>%Oq$j7X_UJys8xe zn`Wg`DlA$R66p<$h?s`5{XCcjoU8TFw(;9Z5JQVbM^57?aJY8A}hOui40f4u?V=@rjZEdUTD&Hd8^$(TIB{k=*w5R^fnhNMWi0 zF`aaBi_^Lo&#=A5)aw|BE3hlZTk#&7V~irW0_#CRg*?nPBFRcPy0zz{#0}PWl&ALk zI#lsIhXa*@HrH}wGY>CHb8!+rjFD(Rk1sZ!@J{%0$`Q^|1o2P}x}AXFVJ+1I&TwZb z@+%q@d2Jsvb-GGRrR;yqlRYN{rtwKhuQ%c>84D<%u4&KJP$d0by`Oc%)=Z&66S-Na zZxW{&$>v1$@G7Azp^#npc$Bgf33`|Dw%z&Izqnu{E6yA@KD?s zYyFhM1E0c>Z!Q#$Vi&V|`YkY2^lQrP1_t6gj7I@kfvt^4e$R~ezaou?H#wFg4P9mm zLU^6@eV(vXmJ3hSb>~J`t*;xeCl=eZVK%%7x4!MFU8oXHdY!lFRzeL%Cyo#A;RpJ8 zoCD(sCjpAYcn0bxc3krX#}SJEQvwfk4RhlvKQ~?Lt8HR@QV|un8+t~SPBwK*P;&d^Y#uvQh=d0PUt_L zq2&a$8L_zlknoS^Q%^GgJ____&VM>_bCMOiiRxD-7|8B8oPI+3894n##Y%w<_Wk+% zYkd8D2cGPuW27tP>ZD=(D?a!?Jp4=j2n>7DQpbOTASD5=8}y7|`~OeQ;Znao{=OUh zo(a^K|OlRC-w+}lm|N}3uN=_jQ4*WWv5;O zdC*h$gFHk*QS$%b@gIkhslOcaH~@J_16)@>AM^PCjlrkRjm5S=OC%Io0|lsepEH5u zhKqoDWe-AywJx;8XhTCiGg*HJ7bj0U+Eij;>?N+Ef<~jU|IpGw;B&A6HVHl-ztTdZ z{n1KjG07XCivYsK3&`%L+X%3)?neIX~MbL>p|AleWp(kmRKpiEKKudEM)bH=XcYo52{srCmkMCXo1>NlrddR<^hhqCqBM@wV)KB@u zewTm#FZsc?5B^D)0O?{UGOJ+g`YR3U!m-w&(W){A#;P)^#vZ1+<|39DV^vdCljE^~ z{@3G2-yT#t>0>1YRZa~%t}tk2X{Zxox$2>>ow1hW7D delta 31 lcmZ23yjfUGzMF%C0SK7>rzDmn79=KTC#I*yZ{(WG2>^)t33vbi diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz index 07cb52ec1c834808609b204ed2ffe0b4cd83f62e..7de1154e5be7e104ad07d9b3a7bce639375c8c67 100644 GIT binary patch delta 850 zcmV-Y1Fihm2HFOY6@PErFbn|Tdw)xnqy>rrI}O`)$K3;2Vr(_C=;`6E{q@t!j^h*? z5c<+42Kb4VWr~XAo;&bI!+UrPRzH0CP{U*A{ouat_d=T8tCOJBXmXv!PGH{v1ua7H z8g4bb->;SUYAGw7x9SXpd0ZfQDw`l%@B}Z%*97xtN5$NvS+5KQOYy}L+!E4C(bd6c zCeR2l-g2*yklqcIABVsYzbncUo{5^WRXuP5m*a~imE+C>WiDqT->$T3RHh|x1t{qO zx4!!wIOzphAb&+Yo$$~7T#C*m@T9pVl$9J_6G|VMgaM2MBS?$o)|$ppZRZ_(I-Du^ zj9jbFhb`Eotldr6wYmkmGG8_@Ush(_BI$62@*8?W^bOaVpjM$I+w!4Tajn_=!I@Zz ztOKrn$CQa1Fl^vogLyhy)e{UP>@p9e#4boisZ&^IEI%KG zZ=8V!5jQ2FJfRs{OF7k7YzXx)MVB-+XI3GldzplsmPRAdd`F9+?I`G#Q&Y{?ex%L; zl_cXcwA6ov27W37WPOGv9nWDnw())jlD%N+gy$;*;GlsF7be5Ka$tF*DN}JV$}IF~ zxD}LjF@Fh@HDghL=mw8M|$xqP_<4A0u`f+6I-4(C_|XFNvNLY z{uFOK?i;bcwVZNxxwYyTs0^h_ytN$I#tU?Aw7yS<@R%Yrb0Z3rZ$4mF3&_+jlsn3N#lDTw^IVTPiTCyk7 c0=*a|(GN8Ul$2L>5^{=v0TW#UNTU}307+|?ng9R* delta 847 zcmV-V1F-zs2G<6V6@OFPFc1LXJHG`vAu~*8dg`>-9(&Kol46UE>V)5GI*G?%)n1$F;meyekE8d-ecfyr*WHWbLdn^7TKG-;i_I(s zN_dHN-{xKS^we!P>4QxQ?u;HDsI&&Gcg@lQbj-~URO>($F@IJtL~`$x7zsMUj=|Im zgjs;X_0%fy+KiR(N=eI|xAG|3Yj|PgGWLNF!nbodyr!4GcSy@EnDtU4tQ1}-Ms0gi zDY!DQBMdYX+FR-+v?q5X<(Xg*;?EK>x2sSSwsI1*UCZHx;?hy)1)`3tke`)XIZHjX zYbCxUC#dz~?|(os+v36~=-~i=nza<1kGzATRuC?X&|GMSncQ2dCYnCSsS6~B(L3_A zJtDZ(c#rP3cTf}3QETCJ;ogJ2PWBGw75R)OpRdUJHblGC$@}sE;Crf6J6eH~>OjpT z<8rlR6jb?6n zo5=a7<|N8iS!Plv?cki4kCwz`8}r?j)VC zv{#5aoPUW@f(bOW46z~>VBuy=<50p2rNK^d|L=ISwh9G=UO3EVukjTd6>FDbaA9JH zGz6*JE4BcYQ`8^gQ^&VX>|ZTK+)8e#Is{0KIFw5GLMiDbi&GSD81V${L%LvK><_(V zFcTE+E_5cY$$gxKc&x-L!+s1lz6&Vktoo0ekADIusP{^cQ~WxnCIf<1ucR#b_Mty# zKN|U7LlxURZ?~y;-nO$VaTak{aXhdiqOLhSOCeq_Q@haK4X;93hC#;`Nx|zB-<*Up z3(m!6lLv>pp?}Wa4tu8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#?PGA)$PnhL% ZCQ6!b(lFqNy}1B6!M}FHK(GfF007c1q}l)g diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 6ac13695e..d0094f5ff 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -86,9 +86,6 @@ __all__ = ["backend"] _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" -_DATAPACKAGE: Final[Path] = ( - Path(__file__).parent / "_metadata" / "datapackage_features.parquet" -) class AltairDatasetsError(Exception): ... @@ -209,7 +206,7 @@ def query( def _scan_metadata( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.LazyFrame: - frame = nw.from_native(self.scan_fn(_DATAPACKAGE)(_DATAPACKAGE)).lazy() + frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() if predicates or constraints: return frame.filter(*predicates, **constraints) return frame diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index c83c6066e..87d1ac366 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -22,17 +22,10 @@ from typing_extensions import TypeAlias -__all__ = [ - "EXTENSION_SUFFIXES", - "VERSION_LATEST", - "Dataset", - "Extension", - "Metadata", - "Version", - "is_ext_read", -] +__all__ = ["EXTENSION_SUFFIXES", "Dataset", "Extension", "Metadata", "is_ext_read"] Dataset: TypeAlias = Literal[ + "7zip", "airports", "annual-precip", "anscombe", @@ -42,13 +35,13 @@ "budgets", "burtin", "cars", - "climate", "co2-concentration", "countries", "crimea", "disasters", "driving", "earthquakes", + "ffox", "flare", "flare-dependencies", "flights-10k", @@ -61,12 +54,11 @@ "football", "gapminder", "gapminder-health-income", + "gimp", "github", "global-temp", - "graticule", "income", "iowa-electricity", - "iris", "jobs", "la-riots", "londonBoroughs", @@ -86,10 +78,8 @@ "political-contributions", "population", "population_engineers_hurricanes", - "seattle-temps", "seattle-weather", "seattle-weather-hourly-normals", - "sf-temps", "sp500", "sp500-2000", "stocks", @@ -102,71 +92,24 @@ "us-state-capitals", "volcano", "weather", - "weball26", + "weekly-weather", "wheat", "windvectors", "world-110m", "zipcodes", ] -Version: TypeAlias = Literal[ - "v2.11.0", - "v2.10.0", - "v2.9.0", - "v2.8.1", - "v2.8.0", - "v2.7.0", - "v2.5.4", - "v2.5.3", - "v2.5.3-next.0", - "v2.5.2", - "v2.5.2-next.0", - "v2.5.1", - "v2.5.1-next.0", - "v2.5.0", - "v2.5.0-next.0", - "v2.4.0", - "v2.3.1", - "v2.3.0", - "v2.1.0", - "v2.0.0", - "v1.31.1", - "v1.31.0", - "v1.30.4", - "v1.30.3", - "v1.30.2", - "v1.30.1", - "v1.29.0", - "v1.24.0", - "v1.22.0", - "v1.21.1", - "v1.21.0", - "v1.20.0", - "v1.19.0", - "v1.18.0", - "v1.17.0", - "v1.16.0", - "v1.15.0", - "v1.14.0", - "v1.12.0", - "v1.11.0", - "v1.10.0", - "v1.8.0", - "v1.7.0", - "v1.5.0", -] -Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"] -VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0" +Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".tsv"] EXTENSION_SUFFIXES: tuple[ + Literal[".arrow"], Literal[".csv"], Literal[".json"], - Literal[".tsv"], - Literal[".arrow"], Literal[".parquet"], -] = (".csv", ".json", ".tsv", ".arrow", ".parquet") + Literal[".tsv"], +] = (".arrow", ".csv", ".json", ".parquet", ".tsv") def is_ext_read(suffix: Any) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} + return suffix in {".arrow", ".csv", ".json", ".parquet", ".tsv"} class Metadata(TypedDict, total=False): @@ -177,29 +120,34 @@ class Metadata(TypedDict, total=False): ---------- dataset_name Name of the dataset/`Path.stem`_. - ext_supported - Dataset can be read as tabular data. + suffix + File extension/`Path.suffix`_. file_name Equivalent to `Path.name`_. - name_collision - Dataset is available via multiple formats. - - .. note:: - Requires specifying a preference in calls to ``data(name, suffix=...)`` + bytes + File size in *bytes*. + is_image + _description_ + is_tabular + Can be read as tabular data. + is_geo + _description_ + is_topo + _description_ + is_spatial + _description_ + is_json + _description_ + has_schema + Data types available for improved ``pandas`` parsing. sha Unique hash for the dataset. .. note:: - If the dataset did *not* change between ``v1.0.0``-``v2.0.0``; + E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``; - then all ``tag``(s) in this range would **share** this value. - size - File size (*bytes*). - suffix - File extension/`Path.suffix`_. - tag - Version identifier for a `vega-datasets release`_. - url_npm + then this value would remain stable. + url Remote url used to access dataset. .. _Path.stem: @@ -208,13 +156,14 @@ class Metadata(TypedDict, total=False): https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases + Examples -------- ``Metadata`` keywords form constraints to filter a table like the below sample: + ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION + ``` shape: (2_879, 9) ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ @@ -249,14 +198,18 @@ class Metadata(TypedDict, total=False): """ dataset_name: str - ext_supported: bool + suffix: str file_name: str - name_collision: bool + bytes: int + is_image: bool + is_tabular: bool + is_geo: bool + is_topo: bool + is_spatial: bool + is_json: bool + has_schema: bool sha: str - size: int - suffix: str - tag: str - url_npm: str + url: str FlFieldStr: TypeAlias = Literal[ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 66353b9e4..95a6fb0ad 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -16,7 +16,7 @@ from narwhals.stable.v1 import dependencies as nw_dep from altair.datasets import Loader, url -from altair.datasets._readers import _METADATA, AltairDatasetsError +from altair.datasets._readers import AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read from tests import skip_requires_pyarrow, slow @@ -296,9 +296,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: assert match_url("flights-10k", url("flights-10k")) assert match_url("flights-200k", url("flights-200k")) - with pytest.raises(TypeError, match="cannot be loaded via url"): - url("climate") - with pytest.raises(TypeError, match="cannot be loaded via url"): url("flights-3m") @@ -690,9 +687,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: """Ensure all backends will query the same column names.""" data = Loader.from_backend(backend) - fn = data._reader.scan_fn(_METADATA) - native = fn(_METADATA) - schema_columns = nw.from_native(native).lazy().collect().columns + schema_columns = data._reader._scan_metadata().collect().columns assert set(schema_columns) == metadata_columns diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c9f35ae7f..131e15bac 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -26,7 +26,6 @@ from polars import col from tools.codemod import ruff -from tools.datasets.github import GitHub from tools.datasets.npm import Npm from tools.schemapi import utils @@ -40,13 +39,10 @@ from typing_extensions import TypeAlias _PathAlias: TypeAlias = Literal[ - "npm_tags", - "gh_tags", - "gh_trees", "typing", "url", - "dpkg_features", - "dpkg_schemas", + "metadata", + "schemas", ] __all__ = ["app"] @@ -67,20 +63,11 @@ class Application: Directories to store ``.parquet`` metadata files. out_fp_typing Path to write metadata-derived typing module. - write_schema - Produce addtional ``...-schema.json`` files that describe table columns. - trees_gh - ``GitHub.trees`` metadata file name. - tags_gh - ``GitHub.tags`` metadata file name. - tags_npm - ``Npm.tags`` metadata file name. - kwds_gh, kwds_npm + kwds_npm Arguments passed to corresponding constructor. See Also -------- - - tools.datasets.github.GitHub - tools.datasets.npm.Npm """ @@ -90,42 +77,20 @@ def __init__( out_dir_altair: Path, out_fp_typing: Path, *, - write_schema: bool, - trees_gh: str = "metadata", - tags_gh: str = "tags", - tags_npm: str = "tags_npm", - kwds_gh: Mapping[str, Any] | None = None, kwds_npm: Mapping[str, Any] | None = None, ) -> None: out_dir_tools.mkdir(exist_ok=True) - kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} - self._write_schema: bool = write_schema - self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) - self._github: GitHub = GitHub( - out_dir_tools, - out_dir_altair, - name_tags=tags_gh, - name_trees=trees_gh, - npm_cdn_url=self._npm.url.CDN, - **kwds_gh, - ) + self._npm: Npm = Npm(out_dir_tools, **kwds_npm) self.paths = types.MappingProxyType["_PathAlias", Path]( { - "npm_tags": self.npm._paths["tags"], - "gh_tags": self.github._paths["tags"], - "gh_trees": self.github._paths["trees"], "typing": out_fp_typing, "url": out_dir_altair / "url.csv.gz", - "dpkg_features": out_dir_altair / "datapackage_features.parquet", - "dpkg_schemas": out_dir_altair / "datapackage_schemas.json.gz", + "metadata": out_dir_altair / "metadata.parquet", + "schemas": out_dir_altair / "schemas.json.gz", } ) - @property - def github(self) -> GitHub: - return self._github - @property def npm(self) -> Npm: return self._npm @@ -151,35 +116,26 @@ def refresh( .. _vega-datasets@3: https://github.com/vega/vega-datasets/issues/654 """ - if not frozen: - print("Syncing datasets ...") - npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self.paths["npm_tags"]) - - gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self.paths["gh_tags"]) - - gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self.paths["gh_trees"]) - - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self.paths["url"]) - else: - print("Reusing frozen metadata ...") - gh_trees = pl.read_parquet(self.paths["gh_trees"]) - + print("Syncing datasets ...") package = self.npm.datapackage(tag=tag, frozen=frozen) - self.write_parquet(package["features"], self.paths["dpkg_features"]) - self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) + self.write_parquet(package["features"], self.paths["metadata"]) + self.write_json_gzip(package["schemas"], self.paths["schemas"]) + # FIXME: 2-Part replacement + # - [x] Switch source to `"metadata"` + refresh (easy) + # - [ ] Rewriting `UrlCache` to operate on result rows (difficult) + urls_min = ( + package["features"] + .lazy() + .filter(~(col("suffix").is_in((".parquet", ".arrow")))) + .select("dataset_name", "url") + .sort("dataset_name") + .collect() + ) + self.write_csv_gzip(urls_min, self.paths["url"]) if include_typing: self.generate_typing() - return gh_trees + return package["features"] def reset(self) -> None: """Remove all metadata files.""" @@ -237,25 +193,16 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None fp.touch() df = frame.lazy().collect() df.write_parquet(fp, compression="zstd", compression_level=17) - if self._write_schema: - schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} - fp_schema = fp.with_name(f"{fp.stem}-schema.json") - if not fp_schema.exists(): - fp_schema.touch() - with fp_schema.open("w") as f: - json.dump(schema, f, indent=2) def generate_typing(self) -> None: from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT - tags = self.scan("gh_tags").select("tag").collect().to_series() - metadata_schema = self.scan("gh_trees").collect_schema().to_python() + dpkg = self.scan("metadata") + metadata_schema = dpkg.collect_schema().to_python() DATASET_NAME = "dataset_name" names = ( - self.scan("gh_trees") - .filter("ext_supported") - .unique(DATASET_NAME) + dpkg.unique(DATASET_NAME) .select(DATASET_NAME) .sort(DATASET_NAME) .collect() @@ -263,34 +210,32 @@ def generate_typing(self) -> None: ) indent = " " * 4 NAME = "Dataset" - TAG = "Version" - LATEST = "VERSION_LATEST" - LATEST_TAG = f"{tags.first()!r}" EXT = "Extension" - EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet" + EXT_TYPES = tuple( + dpkg.filter(is_image=False) + .select(col("suffix").unique().sort()) + .collect() + .to_series() + .to_list() + ) EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" EXTENSION_TYPE_TP = ( - f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXTENSION_TYPES)}]" + f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]" ) EXTENSION_GUARD = "is_ext_read" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" NOTE_SEP = f"\n\n{indent * 2}.. note::\n{indent * 3}" - name_collision = ( - f"Dataset is available via multiple formats.{NOTE_SEP}" - "Requires specifying a preference in calls to ``data(name, suffix=...)``" - ) sha = ( f"Unique hash for the dataset.{NOTE_SEP}" - f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" - f"then all ``tag``(s) in this range would **share** this value." + f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" + f"then this value would remain stable." ) links = ( f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n" f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n" f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" - f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases" ) import textwrap @@ -299,6 +244,8 @@ def generate_typing(self) -> None: -------- ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample: + ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION + ``` shape: (2_879, 9) ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ @@ -334,14 +281,13 @@ def generate_typing(self) -> None: descriptions: dict[str, str] = { "dataset_name": "Name of the dataset/`Path.stem`_.", - "ext_supported": "Dataset can be read as tabular data.", + "suffix": "File extension/`Path.suffix`_.", "file_name": "Equivalent to `Path.name`_.", - "name_collision": name_collision, + "bytes": "File size in *bytes*.", + "is_tabular": "Can be read as tabular data.", + "has_schema": "Data types available for improved ``pandas`` parsing.", "sha": sha, - "size": "File size (*bytes*).", - "suffix": "File extension/`Path.suffix`_.", - "tag": "Version identifier for a `vega-datasets release`_.", - "url_npm": "Remote url used to access dataset.", + "url": "Remote url used to access dataset.", } metadata_doc = ( f"\n{indent}".join( @@ -375,14 +321,12 @@ def generate_typing(self) -> None: utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES, LATEST]}\n\n" + f"__all__ = {[NAME, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", - f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", - f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}", - f"{LATEST}: Literal[{LATEST_TAG}] = {LATEST_TAG}", - f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXTENSION_TYPES!r}", + f"{EXT}: TypeAlias = {utils.spell_literal(EXT_TYPES)}", + f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}", f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" - f"{indent}return suffix in set({EXTENSION_TYPES!r})\n", + f"{indent}return suffix in set({EXT_TYPES!r})\n", UNIVERSAL_TYPED_DICT.format( name=METADATA_TD, metaclass_kwds=", total=False", @@ -408,7 +352,6 @@ def generate_typing(self) -> None: Path(__file__).parent / "_metadata", _alt_datasets / "_metadata", _alt_datasets / "_typing.py", - write_schema=False, ) diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet deleted file mode 100644 index 189dbbcae0b49d624a63d54b76e6ca9ce9425e3c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6289 zcmds62|Sct+rNj&mSw_yo6*QVAbIrA0j2=qVI$ z*@=`bq#{ypo3v=tf>wHp@7!a_^FHFP<-;WKoD2FH0zR5xkQ4aAVh!7L(~O`uU=#P~D2Th`?ca zc{eu@o>ANxd8GP?>r2(b>I==y%VR%N;MG-8?KsGKRxY99Q3)vp*5MM!vHH#=QC3g= z?flN_5LXwwqNG7sK{{faj0`Od zX+-f&bVEALkTwQTO`nZ}L_mdw6O);*CT-0o+a1-ne`M*PwZw_HhL!EN`e}rx&Q33% z!~Bz{Kszi=GKItlB%)|A+Z&cfSV+1g(jrQcu>9sEASRhJ;C#8sF-Z~Q&47A+^<`xX zj*DtV5ep431=X{qZr+2sz-Rh)FFHOcE3=N@vGR7%)5?5b*>LsLGqV`f?K$!mz2eyg zVvhb-8*klOs$jp9?PPM1o2RHR_vWgiQiiQ6gD<^mq&aoy$;r1Grbhf^W|z;oD`{mt zR}Mu+@rl+I7r?JNMTW zK7aq&N#kjm%eLEP5Bm+m+Gkxp6_cl6R()ir@A0Bv_pj}YP0Lz+v66W;#(3dFMQ6|9 z^n(!@{Fc=A1Fv%rxh!3Ax~a<`B;)o5xu#9h2I2x%Lq86RR;zN2fYbHXu<6=!;_8Nq z*&#*t8A;EJ9-D2mX_yxBrS8%~SLM>Tg3|`Q8`#5FmVbIT#X|d4dElP@qYe^%2jcI4 z3_qWgaKr72Yx-1`l7q{1)-~5HmNi)4-TB_FoxhE+XqkM>tUK{(X00Bc=&pi+c7etr z>9?Nl25WkePODu_RjbpRW8cs2;v^Q!^l8*TN2gotn%kPH^sx0-z?UmSFCxkEQo(crDLJ;LB$t&3ZG})v#hW3uQ9Pv zbd%Y-^2|mn$z3xy>>_AGi0(NbKhwfoVL_z_uF~* z5q{{0LH*ZL&Jul6b|~^qtC5?-{z{ww#cc z90#(C%+^QVhSq;{Hz#!WAGl_dYtxnX;ih1isL*hT_50`iONzr5$EXfB`%3i>mPw{7 z1Z(1-i0|&_*Kf4m5W1ukT@?~hFV+&^x=*DpfY|+cgcE$}ONU7)_xfUP!hVGKHk2qV`$NQ*FM+fx!E6oOysDXv}b&D3m#sUyVi=Dia7>pJhobfCj@k+rC9N zIy{D{Ih|(WWopdknJ_HO%#FN^O-(IK%`G@|wuvd%oM*9?<7H$@)S(%%Ip%aWkISWV zyl7?|y4P9;*TmF|W5Vz<;dyZ^xaJlnMr)0Jp0Nacwve%&sUZpSlma|PS^p8Bfb~*r z;6}n^)=MN@QG+r9)a2mkS-my*mT2qUmlCJ#>CjpzZ=}&@HUd3?dZ6@{yYa1I4^LP} zehSphM>~^lo6ou96LgUt9!eqI)O*|#OFp0xw>HyOR=MhiTu%#`;^dv!*_>%wCxyQC zJ6EDzAD7v!x}EE0$-GcKThP+r;gx0h@(?2xibdiI$H_$#cQ3L!Wd;-zD0UqUw!{t27Ez%^L!wh;l!+j_HDW`>6_&1)D;E@tw0&ih6T1gO zYtqzsKAo*?)nz$V8;98YJ94a2R&+#8pDEcV8Q*`-wK<~gV-=tOkL&n)$(y;4$#F*$ zq&YI?aU zqVB0~b~#{3`PQ;I$M%{qbZ@wbe*p|R(4|C8?u&D#pE^0uBtG-V9^Yl5Nln*1nas^2 z4-)jyTf+ZB8!<8)h4)1O@b@HoeZeSD~lT)P~>j zK~6L2{2*Y<9GfonSEpwVde{>-yC=UXHdGiE*okZ1?JLbcP*!^h?W(B=SFD{k{nY%P z?5q<=Y3e_flyvD{j%E*OJN<{F2F^T=^ii%ZTxa)KA*^xa)0#B7gAscjhVwk>Dx+i> z>p4o6>eUglYucl>mBj@QsJ22SD-n(o!WvIa?9eODD1EtTxtpHL-dcjhH_(VP@dYWOe~D;X;Ea(I(1A>hf+#YSYwj zQ7DHW92_8+O{KB}dcCz{fRmLYqTvX^vPpmkgt%uYZf7iFy$Z-Bk!Mp|9>k)V8c*@wd_GLzRMAoK#&NE z+p0b?jEBH^9)WfKbe@;M4(-@Ql)#M&*d2T4P@C!shJp$Uhtt8)AQ(S(VxQv2UgsYq z4B}B7Qh}ok)36|Z(-J@uEegBa!?X!RHKsV|O|6(@N+n%te!tAj6(do{SJ(`$<}TI_ zwpX{p7lrKi!_w?K_CPd~a60l~|Bqh@=gPk%+QB z0*`;4Y$@VNNuPuHK4yY6O@uV@5=tVDy_NzK2m}drKYSh}5r6pXHp%=?P{xlk9*Ou9 zll&+zO>$|107%4%XL+)EY#~EHJ^p}vM?JngzoQ;EgsB}yvxNgzabWinrjLvq6oJcd zAd?V@jC@jn&x(x6(Yo+4{Fr=}4u_73876SSc=yj{aLTxxYcbI!eGcr2^@c@A2kr1d z{{F!*0fMEqMT9>R z(h;5I0Q8$TWBP?C>OIy^SPH9KurUi!CQ#5ochn5)u?XwWT!M+10$mmgj4%Y42#kP6 z;dmxifYB?kc|f^*%)W3Oidr+kASxISR3_M<0508PGuO{~E%*xDTx>$!z%O+U0KW5u`$hPeEzrX2eVyFH*vA3 z{DPv z8<@sQ$I+Bn3zaBJ6|Ps1a$o>!;u`V!-&=n_jt^UsOq#0;7lnvo4*tIEAOUFy*!8~o O2kD4IkOTN<#Qy-UbLb%e diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet deleted file mode 100644 index d53aff4a1b06955c4043c298c930178edae13db9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2598 zcmcJRe{54#6vyv-{R!)~cH6z_GY{#IH5G7lLzZI3cBSMc^XS6@H&$(D^5eeT}#d%G?^@$Y*#ebGQ@>5f~j z8IEhWYV*kgby?ha@`q!>OVl>SQn6t7%X{pOn=&O-?e({64SU>d2GYQ+*_({>aW&MUN=bsa^8 zMfO6w!f$bW$`Kw0F-~h*U@=s9#U+PkxqdCJo?f-0>Dkq9wXVI~)o6P$(DFv`o!-#Z zzVNKC+FScK_YTGeyQo2!OSDG8tZMf*Qv-{!otf;Q+Nv)dGa#hTLZ`c+Nd`Ao5yDH{ z_WZ=KngBJUwo%Y}B*V+B=SDjC#JpQF>R~DjVjNe(ISExOD4JI9b8QhB<`Bm|j&P#|ukk0l!h+qWk(tuf7bGwOQ$aF3@l-@jY-8{9g0pzXllC;t0s z{mjF)&Y`vbWrzKhgK5iZ*`O?|Ps2{}*0@ey)>z?SS&g*VMGdJ{*>^#*RH~ij(~OCs z&$n&6{m)cPw}V!WhSw;}_ri-mC)2IUBz*YQ9D*CdyE)QROVnao{=?pOe{iH2c=kJI zJvUhf9ZMQO!?eUjvr?dr*9hs}q1siRZ%W+w^6BC0{S65YQLWUGh8nc#wd11Z?juA8 z)=8vpKvzms8Zr=>Ozse7gdNyLn0U@LIKeV=B$=z!`$)ut=I1-x!RgP2qMRtLwUvt1k1d*N6&`zZr*V9wU*M~Cb z{$zhBlkQ3NvuOa`DKEs9s3UD{?bJZ1p@5e^9|goEqEQXwku#j^W8o`(xwc07>AMK& zhFN#zny{E43L$G;Ek59PitWy$W_H{}9mLwF71Pe(#6K;@cFB2^z);}kuYfStc zv*XBdBnLJ63~3qJK)xdd+(^v+C;nt7VcD zl@>}8v^CPqYOrtGg!ZM9)U5c`Lkk`-&5d)Gb!ZG%d%mceQ(+e@SI%^_+zRBOZavj{!HLIMRYidULWr$Jy;C>(e zdB|@^dGwN(UWq&);l#0jvD02(jJyv=-?u9Cyl)O>T^~v%en5@O|==w!iY0 i2NpXwNYecI#H_7u3wa|^Aq3y4yME-XnviPvhxk9Ii#j?0 diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 549889f6d..9945bd07d 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -72,6 +72,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: "licenses", "hash", "description", + "path", ) return ( pl.LazyFrame(pkg["resources"]) @@ -82,6 +83,7 @@ def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: .select( DATASET_NAME, path_suffix("path").alias("suffix"), + col("path").alias("file_name"), ~cs.by_name(DATASET_NAME, EXCLUDE), *FEATURES, col("schema").is_not_null().alias("has_schema"), diff --git a/tools/datasets/github.py b/tools/datasets/github.py deleted file mode 100644 index a2956df28..000000000 --- a/tools/datasets/github.py +++ /dev/null @@ -1,490 +0,0 @@ -from __future__ import annotations - -import json -import os -import random -import sys -import time -import urllib.request -import warnings -from collections.abc import Iterable, Iterator, Mapping, Sequence -from itertools import islice -from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast - -import polars as pl -from polars import col - -from tools.datasets import semver -from tools.datasets.models import ( - GitHubRateLimitResources, - GitHubTag, - GitHubTree, - GitHubTreesResponse, - GitHubUrl, - ParsedRateLimit, - ParsedTag, - ParsedTree, - SemVerTag, -) - -if sys.version_info >= (3, 13): - from typing import is_typeddict -else: - from typing_extensions import is_typeddict - -if TYPE_CHECKING: - from collections.abc import MutableMapping - from email.message import Message - from urllib.request import OpenerDirector, Request - - from altair.datasets._typing import Extension - - if sys.version_info >= (3, 13): - from typing import TypeIs - else: - from typing_extensions import TypeIs - if sys.version_info >= (3, 11): - from typing import LiteralString - else: - from typing_extensions import LiteralString - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - - _PathName: TypeAlias = Literal["dir", "tags", "trees"] - - -__all__ = ["GitHub"] - -_TD = TypeVar("_TD", bound=Mapping[str, Any]) - -_DATA = "data" - - -def is_ext_supported(suffix: str) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} - - -def _is_str(obj: Any) -> TypeIs[str]: - return isinstance(obj, str) - - -class _ErrorHandler(urllib.request.BaseHandler): - """ - Adds `rate limit`_ info to a forbidden error. - - .. _rate limit: - https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 - """ - - def http_error_default( - self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message - ): - if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): - limit = hdrs.get("X-RateLimit-Limit", "") - remaining = hdrs.get("X-RateLimit-Remaining", "") - msg = ( - f"{msg}\n\nFailed to balance rate limit.\n" - f"{limit=}, {remaining=}\n" - f"Reset: {time.localtime(int(reset))!r}" - ) - raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) - - -class _GitHubRequestNamespace: - """ - Fetching resources from the `GitHub API`_. - - .. _GitHub API: - https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 - """ - - _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" - _TAGS_MAX_PAGE: Literal[100] = 100 - _VERSION: LiteralString = "2022-11-28" - _UNAUTH_RATE_LIMIT: Literal[60] = 60 - _TAGS_COST: Literal[1] = 1 - _TREES_COST: Literal[2] = 2 - _UNAUTH_DELAY: Literal[5_000] = 5_000 - """**ms** delay added between **unauthenticated** ``trees`` requests.""" - _AUTH_DELAY: Literal[500] = 500 - """**ms** delay added between **authenticated** ``trees`` requests.""" - _UNAUTH_TREES_LIMIT: Literal[10] = 10 - - def __init__(self, gh: GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self) -> GitHubRateLimitResources: - with self._gh._opener.open(self._request(self.url.RATE)) as response: - content: GitHubRateLimitResources = json.load(response)["resources"] - return content - - def delay(self, *, is_auth: bool) -> float: - ms = self._AUTH_DELAY if is_auth else self._UNAUTH_DELAY - return (ms + random.triangular()) / 1_000 - - def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: - if n < 1 or n > self._TAGS_MAX_PAGE: - raise ValueError(n) - req = self._request(f"{self.url.TAGS}?per_page={n}") - with self._gh._opener.open(req) as response: - content: list[GitHubTag] = json.load(response) - if warn_lower and len(content) < n: - earliest = response[-1]["name"] - n_response = len(content) - msg = f"Requested {n=} tags, but got {n_response}\n{earliest=}" - warnings.warn(msg, stacklevel=3) - return content - - def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: - """For a given ``tag``, perform **2x requests** to get directory metadata.""" - if _is_str(tag): - url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" - else: - url = tag["trees_url"] - with self._gh._opener.open(self._request(url)) as response: - content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _DATA) - if data_url := next(query, None): - with self._gh._opener.open(self._request(data_url)) as response: - data_dir: GitHubTreesResponse = json.load(response) - return data_dir - else: - raise FileNotFoundError - - def _request(self, url: str, /, *, raw: bool = False) -> Request: - """ - Wrap a request url with a `personal access token`_ - if set as an env var. - - By default the endpoint returns json, specify raw to get blob data. - See `Media types`_. - - .. _personal access token: - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - .. _Media types: - https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types - """ - headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} - if tok := os.environ.get(self._ENV_VAR): - headers["Authorization"] = ( - tok if tok.startswith("Bearer ") else f"Bearer {tok}" - ) - if raw: - headers["Accept"] = "application/vnd.github.raw+json" - return urllib.request.Request(url, headers=headers) - - -class _GitHubParseNamespace: - """ - Transform responses into intermediate representations. - - Where relevant: - - Adding cheap to compute metadata - - Dropping information that we don't need for the task - """ - - def __init__(self, gh: GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: - core = rate_limit["core"] - reset = core["reset"] - return ParsedRateLimit( - **core, - reset_time=time.localtime(reset), - is_limited=core["remaining"] == 0, - is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, - ) - - def tag(self, tag: GitHubTag, /) -> ParsedTag: - sha = tag["commit"]["sha"] - return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") - - def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: - return [self.tag(t) for t in tags] - - def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: - """For a single tree (file) convert to an IR with only relevant properties.""" - path = Path(tree["path"]) - return ParsedTree( - file_name=path.name, - dataset_name=path.stem, - suffix=path.suffix, - size=tree["size"], - sha=tree["sha"], - ext_supported=is_ext_supported(path.suffix), - tag=tag, - ) - - def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: - """For a tree response (directory of files) convert to an IR with only relevant properties.""" - return [self.tree(t, tag) for t in tree["tree"]] - - def tag_from_str(self, s: str, /) -> str: - # - Actual tag - # - Trees url (using ref name) - # - npm url (works w/o the `v` prefix) - trees_url = self.url.TREES - npm_url = self._gh._npm_cdn_url - if s.startswith("v"): - return s - elif s.startswith(trees_url): - return s.replace(trees_url, "") - elif s.startswith(npm_url): - s, _ = s.replace(npm_url, "").split("/") - return s if s.startswith("v") else f"v{s}" - else: - raise TypeError(s) - - -class GitHub: - """ - Primary interface with the GitHub API. - - Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. - - - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. - - Organizes distinct groups of operations into property accessor namespaces. - - .. _tags: - https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags - .. _trees: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - .. _rate_limit: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ - - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) - - def __init__( - self, - out_dir_tools: Path, - out_dir_altair: Path, - name_tags: str, - name_trees: str, - *, - npm_cdn_url: LiteralString, - base_url: LiteralString = "https://api.github.com/", - org: LiteralString = "vega", - package: LiteralString = "vega-datasets", - ) -> None: - out_dir_tools.mkdir(exist_ok=True) - out_dir_altair.mkdir(exist_ok=True) - self._paths: dict[_PathName, Path] = { - "dir": out_dir_tools, - "tags": out_dir_tools / f"{name_tags}.parquet", - "trees": out_dir_altair / f"{name_trees}.parquet", - } - repo = f"{base_url}repos/{org}/{package}/" - self._url = GitHubUrl( - BASE=base_url, - BLOBS=f"{repo}git/blobs/", - RATE=f"{base_url}rate_limit", - REPO=repo, - TAGS=f"{repo}tags", - TREES=f"{repo}git/trees/", - ) - self._npm_cdn_url: LiteralString = npm_cdn_url - - @property - def req(self) -> _GitHubRequestNamespace: - return _GitHubRequestNamespace(self) - - @property - def parse(self) -> _GitHubParseNamespace: - return _GitHubParseNamespace(self) - - @property - def url(self) -> GitHubUrl: - return self._url - - def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: - limit = self.parse.rate_limit(self.req.rate_limit()) - if strict and limit["is_limited"]: - warnings.warn( - f"Reached rate limit:\n{limit!r}\n\n" - f"Try setting environment variable {self.req._ENV_VAR!r}", - stacklevel=2, - ) - return limit - - def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: - """Return a delay time in seconds, corresponding with authentication status.""" - limit = rate_limit or self.rate_limit(strict=True) - return self.req.delay(is_auth=limit["is_auth"]) - - def tags( - self, - n_head: int | None = None, - *, - npm_tags: pl.DataFrame | pl.LazyFrame | None = None, - warn_lower: bool = False, - ) -> pl.DataFrame: - """ - Get release info, enhance with `SemVer`_ context. - - Parameters - ---------- - n_head - Limit to most recent releases. - npm_tags - Used to remove any github-only releases. - warn_lower - Emit a warning if fewer than ``n_head`` tags were returned. - - .. _SemVer: - https://semver.org/#semantic-versioning-200 - """ - tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - frame = pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) - if npm_tags is not None: - return frame.lazy().join(npm_tags.lazy().select("tag"), on="tag").collect() - else: - return frame - - def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: - """Retrieve directory info for a given version ``tag``.""" - trees = self.req.trees(tag) - tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"] - parsed = self.parse.trees(trees, tag=tag_v) - url = pl.concat_str( - pl.lit(self._npm_cdn_url), - col("tag"), - pl.lit(f"/{_DATA}/"), - col("file_name"), - ) - df = ( - pl.LazyFrame(parsed) - .with_columns( - name_collision=col("dataset_name").is_duplicated(), url_npm=url - ) - .collect() - ) - return df.select(*sorted(df.columns)) - - def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: - """ - Use known tags to discover and update missing trees metadata. - - Aims to stay well-within API rate limits, both for authenticated and unauthenticated users. - - Notes - ----- - Internally handles regenerating the ``tag`` enum. - """ - if gh_tags.is_empty(): - msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" - raise NotImplementedError(msg) - rate_limit = self.rate_limit(strict=True) - stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT - fp = self._paths["trees"] - if not fp.exists(): - print(f"Initializing {fp!s}") - result = self._trees_batched(_iter_rows(gh_tags, stop, SemVerTag)) - else: - trees = ( - pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect() - ) - missing_trees = gh_tags.join( - trees.select(col("tag").unique()), on="tag", how="anti" - ) - if missing_trees.is_empty(): - print(f"Already up-to-date {fp!s}") - result = trees - else: - fresh = self._trees_batched(_iter_rows(missing_trees, stop, SemVerTag)) - result = pl.concat((trees, fresh)) - return ( - result.lazy() - .with_columns(col("tag").cast(semver.tag_enum(gh_tags))) - .sort("tag", descending=True) - .collect() - ) - - def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: - limit = self.rate_limit(strict=True) - npm_tag_only = npm_tags.lazy().select("tag") - fp = self._paths["tags"] - if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: - return pl.scan_parquet(fp).join(npm_tag_only, on="tag").collect() - elif not fp.exists(): - print(f"Initializing {fp!s}") - tags = self.tags(npm_tags=npm_tag_only) - print(f"Collected {tags.height} new tags") - return tags - else: - print("Checking for new tags") - prev = pl.scan_parquet(fp) - latest = self.tags(1, npm_tags=npm_tag_only) - if latest.equals(prev.pipe(semver.sort).head(1).collect()): - print(f"Already up-to-date {fp!s}") - return prev.collect() - print(f"Refreshing {fp!s}") - prev_eager = prev.collect() - tags = ( - pl.concat((self.tags(npm_tags=npm_tag_only), prev_eager)) - .unique("sha") - .pipe(semver.sort) - ) - print(f"Collected {tags.height - prev_eager.height} new tags") - return tags - - def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: - rate_limit = self.rate_limit(strict=True) - if not isinstance(tags, Sequence): - tags = tuple(tags) - req = self.req - n = len(tags) - cost = req._TREES_COST * n - if rate_limit["remaining"] < cost: - raise NotImplementedError(rate_limit, cost) - print( - f"Collecting metadata for {n} missing releases.\n" - f"Using {self.delay(rate_limit):.2f}[ms] between requests ..." - ) - dfs: list[pl.DataFrame] = [] - for tag in tags: - time.sleep(self.delay(rate_limit)) - dfs.append(self.trees(tag)) - df = pl.concat(dfs) - print(f"Finished collection.\nFound {df.height} new rows") - return df - - -def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator[_TD]: - """ - Wraps `pl.DataFrame.iter_rows`_ with typing to preserve key completions. - - Parameters - ---------- - df - Target dataframe. - stop - Passed to `itertools.islice`_. - tp - Static type representing a row/record. - - .. note:: - Performs a **very basic** runtime check on the type of ``tp`` (*not* ``df``). - - Primarily used to override ``dict[str, Any]`` when a *narrower* type is known. - - .. _itertools.islice: - https://docs.python.org/3/library/itertools.html#itertools.islice - .. _pl.DataFrame.iter_rows: - https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html - """ - if not TYPE_CHECKING: - assert is_typeddict(tp) or issubclass(tp, Mapping) - - return cast("Iterator[_TD]", islice(df.iter_rows(named=True), stop)) diff --git a/tools/datasets/models.py b/tools/datasets/models.py index e2036b4ea..21d98050e 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -4,7 +4,7 @@ import sys from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING, Any, Literal, NamedTuple +from typing import TYPE_CHECKING, Literal, NamedTuple if sys.version_info >= (3, 14): from typing import TypedDict @@ -12,8 +12,6 @@ from typing_extensions import TypedDict if TYPE_CHECKING: - import time - if sys.version_info >= (3, 11): from typing import LiteralString, NotRequired, Required else: @@ -26,171 +24,12 @@ from altair.datasets._typing import Dataset, FlFieldStr -Map: TypeAlias = Mapping[str, Any] - - -class GitHubUrl(NamedTuple): - BASE: LiteralString - BLOBS: LiteralString - RATE: LiteralString - REPO: LiteralString - TAGS: LiteralString - TREES: LiteralString - class NpmUrl(NamedTuple): CDN: LiteralString - TAGS: LiteralString GH: LiteralString -class GitHubTag(TypedDict): - """ - A single release's metadata within the response of `List repository tags`_. - - .. _List repository tags: - https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags. - """ - - name: str - node_id: str - commit: dict[Literal["sha", "url"], str] - zipball_url: str - tarball_url: str - - -class ParsedTag(TypedDict): - tag: str - sha: str - trees_url: str - - -class SemVerTag(ParsedTag): - """ - Extends ``ParsedTag`` with `semantic versioning`_. - - These values are extracted via: - - tools.datasets.with_columns - - Describes a row in the dataframe returned by: - - tools.datasets.GitHub.tags - - .. _semantic versioning: - https://semver.org/ - """ - - major: int - minor: int - patch: int - pre_release: int | None - is_pre_release: bool - - -class GitHubTree(TypedDict): - """ - A single file's metadata within the response of `Get a tree`_. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - path: str - mode: str - type: str - sha: str - size: int - url: str - - -class GitHubTreesResponse(TypedDict): - """ - Response from `Get a tree`_. - - Describes directory metadata, with files stored in ``"tree"``. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - sha: str - url: str - tree: list[GitHubTree] - truncated: bool - - -class NpmVersion(TypedDict): - version: str - links: dict[Literal["self", "entrypoints", "stats"], str] - - -class NpmPackageMetadataResponse(TypedDict): - """ - Response from `Get package metadata`_. - - Using: - - headers={"Accept": "application/json"} - - .. _Get package metadata: - https://data.jsdelivr.com/v1/packages/npm/vega-datasets - """ - - type: str - name: str - tags: dict[Literal["canary", "next", "latest"], str] - versions: list[NpmVersion] - links: dict[Literal["stats"], str] - - -class ParsedTree(TypedDict): - file_name: str - dataset_name: str - suffix: str - size: int - sha: str - ext_supported: bool - tag: str - - -class GitHubRateLimit(TypedDict): - """ - An individual item in `Get rate limit status for the authenticated user`_. - - All categories share this schema. - - .. _Get rate limit status for the authenticated user: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ - - limit: int - used: int - remaining: int - reset: int - - -class ParsedRateLimit(GitHubRateLimit): - reset_time: time.struct_time - is_limited: bool - is_auth: bool - - -class GitHubRateLimitResources(TypedDict, total=False): - """ - A subset of response from `Get rate limit status for the authenticated user`_. - - .. _Get rate limit status for the authenticated user: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ - - core: Required[GitHubRateLimit] - search: Required[GitHubRateLimit] - graphql: GitHubRateLimit - integration_manifest: GitHubRateLimit - code_search: GitHubRateLimit - - ##################################################### # frictionless datapackage ##################################################### diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 99d5fe5b0..7f61323c4 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -6,9 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal -import polars as pl - -from tools.datasets import datapackage, semver +from tools.datasets import datapackage from tools.datasets.models import NpmUrl if TYPE_CHECKING: @@ -23,14 +21,9 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from altair.datasets._typing import Version - from tools.datasets.models import ( - FlPackage, - NpmPackageMetadataResponse, - ParsedPackage, - ) + from tools.datasets.models import FlPackage, ParsedPackage - BranchOrTag: TypeAlias = 'Literal["main"] | Version | LiteralString' # noqa: TC008 + BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' # noqa: TC008 __all__ = ["Npm"] @@ -44,21 +37,17 @@ class Npm: def __init__( self, output_dir: Path, - name_tags: str, *, jsdelivr: Literal["jsdelivr"] = "jsdelivr", npm: Literal["npm"] = "npm", package: LiteralString = "vega-datasets", - jsdelivr_version: LiteralString = "v1", ) -> None: output_dir.mkdir(exist_ok=True) - self._paths: dict[Literal["tags", "datapackage"], Path] = { - "tags": output_dir / f"{name_tags}.parquet", + self._paths: dict[Literal["datapackage"], Path] = { "datapackage": output_dir / "datapackage.json", } self._url: NpmUrl = NpmUrl( CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", - TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", ) @@ -79,33 +68,6 @@ def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString: def url(self) -> NpmUrl: return self._url - def tags(self) -> pl.DataFrame: - """ - Request, parse tags from `Get package metadata`_. - - Notes - ----- - - Ignores canary releases - - ``npm`` can accept either, but this endpoint returns without "v": - - {tag} - v{tag} - - .. _Get package metadata: - https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- - """ - req = urllib.request.Request( - self.url.TAGS, headers={"Accept": "application/json"} - ) - with self._opener.open(req) as response: - content: NpmPackageMetadataResponse = json.load(response) - versions = [ - f"v{tag}" - for v in content["versions"] - if (tag := v["version"]) and semver.CANARY not in tag - ] - return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) - def file_gh( self, branch_or_tag: BranchOrTag, diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py deleted file mode 100644 index 788bbb2a2..000000000 --- a/tools/datasets/semver.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -Parsing/transforming `semantic versioning`_ strings. - -.. _semantic versioning: - https://semver.org/ -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Literal - -import polars as pl -from polars import col - -if TYPE_CHECKING: - from typing import TypeVar - - _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) - -__all__ = ["CANARY", "sort", "with_columns"] - -_SEM_VER_FIELDS: tuple[ - Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] -] = "major", "minor", "patch", "pre_release" -CANARY: Literal["--canary"] = "--canary" - - -def with_columns(frame: _Frame, /, *, tag: str = "tag") -> _Frame: - """ - Extracts components of a `SemVer`_ string into sortable columns. - - .. _SemVer: - https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions - """ - fields = col(_SEM_VER_FIELDS) - pattern = r"""(?x) - v?(?[[:digit:]]*)\. - (?[[:digit:]]*)\. - (?[[:digit:]]*) - (\-(next)?(beta)?\.)? - (?[[:digit:]]*)? - """ - sem_ver = col(tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) - ldf = ( - frame.lazy() - .with_columns(sem_ver) - .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) - .with_columns(is_pre_release=col("pre_release").is_not_null()) - ) - if isinstance(frame, pl.DataFrame): - return ldf.collect() - else: - return ldf - - -def tag_enum(frame: _Frame, /, *, tag: str = "tag") -> pl.Enum: - """Extract an **ascending** order ``pl.Enum`` from ``tag``.""" - return pl.Enum( - frame.lazy().pipe(sort, descending=False).select(tag).collect().get_column(tag) - ) - - -def sort(frame: _Frame, /, descending: bool = True) -> _Frame: - """ - Sort ``frame``, displaying in release order. - - Parameters - ---------- - descending - By default, **most recent** is first. - - Notes - ----- - Ensures pre release versions maintain order, always appearing before actual releases. - """ - return frame.sort(_SEM_VER_FIELDS, descending=descending, nulls_last=not descending) From d297d7ea1a8cc7cfa451f61356d7f4f4466062b4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 15:24:58 +0000 Subject: [PATCH 152/201] docs: Update `Metadata` example --- altair/datasets/_typing.py | 59 ++++++++++++++++++------------------- tools/datasets/__init__.py | 60 ++++++++++++++++++-------------------- 2 files changed, 57 insertions(+), 62 deletions(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 87d1ac366..c6daba45e 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -162,38 +162,35 @@ class Metadata(TypedDict, total=False): -------- ``Metadata`` keywords form constraints to filter a table like the below sample: - ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION - ``` - shape: (2_879, 9) - ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ - │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ - │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ - │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ - │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ - ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ - │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ - │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ - │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ - │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ - │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ - │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ - │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ - │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ - │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ - │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ - │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ - │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ - │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ - │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ - │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ - │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ - │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ - │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ - └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + shape: (73, 13) + ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐ + │ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │ + │ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ ┆ str ┆ str │ + ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡ + │ 7zip ┆ .png ┆ 7zip.png ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ d48850099c17… ┆ sdelivr.net/… │ + │ airports ┆ .csv ┆ airports.csv ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 584c3fa1d31e… ┆ sdelivr.net/… │ + │ annual-precip ┆ .json ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │ + │ ┆ ┆ json ┆ ┆ 8f16dda65151… ┆ sdelivr.net/… │ + │ anscombe ┆ .json ┆ anscombe.json ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 3bdf0c866115… ┆ sdelivr.net/… │ + │ barley ┆ .json ┆ barley.json ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ e197ce95c24c… ┆ sdelivr.net/… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ weekly-weather ┆ .json ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │ + │ ┆ ┆ .json ┆ ┆ ccd6baaa89f9… ┆ sdelivr.net/… │ + │ wheat ┆ .json ┆ wheat.json ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │ + │ windvectors ┆ .csv ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │ + │ ┆ ┆ v ┆ ┆ bd59d09fcd94… ┆ sdelivr.net/… │ + │ world-110m ┆ .json ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │ + │ ┆ ┆ n ┆ ┆ 13c94c0c2840… ┆ sdelivr.net/… │ + │ zipcodes ┆ .csv ┆ zipcodes.csv ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 0544c95f1bd4… ┆ sdelivr.net/… │ + └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘ ``` """ diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 131e15bac..26dc8439b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -239,43 +239,41 @@ def generate_typing(self) -> None: ) import textwrap + # NOTE: Uses `pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80)` examples = f"""\ Examples -------- ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample: - ### FIXME: NEEDS UPDATING TO DATAPACKAGE VERSION - ``` - shape: (2_879, 9) - ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ - │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ - │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ - │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ - │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ - ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ - │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ - │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ - │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ - │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ - │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ - │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ - │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ - │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ - │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ - │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ - │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ - │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ - │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ - │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ - │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ - │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ - │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ - │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ - │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ - └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + shape: (73, 13) + ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐ + │ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │ + │ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ ┆ str ┆ str │ + ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡ + │ 7zip ┆ .png ┆ 7zip.png ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ d48850099c17… ┆ sdelivr.net/… │ + │ airports ┆ .csv ┆ airports.csv ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 584c3fa1d31e… ┆ sdelivr.net/… │ + │ annual-precip ┆ .json ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │ + │ ┆ ┆ json ┆ ┆ 8f16dda65151… ┆ sdelivr.net/… │ + │ anscombe ┆ .json ┆ anscombe.json ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 3bdf0c866115… ┆ sdelivr.net/… │ + │ barley ┆ .json ┆ barley.json ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ e197ce95c24c… ┆ sdelivr.net/… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ weekly-weather ┆ .json ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │ + │ ┆ ┆ .json ┆ ┆ ccd6baaa89f9… ┆ sdelivr.net/… │ + │ wheat ┆ .json ┆ wheat.json ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │ + │ windvectors ┆ .csv ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │ + │ ┆ ┆ v ┆ ┆ bd59d09fcd94… ┆ sdelivr.net/… │ + │ world-110m ┆ .json ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │ + │ ┆ ┆ n ┆ ┆ 13c94c0c2840… ┆ sdelivr.net/… │ + │ zipcodes ┆ .csv ┆ zipcodes.csv ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │ + │ ┆ ┆ ┆ ┆ 0544c95f1bd4… ┆ sdelivr.net/… │ + └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘ ``` """ From 64b80ff6f707cd42d0583291dfe5a23b188f1579 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:31:28 +0000 Subject: [PATCH 153/201] docs: Add missing descriptions to `Metadata` --- altair/datasets/_typing.py | 14 +++++++++----- tools/datasets/__init__.py | 7 +++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index c6daba45e..958db2300 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -127,17 +127,17 @@ class Metadata(TypedDict, total=False): bytes File size in *bytes*. is_image - _description_ + Only accessible via url. is_tabular Can be read as tabular data. is_geo - _description_ + `GeoJSON`_ format. is_topo - _description_ + `TopoJSON`_ format. is_spatial - _description_ + Any geospatial format. Only natively supported by ``polars``. is_json - _description_ + Not supported natively by ``pyarrow``. has_schema Data types available for improved ``pandas`` parsing. sha @@ -156,6 +156,10 @@ class Metadata(TypedDict, total=False): https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _GeoJSON: + https://en.wikipedia.org/wiki/GeoJSON + .. _TopoJSON: + https://en.wikipedia.org/wiki/GeoJSON#TopoJSON Examples diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 26dc8439b..7350ede7f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -236,6 +236,8 @@ def generate_typing(self) -> None: f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n" f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n" f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" + f".. _GeoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON\n" + f".. _TopoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON#TopoJSON\n" ) import textwrap @@ -283,6 +285,11 @@ def generate_typing(self) -> None: "file_name": "Equivalent to `Path.name`_.", "bytes": "File size in *bytes*.", "is_tabular": "Can be read as tabular data.", + "is_image": "Only accessible via url.", + "is_geo": "`GeoJSON`_ format.", + "is_topo": "`TopoJSON`_ format.", + "is_spatial": "Any geospatial format. Only natively supported by ``polars``.", + "is_json": "Not supported natively by ``pyarrow``.", "has_schema": "Data types available for improved ``pandas`` parsing.", "sha": sha, "url": "Remote url used to access dataset.", From a0f75852b4d7d88a1894f883b7f8cef9e368b917 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:48:44 +0000 Subject: [PATCH 154/201] refactor: Renaming/reorganize in `tools/` Mainly removing `Fl` prefix, as there is no confusion now `models.py` is purely `frictionless` structures --- tools/datasets/datapackage.py | 8 +++--- tools/datasets/models.py | 48 ++++++++++++++--------------------- tools/datasets/npm.py | 13 ++++++---- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 9945bd07d..5272170c2 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -22,7 +22,7 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence from altair.datasets._typing import Dataset, FlFieldStr - from tools.datasets.models import FlPackage + from tools.datasets.models import Package __all__ = ["parse_package"] @@ -42,13 +42,13 @@ ) -def parse_package(pkg: FlPackage, base_url: str, /) -> ParsedPackage: +def parse_package(pkg: Package, base_url: str, /) -> ParsedPackage: return ParsedPackage( features=extract_features(pkg, base_url), schemas=extract_schemas(pkg) ) -def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: +def extract_schemas(pkg: Package, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: """Reduce all datasets with schemas to a minimal mapping.""" m: Any = { Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]} @@ -58,7 +58,7 @@ def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldS return m -def extract_features(pkg: FlPackage, base_url: str, /) -> pl.DataFrame: +def extract_features(pkg: Package, base_url: str, /) -> pl.DataFrame: EXCLUDE = ( "name", "type", diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 21d98050e..f88a0b842 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -4,7 +4,7 @@ import sys from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING, Literal, NamedTuple +from typing import TYPE_CHECKING, Literal if sys.version_info >= (3, 14): from typing import TypedDict @@ -13,9 +13,9 @@ if TYPE_CHECKING: if sys.version_info >= (3, 11): - from typing import LiteralString, NotRequired, Required + from typing import NotRequired, Required else: - from typing_extensions import LiteralString, NotRequired, Required + from typing_extensions import NotRequired, Required if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -25,25 +25,15 @@ from altair.datasets._typing import Dataset, FlFieldStr -class NpmUrl(NamedTuple): - CDN: LiteralString - GH: LiteralString - - -##################################################### -# frictionless datapackage -##################################################### - - -FlCsvDialect: TypeAlias = Mapping[ +CsvDialect: TypeAlias = Mapping[ Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]] ] -FlJsonDialect: TypeAlias = Mapping[ +JsonDialect: TypeAlias = Mapping[ Literal[r"json"], Mapping[Literal["keyed"], Literal[True]] ] -class FlField(TypedDict): +class Field(TypedDict): """https://datapackage.org/standard/table-schema/#field.""" name: str @@ -51,33 +41,33 @@ class FlField(TypedDict): description: NotRequired[str] -class FlSchema(TypedDict): +class Schema(TypedDict): """https://datapackage.org/standard/table-schema/#properties.""" - fields: Sequence[FlField] + fields: Sequence[Field] -class FlSource(TypedDict, total=False): +class Source(TypedDict, total=False): title: str path: Required[str] email: str version: str -class FlLicense(TypedDict): +class License(TypedDict): name: str path: str title: NotRequired[str] -class FlResource(TypedDict): +class Resource(TypedDict): """https://datapackage.org/standard/data-resource/#properties.""" name: Dataset type: Literal["table", "file", r"json"] description: NotRequired[str] - licenses: NotRequired[Sequence[FlLicense]] - sources: NotRequired[Sequence[FlSource]] + licenses: NotRequired[Sequence[License]] + sources: NotRequired[Sequence[Source]] path: str scheme: Literal["file"] format: Literal[ @@ -96,8 +86,8 @@ class FlResource(TypedDict): encoding: NotRequired[Literal["utf-8"]] hash: str bytes: int - dialect: NotRequired[FlCsvDialect | FlJsonDialect] - schema: NotRequired[FlSchema] + dialect: NotRequired[CsvDialect | JsonDialect] + schema: NotRequired[Schema] class Contributor(TypedDict, total=False): @@ -110,7 +100,7 @@ class Contributor(TypedDict, total=False): organization: str -class FlPackage(TypedDict): +class Package(TypedDict): """ A subset of the `Data Package`_ standard. @@ -122,11 +112,11 @@ class FlPackage(TypedDict): version: str homepage: str description: str - licenses: Sequence[FlLicense] + licenses: Sequence[License] contributors: Sequence[Contributor] - sources: Sequence[FlSource] + sources: Sequence[Source] created: str - resources: Sequence[FlResource] + resources: Sequence[Resource] class ParsedPackage(TypedDict): diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 7f61323c4..ea38eb971 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -4,10 +4,9 @@ import string import urllib.request from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar, Literal +from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple from tools.datasets import datapackage -from tools.datasets.models import NpmUrl if TYPE_CHECKING: import sys @@ -21,7 +20,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.datasets.models import FlPackage, ParsedPackage + from tools.datasets.models import Package, ParsedPackage BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' # noqa: TC008 @@ -29,6 +28,11 @@ __all__ = ["Npm"] +class NpmUrl(NamedTuple): + CDN: LiteralString + GH: LiteralString + + class Npm: """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" @@ -60,7 +64,6 @@ def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString: - Encodes the endpoint at this stage - Use github if its the only option (since its slower otherwise) - npm only has releases/tags (not branches) - - So the column can be renamed ``"url_npm"`` -> ``"url"`` """ return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/" @@ -105,7 +108,7 @@ def file_gh( return read_fn(response) def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage: - pkg: FlPackage = ( + pkg: Package = ( json.loads(self._paths["datapackage"].read_text("utf-8")) if frozen else self.file_gh(tag, "datapackage.json") From 0df79b0a4baa4cce76516315cd4a91f13221ff1c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 17:27:34 +0000 Subject: [PATCH 155/201] test: Skip `is_image` datasets --- tests/test_datasets.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 95a6fb0ad..923fb9fbc 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -617,18 +617,7 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: @datasets_debug @pytest.mark.parametrize( ("name", "suffix"), - list( - _dataset_params( - skip=( - "climate", - "graticule", - "sf-temps", - "iris", - "weball26", - "seattle-temps", - ) - ) - ), + list(_dataset_params(skip=("7zip", "ffox", "gimp"))), ) def test_all_datasets( polars_loader: Loader[pl.DataFrame, pl.LazyFrame], From ee0d381b4a4d37c2e436b8a73cd95f1b1a5f6f97 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 14 Jan 2025 18:08:19 +0000 Subject: [PATCH 156/201] refactor: Make caching **opt-out**, use `$XDG_CACHE_HOME` Caching is the more sensible default when considering a notebook environment Using a standardised path now also https://specifications.freedesktop.org/basedir-spec/latest/#variables --- altair/datasets/_cache.py | 24 +++++++++++++++++------- altair/datasets/_loader.py | 12 ++++++++---- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 3e4beb82d..fdc8c3db8 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -217,9 +217,12 @@ def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]: class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): - """Optional caching of remote dataset requests.""" + """Opt-out caching of remote dataset requests.""" _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + _XDG_CACHE: ClassVar[Path] = ( + Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair" + ).resolve() def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader @@ -273,9 +276,13 @@ def path(self) -> Path: """ Returns path to datasets cache. - By default, this can be configured using the environment variable: + Defaults to (`XDG_CACHE_HOME`_): - "ALTAIR_DATASETS_DIR" + "$XDG_CACHE_HOME/altair/" + + But can be configured using the environment variable: + + "$ALTAIR_DATASETS_DIR" You can set this for the current session via: @@ -289,10 +296,13 @@ def path(self) -> Path: You can *later* disable caching via: >>> load.cache.path = None + + .. _XDG_CACHE_HOME: + https://specifications.freedesktop.org/basedir-spec/latest/#variables """ self._ensure_active() - fp = Path(os.environ[self._ENV_VAR]) - fp.mkdir(exist_ok=True) + fp = Path(usr) if (usr := os.environ.get(self._ENV_VAR)) else self._XDG_CACHE + fp.mkdir(parents=True, exist_ok=True) return fp @path.setter @@ -300,7 +310,7 @@ def path(self, source: StrPath | None, /) -> None: if source is not None: os.environ[self._ENV_VAR] = str(Path(source).resolve()) else: - os.environ.pop(self._ENV_VAR, None) + os.environ[self._ENV_VAR] = "" def __iter__(self) -> Iterator[Path]: yield from self.path.iterdir() @@ -316,7 +326,7 @@ def is_active(self) -> bool: return not self.is_not_active() def is_not_active(self) -> bool: - return os.environ.get(self._ENV_VAR) is None + return os.environ.get(self._ENV_VAR) == "" def is_empty(self) -> bool: """Cache is active, but no files are stored in ``self.path``.""" diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 63bd5f3f7..ef1cf46d3 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -29,7 +29,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ - Load examples **remotely** from `vega-datasets`_, with *optional* caching. + Load examples **remotely** from `vega-datasets`_, with caching. A new ``Loader`` must be initialized by specifying a backend: @@ -280,11 +280,11 @@ def url( @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ - Optional caching of remote dataset requests. + Caching of remote dataset requests. - Enable caching: + Configure cache path: - self.cache.path = ... + self.cache.path = "..." Download the latest datasets *ahead-of-time*: @@ -293,6 +293,10 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: Remove all downloaded datasets: self.cache.clear() + + Disable caching: + + self.cache.path = None """ return self._reader.cache From 138ede601ef35f136e3e54c34e3cf001c2679c6b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:45:12 +0000 Subject: [PATCH 157/201] refactor(typing): Add `_iter_results` helper --- altair/datasets/_cache.py | 17 +++++++++++++++-- altair/datasets/_readers.py | 6 +++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index fdc8c3db8..c3ca65848 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -3,7 +3,7 @@ import os import sys from pathlib import Path -from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT @@ -21,6 +21,8 @@ from _typeshed import StrPath from narwhals.stable.v1.dtypes import DType + from altair.datasets._typing import Metadata + if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -75,6 +77,17 @@ } +def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]: + """ + Yield rows from ``df``, where each represents a dataset. + + See Also + -------- + ``altair.datasets._typing.Metadata`` + """ + yield from cast("Iterator[Metadata]", df.iter_rows(named=True)) + + class CompressedCache(Protocol[_KT, _VT]): fp: Path _mapping: MutableMapping[_KT, _VT] @@ -263,7 +276,7 @@ def download_all(self) -> None: print("Already downloaded all datasets") return None print(f"Downloading {len(frame)} missing datasets...") - for row in frame.iter_rows(named=True): + for row in _iter_results(frame): fp: Path = self.path / (row["sha"] + row["suffix"]) with self._rd._opener.open(row["url"]) as f: fp.touch() diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index d0094f5ff..330f85642 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -31,7 +31,7 @@ import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT -from altair.datasets._cache import DatasetCache +from altair.datasets._cache import DatasetCache, _iter_results from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read if TYPE_CHECKING: @@ -147,10 +147,10 @@ def dataset( **kwds: Any, ) -> IntoDataFrameT: df = self.query(**_extract_constraints(name, suffix)) - result = next(df.iter_rows(named=True)) + result = next(_iter_results(df)) url = result["url"] fn = self.read_fn(url) - if default_kwds := self._schema_kwds(result): # type: ignore + if default_kwds := self._schema_kwds(result): kwds = default_kwds | kwds if kwds else default_kwds if self.cache.is_active(): From 1a4f1c10c52f74d51a4e9ef78fc5d8c21cbded84 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:05:49 +0000 Subject: [PATCH 158/201] feat(DRAFT): Replace `UrlCache` w/ `CsvCache` Now that only a single version is supported, it is possible to mitigate the `pandas` case w/o `.parquet` support (https://github.com/vega/altair/pull/3631#issuecomment-2480832609) This commit adds the file and some tools needed to implement this - but I'll need to follow up with some more changes to integrate this into `_Reader` --- altair/datasets/__init__.py | 4 +- altair/datasets/_cache.py | 59 +++++++++++++--------- altair/datasets/_metadata/metadata.csv.gz | Bin 0 -> 3577 bytes altair/datasets/_metadata/url.csv.gz | Bin 858 -> 0 bytes tests/test_datasets.py | 6 +-- tools/datasets/__init__.py | 16 +++--- 6 files changed, 45 insertions(+), 40 deletions(-) create mode 100644 altair/datasets/_metadata/metadata.csv.gz delete mode 100644 altair/datasets/_metadata/url.csv.gz diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 6095dd404..4986f671d 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -80,9 +80,9 @@ def url( url = load.url(name, suffix) except AltairDatasetsError: - from altair.datasets._cache import url_cache + from altair.datasets._cache import csv_cache - url = url_cache[name] + url = csv_cache.url(name) return url diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index c3ca65848..8b14e3660 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -37,15 +37,16 @@ _Dataset: TypeAlias = "Dataset | LiteralString" # noqa: TC008 _FlSchema: TypeAlias = Mapping[str, FlFieldStr] -__all__ = ["DatasetCache", "UrlCache", "url_cache"] +__all__ = ["DatasetCache"] _KT = TypeVar("_KT") _VT = TypeVar("_VT") _T = TypeVar("_T") -_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" -_SCHEMA: Final[Path] = Path(__file__).parent / "_metadata" / "schemas.json.gz" +_METADATA_DIR: Final[Path] = Path(__file__).parent / "_metadata" +_SCHEMA: Final[Path] = _METADATA_DIR / "schemas.json.gz" +_CSV: Final[Path] = _METADATA_DIR / "metadata.csv.gz" _FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { "integer": nw.Int64, @@ -109,19 +110,23 @@ def get(self, key: _KT, default: _T, /) -> _VT | _T: return self._mapping.get(key, default) -class UrlCache(CompressedCache[_KT, _VT]): +class CsvCache(CompressedCache["_Dataset", "Metadata"]): """ - `csv`_, `gzip`_ -based, lazy url lookup. + `csv`_, `gzip`_ -based, lazy metadata lookup. - Operates on a subset of available datasets: - - Excludes `.parquet`, which `cannot be read via url`_ + Used as a fallback for 2 scenarios: + + 1. ``url(...)`` when no optional dependencies are installed. + 2. ``(Loader|load)(...)`` when the backend is missing* ``.parquet`` support. + + Notes + ----- + *All backends *can* support ``.parquet``, but ``pandas`` requires an optional dependency. .. _csv: https://docs.python.org/3/library/csv.html .. _gzip: https://docs.python.org/3/library/gzip.html - .. _cannot be read via url: - https://github.com/vega/vega/issues/3961 """ def __init__( @@ -129,12 +134,10 @@ def __init__( fp: Path, /, *, - columns: tuple[str, str], - tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"], ) -> None: self.fp: Path = fp - self.columns: tuple[str, str] = columns - self._mapping: MutableMapping[_KT, _VT] = tp() + self._mapping: MutableMapping[_Dataset, Metadata] = tp() def read(self) -> Any: import csv @@ -143,24 +146,32 @@ def read(self) -> Any: b_lines = f.readlines() reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) header = tuple(next(reader)) - if header != self.columns: - msg = f"Expected header to match {self.columns!r},\nbut got: {header!r}" - raise ValueError(msg) - return dict(reader) - - def __getitem__(self, key: _KT, /) -> _VT: - if url := self.get(key, None): - return url + return {row[0]: dict(zip(header, row)) for row in reader} + def __getitem__(self, key: _Dataset, /) -> Metadata: + if result := self.get(key, None): + return result from altair.datasets._typing import Dataset if key in get_args(Dataset): - msg = f"{key!r} cannot be loaded via url." + msg = f"{key!r} cannot be loaded via {type(self).__name__!r}." raise TypeError(msg) else: msg = f"{key!r} does not refer to a known dataset." raise TypeError(msg) + def url(self, name: _Dataset, /) -> str: + if result := self.get(name, None): + return result["url"] + from altair.datasets._typing import Dataset + + if name in get_args(Dataset): + msg = f"{name!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{name!r} does not refer to a known dataset." + raise TypeError(msg) + class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]): """ @@ -359,7 +370,5 @@ def _ensure_active(self) -> None: raise ValueError(msg) -url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache( - _URL, columns=("dataset_name", "url") -) schema_cache = SchemaCache(_SCHEMA) +csv_cache = CsvCache(_CSV) diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..422e10cd48f3b181b47faac540f91c111ec767b7 GIT binary patch literal 3577 zcmVWbiwFn+00002|7~S-VPs)+VJ>5Hb^xtf%XZs17Toh&><-?KSU;C)nqU^>!4jvu%5A`gdCYyBS&t>4IH zp?p2c;h{f#>|f|f#=()wX+!&Q_$0%5W!Rm6A9i%?X)_LgKMdQ4XTn;2mKS=)S zeRw_|cZYv`_)vNK+wTXKP5*iS+g6Sro}NE^mM1#@c;pND!#`iB-+mZpA7)@3_50nh zKOP=_tB21I^Xxn^TG`+yNI%4W{xGOiD6rR!qVmaA+eow3QnK^bT4lVq$)>*#`W9VB z+wF-qkGs9paktOU;%@D-WI`}k5E6fq({0bKD zK!VGm4lhNnG%nN2Oh;vZ{1DE|=O0>=&#&$Thun!GsfbxB2MRdcDzNQ7e@$2Hhkt&*C(MA4C3lor;aKFqN9$9{W7Jx->=65ZmR=0wR*A>zn|T+5ZQ&*GFR zpwLv5s91?xgZYZAtF)iDc^poAtdz@UaUEel1}(}tDNQX)6~n>in6g1|mlB%8p7Y-a zd_~sP!93PsTV;FPqqPF-ISt<}Uy?I<7Z;~yNm2V+*id3swav;XineiaY^-jnFzSt^ z6*Rtu5|mDu_i}v<^ZYW+Xo!oeqf)5nK9|@usqCUh7@<-k36gP&Bxz-WM6-s&ExNAm z`xl`Z`FS!$yiUs_-f)9xsX-g%H5uK+8fR=I7d!>SzKPOhMV8(m>$pGA;VAp5qF<*| z$Rwv0hs>y4gi}C8Pf-;}u*_K~#F|~RjHHl)QnNzE+@b4u-+%5Q5HDLi&8BMZf?m4c z*0Q2jeQX7p?j@kaSVPtpv%`0HvKl^m z&az96qL<7`#CtLZdLStd5k)3fQ&-gW8ClohMrM6@9#1)`0u=S)q_hU1tXhhmB}Iyy zT6G#)ZPZ;BDlw-L$_ltwWEa4XEIZk(z z8&8~yjXv`lD)#`d9E^HLVqBWKR6=Mimkcf%cndRXvF4jiG?q=zy17(I;5iIKM3`Sd z;|lSH$9o!|X7>au5*Bw`108QvaRCych2V;yjB1CaWL@$Gc03|uq|z@d>Xe*`esxk2iPEtJNd zWm55w&8p;v#$b~QCc}fTC*a=@n9dh2BI3>Bc`bM!&2lLL%vPhiW(`wbTeU4&WCijY zd7ULf_!Io%dLlO%yEY8RLYvL}ET-w)lvRY=S{zFXf75f$00$dz-cP9RpHvBz^YqgEKlH}JYS;LpiAMa?B~O^k~Z zSD@UKX$&K#&;`Vz(7@TSwHZ<4Mjz%;I9HU-8)#iWe4$6#NQDEg{qgH%Z|(=xVW?0gwRsI3U}Z(;0yAIhw=$Kj;pkXml>0b9sy(|EpP z6-*fP0-KbABt*G#aAO>`4@p)?>npNuLy!A@nBP9kv-1RAaCxiEPGEK!av`zF-nED= zhnX=UK|aG0G>NRzisOb`Y~5zq^058aupdrO&+~rYFHZdyyJ&v6+c(@et8tUPZ;hLH z9V|sCjtYB;%ncgF(YI_?1pYT5PF(&y9$*bU-;=qnPG%EPyjVz!foF&MQMHvy>_yk) zbCCcV(gbTLq|L7-a}RN1^WRgE|MXkAXLDVhTpFrNT>5NuF&cIT8(?8SvMO2yl}uqI zOD41wPsBy1wRG+=g3j>iwEOsk2ezA&Y{)y5~YqF}uN*u4x} z+ShBfnx~gpSbnmb;Vv7w6v?65RutCMP^*a)5f))Tb#R8;3goxYfcx|7+0kV(?YPZ3 zr{%K3CK%~9p1nKtuZsE#cMyHn$zj`3sdcS>jjg-^*KLRW3#5_Z3HWtBpLIbGEIwJW z%32kqn$ZB|R^clZ>?xdYejHgs%=7x=@Y?DuqiIr{#CSpymO+G+p#eb=aSoYuAIP+EjTl++pp4_&9w zzmTs8sk*|pdE|HI*`V;tCWLj*5{xRPZ56%@z{8Eu1qU# zVB8=B!WI|F6ZZb%W(dw~_IYgGx)oNoR9$r%h0oc}% zxr29u|08k<@=o@LkI$z)j2ms|dnkYLrTO*ua#LgraR}0~)+O5-oF4N=H>1#g6zW+j zNUWBB1<&t7?hZo7<3=98;P>-*vN!U0vv^)}$>`+=GpgAH!CJU#=%nh!Gkj9Bwq$l3 z3#zs#@hixE1LIpVkI%ztzxn!j>7R#ha(nZg^H;1MkA)X=W$#_k#5Kq9`kxBA1q5sf z8F|?vT2?`Iv5_lwq&dSOgz|;&9-seoA=B)6q5ei~ki~NeO zI~<4lX^yA!c)ptEmhZQR6RM&M1ek`HDUdVG7guD|GKd@j1#9Yxv&&a#-HH40idvno zrCsNfq;X;KTLKsRc+J_uJA@;R!;y~iSZUWE&s}_e4EO6(B&D#}n&oN@ zjEq-4)s(Z-rdrj|HVj4d4@bHrrPf%pZ-eYU4;u)uVPZ1P=1Y99ui_V+Q{m}pk9OBe z^(}C*Z~NW2w?+CZqspO6|+lDExOUqTGpGnt^-Gn8*a zAUTT$)7HGUyH$l>L_R?+UBRC>`hIafBuskw-WPU;EXt0u5^Lp92(Bx)Y{#nuRR<^d znfw|O?=W^>o-vy_jN|#DSz2tcDvJxnS*SUKM_?;xMCRHg=f>xkN{m!rac$rgSoa0Z z;Iq`@FyH>XS(rix6>#}$Q{j+8QH6rGG@haGc;v>C#JfT%6pdR5gVL{auCR5CnXHj@=snBL&){_J;k;MvE{ zvx`AcI@$CSR>v|toM=--L2RS;XWuk3Au8pG*_EiYDl5R>!t4GIPe~^$7d8L@%yPriUB(f+jYm?16g8h zHL~dG;jaDl)60(I6dMrw(k2G@iI!!GisYU<@JGXYcnnrQeECqrW9R+gzV7!zn%%3D zpw(!12K>J5_9L`B|6#Jkx^F#KW}m+8Q}k{y?E_bKK7yC%1&V400k{V@fvP5yx*^t_-ZLDoww=?gn3*bc`BPATkr%g$JYe&XGg`{q*<>F z1xxY865JBfO3~H9XC}}HFy3;nkdWRDl^=(|5Wg$R6P}5hvQ<5B0+-{9C6(jO17$8} zBHymGYE-5ra0Mvo0k^*U9XRO)Ss+C{o$$~7T#C*m@T9pVl$9J_6G|VMgaM2MBS?$o z)|$ppZRZ_(I-Du^j9jbFhb`Eotldr6wYmkmGG8_@Ush(_BI$62@*8?W^bOaVpjM$I z+w!4Tajn_=!I@ZztOKrn$CQa1Fl^vogLyhy)e{wcuIa&&@;;u>nVV@hecZP74ARql*(; zo;D~$n6*i$p631(Z#?cBvA?yPa(213>KLdDr9_qZ$Lcn{OYnZT0!1Sip=I%T131T`uz|UBYNGK-MR-yiTCyk70=*a|(GN8Ul$2L>5^{=v0TW#UNTU}3016M7O8@`> diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 923fb9fbc..a65b96bd7 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -268,13 +268,13 @@ def test_url(name: Dataset) -> None: def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets - from altair.datasets._cache import url_cache + from altair.datasets._cache import csv_cache monkeypatch.setitem(sys.modules, "polars", None) monkeypatch.setitem(sys.modules, "pandas", None) monkeypatch.setitem(sys.modules, "pyarrow", None) - assert url_cache._mapping == {} + assert csv_cache._mapping == {} with contextlib.suppress(AltairDatasetsError): monkeypatch.delattr(altair.datasets._loader, "load", raising=False) @@ -283,7 +283,7 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: assert match_url("jobs", url("jobs")) - assert url_cache._mapping != {} + assert csv_cache._mapping != {} assert match_url("cars", url("cars")) assert match_url("stocks", url("stocks")) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 7350ede7f..534bf6b9c 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -40,7 +40,7 @@ _PathAlias: TypeAlias = Literal[ "typing", - "url", + "metadata-csv", "metadata", "schemas", ] @@ -82,11 +82,12 @@ def __init__( out_dir_tools.mkdir(exist_ok=True) kwds_npm = kwds_npm or {} self._npm: Npm = Npm(out_dir_tools, **kwds_npm) + METADATA = "metadata" self.paths = types.MappingProxyType["_PathAlias", Path]( { "typing": out_fp_typing, - "url": out_dir_altair / "url.csv.gz", - "metadata": out_dir_altair / "metadata.parquet", + "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz", + "metadata": out_dir_altair / f"{METADATA}.parquet", "schemas": out_dir_altair / "schemas.json.gz", } ) @@ -120,18 +121,13 @@ def refresh( package = self.npm.datapackage(tag=tag, frozen=frozen) self.write_parquet(package["features"], self.paths["metadata"]) self.write_json_gzip(package["schemas"], self.paths["schemas"]) - # FIXME: 2-Part replacement - # - [x] Switch source to `"metadata"` + refresh (easy) - # - [ ] Rewriting `UrlCache` to operate on result rows (difficult) - urls_min = ( + metadata_min = ( package["features"] .lazy() .filter(~(col("suffix").is_in((".parquet", ".arrow")))) - .select("dataset_name", "url") .sort("dataset_name") - .collect() ) - self.write_csv_gzip(urls_min, self.paths["url"]) + self.write_csv_gzip(metadata_min, self.paths["metadata-csv"]) if include_typing: self.generate_typing() From 32fd0f9444cbc12630d6b4d27ed3ebfdb0e7ac67 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:39:32 +0000 Subject: [PATCH 159/201] refactor: Misc reworking caching - Made paths a `ClassVar` - Removed unused `SchemaCache` methods - Replace `_FIELD_TO_DTYPE` w/ `_DTYPE_TO_FIELD` - Only one variant is ever used Use a `SchemaCache` instance per-`pandas`-based reader - Make fallback `csv_cache` initialization lazy - Only going to use the global when no dependencies found - Otherwise, instance-per-reader --- altair/datasets/_cache.py | 74 ++++++++++++++++--------------------- altair/datasets/_readers.py | 10 +++-- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 8b14e3660..89ed16858 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -37,7 +37,7 @@ _Dataset: TypeAlias = "Dataset | LiteralString" # noqa: TC008 _FlSchema: TypeAlias = Mapping[str, FlFieldStr] -__all__ = ["DatasetCache"] +__all__ = ["CsvCache", "DatasetCache", "SchemaCache", "csv_cache"] _KT = TypeVar("_KT") @@ -45,25 +45,23 @@ _T = TypeVar("_T") _METADATA_DIR: Final[Path] = Path(__file__).parent / "_metadata" -_SCHEMA: Final[Path] = _METADATA_DIR / "schemas.json.gz" -_CSV: Final[Path] = _METADATA_DIR / "metadata.csv.gz" - -_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { - "integer": nw.Int64, - "number": nw.Float64, - "boolean": nw.Boolean, - "string": nw.String, - "object": nw.Struct, - "array": nw.List, - "date": nw.Date, - "datetime": nw.Datetime, - # "time": nw.Time, (Not Implemented, but we don't have any cases using it anyway) - "duration": nw.Duration, + +_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = { + nw.Int64: "integer", + nw.Float64: "number", + nw.Boolean: "boolean", + nw.String: "string", + nw.Struct: "object", + nw.List: "array", + nw.Date: "date", + nw.Datetime: "datetime", + nw.Duration: "duration", + # nw.Time: "time" (Not Implemented, but we don't have any cases using it anyway) } """ -Similar to an inverted `pl.datatypes.convert.dtype_to_ffiname`_. +Similar to `pl.datatypes.convert.dtype_to_ffiname`_. -But using the string repr of ``frictionless`` `Field Types`_ to `narwhals.dtypes`_. +But using `narwhals.dtypes`_ to the string repr of ``frictionless`` `Field Types`_. .. _pl.datatypes.convert.dtype_to_ffiname: https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165 @@ -73,10 +71,6 @@ https://narwhals-dev.github.io/narwhals/api-reference/dtypes/ """ -_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = { - v: k for k, v in _FIELD_TO_DTYPE.items() -} - def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]: """ @@ -129,14 +123,13 @@ class CsvCache(CompressedCache["_Dataset", "Metadata"]): https://docs.python.org/3/library/gzip.html """ + fp = _METADATA_DIR / "metadata.csv.gz" + def __init__( self, - fp: Path, - /, *, tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"], ) -> None: - self.fp: Path = fp self._mapping: MutableMapping[_Dataset, Metadata] = tp() def read(self) -> Any: @@ -189,14 +182,13 @@ class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]): https://github.com/vega/vega-datasets/pull/631 """ + fp = _METADATA_DIR / "schemas.json.gz" + def __init__( self, - fp: Path, - /, *, tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"], ) -> None: - self.fp: Path = fp self._mapping: MutableMapping[_Dataset, _FlSchema] = tp() def read(self) -> Any: @@ -225,20 +217,6 @@ def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]: else: return list(match) - def schema(self, name: _Dataset, /) -> Mapping[str, DType]: - return { - column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items() - } - - def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]: - """ - Can be passed directly to `.with_columns(...). - - BUG: `cars` doesnt work in either pandas backend - """ - for column, dtype in self.schema(name).items(): - yield nw.col(column).cast(dtype) - class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): """Opt-out caching of remote dataset requests.""" @@ -370,5 +348,15 @@ def _ensure_active(self) -> None: raise ValueError(msg) -schema_cache = SchemaCache(_SCHEMA) -csv_cache = CsvCache(_CSV) +csv_cache: CsvCache + + +def __getattr__(name): + if name == "csv_cache": + global csv_cache + csv_cache = CsvCache() + return csv_cache + + else: + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 330f85642..d69b50e1d 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -31,7 +31,7 @@ import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT -from altair.datasets._cache import DatasetCache, _iter_results +from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_results from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read if TYPE_CHECKING: @@ -252,12 +252,12 @@ class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol): - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html """ - def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: - from altair.datasets._cache import schema_cache + _schema_cache: SchemaCache + def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: name: Any = result["dataset_name"] suffix = result["suffix"] - if cols := schema_cache.by_dtype(name, nw.Date, nw.Datetime): + if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime): if suffix == ".json": return {"convert_dates": cols} elif suffix in {".csv", ".tsv"}: @@ -278,6 +278,7 @@ def __init__(self, name: _Pandas, /) -> None: ".parquet": pd.read_parquet, } self._scan_fn = {".parquet": pd.read_parquet} + self._schema_cache = SchemaCache() class _PandasPyArrowReader(_PandasReaderBase): @@ -296,6 +297,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: ".parquet": partial(pd.read_parquet, dtype_backend=_pa), } self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} + self._schema_cache = SchemaCache() def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame: From a1839df416e56814fa7f74afead758377f38550b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:42:20 +0000 Subject: [PATCH 160/201] chore: Include `.parquet` in `metadata.csv.gz` - Readable via url w/ `vegafusion` installed - Currently no cases where a dataset has both `.parquet` and another extension --- altair/datasets/_metadata/metadata.csv.gz | Bin 3577 -> 3632 bytes tools/datasets/__init__.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz index 422e10cd48f3b181b47faac540f91c111ec767b7..30793abc86eee5f4255edba76dd1d9b739e8d66e 100644 GIT binary patch delta 3501 zcmV;e4N~&?8?YRZ8Gp%=+&GrJ^II}Y*q1(yO<(&s_p%4PhfJ0tgVC`_t^WFPMv$>& zN>x-*CbLWc5t7`&x445~I?_Rozi;VHo(`wh`u{w&ej}HK^6e;xr~dGJ|3)t|4vth# z8`_V<7a7hg!|we1u%ly7n{oKZVc0&s64vUgywTI)m7Y%fO@9~v+wY!!+ihQ-#?g6T zbI4D}{Yjo0Z4Q6+Gx!uZRHagim0e0+shq2hj+`k)hZu=P)u-3vad-H~r%#o)zx{Dw z+4Nubzis9C>E-p)7kQ!ck7vG+Km5-d_1jP5?9&Xaqkg{|_Q%81Z}sryX`Y=YMk^cq z1nEz)pFa#L6=4eOb)%?!a@97{Y_*i^ytP&t?`^W_?}NTa*U@%+qRsPeFLm7Q^RswZ zyDXUy%$3BS;=c@|&JwLtp|+}0(}9_S*Iru^t+f^A2y%V}iw_{d1C#) zvVZ=Qb^#v<@33`+_8a*&lbr!24u`BO{hklUeg9den7><-&;cU{e?-`o6Tirl`T-_? ze#16_K0Kg4Oc9<`%!{LivdUsy3ptT3*y9>(tX4_QUZUv8ElLY(Q6Fa5`(wYoq8=wx zVTo?>PIIDUs1R}FLayb?*k^Id6i{d?N>r@Gt-*Xn)>Yci+dK}ZJyyzPv$&41AA=U< zoRp@PrHbKTb4=MFxJwDmVbA&R1HK}E>*`>h>#(h|J?_z3f%Tk*AC@o4nY@dOQ?sO~ zeJyM#v8vi;WfVo*xHvXew^SJQ#?lHJ-$MyXC(L`fK8AUInPxP^#nn+M)N`LpY?@Sd z(IbpdDUk%pI7O1QGC`tQ!{HuXSNHv!(2V>%nIc}NbVLA2DMjq;j|ZeoppGd7Y7 zo`PZDMCr03OYe|%+#l$0l>JoEuhS`Hl2eOAW>hZ1DWIaKs0t)l<}4Iq&8}HSQb<9m zS)pPc&~?1;zw{7@mo1)VQ#E%%FWqlzSy8J#wt`Ie5>R5SBnlN$5gG{fC8M+E6|nE1 zbwc~&>%UL*`5NMPD^tNSI;U%YGx|W|2K_P@cGaPzLiJ|$wfg2vs_4Y6FxrEw3fs|G z{e1%O5q7N&|GCKYadaN&7))BO(^be&XW1o3(M#qe;yoDyJ&+WKh$54#sVnOGjI3*L zBeOm{kEfhe0g8HYQd)yhRxL%(k|IS;tvU^@HtH@5m6%frWd+Vx!NzhRP$rD+i-KkQkSyE|m~k%O!)0 z2HwJqTCDkI6OColvu-YbRT6j(!w?bX7tpvuyyNka#^>2R!HR^%-PS6|7=LRtvSSn<@grthtnvzDwqe$8M zE1ttU0`?8nl=lBV!JFLHu1ngwY?U#KTXC*V@FzUk8Udj0PPs2}7!=WoLkEi5(ZqRLF3#G9En^ZhJfGWA6%h{xY$e7eg?=>xtZ9?AkCK3vD*@vzVr{ z!cY;`adBcOxR(@v!63(yqYea>lm!i%eaX7Gf>t1O0)}wdqC-B$#Wg5Lv zi{)La#WdBRhhWdO5#Lme=zx|hjiN6^l&VNCI@J63BDpK&`2Uz z&IK(3${U7%VLT)>YK04a2d|q0{+z5+)LatR#JD(d1u|5b#&DGiT|g`f4Qv`)n-Mi` zG?gBOb4A&_gVy!KS9+F>RM`32AHQ8T{{8A@_r2ANci%t(+0=~XNInX{r{*L#Eu*%; z&X+NS+KNE`9>(sEq0FXX98R(nspS@*7KY3=jfYi#R>4t1{IE$WNJ5k=2W!bu`;cUX zPq-rMHuSvjhxt{?JUdTdY=1)cIDvC#$c4lPvZqCCIh>gZ30?sHrb%R#R-B#OW9v4< zmWS>C4*TKs@;dML-{Ho6!MZ-x!9@KRY7t%6FXFp?z`T8by)qSIPBj~GE`_5?hSPj{n36xk3>rp(E)PEwtd-UkhT4&KaRC&O+d4;(H_(;KAa5~#-9 zLZ()1YYDoiCT4^h%mK6m6sw?KIUnn45;uI^TYr!}mCZGzZVR)rw$UtqXG*E41|ybT zjm{v)a}yf`wk=z$v$Lv=Pi95IdIzw38@9Bs*J?FSFSD@xm^#BMH*zVGL$$3atf`?^ z6DcBpEW%6c;0(7F$nT*6_m|s2*kv;9xXn4I<+8#i4XERb1c&}rQD5N>qR%=xY&$Bo zuGO!xm3QE}?XZ7?G%`E^zs~1_JLrMM$3#|HtAbQB8lc=Ne5HcDVN1T5T=VE}oLO;X z`iQO@iX6^T=!|umO)X#)j7xtO^K!j7KR}8G(vQ#uWwKkA6T@R|tuFXx;F- zsWW5@FC!oA;)91=;Lfd}+oGs5i-^g^i!oVcZH;w-sv^6h|6Xx*JK4UR`t2Oo^K{C_ zsP*DT>vb%Ft-)W4uILhc$eL_KeE9MH3*#K0)Kdv{hqFGdOv=D zFl!*=Zk_h8U*3&6$7x-Q4y8pnL`kht@X&P%{R{bukg6+e%MXBdyP@A+-?m>S)A?0K zAzpl@s{?6lyGl8jW_=DC@f7jL5RZ*l=gPF=I>{X}K;HDnz9M()Fuss1r}2gAjl*|8 zoian2u=s@35g~JPBb!Zy19J(*zQGKC5NltQb~ftfa$dvg$5=sgciK!Zx+iAy!hEwJ zEGH9^YH*W5VvJUgUCbe}jILH26Y=nxG6=x7hRg%JJN&;Rmmu$CfB60Nw1;t{?R=@` zFTOOt%U^DaY#|OoTGqN`TZ7YM-sol&+K)m#O9hG5@~`0eL&)7h=y=@7^H=Xiq+$>@M5m)y$hPS<~ZJER6)0ZfGr^-FIz;*DyWWJei_&AVD1h(`2UIF zIJ_LD*`GMNR8mZet;A|ngb9Ryb251593o{i_AlgRg`!+RV+M9XJHGb1hkTx2Vojqi zr=PeK1CON8Sy_1ln3_St`S4$P6>8aB?n91 zDzt$Z7hSLn-^E~>!x~Nx@Ve7@(co>yexA%}W9;=t(_bD^wT8`?5D|Ex8P-yghgvhJ zSp{XPX^P})T=Dkz4%-O&K5fSO^sscPSTVWfhvq3}U&P1Sw2X&;%}(HTovWw-#TrM@ z|8T-<&K5o(9BCYmbd={xyZ(6Y;`3v8T%RH-g~iq^S8HHoyz;50oSin+s)n{8yIT`SeMz{SD^py7ERsW76Ah?O*6 zQH$@%8fvunsnQN1r8=qe)F;acSwSgP3?kk#q!568=VZO?Gw=jhcD&X?h zrothGq6!6VX*@&W@yLxc7str7ElgpfQ9IPqE z_6-YqK-3m)y{c*wq9}V6Dj6Pc&dG}VSnu$4|Lu2m;MvE{vx`AcI@$CSR>v|toM=-- bL2RS;XWulF;0+B7!|VPZIj%aRA_Bj{07dAVxsL>OB4x== z(o8ca=hy(GBvs&jRDobR(m{?NxAYKDy3N2rR0^$x$5Z1nNoC!kyuoHcs?F?hktzdP^w1A+2AKg zKg53iFsM{0uwU1WqVmaA+eow3QnK^bT4lVq$)>*#`W9VB+wF-qkGs9paktOU;%@D- zWI`}k5E6fq({0bKDK!VGm4lhNnG%nN2Oh;vZ z{F8VA9|&)-b%pjD`8t!I0VNKDtSkMV566A~Nv4>;S(DNMBL}}n*p(AM$&>s6CV&2l zZ32C`Lw%ScJgJx$M+;?@#kdx7B3rP>HP~3KlA66l(UDt}7S^IZ%&_;zetShdPNu>V z-Qu0*M9EMg;>d+u%ayUu;*=?%&{UMDSczMM`HHNow4b+m98PYOBo{md z!@h~qWkr_WAnUk4(BUZisiI$}Q^+Kz7KhBJT!d3VMNd%`NU+RVD8!mwvy7yWf>N_W z#oVFmc;A2SArLQHJk6$R?t)&r-`29CR()&*neHW^#8^obDx@Mb5b8@tXU!{M-$3hx z_Q&V{o#@jw#BWxnf@5?}*MDa8j>aqW%Usx1hms1_o7vaun=`4R6Su->53VX~M`QK( z3A{zvwKn{5k?G^;JkT+iv|OjFkfF}9OOB$K%t^$1G6s4eDGm`uCRbBe)b$xz*WgBG zeRv*EIjI5^_2Q(o2BEB4ik>A!ikwuWK5 z|G_ofm0H}OR4D>G!9)bLu3d%kh_+Z%^~kp3^Ty=zN-FPRf>;0ad^|jA_329Jx-yC1 zjLN*2N`%U?V{K6m=1x+>bAaP;0i`AzPn?R4KJywX_W-XPjCx06T$;L6LTD|Q3@#dY z3o~l5=9^74mQByPxqnni;5iIKM3`Sd;|lSH$9o!|X7>au5*Bw`108QvaRCych2V;y zjB1CaWL@$Gc03|uq|z@d>Xe*`esxk2iPEtJNdWm55w&8p;v#$b~QCV#_&uP5N&5SY#vE+XR1 z;(0B2AI)+p0nApTx@HYiUR$*-S!4zB8+n~2LiiK>;(8)C7`rwM$3mOU{4A#F+>}*> z+gcn;3W7gHFvzjwr~^S*WI<13U$QQ)6qm@aaEN#Cx+mJb^eDIc3H)__nMQBaVnvi{ zF-6o!;%>oupXaJW!nDpcz-_C^gnU9Gx|*x9CJu8ye0{m zR>oaNQwvVVN@1k52JZ~5d)f@hg~y|en|y}-x^S^F*rMg-N0S3I2y0&v71^rDm2;_1 zAXcHV$8$BKRv5=O@VYtR&&fJP%_VV7jEfUjpxl&c3?rt{1;nDzz}c|18Bya#ALdaw zSCq{gXn$Qle4$6#NQDEg{qgH%Z|(=xVW?0gwR zsI3U}Z(;0yAIhw=$Kj;pkXml>0b9sy(|EpP6-*fP0-KbABt*G#aAO>`4@p)?>npNu zLy!A@nBP9kv-1RAaCxiEPGEK!av`zF-nED=hkuzdAwfRF5;Tdd(u(7TTWsBC*z&Ob z*RUT>PtWsy-!D%67Q1MExZ5|}IID4!y>E@1cpWT7D2@tyiOdZe#nHEHRs{YxAWmHV zJsw~UJ>QeLu1;nXQM_14i-Bi{`cbu&O6*0~5?hS4@9Fw6xk3>rp(E)PEws|qlQSKcq>R7EUcpO2HwnJC&O+dcN{KD(@U4- z5~#-9;F(*ottIH5nwSx4FbB{MP^^M_<$rvvt4X}#>t6bU?5S+7A@#a2D{C9g^4E-% zifTlZU5(Bl$8!@Kgfx_$)!A9q#wW9)V7&p@y$oC0*K4(!rjjg-^*KLRW3#5_Z3HWtBpLIbGEIwJW%32kqn$ZB|R^clZ>$tJ}%;>C|uM zxSpp|Hb$)%H(IY_32Y7iQglU^;6v7ABjUr4S4JS85!GQuCAkCF?KX65h!^<7qwM!| zeL4F6gINO^_u6Uy`hC}^bDY+-=uldOLzL7S1rJ@P(7%wc2&uZlwtNR@w|^V@?ezuq zWip*#WfbDY$Dlfp#1cy+E!D{f%iAOqx0f9xxAw+`bwymA`f zLB4YM=BHC;C=(VRDmo%$Zf<0=sc>K}q1ZQ=0b=co(#}TRT+VA)eIF}m?oONO-SEV0 zUYK9Thvj5KQVni0NQ}|yv44v>M3&LjYGWcEUQ-4E*w&D_gLi}fBXSAyPWFe7&!;_% z8*S%%D1Y&#`Stg5Q)CNq2-33FCEFUD9`i;wqtJd7>RBpCtd@TT&+kI+4noJ{MjpT5 z_w#tNH}ZJ1cwTeK=;a49s@Vj=TDWTHr0T^pd{VQvWOf`2s7R#ha(nZg^H;1MkA)X=W$#_k#5Kq9`kxBA1q5sf8F|?vT2?`Iv5_lwq&VS0v8^F{I65i)bq2{dcuMrAjTZ{aPuR9!v`e}}*^LW0R=9cfb zhZCxz3j~;km?@Am%@)G~-10R?O7inGgCXx)kX@rqiVucck*lcaHB@mmM%bM#t8 zPP(G!G%6-2swiC6H!ChiNP?VJD1du#-DxW?yUpSRiK~`3amN#KvduFTShNdpyx|V%qhXn zs^nnFTZJ|d#xBSpN#q5jtSeusdu-OT`u5%R? zpjhJw`X5es&Dp{`gd>f^k&g0MY1bdmU3`8F_v=$6rGK#4n&oN@jEq-4)s(Z-rdrj| zHVj4d4@bHrrPf%pZ-eYU4;u)uVPZ1P=1Y99ui_V+Q{m}pk9OBe^(}C*Z~NW2w?+CZqspO6|+lDExOUqTGpGnt^-Gn8*aAUTT$)7HGUyH$l> zL_R?+U4Ox!H~M~YJ|s+f`Q8_HhAhgCvJz|MPzbIow`|9&162nn_?i3~67Mi}U!F0W zIgI1^qFGvOuqulS#aXC1gGXR1Xhi1PB<{9 diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 534bf6b9c..37d487da0 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -124,7 +124,7 @@ def refresh( metadata_min = ( package["features"] .lazy() - .filter(~(col("suffix").is_in((".parquet", ".arrow")))) + .filter(col("suffix") != ".arrow") .sort("dataset_name") ) self.write_csv_gzip(metadata_min, self.paths["metadata-csv"]) From 2db8dafddb07ec82d8c3e6110b1f2303a37bbbc9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:47:47 +0000 Subject: [PATCH 161/201] feat: Extend `_extract_suffix` to support `Metadata` Most subsequent changes are operating on this `TypedDict` directly, as it provides richer info for error handling --- altair/datasets/_readers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index d69b50e1d..921b5ef7d 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -65,6 +65,7 @@ from altair.datasets._typing import Dataset, Extension, Metadata from altair.vegalite.v5.schema._typing import OneOrSeq + _IntoSuffix: TypeAlias = "StrPath | Metadata" # noqa: TC008 _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") @@ -129,10 +130,10 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: + def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]: return self._read_fn[_extract_suffix(source, is_ext_read)] - def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: + def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]: return self._scan_fn[_extract_suffix(source, is_ext_scan)] def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: @@ -443,8 +444,10 @@ def _extract_constraints( return constraints -def _extract_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: - suffix: Any = Path(source).suffix +def _extract_suffix(source: _IntoSuffix, guard: Callable[..., TypeIs[_T]], /) -> _T: + suffix: Any = ( + Path(source).suffix if not isinstance(source, Mapping) else source["suffix"] + ) if guard(suffix): return suffix else: From c265e1d0536dd6a4d0fcd12344b15b4f6c515e3f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:50:39 +0000 Subject: [PATCH 162/201] refactor(typing): Simplify `Dataset` import --- altair/datasets/_cache.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 89ed16858..f0309a350 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -7,6 +7,7 @@ import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT +from altair.datasets._typing import Dataset if sys.version_info >= (3, 12): from typing import Protocol @@ -32,7 +33,7 @@ else: from typing_extensions import TypeAlias from altair.datasets._readers import _Reader - from altair.datasets._typing import Dataset, FlFieldStr + from altair.datasets._typing import FlFieldStr _Dataset: TypeAlias = "Dataset | LiteralString" # noqa: TC008 _FlSchema: TypeAlias = Mapping[str, FlFieldStr] @@ -144,7 +145,6 @@ def read(self) -> Any: def __getitem__(self, key: _Dataset, /) -> Metadata: if result := self.get(key, None): return result - from altair.datasets._typing import Dataset if key in get_args(Dataset): msg = f"{key!r} cannot be loaded via {type(self).__name__!r}." @@ -156,7 +156,6 @@ def __getitem__(self, key: _Dataset, /) -> Metadata: def url(self, name: _Dataset, /) -> str: if result := self.get(name, None): return result["url"] - from altair.datasets._typing import Dataset if name in get_args(Dataset): msg = f"{name!r} cannot be loaded via url." From 5503e0b835e3eed32a32b975b0ff6f97de36d1e6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:53:34 +0000 Subject: [PATCH 163/201] fix: Convert `str` to correct types in `CsvCache` --- altair/datasets/_cache.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index f0309a350..eef5a86ee 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -15,7 +15,14 @@ from typing_extensions import Protocol if TYPE_CHECKING: - from collections.abc import Iterator, Mapping, MutableMapping + from collections.abc import ( + Iterable, + Iterator, + Mapping, + MutableMapping, + MutableSequence, + Sequence, + ) from io import IOBase from typing import Any, Final @@ -140,7 +147,19 @@ def read(self) -> Any: b_lines = f.readlines() reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) header = tuple(next(reader)) - return {row[0]: dict(zip(header, row)) for row in reader} + return {row[0]: dict(self._convert_row(header, row)) for row in reader} + + def _convert_row( + self, header: Iterable[str], row: Iterable[str], / + ) -> Iterator[tuple[str, Any]]: + map_tf = {"true": True, "false": False} + for col, value in zip(header, row): + if col.startswith(("is_", "has_")): + yield col, map_tf[value] + elif col == "bytes": + yield col, int(value) + else: + yield col, value def __getitem__(self, key: _Dataset, /) -> Metadata: if result := self.get(key, None): From 3c7c5716e28ea3cbe863bf0d0a503786e09820c1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:04:54 +0000 Subject: [PATCH 164/201] feat: Support `pandas` w/o a `.parquet` reader --- altair/datasets/_cache.py | 23 +++++++++++++++++++++-- altair/datasets/_readers.py | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index eef5a86ee..98b97e35d 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -2,6 +2,7 @@ import os import sys +from collections import defaultdict from pathlib import Path from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args @@ -107,9 +108,13 @@ def __exit__(self, *args) -> None: return def get(self, key: _KT, default: _T, /) -> _VT | _T: + return self.mapping.get(key, default) + + @property + def mapping(self) -> MutableMapping[_KT, _VT]: if not self._mapping: self._mapping.update(self.read()) - return self._mapping.get(key, default) + return self._mapping class CsvCache(CompressedCache["_Dataset", "Metadata"]): @@ -139,6 +144,7 @@ def __init__( tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"], ) -> None: self._mapping: MutableMapping[_Dataset, Metadata] = tp() + self._rotated: MutableMapping[str, MutableSequence[Any]] = defaultdict(list) def read(self) -> Any: import csv @@ -161,6 +167,19 @@ def _convert_row( else: yield col, value + @property + def rotated(self) -> Mapping[str, Sequence[Any]]: + """Columnar view.""" + if not self._rotated: + for record in self.mapping.values(): + for k, v in record.items(): + self._rotated[k].append(v) + return self._rotated + + def metadata(self, ns: Any, /) -> nw.LazyFrame: + data: Any = self.rotated + return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy()) + def __getitem__(self, key: _Dataset, /) -> Metadata: if result := self.get(key, None): return result @@ -274,7 +293,7 @@ def download_all(self) -> None: stems = tuple(fp.stem for fp in self) predicates = (~(nw.col("sha").is_in(stems)),) if stems else () frame = ( - self._rd._scan_metadata(predicates, is_image=False) + self._rd._scan_metadata(*predicates, is_image=False) .select("sha", "suffix", "url") .unique("sha") .collect() diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 921b5ef7d..176439c0d 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -207,10 +207,13 @@ def query( def _scan_metadata( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.LazyFrame: - frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() if predicates or constraints: - return frame.filter(*predicates, **constraints) - return frame + return self._metadata.filter(*predicates, **constraints) + return self._metadata + + @property + def _metadata(self) -> nw.LazyFrame: + return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: @@ -279,8 +282,18 @@ def __init__(self, name: _Pandas, /) -> None: ".parquet": pd.read_parquet, } self._scan_fn = {".parquet": pd.read_parquet} + self._supports_parquet: bool = is_available( + "pyarrow", "fastparquet", require_all=False + ) + self._csv_cache = CsvCache() self._schema_cache = SchemaCache() + @property + def _metadata(self) -> nw.LazyFrame: + if self._supports_parquet: + return super()._metadata + return self._csv_cache.metadata(nw.dependencies.get_pandas()) + class _PandasPyArrowReader(_PandasReaderBase): def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: @@ -459,10 +472,24 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" -def is_available(pkg_names: str | Iterable[str], *more_pkg_names: str) -> bool: +def is_available( + pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True +) -> bool: + """ + Check for importable package(s), without raising on failure. + + Parameters + ---------- + pkg_names, more_pkg_names + One or more packages. + require_all + * ``True`` every package. + * ``False`` at least one package. + """ pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) names = chain(pkgs_names, more_pkg_names) - return all(find_spec(name) is not None for name in names) + fn = all if require_all else any + return fn(find_spec(name) is not None for name in names) def infer_backend( From c23805d25027682e55ac84aeff1a7b311315fe79 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:07:39 +0000 Subject: [PATCH 165/201] refactor: Reduce repetition w/ `_Reader._download` --- altair/datasets/_cache.py | 5 +---- altair/datasets/_readers.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 98b97e35d..36d965f2e 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -303,10 +303,7 @@ def download_all(self) -> None: return None print(f"Downloading {len(frame)} missing datasets...") for row in _iter_results(frame): - fp: Path = self.path / (row["sha"] + row["suffix"]) - with self._rd._opener.open(row["url"]) as f: - fp.touch() - fp.write_bytes(f.read()) + self._rd._download(row["url"], self.path / (row["sha"] + row["suffix"])) print("Finished downloads") return None diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 176439c0d..88d917ab4 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -156,13 +156,9 @@ def dataset( if self.cache.is_active(): fp = self.cache.path / (result["sha"] + result["suffix"]) - if fp.exists() and fp.stat().st_size: - return fn(fp, **kwds) - else: - with self._opener.open(url) as f: - fp.touch() - fp.write_bytes(f.read()) - return fn(fp, **kwds) + if not (fp.exists() and fp.stat().st_size): + self._download(url, fp) + return fn(fp, **kwds) else: with self._opener.open(url) as f: return fn(f, **kwds) @@ -215,6 +211,11 @@ def _scan_metadata( def _metadata(self) -> nw.LazyFrame: return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() + def _download(self, url: str, fp: Path, /) -> None: + with self._opener.open(url) as f: + fp.touch() + fp.write_bytes(f.read()) + @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: return DatasetCache(self) From 056f96d4de55fc98f7f4f6e9e61650e5c8d62e25 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:20:34 +0000 Subject: [PATCH 166/201] feat(DRAFT): `Metadata`-based error handling - Adds `_exceptions.py` with some initial cases - Renaming `result` -> `meta` - Reduced the complexity of `_PyArrowReader` - Generally, trying to avoid exceptions from 3rd parties - to allow suggesting an alternate path that may work --- altair/datasets/__init__.py | 2 +- altair/datasets/_cache.py | 19 +++-- altair/datasets/_exceptions.py | 72 ++++++++++++++++ altair/datasets/_readers.py | 146 ++++++++++++++++----------------- tests/test_datasets.py | 11 ++- 5 files changed, 163 insertions(+), 87 deletions(-) create mode 100644 altair/datasets/_exceptions.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4986f671d..cc6a07d32 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -73,7 +73,7 @@ def url( - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 - https://github.com/vega/altair/pull/3631#discussion_r1846662053 """ - from altair.datasets._readers import AltairDatasetsError + from altair.datasets._exceptions import AltairDatasetsError try: from altair.datasets._loader import load diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 36d965f2e..79fc9c50b 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -3,11 +3,14 @@ import os import sys from collections import defaultdict +from importlib.util import find_spec from pathlib import Path from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._exceptions import AltairDatasetsError from altair.datasets._typing import Dataset if sys.version_info >= (3, 12): @@ -81,7 +84,7 @@ """ -def _iter_results(df: nw.DataFrame[Any], /) -> Iterator[Metadata]: +def _iter_metadata(df: nw.DataFrame[Any], /) -> Iterator[Metadata]: """ Yield rows from ``df``, where each represents a dataset. @@ -181,8 +184,8 @@ def metadata(self, ns: Any, /) -> nw.LazyFrame: return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy()) def __getitem__(self, key: _Dataset, /) -> Metadata: - if result := self.get(key, None): - return result + if meta := self.get(key, None): + return meta if key in get_args(Dataset): msg = f"{key!r} cannot be loaded via {type(self).__name__!r}." @@ -192,8 +195,10 @@ def __getitem__(self, key: _Dataset, /) -> Metadata: raise TypeError(msg) def url(self, name: _Dataset, /) -> str: - if result := self.get(name, None): - return result["url"] + if meta := self.get(name, None): + if meta["suffix"] == ".parquet" and not find_spec("vegafusion"): + raise AltairDatasetsError.url_parquet(meta) + return meta["url"] if name in get_args(Dataset): msg = f"{name!r} cannot be loaded via url." @@ -302,8 +307,8 @@ def download_all(self) -> None: print("Already downloaded all datasets") return None print(f"Downloading {len(frame)} missing datasets...") - for row in _iter_results(frame): - self._rd._download(row["url"], self.path / (row["sha"] + row["suffix"])) + for meta in _iter_metadata(frame): + self._rd._download(meta["url"], self.path / (meta["sha"] + meta["suffix"])) print("Finished downloads") return None diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py new file mode 100644 index 000000000..488470709 --- /dev/null +++ b/altair/datasets/_exceptions.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Sequence + + from altair.datasets._readers import _Backend + from altair.datasets._typing import Metadata + + +class AltairDatasetsError(Exception): + # TODO: Rename, try to reduce verbosity of message, link to vegafusion? + @classmethod + def url_parquet(cls, meta: Metadata, /) -> AltairDatasetsError: + name = meta["file_name"] + msg = ( + f"Currently unable to load {name!r} via url, as '.parquet' datasets require `vegafusion`.\n" + "See upstream issue for details: https://github.com/vega/vega/issues/3961" + ) + return cls(msg) + + @classmethod + def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError: + msg = f"Found no supported backend, searched:\n{priority!r}" + return cls(msg) + + +def module_not_found( + backend_name: str, reqs: str | tuple[str, ...], missing: str +) -> ModuleNotFoundError: + if isinstance(reqs, tuple): + depends = ", ".join(f"{req!r}" for req in reqs) + " packages" + else: + depends = f"{reqs!r} package" + msg = ( + f"Backend {backend_name!r} requires the {depends}, but {missing!r} could not be found.\n" + f"This can be installed with pip using:\n" + f" pip install {missing}\n" + f"Or with conda using:\n" + f" conda install -c conda-forge {missing}" + ) + return ModuleNotFoundError(msg, name=missing) + + +# TODO: Give more direct help (e.g. url("7zip")) +def image(meta: Metadata): + name = meta["file_name"] + ext = meta["suffix"] + msg = ( + f"Unable to load {name!r} as tabular data.\n" + f"{ext!r} datasets are only compatible with `url(...)` or `Loader.url(...)`." + ) + return AltairDatasetsError(msg) + + +# TODO: Pass in `meta` +def geospatial(backend_name: str) -> NotImplementedError: + msg = _suggest_supported( + f"Geospatial data is not supported natively by {backend_name!r}." + ) + return NotImplementedError(msg) + + +# TODO: Pass in `meta` +def non_tabular_json(backend_name: str) -> NotImplementedError: + msg = _suggest_supported(f"Non-tabular json is not supported {backend_name!r}.") + return NotImplementedError(msg) + + +def _suggest_supported(msg: str) -> str: + return f"{msg}\nTry installing `polars` or using `Loader.url(...)` instead." diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 88d917ab4..11cc473a9 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -10,7 +10,7 @@ from __future__ import annotations import urllib.request -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from importlib import import_module from importlib.util import find_spec @@ -19,7 +19,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Final, Literal, @@ -31,11 +30,11 @@ import narwhals.stable.v1 as nw from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT -from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_results +from altair.datasets import _exceptions as _ds_exc +from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read if TYPE_CHECKING: - import json # noqa: F401 import sys from io import IOBase from urllib.request import OpenerDirector @@ -84,14 +83,11 @@ _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow] -__all__ = ["backend"] +__all__ = ["backend", "infer_backend"] _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" -class AltairDatasetsError(Exception): ... - - class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -136,10 +132,16 @@ def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]: def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]: return self._scan_fn[_extract_suffix(source, is_ext_scan)] - def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: + def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: """Hook to provide additional schema metadata on read.""" return {} + def _maybe_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]: + """Backend specific tweaks/errors/warnings, based on ``Metadata``.""" + if meta["is_image"]: + raise _ds_exc.image(meta) + return self.read_fn(meta) + def dataset( self, name: Dataset | LiteralString, @@ -148,14 +150,14 @@ def dataset( **kwds: Any, ) -> IntoDataFrameT: df = self.query(**_extract_constraints(name, suffix)) - result = next(_iter_results(df)) - url = result["url"] - fn = self.read_fn(url) - if default_kwds := self._schema_kwds(result): + meta = next(_iter_metadata(df)) + fn = self._maybe_fn(meta) + url = meta["url"] + if default_kwds := self._schema_kwds(meta): kwds = default_kwds | kwds if kwds else default_kwds if self.cache.is_active(): - fp = self.cache.path / (result["sha"] + result["suffix"]) + fp = self.cache.path / (meta["sha"] + meta["suffix"]) if not (fp.exists() and fp.stat().st_size): self._download(url, fp) return fn(fp, **kwds) @@ -170,7 +172,10 @@ def url( /, ) -> str: frame = self.query(**_extract_constraints(name, suffix)) - url = frame.item(0, "url") + meta = next(_iter_metadata(frame)) + if meta["suffix"] == ".parquet" and not is_available("vegafusion"): + raise _ds_exc.AltairDatasetsError.url_parquet(meta) + url = meta["url"] if isinstance(url, str): return url else: @@ -223,21 +228,7 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: def _import(self, name: str, /) -> Any: if spec := find_spec(name): return import_module(spec.name) - else: - reqs = _requirements(self._name) # type: ignore[call-overload] - if isinstance(reqs, tuple): - depends = ", ".join(f"{req!r}" for req in reqs) + " packages" - else: - depends = f"{reqs!r} package" - - msg = ( - f"Backend {self._name!r} requires the {depends}, but {name!r} could not be found.\n" - f"This can be installed with pip using:\n" - f" pip install {name}\n" - f"Or with conda using:\n" - f" conda install -c conda-forge {name}" - ) - raise ModuleNotFoundError(msg, name=name) + raise _ds_exc.module_not_found(self._name, _requirements(self._name), name) # type: ignore[call-overload] def __repr__(self) -> str: return f"Reader[{self._name}]" @@ -259,15 +250,21 @@ class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol): _schema_cache: SchemaCache - def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: - name: Any = result["dataset_name"] - suffix = result["suffix"] + def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: + name: Any = meta["dataset_name"] + suffix = meta["suffix"] if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime): if suffix == ".json": return {"convert_dates": cols} elif suffix in {".csv", ".tsv"}: return {"parse_dates": cols} - return super()._schema_kwds(result) + return super()._schema_kwds(meta) + + def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]: + fn = super()._maybe_fn(meta) + if meta["is_spatial"]: + raise _ds_exc.geospatial(self._name) + return fn class _PandasReader(_PandasReaderBase): @@ -378,51 +375,49 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): https://arrow.apache.org/docs/python/json.html#reading-json-files """ + def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]: + fn = super()._maybe_fn(meta) + if fn is self._read_json_polars: + return fn + elif meta["is_json"]: + if meta["is_tabular"]: + return self._read_json_tabular + elif meta["is_spatial"]: + raise _ds_exc.geospatial(self._name) + else: + raise _ds_exc.non_tabular_json(self._name) + else: + return fn + + def _read_json_tabular(self, source: Any, /, **kwds: Any) -> pa.Table: + import json + + if not isinstance(source, Path): + obj = json.load(source) + else: + with Path(source).open(encoding="utf-8") as f: + obj = json.load(f) + pa = nw.dependencies.get_pyarrow() + return pa.Table.from_pylist(obj) + + def _read_json_polars(self, source: Any, /, **kwds: Any) -> pa.Table: + return _pl_read_json_roundtrip(source).to_arrow() + def __init__(self, name: _PyArrow, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: - pa = self._import(self._name) - pa_csv = self._import(f"{self._name}.csv") - pa_feather = self._import(f"{self._name}.feather") - pa_parquet = self._import(f"{self._name}.parquet") - pa_read_csv = pa_csv.read_csv - pa_read_feather = pa_feather.read_table - pa_read_parquet = pa_parquet.read_table - - # HACK: Multiple alternatives to `pyarrow.json.read_json` - # ------------------------------------------------------- - # NOTE: Prefer `polars` since it is zero-copy and fast (1) - if find_spec("polars") is not None: - - def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: - return _pl_read_json_roundtrip(source).to_arrow() + pa = self._import(self._name) # noqa: F841 + pa_read_csv = self._import(f"{self._name}.csv").read_csv + pa_read_feather = self._import(f"{self._name}.feather").read_table + pa_read_parquet = self._import(f"{self._name}.parquet").read_table + # NOTE: Prefer `polars` since it is zero-copy and fast + if find_spec("polars") is not None: + pa_read_json = self._read_json_polars else: - # NOTE: Convert inline from stdlib json (2) - import json - - pa_json = self._import(f"{self._name}.json") - - def pa_read_json(source: Any, /, **kwds) -> pa.Table: - if not isinstance(source, Path): - obj = json.load(source) - else: - with Path(source).open(encoding="utf-8") as f: - obj = json.load(f) - # NOTE: Common case of {"values": [{...}]}, missing the `"values"` keys - if isinstance(obj, Sequence) and isinstance(obj[0], Mapping): - return pa.Table.from_pylist(obj) - elif isinstance(obj, Mapping) and "type" in obj: - msg = ( - "Inferred file as geojson, unsupported by pyarrow.\n" - "Try installing `polars` or using `Loader.url(...)` instead." - ) - raise NotImplementedError(msg) - else: - # NOTE: Almost certainly will fail on read as of `v2.9.0` - return pa_json.read_json(source) - - # Stubs suggest using a dataclass, but no way to construct it + pa_read_json = self._import(f"{self._name}.json").read_json + + # NOTE: Stubs suggest using a dataclass, but no way to construct it tab_sep: Any = {"delimiter": "\t"} self._read_fn = { @@ -512,8 +507,7 @@ def infer_backend( it = (backend(name) for name in priority if is_available(_requirements(name))) if reader := next(it, None): return reader - msg = f"Found no supported backend, searched:\n{priority!r}" - raise AltairDatasetsError(msg) + raise _ds_exc.AltairDatasetsError.from_priority(priority) @overload diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a65b96bd7..10f030cfa 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -16,7 +16,7 @@ from narwhals.stable.v1 import dependencies as nw_dep from altair.datasets import Loader, url -from altair.datasets._readers import AltairDatasetsError +from altair.datasets._exceptions import AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read from tests import skip_requires_pyarrow, slow @@ -296,8 +296,13 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: assert match_url("flights-10k", url("flights-10k")) assert match_url("flights-200k", url("flights-200k")) - with pytest.raises(TypeError, match="cannot be loaded via url"): - url("flights-3m") + if find_spec("vegafusion"): + assert match_url("flights-3m", url("flights-3m")) + + with monkeypatch.context() as mp: + mp.setitem(sys.modules, "vegafusion", None) + with pytest.raises(AltairDatasetsError, match=r".parquet.+require.+vegafusion"): + url("flights-3m") with pytest.raises( TypeError, match="'fake data' does not refer to a known dataset" From e168948b6239f07a16d1e9f20b4a4c58cbea7ab4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:04:09 +0000 Subject: [PATCH 167/201] chore(ruff): Remove unused `0.9.2` ignores Related #3771 https://github.com/vega/altair/actions/runs/12810882256/job/35718940621?pr=3631 --- altair/datasets/_cache.py | 2 +- altair/datasets/_readers.py | 2 +- tools/datasets/npm.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 79fc9c50b..cbeb8f01f 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -46,7 +46,7 @@ from altair.datasets._readers import _Reader from altair.datasets._typing import FlFieldStr - _Dataset: TypeAlias = "Dataset | LiteralString" # noqa: TC008 + _Dataset: TypeAlias = "Dataset | LiteralString" _FlSchema: TypeAlias = Mapping[str, FlFieldStr] __all__ = ["CsvCache", "DatasetCache", "SchemaCache", "csv_cache"] diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 11cc473a9..55ab96851 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -64,7 +64,7 @@ from altair.datasets._typing import Dataset, Extension, Metadata from altair.vegalite.v5.schema._typing import OneOrSeq - _IntoSuffix: TypeAlias = "StrPath | Metadata" # noqa: TC008 + _IntoSuffix: TypeAlias = "StrPath | Metadata" _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index ea38eb971..95856d4fc 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -22,7 +22,7 @@ from typing_extensions import TypeAlias from tools.datasets.models import Package, ParsedPackage - BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' # noqa: TC008 + BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' __all__ = ["Npm"] From 7fd1f4d5ce8450d2a611d2cb461983aab03643aa Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:15:29 +0000 Subject: [PATCH 168/201] refactor: clean up, standardize `_exceptions.py` --- altair/datasets/_cache.py | 2 +- altair/datasets/_exceptions.py | 63 +++++++++++++++++++++------------- altair/datasets/_readers.py | 8 ++--- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index cbeb8f01f..08016d622 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -197,7 +197,7 @@ def __getitem__(self, key: _Dataset, /) -> Metadata: def url(self, name: _Dataset, /) -> str: if meta := self.get(name, None): if meta["suffix"] == ".parquet" and not find_spec("vegafusion"): - raise AltairDatasetsError.url_parquet(meta) + raise AltairDatasetsError.from_url(meta) return meta["url"] if name in get_args(Dataset): diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py index 488470709..36dba27ef 100644 --- a/altair/datasets/_exceptions.py +++ b/altair/datasets/_exceptions.py @@ -10,14 +10,20 @@ class AltairDatasetsError(Exception): - # TODO: Rename, try to reduce verbosity of message, link to vegafusion? @classmethod - def url_parquet(cls, meta: Metadata, /) -> AltairDatasetsError: - name = meta["file_name"] - msg = ( - f"Currently unable to load {name!r} via url, as '.parquet' datasets require `vegafusion`.\n" - "See upstream issue for details: https://github.com/vega/vega/issues/3961" - ) + def from_url(cls, meta: Metadata, /) -> AltairDatasetsError: + if meta["suffix"] == ".parquet": + msg = ( + f"{_failed_url(meta)}" + f"{meta['suffix']!r} datasets require `vegafusion`.\n" + "See upstream issue for details: https://github.com/vega/vega/issues/3961" + ) + else: + msg = ( + f"{cls.from_url.__qualname__}() called for " + f"unimplemented extension: {meta['suffix']}\n\n{meta!r}" + ) + raise NotImplementedError(msg) return cls(msg) @classmethod @@ -43,30 +49,41 @@ def module_not_found( return ModuleNotFoundError(msg, name=missing) -# TODO: Give more direct help (e.g. url("7zip")) -def image(meta: Metadata): - name = meta["file_name"] - ext = meta["suffix"] - msg = ( - f"Unable to load {name!r} as tabular data.\n" - f"{ext!r} datasets are only compatible with `url(...)` or `Loader.url(...)`." - ) +def image(meta: Metadata, /) -> AltairDatasetsError: + msg = f"{_failed_tabular(meta)}\n{_suggest_url(meta)}" return AltairDatasetsError(msg) -# TODO: Pass in `meta` -def geospatial(backend_name: str) -> NotImplementedError: - msg = _suggest_supported( +def geospatial(meta: Metadata, backend_name: str) -> NotImplementedError: + msg = ( + f"{_failed_tabular(meta)}" f"Geospatial data is not supported natively by {backend_name!r}." + f"{_suggest_url(meta, 'polars')}" ) return NotImplementedError(msg) -# TODO: Pass in `meta` -def non_tabular_json(backend_name: str) -> NotImplementedError: - msg = _suggest_supported(f"Non-tabular json is not supported {backend_name!r}.") +def non_tabular_json(meta: Metadata, backend_name: str) -> NotImplementedError: + msg = ( + f"{_failed_tabular(meta)}" + f"Non-tabular json is not supported natively by {backend_name!r}." + f"{_suggest_url(meta, 'polars')}" + ) return NotImplementedError(msg) -def _suggest_supported(msg: str) -> str: - return f"{msg}\nTry installing `polars` or using `Loader.url(...)` instead." +def _failed_url(meta: Metadata, /) -> str: + return f"Unable to load {meta['file_name']!r} via url.\n" + + +def _failed_tabular(meta: Metadata, /) -> str: + return f"Unable to load {meta['file_name']!r} as tabular data.\n" + + +def _suggest_url(meta: Metadata, install_other: str | None = None) -> str: + other = f" installing `{install_other}` or" if install_other else "" + return ( + f"\n\nInstead, try{other}:\n\n" + " from altair.datasets import url\n" + f" url({meta['dataset_name']!r})" + ) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 55ab96851..c0587653a 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -174,7 +174,7 @@ def url( frame = self.query(**_extract_constraints(name, suffix)) meta = next(_iter_metadata(frame)) if meta["suffix"] == ".parquet" and not is_available("vegafusion"): - raise _ds_exc.AltairDatasetsError.url_parquet(meta) + raise _ds_exc.AltairDatasetsError.from_url(meta) url = meta["url"] if isinstance(url, str): return url @@ -263,7 +263,7 @@ def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]: fn = super()._maybe_fn(meta) if meta["is_spatial"]: - raise _ds_exc.geospatial(self._name) + raise _ds_exc.geospatial(meta, self._name) return fn @@ -383,9 +383,9 @@ def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]: if meta["is_tabular"]: return self._read_json_tabular elif meta["is_spatial"]: - raise _ds_exc.geospatial(self._name) + raise _ds_exc.geospatial(meta, self._name) else: - raise _ds_exc.non_tabular_json(self._name) + raise _ds_exc.non_tabular_json(meta, self._name) else: return fn From 5dc227e72abd07c0afe705e8275f9a7052996941 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:22:12 +0000 Subject: [PATCH 169/201] test: Refactor decorators, test new errors --- tests/test_datasets.py | 101 ++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 10f030cfa..3ccdba273 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -38,6 +38,12 @@ from altair.vegalite.v5.schema._typing import OneOrSeq from tests import MarksType + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame] + CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -51,23 +57,34 @@ class DatasetSpec(TypedDict, total=False): requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() -backends: pytest.MarkDecorator = pytest.mark.parametrize( - "backend", - [ - "polars", - pytest.param( - "pandas", - marks=pytest.mark.xfail( - find_spec("pyarrow") is None, - reason=( - "`pandas` supports backends other than `pyarrow` for `.parquet`.\n" - "However, none of these are currently an `altair` dependency." - ), +_b_params = { + "polars": pytest.param("polars"), + "pandas": pytest.param( + "pandas", + marks=pytest.mark.xfail( + find_spec("pyarrow") is None, + reason=( + "`pandas` supports backends other than `pyarrow` for `.parquet`.\n" + "However, none of these are currently an `altair` dependency." ), ), - pytest.param("pandas[pyarrow]", marks=requires_pyarrow), - pytest.param("pyarrow", marks=requires_pyarrow), - ], + ), + "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=requires_pyarrow), + "pyarrow": pytest.param("pyarrow", marks=requires_pyarrow), +} + +backends: pytest.MarkDecorator = pytest.mark.parametrize("backend", _b_params.values()) +backends_no_polars: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", [v for k, v in _b_params.items() if k != "polars"] +) +backends_pandas_any: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", [v for k, v in _b_params.items() if "pandas" in k] +) +backends_single: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", [v for k, v in _b_params.items() if "[" not in k] +) +backends_multi: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", [v for k, v in _b_params.items() if "[" in k] ) datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() @@ -97,14 +114,30 @@ def is_flaky_datasets(request: pytest.FixtureRequest) -> bool: @pytest.fixture(scope="session") -def polars_loader( - tmp_path_factory: pytest.TempPathFactory, -) -> Loader[pl.DataFrame, pl.LazyFrame]: +def polars_loader(tmp_path_factory: pytest.TempPathFactory) -> PolarsLoader: data = Loader.from_backend("polars") data.cache.path = tmp_path_factory.mktemp("loader-cache-polars") return data +@pytest.fixture( + params=("earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m") +) +def spatial_datasets(request: pytest.FixtureRequest) -> Dataset: + return request.param + + +@backends_no_polars +def test_spatial(spatial_datasets, backend: _Backend) -> None: + load = Loader.from_backend(backend) + pattern = re.compile( + rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+url", + flags=re.DOTALL | re.IGNORECASE, + ) + with pytest.raises(NotImplementedError, match=pattern): + load(spatial_datasets) + + @pytest.fixture def metadata_columns() -> frozenset[str]: """ @@ -321,13 +354,10 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None assert set(nw_frame.columns) == {"symbol", "date", "price"} -@backends +@backends_single def test_missing_dependency_single( backend: _Backend, monkeypatch: pytest.MonkeyPatch ) -> None: - if backend == "pandas[pyarrow]": - pytest.skip("Testing single dependency backends only") - monkeypatch.setitem(sys.modules, backend, None) with pytest.raises( @@ -340,7 +370,7 @@ def test_missing_dependency_single( Loader.from_backend(backend) -@pytest.mark.parametrize("backend", ["pandas[pyarrow]"]) +@backends_multi @skip_requires_pyarrow def test_missing_dependency_multi( backend: _Backend, monkeypatch: pytest.MonkeyPatch @@ -597,9 +627,7 @@ def test_pyarrow_read_json( ], ) def test_polars_read_json_roundtrip( - polars_loader: Loader[pl.DataFrame, pl.LazyFrame], - spec: DatasetSpec, - column: str, + polars_loader: PolarsLoader, spec: DatasetSpec, column: str ) -> None: frame = polars_loader(spec["name"], ".json") tp = frame.schema.to_python()[column] @@ -620,18 +648,17 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: @slow @datasets_debug -@pytest.mark.parametrize( - ("name", "suffix"), - list(_dataset_params(skip=("7zip", "ffox", "gimp"))), -) +@pytest.mark.parametrize(("name", "suffix"), list(_dataset_params())) def test_all_datasets( - polars_loader: Loader[pl.DataFrame, pl.LazyFrame], - name: Dataset, - suffix: Extension, + polars_loader: PolarsLoader, name: Dataset, suffix: Extension ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" - frame = polars_loader(name, suffix) - assert nw_dep.is_polars_dataframe(frame) + if name in {"7zip", "ffox", "gimp"}: + with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"): + polars_loader(name, suffix) + else: + frame = polars_loader(name, suffix) + assert nw_dep.is_polars_dataframe(frame) def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): @@ -686,7 +713,7 @@ def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) - @skip_requires_pyarrow -@pytest.mark.parametrize("backend", ["pandas", "pandas[pyarrow]"]) +@backends_pandas_any @pytest.mark.parametrize( ("name", "columns"), [ @@ -709,7 +736,7 @@ def test_pandas_date_parse( backend: _PandasAny, name: Dataset, columns: OneOrSeq[str], - polars_loader: Loader[pl.DataFrame, pl.LazyFrame], + polars_loader: PolarsLoader, ) -> None: """ Ensure schema defaults are correctly parsed. From ba01af12e064398dc89a7966f2849c1b1d03ff6c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 17 Jan 2025 18:24:20 +0000 Subject: [PATCH 170/201] docs: Replace outdated docs - Using `load` instead of `data` - Don't mention multi-versions, as that was dropped --- altair/datasets/_loader.py | 34 +++++++++++++++++----------------- altair/datasets/_readers.py | 4 +++- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index ef1cf46d3..0bb91aa1f 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -35,8 +35,8 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): from altair.datasets import Loader - data = Loader.from_backend("polars") - >>> data # doctest: +SKIP + load = Loader.from_backend("polars") + >>> load # doctest: +SKIP Loader[polars] .. _vega-datasets: @@ -94,24 +94,24 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: from altair.datasets import Loader - data = Loader.from_backend("polars") - cars = data("cars") + load = Loader.from_backend("polars") + cars = load("cars") >>> type(cars) # doctest: +SKIP polars.dataframe.frame.DataFrame Using ``pandas``: - data = Loader.from_backend("pandas") - cars = data("cars") + load = Loader.from_backend("pandas") + cars = load("cars") >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame Using ``pandas``, backed by ``pyarrow`` dtypes: - data = Loader.from_backend("pandas[pyarrow]") - cars = data("cars") + load = Loader.from_backend("pandas[pyarrow]") + cars = load("cars") >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame @@ -165,8 +165,8 @@ def __call__( from altair.datasets import Loader - data = Loader.from_backend("polars") - source = data("iowa-electricity") + load = Loader.from_backend("polars") + source = load("iowa-electricity") >>> source.columns # doctest: +SKIP ['year', 'source', 'net_generation'] @@ -193,8 +193,8 @@ def __call__( Using ``pandas``: - data = Loader.from_backend("pandas") - source = data("iowa-electricity") + load = Loader.from_backend("pandas") + source = load("iowa-electricity") >>> source.columns # doctest: +SKIP Index(['year', 'source', 'net_generation'], dtype='object') @@ -217,8 +217,8 @@ def __call__( Using ``pyarrow``: - data = Loader.from_backend("pyarrow") - source = data("iowa-electricity") + load = Loader.from_backend("pyarrow") + source = load("iowa-electricity") >>> source.column_names # doctest: +SKIP ['year', 'source', 'net_generation'] @@ -266,13 +266,13 @@ def url( import altair as alt from altair.datasets import Loader - data = Loader.from_backend("polars") - >>> data.url("cars") # doctest: +SKIP + load = Loader.from_backend("polars") + >>> load.url("cars") # doctest: +SKIP 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json' We can pass the result directly to a chart: - url = data.url("cars") + url = load.url("cars") alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") """ return self._reader.url(name, suffix) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index c0587653a..f76cc5a0a 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -186,7 +186,7 @@ def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.DataFrame[IntoDataFrameT]: """ - Query multi-version trees metadata. + Query a tabular version of `vega-datasets/datapackage.json`_. Applies a filter, erroring out when no results would be returned. @@ -194,6 +194,8 @@ def query( ----- Arguments correspond to those seen in `pl.LazyFrame.filter`_. + .. _vega-datasets/datapackage.json: + https://github.com/vega/vega-datasets/blob/main/datapackage.json .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ From 63f4be0232d5d818edaa9e7c67dcfa76e9057dda Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 18 Jan 2025 21:21:36 +0000 Subject: [PATCH 171/201] refactor: Clean up `tools.datasets` - `Application.generate_typing` now mostly populated by `DataPackage` methods - Docs are defined alongside expressions - Factored out repetitive code into `spell_literal_alias` - `Metadata` examples table is now generated inside the doc --- tools/datasets/__init__.py | 179 ++++---------------- tools/datasets/datapackage.py | 300 ++++++++++++++++++++++++---------- tools/datasets/npm.py | 20 +-- tools/schemapi/utils.py | 20 +++ 4 files changed, 275 insertions(+), 244 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 37d487da0..faf5e8d96 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -22,28 +22,28 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -import polars as pl -from polars import col - from tools.codemod import ruff from tools.datasets.npm import Npm +from tools.fs import REPO_ROOT from tools.schemapi import utils if TYPE_CHECKING: import sys from collections.abc import Mapping + import polars as pl + + from tools.datasets import datapackage + if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias _PathAlias: TypeAlias = Literal[ - "typing", - "metadata-csv", - "metadata", - "schemas", + "typing", "metadata-csv", "metadata", "schemas", "datapackage" ] + PathMap: TypeAlias = Mapping[_PathAlias, Path] __all__ = ["app"] @@ -63,8 +63,6 @@ class Application: Directories to store ``.parquet`` metadata files. out_fp_typing Path to write metadata-derived typing module. - kwds_npm - Arguments passed to corresponding constructor. See Also -------- @@ -72,16 +70,9 @@ class Application: """ def __init__( - self, - out_dir_tools: Path, - out_dir_altair: Path, - out_fp_typing: Path, - *, - kwds_npm: Mapping[str, Any] | None = None, + self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path ) -> None: out_dir_tools.mkdir(exist_ok=True) - kwds_npm = kwds_npm or {} - self._npm: Npm = Npm(out_dir_tools, **kwds_npm) METADATA = "metadata" self.paths = types.MappingProxyType["_PathAlias", Path]( { @@ -89,8 +80,10 @@ def __init__( "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz", "metadata": out_dir_altair / f"{METADATA}.parquet", "schemas": out_dir_altair / "schemas.json.gz", + "datapackage": out_dir_tools / "datapackage.json", } ) + self._npm: Npm = Npm(self.paths) @property def npm(self) -> Npm: @@ -118,20 +111,15 @@ def refresh( https://github.com/vega/vega-datasets/issues/654 """ print("Syncing datasets ...") - package = self.npm.datapackage(tag=tag, frozen=frozen) - self.write_parquet(package["features"], self.paths["metadata"]) - self.write_json_gzip(package["schemas"], self.paths["schemas"]) - metadata_min = ( - package["features"] - .lazy() - .filter(col("suffix") != ".arrow") - .sort("dataset_name") - ) - self.write_csv_gzip(metadata_min, self.paths["metadata-csv"]) + dpkg = self.npm.datapackage(tag=tag, frozen=frozen) + self.write_parquet(dpkg.core, self.paths["metadata"]) + self.write_json_gzip(dpkg.schemas(), self.paths["schemas"]) + self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"]) + print("Finished updating datasets.") if include_typing: - self.generate_typing() - return package["features"] + self.generate_typing(dpkg) + return dpkg.core.collect() def reset(self) -> None: """Remove all metadata files.""" @@ -140,10 +128,14 @@ def reset(self) -> None: def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" + import polars as pl + return pl.read_parquet(self.paths[name]) def scan(self, name: _PathAlias, /) -> pl.LazyFrame: """Scan existing metadata from file.""" + import polars as pl + return pl.scan_parquet(self.paths[name]) def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: @@ -190,114 +182,16 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None df = frame.lazy().collect() df.write_parquet(fp, compression="zstd", compression_level=17) - def generate_typing(self) -> None: - from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT - - dpkg = self.scan("metadata") - metadata_schema = dpkg.collect_schema().to_python() - - DATASET_NAME = "dataset_name" - names = ( - dpkg.unique(DATASET_NAME) - .select(DATASET_NAME) - .sort(DATASET_NAME) - .collect() - .to_series() - ) + def generate_typing(self, dpkg: datapackage.DataPackage) -> None: indent = " " * 4 NAME = "Dataset" EXT = "Extension" - EXT_TYPES = tuple( - dpkg.filter(is_image=False) - .select(col("suffix").unique().sort()) - .collect() - .to_series() - .to_list() - ) + EXT_TYPES = dpkg.extensions() EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" EXTENSION_TYPE_TP = ( f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]" ) EXTENSION_GUARD = "is_ext_read" - METADATA_TD = "Metadata" - DESCRIPTION_DEFAULT = "_description_" - NOTE_SEP = f"\n\n{indent * 2}.. note::\n{indent * 3}" - - sha = ( - f"Unique hash for the dataset.{NOTE_SEP}" - f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" - f"then this value would remain stable." - ) - links = ( - f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n" - f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n" - f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" - f".. _GeoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON\n" - f".. _TopoJSON:\n{indent * 2}https://en.wikipedia.org/wiki/GeoJSON#TopoJSON\n" - ) - import textwrap - - # NOTE: Uses `pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80)` - examples = f"""\ - Examples - -------- - ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample: - - ``` - shape: (73, 13) - ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐ - │ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │ - │ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ ┆ str ┆ str │ - ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡ - │ 7zip ┆ .png ┆ 7zip.png ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ d48850099c17… ┆ sdelivr.net/… │ - │ airports ┆ .csv ┆ airports.csv ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ 584c3fa1d31e… ┆ sdelivr.net/… │ - │ annual-precip ┆ .json ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │ - │ ┆ ┆ json ┆ ┆ 8f16dda65151… ┆ sdelivr.net/… │ - │ anscombe ┆ .json ┆ anscombe.json ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ 3bdf0c866115… ┆ sdelivr.net/… │ - │ barley ┆ .json ┆ barley.json ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ e197ce95c24c… ┆ sdelivr.net/… │ - │ … ┆ … ┆ … ┆ … ┆ … ┆ … │ - │ weekly-weather ┆ .json ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │ - │ ┆ ┆ .json ┆ ┆ ccd6baaa89f9… ┆ sdelivr.net/… │ - │ wheat ┆ .json ┆ wheat.json ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │ - │ windvectors ┆ .csv ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │ - │ ┆ ┆ v ┆ ┆ bd59d09fcd94… ┆ sdelivr.net/… │ - │ world-110m ┆ .json ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │ - │ ┆ ┆ n ┆ ┆ 13c94c0c2840… ┆ sdelivr.net/… │ - │ zipcodes ┆ .csv ┆ zipcodes.csv ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │ - │ ┆ ┆ ┆ ┆ 0544c95f1bd4… ┆ sdelivr.net/… │ - └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘ - ``` - """ - - descriptions: dict[str, str] = { - "dataset_name": "Name of the dataset/`Path.stem`_.", - "suffix": "File extension/`Path.suffix`_.", - "file_name": "Equivalent to `Path.name`_.", - "bytes": "File size in *bytes*.", - "is_tabular": "Can be read as tabular data.", - "is_image": "Only accessible via url.", - "is_geo": "`GeoJSON`_ format.", - "is_topo": "`TopoJSON`_ format.", - "is_spatial": "Any geospatial format. Only natively supported by ``polars``.", - "is_json": "Not supported natively by ``pyarrow``.", - "has_schema": "Data types available for improved ``pandas`` parsing.", - "sha": sha, - "url": "Remote url used to access dataset.", - } - metadata_doc = ( - f"\n{indent}".join( - f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" - for param in metadata_schema - ) - + f"\n\n{links}\n\n" - f"{textwrap.indent(textwrap.dedent(examples), indent)}" - ) FIELD = "FlFieldStr" FIELD_TYPES = ( @@ -322,23 +216,14 @@ def generate_typing(self) -> None: utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" - f"{NAME}: TypeAlias = {utils.spell_literal(names)}", - f"{EXT}: TypeAlias = {utils.spell_literal(EXT_TYPES)}", + f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n", + utils.spell_literal_alias(NAME, dpkg.dataset_names()), + utils.spell_literal_alias(EXT, EXT_TYPES), f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}", f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" f"{indent}return suffix in set({EXT_TYPES!r})\n", - UNIVERSAL_TYPED_DICT.format( - name=METADATA_TD, - metaclass_kwds=", total=False", - td_args=f"\n{indent}".join( - f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() - ), - summary="Full schema for ``metadata.parquet``.", - doc=metadata_doc, - comment="", - ), - f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n" + dpkg.typed_dict(), + utils.spell_literal_alias(FIELD, FIELD_TYPES), '"""\n' "String representation of `frictionless`_ `Field Types`_.\n\n" f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n" @@ -348,15 +233,9 @@ def generate_typing(self) -> None: ruff.write_lint_format(self.paths["typing"], contents) -_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets" +_alt_datasets = REPO_ROOT / "altair" / "datasets" app = Application( Path(__file__).parent / "_metadata", _alt_datasets / "_metadata", _alt_datasets / "_typing.py", ) - - -# This is the tag in http://github.com/vega/vega-datasets from -# which the datasets in this repository are sourced. -_OLD_SOURCE_TAG = "v1.29.0" # 5 years ago -_CURRENT_SOURCE_TAG = "v2.9.0" diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 5272170c2..9747bdb71 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -7,92 +7,189 @@ from __future__ import annotations +import textwrap from collections import deque +from functools import cached_property from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, ClassVar, Literal import polars as pl from polars import col -from polars import selectors as cs -from tools.datasets.models import ParsedPackage from tools.schemapi import utils if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Mapping, Sequence from altair.datasets._typing import Dataset, FlFieldStr - from tools.datasets.models import Package + from tools.datasets.models import Package, Resource -__all__ = ["parse_package"] +__all__ = ["DataPackage"] +INDENT = " " * 4 -DATASET_NAME: Literal["dataset_name"] = "dataset_name" -# # NOTE: Flag columns -# Storing these instead of the full **56KB** `datapackage.json` -FEATURES: Sequence[pl.Expr] = ( - (col("format") == "png").alias("is_image"), - (col("type") == "table").alias("is_tabular"), - (col("format") == "geojson").alias("is_geo"), - (col("format") == "topojson").alias("is_topo"), - col("format").is_in(("geojson", "topojson")).alias("is_spatial"), - (col("format").str.contains("json")).alias("is_json"), -) +class Column: + def __init__(self, name: str, expr: pl.Expr, /, doc: str = "_description_") -> None: + self._name: str = name + self._expr: pl.Expr = expr + self._doc: str = doc + @property + def expr(self) -> pl.Expr: + return self._expr.alias(self._name) -def parse_package(pkg: Package, base_url: str, /) -> ParsedPackage: - return ParsedPackage( - features=extract_features(pkg, base_url), schemas=extract_schemas(pkg) - ) + @property + def doc(self) -> str: + return f"{self._name}\n{INDENT * 2}{self._doc}" + def is_feature(self) -> bool: + return self._name.startswith("is_") -def extract_schemas(pkg: Package, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: - """Reduce all datasets with schemas to a minimal mapping.""" - m: Any = { - Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]} - for rsrc in pkg["resources"] - if (s := rsrc.get("schema")) - } - return m - - -def extract_features(pkg: Package, base_url: str, /) -> pl.DataFrame: - EXCLUDE = ( - "name", - "type", - "format", - "scheme", - "mediatype", - "encoding", - "dialect", - "schema", - "sources", - "licenses", - "hash", - "description", - "path", - ) - return ( - pl.LazyFrame(pkg["resources"]) - .with_columns( - path_stem("path").alias(DATASET_NAME), - cs.exclude("name"), + +class DataPackage: + NAME: ClassVar[Literal["dataset_name"]] = "dataset_name" + """ + Main user-facing column name. + + - Does not include file extension + - Preserves case of original file name + """ + + sort_by: str | Sequence[str] = "dataset_name", "bytes" + """Key(s) used to ensure output is deterministic.""" + + _NAME_TYPED_DICT: ClassVar[Literal["Metadata"]] = "Metadata" + _columns: ClassVar[Sequence[Column]] + _links: ClassVar[Sequence[str]] + + def __init__(self, pkg: Package, base_url: str, path: Path, /) -> None: + self._pkg: Package = pkg + self._base_url: str = base_url + self._path: Path = path + + @classmethod + def with_columns(cls, *columns: Column) -> type[DataPackage]: + cls._columns = columns + return cls + + @classmethod + def with_links(cls, *links: str) -> type[DataPackage]: + cls._links = links + return cls + + @property + def columns(self) -> Iterator[Column]: + yield from self._columns + yield self._url + + @cached_property + def core(self) -> pl.LazyFrame: + """A minimal, tabular view of ``datapackage.json``.""" + return pl.LazyFrame(self._resources).select(self._exprs).sort(self.sort_by) + + def schemas(self) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: + """Reduce all datasets with schemas to a minimal mapping.""" + m: Any = { + Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]} + for rsrc in self._resources + if (s := rsrc.get("schema")) + } + return m + + def dataset_names(self) -> Iterable[str]: + return self.core.select(col(self.NAME).unique().sort()).collect().to_series() + + def extensions(self) -> tuple[str, ...]: + return tuple( + self.core.filter(is_image=False) + .select(col("suffix").unique().sort()) + .collect() + .to_series() + .to_list() ) - .select( - DATASET_NAME, - path_suffix("path").alias("suffix"), - col("path").alias("file_name"), - ~cs.by_name(DATASET_NAME, EXCLUDE), - *FEATURES, - col("schema").is_not_null().alias("has_schema"), - col("hash").str.split(":").list.last().alias("sha"), - pl.concat_str(pl.lit(base_url), "path").alias("url"), + + # TODO: Collect, then raise if cannot guarantee uniqueness + def metadata_csv(self) -> pl.LazyFrame: + """Variant with duplicate dataset names removed.""" + return self.core.filter(col("suffix") != ".arrow").sort(self.NAME) + + def typed_dict(self) -> str: + from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT + + return UNIVERSAL_TYPED_DICT.format( + name=self._NAME_TYPED_DICT, + metaclass_kwds=", total=False", + td_args=self._metadata_td_args, + summary=f"Full schema for ``{self._path.name}``.", + doc=self._metadata_doc, + comment="", ) - .sort(DATASET_NAME, "bytes") - .collect() - ) + + @property + def _exprs(self) -> Iterator[pl.Expr]: + return (column.expr for column in self.columns) + + @property + def _docs(self) -> Iterator[str]: + return (column.doc for column in self.columns) + + @property + def _resources(self) -> Sequence[Resource]: + return self._pkg["resources"] + + @property + def _metadata_doc(self) -> str: + NLINDENT = f"\n{INDENT}" + return ( + f"{NLINDENT.join(self._docs)}\n\n{''.join(self._links)}\n" + f"{textwrap.indent(self._metadata_examples, INDENT)}" + f"{INDENT}" + ) + + @property + def _metadata_examples(self) -> str: + with pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80): + table = repr(self.core.collect()) + return ( + f"\nExamples" + f"\n--------\n" + f"``{self._NAME_TYPED_DICT}`` keywords form constraints to filter a table like the below sample:\n\n" + f"```\n{table}\n```\n" + ) + + @property + def _metadata_td_args(self) -> str: + schema = self.core.collect_schema().to_python() + return f"\n{INDENT}".join(f"{p}: {tp.__name__}" for p, tp in schema.items()) + + @property + def _url(self) -> Column: + expr = pl.concat_str(pl.lit(self._base_url), "path") + return Column("url", expr, "Remote url used to access dataset.") + + def features_typing(self, frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: + """ + Current plan is to use type aliases in overloads. + + - ``Tabular`` can be treated interchangeably + - ``Image`` can only work with ``url`` + - ``(Spatial|Geo|Topo)`` can be read with ``polars`` + - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955 + - ``Json`` should warn when using the ``pyarrow`` backend + """ + guards = deque[str]() + ldf = frame.lazy() + for column in self.columns: + if not column.is_feature(): + continue + guard_name = column._name + alias_name = guard_name.removeprefix("is_").capitalize() + members = ldf.filter(guard_name).select(self.NAME).collect().to_series() + guards.append(guard_literal(alias_name, guard_name, members)) + yield utils.spell_literal_alias(alias_name, members) + yield from guards def path_stem(column: str | pl.Expr, /) -> pl.Expr: @@ -119,30 +216,65 @@ def path_suffix(column: str | pl.Expr, /) -> pl.Expr: return path.str.tail(path.str.reverse().str.find(r"\.") + 1) -def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: - """ - Current plan is to use type aliases in overloads. - - - ``Tabular`` can be treated interchangeably - - ``Image`` can only work with ``url`` - - ``(Spatial|Geo|Topo)`` can be read with ``polars`` - - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955 - - ``Json`` should warn when using the ``pyarrow`` backend - """ - guards = deque[str]() - ldf = frame.lazy() - for feat in FEATURES: - guard_name = feat.meta.output_name() - alias_name = guard_name.removeprefix("is_").capitalize() - members = ldf.filter(guard_name).select(DATASET_NAME).collect().to_series() - guards.append(guard_literal(alias_name, guard_name, members)) - yield f"{alias_name}: TypeAlias = {utils.spell_literal(members)}" - yield from guards - - def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str: """Type narrowing function, all members must be literal strings.""" return ( f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n" f" return obj in set({sorted(set(members))!r})\n" ) + + +PATHLIB = "https://docs.python.org/3/library/pathlib.html" +GEOJSON = "https://en.wikipedia.org/wiki/GeoJSON" + + +def link(name: str, url: str, /) -> str: + return f"{INDENT}.. _{name}:\n{INDENT * 2}{url}\n" + + +def note(s: str, /) -> str: + return f"\n\n{INDENT * 2}.. note::\n{INDENT * 3}{s}" + + +fmt = col("format") +DataPackage.with_columns( + Column("dataset_name", path_stem("path"), "Name of the dataset/`Path.stem`_."), + Column("suffix", path_suffix("path"), "File extension/`Path.suffix`_."), + Column("file_name", col("path"), "Equivalent to `Path.name`_."), + Column("bytes", col("bytes"), "File size in *bytes*."), + Column("is_image", fmt == "png", "Only accessible via url."), + Column("is_tabular", col("type") == "table", "Can be read as tabular data."), + Column("is_geo", fmt == "geojson", "`GeoJSON`_ format."), + Column("is_topo", fmt == "topojson", "`TopoJSON`_ format."), + Column( + "is_spatial", + fmt.is_in(("geojson", "topojson")), + "Any geospatial format. Only natively supported by ``polars``.", + ), + Column( + "is_json", fmt.str.contains("json"), "Not supported natively by ``pyarrow``." + ), + Column( + "has_schema", + col("schema").is_not_null(), + "Data types available for improved ``pandas`` parsing.", + ), + Column( + "sha", + col("hash").str.split(":").list.last(), + doc=( + "Unique hash for the dataset." + + note( + f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{INDENT * 3}" + f"then this value would remain stable." + ) + ), + ), +) +DataPackage.with_links( + link("Path.stem", f"{PATHLIB}#pathlib.PurePath.stem"), + link("Path.name", f"{PATHLIB}#pathlib.PurePath.name"), + link("Path.suffix", f"{PATHLIB}#pathlib.PurePath.suffix"), + link("GeoJSON", GEOJSON), + link("TopoJSON", f"{GEOJSON}#TopoJSON"), +) diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 95856d4fc..40116cb05 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -20,7 +20,9 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.datasets.models import Package, ParsedPackage + from tools.datasets import PathMap + from tools.datasets.datapackage import DataPackage + from tools.datasets.models import Package BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' @@ -40,16 +42,13 @@ class Npm: def __init__( self, - output_dir: Path, + paths: PathMap, *, jsdelivr: Literal["jsdelivr"] = "jsdelivr", npm: Literal["npm"] = "npm", package: LiteralString = "vega-datasets", ) -> None: - output_dir.mkdir(exist_ok=True) - self._paths: dict[Literal["datapackage"], Path] = { - "datapackage": output_dir / "datapackage.json", - } + self.paths: PathMap = paths self._url: NpmUrl = NpmUrl( CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", @@ -107,14 +106,15 @@ def file_gh( with self._opener.open(req) as response: return read_fn(response) - def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> ParsedPackage: + def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> DataPackage: pkg: Package = ( - json.loads(self._paths["datapackage"].read_text("utf-8")) + json.loads(self.paths["datapackage"].read_text("utf-8")) if frozen else self.file_gh(tag, "datapackage.json") ) - - return datapackage.parse_package(pkg, self.dataset_base_url(tag)) + return datapackage.DataPackage( + pkg, self.dataset_base_url(tag), self.paths["metadata"] + ) def is_branch(s: BranchOrTag, /) -> bool: diff --git a/tools/schemapi/utils.py b/tools/schemapi/utils.py index a9426f15c..3d4b2d347 100644 --- a/tools/schemapi/utils.py +++ b/tools/schemapi/utils.py @@ -1227,6 +1227,26 @@ def spell_literal(it: Iterable[str], /, *, quote: bool = True) -> str: return f"Literal[{', '.join(it_el)}]" +def spell_literal_alias( + alias_name: str, members: Iterable[str], /, *, quote: bool = True +) -> str: + """ + Wraps ``utils.spell_literal`` as a ``TypeAlias``. + + Examples + -------- + >>> spell_literal_alias("Animals", ("Dog", "Cat", "Fish")) + "Animals: TypeAlias = Literal['Dog', 'Cat', 'Fish']" + + >>> spell_literal_alias("Digits", "0123456789") + "Digits: TypeAlias = Literal['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']" + + >>> spell_literal_alias("LessThanFive", (repr(i) for i in range(5))) + "LessThanFive: TypeAlias = Literal['0', '1', '2', '3', '4']" + """ + return f"{alias_name}: TypeAlias = {spell_literal(members, quote=quote)}" + + def maybe_rewrap_literal(it: Iterable[str], /) -> Iterator[str]: """ Where `it` may contain one or more `"enum"`, `"const"`, flatten to a single `Literal[...]`. From 7433eb81fe22acf62588016e240f0997ef6df908 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:09:49 +0000 Subject: [PATCH 172/201] test: `test_datasets` overhaul - Eliminated all flaky tests - Mocking more of the internals that is safer to run in parallel - Split out non-threadsafe tests with `@no_xdist` - Huge performance improvement for the slower tests - Added some helper functions (`is_*`) where common patterns were identified - **Removed skipping from native `pandas` backend** - Confirms that its now safe without `pyarrow` installed --- altair/datasets/_readers.py | 24 +- pyproject.toml | 21 +- tests/__init__.py | 10 + tests/test_datasets.py | 556 ++++++++++++++++++------------------ 4 files changed, 308 insertions(+), 303 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index f76cc5a0a..0a18c1e61 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -60,6 +60,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias + from packaging.requirements import Requirement from altair.datasets._typing import Dataset, Extension, Metadata from altair.vegalite.v5.schema._typing import OneOrSeq @@ -379,7 +380,7 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]: fn = super()._maybe_fn(meta) - if fn is self._read_json_polars: + if fn == self._read_json_polars: return fn elif meta["is_json"]: if meta["is_tabular"]: @@ -550,7 +551,7 @@ def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... -def _requirements(s: _Backend, /): +def _requirements(s: Any, /) -> Any: concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} if s in concrete: return s @@ -559,12 +560,13 @@ def _requirements(s: _Backend, /): req = Requirement(s) supports_extras: set[Literal[_Pandas]] = {"pandas"} - if req.name in supports_extras: - name = req.name - if (extras := req.extras) and extras == {"pyarrow"}: - extra = "pyarrow" - return name, extra - else: - raise NotImplementedError(s) - else: - raise NotImplementedError(s) + if req.name in supports_extras and req.extras == {"pyarrow"}: + return req.name, "pyarrow" + return _requirements_unknown(req) + + +def _requirements_unknown(req: Requirement | str, /) -> Any: + from packaging.requirements import Requirement + + req = Requirement(req) if isinstance(req, str) else req + return (req.name, *req.extras) diff --git a/pyproject.toml b/pyproject.toml index 5ac95f190..03e33cc36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,16 +262,18 @@ cwd = "." [tool.taskipy.tasks] lint = "ruff check" format = "ruff format --diff --check" +ruff-check = "task lint && task format" ruff-fix = "task lint && ruff format" type-check = "mypy altair tests" -pytest = "pytest" -test = "task lint && task format && task type-check && task pytest" -test-fast = "task ruff-fix && pytest -m \"not slow\"" -test-slow = "task ruff-fix && pytest -m \"slow\"" -test-datasets = "task ruff-fix && pytest tests -k test_datasets -m \"\"" -test-min = "task lint && task format && task type-check && hatch test --python 3.9" -test-all = "task lint && task format && task type-check && hatch test --all" +pytest-serial = "pytest -m \"no_xdist\" --numprocesses=1" +pytest = "pytest && task pytest-serial" +test = "task ruff-check && task type-check && task pytest" +test-fast = "task ruff-fix && pytest -m \"not slow and not datasets_debug and not no_xdist\"" +test-slow = "task ruff-fix && pytest -m \"slow and not datasets_debug and not no_xdist\"" +test-datasets = "task ruff-fix && pytest tests -k test_datasets -m \"not no_xdist\" && task pytest-serial" +test-min = "task ruff-check && task type-check && hatch test --python 3.9" +test-all = "task ruff-check && task type-check && hatch test --all" generate-schema-wrapper = "mypy tools && python tools/generate_schema_wrapper.py && task test" @@ -303,12 +305,13 @@ addopts = [ "tests", "altair", "tools", - "-m not datasets_debug", + "-m not datasets_debug and not no_xdist", ] # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks markers = [ "slow: Label tests as slow (deselect with '-m \"not slow\"')", - "datasets_debug: Disabled by default due to high number of requests" + "datasets_debug: Disabled by default due to high number of requests", + "no_xdist: Unsafe to run in parallel" ] [tool.mypy] diff --git a/tests/__init__.py b/tests/__init__.py index 5d78dce0d..80c27fc2c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -60,6 +60,16 @@ def windows_has_tzdata() -> bool: >>> hatch run test-slow --durations=25 # doctest: +SKIP """ +no_xdist: pytest.MarkDecorator = pytest.mark.no_xdist() +""" +Custom ``pytest.mark`` decorator. + +Each marked test will run **serially**, after all other selected tests. + +.. tip:: + Use as a last resort when a test depends on manipulating global state. +""" + skip_requires_ipython: pytest.MarkDecorator = pytest.mark.skipif( find_spec("IPython") is None, reason="`IPython` not installed." ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3ccdba273..b212d79ce 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,6 +1,5 @@ from __future__ import annotations -import contextlib import datetime as dt import re import sys @@ -15,18 +14,14 @@ from narwhals.stable import v1 as nw from narwhals.stable.v1 import dependencies as nw_dep -from altair.datasets import Loader, url +from altair.datasets import Loader from altair.datasets._exceptions import AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read -from tests import skip_requires_pyarrow, slow - -if sys.version_info >= (3, 14): - from typing import TypedDict -else: - from typing_extensions import TypedDict +from tests import no_xdist, skip_requires_pyarrow +from tools import fs if TYPE_CHECKING: - from collections.abc import Container, Iterator + from collections.abc import Callable, Container, Iterator, Mapping from pathlib import Path from typing import Literal @@ -34,7 +29,7 @@ import polars as pl from _pytest.mark.structures import ParameterSet - from altair.datasets._readers import _Backend, _PandasAny, _Polars + from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow from altair.vegalite.v5.schema._typing import OneOrSeq from tests import MarksType @@ -45,46 +40,24 @@ PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame] CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" - - -class DatasetSpec(TypedDict, total=False): - """Exceptional cases which cannot rely on defaults.""" - - name: Dataset - suffix: Extension - marks: MarksType - - -requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() - -_b_params = { +_backend_params: Mapping[_Backend, ParameterSet] = { "polars": pytest.param("polars"), - "pandas": pytest.param( - "pandas", - marks=pytest.mark.xfail( - find_spec("pyarrow") is None, - reason=( - "`pandas` supports backends other than `pyarrow` for `.parquet`.\n" - "However, none of these are currently an `altair` dependency." - ), - ), - ), - "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=requires_pyarrow), - "pyarrow": pytest.param("pyarrow", marks=requires_pyarrow), + "pandas": pytest.param("pandas"), + "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=skip_requires_pyarrow()), + "pyarrow": pytest.param("pyarrow", marks=skip_requires_pyarrow()), } -backends: pytest.MarkDecorator = pytest.mark.parametrize("backend", _b_params.values()) +backends: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", _backend_params.values() +) backends_no_polars: pytest.MarkDecorator = pytest.mark.parametrize( - "backend", [v for k, v in _b_params.items() if k != "polars"] + "backend", [v for k, v in _backend_params.items() if k != "polars"] ) backends_pandas_any: pytest.MarkDecorator = pytest.mark.parametrize( - "backend", [v for k, v in _b_params.items() if "pandas" in k] -) -backends_single: pytest.MarkDecorator = pytest.mark.parametrize( - "backend", [v for k, v in _b_params.items() if "[" not in k] + "backend", [v for k, v in _backend_params.items() if "pandas" in k] ) -backends_multi: pytest.MarkDecorator = pytest.mark.parametrize( - "backend", [v for k, v in _b_params.items() if "[" in k] +backends_pyarrow: pytest.MarkDecorator = pytest.mark.parametrize( + "backend", [v for k, v in _backend_params.items() if k == "pyarrow"] ) datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() @@ -100,24 +73,12 @@ class DatasetSpec(TypedDict, total=False): """ -@pytest.fixture -def is_flaky_datasets(request: pytest.FixtureRequest) -> bool: - mark_filter = request.config.getoption("-m", None) # pyright: ignore[reportArgumentType] - if mark_filter is None: - return False - elif mark_filter == "": - return True - elif isinstance(mark_filter, str): - return False - else: - raise TypeError(mark_filter) - - @pytest.fixture(scope="session") -def polars_loader(tmp_path_factory: pytest.TempPathFactory) -> PolarsLoader: - data = Loader.from_backend("polars") - data.cache.path = tmp_path_factory.mktemp("loader-cache-polars") - return data +def polars_loader() -> PolarsLoader: + load = Loader.from_backend("polars") + if load.cache.is_not_active(): + load.cache.path = load.cache._XDG_CACHE + return load @pytest.fixture( @@ -127,17 +88,6 @@ def spatial_datasets(request: pytest.FixtureRequest) -> Dataset: return request.param -@backends_no_polars -def test_spatial(spatial_datasets, backend: _Backend) -> None: - load = Loader.from_backend(backend) - pattern = re.compile( - rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+url", - flags=re.DOTALL | re.IGNORECASE, - ) - with pytest.raises(NotImplementedError, match=pattern): - load(spatial_datasets) - - @pytest.fixture def metadata_columns() -> frozenset[str]: """ @@ -158,25 +108,65 @@ def metadata_columns() -> frozenset[str]: ) -def match_url(name: Dataset, url: str) -> bool: +def is_frame_backend(frame: Any, backend: _Backend, /) -> bool: + pandas_any: set[_PandasAny] = {"pandas", "pandas[pyarrow]"} + if backend in pandas_any: + return nw_dep.is_pandas_dataframe(frame) + elif backend == "pyarrow": + return nw_dep.is_pyarrow_table(frame) + elif backend == "polars": + return nw_dep.is_polars_dataframe(frame) + else: + raise TypeError(backend) + + +def is_loader_backend(loader: Loader[Any, Any], backend: _Backend, /) -> bool: + return repr(loader) == f"{type(loader).__name__}[{backend}]" + + +def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool: pattern = rf".+/vega-datasets@.+/data/{name}\..+" + url = fn_url(name) return re.match(pattern, url) is not None +def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool: + """ + User requested ``pyarrow``, but also has ``polars`` installed. + + Notes + ----- + - Currently, defers to ``polars`` only for ``.json``. + """ + return bool( + is_loader_backend(loader, "pyarrow") + and (fn := getattr(loader._reader, "_read_json_polars", None)) + and fn == loader._reader.read_fn("dummy.json") + ) + + +@backends +def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: + """Ensure all backends will query the same column names.""" + load = Loader.from_backend(backend) + schema_columns = load._reader._scan_metadata().collect().columns + assert set(schema_columns) == metadata_columns + + @backends def test_loader_from_backend(backend: _Backend) -> None: - data = Loader.from_backend(backend) - assert data._reader._name == backend + load = Loader.from_backend(backend) + assert is_loader_backend(load, backend) @backends def test_loader_url(backend: _Backend) -> None: - data = Loader.from_backend(backend) - dataset_name: Dataset = "volcano" - assert match_url(dataset_name, data.url(dataset_name)) + load = Loader.from_backend(backend) + assert is_url("volcano", load.url) -def test_load(monkeypatch: pytest.MonkeyPatch) -> None: +@no_xdist +def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None: """ Inferring the best backend available. @@ -187,7 +177,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets._loader from altair.datasets import load - assert load._reader._name == "polars" + assert is_loader_backend(load, "polars") monkeypatch.delattr(altair.datasets._loader, "load", raising=False) monkeypatch.setitem(sys.modules, "polars", None) @@ -196,20 +186,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: if find_spec("pyarrow") is None: # NOTE: We can end the test early for the CI job that removes `pyarrow` - assert load._reader._name == "pandas" + assert is_loader_backend(load, "pandas") monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) with pytest.raises(AltairDatasetsError, match=r"no.+backend"): from altair.datasets import load else: - assert load._reader._name == "pandas[pyarrow]" + assert is_loader_backend(load, "pandas[pyarrow]") monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load - assert load._reader._name == "pandas" + assert is_loader_backend(load, "pandas") monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) @@ -217,7 +207,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) from altair.datasets import load - assert load._reader._name == "pyarrow" + assert is_loader_backend(load, "pyarrow") monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) @@ -225,40 +215,22 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load -# HACK: Using a fixture to get a command line option -# https://docs.pytest.org/en/stable/example/simple.html#pass-different-values-to-a-test-function-depending-on-command-line-options -@pytest.mark.xfail( - is_flaky_datasets, # type: ignore - reason=( - "'pandas[pyarrow]' seems to break locally when running:\n" - ">>> pytest -p no:randomly -n logical tests -k test_datasets -m ''\n\n" - "Possibly related:\n" - " https://github.com/modin-project/modin/issues/951\n" - " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L164\n" - " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L257\n" - ), - raises=AttributeError, -) -@requires_pyarrow -def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: +@backends +def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets._loader monkeypatch.delattr(altair.datasets._loader, "load", raising=False) from altair.datasets import load - assert load._reader._name == "polars" + assert is_loader_backend(load, "polars") default = load("cars") - df_pyarrow = load("cars", backend="pyarrow") - df_pandas = load("cars", backend="pandas[pyarrow]") + df = load("cars", backend=backend) default_2 = load("cars") - df_polars = load("cars", backend="polars") assert nw_dep.is_polars_dataframe(default) - assert nw_dep.is_pyarrow_table(df_pyarrow) - assert nw_dep.is_pandas_dataframe(df_pandas) + assert is_frame_backend(df, backend) assert nw_dep.is_polars_dataframe(default_2) - assert nw_dep.is_polars_dataframe(df_polars) @pytest.mark.parametrize( @@ -296,41 +268,36 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: def test_url(name: Dataset) -> None: from altair.datasets import url - assert match_url(name, url(name)) + assert is_url(name, url) def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: - import altair.datasets from altair.datasets._cache import csv_cache + from altair.datasets._readers import infer_backend - monkeypatch.setitem(sys.modules, "polars", None) - monkeypatch.setitem(sys.modules, "pandas", None) - monkeypatch.setitem(sys.modules, "pyarrow", None) + priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4") assert csv_cache._mapping == {} - - with contextlib.suppress(AltairDatasetsError): - monkeypatch.delattr(altair.datasets._loader, "load", raising=False) with pytest.raises(AltairDatasetsError): - from altair.datasets import load as load - - assert match_url("jobs", url("jobs")) + infer_backend(priority=priority) + url = csv_cache.url + assert is_url("jobs", url) assert csv_cache._mapping != {} - assert match_url("cars", url("cars")) - assert match_url("stocks", url("stocks")) - assert match_url("countries", url("countries")) - assert match_url("crimea", url("crimea")) - assert match_url("disasters", url("disasters")) - assert match_url("driving", url("driving")) - assert match_url("earthquakes", url("earthquakes")) - assert match_url("flare", url("flare")) - assert match_url("flights-10k", url("flights-10k")) - assert match_url("flights-200k", url("flights-200k")) + assert is_url("cars", url) + assert is_url("stocks", url) + assert is_url("countries", url) + assert is_url("crimea", url) + assert is_url("disasters", url) + assert is_url("driving", url) + assert is_url("earthquakes", url) + assert is_url("flare", url) + assert is_url("flights-10k", url) + assert is_url("flights-200k", url) if find_spec("vegafusion"): - assert match_url("flights-3m", url("flights-3m")) + assert is_url("flights-3m", url) with monkeypatch.context() as mp: mp.setitem(sys.modules, "vegafusion", None) @@ -344,51 +311,14 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: @backends -def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv(CACHE_ENV_VAR, raising=False) - - data = Loader.from_backend(backend) - frame = data("stocks", ".csv") +def test_loader_call(backend: _Backend) -> None: + load = Loader.from_backend(backend) + frame = load("stocks", ".csv") assert nw_dep.is_into_dataframe(frame) nw_frame = nw.from_native(frame) assert set(nw_frame.columns) == {"symbol", "date", "price"} -@backends_single -def test_missing_dependency_single( - backend: _Backend, monkeypatch: pytest.MonkeyPatch -) -> None: - monkeypatch.setitem(sys.modules, backend, None) - - with pytest.raises( - ModuleNotFoundError, - match=re.compile( - rf"{backend}.+requires.+{backend}.+but.+{backend}.+not.+found.+pip install {backend}", - flags=re.DOTALL, - ), - ): - Loader.from_backend(backend) - - -@backends_multi -@skip_requires_pyarrow -def test_missing_dependency_multi( - backend: _Backend, monkeypatch: pytest.MonkeyPatch -) -> None: - secondary = "pyarrow" - primary = backend.removesuffix(f"[{secondary}]") - monkeypatch.setitem(sys.modules, secondary, None) - - with pytest.raises( - ModuleNotFoundError, - match=re.compile( - rf"{re.escape(backend)}.+requires.+'{primary}', '{secondary}'.+but.+{secondary}.+not.+found.+pip install {secondary}", - flags=re.DOTALL, - ), - ): - Loader.from_backend(backend) - - @backends def test_dataset_not_found(backend: _Backend) -> None: """ @@ -396,7 +326,7 @@ def test_dataset_not_found(backend: _Backend) -> None: ``Loader.url`` is used since it doesn't require a remote connection. """ - data = Loader.from_backend(backend) + load = Loader.from_backend(backend) real_name: Literal["disasters"] = "disasters" invalid_name: Literal["fake name"] = "fake name" invalid_suffix: Literal["fake suffix"] = "fake suffix" @@ -411,7 +341,7 @@ def test_dataset_not_found(backend: _Backend) -> None: ERR_NO_RESULT, match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL), ): - data.url(invalid_name) + load.url(invalid_name) with pytest.raises( TypeError, @@ -420,7 +350,7 @@ def test_dataset_not_found(backend: _Backend) -> None: re.DOTALL, ), ): - data.url(real_name, invalid_suffix) # type: ignore[arg-type] + load.url(real_name, invalid_suffix) # type: ignore[arg-type] with pytest.raises( ERR_NO_RESULT, @@ -429,7 +359,44 @@ def test_dataset_not_found(backend: _Backend) -> None: re.DOTALL, ), ): - data.url(real_name, incorrect_suffix) + load.url(real_name, incorrect_suffix) + + +def test_reader_missing_dependencies() -> None: + from packaging.requirements import Requirement + + from altair.datasets._readers import _Reader + + class MissingDeps(_Reader): + def __init__(self, name) -> None: + self._name = name + reqs = Requirement(name) + for req in (reqs.name, *reqs.extras): + self._import(req) + + self._read_fn = {} + self._scan_fn = {} + + fake_name = "not_a_real_package" + real_name = "altair" + fake_extra = "AnotherFakePackage" + backend = f"{real_name}[{fake_extra}]" + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{fake_name}.+requires.+{fake_name}.+but.+{fake_name}.+not.+found.+pip install {fake_name}", + flags=re.DOTALL, + ), + ): + MissingDeps(fake_name) + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{re.escape(backend)}.+requires.+'{real_name}', '{fake_extra}'.+but.+{fake_extra}.+not.+found.+pip install {fake_extra}", + flags=re.DOTALL, + ), + ): + MissingDeps(backend) @backends @@ -451,97 +418,112 @@ def test_reader_cache( monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - data = Loader.from_backend(backend) - assert data.cache.is_active() - cache_dir = data.cache.path + load = Loader.from_backend(backend) + assert load.cache.is_active() + cache_dir = load.cache.path assert cache_dir == tmp_path - assert tuple(data.cache) == () + assert tuple(load.cache) == () # smallest csvs - lookup_groups = data("lookup_groups") - data("lookup_people") - data("iowa-electricity") - data("global-temp") + lookup_groups = load("lookup_groups") + load("lookup_people") + load("iowa-electricity") + load("global-temp") - cached_paths = tuple(data.cache) + cached_paths = tuple(load.cache) assert len(cached_paths) == 4 if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, - cast("pl.DataFrame", data("lookup_groups", ".csv")), + cast("pl.DataFrame", load("lookup_groups", ".csv")), ) else: left, right = ( pl.DataFrame(lookup_groups), - pl.DataFrame(data("lookup_groups", ".csv")), + pl.DataFrame(load("lookup_groups", ".csv")), ) assert_frame_equal(left, right) - assert len(tuple(data.cache)) == 4 - assert cached_paths == tuple(data.cache) + assert len(tuple(load.cache)) == 4 + assert cached_paths == tuple(load.cache) - data("iowa-electricity", ".csv") - data("global-temp", ".csv") - data("global-temp.csv") + load("iowa-electricity", ".csv") + load("global-temp", ".csv") + load("global-temp.csv") - assert len(tuple(data.cache)) == 4 - assert cached_paths == tuple(data.cache) + assert len(tuple(load.cache)) == 4 + assert cached_paths == tuple(load.cache) - data("lookup_people") - data("lookup_people.csv") - data("lookup_people", ".csv") - data("lookup_people") + load("lookup_people") + load("lookup_people.csv") + load("lookup_people", ".csv") + load("lookup_people") - assert len(tuple(data.cache)) == 4 - assert cached_paths == tuple(data.cache) + assert len(tuple(load.cache)) == 4 + assert cached_paths == tuple(load.cache) -@slow @datasets_debug @backends def test_reader_cache_exhaustive( - backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + backend: _Backend, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + polars_loader: PolarsLoader, ) -> None: """ Fully populate and then purge the cache for all backends. - Does not attempt to read the files - Checking we can support pre-downloading and safely deleting + + Notes + ----- + - Requests work the same for all backends + - The logic for detecting the cache contents uses ``narhwals`` + - Here, we're testing that these ``narwhals`` ops are consistent + - `DatasetCache.download_all` is expensive for CI, so aiming for it to run at most once + - 34-45s per call (4x backends) """ + polars_loader.cache.download_all() + CLONED: Path = tmp_path / "clone" + fs.mkdir(CLONED) + fs.copytree(polars_loader.cache.path, CLONED) + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - data = Loader.from_backend(backend) - assert data.cache.is_active() - cache_dir = data.cache.path + load = Loader.from_backend(backend) + assert load.cache.is_active() + cache_dir = load.cache.path assert cache_dir == tmp_path - assert tuple(data.cache) == () + assert tuple(load.cache) == (CLONED,) - data.cache.download_all() - cached_paths = tuple(data.cache) + load.cache.path = CLONED + cached_paths = tuple(load.cache) assert cached_paths != () # NOTE: Approximating all datasets downloaded assert len(cached_paths) >= 40 assert all( bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size) - for fp in data.cache + for fp in load.cache ) # NOTE: Confirm this is a no-op - data.cache.download_all() - assert len(cached_paths) == len(tuple(data.cache)) + load.cache.download_all() + assert len(cached_paths) == len(tuple(load.cache)) # NOTE: Ensure unrelated files in the directory are not removed dummy: Path = tmp_path / "dummy.json" dummy.touch(exist_ok=False) - data.cache.clear() + load.cache.clear() remaining = tuple(tmp_path.iterdir()) - assert len(remaining) == 1 - assert remaining[0] == dummy - dummy.unlink() + assert set(remaining) == {dummy, CLONED} + fs.rm(dummy, CLONED) +@no_xdist def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: from altair.datasets import load @@ -572,68 +554,66 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - assert not load.cache.is_empty() -movies_fail: ParameterSet = pytest.param( - "movies", - marks=pytest.mark.xfail( - reason="Only working for `polars`.\n" - "`pyarrow` isn't happy with the mixed `int`/`str` column." - ), -) -earthquakes_fail: ParameterSet = pytest.param( - "earthquakes", - marks=pytest.mark.xfail( - reason="Only working for `polars`.\nGeoJSON fails on native `pyarrow`" - ), -) - - +# TODO: Investigate adding schemas for `pyarrow`. @pytest.mark.parametrize( - "name", + ("name", "fallback"), [ - "cars", - movies_fail, - "wheat", - "barley", - "gapminder", - "income", - "burtin", - earthquakes_fail, + ("cars", "polars"), + ("movies", "polars"), + ("wheat", "polars"), + ("barley", "polars"), + ("gapminder", "polars"), + ("income", "polars"), + ("burtin", "polars"), + ("cars", None), + pytest.param( + "movies", + None, + marks=pytest.mark.xfail( + True, + raises=TypeError, + reason=( + "msg: `Expected bytes, got a 'int' object`\n" + "Isn't happy with the mixed `int`/`str` column." + ), + strict=True, + ), + ), + ("wheat", None), + ("barley", None), + ("gapminder", None), + ("income", None), + ("burtin", None), ], ) -@pytest.mark.parametrize("fallback", ["polars", None]) -@skip_requires_pyarrow +@backends_pyarrow def test_pyarrow_read_json( - fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch + backend: _PyArrow, + fallback: _Polars | None, + name: Dataset, + monkeypatch: pytest.MonkeyPatch, ) -> None: - monkeypatch.delenv(CACHE_ENV_VAR, raising=False) - monkeypatch.delitem(sys.modules, "pandas", raising=False) if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) - - data = Loader.from_backend("pyarrow") - - data(name, ".json") + load = Loader.from_backend(backend) + assert load(name, ".json") -@pytest.mark.parametrize( - ("spec", "column"), - [ - (DatasetSpec(name="cars"), "Year"), - (DatasetSpec(name="unemployment-across-industries"), "date"), - (DatasetSpec(name="flights-10k"), "date"), - (DatasetSpec(name="football"), "date"), - (DatasetSpec(name="crimea"), "date"), - (DatasetSpec(name="ohlc"), "date"), - ], -) -def test_polars_read_json_roundtrip( - polars_loader: PolarsLoader, spec: DatasetSpec, column: str -) -> None: - frame = polars_loader(spec["name"], ".json") - tp = frame.schema.to_python()[column] - assert tp is dt.date or issubclass(tp, dt.date) +@backends_no_polars +def test_spatial(spatial_datasets, backend: _Backend) -> None: + load = Loader.from_backend(backend) + if is_polars_backed_pyarrow(load): + assert nw_dep.is_pyarrow_table(load(spatial_datasets)) + else: + pattern = re.compile( + rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url", + flags=re.DOTALL | re.IGNORECASE, + ) + with pytest.raises(NotImplementedError, match=pattern): + load(spatial_datasets) +# TODO: Adapt into something useful or simplify into just param name def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: """Temp way of excluding datasets that were removed.""" names: tuple[Dataset, ...] = get_args(Dataset) @@ -646,9 +626,8 @@ def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: yield pytest.param(*args, marks=marks) -@slow -@datasets_debug @pytest.mark.parametrize(("name", "suffix"), list(_dataset_params())) +@datasets_debug def test_all_datasets( polars_loader: PolarsLoader, name: Dataset, suffix: Extension ) -> None: @@ -668,51 +647,62 @@ def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: from polars.testing import assert_frame_equal - data = Loader.from_backend("polars") - data.cache.path = tmp_path + load = Loader.from_backend("polars") + load.cache.path = tmp_path - data("londonCentroids") - data("stocks") - data("driving") + load("londonCentroids") + load("stocks") + load("driving") cached_paths = tuple(tmp_path.iterdir()) assert len(cached_paths) == 3 raiser = partial(_raise_exception, URLError) with monkeypatch.context() as mp: - mp.setattr(data._reader._opener, "open", raiser) + mp.setattr(load._reader._opener, "open", raiser) # Existing cache entries don't trigger an error - data("londonCentroids") - data("stocks") - data("driving") + load("londonCentroids") + load("stocks") + load("driving") # Mocking cache-miss without remote conn with pytest.raises(URLError): - data("birdstrikes") + load("birdstrikes") assert len(tuple(tmp_path.iterdir())) == 3 # Now we can get a cache-hit - frame = data("birdstrikes") + frame = load("birdstrikes") assert nw_dep.is_polars_dataframe(frame) assert len(tuple(tmp_path.iterdir())) == 4 with monkeypatch.context() as mp: - mp.setattr(data._reader._opener, "open", raiser) + mp.setattr(load._reader._opener, "open", raiser) # Here, the remote conn isn't considered - we already have the file - frame_from_cache = data("birdstrikes") + frame_from_cache = load("birdstrikes") assert len(tuple(tmp_path.iterdir())) == 4 assert_frame_equal(frame, frame_from_cache) -@backends -def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: - """Ensure all backends will query the same column names.""" - data = Loader.from_backend(backend) - schema_columns = data._reader._scan_metadata().collect().columns - assert set(schema_columns) == metadata_columns +@pytest.mark.parametrize( + ("name", "column"), + [ + ("cars", "Year"), + ("unemployment-across-industries", "date"), + ("flights-10k", "date"), + ("football", "date"), + ("crimea", "date"), + ("ohlc", "date"), + ], +) +def test_polars_date_read_json_roundtrip( + polars_loader: PolarsLoader, name: Dataset, column: str +) -> None: + """Ensure ``date`` columns are inferred using the roundtrip json -> csv method.""" + frame = polars_loader(name, ".json") + tp = frame.schema.to_python()[column] + assert tp is dt.date or issubclass(tp, dt.date) -@skip_requires_pyarrow @backends_pandas_any @pytest.mark.parametrize( ("name", "columns"), From d64dbee607006108ca617b1e4f5f6240c79c0727 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 21 Jan 2025 18:42:27 +0000 Subject: [PATCH 173/201] refactor: Reuse `tools.fs` more, fix `app.(read|scan)` Using only `.parquet` was relevant in earlier versions that produced multiple `.parquet` files Now these methods safely handle all formats in use --- tools/datasets/__init__.py | 40 +++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index faf5e8d96..64940ebc1 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -22,9 +22,9 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal +from tools import fs from tools.codemod import ruff from tools.datasets.npm import Npm -from tools.fs import REPO_ROOT from tools.schemapi import utils if TYPE_CHECKING: @@ -60,7 +60,7 @@ class Application: Parameters ---------- out_dir_tools, out_dir_altair - Directories to store ``.parquet`` metadata files. + Directories to store metadata files. out_fp_typing Path to write metadata-derived typing module. @@ -72,7 +72,7 @@ class Application: def __init__( self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path ) -> None: - out_dir_tools.mkdir(exist_ok=True) + fs.mkdir(out_dir_tools) METADATA = "metadata" self.paths = types.MappingProxyType["_PathAlias", Path]( { @@ -102,7 +102,7 @@ def refresh( include_typing Regenerate ``altair.datasets._typing``. frozen - Don't perform any requests or attempt to check for new versions. + Don't perform any requests. .. note:: **Temporary** measure to work from ``main`` until `vega-datasets@3`_. @@ -123,20 +123,28 @@ def refresh( def reset(self) -> None: """Remove all metadata files.""" - for fp in self.paths.values(): - fp.unlink(missing_ok=True) + fs.rm(*self.paths.values()) def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" - import polars as pl - - return pl.read_parquet(self.paths[name]) + return self.scan(name).collect() def scan(self, name: _PathAlias, /) -> pl.LazyFrame: """Scan existing metadata from file.""" import polars as pl - return pl.scan_parquet(self.paths[name]) + fp = self.paths[name] + if fp.suffix == ".parquet": + return pl.scan_parquet(fp) + elif ".csv" in fp.suffixes: + return pl.scan_csv(fp) + elif ".json" in fp.suffixes: + return pl.read_json(fp).lazy() + else: + msg = ( + f"Unable to read {fp.name!r} as tabular data.\nSuffixes: {fp.suffixes}" + ) + raise NotImplementedError(msg) def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """ @@ -152,8 +160,7 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non """ if fp.suffix != ".gz": fp = fp.with_suffix(".csv.gz") - if not fp.exists(): - fp.touch() + fp.touch() df = frame.lazy().collect() buf = BytesIO() with gzip.GzipFile(fp, mode="wb", mtime=0) as f: @@ -169,16 +176,13 @@ def write_json_gzip(self, obj: Any, fp: Path, /) -> None: """ if fp.suffix != ".gz": fp = fp.with_suffix(".json.gz") - if not fp.exists(): - fp.touch() - + fp.touch() with gzip.GzipFile(fp, mode="wb", mtime=0) as f: f.write(json.dumps(obj).encode()) def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" - if not fp.exists(): - fp.touch() + fp.touch() df = frame.lazy().collect() df.write_parquet(fp, compression="zstd", compression_level=17) @@ -233,7 +237,7 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None: ruff.write_lint_format(self.paths["typing"], contents) -_alt_datasets = REPO_ROOT / "altair" / "datasets" +_alt_datasets = fs.REPO_ROOT / "altair" / "datasets" app = Application( Path(__file__).parent / "_metadata", _alt_datasets / "_metadata", From 0c72435e09ab6263c48bb566c733deb11f927bd9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:30:07 +0000 Subject: [PATCH 174/201] feat(typing): Set `"polars"` as default in `Loader.from_backend` Without a default, I found that VSCode was always suggesting the **last** overload first (`"pyarrow"`) This is a bad suggestion, as it provides the *worst native* experience. The default now aligns with the backend providing the *best native* experience --- altair/datasets/_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 0bb91aa1f..8417e2d6a 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -48,7 +48,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod def from_backend( - cls, backend_name: Literal["polars"], / + cls, backend_name: Literal["polars"] = ..., / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @@ -64,7 +64,7 @@ def from_backend( ) -> Loader[pa.Table, pa.Table]: ... @classmethod - def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. From 8e4c168c6942ace581f7a03026ae0271afdd9871 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:54:29 +0000 Subject: [PATCH 175/201] docs: Adds module-level doc to `altair.datasets` - Multiple **brief** examples, for a taste of the public API - See (#3763) - Refs to everywhere a first-time user may need help from - Also aligned the (`Loader`|`load`) docs w/ eachother and the new phrasing here --- altair/datasets/__init__.py | 65 ++++++++++++++++++++++++++++++++++++- altair/datasets/_loader.py | 2 +- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index cc6a07d32..01dc35212 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,3 +1,64 @@ +""" +Load example datasets **remotely** from `vega-datasets`_. + +Provides over **70+** datasets, used throughout our `Example Gallery`_. + +You can learn more about each dataset at `datapackage.md`_. + +Examples +-------- +Load a dataset as a ``DataFrame``/``Table``:: + + from altair.datasets import load + + load("cars") + +.. note:: + Requires installation of either `polars`_, `pandas`_, or `pyarrow`_. + +Get the remote address of a dataset and use directly in a :class:`altair.Chart`:: + + import altair as alt + from altair.datasets import url + + source = url("co2-concentration") + alt.Chart(source).mark_line(tooltip=True).encode(x="Date:T", y="CO2:Q") + +.. note:: + Works without any additional dependencies. + +For greater control over the backend library use:: + + from altair.datasets import Loader + + load = Loader.from_backend("polars") + load("penguins") + load.url("penguins") + +This method also provides *precise* Tab completions on the returned object:: + + load("cars"). + # bottom_k + # drop + # drop_in_place + # drop_nans + # dtypes + # ... + +.. _vega-datasets: + https://github.com/vega/vega-datasets +.. _Example Gallery: + https://altair-viz.github.io/gallery/index.html#example-gallery +.. _datapackage.md: + https://github.com/vega/vega-datasets/blob/main/datapackage.md +.. _polars: + https://docs.pola.rs/user-guide/installation/ +.. _pandas: + https://pandas.pydata.org/docs/getting_started/install.html +.. _pyarrow: + https://arrow.apache.org/docs/python/install.html +""" + from __future__ import annotations from typing import TYPE_CHECKING @@ -22,7 +83,9 @@ load: _Load[Any, Any] """ -For full IDE completions, instead use: +Get a remote dataset and load as tabular data. + +For full Tab completions, instead use: from altair.datasets import Loader load = Loader.from_backend("polars") diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 8417e2d6a..6c359edb2 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -29,7 +29,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ - Load examples **remotely** from `vega-datasets`_, with caching. + Load example datasets **remotely** from `vega-datasets`_, with caching. A new ``Loader`` must be initialized by specifying a backend: From 106f8bb40ecd59a18606c29fadd97094cf75c968 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 22 Jan 2025 14:37:08 +0000 Subject: [PATCH 176/201] test: Clean up `test_datasets` - Reduce superfluous docs - Format/reorganize remaining docs - Follow up on some comments Misc style changes --- tests/test_datasets.py | 177 +++++++++++------------------------------ 1 file changed, 48 insertions(+), 129 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index b212d79ce..0855b73af 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -16,12 +16,12 @@ from altair.datasets import Loader from altair.datasets._exceptions import AltairDatasetsError -from altair.datasets._typing import Dataset, Extension, Metadata, is_ext_read +from altair.datasets._typing import Dataset, Metadata, is_ext_read from tests import no_xdist, skip_requires_pyarrow from tools import fs if TYPE_CHECKING: - from collections.abc import Callable, Container, Iterator, Mapping + from collections.abc import Callable, Mapping from pathlib import Path from typing import Literal @@ -31,7 +31,6 @@ from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow from altair.vegalite.v5.schema._typing import OneOrSeq - from tests import MarksType if sys.version_info >= (3, 10): from typing import TypeAlias @@ -39,7 +38,18 @@ from typing_extensions import TypeAlias PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame] -CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" +datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() +""" +Custom ``pytest.mark`` decorator. + +Use for more exhaustive tests that require many requests. + +**Disabled** by default in ``pyproject.toml``: + + [tool.pytest.ini_options] + addopts = ... +""" + _backend_params: Mapping[_Backend, ParameterSet] = { "polars": pytest.param("polars"), "pandas": pytest.param("pandas"), @@ -60,52 +70,26 @@ "backend", [v for k, v in _backend_params.items() if k == "pyarrow"] ) -datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() -""" -Custom ``pytest.mark`` decorator. - -Use for more exhaustive tests that require many requests. - -**Disabled** by default in ``pyproject.toml``: +datasets_all: pytest.MarkDecorator = pytest.mark.parametrize("name", get_args(Dataset)) +datasets_spatial: pytest.MarkDecorator = pytest.mark.parametrize( + "name", ["earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m"] +) - [tool.pytest.ini_options] - addopts = ... -""" +CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @pytest.fixture(scope="session") def polars_loader() -> PolarsLoader: + """Fastest and **most reliable** backend.""" load = Loader.from_backend("polars") if load.cache.is_not_active(): load.cache.path = load.cache._XDG_CACHE return load -@pytest.fixture( - params=("earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m") -) -def spatial_datasets(request: pytest.FixtureRequest) -> Dataset: - return request.param - - @pytest.fixture def metadata_columns() -> frozenset[str]: - """ - Returns all defined keys ``Metadata`` (``TypedDict``). - - Note - ---- - - ``# type: ignore``(s) are to fix a false positive. - - Should be recognised by this stub `typing_extensions.pyi`_ - - .. _typing_extensions.pyi: - https://github.com/python/typeshed/blob/51d0f0194c27347ab7d0083bd7b11210a09fef75/stdlib/typing_extensions.pyi#L222-L229 - """ - return Metadata.__required_keys__.union( - Metadata.__optional_keys__, - Metadata.__readonly_keys__, # type: ignore[attr-defined] - Metadata.__mutable_keys__, # type: ignore[attr-defined] - ) + return Metadata.__required_keys__.union(Metadata.__optional_keys__) def is_frame_backend(frame: Any, backend: _Backend, /) -> bool: @@ -131,13 +115,8 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool: def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool: - """ - User requested ``pyarrow``, but also has ``polars`` installed. - - Notes - ----- - - Currently, defers to ``polars`` only for ``.json``. - """ + """User requested ``pyarrow``, but also has ``polars`` installed.""" + # NOTE: Would prefer if there was a *less* private method to test this. return bool( is_loader_backend(loader, "pyarrow") and (fn := getattr(loader._reader, "_read_json_polars", None)) @@ -168,18 +147,17 @@ def test_loader_url(backend: _Backend) -> None: @no_xdist def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None: """ - Inferring the best backend available. - - Based on the following order: + Ensure the **most reliable**, available backend is selected. - priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" + See Also + -------- + ``altair.datasets._readers.infer_backend`` """ import altair.datasets._loader from altair.datasets import load assert is_loader_backend(load, "polars") monkeypatch.delattr(altair.datasets._loader, "load", raising=False) - monkeypatch.setitem(sys.modules, "polars", None) from altair.datasets import load @@ -194,14 +172,12 @@ def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None: else: assert is_loader_backend(load, "pandas[pyarrow]") monkeypatch.delattr(altair.datasets._loader, "load") - monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load assert is_loader_backend(load, "pandas") monkeypatch.delattr(altair.datasets._loader, "load") - monkeypatch.setitem(sys.modules, "pandas", None) monkeypatch.delitem(sys.modules, "pyarrow") monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) @@ -223,11 +199,9 @@ def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load assert is_loader_backend(load, "polars") - default = load("cars") df = load("cars", backend=backend) default_2 = load("cars") - assert nw_dep.is_polars_dataframe(default) assert is_frame_backend(df, backend) assert nw_dep.is_polars_dataframe(default_2) @@ -276,7 +250,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets._readers import infer_backend priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4") - assert csv_cache._mapping == {} with pytest.raises(AltairDatasetsError): infer_backend(priority=priority) @@ -284,7 +257,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: url = csv_cache.url assert is_url("jobs", url) assert csv_cache._mapping != {} - assert is_url("cars", url) assert is_url("stocks", url) assert is_url("countries", url) @@ -295,7 +267,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: assert is_url("flare", url) assert is_url("flights-10k", url) assert is_url("flights-200k", url) - if find_spec("vegafusion"): assert is_url("flights-3m", url) @@ -303,7 +274,6 @@ def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: mp.setitem(sys.modules, "vegafusion", None) with pytest.raises(AltairDatasetsError, match=r".parquet.+require.+vegafusion"): url("flights-3m") - with pytest.raises( TypeError, match="'fake data' does not refer to a known dataset" ): @@ -321,17 +291,12 @@ def test_loader_call(backend: _Backend) -> None: @backends def test_dataset_not_found(backend: _Backend) -> None: - """ - Various queries that should **always raise** due to non-existent dataset. - - ``Loader.url`` is used since it doesn't require a remote connection. - """ + """Various queries that should **always raise** due to non-existent dataset.""" load = Loader.from_backend(backend) real_name: Literal["disasters"] = "disasters" invalid_name: Literal["fake name"] = "fake name" invalid_suffix: Literal["fake suffix"] = "fake suffix" incorrect_suffix: Literal[".json"] = ".json" - ERR_NO_RESULT = ValueError MSG_NO_RESULT = "Found no results for" NAME = "dataset_name" @@ -342,7 +307,6 @@ def test_dataset_not_found(backend: _Backend) -> None: match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL), ): load.url(invalid_name) - with pytest.raises( TypeError, match=re.compile( @@ -351,7 +315,6 @@ def test_dataset_not_found(backend: _Backend) -> None: ), ): load.url(real_name, invalid_suffix) # type: ignore[arg-type] - with pytest.raises( ERR_NO_RESULT, match=re.compile( @@ -403,26 +366,15 @@ def __init__(self, name) -> None: def test_reader_cache( backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - """ - Using a sample of the smallest datasets, make *"requests"* that are all caught by prior hits. - - Note - ---- - `tmp_path`_ is a built-in fixture. - - .. _tmp_path: - https://docs.pytest.org/en/stable/getting-started.html#request-a-unique-temporary-directory-for-functional-tests - """ + """Ensure cache hits avoid network activity.""" import polars as pl from polars.testing import assert_frame_equal monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - load = Loader.from_backend(backend) assert load.cache.is_active() cache_dir = load.cache.path assert cache_dir == tmp_path - assert tuple(load.cache) == () # smallest csvs @@ -430,7 +382,6 @@ def test_reader_cache( load("lookup_people") load("iowa-electricity") load("global-temp") - cached_paths = tuple(load.cache) assert len(cached_paths) == 4 @@ -448,19 +399,15 @@ def test_reader_cache( assert_frame_equal(left, right) assert len(tuple(load.cache)) == 4 assert cached_paths == tuple(load.cache) - load("iowa-electricity", ".csv") load("global-temp", ".csv") load("global-temp.csv") - assert len(tuple(load.cache)) == 4 assert cached_paths == tuple(load.cache) - load("lookup_people") load("lookup_people.csv") load("lookup_people", ".csv") load("lookup_people") - assert len(tuple(load.cache)) == 4 assert cached_paths == tuple(load.cache) @@ -476,15 +423,14 @@ def test_reader_cache_exhaustive( """ Fully populate and then purge the cache for all backends. - - Does not attempt to read the files - - Checking we can support pre-downloading and safely deleting - Notes ----- - - Requests work the same for all backends - - The logic for detecting the cache contents uses ``narhwals`` - - Here, we're testing that these ``narwhals`` ops are consistent - - `DatasetCache.download_all` is expensive for CI, so aiming for it to run at most once + - Does not attempt to read the files + - Checking we can support pre-downloading and safely deleting + - Requests work the same for all backends + - The logic for detecting the cache contents uses ``narhwals`` + - Here, we're testing that these ``narwhals`` ops are consistent + - `DatasetCache.download_all` is expensive for CI, so aiming for it to run **at most once** - 34-45s per call (4x backends) """ polars_loader.cache.download_all() @@ -498,13 +444,12 @@ def test_reader_cache_exhaustive( cache_dir = load.cache.path assert cache_dir == tmp_path assert tuple(load.cache) == (CLONED,) - load.cache.path = CLONED cached_paths = tuple(load.cache) assert cached_paths != () # NOTE: Approximating all datasets downloaded - assert len(cached_paths) >= 40 + assert len(cached_paths) >= 70 assert all( bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size) for fp in load.cache @@ -528,16 +473,13 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - from altair.datasets import load monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - assert load.cache.is_active() assert load.cache.path == tmp_path assert load.cache.is_empty() load("cars") assert not load.cache.is_empty() - - # RELATED: https://github.com/python/mypy/issues/3004 + # ISSUE: https://github.com/python/mypy/issues/3004 load.cache.path = None # type: ignore[assignment] - assert load.cache.is_not_active() with pytest.raises( ValueError, @@ -546,9 +488,7 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - ), ): tuple(load.cache) - load.cache.path = tmp_path - assert load.cache.is_active() assert load.cache.path == tmp_path assert not load.cache.is_empty() @@ -599,44 +539,29 @@ def test_pyarrow_read_json( assert load(name, ".json") +@datasets_spatial @backends_no_polars -def test_spatial(spatial_datasets, backend: _Backend) -> None: +def test_spatial(backend: _Backend, name: Dataset) -> None: load = Loader.from_backend(backend) if is_polars_backed_pyarrow(load): - assert nw_dep.is_pyarrow_table(load(spatial_datasets)) + assert nw_dep.is_pyarrow_table(load(name)) else: pattern = re.compile( - rf"{spatial_datasets}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url", + rf"{name}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url", flags=re.DOTALL | re.IGNORECASE, ) with pytest.raises(NotImplementedError, match=pattern): - load(spatial_datasets) - + load(name) -# TODO: Adapt into something useful or simplify into just param name -def _dataset_params(*, skip: Container[str] = ()) -> Iterator[ParameterSet]: - """Temp way of excluding datasets that were removed.""" - names: tuple[Dataset, ...] = get_args(Dataset) - args: tuple[Dataset, Extension | None] - for name in names: - marks: MarksType = () - if name in skip: - continue - args = name, None - yield pytest.param(*args, marks=marks) - -@pytest.mark.parametrize(("name", "suffix"), list(_dataset_params())) +@datasets_all @datasets_debug -def test_all_datasets( - polars_loader: PolarsLoader, name: Dataset, suffix: Extension -) -> None: - """Ensure all annotated datasets can be loaded with the most reliable backend.""" +def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None: if name in {"7zip", "ffox", "gimp"}: with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"): - polars_loader(name, suffix) + polars_loader(name) else: - frame = polars_loader(name, suffix) + frame = polars_loader(name) assert nw_dep.is_polars_dataframe(frame) @@ -649,14 +574,11 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - load = Loader.from_backend("polars") load.cache.path = tmp_path - load("londonCentroids") load("stocks") load("driving") - cached_paths = tuple(tmp_path.iterdir()) assert len(cached_paths) == 3 - raiser = partial(_raise_exception, URLError) with monkeypatch.context() as mp: mp.setattr(load._reader._opener, "open", raiser) @@ -679,7 +601,6 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - # Here, the remote conn isn't considered - we already have the file frame_from_cache = load("birdstrikes") assert len(tuple(tmp_path.iterdir())) == 4 - assert_frame_equal(frame, frame_from_cache) @@ -731,12 +652,12 @@ def test_pandas_date_parse( """ Ensure schema defaults are correctly parsed. - NOTE: + Notes + ----- - Depends on ``frictionless`` being able to detect the date/datetime columns. - Not all format strings work """ date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns) - load = Loader.from_backend(backend) url = load.url(name) kwds: dict[str, Any] = ( @@ -745,10 +666,8 @@ def test_pandas_date_parse( else {"parse_dates": date_columns} ) kwds_empty: dict[str, Any] = {k: [] for k in kwds} - df_schema_derived: pd.DataFrame = load(name) nw_schema = nw.from_native(df_schema_derived).schema - df_manually_specified: pd.DataFrame = load(name, **kwds) df_dates_empty: pd.DataFrame = load(name, **kwds_empty) From c3c2edaa64be37547da3f467e453af1ee8c4ba60 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:42:11 +0000 Subject: [PATCH 177/201] docs: Make `sphinx` happy with docs These changes are very minor in VSCode, but fix a lot of rendering issues on the website --- altair/datasets/__init__.py | 23 ++++------- altair/datasets/_cache.py | 21 +++++----- altair/datasets/_loader.py | 78 ++++++++++++++++++------------------- doc/user_guide/api.rst | 16 ++++++++ tools/generate_api_docs.py | 19 +++++++++ 5 files changed, 92 insertions(+), 65 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 01dc35212..3c61eda0b 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,5 +1,5 @@ """ -Load example datasets **remotely** from `vega-datasets`_. +Load example datasets *remotely* from `vega-datasets`_. Provides over **70+** datasets, used throughout our `Example Gallery`_. @@ -85,24 +85,18 @@ """ Get a remote dataset and load as tabular data. -For full Tab completions, instead use: +For full Tab completions, instead use:: from altair.datasets import Loader load = Loader.from_backend("polars") cars = load("cars") movies = load("movies") -Alternatively, specify ``backend`` during a call: +Alternatively, specify ``backend`` during a call:: from altair.datasets import load cars = load("cars", backend="polars") movies = load("movies", backend="polars") - -Related -------- -- https://github.com/vega/altair/pull/3631#issuecomment-2480832609 -- https://github.com/vega/altair/pull/3631#discussion_r1847111064 -- https://github.com/vega/altair/pull/3631#discussion_r1847176465 """ @@ -124,17 +118,14 @@ def url( .. note:: Only needed if ``name`` is available in multiple formats. + Returns + ------- + ``str`` + .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - - Related - ------- - - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 - - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 - - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 - - https://github.com/vega/altair/pull/3631#discussion_r1846662053 """ from altair.datasets._exceptions import AltairDatasetsError diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 08016d622..a415a8380 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -317,26 +317,27 @@ def path(self) -> Path: """ Returns path to datasets cache. - Defaults to (`XDG_CACHE_HOME`_): + Defaults to (`XDG_CACHE_HOME`_):: "$XDG_CACHE_HOME/altair/" - But can be configured using the environment variable: + But can be configured using the environment variable:: "$ALTAIR_DATASETS_DIR" - You can set this for the current session via: + You can set this for the current session via:: - >>> from pathlib import Path - >>> from altair.datasets import load - >>> load.cache.path = Path.home() / ".altair_cache" + from pathlib import Path + from altair.datasets import load - >>> load.cache.path.relative_to(Path.home()).as_posix() - '.altair_cache' + load.cache.path = Path.home() / ".altair_cache" - You can *later* disable caching via: + load.cache.path.relative_to(Path.home()).as_posix() + ".altair_cache" - >>> load.cache.path = None + You can *later* disable caching via:: + + load.cache.path = None .. _XDG_CACHE_HOME: https://specifications.freedesktop.org/basedir-spec/latest/#variables diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 6c359edb2..8f13ab2de 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -29,14 +29,14 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ - Load example datasets **remotely** from `vega-datasets`_, with caching. + Load example datasets *remotely* from `vega-datasets`_, with caching. - A new ``Loader`` must be initialized by specifying a backend: + A new ``Loader`` must be initialized by specifying a backend:: from altair.datasets import Loader load = Loader.from_backend("polars") - >>> load # doctest: +SKIP + load Loader[polars] .. _vega-datasets: @@ -81,42 +81,35 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]: .. warning:: Most datasets use a `JSON format not supported`_ by ``pyarrow`` - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - Examples -------- - Using ``polars``: + Using ``polars``:: from altair.datasets import Loader load = Loader.from_backend("polars") cars = load("cars") - >>> type(cars) # doctest: +SKIP + type(cars) polars.dataframe.frame.DataFrame - Using ``pandas``: + Using ``pandas``:: load = Loader.from_backend("pandas") cars = load("cars") - >>> type(cars) # doctest: +SKIP + type(cars) pandas.core.frame.DataFrame - Using ``pandas``, backed by ``pyarrow`` dtypes: + Using ``pandas``, backed by ``pyarrow`` dtypes:: load = Loader.from_backend("pandas[pyarrow]") cars = load("cars") - >>> type(cars) # doctest: +SKIP + type(cars) pandas.core.frame.DataFrame - >>> cars.dtypes # doctest: +SKIP + cars.dtypes Name string[pyarrow] Miles_per_Gallon double[pyarrow] Cylinders int64[pyarrow] @@ -127,6 +120,13 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]: Year timestamp[ns][pyarrow] Origin string[pyarrow] dtype: object + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files """ obj = Loader.__new__(Loader) obj._reader = backend(backend_name) @@ -154,24 +154,19 @@ def __call__( **kwds Arguments passed to the underlying read function. - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - Examples -------- - Using ``polars``: + Using ``polars``:: from altair.datasets import Loader load = Loader.from_backend("polars") source = load("iowa-electricity") - >>> source.columns # doctest: +SKIP + source.columns ['year', 'source', 'net_generation'] - >>> source # doctest: +SKIP + source shape: (51, 3) ┌────────────┬──────────────┬────────────────┐ │ year ┆ source ┆ net_generation │ @@ -191,15 +186,15 @@ def __call__( │ 2017-01-01 ┆ Renewables ┆ 21933 │ └────────────┴──────────────┴────────────────┘ - Using ``pandas``: + Using ``pandas``:: load = Loader.from_backend("pandas") source = load("iowa-electricity") - >>> source.columns # doctest: +SKIP + source.columns Index(['year', 'source', 'net_generation'], dtype='object') - >>> source # doctest: +SKIP + source year source net_generation 0 2001-01-01 Fossil Fuels 35361 1 2002-01-01 Fossil Fuels 35991 @@ -215,15 +210,15 @@ def __call__( [51 rows x 3 columns] - Using ``pyarrow``: + Using ``pyarrow``:: load = Loader.from_backend("pyarrow") source = load("iowa-electricity") - >>> source.column_names # doctest: +SKIP + source.column_names ['year', 'source', 'net_generation'] - >>> source # doctest: +SKIP + source pyarrow.Table year: date32[day] source: string @@ -232,6 +227,11 @@ def __call__( year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]] source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]] net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]] + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix """ return self._reader.dataset(name, suffix, **kwds) @@ -261,16 +261,16 @@ def url( Examples -------- - The returned url will always point to an accessible dataset: + The returned url will always point to an accessible dataset:: import altair as alt from altair.datasets import Loader load = Loader.from_backend("polars") - >>> load.url("cars") # doctest: +SKIP - 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json' + load.url("cars") + "https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json" - We can pass the result directly to a chart: + We can pass the result directly to a chart:: url = load.url("cars") alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") @@ -282,19 +282,19 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ Caching of remote dataset requests. - Configure cache path: + Configure cache path:: self.cache.path = "..." - Download the latest datasets *ahead-of-time*: + Download the latest datasets *ahead-of-time*:: self.cache.download_all() - Remove all downloaded datasets: + Remove all downloaded datasets:: self.cache.clear() - Disable caching: + Disable caching:: self.cache.path = None """ diff --git a/doc/user_guide/api.rst b/doc/user_guide/api.rst index 5793f0ae8..336c29d54 100644 --- a/doc/user_guide/api.rst +++ b/doc/user_guide/api.rst @@ -791,5 +791,21 @@ Typing Optional is_chart_type +.. _api-datasets: + +Datasets +-------- +.. currentmodule:: altair.datasets + +.. autosummary:: + :toctree: generated/datasets/ + :nosignatures: + + Loader + load + url + .. _Generic: https://typing.readthedocs.io/en/latest/spec/generics.html#generics +.. _vega-datasets: + https://github.com/vega/vega-datasets diff --git a/tools/generate_api_docs.py b/tools/generate_api_docs.py index 55c68729e..babd3d3eb 100644 --- a/tools/generate_api_docs.py +++ b/tools/generate_api_docs.py @@ -110,8 +110,22 @@ {typing_objects} +.. _api-datasets: + +Datasets +-------- +.. currentmodule:: altair.datasets + +.. autosummary:: + :toctree: generated/datasets/ + :nosignatures: + + {datasets_objects} + .. _Generic: https://typing.readthedocs.io/en/latest/spec/generics.html#generics +.. _vega-datasets: + https://github.com/vega/vega-datasets """ @@ -171,6 +185,10 @@ def theme() -> list[str]: return sort_3 +def datasets() -> list[str]: + return alt.datasets.__all__ + + def lowlevel_wrappers() -> list[str]: objects = sorted(iter_objects(alt.schema.core, restrict_to_subclass=alt.SchemaBase)) # The names of these two classes are also used for classes in alt.channels. Due to @@ -194,6 +212,7 @@ def write_api_file() -> None: api_classes=sep.join(api_classes()), typing_objects=sep.join(type_hints()), theme_objects=sep.join(theme()), + datasets_objects=sep.join(datasets()), ), encoding="utf-8", ) From d3b3ef2afed2fb1bff4fdcb099787191e04a8b15 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 25 Jan 2025 20:52:35 +0000 Subject: [PATCH 178/201] refactor: Add `find_spec` fastpath to `is_available` Have a lot of changes locally that use `find_spec`, but would prefer a single name assoicated with this action The actual spec is never relevant for this usage --- altair/datasets/_readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 0a18c1e61..a1f66dee1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -485,6 +485,8 @@ def is_available( * ``True`` every package. * ``False`` at least one package. """ + if not more_pkg_names and isinstance(pkg_names, str): + return find_spec(pkg_names) is not None pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) names = chain(pkgs_names, more_pkg_names) fn = all if require_all else any From b606a7d6e6980865930d8bb8cb720d6340855782 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 29 Jan 2025 15:05:47 +0000 Subject: [PATCH 179/201] feat(DRAFT): Private API overhaul **Public API is unchanged** Core changes are to simplify testing and extension: - `_readers.py` -> `_reader.py` - w/ two new support modules `_constraints`, and `_readimpl` - Functions (`BaseImpl`) are declared with what they support (`include`) and restrictions (`exclude`) on that subset - Transforms a lot of the imperative logic into set operations - Greatly improved `pyarrow` support - Utilize schema - Provides additional fallback `.json` implementations - `_stdlib_read_json_to_arrow` finally resolves `"movies.json"` issue --- altair/datasets/_cache.py | 106 +++++- altair/datasets/_constraints.py | 115 +++++++ altair/datasets/_exceptions.py | 78 +++-- altair/datasets/_loader.py | 40 ++- altair/datasets/_reader.py | 540 ++++++++++++++++++++++++++++++ altair/datasets/_readers.py | 574 -------------------------------- altair/datasets/_readimpl.py | 414 +++++++++++++++++++++++ tests/test_datasets.py | 79 ++--- 8 files changed, 1260 insertions(+), 686 deletions(-) create mode 100644 altair/datasets/_constraints.py create mode 100644 altair/datasets/_reader.py delete mode 100644 altair/datasets/_readers.py create mode 100644 altair/datasets/_readimpl.py diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index a415a8380..9abe09726 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,10 +5,9 @@ from collections import defaultdict from importlib.util import find_spec from pathlib import Path -from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast, get_args +from typing import TYPE_CHECKING, ClassVar, TypeVar, cast, get_args import narwhals.stable.v1 as nw -from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._exceptions import AltairDatasetsError from altair.datasets._typing import Dataset @@ -29,12 +28,18 @@ ) from io import IOBase from typing import Any, Final + from urllib.request import OpenerDirector from _typeshed import StrPath from narwhals.stable.v1.dtypes import DType + from narwhals.stable.v1.typing import IntoExpr from altair.datasets._typing import Metadata + if sys.version_info >= (3, 12): + from typing import Unpack + else: + from typing_extensions import Unpack if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -43,8 +48,8 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from altair.datasets._readers import _Reader from altair.datasets._typing import FlFieldStr + from altair.vegalite.v5.schema._typing import OneOrSeq _Dataset: TypeAlias = "Dataset | LiteralString" _FlSchema: TypeAlias = Mapping[str, FlFieldStr] @@ -83,6 +88,10 @@ https://narwhals-dev.github.io/narwhals/api-reference/dtypes/ """ +_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { + v: k for k, v in _DTYPE_TO_FIELD.items() +} + def _iter_metadata(df: nw.DataFrame[Any], /) -> Iterator[Metadata]: """ @@ -179,10 +188,7 @@ def rotated(self) -> Mapping[str, Sequence[Any]]: self._rotated[k].append(v) return self._rotated - def metadata(self, ns: Any, /) -> nw.LazyFrame: - data: Any = self.rotated - return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns).lazy()) - + # TODO: Evaluate which errors are now obsolete def __getitem__(self, key: _Dataset, /) -> Metadata: if meta := self.get(key, None): return meta @@ -194,6 +200,7 @@ def __getitem__(self, key: _Dataset, /) -> Metadata: msg = f"{key!r} does not refer to a known dataset." raise TypeError(msg) + # TODO: Evaluate which errors are now obsolete def url(self, name: _Dataset, /) -> str: if meta := self.get(name, None): if meta["suffix"] == ".parquet" and not find_spec("vegafusion"): @@ -207,6 +214,9 @@ def url(self, name: _Dataset, /) -> str: msg = f"{name!r} does not refer to a known dataset." raise TypeError(msg) + def __repr__(self) -> str: + return f"<{type(self).__name__}: {'COLLECTED' if self._mapping else 'READY'}>" + class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]): """ @@ -230,8 +240,10 @@ def __init__( self, *, tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"], + implementation: nw.Implementation = nw.Implementation.UNKNOWN, ) -> None: self._mapping: MutableMapping[_Dataset, _FlSchema] = tp() + self._implementation: nw.Implementation = implementation def read(self) -> Any: import json @@ -259,8 +271,63 @@ def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]: else: return list(match) + def is_active(self) -> bool: + return self._implementation in { + nw.Implementation.PANDAS, + nw.Implementation.PYARROW, + nw.Implementation.MODIN, + nw.Implementation.PYARROW, + } + + def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: + name: Any = meta["dataset_name"] + impl = self._implementation + if (impl.is_pandas_like() or impl.is_pyarrow()) and (self[name]): + suffix = meta["suffix"] + if impl.is_pandas_like(): + if cols := self.by_dtype(name, nw.Date, nw.Datetime): + if suffix == ".json": + return {"convert_dates": cols} + elif suffix in {".csv", ".tsv"}: + return {"parse_dates": cols} + else: + schema = self.schema_pyarrow(name) + if suffix in {".csv", ".tsv"}: + from pyarrow.csv import ConvertOptions + + return {"convert_options": ConvertOptions(column_types=schema)} # pyright: ignore[reportCallIssue] + elif suffix == ".parquet": + return {"schema": schema} + + return {} + + def schema(self, name: _Dataset, /) -> Mapping[str, DType]: + return { + column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items() + } + + # TODO: Open an issue in ``narwhals`` to try and get a public api for type conversion + def schema_pyarrow(self, name: _Dataset, /): + schema = self.schema(name) + if schema: + from narwhals._arrow.utils import narwhals_to_native_dtype + from narwhals.utils import Version -class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): + m = {k: narwhals_to_native_dtype(v, Version.V1) for k, v in schema.items()} + else: + m = {} + return nw.dependencies.get_pyarrow().schema(m) + + +class _SupportsScanMetadata(Protocol): + _opener: ClassVar[OpenerDirector] + + def _scan_metadata( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.LazyFrame: ... + + +class DatasetCache: """Opt-out caching of remote dataset requests.""" _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" @@ -268,8 +335,8 @@ class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair" ).resolve() - def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: - self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader + def __init__(self, reader: _SupportsScanMetadata, /) -> None: + self._rd: _SupportsScanMetadata = reader def clear(self) -> None: """Delete all previously cached datasets.""" @@ -308,10 +375,24 @@ def download_all(self) -> None: return None print(f"Downloading {len(frame)} missing datasets...") for meta in _iter_metadata(frame): - self._rd._download(meta["url"], self.path / (meta["sha"] + meta["suffix"])) + self._download_one(meta["url"], self.path_meta(meta)) print("Finished downloads") return None + def _maybe_download(self, meta: Metadata, /) -> Path: + fp = self.path_meta(meta) + return ( + fp + if (fp.exists() and fp.stat().st_size) + else self._download_one(meta["url"], fp) + ) + + def _download_one(self, url: str, fp: Path, /) -> Path: + with self._rd._opener.open(url) as f: + fp.touch() + fp.write_bytes(f.read()) + return fp + @property def path(self) -> Path: """ @@ -354,6 +435,9 @@ def path(self, source: StrPath | None, /) -> None: else: os.environ[self._ENV_VAR] = "" + def path_meta(self, meta: Metadata, /) -> Path: + return self.path / (meta["sha"] + meta["suffix"]) + def __iter__(self) -> Iterator[Path]: yield from self.path.iterdir() diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py new file mode 100644 index 000000000..e5eaa3b97 --- /dev/null +++ b/altair/datasets/_constraints.py @@ -0,0 +1,115 @@ +"""Set-like guards for matching metadata to an implementation.""" + +from __future__ import annotations + +from collections.abc import Set +from itertools import chain +from typing import TYPE_CHECKING, Any + +from narwhals.stable import v1 as nw + +if TYPE_CHECKING: + import sys + from collections.abc import Iterable, Iterator + + from altair.datasets._typing import Metadata + + if sys.version_info >= (3, 12): + from typing import Unpack + else: + from typing_extensions import Unpack + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + +__all__ = [ + "Items", + "MetaIs", + "is_arrow", + "is_csv", + "is_json", + "is_meta", + "is_not_tabular", + "is_parquet", + "is_spatial", + "is_tsv", +] + +Items: TypeAlias = Set[tuple[str, Any]] + + +class MetaIs(Set[tuple[str, Any]]): + _requires: frozenset[tuple[str, Any]] + + def __init__(self, kwds: frozenset[tuple[str, Any]], /) -> None: + object.__setattr__(self, "_requires", kwds) + + @classmethod + def from_metadata(cls, meta: Metadata, /) -> MetaIs: + return cls(frozenset(meta.items())) + + def to_metadata(self) -> Metadata: + if TYPE_CHECKING: + + def collect(**kwds: Unpack[Metadata]) -> Metadata: + return kwds + + return collect(**dict(self)) + return dict(self) + + def to_expr(self) -> nw.Expr: + return nw.all_horizontal(nw.col(name) == val for name, val in self) + + def isdisjoint(self, other: Iterable[Any]) -> bool: + return super().isdisjoint(other) + + def issubset(self, other: Iterable[Any]) -> bool: + return self._requires.issubset(other) + + def __call__(self, meta: Items, /) -> bool: + return self._requires <= meta + + def __hash__(self) -> int: + return hash(self._requires) + + def __contains__(self, x: object) -> bool: + return self._requires.__contains__(x) + + def __iter__(self) -> Iterator[tuple[str, Any]]: + yield from self._requires + + def __len__(self) -> int: + return self._requires.__len__() + + def __setattr__(self, name: str, value: Any): + msg = ( + f"{type(self).__name__!r} is immutable.\n" + f"Could not assign self.{name} = {value}" + ) + raise TypeError(msg) + + def __repr__(self) -> str: + items = dict(self) + if not items: + contents = "" + elif suffix := items.pop("suffix", None): + contents = ", ".join( + chain([f"'*{suffix}'"], (f"{k}={v!r}" for k, v in items.items())) + ) + else: + contents = ", ".join(f"{k}={v!r}" for k, v in items.items()) + return f"is_meta({contents})" + + +def is_meta(**kwds: Unpack[Metadata]) -> MetaIs: + return MetaIs.from_metadata(kwds) + + +is_csv = is_meta(suffix=".csv") +is_json = is_meta(suffix=".json") +is_tsv = is_meta(suffix=".tsv") +is_arrow = is_meta(suffix=".arrow") +is_parquet = is_meta(suffix=".parquet") +is_spatial = is_meta(is_spatial=True) +is_not_tabular = is_meta(is_tabular=False) diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py index 36dba27ef..2f9c13d45 100644 --- a/altair/datasets/_exceptions.py +++ b/altair/datasets/_exceptions.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: from collections.abc import Sequence - from altair.datasets._readers import _Backend + from altair.datasets._reader import _Backend from altair.datasets._typing import Metadata @@ -26,6 +26,19 @@ def from_url(cls, meta: Metadata, /) -> AltairDatasetsError: raise NotImplementedError(msg) return cls(msg) + @classmethod + def from_tabular(cls, meta: Metadata, backend_name: str, /) -> AltairDatasetsError: + install_other = None + mid = "\n" + if not meta["is_image"] and not meta["is_tabular"]: + install_other = "polars" + if meta["is_spatial"]: + mid = f"Geospatial data is not supported natively by {backend_name!r}." + elif meta["is_json"]: + mid = f"Non-tabular json is not supported natively by {backend_name!r}." + msg = f"{_failed_tabular(meta)}{mid}{_suggest_url(meta, install_other)}" + return cls(msg) + @classmethod def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError: msg = f"Found no supported backend, searched:\n{priority!r}" @@ -33,12 +46,12 @@ def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError: def module_not_found( - backend_name: str, reqs: str | tuple[str, ...], missing: str + backend_name: str, reqs: Sequence[str], missing: str ) -> ModuleNotFoundError: - if isinstance(reqs, tuple): - depends = ", ".join(f"{req!r}" for req in reqs) + " packages" + if len(reqs) == 1: + depends = f"{reqs[0]!r} package" else: - depends = f"{reqs!r} package" + depends = ", ".join(f"{req!r}" for req in reqs) + " packages" msg = ( f"Backend {backend_name!r} requires the {depends}, but {missing!r} could not be found.\n" f"This can be installed with pip using:\n" @@ -49,29 +62,6 @@ def module_not_found( return ModuleNotFoundError(msg, name=missing) -def image(meta: Metadata, /) -> AltairDatasetsError: - msg = f"{_failed_tabular(meta)}\n{_suggest_url(meta)}" - return AltairDatasetsError(msg) - - -def geospatial(meta: Metadata, backend_name: str) -> NotImplementedError: - msg = ( - f"{_failed_tabular(meta)}" - f"Geospatial data is not supported natively by {backend_name!r}." - f"{_suggest_url(meta, 'polars')}" - ) - return NotImplementedError(msg) - - -def non_tabular_json(meta: Metadata, backend_name: str) -> NotImplementedError: - msg = ( - f"{_failed_tabular(meta)}" - f"Non-tabular json is not supported natively by {backend_name!r}." - f"{_suggest_url(meta, 'polars')}" - ) - return NotImplementedError(msg) - - def _failed_url(meta: Metadata, /) -> str: return f"Unable to load {meta['file_name']!r} via url.\n" @@ -87,3 +77,35 @@ def _suggest_url(meta: Metadata, install_other: str | None = None) -> str: " from altair.datasets import url\n" f" url({meta['dataset_name']!r})" ) + + +# TODO: +# - Use `AltairDatasetsError` +# - Remove notes from doc +# - Improve message and how data is selected +def implementation_not_found(meta: Metadata, /) -> NotImplementedError: + """ + Search finished without finding a *declared* incompatibility. + + Notes + ----- + - New kind of error + - Previously, every backend had a function assigned + - But they might not all work + - Now, only things that are known to be widely safe are added + - Should probably suggest using a pre-defined backend that supports everything + - What can reach here? + - `is_image` (all) + - `"pandas"` (using inference wont trigger these) + - `.arrow` (w/o `pyarrow`) + - `.parquet` (w/o either `pyarrow` or `fastparquet`) + """ + INDENT = " " * 4 + record = f",\n{INDENT}".join( + f"{k}={v!r}" + for k, v in meta.items() + if not (k.startswith(("is_", "sha", "bytes", "has_"))) + or (v is True and k.startswith("is_")) + ) + msg = f"Found no implementation that supports:\n{INDENT}{record}" + return NotImplementedError(msg) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 8f13ab2de..9b55daf70 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -2,9 +2,10 @@ from typing import TYPE_CHECKING, Generic, final, overload -from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT +from narwhals.stable.v1.typing import IntoDataFrameT -from altair.datasets._readers import _Reader, backend +from altair.datasets import _reader +from altair.datasets._reader import IntoFrameT if TYPE_CHECKING: import sys @@ -13,14 +14,16 @@ import pandas as pd import polars as pl import pyarrow as pa + from narwhals.stable import v1 as nw from altair.datasets._cache import DatasetCache + from altair.datasets._reader import Reader if sys.version_info >= (3, 11): - from typing import LiteralString + from typing import LiteralString, Self else: - from typing_extensions import LiteralString - from altair.datasets._readers import _Backend + from typing_extensions import LiteralString, Self + from altair.datasets._reader import _Backend from altair.datasets._typing import Dataset, Extension @@ -43,7 +46,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): https://github.com/vega/vega-datasets """ - _reader: _Reader[IntoDataFrameT, IntoFrameT] + _reader: Reader[IntoDataFrameT, IntoFrameT] @overload @classmethod @@ -55,16 +58,18 @@ def from_backend( @classmethod def from_backend( cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / - ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + ) -> Loader[pd.DataFrame, nw.LazyFrame]: ... @overload @classmethod def from_backend( cls, backend_name: Literal["pyarrow"], / - ) -> Loader[pa.Table, pa.Table]: ... + ) -> Loader[pa.Table, nw.LazyFrame]: ... @classmethod - def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]: + def from_backend( + cls: type[Loader[Any, Any]], backend_name: _Backend = "polars", / + ) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. @@ -128,8 +133,12 @@ def from_backend(cls, backend_name: _Backend = "polars", /) -> Loader[Any, Any]: .. _JSON format not supported: https://arrow.apache.org/docs/python/json.html#reading-json-files """ - obj = Loader.__new__(Loader) - obj._reader = backend(backend_name) + return cls.from_reader(_reader._from_backend(backend_name)) + + @classmethod + def from_reader(cls, reader: Reader[IntoDataFrameT, IntoFrameT], /) -> Self: + obj = cls.__new__(cls) + obj._reader = reader return obj def __call__( @@ -278,7 +287,7 @@ def url( return self._reader.url(name, suffix) @property - def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: + def cache(self) -> DatasetCache: """ Caching of remote dataset requests. @@ -361,12 +370,9 @@ def __call__( def __getattr__(name): if name == "load": - from altair.datasets._readers import infer_backend - - reader = infer_backend() + reader = _reader.infer_backend() global load - load = _Load.__new__(_Load) - load._reader = reader + load = _Load.from_reader(reader) return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py new file mode 100644 index 000000000..eacc516ba --- /dev/null +++ b/altair/datasets/_reader.py @@ -0,0 +1,540 @@ +""" +Backend for ``alt.datasets.Loader``. + +Notes +----- +Extending would be more ergonomic if `read`, `scan`, `_constraints` were available under a single export:: + + from altair.datasets import ext, reader + import polars as pl + + impls = ( + ext.read(pl.read_parquet, ext.is_parquet), + ext.read(pl.read_csv, ext.is_csv), + ext.read(pl.read_json, ext.is_json), + ) + user_reader = reader(impls) + user_reader.dataset("airports") +""" + +from __future__ import annotations + +from collections import Counter +from collections.abc import Mapping +from importlib import import_module +from importlib.util import find_spec +from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Generic, Literal, cast, overload +from urllib.request import build_opener as _build_opener + +from narwhals.stable import v1 as nw +from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr +from packaging.requirements import Requirement + +from altair.datasets import _readimpl +from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata +from altair.datasets._constraints import is_parquet +from altair.datasets._exceptions import ( + AltairDatasetsError, + implementation_not_found, + module_not_found, +) +from altair.datasets._readimpl import IntoFrameT, is_available +from altair.datasets._typing import EXTENSION_SUFFIXES + +if TYPE_CHECKING: + import sys + from collections.abc import Callable, Sequence + from urllib.request import OpenerDirector + + import pandas as pd + import polars as pl + import pyarrow as pa + + from altair.datasets._readimpl import BaseImpl, R, ReadImpl, ScanImpl + from altair.datasets._typing import Dataset, Extension, Metadata + from altair.vegalite.v5.schema._typing import OneOrSeq + + if sys.version_info >= (3, 13): + from typing import TypeIs, TypeVar + else: + from typing_extensions import TypeIs, TypeVar + if sys.version_info >= (3, 12): + from typing import Unpack + else: + from typing_extensions import Unpack + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + _Polars: TypeAlias = Literal["polars"] + _Pandas: TypeAlias = Literal["pandas"] + _PyArrow: TypeAlias = Literal["pyarrow"] + _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] + _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow] + _CuDF: TypeAlias = Literal["cudf"] + _Dask: TypeAlias = Literal["dask"] + _DuckDB: TypeAlias = Literal["duckdb"] + _Ibis: TypeAlias = Literal["ibis"] + _PySpark: TypeAlias = Literal["pyspark"] + _NwSupport: TypeAlias = Literal[ + _Polars, _Pandas, _PyArrow, _CuDF, _Dask, _DuckDB, _Ibis, _PySpark + ] + _NwSupportT = TypeVar( + "_NwSupportT", + _Polars, + _Pandas, + _PyArrow, + _CuDF, + _Dask, + _DuckDB, + _Ibis, + _PySpark, + ) + + +class Reader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Modular file reader, targeting remote & local tabular resources. + + .. warning:: + Use ``reader(...)`` instead of instantiating ``Reader`` directly. + """ + + # TODO: Docs + _read: Sequence[ReadImpl[IntoDataFrameT]] + """Eager file read functions.""" + + # TODO: Docs + _scan: Sequence[ScanImpl[IntoFrameT]] + """ + *Optionally*-lazy file read/scan functions. + + Used exclusively for ``metadata.parquet``. + + Currently ``"polars"`` is the only lazy option. + All others defer to the eager variant. + """ + + _name: str + """ + Used in error messages, repr and matching ``@overload``(s). + + Otherwise, has no concrete meaning. + """ + + _implementation: nw.Implementation + """ + Corresponding `narwhals implementation`_. + + .. _narwhals implementation: + https://github.com/narwhals-dev/narwhals/blob/9b6a355530ea46c590d5a6d1d0567be59c0b5742/narwhals/utils.py#L61-L290 + """ + + _opener: ClassVar[OpenerDirector] = _build_opener() + _metadata_path: ClassVar[Path] = ( + Path(__file__).parent / "_metadata" / "metadata.parquet" + ) + + def __init__( + self, + read: Sequence[ReadImpl[IntoDataFrameT]], + scan: Sequence[ScanImpl[IntoFrameT]], + name: str, + implementation: nw.Implementation, + ) -> None: + self._read = read + self._scan = scan + self._name = name + self._implementation = implementation + self._schema_cache = SchemaCache(implementation=implementation) + + # TODO: Finish working on presentation + # - The contents of both are functional + def profile(self, mode: Literal["any", "each"]): + """ + Describe which datasets/groups are supported. + + Focusing on actual datasets, rather than describing wrapped functions (repr) + + .. note:: + Having this public to make testing easier (``tests.test_datasets.is_polars_backed_pyarrow``) + """ + if mode == "any": + relevant_columns = set( + chain.from_iterable(impl._relevant_columns for impl in self._read) + ) + frame = self._scan_metadata().select("dataset_name", *relevant_columns) + it = (impl._include_expr for impl in self._read) + # BUG: ``narwhals`` raises a ``ValueError`` when ``__invert__``-ing a previously used Expr? + # - Can't reproduce trivially + # - Doesnt seem to be related to genexp + inc_expr = nw.any_horizontal(*it) + include = _dataset_names(frame, inc_expr) + exclude = _dataset_names(frame, ~nw.col("dataset_name").is_in(include)) + return {"include": include, "exclude": exclude} + elif mode == "each": + # FIXME: Rough draft of how to group results + # - Don't really want a nested dict + m = {} + frame = self._scan_metadata() + for impl in self._read: + name = impl._contents + m[name] = {"include": _dataset_names(frame, impl._include_expr)} + if impl.exclude: + m[name].update(exclude=_dataset_names(frame, impl._exclude_expr)) + return m + else: + msg = f"Unexpected {mode=}" + raise TypeError(msg) + + def __repr__(self) -> str: + from textwrap import indent + + PREFIX = " " * 4 + NL = "\n" + body = f"read\n{indent(NL.join(el._contents for el in self._read), PREFIX)}" + if self._scan: + body += ( + f"\nscan\n{indent(NL.join(el._contents for el in self._scan), PREFIX)}" + ) + return f"Reader[{self._name}] {self._implementation!r}\n{body}" + + def read_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]: + return self._solve(meta, self._read) + + def scan_fn(self, meta: Metadata | Path | str, /) -> Callable[..., IntoFrameT]: + meta = meta if isinstance(meta, Mapping) else {"suffix": _into_suffix(meta)} + return self._solve(meta, self._scan) + + @property + def cache(self) -> DatasetCache: + return DatasetCache(self) + + def dataset( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + **kwds: Any, + ) -> IntoDataFrameT: + frame = self._query(name, suffix) + meta = next(_iter_metadata(frame)) + fn = self.read_fn(meta) + fn_kwds = self._merge_kwds(meta, kwds) + if self.cache.is_active(): + fp = self.cache._maybe_download(meta) + return fn(fp, **fn_kwds) + else: + with self._opener.open(meta["url"]) as f: + return fn(f, **fn_kwds) + + def url( + self, name: Dataset | LiteralString, suffix: Extension | None = None, / + ) -> str: + frame = self._query(name, suffix) + meta = next(_iter_metadata(frame)) + if is_parquet(meta.items()) and not is_available("vegafusion"): + raise AltairDatasetsError.from_url(meta) + url = meta["url"] + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." + raise TypeError(msg) + + def _query( + self, name: Dataset | LiteralString, suffix: Extension | None = None, / + ) -> nw.DataFrame[IntoDataFrameT]: + """ + Query a tabular version of `vega-datasets/datapackage.json`_. + + Applies a filter, erroring out when no results would be returned. + + .. _vega-datasets/datapackage.json: + https://github.com/vega/vega-datasets/blob/main/datapackage.json + """ + constraints = _into_constraints(name, suffix) + frame = self._scan_metadata(**constraints).collect() + if not frame.is_empty(): + return frame + else: + msg = f"Found no results for:\n {constraints!r}" + raise ValueError(msg) + + # TODO: Docs + def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, Any]: + """ + Hook to utilize ``meta`` to extend ``kwds`` with known helpful defaults. + + - User provided arguments have a higher precedence. + - The keywords for schemas vary between libraries + - pandas is internally inconsistent + - By default, returns unchanged + """ + if self._schema_cache.is_active() and ( + schema := self._schema_cache.schema_kwds(meta) + ): + kwds = schema | kwds if kwds else schema + return kwds + + @property + def _metadata_frame(self) -> nw.LazyFrame: + fp = self._metadata_path + return nw.from_native(self.scan_fn(fp)(fp)).lazy() + + def _scan_metadata( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.LazyFrame: + if predicates or constraints: + return self._metadata_frame.filter(*predicates, **constraints) + return self._metadata_frame + + # TODO: Docs + def _solve( + self, meta: Metadata, impls: Sequence[BaseImpl[R]], / + ) -> Callable[..., R]: + """ + Return the first function meeting constraints of meta. + + Notes + ----- + - Iterate over impls + - Each one can either match or signal an error + - An error blocks any additional checking + - Both include & exclude + - Uses ``ItemsView`` to support set ops + - `meta` isn't iterated over + - Leaves the door open for caching the search space + """ + items = meta.items() + it = (some for impl in impls if (some := impl.unwrap_or(items))) + if fn_or_err := next(it, None): + if _is_err(fn_or_err): + raise fn_or_err.from_tabular(meta, self._name) + return fn_or_err + if meta["is_image"]: + raise AltairDatasetsError.from_tabular(meta, self._name) + raise implementation_not_found(meta) + + +# TODO: Review after finishing `profile` +# NOTE: Temp helper function for `Reader.profile` +def _dataset_names( + frame: nw.LazyFrame, + *predicates: OneOrSeq[IntoExpr], + **constraints: Unpack[Metadata], +): + return ( + frame.filter(*predicates, **constraints) + .select("dataset_name") + .collect() + .get_column("dataset_name") + .to_list() + ) + + +class _NoParquetReader(Reader[IntoDataFrameT, IntoFrameT]): + def __repr__(self) -> str: + return f"{super().__repr__()}\ncsv_cache\n {self.csv_cache!r}" + + @property + def csv_cache(self) -> CsvCache: + if not hasattr(self, "_csv_cache"): + self._csv_cache = CsvCache() + return self._csv_cache + + @property + def _metadata_frame(self) -> nw.LazyFrame: + ns = self._implementation.to_native_namespace() + data = cast("dict[str, Any]", self.csv_cache.rotated) + return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns)).lazy() + + +@overload +def reader( + read_fns: Sequence[ReadImpl[IntoDataFrameT]], + scan_fns: tuple[()] = ..., + *, + name: str | None = ..., + implementation: nw.Implementation = ..., +) -> Reader[IntoDataFrameT, nw.LazyFrame]: ... + + +@overload +def reader( + read_fns: Sequence[ReadImpl[IntoDataFrameT]], + scan_fns: Sequence[ScanImpl[IntoFrameT]], + *, + name: str | None = ..., + implementation: nw.Implementation = ..., +) -> Reader[IntoDataFrameT, IntoFrameT]: ... + + +def reader( + read_fns: Sequence[ReadImpl[IntoDataFrameT]], + scan_fns: Sequence[ScanImpl[IntoFrameT]] = (), + *, + name: str | None = None, + implementation: nw.Implementation = nw.Implementation.UNKNOWN, +) -> Reader[IntoDataFrameT, IntoFrameT] | Reader[IntoDataFrameT, nw.LazyFrame]: + name = name or Counter(el._inferred_package for el in read_fns).most_common(1)[0][0] + if implementation is nw.Implementation.UNKNOWN: + implementation = _into_implementation(Requirement(name)) + if scan_fns: + return Reader(read_fns, scan_fns, name, implementation) + if stolen := _steal_eager_parquet(read_fns): + return Reader(read_fns, stolen, name, implementation) + else: + return _NoParquetReader[IntoDataFrameT](read_fns, (), name, implementation) + + +def infer_backend( + *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow") +) -> Reader[Any, Any]: + """ + Return the first available reader in order of `priority`. + + Notes + ----- + - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``) + - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support + - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed + - ``"pyarrow"``: least reliable + + .. _fastparquet: + https://github.com/dask/fastparquet + """ + it = (_from_backend(name) for name in priority if is_available(_requirements(name))) + if reader := next(it, None): + return reader + raise AltairDatasetsError.from_priority(priority) + + +@overload +def _from_backend(name: _Polars, /) -> Reader[pl.DataFrame, pl.LazyFrame]: ... +@overload +def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, nw.LazyFrame]: ... +@overload +def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, nw.LazyFrame]: ... + + +# FIXME: The order this is defined in makes splitting the module complicated +# - Can't use a classmethod, since some result in a subclass used +def _from_backend(name: _Backend, /) -> Reader[Any, Any]: + """ + Reader initialization dispatcher. + + FIXME: Works, but defining these in mixed shape functions seems off. + """ + if not _is_backend(name): + msg = f"Unknown backend {name!r}" + raise TypeError(msg) + implementation = _into_implementation(name) + if name == "polars": + rd, sc = _readimpl.pl_only() + return reader(rd, sc, name=name, implementation=implementation) + elif name == "pandas[pyarrow]": + return reader(_readimpl.pd_pyarrow(), name=name, implementation=implementation) + elif name == "pandas": + return reader(_readimpl.pd_only(), name=name, implementation=implementation) + elif name == "pyarrow": + return reader(_readimpl.pa_any(), name=name, implementation=implementation) + + +def _is_backend(obj: Any) -> TypeIs[_Backend]: + return obj in {"polars", "pandas", "pandas[pyarrow]", "pyarrow"} + + +def _is_err(obj: Any) -> TypeIs[type[AltairDatasetsError]]: + return obj is AltairDatasetsError + + +def _into_constraints( + name: Dataset | LiteralString, suffix: Extension | None, / +) -> Metadata: + """Transform args into a mapping to column names.""" + m: Metadata = {} + if "." in name: + m["file_name"] = name + elif suffix is None: + m["dataset_name"] = name + elif suffix.startswith("."): + m = {"dataset_name": name, "suffix": suffix} + else: + msg = ( + f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n" + f"but got: {suffix!r}" + ) + raise TypeError(msg) + return m + + +def _into_implementation( + backend: _NwSupport | _PandasAny | Requirement, / +) -> nw.Implementation: + primary = _import_guarded(backend) + mapping: Mapping[LiteralString, nw.Implementation] = { + "polars": nw.Implementation.POLARS, + "pandas": nw.Implementation.PANDAS, + "pyarrow": nw.Implementation.PYARROW, + "cudf": nw.Implementation.CUDF, + "dask": nw.Implementation.DASK, + "duckdb": nw.Implementation.DUCKDB, + "ibis": nw.Implementation.IBIS, + "pyspark": nw.Implementation.PYSPARK, + } + if impl := mapping.get(primary): + return impl + msg = f"Package {primary!r} is not supported by `narhwals`." + raise ValueError(msg) + + +def _into_suffix(obj: Path | str, /) -> Any: + if isinstance(obj, Path): + return obj.suffix + elif isinstance(obj, str): + return obj + else: + msg = f"Unexpected type {type(obj).__name__!r}" + raise TypeError(msg) + + +def _steal_eager_parquet( + read_fns: Sequence[ReadImpl[IntoDataFrameT]], / +) -> Sequence[ScanImpl[nw.LazyFrame]] | None: + if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None): + return (convertable.to_scan_impl(),) + return None + + +@overload +def _import_guarded(req: _PandasAny, /) -> _Pandas: ... + + +@overload +def _import_guarded(req: _NwSupportT, /) -> _NwSupportT: ... + + +@overload +def _import_guarded(req: Requirement, /) -> LiteralString: ... + + +def _import_guarded(req: Any, /) -> LiteralString: + requires = _requirements(req) + for name in requires: + if spec := find_spec(name): + import_module(spec.name) + else: + raise module_not_found(str(req), requires, missing=name) + return requires[0] + + +def _requirements(req: Requirement | str, /) -> tuple[Any, ...]: + req = Requirement(req) if isinstance(req, str) else req + return (req.name, *req.extras) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py deleted file mode 100644 index a1f66dee1..000000000 --- a/altair/datasets/_readers.py +++ /dev/null @@ -1,574 +0,0 @@ -""" -Backends for ``alt.datasets.Loader``. - -- Interfacing with the cached metadata. - - But not updating it -- Performing requests from those urls -- Dispatching read function on file extension -""" - -from __future__ import annotations - -import urllib.request -from collections.abc import Callable, Iterable, Mapping, Sequence -from functools import partial -from importlib import import_module -from importlib.util import find_spec -from itertools import chain -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Final, - Literal, - Protocol, - TypeVar, - overload, -) - -import narwhals.stable.v1 as nw -from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT - -from altair.datasets import _exceptions as _ds_exc -from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata -from altair.datasets._typing import EXTENSION_SUFFIXES, Metadata, is_ext_read - -if TYPE_CHECKING: - import sys - from io import IOBase - from urllib.request import OpenerDirector - - import pandas as pd - import polars as pl - import pyarrow as pa - from _typeshed import StrPath - from pyarrow.csv import read_csv as pa_read_csv # noqa: F401 - from pyarrow.feather import read_table as pa_read_feather # noqa: F401 - from pyarrow.json import read_json as pa_read_json # noqa: F401 - from pyarrow.parquet import read_table as pa_read_parquet # noqa: F401 - - if sys.version_info >= (3, 13): - from typing import TypeIs, Unpack - else: - from typing_extensions import TypeIs, Unpack - if sys.version_info >= (3, 11): - from typing import LiteralString - else: - from typing_extensions import LiteralString - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - from packaging.requirements import Requirement - - from altair.datasets._typing import Dataset, Extension, Metadata - from altair.vegalite.v5.schema._typing import OneOrSeq - - _IntoSuffix: TypeAlias = "StrPath | Metadata" - _ExtensionScan: TypeAlias = Literal[".parquet"] - _T = TypeVar("_T") - - # NOTE: Using a constrained instead of bound `TypeVar` - # error: Incompatible return value type (got "DataFrame[Any] | LazyFrame[Any]", expected "FrameT") [return-value] - # - https://typing.readthedocs.io/en/latest/spec/generics.html#introduction - # - https://typing.readthedocs.io/en/latest/spec/generics.html#type-variables-with-an-upper-bound - # https://github.com/narwhals-dev/narwhals/blob/21b8436567de3631c584ef67632317ad70ae5de0/narwhals/typing.py#L59 - FrameT = TypeVar("FrameT", nw.DataFrame[Any], nw.LazyFrame) - - _Polars: TypeAlias = Literal["polars"] - _Pandas: TypeAlias = Literal["pandas"] - _PyArrow: TypeAlias = Literal["pyarrow"] - _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow) - _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] - _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow] - - -__all__ = ["backend", "infer_backend"] - -_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" - - -class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): - """ - Describes basic IO for remote & local tabular resources. - - Subclassing this protocol directly will provide a *mostly* complete implementation. - - Each of the following must be explicitly assigned: - - _Reader._read_fn - _Reader._scan_fn - _Reader._name - """ - - _read_fn: Mapping[Extension, Callable[..., IntoDataFrameT]] - """ - Eager file read functions. - - Each corresponds to a known file extension within ``vega-datasets``. - """ - - _scan_fn: Mapping[_ExtensionScan, Callable[..., IntoFrameT]] - """ - *Optionally*-lazy file read/scan functions. - - Used exclusively for ``metadata.parquet``. - - Currently ``"polars"`` is the only lazy option. - """ - - _name: LiteralString - """ - Used in error messages, repr and matching ``@overload``(s). - - Otherwise, has no concrete meaning. - """ - - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - - def read_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoDataFrameT]: - return self._read_fn[_extract_suffix(source, is_ext_read)] - - def scan_fn(self, source: _IntoSuffix, /) -> Callable[..., IntoFrameT]: - return self._scan_fn[_extract_suffix(source, is_ext_scan)] - - def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: - """Hook to provide additional schema metadata on read.""" - return {} - - def _maybe_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]: - """Backend specific tweaks/errors/warnings, based on ``Metadata``.""" - if meta["is_image"]: - raise _ds_exc.image(meta) - return self.read_fn(meta) - - def dataset( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - **kwds: Any, - ) -> IntoDataFrameT: - df = self.query(**_extract_constraints(name, suffix)) - meta = next(_iter_metadata(df)) - fn = self._maybe_fn(meta) - url = meta["url"] - if default_kwds := self._schema_kwds(meta): - kwds = default_kwds | kwds if kwds else default_kwds - - if self.cache.is_active(): - fp = self.cache.path / (meta["sha"] + meta["suffix"]) - if not (fp.exists() and fp.stat().st_size): - self._download(url, fp) - return fn(fp, **kwds) - else: - with self._opener.open(url) as f: - return fn(f, **kwds) - - def url( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - ) -> str: - frame = self.query(**_extract_constraints(name, suffix)) - meta = next(_iter_metadata(frame)) - if meta["suffix"] == ".parquet" and not is_available("vegafusion"): - raise _ds_exc.AltairDatasetsError.from_url(meta) - url = meta["url"] - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." - raise TypeError(msg) - - def query( - self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.DataFrame[IntoDataFrameT]: - """ - Query a tabular version of `vega-datasets/datapackage.json`_. - - Applies a filter, erroring out when no results would be returned. - - Notes - ----- - Arguments correspond to those seen in `pl.LazyFrame.filter`_. - - .. _vega-datasets/datapackage.json: - https://github.com/vega/vega-datasets/blob/main/datapackage.json - .. _pl.LazyFrame.filter: - https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html - """ - frame = self._scan_metadata(*predicates, **constraints).collect() - if not frame.is_empty(): - return frame - else: - terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n {terms}" - raise ValueError(msg) - - def _scan_metadata( - self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.LazyFrame: - if predicates or constraints: - return self._metadata.filter(*predicates, **constraints) - return self._metadata - - @property - def _metadata(self) -> nw.LazyFrame: - return nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() - - def _download(self, url: str, fp: Path, /) -> None: - with self._opener.open(url) as f: - fp.touch() - fp.write_bytes(f.read()) - - @property - def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: - return DatasetCache(self) - - def _import(self, name: str, /) -> Any: - if spec := find_spec(name): - return import_module(spec.name) - raise _ds_exc.module_not_found(self._name, _requirements(self._name), name) # type: ignore[call-overload] - - def __repr__(self) -> str: - return f"Reader[{self._name}]" - - def __init__(self, name: LiteralString, /) -> None: ... - - -class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol): - """ - Provides temporal column names as keyword arguments on read. - - Related - ------- - - https://github.com/vega/altair/pull/3631#issuecomment-2480816377 - - https://github.com/vega/vega-datasets/pull/631 - - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html - - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html - """ - - _schema_cache: SchemaCache - - def _schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: - name: Any = meta["dataset_name"] - suffix = meta["suffix"] - if cols := self._schema_cache.by_dtype(name, nw.Date, nw.Datetime): - if suffix == ".json": - return {"convert_dates": cols} - elif suffix in {".csv", ".tsv"}: - return {"parse_dates": cols} - return super()._schema_kwds(meta) - - def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pd.DataFrame]: - fn = super()._maybe_fn(meta) - if meta["is_spatial"]: - raise _ds_exc.geospatial(meta, self._name) - return fn - - -class _PandasReader(_PandasReaderBase): - def __init__(self, name: _Pandas, /) -> None: - self._name = _requirements(name) - if not TYPE_CHECKING: - pd = self._import(self._name) - self._read_fn = { - ".csv": pd.read_csv, - ".json": pd.read_json, - ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"), - ".arrow": pd.read_feather, - ".parquet": pd.read_parquet, - } - self._scan_fn = {".parquet": pd.read_parquet} - self._supports_parquet: bool = is_available( - "pyarrow", "fastparquet", require_all=False - ) - self._csv_cache = CsvCache() - self._schema_cache = SchemaCache() - - @property - def _metadata(self) -> nw.LazyFrame: - if self._supports_parquet: - return super()._metadata - return self._csv_cache.metadata(nw.dependencies.get_pandas()) - - -class _PandasPyArrowReader(_PandasReaderBase): - def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: - _pd, _pa = _requirements(name) - self._name = name - if not TYPE_CHECKING: - pd = self._import(_pd) - pa = self._import(_pa) # noqa: F841 - - self._read_fn = { - ".csv": partial["pd.DataFrame"](pd.read_csv, dtype_backend=_pa), - ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa), - ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa), - ".arrow": partial(pd.read_feather, dtype_backend=_pa), - ".parquet": partial(pd.read_parquet, dtype_backend=_pa), - } - self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} - self._schema_cache = SchemaCache() - - -def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame: - """ - Try to utilize better date parsing available in `pl.read_csv`_. - - `pl.read_json`_ has few options when compared to `pl.read_csv`_. - - Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_. - - .. _pl.read_json: - https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html - .. _pl.read_csv: - https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html - .. _pandas.read_json: - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html - """ - from io import BytesIO - - import polars as pl - - df = pl.read_json(source, **kwds) - if any(tp.is_nested() for tp in df.schema.dtypes()): - # NOTE: Inferred as `(Geo|Topo)JSON`, which wouldn't be supported by `read_csv` - return df - buf = BytesIO() - df.write_csv(buf) - if kwds: - SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"} - kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS} - return pl.read_csv(buf, try_parse_dates=True, **kwds) - - -class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, name: _Polars, /) -> None: - self._name = _requirements(name) - if not TYPE_CHECKING: - pl = self._import(self._name) - self._read_fn = { - ".csv": partial(pl.read_csv, try_parse_dates=True), - ".json": _pl_read_json_roundtrip, - ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True), - ".arrow": pl.read_ipc, - ".parquet": pl.read_parquet, - } - self._scan_fn = {".parquet": pl.scan_parquet} - - -class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): - """ - Reader backed by `pyarrow.Table`_. - - Warning - ------- - **JSON**: Only supports `line-delimited`_ JSON. - Likely to raise the following error: - - ArrowInvalid: JSON parse error: Column() changed from object to array in row 0 - - .. _pyarrow.Table: - https://arrow.apache.org/docs/python/generated/pyarrow.Table.html - .. _line-delimited: - https://arrow.apache.org/docs/python/json.html#reading-json-files - """ - - def _maybe_fn(self, meta: Metadata, /) -> Callable[..., pa.Table]: - fn = super()._maybe_fn(meta) - if fn == self._read_json_polars: - return fn - elif meta["is_json"]: - if meta["is_tabular"]: - return self._read_json_tabular - elif meta["is_spatial"]: - raise _ds_exc.geospatial(meta, self._name) - else: - raise _ds_exc.non_tabular_json(meta, self._name) - else: - return fn - - def _read_json_tabular(self, source: Any, /, **kwds: Any) -> pa.Table: - import json - - if not isinstance(source, Path): - obj = json.load(source) - else: - with Path(source).open(encoding="utf-8") as f: - obj = json.load(f) - pa = nw.dependencies.get_pyarrow() - return pa.Table.from_pylist(obj) - - def _read_json_polars(self, source: Any, /, **kwds: Any) -> pa.Table: - return _pl_read_json_roundtrip(source).to_arrow() - - def __init__(self, name: _PyArrow, /) -> None: - self._name = _requirements(name) - if not TYPE_CHECKING: - pa = self._import(self._name) # noqa: F841 - pa_read_csv = self._import(f"{self._name}.csv").read_csv - pa_read_feather = self._import(f"{self._name}.feather").read_table - pa_read_parquet = self._import(f"{self._name}.parquet").read_table - - # NOTE: Prefer `polars` since it is zero-copy and fast - if find_spec("polars") is not None: - pa_read_json = self._read_json_polars - else: - pa_read_json = self._import(f"{self._name}.json").read_json - - # NOTE: Stubs suggest using a dataclass, but no way to construct it - tab_sep: Any = {"delimiter": "\t"} - - self._read_fn = { - ".csv": pa_read_csv, - ".json": pa_read_json, - ".tsv": partial(pa_read_csv, parse_options=tab_sep), - ".arrow": pa_read_feather, - ".parquet": pa_read_parquet, - } - self._scan_fn = {".parquet": pa_read_parquet} - - -def _extract_constraints( - name: Dataset | LiteralString, suffix: Extension | None, / -) -> Metadata: - """Transform args into a mapping to column names.""" - constraints: Metadata = {} - if name.endswith(EXTENSION_SUFFIXES): - fp = Path(name) - constraints["dataset_name"] = fp.stem - constraints["suffix"] = fp.suffix - return constraints - elif suffix is not None: - if not is_ext_read(suffix): - msg = ( - f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n" - f"but got: {suffix!r}" - ) - raise TypeError(msg) - else: - constraints["suffix"] = suffix - constraints["dataset_name"] = name - return constraints - - -def _extract_suffix(source: _IntoSuffix, guard: Callable[..., TypeIs[_T]], /) -> _T: - suffix: Any = ( - Path(source).suffix if not isinstance(source, Mapping) else source["suffix"] - ) - if guard(suffix): - return suffix - else: - msg = f"Unexpected file extension {suffix!r}, from:\n{source}" - raise TypeError(msg) - - -def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: - return suffix == ".parquet" - - -def is_available( - pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True -) -> bool: - """ - Check for importable package(s), without raising on failure. - - Parameters - ---------- - pkg_names, more_pkg_names - One or more packages. - require_all - * ``True`` every package. - * ``False`` at least one package. - """ - if not more_pkg_names and isinstance(pkg_names, str): - return find_spec(pkg_names) is not None - pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) - names = chain(pkgs_names, more_pkg_names) - fn = all if require_all else any - return fn(find_spec(name) is not None for name in names) - - -def infer_backend( - *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow") -) -> _Reader[Any, Any]: - """ - Return the first available reader in order of `priority`. - - Notes - ----- - - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``) - - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support - - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed - - ``"pyarrow"``: least reliable - - .. _fastparquet: - https://github.com/dask/fastparquet - """ - it = (backend(name) for name in priority if is_available(_requirements(name))) - if reader := next(it, None): - return reader - raise _ds_exc.AltairDatasetsError.from_priority(priority) - - -@overload -def backend(name: _Polars, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... - - -@overload -def backend(name: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... - - -@overload -def backend(name: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... - - -def backend(name: _Backend, /) -> _Reader[Any, Any]: - """Reader initialization dispatcher.""" - if name == "polars": - return _PolarsReader(name) - elif name == "pandas[pyarrow]": - return _PandasPyArrowReader(name) - elif name == "pandas": - return _PandasReader(name) - elif name == "pyarrow": - return _PyArrowReader(name) - elif name in {"ibis", "cudf", "dask", "modin"}: - msg = "Supported by ``narwhals``, not investigated yet" - raise NotImplementedError(msg) - else: - msg = f"Unknown backend {name!r}" - raise TypeError(msg) - - -@overload -def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... - - -@overload -def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... - - -def _requirements(s: Any, /) -> Any: - concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} - if s in concrete: - return s - else: - from packaging.requirements import Requirement - - req = Requirement(s) - supports_extras: set[Literal[_Pandas]] = {"pandas"} - if req.name in supports_extras and req.extras == {"pyarrow"}: - return req.name, "pyarrow" - return _requirements_unknown(req) - - -def _requirements_unknown(req: Requirement | str, /) -> Any: - from packaging.requirements import Requirement - - req = Requirement(req) if isinstance(req, str) else req - return (req.name, *req.extras) diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py new file mode 100644 index 000000000..119352db5 --- /dev/null +++ b/altair/datasets/_readimpl.py @@ -0,0 +1,414 @@ +"""Individual read functions and siuations they support.""" + +from __future__ import annotations + +import sys +from enum import Enum +from functools import partial, wraps +from importlib.util import find_spec +from itertools import chain +from operator import itemgetter +from pathlib import Path +from typing import TYPE_CHECKING, Any, Generic, Literal + +from narwhals.stable import v1 as nw +from narwhals.stable.v1.dependencies import get_pandas, get_polars +from narwhals.stable.v1.typing import IntoDataFrameT + +from altair.datasets._constraints import ( + is_arrow, + is_csv, + is_json, + is_meta, + is_not_tabular, + is_parquet, + is_spatial, + is_tsv, +) +from altair.datasets._exceptions import AltairDatasetsError + +if sys.version_info >= (3, 13): + from typing import TypeVar +else: + from typing_extensions import TypeVar + +if TYPE_CHECKING: + from collections.abc import Callable, Iterable, Iterator, Sequence + from io import IOBase + from types import ModuleType + + import pandas as pd + import polars as pl + import pyarrow as pa + from narwhals.stable.v1 import typing as nwt + + from altair.datasets._constraints import Items, MetaIs + +__all__ = ["is_available", "pa_any", "pd_only", "pd_pyarrow", "pl_only", "read", "scan"] + +R = TypeVar("R") +IntoFrameT = TypeVar( + "IntoFrameT", + bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike", + default=nw.LazyFrame, +) + + +class Skip(Enum): + """Falsy sentinel.""" + + skip = 0 + + def __bool__(self) -> Literal[False]: + return False + + def __repr__(self) -> Literal[""]: + return "" + + +class BaseImpl(Generic[R]): + fn: Callable[..., R] + """Wrapped read function.""" + include: MetaIs + """Passing this makes ``fn`` a candidate.""" + exclude: MetaIs + """Passing this overrides ``include``, transforming into an error.""" + + def __init__( + self, + fn: Callable[..., R], + include: MetaIs, + exclude: MetaIs | None, + kwds: dict[str, Any], + /, + ) -> None: + exclude = exclude or self._exclude_none() + if not include.isdisjoint(exclude): + intersection = ", ".join(f"{k}={v!r}" for k, v in include & exclude) + msg = f"Constraints overlap at: `{intersection}`\ninclude={include!r}\nexclude={exclude!r}" + raise TypeError(msg) + object.__setattr__(self, "fn", partial(fn, **kwds) if kwds else fn) + object.__setattr__(self, "include", include) + object.__setattr__(self, "exclude", exclude) + + # TODO: Consider renaming + # NOTE: + # - Fn means call it + # - Err means raise it + # - Skip means its safe to check other impls + def unwrap_or( + self, meta: Items, / + ) -> Callable[..., R] | type[AltairDatasetsError] | Skip: + if self.include.issubset(meta): + return self.fn if self.exclude.isdisjoint(meta) else AltairDatasetsError + return Skip.skip + + @classmethod + def _exclude_none(cls) -> MetaIs: + return is_meta() + + def __setattr__(self, name: str, value: Any): + msg = ( + f"{type(self).__name__!r} is immutable.\n" + f"Could not assign self.{name} = {value}" + ) + raise TypeError(msg) + + @property + def _inferred_package(self) -> str: + return _root_package_name(_unwrap_partial(self.fn), "UNKNOWN") + + def __repr__(self) -> str: + tp_name = f"{type(self).__name__}[{self._inferred_package}?]" + return f"{tp_name}({self._contents})" + + # TODO: Consider renaming + @property + def _contents(self) -> str: + if isinstance(self.fn, partial): + fn = _unwrap_partial(self.fn) + it = (f"{k}={v!r}" for k, v in self.fn.keywords.items()) + fn_repr = f"{fn.__name__}(..., {', '.join(it)})" + else: + fn_repr = f"{self.fn.__name__}(...)" + if self.exclude: + params = f"include={self.include!r}, exclude={self.exclude!r}" + else: + params = repr(self.include) + return f"{fn_repr}, {params}" + + @property + def _relevant_columns(self) -> Iterator[str]: + name = itemgetter(0) + yield from (name(obj) for obj in chain(self.include, self.exclude)) + + @property + def _include_expr(self) -> nw.Expr: + return ( + self.include.to_expr() & ~self.exclude.to_expr() + if self.exclude + else self.include.to_expr() + ) + + @property + def _exclude_expr(self) -> nw.Expr: + if self.exclude: + return self.include.to_expr() & self.exclude.to_expr() + msg = f"Unable to generate an exclude expression without setting exclude\n\n{self!r}" + raise TypeError(msg) + + +def _unwrap_partial(fn: Any, /) -> Any: + # NOTE: ``functools._unwrap_partial`` + func = fn + while isinstance(func, partial): + func = func.func + return func + + +class ScanImpl(BaseImpl[IntoFrameT]): ... + + +class ReadImpl(BaseImpl[IntoDataFrameT]): + def to_scan_impl(self) -> ScanImpl[nw.LazyFrame]: + return ScanImpl(_into_scan_fn(self.fn), self.include, self.exclude, {}) + + +def _into_scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]: + @wraps(_unwrap_partial(fn)) + def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame: + return nw.from_native(fn(*args, **kwds)).lazy() + + return wrapper + + +def _root_package_name(obj: Any, default: str, /) -> str: + # NOTE: Defers importing `inspect`, if we can get the module name + if hasattr(obj, "__module__"): + return obj.__module__.split(".")[0] + else: + from inspect import getmodule + + module = getmodule(obj) + if module and (pkg := module.__package__): + return pkg.split(".")[0] + return default + + +def is_available( + pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True +) -> bool: + """ + Check for importable package(s), without raising on failure. + + Parameters + ---------- + pkg_names, more_pkg_names + One or more packages. + require_all + * ``True`` every package. + * ``False`` at least one package. + """ + if not more_pkg_names and isinstance(pkg_names, str): + return find_spec(pkg_names) is not None + pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) + names = chain(pkgs_names, more_pkg_names) + fn = all if require_all else any + return fn(find_spec(name) is not None for name in names) + + +def read( + fn: Callable[..., IntoDataFrameT], + /, + include: MetaIs, + exclude: MetaIs | None = None, + **kwds: Any, +) -> ReadImpl[IntoDataFrameT]: + return ReadImpl(fn, include, exclude, kwds) + + +def scan( + fn: Callable[..., IntoFrameT], + /, + include: MetaIs, + exclude: MetaIs | None = None, + **kwds: Any, +) -> ScanImpl[IntoFrameT]: + return ScanImpl(fn, include, exclude, kwds) + + +def pl_only() -> tuple[ + Sequence[ReadImpl[pl.DataFrame]], Sequence[ScanImpl[pl.LazyFrame]] +]: + import polars as pl + + read_fns = ( + read(pl.read_csv, is_csv, try_parse_dates=True), + read(_pl_read_json_roundtrip(get_polars()), is_json), + read(pl.read_csv, is_tsv, separator="\t", try_parse_dates=True), + read(pl.read_ipc, is_arrow), + read(pl.read_parquet, is_parquet), + ) + scan_fns = (scan(pl.scan_parquet, is_parquet),) + return read_fns, scan_fns + + +def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]: + import pandas as pd + + opt: Sequence[ReadImpl[pd.DataFrame]] + if is_available("pyarrow"): + opt = read(pd.read_feather, is_arrow), read(pd.read_parquet, is_parquet) + elif is_available("fastparquet"): + opt = (read(pd.read_parquet, is_parquet),) + else: + opt = () + return ( + read(pd.read_csv, is_csv), + read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial), + read(pd.read_csv, is_tsv, sep="\t"), + *opt, + ) + + +def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]: + import pandas as pd + + kwds: dict[str, Any] = {"dtype_backend": "pyarrow"} + return ( + read(pd.read_csv, is_csv, **kwds), + read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial, **kwds), + read(pd.read_csv, is_tsv, sep="\t", **kwds), + read(pd.read_feather, is_arrow, **kwds), + read(pd.read_parquet, is_parquet, **kwds), + ) + + +def pa_any() -> Sequence[ReadImpl[pa.Table]]: + from pyarrow import csv, feather, parquet + + return ( + read(csv.read_csv, is_csv), + _pa_read_json_impl(), + read(csv.read_csv, is_tsv, parse_options={"delimiter": "\t"}), + read(feather.read_table, is_arrow), + read(parquet.read_table, is_parquet), + ) + + +def _pa_read_json_impl() -> ReadImpl[pa.Table]: + """ + Mitigating ``pyarrow``'s `line-delimited`_ JSON requirement. + + .. _line-delimited: + https://arrow.apache.org/docs/python/json.html#reading-json-files + """ + if is_available("polars"): + return read(_pl_read_json_roundtrip_to_arrow(get_polars()), is_json) + elif is_available("pandas"): + return read(_pd_read_json_to_arrow(get_pandas()), is_json, exclude=is_spatial) + return read(_stdlib_read_json_to_arrow, is_json, exclude=is_not_tabular) + + +def _pd_read_json(ns: ModuleType, /) -> Callable[..., pd.DataFrame]: + @wraps(ns.read_json) + def fn(source: Path | Any, /, **kwds: Any) -> pd.DataFrame: + return _pd_fix_dtypes_nw(ns.read_json(source, **kwds), **kwds).to_native() + + return fn + + +def _pd_fix_dtypes_nw( + df: pd.DataFrame, /, *, dtype_backend: Any = None, **kwds: Any +) -> nw.DataFrame[pd.DataFrame]: + kwds = {"dtype_backend": dtype_backend} if dtype_backend else {} + return ( + df.convert_dtypes(**kwds) + .pipe(nw.from_native, eager_only=True) + .with_columns(nw.selectors.by_dtype(nw.Object).cast(nw.String)) + ) + + +def _pd_read_json_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]: + @wraps(ns.read_json) + def fn(source: Path | Any, /, *, schema: Any = None, **kwds: Any) -> pa.Table: + """``schema`` is only here to swallow the ``SchemaCache`` if used.""" + return ( + ns.read_json(source, **kwds) + .pipe(_pd_fix_dtypes_nw, dtype_backend="pyarrow") + .to_arrow() + ) + + return fn + + +def _pl_read_json_roundtrip(ns: ModuleType, /) -> Callable[..., pl.DataFrame]: + """ + Try to utilize better date parsing available in `pl.read_csv`_. + + `pl.read_json`_ has few options when compared to `pl.read_csv`_. + + Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_. + + .. _pl.read_json: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html + .. _pl.read_csv: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html + .. _pandas.read_json: + https://pandas.pydata.org/docs/reference/api/pandas.read_json.html + """ + from io import BytesIO + + @wraps(ns.read_json) + def fn(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame: + df = ns.read_json(source, **kwds) + if any(tp.is_nested() for tp in df.schema.dtypes()): + return df + buf = BytesIO() + df.write_csv(buf) + if kwds: + SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"} + kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS} + return ns.read_csv(buf, try_parse_dates=True, **kwds) + + return fn + + +def _pl_read_json_roundtrip_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]: + eager = _pl_read_json_roundtrip(ns) + + @wraps(ns.read_json) + def fn(source: Path | IOBase, /, **kwds: Any) -> pa.Table: + return eager(source).to_arrow() + + return fn + + +def _stdlib_read_json(source: Path | Any, /) -> Any: + import json + + if not isinstance(source, Path): + return json.load(source) + else: + with Path(source).open(encoding="utf-8") as f: + return json.load(f) + + +def _stdlib_read_json_to_arrow(source: Path | Any, /, **kwds: Any) -> pa.Table: + import pyarrow as pa + + rows: list[dict[str, Any]] = _stdlib_read_json(source) + try: + return pa.Table.from_pylist(rows, **kwds) + except TypeError: + import csv + import io + + from pyarrow import csv as pa_csv + + with io.StringIO() as f: + writer = csv.DictWriter(f, rows[0].keys(), dialect=csv.unix_dialect) + writer.writeheader() + writer.writerows(rows) + with io.BytesIO(f.getvalue().encode()) as f2: + return pa_csv.read_csv(f2) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 0855b73af..3765fa69b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -29,7 +29,7 @@ import polars as pl from _pytest.mark.structures import ParameterSet - from altair.datasets._readers import _Backend, _PandasAny, _Polars, _PyArrow + from altair.datasets._reader import _Backend, _PandasAny, _Polars, _PyArrow from altair.vegalite.v5.schema._typing import OneOrSeq if sys.version_info >= (3, 10): @@ -117,11 +117,14 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool: def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool: """User requested ``pyarrow``, but also has ``polars`` installed.""" # NOTE: Would prefer if there was a *less* private method to test this. - return bool( - is_loader_backend(loader, "pyarrow") - and (fn := getattr(loader._reader, "_read_json_polars", None)) - and fn == loader._reader.read_fn("dummy.json") - ) + from altair.datasets._constraints import is_meta + + if is_loader_backend(loader, "pyarrow"): + items = is_meta(suffix=".json", is_spatial=True) + impls = loader._reader._read + it = (some for impl in impls if (some := impl.unwrap_or(items))) + return callable(next(it, None)) + return False @backends @@ -151,7 +154,7 @@ def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None: See Also -------- - ``altair.datasets._readers.infer_backend`` + ``altair.datasets._reader.infer_backend`` """ import altair.datasets._loader from altair.datasets import load @@ -247,7 +250,7 @@ def test_url(name: Dataset) -> None: def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets._cache import csv_cache - from altair.datasets._readers import infer_backend + from altair.datasets._reader import infer_backend priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4") assert csv_cache._mapping == {} @@ -318,7 +321,7 @@ def test_dataset_not_found(backend: _Backend) -> None: with pytest.raises( ERR_NO_RESULT, match=re.compile( - rf"{MSG_NO_RESULT}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", + rf"{MSG_NO_RESULT}.+{NAME}.+{real_name}.+{SUFFIX}.+{incorrect_suffix}", re.DOTALL, ), ): @@ -326,19 +329,7 @@ def test_dataset_not_found(backend: _Backend) -> None: def test_reader_missing_dependencies() -> None: - from packaging.requirements import Requirement - - from altair.datasets._readers import _Reader - - class MissingDeps(_Reader): - def __init__(self, name) -> None: - self._name = name - reqs = Requirement(name) - for req in (reqs.name, *reqs.extras): - self._import(req) - - self._read_fn = {} - self._scan_fn = {} + from altair.datasets._reader import _import_guarded fake_name = "not_a_real_package" real_name = "altair" @@ -351,7 +342,7 @@ def __init__(self, name) -> None: flags=re.DOTALL, ), ): - MissingDeps(fake_name) + _import_guarded(fake_name) # type: ignore with pytest.raises( ModuleNotFoundError, match=re.compile( @@ -359,7 +350,7 @@ def __init__(self, name) -> None: flags=re.DOTALL, ), ): - MissingDeps(backend) + _import_guarded(backend) # type: ignore @backends @@ -494,38 +485,10 @@ def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - assert not load.cache.is_empty() -# TODO: Investigate adding schemas for `pyarrow`. @pytest.mark.parametrize( - ("name", "fallback"), - [ - ("cars", "polars"), - ("movies", "polars"), - ("wheat", "polars"), - ("barley", "polars"), - ("gapminder", "polars"), - ("income", "polars"), - ("burtin", "polars"), - ("cars", None), - pytest.param( - "movies", - None, - marks=pytest.mark.xfail( - True, - raises=TypeError, - reason=( - "msg: `Expected bytes, got a 'int' object`\n" - "Isn't happy with the mixed `int`/`str` column." - ), - strict=True, - ), - ), - ("wheat", None), - ("barley", None), - ("gapminder", None), - ("income", None), - ("burtin", None), - ], + "name", ["cars", "movies", "wheat", "barley", "gapminder", "income", "burtin"] ) +@pytest.mark.parametrize("fallback", ["polars", None]) @backends_pyarrow def test_pyarrow_read_json( backend: _PyArrow, @@ -550,7 +513,7 @@ def test_spatial(backend: _Backend, name: Dataset) -> None: rf"{name}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url", flags=re.DOTALL | re.IGNORECASE, ) - with pytest.raises(NotImplementedError, match=pattern): + with pytest.raises(AltairDatasetsError, match=pattern): load(name) @@ -558,7 +521,11 @@ def test_spatial(backend: _Backend, name: Dataset) -> None: @datasets_debug def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None: if name in {"7zip", "ffox", "gimp"}: - with pytest.raises(AltairDatasetsError, match=rf"{name}.+tabular"): + pattern = re.compile( + rf"Unable to load.+{name}.png.+as tabular data", + flags=re.DOTALL | re.IGNORECASE, + ) + with pytest.raises((AltairDatasetsError, NotImplementedError), match=pattern): polars_loader(name) else: frame = polars_loader(name) From 2203972ed49a97c4398116310a3ef2d607d23614 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 29 Jan 2025 16:26:04 +0000 Subject: [PATCH 180/201] refactor: Simplify obsolete paths in `CsvCache` They were an artifact of *previously* using multiple `vega-dataset` versions in `.paquet` - but only the most recent in `.csv.gz` Currently both store the same range of names, so this error handling never triggered --- altair/datasets/_cache.py | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 9abe09726..13dca2f23 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,12 +5,11 @@ from collections import defaultdict from importlib.util import find_spec from pathlib import Path -from typing import TYPE_CHECKING, ClassVar, TypeVar, cast, get_args +from typing import TYPE_CHECKING, ClassVar, TypeVar, cast import narwhals.stable.v1 as nw from altair.datasets._exceptions import AltairDatasetsError -from altair.datasets._typing import Dataset if sys.version_info >= (3, 12): from typing import Protocol @@ -34,7 +33,7 @@ from narwhals.stable.v1.dtypes import DType from narwhals.stable.v1.typing import IntoExpr - from altair.datasets._typing import Metadata + from altair.datasets._typing import Dataset, Metadata if sys.version_info >= (3, 12): from typing import Unpack @@ -188,31 +187,17 @@ def rotated(self) -> Mapping[str, Sequence[Any]]: self._rotated[k].append(v) return self._rotated - # TODO: Evaluate which errors are now obsolete def __getitem__(self, key: _Dataset, /) -> Metadata: if meta := self.get(key, None): return meta + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) - if key in get_args(Dataset): - msg = f"{key!r} cannot be loaded via {type(self).__name__!r}." - raise TypeError(msg) - else: - msg = f"{key!r} does not refer to a known dataset." - raise TypeError(msg) - - # TODO: Evaluate which errors are now obsolete def url(self, name: _Dataset, /) -> str: - if meta := self.get(name, None): - if meta["suffix"] == ".parquet" and not find_spec("vegafusion"): - raise AltairDatasetsError.from_url(meta) - return meta["url"] - - if name in get_args(Dataset): - msg = f"{name!r} cannot be loaded via url." - raise TypeError(msg) - else: - msg = f"{name!r} does not refer to a known dataset." - raise TypeError(msg) + meta = self[name] + if meta["suffix"] == ".parquet" and not find_spec("vegafusion"): + raise AltairDatasetsError.from_url(meta) + return meta["url"] def __repr__(self) -> str: return f"<{type(self).__name__}: {'COLLECTED' if self._mapping else 'READY'}>" From e68ab89810e6d7aaa7e9ca3b19461b6603866454 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 13:37:03 +0000 Subject: [PATCH 181/201] chore: add workaround for `narwhals` bug Opened (https://github.com/narwhals-dev/narwhals/issues/1897) Marking (https://github.com/vega/altair/pull/3631#discussion_r1934313255) as resolved --- altair/datasets/_constraints.py | 20 +++++++++++++++++++- altair/datasets/_reader.py | 10 ++++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py index e5eaa3b97..fbfd9cbc8 100644 --- a/altair/datasets/_constraints.py +++ b/altair/datasets/_constraints.py @@ -2,6 +2,8 @@ from __future__ import annotations +import functools +import operator from collections.abc import Set from itertools import chain from typing import TYPE_CHECKING, Any @@ -59,7 +61,23 @@ def collect(**kwds: Unpack[Metadata]) -> Metadata: return dict(self) def to_expr(self) -> nw.Expr: - return nw.all_horizontal(nw.col(name) == val for name, val in self) + """ + Convert constraint into a narhwals expression. + + Notes + ----- + Workaround for `issue`_ is performing the reduction with ``stdlib`` + + .. _issue: + https://github.com/narwhals-dev/narwhals/issues/1897 + .. _discussion: + https://github.com/vega/altair/pull/3631#discussion_r1934313255 + """ + if not self: + msg = f"Unable to convert an empty set to an expression:\n\n{self!r}" + raise TypeError(msg) + exprs = (nw.col(name) == val for name, val in self) + return functools.reduce(operator.and_, exprs) def isdisjoint(self, other: Iterable[Any]) -> bool: return super().isdisjoint(other) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index eacc516ba..b8cc6b859 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -171,13 +171,11 @@ def profile(self, mode: Literal["any", "each"]): ) frame = self._scan_metadata().select("dataset_name", *relevant_columns) it = (impl._include_expr for impl in self._read) - # BUG: ``narwhals`` raises a ``ValueError`` when ``__invert__``-ing a previously used Expr? - # - Can't reproduce trivially - # - Doesnt seem to be related to genexp inc_expr = nw.any_horizontal(*it) - include = _dataset_names(frame, inc_expr) - exclude = _dataset_names(frame, ~nw.col("dataset_name").is_in(include)) - return {"include": include, "exclude": exclude} + return { + "include": _dataset_names(frame, inc_expr), + "exclude": _dataset_names(frame, ~inc_expr), + } elif mode == "each": # FIXME: Rough draft of how to group results # - Don't really want a nested dict From 576a9b40da3bbf27656ff2e3f4c896d0ff3b2d9e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:49:02 +0000 Subject: [PATCH 182/201] feat(typing): replace `(Read|Scan)Impl` classes with aliases - Shorter names `Read`, `Scan` - The single unique method is now `into_scan` - There was no real need to have concrete classes when they behave the same as parent --- altair/datasets/_reader.py | 26 ++++----- altair/datasets/_readimpl.py | 107 +++++++++++++++++------------------ 2 files changed, 66 insertions(+), 67 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index b8cc6b859..3d7d2d87f 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -52,7 +52,7 @@ import polars as pl import pyarrow as pa - from altair.datasets._readimpl import BaseImpl, R, ReadImpl, ScanImpl + from altair.datasets._readimpl import BaseImpl, R, Read, Scan from altair.datasets._typing import Dataset, Extension, Metadata from altair.vegalite.v5.schema._typing import OneOrSeq @@ -107,11 +107,11 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]): """ # TODO: Docs - _read: Sequence[ReadImpl[IntoDataFrameT]] + _read: Sequence[Read[IntoDataFrameT]] """Eager file read functions.""" # TODO: Docs - _scan: Sequence[ScanImpl[IntoFrameT]] + _scan: Sequence[Scan[IntoFrameT]] """ *Optionally*-lazy file read/scan functions. @@ -143,8 +143,8 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]): def __init__( self, - read: Sequence[ReadImpl[IntoDataFrameT]], - scan: Sequence[ScanImpl[IntoFrameT]], + read: Sequence[Read[IntoDataFrameT]], + scan: Sequence[Scan[IntoFrameT]], name: str, implementation: nw.Implementation, ) -> None: @@ -356,7 +356,7 @@ def _metadata_frame(self) -> nw.LazyFrame: @overload def reader( - read_fns: Sequence[ReadImpl[IntoDataFrameT]], + read_fns: Sequence[Read[IntoDataFrameT]], scan_fns: tuple[()] = ..., *, name: str | None = ..., @@ -366,8 +366,8 @@ def reader( @overload def reader( - read_fns: Sequence[ReadImpl[IntoDataFrameT]], - scan_fns: Sequence[ScanImpl[IntoFrameT]], + read_fns: Sequence[Read[IntoDataFrameT]], + scan_fns: Sequence[Scan[IntoFrameT]], *, name: str | None = ..., implementation: nw.Implementation = ..., @@ -375,8 +375,8 @@ def reader( def reader( - read_fns: Sequence[ReadImpl[IntoDataFrameT]], - scan_fns: Sequence[ScanImpl[IntoFrameT]] = (), + read_fns: Sequence[Read[IntoDataFrameT]], + scan_fns: Sequence[Scan[IntoFrameT]] = (), *, name: str | None = None, implementation: nw.Implementation = nw.Implementation.UNKNOWN, @@ -504,10 +504,10 @@ def _into_suffix(obj: Path | str, /) -> Any: def _steal_eager_parquet( - read_fns: Sequence[ReadImpl[IntoDataFrameT]], / -) -> Sequence[ScanImpl[nw.LazyFrame]] | None: + read_fns: Sequence[Read[IntoDataFrameT]], / +) -> Sequence[Scan[nw.LazyFrame]] | None: if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None): - return (convertable.to_scan_impl(),) + return (_readimpl.into_scan(convertable),) return None diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index 119352db5..fc9c77110 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -31,6 +31,10 @@ from typing import TypeVar else: from typing_extensions import TypeVar +if sys.version_info >= (3, 12): + from typing import TypeAliasType +else: + from typing_extensions import TypeAliasType if TYPE_CHECKING: from collections.abc import Callable, Iterable, Iterator, Sequence @@ -46,12 +50,14 @@ __all__ = ["is_available", "pa_any", "pd_only", "pd_pyarrow", "pl_only", "read", "scan"] -R = TypeVar("R") +R = TypeVar("R", bound="nwt.IntoFrame") IntoFrameT = TypeVar( "IntoFrameT", bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike", default=nw.LazyFrame, ) +Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,)) +Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,)) class Skip(Enum): @@ -158,41 +164,35 @@ def _exclude_expr(self) -> nw.Expr: raise TypeError(msg) -def _unwrap_partial(fn: Any, /) -> Any: - # NOTE: ``functools._unwrap_partial`` - func = fn - while isinstance(func, partial): - func = func.func - return func - - -class ScanImpl(BaseImpl[IntoFrameT]): ... - - -class ReadImpl(BaseImpl[IntoDataFrameT]): - def to_scan_impl(self) -> ScanImpl[nw.LazyFrame]: - return ScanImpl(_into_scan_fn(self.fn), self.include, self.exclude, {}) +def read( + fn: Callable[..., IntoDataFrameT], + /, + include: MetaIs, + exclude: MetaIs | None = None, + **kwds: Any, +) -> Read[IntoDataFrameT]: + return BaseImpl(fn, include, exclude, kwds) -def _into_scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]: - @wraps(_unwrap_partial(fn)) - def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame: - return nw.from_native(fn(*args, **kwds)).lazy() +def scan( + fn: Callable[..., IntoFrameT], + /, + include: MetaIs, + exclude: MetaIs | None = None, + **kwds: Any, +) -> Scan[IntoFrameT]: + return BaseImpl(fn, include, exclude, kwds) - return wrapper +def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame]: + def scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]: + @wraps(_unwrap_partial(fn)) + def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame: + return nw.from_native(fn(*args, **kwds)).lazy() -def _root_package_name(obj: Any, default: str, /) -> str: - # NOTE: Defers importing `inspect`, if we can get the module name - if hasattr(obj, "__module__"): - return obj.__module__.split(".")[0] - else: - from inspect import getmodule + return wrapper - module = getmodule(obj) - if module and (pkg := module.__package__): - return pkg.split(".")[0] - return default + return BaseImpl(scan_fn(impl.fn), impl.include, impl.exclude, {}) def is_available( @@ -217,29 +217,28 @@ def is_available( return fn(find_spec(name) is not None for name in names) -def read( - fn: Callable[..., IntoDataFrameT], - /, - include: MetaIs, - exclude: MetaIs | None = None, - **kwds: Any, -) -> ReadImpl[IntoDataFrameT]: - return ReadImpl(fn, include, exclude, kwds) +def _root_package_name(obj: Any, default: str, /) -> str: + # NOTE: Defers importing `inspect`, if we can get the module name + if hasattr(obj, "__module__"): + return obj.__module__.split(".")[0] + else: + from inspect import getmodule + module = getmodule(obj) + if module and (pkg := module.__package__): + return pkg.split(".")[0] + return default -def scan( - fn: Callable[..., IntoFrameT], - /, - include: MetaIs, - exclude: MetaIs | None = None, - **kwds: Any, -) -> ScanImpl[IntoFrameT]: - return ScanImpl(fn, include, exclude, kwds) + +def _unwrap_partial(fn: Any, /) -> Any: + # NOTE: ``functools._unwrap_partial`` + func = fn + while isinstance(func, partial): + func = func.func + return func -def pl_only() -> tuple[ - Sequence[ReadImpl[pl.DataFrame]], Sequence[ScanImpl[pl.LazyFrame]] -]: +def pl_only() -> tuple[Sequence[Read[pl.DataFrame]], Sequence[Scan[pl.LazyFrame]]]: import polars as pl read_fns = ( @@ -253,10 +252,10 @@ def pl_only() -> tuple[ return read_fns, scan_fns -def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]: +def pd_only() -> Sequence[Read[pd.DataFrame]]: import pandas as pd - opt: Sequence[ReadImpl[pd.DataFrame]] + opt: Sequence[Read[pd.DataFrame]] if is_available("pyarrow"): opt = read(pd.read_feather, is_arrow), read(pd.read_parquet, is_parquet) elif is_available("fastparquet"): @@ -271,7 +270,7 @@ def pd_only() -> Sequence[ReadImpl[pd.DataFrame]]: ) -def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]: +def pd_pyarrow() -> Sequence[Read[pd.DataFrame]]: import pandas as pd kwds: dict[str, Any] = {"dtype_backend": "pyarrow"} @@ -284,7 +283,7 @@ def pd_pyarrow() -> Sequence[ReadImpl[pd.DataFrame]]: ) -def pa_any() -> Sequence[ReadImpl[pa.Table]]: +def pa_any() -> Sequence[Read[pa.Table]]: from pyarrow import csv, feather, parquet return ( @@ -296,7 +295,7 @@ def pa_any() -> Sequence[ReadImpl[pa.Table]]: ) -def _pa_read_json_impl() -> ReadImpl[pa.Table]: +def _pa_read_json_impl() -> Read[pa.Table]: """ Mitigating ``pyarrow``'s `line-delimited`_ JSON requirement. From 91562d55d1b7120cd065b8ad20893de73700b3e6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 16:02:41 +0000 Subject: [PATCH 183/201] feat: Rename, docs `unwrap_or` -> `unwrap_or_skip` --- altair/datasets/_reader.py | 2 +- altair/datasets/_readimpl.py | 22 ++++++++++++++++------ tests/test_datasets.py | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 3d7d2d87f..46a7f5620 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -311,7 +311,7 @@ def _solve( - Leaves the door open for caching the search space """ items = meta.items() - it = (some for impl in impls if (some := impl.unwrap_or(items))) + it = (some for impl in impls if (some := impl.unwrap_or_skip(items))) if fn_or_err := next(it, None): if _is_err(fn_or_err): raise fn_or_err.from_tabular(meta, self._name) diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index fc9c77110..f964da253 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -97,14 +97,24 @@ def __init__( object.__setattr__(self, "include", include) object.__setattr__(self, "exclude", exclude) - # TODO: Consider renaming - # NOTE: - # - Fn means call it - # - Err means raise it - # - Skip means its safe to check other impls - def unwrap_or( + def unwrap_or_skip( self, meta: Items, / ) -> Callable[..., R] | type[AltairDatasetsError] | Skip: + """ + Indicate an action to take for a dataset. + + **Supports** dataset, use this function:: + + Callable[..., R] + + Has explicitly marked as **not supported**:: + + type[AltairDatasetsError] + + No relevant constraints overlap, safe to check others:: + + Skip + """ if self.include.issubset(meta): return self.fn if self.exclude.isdisjoint(meta) else AltairDatasetsError return Skip.skip diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3765fa69b..8acefa0e6 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -122,7 +122,7 @@ def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool: if is_loader_backend(loader, "pyarrow"): items = is_meta(suffix=".json", is_spatial=True) impls = loader._reader._read - it = (some for impl in impls if (some := impl.unwrap_or(items))) + it = (some for impl in impls if (some := impl.unwrap_or_skip(items))) return callable(next(it, None)) return False From 1628cbd6c3ff642996d22ac15822854f1017173c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 16:31:27 +0000 Subject: [PATCH 184/201] refactor: Replace `._contents` w/ `.__str__()` Inspired by https://github.com/pypa/packaging/blob/8510bd9d3bab5571974202ec85f6ef7b0359bfaf/src/packaging/requirements.py#L67-L71 --- altair/datasets/_reader.py | 8 +++----- altair/datasets/_readimpl.py | 17 ++++++----------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 46a7f5620..c06fdc9cc 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -182,7 +182,7 @@ def profile(self, mode: Literal["any", "each"]): m = {} frame = self._scan_metadata() for impl in self._read: - name = impl._contents + name = str(impl) m[name] = {"include": _dataset_names(frame, impl._include_expr)} if impl.exclude: m[name].update(exclude=_dataset_names(frame, impl._exclude_expr)) @@ -196,11 +196,9 @@ def __repr__(self) -> str: PREFIX = " " * 4 NL = "\n" - body = f"read\n{indent(NL.join(el._contents for el in self._read), PREFIX)}" + body = f"read\n{indent(NL.join(str(el) for el in self._read), PREFIX)}" if self._scan: - body += ( - f"\nscan\n{indent(NL.join(el._contents for el in self._scan), PREFIX)}" - ) + body += f"\nscan\n{indent(NL.join(str(el) for el in self._scan), PREFIX)}" return f"Reader[{self._name}] {self._implementation!r}\n{body}" def read_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]: diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index f964da253..4969a25f9 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -136,22 +136,17 @@ def _inferred_package(self) -> str: def __repr__(self) -> str: tp_name = f"{type(self).__name__}[{self._inferred_package}?]" - return f"{tp_name}({self._contents})" + return f"{tp_name}({self})" - # TODO: Consider renaming - @property - def _contents(self) -> str: + def __str__(self) -> str: if isinstance(self.fn, partial): fn = _unwrap_partial(self.fn) - it = (f"{k}={v!r}" for k, v in self.fn.keywords.items()) - fn_repr = f"{fn.__name__}(..., {', '.join(it)})" + kwds = self.fn.keywords.items() + fn_repr = f"{fn.__name__}(..., {', '.join(f'{k}={v!r}' for k, v in kwds)})" else: fn_repr = f"{self.fn.__name__}(...)" - if self.exclude: - params = f"include={self.include!r}, exclude={self.exclude!r}" - else: - params = repr(self.include) - return f"{fn_repr}, {params}" + inc, exc = self.include, self.exclude + return f"{fn_repr}, {f'include={inc!r}, exclude={exc!r}' if exc else repr(inc)}" @property def _relevant_columns(self) -> Iterator[str]: From cbd04e33cfb38a646862bb0c5b7bc2c2d1ce815b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 17:01:39 +0000 Subject: [PATCH 185/201] fix: Use correct type for `pyarrow.csv.read_csv` Resolves: ```py File ../altair/.venv/Lib/site-packages/pyarrow/csv.pyx:1258, in pyarrow._csv.read_csv() TypeError: Cannot convert dict to pyarrow._csv.ParseOptions ``` --- altair/datasets/_readimpl.py | 2 +- tests/test_datasets.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index 4969a25f9..0278d48b5 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -294,7 +294,7 @@ def pa_any() -> Sequence[Read[pa.Table]]: return ( read(csv.read_csv, is_csv), _pa_read_json_impl(), - read(csv.read_csv, is_tsv, parse_options={"delimiter": "\t"}), + read(csv.read_csv, is_tsv, parse_options=csv.ParseOptions(delimiter="\t")), # pyright: ignore[reportCallIssue] read(feather.read_table, is_arrow), read(parquet.read_table, is_parquet), ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 8acefa0e6..60b4a9cfb 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -517,6 +517,12 @@ def test_spatial(backend: _Backend, name: Dataset) -> None: load(name) +@backends +def test_tsv(backend: _Backend) -> None: + load = Loader.from_backend(backend) + is_frame_backend(load("unemployment", ".tsv"), backend) + + @datasets_all @datasets_debug def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None: From c0a92a618469fb44c843b927bd3d3276a2732d7b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 18:16:12 +0000 Subject: [PATCH 186/201] docs: Add docs for `Read`, `Scan`, `BaseImpl` --- altair/datasets/_reader.py | 11 +---------- altair/datasets/_readimpl.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index c06fdc9cc..2162d910c 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -106,20 +106,11 @@ class Reader(Generic[IntoDataFrameT, IntoFrameT]): Use ``reader(...)`` instead of instantiating ``Reader`` directly. """ - # TODO: Docs _read: Sequence[Read[IntoDataFrameT]] """Eager file read functions.""" - # TODO: Docs _scan: Sequence[Scan[IntoFrameT]] - """ - *Optionally*-lazy file read/scan functions. - - Used exclusively for ``metadata.parquet``. - - Currently ``"polars"`` is the only lazy option. - All others defer to the eager variant. - """ + """Lazy file read functions.""" _name: str """ diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index 0278d48b5..cc4c01e07 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -56,8 +56,11 @@ bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike", default=nw.LazyFrame, ) -Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,)) Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,)) +"""An *eager* file read function.""" + +Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,)) +"""A *lazy* file read function.""" class Skip(Enum): @@ -73,12 +76,33 @@ def __repr__(self) -> Literal[""]: class BaseImpl(Generic[R]): + """ + A function wrapped with dataset support constraints. + + The ``include``, ``exclude`` properties form a `NIMPLY gate`_ (`Material nonimplication`_). + + Examples + -------- + For some dataset ``D``, we can use ``fn`` if:: + + impl: BaseImpl + impl.include(D) and not impl.exclude(D) + + + .. _NIMPLY gate: + https://en.m.wikipedia.org/wiki/NIMPLY_gate + .. _Material nonimplication: + https://en.m.wikipedia.org/wiki/Material_nonimplication#Truth_table + """ + fn: Callable[..., R] - """Wrapped read function.""" + """Wrapped read/scan function.""" + include: MetaIs - """Passing this makes ``fn`` a candidate.""" + """Constraint indicating ``fn`` **supports** reading a dataset.""" + exclude: MetaIs - """Passing this overrides ``include``, transforming into an error.""" + """Constraint *subsetting* ``include`` to mark **non-support**.""" def __init__( self, @@ -121,6 +145,7 @@ def unwrap_or_skip( @classmethod def _exclude_none(cls) -> MetaIs: + """Represents the empty set.""" return is_meta() def __setattr__(self, name: str, value: Any): From 2b8bf5e5459ac800711535f6cdb833d065bf0909 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 19:01:49 +0000 Subject: [PATCH 187/201] docs: Clean up `_merge_kwds`, `_solve` --- altair/datasets/_reader.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 2162d910c..4075598ec 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -254,15 +254,11 @@ def _query( msg = f"Found no results for:\n {constraints!r}" raise ValueError(msg) - # TODO: Docs def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, Any]: """ - Hook to utilize ``meta`` to extend ``kwds`` with known helpful defaults. + Extend user-provided arguments with dataset & library-specfic defaults. - - User provided arguments have a higher precedence. - - The keywords for schemas vary between libraries - - pandas is internally inconsistent - - By default, returns unchanged + .. important:: User-provided arguments have a higher precedence. """ if self._schema_cache.is_active() and ( schema := self._schema_cache.schema_kwds(meta) @@ -282,22 +278,15 @@ def _scan_metadata( return self._metadata_frame.filter(*predicates, **constraints) return self._metadata_frame - # TODO: Docs def _solve( self, meta: Metadata, impls: Sequence[BaseImpl[R]], / ) -> Callable[..., R]: """ - Return the first function meeting constraints of meta. - - Notes - ----- - - Iterate over impls - - Each one can either match or signal an error - - An error blocks any additional checking - - Both include & exclude - - Uses ``ItemsView`` to support set ops - - `meta` isn't iterated over - - Leaves the door open for caching the search space + Return the first function that satisfies dataset constraints. + + See Also + -------- + ``altair.datasets._readimpl.BaseImpl.unwrap_or_skip`` """ items = meta.items() it = (some for impl in impls if (some := impl.unwrap_or_skip(items))) From 755ab4f560af13f9268e905cf70783c34b30b1d7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 30 Jan 2025 19:45:00 +0000 Subject: [PATCH 188/201] refactor(typing): Include all suffixes in `Extension` Also simplifies and removes outdated `Extension`-related tooling --- altair/datasets/_reader.py | 7 +++++-- altair/datasets/_typing.py | 22 +++------------------- tests/test_datasets.py | 7 ++----- tools/datasets/__init__.py | 17 +++-------------- tools/datasets/datapackage.py | 3 +-- 5 files changed, 14 insertions(+), 42 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 4075598ec..309080823 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -41,7 +41,6 @@ module_not_found, ) from altair.datasets._readimpl import IntoFrameT, is_available -from altair.datasets._typing import EXTENSION_SUFFIXES if TYPE_CHECKING: import sys @@ -443,8 +442,12 @@ def _into_constraints( elif suffix.startswith("."): m = {"dataset_name": name, "suffix": suffix} else: + from typing import get_args + + from altair.datasets._typing import Extension + msg = ( - f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n" + f"Expected 'suffix' to be one of {get_args(Extension)!r},\n" f"but got: {suffix!r}" ) raise TypeError(msg) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 958db2300..7c524f2ec 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -4,25 +4,20 @@ from __future__ import annotations import sys -from typing import Any, Literal +from typing import Literal if sys.version_info >= (3, 14): from typing import TypedDict else: from typing_extensions import TypedDict -if sys.version_info >= (3, 13): - from typing import TypeIs -else: - from typing_extensions import TypeIs - if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -__all__ = ["EXTENSION_SUFFIXES", "Dataset", "Extension", "Metadata", "is_ext_read"] +__all__ = ["Dataset", "Extension", "Metadata"] Dataset: TypeAlias = Literal[ "7zip", @@ -98,18 +93,7 @@ "world-110m", "zipcodes", ] -Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".tsv"] -EXTENSION_SUFFIXES: tuple[ - Literal[".arrow"], - Literal[".csv"], - Literal[".json"], - Literal[".parquet"], - Literal[".tsv"], -] = (".arrow", ".csv", ".json", ".parquet", ".tsv") - - -def is_ext_read(suffix: Any) -> TypeIs[Extension]: - return suffix in {".arrow", ".csv", ".json", ".parquet", ".tsv"} +Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".png", ".tsv"] class Metadata(TypedDict, total=False): diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 60b4a9cfb..429f4b16a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -16,7 +16,7 @@ from altair.datasets import Loader from altair.datasets._exceptions import AltairDatasetsError -from altair.datasets._typing import Dataset, Metadata, is_ext_read +from altair.datasets._typing import Dataset, Metadata from tests import no_xdist, skip_requires_pyarrow from tools import fs @@ -441,10 +441,7 @@ def test_reader_cache_exhaustive( # NOTE: Approximating all datasets downloaded assert len(cached_paths) >= 70 - assert all( - bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size) - for fp in load.cache - ) + assert all(bool(fp.exists() and fp.stat().st_size) for fp in load.cache) # NOTE: Confirm this is a no-op load.cache.download_all() assert len(cached_paths) == len(tuple(load.cache)) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 64940ebc1..6c8c75fe5 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -190,13 +190,6 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None: indent = " " * 4 NAME = "Dataset" EXT = "Extension" - EXT_TYPES = dpkg.extensions() - EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" - EXTENSION_TYPE_TP = ( - f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXT_TYPES)}]" - ) - EXTENSION_GUARD = "is_ext_read" - FIELD = "FlFieldStr" FIELD_TYPES = ( "integer", @@ -215,17 +208,13 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None: f"{HEADER_COMMENT}", "from __future__ import annotations\n", "import sys", - "from typing import Any, Literal, TYPE_CHECKING", + "from typing import Literal, TYPE_CHECKING", utils.import_typing_extensions((3, 14), "TypedDict"), - utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n", + f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT]}\n", utils.spell_literal_alias(NAME, dpkg.dataset_names()), - utils.spell_literal_alias(EXT, EXT_TYPES), - f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXT_TYPES!r}", - f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" - f"{indent}return suffix in set({EXT_TYPES!r})\n", + utils.spell_literal_alias(EXT, dpkg.extensions()), dpkg.typed_dict(), utils.spell_literal_alias(FIELD, FIELD_TYPES), '"""\n' diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index 9747bdb71..ec707c0da 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -103,8 +103,7 @@ def dataset_names(self) -> Iterable[str]: def extensions(self) -> tuple[str, ...]: return tuple( - self.core.filter(is_image=False) - .select(col("suffix").unique().sort()) + self.core.select(col("suffix").unique().sort()) .collect() .to_series() .to_list() From 0ba3d677ab91092380f4fa5388766e866d5be924 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 31 Jan 2025 15:05:02 +0000 Subject: [PATCH 189/201] feat: Finish `Reader.profile` - Reduced the scope a bit, now just un/supported - Added `pprint` option - Finished docs, including example pointing to use `url(...)` --- altair/datasets/_reader.py | 100 +++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 44 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 309080823..195607fe5 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -96,6 +96,26 @@ _PySpark, ) +_SupportProfile: TypeAlias = Mapping[ + Literal["supported", "unsupported"], "Sequence[Dataset]" +] +""" +Dataset support varies between backends and available dependencies. + +Any name listed in ``"unsupported"`` will raise an error on:: + + from altair.datasets import load + + load("7zip") + +Instead, they can be loaded via:: + + import altair as alt + from altair.datasets import url + + alt.Chart(url("7zip")) +""" + class Reader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -144,43 +164,6 @@ def __init__( self._implementation = implementation self._schema_cache = SchemaCache(implementation=implementation) - # TODO: Finish working on presentation - # - The contents of both are functional - def profile(self, mode: Literal["any", "each"]): - """ - Describe which datasets/groups are supported. - - Focusing on actual datasets, rather than describing wrapped functions (repr) - - .. note:: - Having this public to make testing easier (``tests.test_datasets.is_polars_backed_pyarrow``) - """ - if mode == "any": - relevant_columns = set( - chain.from_iterable(impl._relevant_columns for impl in self._read) - ) - frame = self._scan_metadata().select("dataset_name", *relevant_columns) - it = (impl._include_expr for impl in self._read) - inc_expr = nw.any_horizontal(*it) - return { - "include": _dataset_names(frame, inc_expr), - "exclude": _dataset_names(frame, ~inc_expr), - } - elif mode == "each": - # FIXME: Rough draft of how to group results - # - Don't really want a nested dict - m = {} - frame = self._scan_metadata() - for impl in self._read: - name = str(impl) - m[name] = {"include": _dataset_names(frame, impl._include_expr)} - if impl.exclude: - m[name].update(exclude=_dataset_names(frame, impl._exclude_expr)) - return m - else: - msg = f"Unexpected {mode=}" - raise TypeError(msg) - def __repr__(self) -> str: from textwrap import indent @@ -234,6 +217,38 @@ def url( msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." raise TypeError(msg) + @overload + def profile(self, *, show: Literal[False] = ...) -> _SupportProfile: ... + + @overload + def profile(self, *, show: Literal[True]) -> None: ... + + def profile(self, *, show: bool = False) -> _SupportProfile | None: + """ + Describe which datasets can be loaded as tabular data. + + Parameters + ---------- + show + Print a densely formatted repr *instead of* returning a mapping. + """ + relevant_columns = set( + chain.from_iterable(impl._relevant_columns for impl in self._read) + ) + frame = self._scan_metadata().select("dataset_name", *relevant_columns) + it = (impl._include_expr for impl in self._read) + inc_expr = nw.any_horizontal(*it) + result: _SupportProfile = { + "unsupported": _dataset_names(frame, ~inc_expr), + "supported": _dataset_names(frame, inc_expr), + } + if show: + import pprint + + pprint.pprint(result, compact=True, sort_dicts=False) + return None + return result + def _query( self, name: Dataset | LiteralString, suffix: Extension | None = None, / ) -> nw.DataFrame[IntoDataFrameT]: @@ -298,15 +313,12 @@ def _solve( raise implementation_not_found(meta) -# TODO: Review after finishing `profile` -# NOTE: Temp helper function for `Reader.profile` def _dataset_names( - frame: nw.LazyFrame, - *predicates: OneOrSeq[IntoExpr], - **constraints: Unpack[Metadata], -): + frame: nw.LazyFrame, *predicates: OneOrSeq[IntoExpr] +) -> Sequence[Dataset]: + # NOTE: helper function for `Reader.profile` return ( - frame.filter(*predicates, **constraints) + frame.filter(*predicates) .select("dataset_name") .collect() .get_column("dataset_name") From 845b3eec47a8c1125ce4b2e1977e1c995eea6d61 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 31 Jan 2025 15:21:15 +0000 Subject: [PATCH 190/201] test: Use `Reader.profile` in `is_polars_backed_pyarrow` --- tests/test_datasets.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 429f4b16a..2bef2ed70 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -115,16 +115,15 @@ def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool: def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool: - """User requested ``pyarrow``, but also has ``polars`` installed.""" - # NOTE: Would prefer if there was a *less* private method to test this. - from altair.datasets._constraints import is_meta - - if is_loader_backend(loader, "pyarrow"): - items = is_meta(suffix=".json", is_spatial=True) - impls = loader._reader._read - it = (some for impl in impls if (some := impl.unwrap_or_skip(items))) - return callable(next(it, None)) - return False + """ + User requested ``pyarrow``, but also has ``polars`` installed. + + Both support nested datatypes, which are required for spatial json. + """ + return ( + is_loader_backend(loader, "pyarrow") + and "earthquakes" in loader._reader.profile()["supported"] + ) @backends From 869d2161bde45d59582c687e574fbfe0f7efe776 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 31 Jan 2025 18:47:20 +0000 Subject: [PATCH 191/201] feat: Clean up, add tests for new exceptions --- altair/datasets/_exceptions.py | 68 ++++++++++++++++------------------ altair/datasets/_reader.py | 10 +---- tests/test_datasets.py | 29 ++++++++++++++- 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py index 2f9c13d45..3b377f657 100644 --- a/altair/datasets/_exceptions.py +++ b/altair/datasets/_exceptions.py @@ -28,16 +28,25 @@ def from_url(cls, meta: Metadata, /) -> AltairDatasetsError: @classmethod def from_tabular(cls, meta: Metadata, backend_name: str, /) -> AltairDatasetsError: - install_other = None - mid = "\n" - if not meta["is_image"] and not meta["is_tabular"]: - install_other = "polars" - if meta["is_spatial"]: - mid = f"Geospatial data is not supported natively by {backend_name!r}." - elif meta["is_json"]: - mid = f"Non-tabular json is not supported natively by {backend_name!r}." - msg = f"{_failed_tabular(meta)}{mid}{_suggest_url(meta, install_other)}" - return cls(msg) + if meta["is_image"]: + reason = "Image data is non-tabular." + return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta)}") + elif not meta["is_tabular"] or meta["suffix"] in {".arrow", ".parquet"}: + if meta["suffix"] in {".arrow", ".parquet"}: + install: tuple[str, ...] = "pyarrow", "polars" + what = f"{meta['suffix']!r}" + else: + install = ("polars",) + if meta["is_spatial"]: + what = "Geospatial data" + elif meta["is_json"]: + what = "Non-tabular json" + else: + what = f"{meta['file_name']!r}" + reason = _why(what, backend_name) + return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta, *install)}") + else: + return cls(_implementation_not_found(meta)) @classmethod def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError: @@ -70,36 +79,24 @@ def _failed_tabular(meta: Metadata, /) -> str: return f"Unable to load {meta['file_name']!r} as tabular data.\n" -def _suggest_url(meta: Metadata, install_other: str | None = None) -> str: - other = f" installing `{install_other}` or" if install_other else "" +def _why(what: str, backend_name: str, /) -> str: + return f"{what} is not supported natively by {backend_name!r}." + + +def _suggest_url(meta: Metadata, *install_other: str) -> str: + other = "" + if install_other: + others = " or ".join(f"`{other}`" for other in install_other) + other = f" installing {others}, or use" return ( - f"\n\nInstead, try{other}:\n\n" + f"\n\nInstead, try{other}:\n" " from altair.datasets import url\n" f" url({meta['dataset_name']!r})" ) -# TODO: -# - Use `AltairDatasetsError` -# - Remove notes from doc -# - Improve message and how data is selected -def implementation_not_found(meta: Metadata, /) -> NotImplementedError: - """ - Search finished without finding a *declared* incompatibility. - - Notes - ----- - - New kind of error - - Previously, every backend had a function assigned - - But they might not all work - - Now, only things that are known to be widely safe are added - - Should probably suggest using a pre-defined backend that supports everything - - What can reach here? - - `is_image` (all) - - `"pandas"` (using inference wont trigger these) - - `.arrow` (w/o `pyarrow`) - - `.parquet` (w/o either `pyarrow` or `fastparquet`) - """ +def _implementation_not_found(meta: Metadata, /) -> str: + """Search finished without finding a *declared* incompatibility.""" INDENT = " " * 4 record = f",\n{INDENT}".join( f"{k}={v!r}" @@ -107,5 +104,4 @@ def implementation_not_found(meta: Metadata, /) -> NotImplementedError: if not (k.startswith(("is_", "sha", "bytes", "has_"))) or (v is True and k.startswith("is_")) ) - msg = f"Found no implementation that supports:\n{INDENT}{record}" - return NotImplementedError(msg) + return f"Found no implementation that supports:\n{INDENT}{record}" diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 195607fe5..cacb903f2 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -35,11 +35,7 @@ from altair.datasets import _readimpl from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata from altair.datasets._constraints import is_parquet -from altair.datasets._exceptions import ( - AltairDatasetsError, - implementation_not_found, - module_not_found, -) +from altair.datasets._exceptions import AltairDatasetsError, module_not_found from altair.datasets._readimpl import IntoFrameT, is_available if TYPE_CHECKING: @@ -308,9 +304,7 @@ def _solve( if _is_err(fn_or_err): raise fn_or_err.from_tabular(meta, self._name) return fn_or_err - if meta["is_image"]: - raise AltairDatasetsError.from_tabular(meta, self._name) - raise implementation_not_found(meta) + raise AltairDatasetsError.from_tabular(meta, self._name) def _dataset_names( diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 2bef2ed70..81ee5e3f3 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -352,6 +352,33 @@ def test_reader_missing_dependencies() -> None: _import_guarded(backend) # type: ignore +def test_reader_missing_implementation() -> None: + from altair.datasets._constraints import is_csv + from altair.datasets._reader import reader + from altair.datasets._readimpl import read + + def func(*args, **kwds) -> pd.DataFrame: + if TYPE_CHECKING: + return pd.DataFrame() + + name = "pandas" + rd = reader((read(func, is_csv),), name=name) + with pytest.raises( + AltairDatasetsError, + match=re.compile(rf"Unable.+parquet.+native.+{name}", flags=re.DOTALL), + ): + rd.dataset("flights-3m") + with pytest.raises( + AltairDatasetsError, + match=re.compile(r"Found no.+support.+flights.+json", flags=re.DOTALL), + ): + rd.dataset("flights-2k") + with pytest.raises( + AltairDatasetsError, match=re.compile(r"Image data is non-tabular") + ): + rd.dataset("7zip") + + @backends def test_reader_cache( backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path @@ -527,7 +554,7 @@ def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None: rf"Unable to load.+{name}.png.+as tabular data", flags=re.DOTALL | re.IGNORECASE, ) - with pytest.raises((AltairDatasetsError, NotImplementedError), match=pattern): + with pytest.raises(AltairDatasetsError, match=pattern): polars_loader(name) else: frame = polars_loader(name) From 7bb6f9e7a8c98152f17a2ec9c57141cc58171148 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 1 Feb 2025 12:19:58 +0000 Subject: [PATCH 192/201] feat: Adds `Reader.open_markdown` - Will be even more useful after merging https://github.com/vega/vega-datasets/pull/663 - Thinking this is a fair tradeoff vs inlining the descriptions into `altair` - All the info is available and it is quicker than manually searching the headings in a browser --- altair/datasets/_reader.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index cacb903f2..ec1f00ba5 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -213,6 +213,36 @@ def url( msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}." raise TypeError(msg) + # TODO: (Multiple) + # - Settle on a better name + # - Add method to `Loader` + # - Move docs to `Loader.{new name}` + def open_markdown(self, name: Dataset, /) -> None: + """ + Learn more about a dataset, opening `vega-datasets/datapackage.md`_ with the default browser. + + Additional info *may* include: `description`_, `schema`_, `sources`_, `licenses`_. + + .. _vega-datasets/datapackage.md: + https://github.com/vega/vega-datasets/blob/main/datapackage.md + .. _description: + https://datapackage.org/standard/data-resource/#description + .. _schema: + https://datapackage.org/standard/table-schema/#schema + .. _sources: + https://datapackage.org/standard/data-package/#sources + .. _licenses: + https://datapackage.org/standard/data-package/#licenses + """ + import webbrowser + + from altair.utils import VERSIONS + + ref = self._query(name).get_column("file_name").item(0).replace(".", "") + tag = VERSIONS["vega-datasets"] + url = f"https://github.com/vega/vega-datasets/blob/{tag}/datapackage.md#{ref}" + webbrowser.open(url) + @overload def profile(self, *, show: Literal[False] = ...) -> _SupportProfile: ... From 760eb66a2f96acf5e6a0f4271a163cb066639ae1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 1 Feb 2025 20:55:02 +0000 Subject: [PATCH 193/201] docs: fix typo Resolves https://github.com/vega/altair/pull/3631#discussion_r1937938282 --- altair/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 3c61eda0b..efdd85c3c 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,7 +1,7 @@ """ Load example datasets *remotely* from `vega-datasets`_. -Provides over **70+** datasets, used throughout our `Example Gallery`_. +Provides **70+** datasets, used throughout our `Example Gallery`_. You can learn more about each dataset at `datapackage.md`_. From cc6d7573fc453c70a01753166a754de5f961a888 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:55:42 +0000 Subject: [PATCH 194/201] fix: fix typo in error message https://github.com/vega/altair/pull/3631#discussion_r1938474543 --- altair/datasets/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index ec1f00ba5..f75a523d4 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -506,7 +506,7 @@ def _into_implementation( } if impl := mapping.get(primary): return impl - msg = f"Package {primary!r} is not supported by `narhwals`." + msg = f"Package {primary!r} is not supported by `narwhals`." raise ValueError(msg) From 6c93eb01bbcea3edd9afa69a90efc93f4d5d4364 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 5 Feb 2025 18:27:45 +0000 Subject: [PATCH 195/201] refactor: utilize narwhals fix https://github.com/narwhals-dev/narwhals/pull/1934 --- altair/datasets/_constraints.py | 18 ++---------------- altair/datasets/_reader.py | 3 +-- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py index fbfd9cbc8..395a9d906 100644 --- a/altair/datasets/_constraints.py +++ b/altair/datasets/_constraints.py @@ -2,8 +2,6 @@ from __future__ import annotations -import functools -import operator from collections.abc import Set from itertools import chain from typing import TYPE_CHECKING, Any @@ -61,23 +59,11 @@ def collect(**kwds: Unpack[Metadata]) -> Metadata: return dict(self) def to_expr(self) -> nw.Expr: - """ - Convert constraint into a narhwals expression. - - Notes - ----- - Workaround for `issue`_ is performing the reduction with ``stdlib`` - - .. _issue: - https://github.com/narwhals-dev/narwhals/issues/1897 - .. _discussion: - https://github.com/vega/altair/pull/3631#discussion_r1934313255 - """ + """Convert constraint into a narwhals expression.""" if not self: msg = f"Unable to convert an empty set to an expression:\n\n{self!r}" raise TypeError(msg) - exprs = (nw.col(name) == val for name, val in self) - return functools.reduce(operator.and_, exprs) + return nw.all_horizontal(nw.col(name) == val for name, val in self) def isdisjoint(self, other: Iterable[Any]) -> bool: return super().isdisjoint(other) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index f75a523d4..8be37d365 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -262,8 +262,7 @@ def profile(self, *, show: bool = False) -> _SupportProfile | None: chain.from_iterable(impl._relevant_columns for impl in self._read) ) frame = self._scan_metadata().select("dataset_name", *relevant_columns) - it = (impl._include_expr for impl in self._read) - inc_expr = nw.any_horizontal(*it) + inc_expr = nw.any_horizontal(impl._include_expr for impl in self._read) result: _SupportProfile = { "unsupported": _dataset_names(frame, ~inc_expr), "supported": _dataset_names(frame, inc_expr), From 790ff10deb678e2d6d6bb7de3627df5b1e66b646 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 5 Feb 2025 18:52:27 +0000 Subject: [PATCH 196/201] refactor: utilize `nw.Implementation.from_backend` See https://github.com/narwhals-dev/narwhals/issues/1888 --- altair/datasets/_reader.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 8be37d365..8fbaf657d 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -361,9 +361,9 @@ def csv_cache(self) -> CsvCache: @property def _metadata_frame(self) -> nw.LazyFrame: - ns = self._implementation.to_native_namespace() data = cast("dict[str, Any]", self.csv_cache.rotated) - return nw.maybe_convert_dtypes(nw.from_dict(data, native_namespace=ns)).lazy() + impl = self._implementation + return nw.maybe_convert_dtypes(nw.from_dict(data, backend=impl)).lazy() @overload @@ -493,17 +493,8 @@ def _into_implementation( backend: _NwSupport | _PandasAny | Requirement, / ) -> nw.Implementation: primary = _import_guarded(backend) - mapping: Mapping[LiteralString, nw.Implementation] = { - "polars": nw.Implementation.POLARS, - "pandas": nw.Implementation.PANDAS, - "pyarrow": nw.Implementation.PYARROW, - "cudf": nw.Implementation.CUDF, - "dask": nw.Implementation.DASK, - "duckdb": nw.Implementation.DUCKDB, - "ibis": nw.Implementation.IBIS, - "pyspark": nw.Implementation.PYSPARK, - } - if impl := mapping.get(primary): + impl = nw.Implementation.from_backend(primary) + if impl is not nw.Implementation.UNKNOWN: return impl msg = f"Package {primary!r} is not supported by `narwhals`." raise ValueError(msg) From 8e538480b7298d7ad9df8d98f987cc0b0352e244 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:25:02 +0000 Subject: [PATCH 197/201] feat(typing): utilize `nw.LazyFrame` working `TypeVar` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Possible since https://github.com/narwhals-dev/narwhals/pull/1930 @MarcoGorelli if you're interested what that PR did (besides fix warnings 😉) --- altair/datasets/_cache.py | 2 +- altair/datasets/_loader.py | 5 ++--- altair/datasets/_reader.py | 19 +++++++++++-------- altair/datasets/_readimpl.py | 14 ++++++++------ 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 13dca2f23..eb22cc36e 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -309,7 +309,7 @@ class _SupportsScanMetadata(Protocol): def _scan_metadata( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.LazyFrame: ... + ) -> nw.LazyFrame[Any]: ... class DatasetCache: diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 9b55daf70..d1db0fb9d 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -14,7 +14,6 @@ import pandas as pd import polars as pl import pyarrow as pa - from narwhals.stable import v1 as nw from altair.datasets._cache import DatasetCache from altair.datasets._reader import Reader @@ -58,13 +57,13 @@ def from_backend( @classmethod def from_backend( cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / - ) -> Loader[pd.DataFrame, nw.LazyFrame]: ... + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... @overload @classmethod def from_backend( cls, backend_name: Literal["pyarrow"], / - ) -> Loader[pa.Table, nw.LazyFrame]: ... + ) -> Loader[pa.Table, pa.Table]: ... @classmethod def from_backend( diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py index 8fbaf657d..4f974fef0 100644 --- a/altair/datasets/_reader.py +++ b/altair/datasets/_reader.py @@ -306,13 +306,13 @@ def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, A return kwds @property - def _metadata_frame(self) -> nw.LazyFrame: + def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]: fp = self._metadata_path return nw.from_native(self.scan_fn(fp)(fp)).lazy() def _scan_metadata( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] - ) -> nw.LazyFrame: + ) -> nw.LazyFrame[IntoFrameT]: if predicates or constraints: return self._metadata_frame.filter(*predicates, **constraints) return self._metadata_frame @@ -360,7 +360,7 @@ def csv_cache(self) -> CsvCache: return self._csv_cache @property - def _metadata_frame(self) -> nw.LazyFrame: + def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]: data = cast("dict[str, Any]", self.csv_cache.rotated) impl = self._implementation return nw.maybe_convert_dtypes(nw.from_dict(data, backend=impl)).lazy() @@ -373,7 +373,7 @@ def reader( *, name: str | None = ..., implementation: nw.Implementation = ..., -) -> Reader[IntoDataFrameT, nw.LazyFrame]: ... +) -> Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]]: ... @overload @@ -392,7 +392,10 @@ def reader( *, name: str | None = None, implementation: nw.Implementation = nw.Implementation.UNKNOWN, -) -> Reader[IntoDataFrameT, IntoFrameT] | Reader[IntoDataFrameT, nw.LazyFrame]: +) -> ( + Reader[IntoDataFrameT, IntoFrameT] + | Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]] +): name = name or Counter(el._inferred_package for el in read_fns).most_common(1)[0][0] if implementation is nw.Implementation.UNKNOWN: implementation = _into_implementation(Requirement(name)) @@ -429,9 +432,9 @@ def infer_backend( @overload def _from_backend(name: _Polars, /) -> Reader[pl.DataFrame, pl.LazyFrame]: ... @overload -def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, nw.LazyFrame]: ... +def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, pd.DataFrame]: ... @overload -def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, nw.LazyFrame]: ... +def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, pa.Table]: ... # FIXME: The order this is defined in makes splitting the module complicated @@ -512,7 +515,7 @@ def _into_suffix(obj: Path | str, /) -> Any: def _steal_eager_parquet( read_fns: Sequence[Read[IntoDataFrameT]], / -) -> Sequence[Scan[nw.LazyFrame]] | None: +) -> Sequence[Scan[nw.LazyFrame[IntoDataFrameT]]] | None: if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None): return (_readimpl.into_scan(convertable),) return None diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py index cc4c01e07..1a5840167 100644 --- a/altair/datasets/_readimpl.py +++ b/altair/datasets/_readimpl.py @@ -53,8 +53,8 @@ R = TypeVar("R", bound="nwt.IntoFrame") IntoFrameT = TypeVar( "IntoFrameT", - bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame | nwt.DataFrameLike", - default=nw.LazyFrame, + bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame[Any] | nwt.DataFrameLike", + default=nw.LazyFrame[Any], ) Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,)) """An *eager* file read function.""" @@ -214,15 +214,17 @@ def scan( return BaseImpl(fn, include, exclude, kwds) -def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame]: - def scan_fn(fn: Callable[..., IntoDataFrameT], /) -> Callable[..., nw.LazyFrame]: +def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame[IntoDataFrameT]]: + def scan_fn( + fn: Callable[..., IntoDataFrameT], / + ) -> Callable[..., nw.LazyFrame[IntoDataFrameT]]: @wraps(_unwrap_partial(fn)) - def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame: + def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame[IntoDataFrameT]: return nw.from_native(fn(*args, **kwds)).lazy() return wrapper - return BaseImpl(scan_fn(impl.fn), impl.include, impl.exclude, {}) + return scan(scan_fn(impl.fn), impl.include, impl.exclude) def is_available( From 2c3b44dab018a3b9e3386ef21be3ba058a9c8ff6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 6 Feb 2025 19:06:29 +0000 Subject: [PATCH 198/201] docs: Show less data in examples --- altair/datasets/_loader.py | 58 +++++++++++++------------------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index d1db0fb9d..cc72fb950 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -108,21 +108,15 @@ def from_backend( Using ``pandas``, backed by ``pyarrow`` dtypes:: load = Loader.from_backend("pandas[pyarrow]") - cars = load("cars") + co2 = load("co2") - type(cars) + type(co2) pandas.core.frame.DataFrame - cars.dtypes - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year timestamp[ns][pyarrow] - Origin string[pyarrow] + co2.dtypes + Date datetime64[ns] + CO2 double[pyarrow] + adjusted CO2 double[pyarrow] dtype: object .. _polars defaults: @@ -174,8 +168,8 @@ def __call__( source.columns ['year', 'source', 'net_generation'] - source - shape: (51, 3) + source.head(5) + shape: (5, 3) ┌────────────┬──────────────┬────────────────┐ │ year ┆ source ┆ net_generation │ │ --- ┆ --- ┆ --- │ @@ -186,12 +180,6 @@ def __call__( │ 2003-01-01 ┆ Fossil Fuels ┆ 36234 │ │ 2004-01-01 ┆ Fossil Fuels ┆ 36205 │ │ 2005-01-01 ┆ Fossil Fuels ┆ 36883 │ - │ … ┆ … ┆ … │ - │ 2013-01-01 ┆ Renewables ┆ 16476 │ - │ 2014-01-01 ┆ Renewables ┆ 17452 │ - │ 2015-01-01 ┆ Renewables ┆ 19091 │ - │ 2016-01-01 ┆ Renewables ┆ 21241 │ - │ 2017-01-01 ┆ Renewables ┆ 21933 │ └────────────┴──────────────┴────────────────┘ Using ``pandas``:: @@ -202,21 +190,13 @@ def __call__( source.columns Index(['year', 'source', 'net_generation'], dtype='object') - source - year source net_generation - 0 2001-01-01 Fossil Fuels 35361 - 1 2002-01-01 Fossil Fuels 35991 - 2 2003-01-01 Fossil Fuels 36234 - 3 2004-01-01 Fossil Fuels 36205 - 4 2005-01-01 Fossil Fuels 36883 - .. ... ... ... - 46 2013-01-01 Renewables 16476 - 47 2014-01-01 Renewables 17452 - 48 2015-01-01 Renewables 19091 - 49 2016-01-01 Renewables 21241 - 50 2017-01-01 Renewables 21933 - - [51 rows x 3 columns] + source.head(5) + year source net_generation + 0 2001-01-01 Fossil Fuels 35361 + 1 2002-01-01 Fossil Fuels 35991 + 2 2003-01-01 Fossil Fuels 36234 + 3 2004-01-01 Fossil Fuels 36205 + 4 2005-01-01 Fossil Fuels 36883 Using ``pyarrow``:: @@ -226,15 +206,15 @@ def __call__( source.column_names ['year', 'source', 'net_generation'] - source + source.slice(0, 5) pyarrow.Table year: date32[day] source: string net_generation: int64 ---- - year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]] - source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]] - net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]] + year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01]] + source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels"]] + net_generation: [[35361,35991,36234,36205,36883]] .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem From 51a967aef63e6c934b1b227cc9214776c3a5c699 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:13:59 +0000 Subject: [PATCH 199/201] feat: Update for `vega-datasets@3.0.0-alpha.1` Made possible via https://github.com/vega/vega-datasets/pull/681 - Removes temp files - Removes some outdated apis - Remove test based on removed `"points"` dataset --- altair/datasets/_metadata/metadata.csv.gz | Bin 3632 -> 3595 bytes altair/datasets/_metadata/metadata.parquet | Bin 9208 -> 9174 bytes altair/datasets/_metadata/schemas.json.gz | Bin 2471 -> 2461 bytes altair/datasets/_typing.py | 3 +- altair/utils/schemapi.py | 2 +- pyproject.toml | 6 +- tests/test_datasets.py | 1 - tools/datasets/__init__.py | 61 +++++---------------- tools/datasets/_metadata/datapackage.json | 1 - tools/datasets/models.py | 8 --- tools/datasets/npm.py | 40 ++++++-------- tools/generate_schema_wrapper.py | 2 +- 12 files changed, 37 insertions(+), 87 deletions(-) delete mode 100644 tools/datasets/_metadata/datapackage.json diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz index 30793abc86eee5f4255edba76dd1d9b739e8d66e..50fef1d82d1f766438ec99a64c1849996887fdee 100644 GIT binary patch literal 3595 zcmV+m4)pOKiwFn+00002|7~S-VPs)+VJ>5Hb^xtf$(9?r5xm#8G$rhdPk#2*N8j}l z$V4|sY!=N$vi$XqBCEJ`Yc!=-eUSikYbFB7B?1MeBOT=UWlOK}bU3xP{_j&;Z{)I2 zejMfSv_5=UztXb|gCmvGhW5koS^D!zzdQeZ*wJxKn_>9Jq2E5e5Z3C8ywcO*g`Q6P zO&9;S-aUQVZJ(cp(RpBV$WO=pNuC;Q4uAD$@F{SpN~IJlyOg|AIaeJWIa7)bF%pZa zPcO&g?(nzIpDS-a{c&K~tiSC)ZRPlRyLDPBh0Rx_*stA z{+}n8vCZ20*|Fn`PKf(rQXAr=c~>c?DgpV-#;9gOCT@Bu`WwIz`QZWjVT}5uVxAp5 zlvNhvTF8lP!EV=JW3@_Z_7X)$Zc$qJ5a@>q`~JAzUZD?@vFt=Qd(SyhGE|5-av|4p zW$d^(WeRvS6(uTG;uZtH!s{ySrwt#5<8CYEvYB0F*bjk>vQJ7=%TmQ~usNn|5b&jh z=CJ#`5cL&aS9`On{kF>XxJNSv;B$VyTfSs%@-EKK*^;96wXmVYs%o2+Q50?C;@DW- zQejjhOG~-@3@Uh?Fkj2{K&JU+n$bdM7f_{8_kAw0X;RrmkElX1MPel56iL#`1c_!b zk$Z?;-LGGTCiv&cm<2k`&jP~@f~N*;l-Fc*6KkBYkzDW;4ErWZm!;`@2e0GxfeuI6 zkCp#A9U~|?H9LX^)gzn&Du0TqK*MFuLLt`dnq?%46cnPR3g`i1$NTlSHDu;xv&Y$3 z^P<0`oGy7V7b0$^v?p7FW#8rhoX{;8qcnh;@t^e;u_79`;K*wOx zeBG}?hP=xzIf`B~ClT+-7|4jEI7F17TuohC|0j4|gT|n{ zFtmsHC@xo=cVymk`8xSB_?s}hKO6W0qlybq7A*u<1eI1hEG6rbH}L%tStK7uwU zMDBQGKRf%gF`DaeOA(`iSwk6+kW>*{Q_?5}C}FmK*onMhVP8>jY5$)S49{)tx}>ek zRv9z9dFSc`hmsU&l(g(ohsD=WYIQiBnl-dwMPc|LIyb0SM*i)&`0-pE`uP0tO3vp7 z;U~6G8d|nV#lusmk{ep0O)8iiefl6G{>{0J_cIqY^=9$B7QB!1k3|8v)o3YMOGfTj z+mc1@qm-e9Wr?sx1cUuSZ0>+|t?!S8Hk;{G#_1$XRD?5~oh%BLC`Hi7vE--&L1AS< zOJ`rQF0LT-$S<|j4;Z^=+P$vPWA-D1>-;i}-l*C7Fx6t3YS2j;QEsvo`05tntyXir3x%Ywr0>^$tz>gH+_M9 zQqJZekTRa?hjDw4w|ic{j`zmH=sZY9>-qYzHszIPjz)?%-lrB?)P-TdZ3vc|Yo;Z$ z^u}pI2f)XdQyCB9Jf7~1{cl{84F+8W#~f0u)+8a*%CP_F>A~q(DGa<8gP!np&zm0k zu{z4I-Dm8#g^T0EX3Z~7njG-Kb@+-PLBok$ITy4Nh z?ds(myw$VM!9YFP)QsgwK8m$DH7B`g86*kLpbQb!mInXNfOdcMWilbda1^)5IXC;# zGGw-CxGA#=h84nzO-ex$B40T;SB~0;BujnA6<)WY)qdSiAAP3Tc>?qN8=A?9K#4<0m*rLjN4@1N${UGPLy(Qtj(1q*|g zRnaP_WC|mJGNJW&A}%_82&YG&;M9LT?Y=zY=yp@$+%H|SXa03_bOSFS{C@LGtG&HV6!1wpM3nRU6J| zX*qj?vU}~fw6E7%Hcc);HxrSR|Z0sOby zE!<@??!C=9r`i5OY|_9xeCTj!b`>QUZX^1vlfwa^nCn{oV%K_yu-o?gSI8>EBl_!n zx;li6n0-BEm9;8JHG`|lt-`h{cqX>wo5?i~rpcM5*SC)lyS~WbEThhV$Jy8_2H81R ze=roF$-=6D#c+FLl@TbgWL#135$GF4b*cWi$Jg~Qn>yjf@G|nz&c5cz#d5ZS{*2&<`t)p5{BPU~89C@sQ( zN@|UQhcHxVc1UDISY7Ic{se8e>o@D;x*|DhKYW%fr{UY`^{{^a(=qCl3A1lK9T8F^ zH?rB(nw?81t_>ocSo@;1vr#vfm*&c^0E6r9v>CsCkMyR6>ES_`&nP6-*rLzH7_A;! zhr@vxU9C1IlGaTboNHUm=mFy$i7$v#=q%YEzPz0FuvWC4-r@YkFHK+h=Q|c#h(nN; zwJzD#;Peoax*3IXuTV8oLD07RVPyXlcXtpv9yhZ3j=x`qJGz0)o5k~*OGeMXQ&G(( z$j8D}L#tFTo?&8|wI#FTSWqBEiC@a_CqUmaTD|nA{pQE&(zFggCHUrd&c9;yaNj$d zFnjOd@ww(WypU8uw}AL9AtP5?M2je>L0mqT@K1nuhaC+2NOKroZp7?w+(arVCdF1_ zH7dfU!PXd@K8Hxz4E+MRS|RP0a+%OwAP--=?s1>zm(bHbhx8jyXkcIz+8HZvfK@Y? zc%L(cnzO=xjZiMzhrqvr><-7iex1VWJf7a{x%oTqVN z_TEQ?-DxYYyG{S&73Tlx(-B*Kzj9u%F53B}i#1i!xef)daka&nsB+2y=?!xb*(RUT z;@2B*Q9ck^QQh~4!)m?d)8YO_)~|nQdaCHrQD#dCj$wEtQ8TQjqMX$5fDOz88Ujd6g)rQVonLJQzZxc+$z+B7#Cfz3{%9Qo5Nya z4;Z`C@C4y?B5h-X6sb?{{`B&m8W?#g|+O!O}$WCBMovSDf#TrMDL9%A>PGVj%;HqiX> zHxx>gxCSI(jlwZ)(8?q%h7k*`o%R|e-Te_Wid`;wl23Wc2^ zi=w2g#9BENLP+J7?eM6e>fi)_Ccl`;JD}b77tAIi!+3gpmu9=R%Hl$C7IM(wFl+^v z$XuJ`-1z(%iBSWV=E4nK_Z>~+o77`JJ(ay%7$XTqa{k9t;gCX6g@U#;+lC+zF^Y6{TFvbiq$~2lu%K5&ZEJQ_Rg(}@*{e{= z@bCanmOkcs1G4+ydRKd%Z1Oa_7!IYAO~2uvEW;g&HboS4J9IDG&B literal 3632 zcmV-04$tu)iwFn+00002|7~S-VPs)+VJ>5Hb^xtf$&%bSmb~*@GE3N(K8;OZ`#ATq z2fT+&mLh}Eu}H1{`f)~(v1CeBR8c0gOaKv*+`+fFgJ3$+L5{y~=}n#vr`G!aJhgr! zmxc1}D2J#1@O%G8FES2}R8AY(kHZ%k&MU+2{QIz@V^5oL_{U+`KD`pw>Z`ob)8Un# zPWw$4|J(1Le%ozdp2pF6U~|Y%$Nfp38f^}L^)vVsI8>!lij`eTUa6d`j*gruMTZ!P zMb)R*<8gQR$EQz~x4->yVA=Fv_P=fA`03^K(-(Q6^N(k~kU#v-8}-{yMwv7bK-DisRsb)%?!a@97{Y_*i^ytP&t?`^W_?}NTa z*U@%+qRsPeFLm7Q^RswZyDXUy%$3BS;=c@|&JwLtp|+}0(}9_S*Iru^t+f^A2y%V} ziw_{d1C#)vVZ;%&dcW?T9f3cvMOM03pZ6$2wM9<&XF%M8BsCCHR$iK zb%pjD`8L76OfKW(@SiLMow$0%V!X;Fh|aM}_SLwWQ_G4oYbx5N714f&tSkMV566A~ zS*Do3TR6|PG1i*AIPP^xMQ2rWsme%;rIse160)ELjbud(*%sEoe?-`o6TirD-u3h3 zGPYS;zc^TY(FrkLOlo6bH18_qR3#v7*%;MK$iz*r0Di+Zfj&H-K1>mwRLqN`g|fYf&F&*!yF@y`ml`Q(=j2@lJE1WT+5vr#)86WwW@BupfgK<(!nJmZgf}U~^2_Ah=5j&0){^ z?*qOf>*`>h>#(h|J?_z3f%Tk*AC@o4nY@dOQ?sO~eJyM#v8vi;WfVo*xHvXew^SJQ z#?lHJ-$MyXC(L`fK8AUInPxP^#nn+M)N`LpY?@Sd(IbpdDUk%pI7O1QGC`tQ!{HuX zSNHv!(2V>%nIc}NbVLA2DMjq;j|ZeooyHj)dTf??l8>9QhA?~rxeALwwD{Z!Gf z(NI|Jtp<*7;b-eGt^bm-bEuLmmHFrTT z-EV7IQL8?-f=u@kP-3hk3Kdcj8VL0zqqF7}uiUeVYj7j8K0J@7oKyjddT~-(gHTp2MbDBV zMNX|c4XrlnE(?{IQwe1S+$*vR;AfVdY+1JTRt!IWa1D2*7B?tWioi}V5kakMS7AJ& zEf!ThvaR^MF}b{w%14;s)xW$R56@bCz7o2wOyW1AGB2hQp|b2)Ta<&jlhp7W;5b}B zsmaC@r(&beyoSmnz$*u%K9CrfrY@BbTFWJaiw54pj9RSuW)qEN)3a_aRT6j(!w?bX z7tpvuyyNka#^>2R!HR^%-PS6|7=LRtvSSn<@grthtnvzDwqe$8ME1ttU0`?8nl=lBV!JFLHu1ngwY?U#K zTXC*V@Fn5MJBP!ZN~abhUAmlVMu$C9HC1eKHp4Vrz) zy10T^BfrA#KEUfmMe{-FGQ58 zSnS`&{)p}d{`pmCb9{a7w{>`vOEmr9!}Dh;sO3k7RI0!dWNStpmAo<*t{9*+jLAjj?RN*v|cX%YExc$=4hmN<9%wOMO_%?*oI)axn^22D^7l9 zV8D8OJ(X$u&*S-s*8jvw)o7koaLggWy_+OtS{Zj89XdE2D}|BL8oV>K?qxF|7oLwY zZt@xSZQYK04a2d|q0{+z5+)LatR z#JD(d1u|5b#&DGiT|g`f4Qv`)n-Mi`G?gBOb4A&_gVy!KS9+F>RM`32AHQ8T{{8A@ z_r2ANci%t(+0=~XNInX{r{*L#Eu*%;&X+NS+KNE`9>(sEq0FXX98R(nspS@*7KY3= zjfYiM!BIl|ut_OMLX;~9YspdjkYt5VxFYK|^t|tf`Bln1J5OM2e?s>-fpcfbg~SH3 zr$uZzoS6v;UI6~4No18)oSocb>o&udhwc9k`{DHRI`8-6;?!@ki{?*v`;HrDHEy!^ zt#K2tgQW5yd$vGp67rO#6t}3V>Ra>dV z9`ZB!TqJ;oG{G7QY4dBzJVKn<{NJg_zxu5_vbnBKmJ`(_E`2t-7!7ZR4Y2T1Srx5< zN~SQ9B@hS~`yyL1*}U+Wr252ezA&)gc{5Nv;!2Ypk6s2 z>uM4=eBE1rkUf>nHKc9}v$D3)EPrQ8si+1cmR*g`AjfkP8w9p3TdT9Ps*O)(MZtOp zuzMS}w6E7{HBT?Iu>6=h!zwp&DUw6AtthOip;i+qA}qp7>);Hx70Bvb%Ft-)W4uILhc$eL_KeE9MH z3*#K0)Kdv{hqFGdOv< zUzBz>>gIA@!|KOaL34N7OfR}8X7j>)vmh)d6Ow9hlR;vPR*zlGA+n6FRvQ!X@R~9R zz_y0W1H3!@zay6*?__`Y{q?kmaii^gspc=fG{4JVZi;Ln4nbPhx@22}(_`M~W)#|w zLOn|biPiG2;Q2$y-9hMh+{p7+{C*uz_C_A>7SC%g8NK{qMm3utSPNGTom9PehEHnN zmduW0LDd!|eg(PjV0=&J`E@w$H{YHw{qyi$ZtuQx{)*M(vG8K9?7a(`xaK(CWmG}8 zfPgI_BQIM-%POdjTz(nX?_ll@JNW;J;W)e;rrDo3x>Qn3imk+IRD=nHb251593o{i z_AlgRg`!+RV+M9XJHGb1hkTx2Vojqir=PeK1CON8Sy_1ln3_St`y4By^l z3;0KD^!>_t!MbRd7bn(K$38!nw8qsIXQIjw1ju5TkjOUql-AtLcn|P_!!yw8cz+z4bc2*?^OWrE9ffyHEungbDV4A}kP7m<9(|FO~ZN`3{%xPon z^+wZQ9#XZ2&6f}nc%d2AQj&*SGpJbwWvXe4L#4#`^TIbg5V|x#frE zDP~{9$J(@vhs{plb)BoI0L2 zELUq_WW4gJrktHN)vAWJVJM=1IMO94wZ@u#A7uAs*g${{6O(B+-$=Z@ieGR}g{P-I z+FdKvx4^~11)$-1AE_{+jfj;rUQvth(RB|Z@m?Hi1EC&<W@w-Zoc#2{CBTWP)bT zP`(L)%8fvunsnQN1r8=qe)F;acSwSgP3 z?kk$X7pcc#zRG&HFog~(;PTg|!XbsC3I%OxJVW8}$c-}?d1X=fbwX?ND^Awu86I!W$%^|}@9=g1?RRzH z*~iYai$PF2+4K`u$1*&eXj4Q%Y@_yP-!w8ID&>jUm8i5TE5P5w>;4}(t~#USHUI$4 CB{VSr diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 293c93a975929f24b4ccd533d4cb475df654bf9a..840a44a53e4ea0947667311432f299cf492ddf9b 100644 GIT binary patch delta 5472 zcmb_gc|6qLyZ?+~j4>F?j2XtRiLtMhgrw}d%Ffu0eQ86+64}F)6xkw6WKEJ5LzZMo ziv=z)=bZDr&w1WwQS*hSfYE*v86+*T6oGsP z)k7wMkN^OP+?4umj{+;O0xh{VDFT8Proa)v2!Ki0XG46YAuS3_A6F{+^PLz^zF#?2 z(sbcU6^Xmie{D|c+Q=DP>ATbJGgj{^jXS5>Lyulbff4y;)Li_g^2I~7$L{RJGbq>62PsUFsI80$(kTD62weI zP0RinVi$jAr&cfABNhIR*#)uxI!%*e?72iQ9yvVDu=VA7Eng@x=4!(f#rVFwPBNkS zLMFLco%2+<@kN=p7JIIWkJGl=?HSH1XO-kj(lYb5U~6ExFWmi+*|70Qk(zRq_|e=O zt$k1sG1O1`-O$&dovY=(JfdvKtKpAG%M&js%d&AY>j$eB%#+gdibUvH&*+`;Wcg*V<15fTUU)iH@g zror?!QroPKleA16K=hEU0Dr)%q;Y*YE&pI)4+HfBpnZU~`^0=fgem8pU}74RHT*DO z4wwK~5-81sw`*Z&fAx|RyZpWELM7{dhH8xXN4s?gOM@Z`{IqD*(wj_b-7rb?Pn4MY z@BiuuNw&yru^l`U^PD_jV)1CQ0NxfeOXPh4*4;?w5kF~b5<08WzvB#Pf(h!uUsw(=U;&YJT` zt?%pLlF_tgDSNeCvkGXv+pgnA8dqn|@BHiR8l4={WpX9ll){9QH6Ajn z^(HGbHbszkVgGcxg9)|Kn5b0oJ%Yp=4Hwb@^H9ugpx`w<4cE(*4Q``gAO);*?@;;7BqN>a4>K*fD?0RhK&EEzKD}Z}YlO(Ak)i{TB+LdP)d>5xOg738+Nl<%jOus6!5c;F*3W%QciL|5nK} zxVU>d+j$eboPSn*7#dXv$AytG^aAKG2IK(vw;ERi?VtxV$R_kwAa(K$dNHs?GeCYz zF9_-(gP>>`%_4*)-#ZpWJxg-VLPrH(e7?T@bksR78j4y*2ko}3wg^4JjsA~>rsjx6 z(*B|)ox`a>=YXI2#W14g>2NjVuiRV=)PVd**$saC!O#a0;lL)pK>$dN$_6S?0n82n zfq+98r!})@cJ9q_$o`F)#j|}sk0^q_;~*_<$C;|0&P!!Yx&6bJSftynNN#t zhteV+$RO*`zm=Ii;{|CTsA>_#Tu}fPCZQ8cJN%mnu$P`7BK60WV4{po<4~O+%e}=D zN;{W3HawI+#>kUiB}W`hy5e0Jb-^!HAWxNpu8(eB<%zVxT^TD^-<*2VFtJ;=J&BPp zn-rO68UVQk?N@!@=O*ZMZm!UKS?69())jQ6A6gXjno)wHvuI^##8{wddSFc!j}i(O z`Vt(})pRDuJhR1-H_vOfl9nDyjTxNZ>lhF^Hh*(`Jjdbe!ivgNfbKlM3Jj}~#vvDy z=$fAw;t{=+K89lIN`K_MzB|5A!~>h9q?LSO(l3s<))73x-q4;>RY%dP!e8N8O1ExG z>$*5hkn?P4s+*dm(EI3LmrvJ%woTpTzT;ep6|e4GrsYnNHt-jjYddvezLpZD>NtWM z+?TYAkD*I`b*ehSA?~J?*zFfhW28%s3T-2{yg1w0x1S$o7wSWVx^W@HA9XJ_v*q?M z)pRg~H+yDt*L%g2FGkdVz%08*l{^Z>w-1{==6<P_i?Ex=6f4%PA3)n0%$GbJfu428fXN*bdlb6Ig zdMd8@vP(V>4G|m7R-Qm*yEh#zAAZIi7Vy- z+8o71p-U6dp5~A7xsR1Pn*v)#K-U=lB%@Riw`(5GqFgf|su`C)i@$25|d@e_6J zy=zd1gvLksnJL=#`-KsDwY(+K_~-kwR2_B32xGVOqFG8f^304`K4d>sJZ4>7!WaL@ zHEMSfr)u{++dv`~zi7B}d1wY*n7*3n2~O=@<=Cskd7Riqx16SQ-s^BRLVx@VPXeMU zh4p~8)3~hI{L0maHQT|)T}&3z1D2<vD%nilrKw~kxZZ?0HHP6f|?Tw6GJ266E+MbkL>!+qsy}A= zCO1w0DC*KPMu|xBGh32O$jaGj-q(9-KLwUsVvb8vbYtEW>wYsR5`co#a}aDAb*u>W zWWI`vO8oaW?lM+T7@(G{ml3oF03iV8F)r>HV22JC6ti0!+*_rgmTbZafRdkr;8Zu@ zQfu@c!O3y~J3;DMztwM(dstomXVZY)Y8t=+ZiJs#L7P0Pmm60hp-g_sYEGi|#XVNY zXNQ>YNxQYq~1bINdMrdPH}#obUkRFv8R!j#ShQ^5PWqf4I1- zx+`OM2bz?|M~lz`0$85vvf?OE^y1aQL7k8GwBaw+c`uHMATRnz-{GJ&dG}?Vx=vb9 z{0b6(?Xx`tWN7Ez=$pMYAD#S*?2jM&zBsbf7QJGdpvY)cVY%hTAx3yYoV#n$pb^vm=v)V zze~EZbpV3p+-@Rv?!h*Zz&zIaNa+Eq1|A|YH(qU^**&kfy%PZ4_#%FHXpiI z1p^7r2=Cwv-?OgYJ~$<%AsovV3zv=xeB!;DpV0D=WiQ+KdhYwIN8je{Po6GHF%LIx z!IY$2!yE^P38&g4Z-E5e%s9?KxgyrrpNB$7GKW}=TEN+=ANQ0O(7uWX{KfWo9@2J` zyi41}HdxBmQccJNHEnv+Mz_fJDdlLDxTkf!vc^vv?$c9!KA_j>PdOYtR_7VbKF_Q< zM|y%>5<9`BT|=*5dH{CVQ$;$wXI%46@D#itzGX-Aep58p%HD;$r@D;9vREu{ z(5Zip8SW0xiqhI$moEyQ)!gr`JY9yirsLd3q0J7L+GFaFjZhdlZ(lE|Kuu0kT2fkq;OXl|kd%?y7?uih_jUAfat@Gmq{_`h8#wiUh1*31O8R=cQt7|( zQt{u^@ox=3_?W-vPwu|7Lw#exjq7Z5WB;cd`Tmk)bu=dd1o|-@>A$%CT!zi*V9-DE z`dfUm9fq3}!+@Lyp{NUh!J<~-)OUklb}a@OLG#P5;72ZRj;9GBiNA(>gtyEj+?LtA zbIV*6*)r?#TjsRbmKk%18M(aqmA6Nb%Vc`4FAT{b3>GthTZWNWxwy8!O;ArNXCxZ+ zzy_%Z;If$iIavSg0|e^E27cZKDGT80xc|ZbLCvvjY%zRWagp)#Ok${(jUYv~$U^|G zMDTBNi<|Uz!~L5rQU$<;31c^!}ThS zN_M60z-0oGde7rtT5bdU|8rXNe??RPKoxD|_R;})j|Mj6<;&~Q-HvQhOQFDo7N L8v_7rJgfc}7rzt& delta 5537 zcmb_gc|4SB|9&3(ZY<9%vdmbvVuY1I3*$88?$4UWWfUd)! zd6s=p1_%KFAk`%Q`5>B}zylbH;xaL!&bEL9UwIOK=pvDQn)&nvZF_!JzRlhfe{=G?xd=$R`@FQ*p!aavDm6dft7Wm+#zvNG!Z&G&7JF>_sXd>hEQ-)XNQ#i%|^S4;Wv8(~JK zijtjsPK>x-$P7^<>2#Ql zY)qTnnw+V31yC#$5I)2UPhqD{(z!F4LkBldM)YWsq&m2XqK6Vojjc+JGn7nhO3{}5 zCQ0HZlu~gM8D>70*(RzZK|Zfp%S)z5aq{INnp_82jWHX7&LJ_?FDJj+9W7k+{c`v2WkP%Uavov z#aJU+!@YAjojDuL4t!t{aHW=E;IwW6 z!Rj{oW)k0J-3w=(dV`H-7n5J4wiyXK;J16c$yGL?q~P&0Pk|7Y4@;7WQ4i9Qh-s z^uxA$i61R+h#AN5`oeJSMVm=M(#z}P0z+{x_2o#lvU%oE$rwl5_?oqdHq2*{z_CyQ6Cs3o?+0$xtl{2MJdhT+QF;Vl*c(GT#~6J^ENO!Tg7bN z{{2}F-Wh|0%G0~KDwDQhdqgsYb(zCXF84gN-9`}!ZE$NEjK+L2;%7NeG24#cgV7#rDrapw-H5Bc@woA>3(12?=HCkG43m!J%`ER8?-tkZ1;Faz|zV9>D5 zEn(W+1OVf5n48EnH>`m>@6R!l{C82+NY?un-Ts};yM-JSwkj55!TzUWu}#Gc-%6Iz z?RYSO_?Oyq?C?Y;>;ko(MTkihQrcJv6j+fko+My%B>(|%H$YuwK?%_c`~ch+J0rGH z-XMqsY1(LC{{e%xz*b1>*Kn&}NIoDHBM9Hb7E1gItG@*cL~S&C&XO(aUn|aMHIC>X+S$Wq@)Op=sn# z(=dfySpkq~vO#dwmC=inkf~JzL0x#Zhe2Qf7zjRX#cYFH$66*QbWvt*XluQ{)RkVO zt>mIvpmd^`-a6E9aH9U+tHwG-b4C642_@54u1`t|^j4^nJYI5e+1NY3IxanS1d}p( z#3I;xXc~@L^;{D$BhS~b-#5ecA?{z~6I0<5lQ08%`;<@oijnpLZ$A|CF5vW~$GZ(m9pXwx? z;twm(!bvp7YLi0hau%PfU0arTloaIYJ0$<8-OfeGX8uj&lw_oTpK!NPbk_}vTYT-? z6q~Go!JQ(z9+h;UtXt+qJQa|H)Gljg)&^c}bIU=}@z*C}RMoBE_DB=EvWsFORG){# zpR6=}<5Q0!RfS!br>niPnfO>i_4ZSpy z&t0iY9MU8^`T6ZF5pqf6%l4FsT_tx`Bv(cyJ#0@`KN>ozn!g%D86K;AG_zMCX3>Pv zeo`bvaHi{y3#Nu6j+xZnF3nKM-4-MwJlH$(?8*+=!lVZ&!+KT*877$D9nPWd4Z z#ibi?eeBfr>4a-2qW$2F9lOTldPXDM9mw0PXYRK{oui@UsU~T2sAq*;L_*aml@GoK zVd2_rBtC5-0VOcP@84NAo*vM%Mt$~ZW|!aci^j7(cKPqmTexA6S}W$ZN+&Wsnq^;2 z!OpcKO3L_rKlLmY?{O=Sema#T&pmLl^xW_ zu`+Q|w)g2jdDfc=*;l@}QL^ZWN&B-X9{mNk2&u{x4}M3pM(^rQ{P zojFa6ZZG2^I6aWnr)q>Vi;;Z~bzAV}UAdLK^Ygk??X}UPZ_M=Im+dB>No7t8{C+0l zLICT8y@M~@Tsra7(ly6J%qiMm!X14%VnONvcPeQonZ^08V*99?}$O+30n<~~^zAlBZm>zy;S4zMo2a0K&o ze|F~`z3v4htKihpnhqOA`}&go@;jB2cc>!?%{7Ph^&B$bHG5az3HQR`-0rv_8HBzkG09(R+)W!b`36H zlCbD4iH*w@ned$D&SKwHfjGb@hy!( zdh>D&3#eNJ)K*P1GpE69uu! za#UwtXHtee)X)CY8h?}j5gZ6T?}oAIMdHq6vwP!Wqd2?!`Es;k z@fCHB!#=H?ncI7><%J0MIFfqn=F!JARk#)37+t%TJ=F31ty=mGf`)0&6Vxra56Ae` zR!OwZ;G#oO2F~|K#wE|GEJyEIHmSu|E4}&@jLc$YsUP2W`{iRQR$vs{J!EOo_B4-A zkV!b-P{bV_ZA!++wsP}&)$OLEeED-W=794_4tvUV?K5fvNUq-NYIP*%d&ku52aV-x= zmaF3E^s8-{A}U{&T4xof@S`(8tO?>(@BYfe5$l0s#p@y(3l;PJx8qcqqN2--_qoB` z;*(>}nGvJJVXRh=s%W4CoXi3vPk$jD5!r*xi@cp&%Bh1SNqyk#F1_}o{w*Th?0rv5GUgC7{i6S~BBI zrrY8NlfDz; z;;)Pr%9xA`C6zGqw%!_vIZT*~-Fq|rfxWlzWpQPGgW~r=E*%n8okALA;&o!7YM~(8 z@Qs)A4FVjLxrXksHI=z*$Pegv1!2jUm_Go)wExZ@(1!$lJ0Mb=;FH|gmAxB>v;V6=fnN)}Xo0|k;MZns|8>@1E#GVg3i(wC z>7Vkq_7v?*PV>wtL3kD`6h*gTpJ{OpCsYo8i)T{|z^5551@UftqKffG9!K1U8GEdPhKD>pcg8 z1YaOPNO+Yiqyk{uxdgxaxBvA510L@|!_T-viU4+T+YkIN{2%wmD0BSiH);eE*LHZ} z21gbSG6b+6MFpY%=Eyp>ZR5JMkv@)tv;pjpxFCgF=O_uu&v$7`!D_oesu%Y`4x3(hag0iXNR=^&J4kaHxg&RCQ?j)4C3Ezs*XG1 z&pl}a5bp-jc6jVVr!D#auSKodOdqZ@^-P?--PnCgoV{&kE|)Y(Z+lM~TB0OwC{jaG zalAYE?*%EzmH_HDeQ5V74nP7R2m&8qe>-8qG}<_O^7rK1iC~VwhZF0JQ02+rk`u*h zKBZ*B?WYv$n8S;7tDHgWBy+;Gd0492ddp3qYE@$LE6|y;nSX}=f?*Hb2yUn1{_X!2 z%JH%j^am2KWk&LS*uEEB=2NxL3?$-{>^Uu`c z3YXllC!XBsoXhEgk43>Vmt54|Ux_Y~oYkyEQht%5tXy){dpY-i$HL)*6pAOe4L8hb zGu?e_L@8AAKz}dku40yZrA1IMWquYim!jawgZRqlSPjp}wLFTNcPBy4-DvBQTUT-H z;=s7)4TNHh^Qaw@$zKo)bn`s9zIaIBa8F8y2tF(b*VN`EcU}1C)?_@{fD1}(_Q^(f z;Wn1nEy;!@H1PWsw%rbk;vz;1B2q3&uCjfSQ44ww-G6e~8qV6-mYx&Ql7$x-RnxQR zrbiHFTd;cG##^;Ww1Mj_76=a(+rEOF|(w zpA(zUiOuK4=5u88<3Gs?a)xtqhI4X;6*zTS+cDEai?LxrLf59`XamAB-D)4=cCkEtn%u#e%wiZnC5pic}KxT{}0{of7q$}raY*B zyM!Sjk%yzT+d@T&W~3ZmyCrfeUGvm&R!_-n$Mz9&SPk8}A^FO$CydvSAeuT~wzlG;9vOujqD zMNt4Ci6{@__w=4;Tr|#3UH531vVdi5S{PlY7pB?U??C#eM&thJ|=!bGSIq zP5pNq`^b@x0??94v%YmUz1@K)E>Qq?F1=PB%I7V7+@Ls%n-uv*=*a96edfp)GQ@A4 z@pnUz&)GQ=Q&FEZNV~l%~XoAwPYkENdahZ@Y4ifH9zhj0&%Cn?TXv0WGV1OZ%4~i98-Lum4gM<{Pam!`-5IZQiFf;EuXCwq&S%$lFTE!XSE3}w6saL8 zuQwV0_kxtPD*@ESeQ5I(2Ot3u1c49Gemh~pG}<_O^7rK1iC~VwA1Brsp~{oLB`1p2 zd`iiL+fOOhF^3oFRyl*#N#=xW^RQI4^_H7J)vCngSD-UxGk*>L3x+*#Bez?Kpg)j+Ei;nu!}h)4GM}n_W*`xtWY3vFQmvSrEeIAS_y2w3o!2fp7bd?EUjZjG ztZ>NR{gvn<$yv=xB;^+=%E~2Iy_a+U?^rneA%)_}ZNm+7 z+Dv!f8c_|#s<2-7|Wbzlp0^K}Mt}h-EINX!cA%YJJ!Zo#d$z2ydx-}V3HsFF%n|-p; zUAT?qbxX2g2@U*yg>ARPqPU3Bf{2uhlB;Z=WYmJ5Lw~niwuZAdwx#Dpv}EB0M%DBz zy6F*w*%qvxxA9gj5^dmmi^ajcBMHm(hciNXM<_oN%6mfjKq%*gvLcin*?e|-dP*ouLOCOpcZBjYp}Z%Q4}@|~ zC@VrClS={`e@kNX*(ollA(eE+wEO!HT8Im6c*bmu^~_|8;ls8yPN*RAl4TC?141@; zT!~Cb85WIz!Gklrak}1TnDZNtQ9l`0r%+vm@`5KSK5^Arbk>ZS?w}Mc+z82S3Pn8q z#3aBXTnzVMYtD^D^ebT-$ueGZ<))_nLmSH*y(1O(e>3+5KNTvKTTAN$mmWw6jZQjv zegGgHvcTulXNyd$42(I*_;!M^%jWiMDx2kBTI-;qPdTp#EGc@hl>#UnPWq?7t>6#< zvInapxCy`|{W!_(k>Xnj`&4Zv^Czw7AQhb^L&%%WQ?Br|-G z^M*qVt8BPV8!!{v?U?DI#n`YQp=(oev;kq6ZnY0_yI7t+O>W|N6aeFFPu=dAA_i$3 z>0!Ht1ao*M5!H~dd);r>Gg1z(-4Z#Ku6gP>tEXhPWBUj>tcGsgkbLFW6UJ*u5KSGb*`nf1x@sX5 zB=+ukM~4Ro{#bsucbbzM-L&%Hn$cpzOX~519o?LOX1Ay@ne@d9F9pqrGFv!er6!mo zQ(V<LDtRWnnam;x2{8bOcSte)yudT}6REv#|um0e@uLqk%`wI6XERAc4pr zBB6us-WgccZQa0q2p=x#HS|wT6tIrszvC)8Ubt47SEXFKsQ?YaRdC}67p_H@e+TJr zpGIcn9pj=X0J=n!he3UM&oeF>XQxMfG)!4QJvJ?juG5RwRRCYYokl?VNK3x94{a(2fv;KGupaE~c1clW&TbAJ~fC&$BEQY{aQ;(A_jcM(O~ z6ogp*LJ&@mVrFEg@HmCyKD4XWfOnZ1<|XthC^VnzfDTeosTr%7SY)A21BzMfJO=f> z8dQ0?Uvy>D&UWUFho-GT@58qt9=sFAPN*DZLZmoaGeJux8sQ?7O0O!d*-MOy=eJ0^ zryP`y(|@>a-k+W>`mY54y9&ZK;dmd z+Wh$7zoBy1H4u1%0>4Wga@B%m9^yi`pSSin)U}b%`>{zQ%K>I>B~QaCFxG+w^H12$u$-sP7kob$ z>yjkY2k8W<1-nW^s|4**8g@-D2*5BCGR8r|{pokia7cNUbQEnE-UtjZgz}*g%e;k9 zOn+qpiXQj|>QBUMxQznRChYaAK>OA!2Fh?>cmszZINI8c!sgaWE__1nR%+|B<}iK2 zA0p;Nl)l8O;nU0X>U;pFS;bUIaumFPP_q^p3tGDeFLB16rj3~&6-_^CyiQWJml;FE zKe%e_;9lGuR2Z0JDF7evrxUO1IdsJb=QX`?bQ9*g^wW~rFno#D|C|Qt{VE#&6Oc|R e{3nh4Amr`8>H4pJIySlI*Z%-?tB3`rCIA3g6Wq4| diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 7c524f2ec..3357ddf3b 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -69,7 +69,6 @@ "ohlc", "penguins", "platformer-terrain", - "points", "political-contributions", "population", "population_engineers_hurricanes", @@ -151,7 +150,7 @@ class Metadata(TypedDict, total=False): ``Metadata`` keywords form constraints to filter a table like the below sample: ``` - shape: (73, 13) + shape: (72, 13) ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐ │ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │ │ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │ diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b1395d4fc..d75cdb593 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]: ], str, ] = { - "vega-datasets": "main", + "vega-datasets": "3.0.0-alpha.1", "vega-embed": "6", "vega-lite": "v5.21.0", "vegafusion": "1.6.6", diff --git a/pyproject.toml b/pyproject.toml index 9fa203f37..b9edc7ea2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,9 +104,9 @@ doc = [ [tool.altair.vega] # Minimum/exact versions, for projects under the `vega` organization -vega-datasets = "main" # https://github.com/vega/vega-datasets -vega-embed = "6" # https://github.com/vega/vega-embed -vega-lite = "v5.21.0" # https://github.com/vega/vega-lite +vega-datasets = "3.0.0-alpha.1" # https://github.com/vega/vega-datasets +vega-embed = "6" # https://github.com/vega/vega-embed +vega-lite = "v5.21.0" # https://github.com/vega/vega-lite [tool.hatch] build = { include = ["/altair"], artifacts = ["altair/jupyter/js/index.js"] } diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 81ee5e3f3..f112cacb8 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -227,7 +227,6 @@ def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: "ohlc", "penguins", "platformer-terrain", - "points", "political-contributions", "population", "population_engineers_hurricanes", diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 6c8c75fe5..a7c1d06c4 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -20,7 +20,7 @@ import types from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, ClassVar, Literal from tools import fs from tools.codemod import ruff @@ -40,9 +40,7 @@ else: from typing_extensions import TypeAlias - _PathAlias: TypeAlias = Literal[ - "typing", "metadata-csv", "metadata", "schemas", "datapackage" - ] + _PathAlias: TypeAlias = Literal["typing", "metadata-csv", "metadata", "schemas"] PathMap: TypeAlias = Mapping[_PathAlias, Path] __all__ = ["app"] @@ -54,33 +52,19 @@ class Application: - """ - Top-level context. - - Parameters - ---------- - out_dir_tools, out_dir_altair - Directories to store metadata files. - out_fp_typing - Path to write metadata-derived typing module. - - See Also - -------- - - tools.datasets.npm.Npm - """ - - def __init__( - self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path - ) -> None: - fs.mkdir(out_dir_tools) + """Top-level context.""" + + OUT_DIR: ClassVar[Path] = fs.REPO_ROOT / "altair" / "datasets" + + def __init__(self) -> None: METADATA = "metadata" + out_meta = self.OUT_DIR / "_metadata" self.paths = types.MappingProxyType["_PathAlias", Path]( { - "typing": out_fp_typing, - "metadata-csv": out_dir_altair / f"{METADATA}.csv.gz", - "metadata": out_dir_altair / f"{METADATA}.parquet", - "schemas": out_dir_altair / "schemas.json.gz", - "datapackage": out_dir_tools / "datapackage.json", + "typing": self.OUT_DIR / "_typing.py", + "metadata-csv": out_meta / f"{METADATA}.csv.gz", + "metadata": out_meta / f"{METADATA}.parquet", + "schemas": out_meta / "schemas.json.gz", } ) self._npm: Npm = Npm(self.paths) @@ -89,9 +73,7 @@ def __init__( def npm(self) -> Npm: return self._npm - def refresh( - self, tag: Any, /, *, include_typing: bool = False, frozen: bool = False - ) -> pl.DataFrame: + def refresh(self, tag: Any, /, *, include_typing: bool = False) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -101,17 +83,9 @@ def refresh( Branch or release version to build against. include_typing Regenerate ``altair.datasets._typing``. - frozen - Don't perform any requests. - - .. note:: - **Temporary** measure to work from ``main`` until `vega-datasets@3`_. - - .. _vega-datasets@3: - https://github.com/vega/vega-datasets/issues/654 """ print("Syncing datasets ...") - dpkg = self.npm.datapackage(tag=tag, frozen=frozen) + dpkg = self.npm.datapackage(tag=tag) self.write_parquet(dpkg.core, self.paths["metadata"]) self.write_json_gzip(dpkg.schemas(), self.paths["schemas"]) self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"]) @@ -226,9 +200,4 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None: ruff.write_lint_format(self.paths["typing"], contents) -_alt_datasets = fs.REPO_ROOT / "altair" / "datasets" -app = Application( - Path(__file__).parent / "_metadata", - _alt_datasets / "_metadata", - _alt_datasets / "_typing.py", -) +app = Application() diff --git a/tools/datasets/_metadata/datapackage.json b/tools/datasets/_metadata/datapackage.json deleted file mode 100644 index df9d40e85..000000000 --- a/tools/datasets/_metadata/datapackage.json +++ /dev/null @@ -1 +0,0 @@ -{"name": "vega-datasets", "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [{"name": "BSD-3-Clause", "path": "https://opensource.org/license/bsd-3-clause", "title": "The 3-Clause BSD License"}], "contributors": [{"title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu"}, {"title": "vega-datasets contributors", "path": "https://github.com/vega/vega-datasets/graphs/contributors"}], "version": "2.11.0", "created": "2025-01-12T14:23:04.938086+00:00", "resources": [{"name": "7zip.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:6586d6c00887cd48850099c174a42bb1677ade0c", "bytes": 3969}, {"name": "airports.csv", "type": "table", "path": "airports.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:608ba6d51fa70584c3fa1d31eb94533302553838", "bytes": 210365, "schema": {"fields": [{"name": "iata", "type": "string"}, {"name": "name", "type": "string"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "country", "type": "string"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}]}}, {"name": "annual-precip.json", "type": "json", "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", "sources": [{"title": "Climate Forecast System Version 2", "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2"}], "path": "annual-precip.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:719e73406cfc08f16dda651513ae1113edd75845", "bytes": 266265}, {"name": "anscombe.json", "type": "table", "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:11ae97090b6263bdf0c8661156a44a5b782e0787", "bytes": 1703, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Series", "type": "string"}, {"name": "X", "type": "integer"}, {"name": "Y", "type": "number"}]}}, {"name": "barley.json", "type": "table", "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", "sources": [{"title": "The Design of Experiments Reference", "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments"}, {"title": "Trellis Charts Paper", "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf"}], "path": "barley.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8dc50de2509b6e197ce95c24c98f90d9d1ab138c", "bytes": 8487, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "yield", "type": "number"}, {"name": "variety", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "site", "type": "string"}]}}, {"name": "birdstrikes.csv", "type": "table", "description": "Records of reported wildlife strikes received by the U.S. FAA", "sources": [{"title": "FAA Wildlife Strike Database", "path": "http://wildlife.faa.gov"}], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:1b8b190c9bc02ef7bcbfe5a8a70f61b1616d3f6c", "bytes": 1223329, "schema": {"fields": [{"name": "Airport Name", "type": "string"}, {"name": "Aircraft Make Model", "type": "string"}, {"name": "Effect Amount of damage", "type": "string"}, {"name": "Flight Date", "type": "date"}, {"name": "Aircraft Airline Operator", "type": "string"}, {"name": "Origin State", "type": "string"}, {"name": "Phase of flight", "type": "string"}, {"name": "Wildlife Size", "type": "string"}, {"name": "Wildlife Species", "type": "string"}, {"name": "Time of day", "type": "string"}, {"name": "Cost Other", "type": "integer"}, {"name": "Cost Repair", "type": "integer"}, {"name": "Cost Total $", "type": "integer"}, {"name": "Speed IAS in knots", "type": "integer"}]}}, {"name": "budget.json", "type": "table", "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", "sources": [{"title": "Office of Management and Budget - Budget FY 2016 - Receipts", "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3"}], "path": "budget.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:5b18c08b28fb782f54ca98ce6a1dd220f269adf1", "bytes": 391353, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Source Category Code", "type": "integer"}, {"name": "Source category name", "type": "string"}, {"name": "Source subcategory", "type": "integer"}, {"name": "Source subcategory name", "type": "string"}, {"name": "Agency code", "type": "integer"}, {"name": "Agency name", "type": "string"}, {"name": "Bureau code", "type": "integer"}, {"name": "Bureau name", "type": "string"}, {"name": "Account code", "type": "integer"}, {"name": "Account name", "type": "string"}, {"name": "Treasury Agency code", "type": "integer"}, {"name": "On- or off-budget", "type": "string"}, {"name": "1962", "type": "string"}, {"name": "1963", "type": "string"}, {"name": "1964", "type": "string"}, {"name": "1965", "type": "string"}, {"name": "1966", "type": "string"}, {"name": "1967", "type": "string"}, {"name": "1968", "type": "string"}, {"name": "1969", "type": "string"}, {"name": "1970", "type": "string"}, {"name": "1971", "type": "string"}, {"name": "1972", "type": "string"}, {"name": "1973", "type": "string"}, {"name": "1974", "type": "string"}, {"name": "1975", "type": "string"}, {"name": "1976", "type": "string"}, {"name": "TQ", "type": "string"}, {"name": "1977", "type": "string"}, {"name": "1978", "type": "string"}, {"name": "1979", "type": "string"}, {"name": "1980", "type": "string"}, {"name": "1981", "type": "string"}, {"name": "1982", "type": "string"}, {"name": "1983", "type": "string"}, {"name": "1984", "type": "string"}, {"name": "1985", "type": "string"}, {"name": "1986", "type": "string"}, {"name": "1987", "type": "string"}, {"name": "1988", "type": "string"}, {"name": "1989", "type": "string"}, {"name": "1990", "type": "string"}, {"name": "1991", "type": "string"}, {"name": "1992", "type": "string"}, {"name": "1993", "type": "string"}, {"name": "1994", "type": "string"}, {"name": "1995", "type": "string"}, {"name": "1996", "type": "string"}, {"name": "1997", "type": "string"}, {"name": "1998", "type": "string"}, {"name": "1999", "type": "string"}, {"name": "2000", "type": "string"}, {"name": "2001", "type": "string"}, {"name": "2002", "type": "string"}, {"name": "2003", "type": "string"}, {"name": "2004", "type": "string"}, {"name": "2005", "type": "string"}, {"name": "2006", "type": "string"}, {"name": "2007", "type": "string"}, {"name": "2008", "type": "string"}, {"name": "2009", "type": "string"}, {"name": "2010", "type": "string"}, {"name": "2011", "type": "string"}, {"name": "2012", "type": "string"}, {"name": "2013", "type": "string"}, {"name": "2014", "type": "string"}, {"name": "2015", "type": "string"}, {"name": "2016", "type": "string"}, {"name": "2017", "type": "string"}, {"name": "2018", "type": "string"}, {"name": "2019", "type": "string"}, {"name": "2020", "type": "string"}]}}, {"name": "budgets.json", "type": "table", "path": "budgets.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8a909e24f698a3b0f6c637c30ec95e7e17df7ef6", "bytes": 18079, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "budgetYear", "type": "integer"}, {"name": "forecastYear", "type": "integer"}, {"name": "value", "type": "number"}]}}, {"name": "burtin.json", "type": "table", "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 \u03bcg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in \u03bc/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [{"title": "Scope Magazine", "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/"}, {"title": "Protovis Antibiotics Example", "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html"}], "path": "burtin.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d8a82abaad7dba4f9cd8cee402ba3bf07e70d0e4", "bytes": 2743, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Bacteria", "type": "string"}, {"name": "Penicillin", "type": "number"}, {"name": "Streptomycin", "type": "number"}, {"name": "Neomycin", "type": "number"}, {"name": "Gram_Staining", "type": "string"}, {"name": "Genus", "type": "string"}]}}, {"name": "cars.json", "type": "table", "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", "sources": [{"title": "StatLib Datasets Archive", "path": "http://lib.stat.cmu.edu/datasets/"}], "path": "cars.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:1d56d3fa6da01af9ece2d6397892fe5bb6f47c3d", "bytes": 100492, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Name", "type": "string"}, {"name": "Miles_per_Gallon", "type": "integer"}, {"name": "Cylinders", "type": "integer"}, {"name": "Displacement", "type": "number"}, {"name": "Horsepower", "type": "integer"}, {"name": "Weight_in_lbs", "type": "integer"}, {"name": "Acceleration", "type": "number"}, {"name": "Year", "type": "date"}, {"name": "Origin", "type": "string"}]}}, {"name": "co2-concentration.csv", "type": "table", "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", "sources": [{"title": "Scripps CO2 Program", "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record"}], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b8715cbd2a8d0c139020a73fdb4d231f8bde193a", "bytes": 18547, "schema": {"fields": [{"name": "Date", "type": "date"}, {"name": "CO2", "type": "number"}, {"name": "adjusted CO2", "type": "number"}]}}, {"name": "countries.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation - Life Expectancy", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}], "path": "countries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:0070959b7f1a09475baa5099098240ae81026e72", "bytes": 99457, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "_comment", "type": "string"}, {"name": "year", "type": "integer", "description": "Years from 1955 to 2000 at 5-year intervals"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman) for the given year"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years for the given year"}, {"name": "n_fertility", "type": "number", "description": "Fertility rate for the next 5-year interval"}, {"name": "n_life_expect", "type": "number", "description": "Life expectancy for the next 5-year interval"}, {"name": "country", "type": "string", "description": "Name of the country"}]}}, {"name": "crimea.json", "type": "table", "description": "This dataset, which informed Florence Nightingale's groundbreaking work in public health, details \nmonthly mortality rates from British military hospitals during the Crimean War (1854-1856). \n\nNightingale credits Dr. William Farr for compiling the data from the 1858 [Medical and Surgical \nHistory of the British Army](http://resource.nlm.nih.gov/62510370R). The dataset categorizes \ndeaths into \"zymotic\" diseases (preventable infectious diseases), wounds/injuries, and other causes. \nCovering the period from April 1854 to March 1856, the dataset includes monthly army strength \nalongside mortality figures. Nightingale transformed this data into her now-famous [polar area \ndiagrams](https://iiif.lib.harvard.edu/manifests/view/drs:7420433$25i). \n\nThe annual mortality rates plotted in the chart can be calculated from the dataset using the formula \n> (Deaths × 1000 × 12) ÷ Army Size. \n\nAs [The Lancet](https://pmc.ncbi.nlm.nih.gov/articles/PMC7252134/) argued in 2020, Nightingale's \ninnovative visualizations proved that \"far more men died of disease, infection, and exposure \nthan in battle\u2014a fact that shocked the British nation.\" Her work also vividly illustrated \nthe dramatic impact of sanitary reforms, particularly in reducing preventable deaths.", "sources": [{"title": "Nightingale, Florence. A contribution to the sanitary history of the British army during the late war with Russia. London : John W. Parker and Son, 1859. Table II. Table showing the Estimated Average Monthly Strength of the Army; and the Deaths and Annual Rate of Mortality per 1,000 in each month, from April 1854, to March 1856 (inclusive), in the Hospitals of the Army in the East.\n", "path": "https://nrs.lib.harvard.edu/urn-3:hms.count:1177146?n=21"}], "path": "crimea.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d2df500c612051a21fe324237a465a62d5fe01b6", "bytes": 2183, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date", "description": "First day of each month during the observation period, in ISO 8601 format (YYYY-MM-DD)"}, {"name": "wounds", "type": "integer", "description": "Deaths from \"Wounds and Injuries\" which comprised: Luxatio (dislocation), Sub-Luxatio (partial dislocation), Vulnus Sclopitorum (gunshot wounds), Vulnus Incisum (incised wounds), Contusio (bruising), Fractura (fractures), Ambustio (burns) and Concussio-Cerebri (brain concussion)\n"}, {"name": "other", "type": "integer", "description": "Deaths from All Other Causes"}, {"name": "disease", "type": "integer", "description": "Deaths from Zymotic Diseases (preventable infectious diseases)"}, {"name": "army_size", "type": "integer", "description": "Estimated Average Monthly Strength of the Army"}]}}, {"name": "disasters.csv", "type": "table", "description": "Annual number of deaths from disasters.", "sources": [{"title": "Our World in Data - Natural Catastrophes", "path": "https://ourworldindata.org/natural-catastrophes"}], "path": "disasters.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0584ed86190870b0089d9ea67c94f3dd3feb0ec8", "bytes": 18840, "schema": {"fields": [{"name": "Entity", "type": "string"}, {"name": "Year", "type": "integer"}, {"name": "Deaths", "type": "integer"}]}}, {"name": "driving.json", "type": "table", "sources": [{"title": "New York Times", "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html"}], "path": "driving.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:33d0afc57fb1005e69cd3e8a6c77a26670d91979", "bytes": 3461, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "side", "type": "string"}, {"name": "year", "type": "integer"}, {"name": "miles", "type": "integer"}, {"name": "gas", "type": "number"}]}}, {"name": "earthquakes.json", "type": "json", "description": "Earthquake data retrieved Feb 6, 2018", "sources": [{"title": "USGS Earthquake Feed", "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson"}], "path": "earthquakes.json", "scheme": "file", "format": "geojson", "mediatype": "text/geojson", "encoding": "utf-8", "hash": "sha1:ed4c47436c09d5cc5f428c233fbd8074c0346fd0", "bytes": 1219853}, {"name": "ffox.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:0691709484a75e9d8ee55a22b1980d67d239c2c4", "bytes": 17628}, {"name": "flare-dependencies.json", "type": "table", "path": "flare-dependencies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:10bbe538daaa34014cd5173b331f7d3c10bfda49", "bytes": 34600, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "source", "type": "integer"}, {"name": "target", "type": "integer"}]}}, {"name": "flare.json", "type": "table", "path": "flare.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d232ea60f875de87a7d8fc414876e19356a98b6b", "bytes": 20638, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}]}}, {"name": "flights-10k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-10k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:769a34f3d0442be8f356651463fe925ad8b3759d", "bytes": 892400, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-200k.arrow", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", "mediatype": "application/vnd.apache.arrow.file", "hash": "sha1:74f6b3cf8b779e3ff204be2f5a9762763d50a095", "bytes": 1600864, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-200k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-200k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4722e02637cf5f38ad9ea5d1f48cae7872dce22d", "bytes": 9863892, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "time", "type": "number"}]}}, {"name": "flights-20k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-20k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:20c920b46db4f664bed3e1420b8348527cd7c41e", "bytes": 1784867, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-2k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-2k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d9221dc7cd477209bf87e680be3c881d8fee53cd", "bytes": 178495, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-3m.parquet", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", "mediatype": "application/parquet", "hash": "sha1:9c4e0b480a1a60954a7e5c6bcc43e1c91a73caaa", "bytes": 13493022, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-5k.json", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-5k.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8459fa09e3ba8197928b5dba0b9f5cc380629758", "bytes": 446167, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "delay", "type": "integer"}, {"name": "distance", "type": "integer"}, {"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}]}}, {"name": "flights-airport.csv", "type": "table", "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", "sources": [{"title": "U.S. Bureau of Transportation Statistics", "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr"}], "path": "flights-airport.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0ba03114891e97cfc3f83d9e3569259e7f07af7b", "bytes": 65572, "schema": {"fields": [{"name": "origin", "type": "string"}, {"name": "destination", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "football.json", "type": "table", "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", "sources": [{"title": "OpenFootball", "path": "https://github.com/openfootball/football.json"}], "path": "football.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:d07898748997b9716ae699e9c2d5b91b4bb48a51", "bytes": 1207180, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "division", "type": "string"}, {"name": "home_team", "type": "string"}, {"name": "away_team", "type": "string"}, {"name": "home_score", "type": "integer"}, {"name": "away_score", "type": "integer"}]}}, {"name": "gapminder-health-income.csv", "type": "table", "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", "licenses": [{"title": "Creative Commons Attribution 4.0 International", "path": "https://www.gapminder.org/free-material/"}], "sources": [{"title": "Gapminder Foundation", "path": "https://www.gapminder.org"}, {"title": "Gapminder GDP Per Capita Data", "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268"}], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:abce37a932917085023a345b1a004396e9355ac3", "bytes": 8605, "schema": {"fields": [{"name": "country", "type": "string"}, {"name": "income", "type": "integer"}, {"name": "health", "type": "number"}, {"name": "population", "type": "integer"}, {"name": "region", "type": "string"}]}}, {"name": "gapminder.json", "type": "table", "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [{"title": "Gapminder Foundation - Life Expectancy (Data)", "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundatio - Life Expectancy (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd004/"}, {"title": "Gapminder Foundation - Population (Data)", "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", "version": "7"}, {"title": "Gapminder Foundation - Population (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd003/"}, {"title": "Gapminder Foundation - Fertility (Data)", "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", "version": "14"}, {"title": "Gapminder Foundation - Fertility Documentation (Documentation)", "path": "https://www.gapminder.org/data/documentation/gd008/"}, {"title": "Gapminder Foundation - Data Geographies (Data)", "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", "version": "2"}, {"title": "Gapminder Foundation - Data Geographies (Documentation)", "path": "https://www.gapminder.org/data/geo/"}, {"title": "Gapminder Data Documentation", "path": "https://www.gapminder.org/data/documentation/"}], "path": "gapminder.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:8cb2f0fc23ce612e5f0c7bbe3dcac57f6764b7b3", "bytes": 75201, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Years from 1955 to 2005 at 5-year intervals"}, {"name": "country", "type": "string", "description": "Name of the country"}, {"name": "cluster", "type": "integer", "description": "A categorical variable (values 0-5) grouping countries by region"}, {"name": "pop", "type": "integer", "description": "Population of the country"}, {"name": "life_expect", "type": "number", "description": "Life expectancy in years"}, {"name": "fertility", "type": "number", "description": "Fertility rate (average number of children per woman"}]}}, {"name": "gimp.png", "type": "file", "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", "mediatype": "image/png", "encoding": "utf-8", "hash": "sha1:cf0505dd72eb52558f6f71bd6f43663df4f2f82c", "bytes": 8211}, {"name": "github.csv", "type": "table", "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:18547064dd687c328ea2fb5023cae6417ca6f050", "bytes": 21059, "schema": {"fields": [{"name": "time", "type": "string"}, {"name": "count", "type": "integer"}]}}, {"name": "global-temp.csv", "type": "table", "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", "sources": [{"title": "NASA Goddard Institute for Space Studies", "path": "https://data.giss.nasa.gov/gistemp/"}], "path": "global-temp.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:01a4f05ed45ce939307dcd9bc4e75ed5cd1ab202", "bytes": 1663, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "temp", "type": "number"}]}}, {"name": "income.json", "type": "table", "path": "income.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:ebfd02fd584009ee391bfc5d33972e4c94f507ab", "bytes": 72771, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "region", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "pct", "type": "number"}, {"name": "total", "type": "integer"}, {"name": "group", "type": "string"}]}}, {"name": "iowa-electricity.csv", "type": "table", "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", "sources": [{"title": "U.S. Energy Information Administration", "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin="}], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:214238f23d7a57e3398f4e9f1e87e61abb23cafc", "bytes": 1531, "schema": {"fields": [{"name": "year", "type": "date"}, {"name": "source", "type": "string"}, {"name": "net_generation", "type": "integer"}]}}, {"name": "jobs.json", "type": "table", "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Vi\u00e9gas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/", "version": "6.0"}], "path": "jobs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:69d386f47305f4d8fd2886e805004fbdd71568e9", "bytes": 936649, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "job", "type": "string", "description": "The occupation title"}, {"name": "sex", "type": "string", "description": "Sex (men/women)"}, {"name": "year", "type": "integer", "description": "Census year"}, {"name": "count", "type": "integer", "description": "Number of individuals in the occupation"}, {"name": "perc", "type": "number", "description": "Percentage of the workforce in the occupation"}]}}, {"name": "la-riots.csv", "type": "table", "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", "sources": [{"title": "LA Riots Deaths, Los Angeles Times Data Desk", "path": "http://spreadsheets.latimes.com/la-riots-deaths/"}], "path": "la-riots.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:94ee8ad8198d2954f77e3a98268d8b1f7fe7d086", "bytes": 7432, "schema": {"fields": [{"name": "first_name", "type": "string"}, {"name": "last_name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "gender", "type": "string"}, {"name": "race", "type": "string"}, {"name": "death_date", "type": "date"}, {"name": "address", "type": "string"}, {"name": "neighborhood", "type": "string"}, {"name": "type", "type": "string"}, {"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}]}}, {"name": "londonboroughs.json", "type": "json", "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data \u00a9 Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data \u00a9 Crown copyright and database right [2015].", "sources": [{"title": "Statistical GIS Boundary Files, London Datastore", "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london"}], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:d90805055ffdfe5163a7655c4847dc61df45f92b", "bytes": 14732}, {"name": "londoncentroids.json", "type": "table", "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:2e24c01140cfbcad5e1c859be6df4efebca2fbf5", "bytes": 2339, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "cx", "type": "number"}, {"name": "cy", "type": "number"}]}}, {"name": "londontubelines.json", "type": "json", "description": "Selected rail lines simplified from source.", "sources": [{"title": "London Tube Data", "path": "https://github.com/oobrien/vis/tree/master/tube/data"}], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:1b21ea5339320090b106082bd9d39a1055aadb18", "bytes": 80097}, {"name": "lookup_groups.csv", "type": "table", "path": "lookup_groups.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:741df36729a9d84d18ec42f23a386b53e7e3c428", "bytes": 77, "schema": {"fields": [{"name": "group", "type": "integer"}, {"name": "person", "type": "string"}]}}, {"name": "lookup_people.csv", "type": "table", "path": "lookup_people.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:c79f69afb3ff81a0c8ddc01f5cf2f078e288457c", "bytes": 125, "schema": {"fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "height", "type": "integer"}]}}, {"name": "miserables.json", "type": "json", "path": "miserables.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:a8b0faaa94c7425c49fe36ea1a93319430fec426", "bytes": 12372}, {"name": "monarchs.json", "type": "table", "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", "sources": [{"title": "The Royal Family - Kings & Queens", "path": "https://www.royal.uk/kings-and-queens-1066"}, {"title": "The Royal Family - Interregnum", "path": "https://www.royal.uk/interregnum-1649-1660"}], "path": "monarchs.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:921dfa487a4198cfe78f743aa0aa87ad921642df", "bytes": 683, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "name", "type": "string", "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)"}, {"name": "start", "type": "integer", "description": "The year their rule began"}, {"name": "end", "type": "integer", "description": "The year their rule ended"}, {"name": "index", "type": "integer", "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers"}]}}, {"name": "movies.json", "type": "table", "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:e38178f99454568c5160fc759184a1a1471cc558", "bytes": 1399981, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Title", "type": "string"}, {"name": "US Gross", "type": "integer"}, {"name": "Worldwide Gross", "type": "integer"}, {"name": "US DVD Sales", "type": "integer"}, {"name": "Production Budget", "type": "integer"}, {"name": "Release Date", "type": "string"}, {"name": "MPAA Rating", "type": "string"}, {"name": "Running Time min", "type": "integer"}, {"name": "Distributor", "type": "string"}, {"name": "Source", "type": "string"}, {"name": "Major Genre", "type": "string"}, {"name": "Creative Type", "type": "string"}, {"name": "Director", "type": "string"}, {"name": "Rotten Tomatoes Rating", "type": "integer"}, {"name": "IMDB Rating", "type": "number"}, {"name": "IMDB Votes", "type": "integer"}]}}, {"name": "normal-2d.json", "type": "table", "path": "normal-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4303306ec275209fcba008cbd3a5f29c9e612424", "bytes": 34398, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "obesity.json", "type": "table", "path": "obesity.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:6da8129ed0b0333c88302e153824b06f7859aac9", "bytes": 2202, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "id", "type": "integer"}, {"name": "rate", "type": "number"}, {"name": "state", "type": "string"}]}}, {"name": "ohlc.json", "type": "table", "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [{"title": "Yahoo Finance VIX Data", "path": "https://finance.yahoo.com/chart/%5EVIX"}], "path": "ohlc.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9b3d93e8479d3ddeee29b5e22909132346ac0a3b", "bytes": 5737, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "signal", "type": "string"}, {"name": "ret", "type": "number"}]}}, {"name": "penguins.json", "type": "table", "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", "sources": [{"title": "Palmer Station Antarctica LTER", "path": "https://pal.lternet.edu/"}, {"title": "Allison Horst's Penguins Repository", "path": "https://github.com/allisonhorst/penguins"}], "path": "penguins.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:517b6d3267174b1b65691a37cbd59c1739155866", "bytes": 67119, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Species", "type": "string"}, {"name": "Island", "type": "string"}, {"name": "Beak Length (mm)", "type": "number"}, {"name": "Beak Depth (mm)", "type": "number"}, {"name": "Flipper Length (mm)", "type": "integer"}, {"name": "Body Mass (g)", "type": "integer"}, {"name": "Sex", "type": "string"}]}}, {"name": "platformer-terrain.json", "type": "table", "description": "Assets from the video game Celeste.", "sources": [{"title": "Celeste Game", "path": "http://www.celestegame.com/"}], "path": "platformer-terrain.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:01df4411cb16bf758fe8ffa6529507419189edc2", "bytes": 1424097, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "integer"}, {"name": "y", "type": "integer"}, {"name": "lumosity", "type": "number"}, {"name": "saturation", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "id", "type": "string"}, {"name": "color", "type": "string"}, {"name": "key", "type": "string"}]}}, {"name": "points.json", "type": "table", "path": "points.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4716a117308962f3596179d7d7d2ad729a19cda7", "bytes": 4926, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "x", "type": "number"}, {"name": "y", "type": "number"}]}}, {"name": "political-contributions.json", "type": "table", "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [{"title": "Federal Election Commission Bulk Data", "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data"}, {"title": "OpenFEC API", "path": "https://api.open.fec.gov/developers/"}], "path": "political-contributions.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4aa2e19fa392cc9448aa8ffbdad15b014371f499", "bytes": 50265, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "Candidate_Identification", "type": "string"}, {"name": "Candidate_Name", "type": "string"}, {"name": "Incumbent_Challenger_Status", "type": "string"}, {"name": "Party_Code", "type": "integer"}, {"name": "Party_Affiliation", "type": "string"}, {"name": "Total_Receipts", "type": "number"}, {"name": "Transfers_from_Authorized_Committees", "type": "integer"}, {"name": "Total_Disbursements", "type": "number"}, {"name": "Transfers_to_Authorized_Committees", "type": "number"}, {"name": "Beginning_Cash", "type": "number"}, {"name": "Ending_Cash", "type": "number"}, {"name": "Contributions_from_Candidate", "type": "number"}, {"name": "Loans_from_Candidate", "type": "integer"}, {"name": "Other_Loans", "type": "integer"}, {"name": "Candidate_Loan_Repayments", "type": "number"}, {"name": "Other_Loan_Repayments", "type": "integer"}, {"name": "Debts_Owed_By", "type": "number"}, {"name": "Total_Individual_Contributions", "type": "integer"}, {"name": "Candidate_State", "type": "string"}, {"name": "Candidate_District", "type": "integer"}, {"name": "Contributions_from_Other_Political_Committees", "type": "integer"}, {"name": "Contributions_from_Party_Committees", "type": "integer"}, {"name": "Coverage_End_Date", "type": "string"}, {"name": "Refunds_to_Individuals", "type": "integer"}, {"name": "Refunds_to_Committees", "type": "integer"}]}}, {"name": "population.json", "type": "table", "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", "sources": [{"title": "IPUMS USA", "path": "https://usa.ipums.org/usa/"}], "path": "population.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:680fd336e777314198450721c31227a11f02411f", "bytes": 27665, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer", "description": "Four-digit year of the survey"}, {"name": "age", "type": "integer", "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)"}, {"name": "sex", "type": "integer", "description": "Sex (1=men, 2=women)"}, {"name": "people", "type": "integer", "description": "Number of individuals (IPUMS PERWT)"}]}}, {"name": "population_engineers_hurricanes.csv", "type": "table", "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [{"title": "Bureau of Labor Statistics", "path": "https://www.bls.gov/oes/tables.htm"}, {"title": "American Community Survey", "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table"}, {"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:3bad66ef911b93c641edc21f2034302348bffaf9", "bytes": 1852, "schema": {"fields": [{"name": "state", "type": "string"}, {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "engineers", "type": "number"}, {"name": "hurricanes", "type": "integer"}]}}, {"name": "seattle-weather-hourly-normals.csv", "type": "table", "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [{"title": "NOAA National Climatic Data Center (NCDC)", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals"}], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d55461adc9742bb061f6072b694aaf73e8b529db", "bytes": 311148, "schema": {"fields": [{"name": "date", "type": "datetime"}, {"name": "pressure", "type": "number"}, {"name": "temperature", "type": "number"}, {"name": "wind", "type": "number"}]}}, {"name": "seattle-weather.csv", "type": "table", "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", "sources": [{"title": "NOAA National Climatic Data Center", "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records"}], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0f38b53bdc1c42c5e5d484f33b9d4d7b229e0e59", "bytes": 48219, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "sp500-2000.csv", "type": "table", "description": "S&P 500 index values from 2000 to 2020.", "sources": [{"title": "Yahoo Finance", "path": "https://finance.yahoo.com/quote/%5EDJI/history/"}], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:b82f20656d0521801db7c5599a6c990415a8aaff", "bytes": 415968, "schema": {"fields": [{"name": "date", "type": "date"}, {"name": "open", "type": "number"}, {"name": "high", "type": "number"}, {"name": "low", "type": "number"}, {"name": "close", "type": "number"}, {"name": "adjclose", "type": "number"}, {"name": "volume", "type": "integer"}]}}, {"name": "sp500.csv", "type": "table", "path": "sp500.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0eb287fb7c207f4ed392821d67a92267180fc8cf", "bytes": 2305, "schema": {"fields": [{"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "stocks.csv", "type": "table", "path": "stocks.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:58e2ce1bed01eeebe29f5b4be32344aaec5532c0", "bytes": 12245, "schema": {"fields": [{"name": "symbol", "type": "string"}, {"name": "date", "type": "string"}, {"name": "price", "type": "number"}]}}, {"name": "udistrict.json", "type": "table", "path": "udistrict.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:65675107d81c19ffab260ac1f235f3e477fe8982", "bytes": 6460, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "key", "type": "string"}, {"name": "lat", "type": "number"}]}}, {"name": "unemployment-across-industries.json", "type": "table", "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "U.S. Census Bureau Current Population Survey", "path": "https://www.census.gov/programs-surveys/cps.html"}, {"title": "BLS LAUS Data Tools", "path": "https://www.bls.gov/lau/data.htm"}, {"title": "Bureau of Labor Statistics Table A-31", "path": "https://www.bls.gov/web/empsit/cpseea31.htm"}], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:4d769356c95c40a9807a7d048ab81aa56ae77df0", "bytes": 185641, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "series", "type": "string", "description": "Industry name"}, {"name": "year", "type": "integer", "description": "Year (2000-2010)"}, {"name": "month", "type": "integer", "description": "Month (1-12)"}, {"name": "count", "type": "integer", "description": "Number of unemployed persons (in thousands)"}, {"name": "rate", "type": "number", "description": "Unemployment rate (percentage)"}, {"name": "date", "type": "datetime", "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")"}]}}, {"name": "unemployment.tsv", "type": "table", "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", "sources": [{"title": "BLS Developers API", "path": "https://www.bls.gov/developers/"}, {"title": "BLS Handbook of Methods", "path": "https://www.bls.gov/opub/hom/lau/home.htm"}], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", "mediatype": "text/tsv", "encoding": "utf-8", "hash": "sha1:d1aca19c4821fdc3b4270989661a1787d38588d0", "bytes": 34739, "dialect": {"csv": {"delimiter": "\t"}}, "schema": {"fields": [{"name": "id", "type": "integer", "description": "The combined state and county FIPS code"}, {"name": "rate", "type": "number", "description": "The unemployment rate for the county"}]}}, {"name": "uniform-2d.json", "type": "table", "path": "uniform-2d.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:c6120dd8887a0841a9fcc31e247463dbd3d0a996", "bytes": 34217, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "u", "type": "number"}, {"name": "v", "type": "number"}]}}, {"name": "us-10m.json", "type": "json", "path": "us-10m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:ff7a7e679c46f2d1eb85cc92521b990f1a7a5c7a", "bytes": 642361}, {"name": "us-employment.csv", "type": "table", "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [{"title": "U.S. Bureau of Labor Statistics Current Employment Statistics", "path": "https://www.bls.gov/ces/"}], "path": "us-employment.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:8795be57cf1e004f4ecba44cab2b324a074330df", "bytes": 17841, "schema": {"fields": [{"name": "month", "type": "date"}, {"name": "nonfarm", "type": "integer"}, {"name": "private", "type": "integer"}, {"name": "goods_producing", "type": "integer"}, {"name": "service_providing", "type": "integer"}, {"name": "private_service_providing", "type": "integer"}, {"name": "mining_and_logging", "type": "integer"}, {"name": "construction", "type": "integer"}, {"name": "manufacturing", "type": "integer"}, {"name": "durable_goods", "type": "integer"}, {"name": "nondurable_goods", "type": "integer"}, {"name": "trade_transportation_utilties", "type": "integer"}, {"name": "wholesale_trade", "type": "number"}, {"name": "retail_trade", "type": "number"}, {"name": "transportation_and_warehousing", "type": "number"}, {"name": "utilities", "type": "number"}, {"name": "information", "type": "integer"}, {"name": "financial_activities", "type": "integer"}, {"name": "professional_and_business_services", "type": "integer"}, {"name": "education_and_health_services", "type": "integer"}, {"name": "leisure_and_hospitality", "type": "integer"}, {"name": "other_services", "type": "integer"}, {"name": "government", "type": "integer"}, {"name": "nonfarm_change", "type": "integer"}]}}, {"name": "us-state-capitals.json", "type": "table", "path": "us-state-capitals.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:9c3211c5058c899412c30f5992a77c54a1b80066", "bytes": 3869, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "lon", "type": "number"}, {"name": "lat", "type": "number"}, {"name": "state", "type": "string"}, {"name": "city", "type": "string"}]}}, {"name": "volcano.json", "type": "json", "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", "sources": [{"title": "R Datasets", "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html"}], "path": "volcano.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:841151dbfbc5f6db3e19904557abd7a7aad0efd2", "bytes": 21167}, {"name": "weather.csv", "type": "table", "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", "sources": [{"title": "NOAA Climate Data Online", "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation"}], "path": "weather.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:0e7e853f4c5b67615da261d5d343824a43510f50", "bytes": 121417, "schema": {"fields": [{"name": "location", "type": "string"}, {"name": "date", "type": "date"}, {"name": "precipitation", "type": "number"}, {"name": "temp_max", "type": "number"}, {"name": "temp_min", "type": "number"}, {"name": "wind", "type": "number"}, {"name": "weather", "type": "string"}]}}, {"name": "weekly-weather.json", "type": "json", "description": "Instructional dataset showing actual and predicted temperature data.\n\n> [!IMPORTANT]\n> Named `weather.json` in previous versions (`v1.4.0` - `v2.11.0`).\n", "path": "weekly-weather.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:bd42a3e2403e7ccd6baaa89f93e7f0c164e0c185", "bytes": 1281}, {"name": "wheat.json", "type": "table", "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [{"title": "1822 Playfair Chart", "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg"}], "path": "wheat.json", "scheme": "file", "format": "json", "mediatype": "text/json", "encoding": "utf-8", "hash": "sha1:cde46b43fc82f4c3c2a37ddcfe99fd5f4d8d8791", "bytes": 2085, "dialect": {"json": {"keyed": true}}, "schema": {"fields": [{"name": "year", "type": "integer"}, {"name": "wheat", "type": "number"}, {"name": "wages", "type": "number"}]}}, {"name": "windvectors.csv", "type": "table", "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:ed686b0ba613abd59d09fcd946b5030a918b8154", "bytes": 129253, "schema": {"fields": [{"name": "longitude", "type": "number"}, {"name": "latitude", "type": "number"}, {"name": "dir", "type": "integer"}, {"name": "dirCat", "type": "integer"}, {"name": "speed", "type": "number"}]}}, {"name": "world-110m.json", "type": "json", "path": "world-110m.json", "scheme": "file", "format": "topojson", "mediatype": "text/topojson", "encoding": "utf-8", "hash": "sha1:a1ce852de6f2713c94c0c284039506ca2d4f3dee", "bytes": 119410}, {"name": "zipcodes.csv", "type": "table", "description": "GeoNames.org", "sources": [{"title": "GeoNames", "path": "https://www.geonames.org"}], "path": "zipcodes.csv", "scheme": "file", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", "hash": "sha1:d3df33e12be0d0544c95f1bd47005add4b7010be", "bytes": 2018388, "schema": {"fields": [{"name": "zip_code", "type": "integer"}, {"name": "latitude", "type": "number"}, {"name": "longitude", "type": "number"}, {"name": "city", "type": "string"}, {"name": "state", "type": "string"}, {"name": "county", "type": "string"}]}}]} \ No newline at end of file diff --git a/tools/datasets/models.py b/tools/datasets/models.py index f88a0b842..ee1af8953 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -20,7 +20,6 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - import polars as pl from altair.datasets._typing import Dataset, FlFieldStr @@ -117,10 +116,3 @@ class Package(TypedDict): sources: Sequence[Source] created: str resources: Sequence[Resource] - - -class ParsedPackage(TypedDict): - """Minimal representations to write to disk.""" - - features: pl.DataFrame - schemas: Mapping[Dataset, Mapping[str, FlFieldStr]] diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 40116cb05..a10e13a64 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -5,6 +5,7 @@ import urllib.request from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple +from urllib.request import Request from tools.datasets import datapackage @@ -22,7 +23,6 @@ from typing_extensions import TypeAlias from tools.datasets import PathMap from tools.datasets.datapackage import DataPackage - from tools.datasets.models import Package BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString' @@ -54,30 +54,25 @@ def __init__( GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", ) - def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString: - """ - Common url prefix for all datasets derived from ``version``. + def _prefix(self, version: BranchOrTag, /) -> LiteralString: + return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/" - Notes - ----- - - Encodes the endpoint at this stage - - Use github if its the only option (since its slower otherwise) - - npm only has releases/tags (not branches) - """ - return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/" + def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString: + """Common url prefix for all datasets derived from ``version``.""" + return f"{self._prefix(version)}data/" @property def url(self) -> NpmUrl: return self._url - def file_gh( + def file( self, branch_or_tag: BranchOrTag, path: str, /, ) -> Any: """ - Request a file from the `jsdelivr GitHub`_ endpoint. + Request a file from `jsdelivr` `npm`_ or `GitHub`_ endpoints. Parameters ---------- @@ -86,7 +81,9 @@ def file_gh( path Relative filepath from the root of the repo. - .. _jsdelivr GitHub: + .. _npm: + https://www.jsdelivr.com/documentation#id-npm + .. _GitHub: https://www.jsdelivr.com/documentation#id-github .. _branches: https://github.com/vega/vega-datasets/branches @@ -100,20 +97,15 @@ def file_gh( read_fn = json.load else: raise NotImplementedError(path, suffix) - req = urllib.request.Request( - f"{self.url.GH}{branch_or_tag}/{path}", headers=headers - ) + req = Request(f"{self._prefix(branch_or_tag)}{path}", headers=headers) with self._opener.open(req) as response: return read_fn(response) - def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> DataPackage: - pkg: Package = ( - json.loads(self.paths["datapackage"].read_text("utf-8")) - if frozen - else self.file_gh(tag, "datapackage.json") - ) + def datapackage(self, *, tag: LiteralString) -> DataPackage: return datapackage.DataPackage( - pkg, self.dataset_base_url(tag), self.paths["metadata"] + self.file(tag, "datapackage.json"), + self.dataset_base_url(tag), + self.paths["metadata"], ) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index 4ccb3f670..92c6f101d 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -1405,7 +1405,7 @@ def main() -> None: copy_schemapi_util() vegalite_main(args.skip_download) write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT) - datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True, frozen=True) + datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True) # The modules below are imported after the generation of the new schema files # as these modules import Altair. This allows them to use the new changes From a776e2fd5a74dd21917b884e2b418ade18773895 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 10 Feb 2025 13:11:54 +0000 Subject: [PATCH 200/201] refactor: replace `SchemaCache.schema_pyarrow` -> `nw.Schema.to_arrow` Related - https://github.com/narwhals-dev/narwhals/pull/1924 - https://github.com/vega/altair/pull/3631#discussion_r1937953187 --- altair/datasets/_cache.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index eb22cc36e..5459f0b16 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -264,19 +264,22 @@ def is_active(self) -> bool: nw.Implementation.PYARROW, } + def schema(self, name: _Dataset, /) -> nw.Schema: + it = ((col, _FIELD_TO_DTYPE[tp_str]()) for col, tp_str in self[name].items()) + return nw.Schema(it) + def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: name: Any = meta["dataset_name"] - impl = self._implementation - if (impl.is_pandas_like() or impl.is_pyarrow()) and (self[name]): + if self.is_active() and (self[name]): suffix = meta["suffix"] - if impl.is_pandas_like(): + if self._implementation.is_pandas_like(): if cols := self.by_dtype(name, nw.Date, nw.Datetime): if suffix == ".json": return {"convert_dates": cols} elif suffix in {".csv", ".tsv"}: return {"parse_dates": cols} else: - schema = self.schema_pyarrow(name) + schema = self.schema(name).to_arrow() if suffix in {".csv", ".tsv"}: from pyarrow.csv import ConvertOptions @@ -286,23 +289,6 @@ def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]: return {} - def schema(self, name: _Dataset, /) -> Mapping[str, DType]: - return { - column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items() - } - - # TODO: Open an issue in ``narwhals`` to try and get a public api for type conversion - def schema_pyarrow(self, name: _Dataset, /): - schema = self.schema(name) - if schema: - from narwhals._arrow.utils import narwhals_to_native_dtype - from narwhals.utils import Version - - m = {k: narwhals_to_native_dtype(v, Version.V1) for k, v in schema.items()} - else: - m = {} - return nw.dependencies.get_pyarrow().schema(m) - class _SupportsScanMetadata(Protocol): _opener: ClassVar[OpenerDirector] From ddda22c50f7265728dcec26afec02d6d0dbda189 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:10:51 +0000 Subject: [PATCH 201/201] feat(typing): Properly annotate `dataset_name`, `suffix` Makes more sense following (755ab4f560af13f9268e905cf70783c34b30b1d7) --- altair/datasets/_typing.py | 9 +++++++-- tools/datasets/__init__.py | 1 + tools/datasets/datapackage.py | 24 ++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 3357ddf3b..a60f38687 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -11,6 +11,11 @@ else: from typing_extensions import TypedDict +if sys.version_info >= (3, 11): + from typing import LiteralString +else: + from typing_extensions import LiteralString + if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -181,8 +186,8 @@ class Metadata(TypedDict, total=False): ``` """ - dataset_name: str - suffix: str + dataset_name: Dataset | LiteralString + suffix: Extension file_name: str bytes: int is_image: bool diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a7c1d06c4..a41392d9d 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -184,6 +184,7 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None: "import sys", "from typing import Literal, TYPE_CHECKING", utils.import_typing_extensions((3, 14), "TypedDict"), + utils.import_typing_extensions((3, 11), "LiteralString"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT]}\n", diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index ec707c0da..9ea6a8c8d 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -31,10 +31,13 @@ class Column: - def __init__(self, name: str, expr: pl.Expr, /, doc: str = "_description_") -> None: + def __init__( + self, name: str, expr: pl.Expr, /, doc: str = "_description_", tp_str: str = "" + ) -> None: self._name: str = name self._expr: pl.Expr = expr self._doc: str = doc + self._tp_str: str = tp_str @property def expr(self) -> pl.Expr: @@ -161,7 +164,10 @@ def _metadata_examples(self) -> str: @property def _metadata_td_args(self) -> str: schema = self.core.collect_schema().to_python() - return f"\n{INDENT}".join(f"{p}: {tp.__name__}" for p, tp in schema.items()) + return f"\n{INDENT}".join( + f"{column._name}: {column._tp_str or tp.__name__}" + for column, tp in zip(self.columns, schema.values()) + ) @property def _url(self) -> Column: @@ -237,8 +243,18 @@ def note(s: str, /) -> str: fmt = col("format") DataPackage.with_columns( - Column("dataset_name", path_stem("path"), "Name of the dataset/`Path.stem`_."), - Column("suffix", path_suffix("path"), "File extension/`Path.suffix`_."), + Column( + "dataset_name", + path_stem("path"), + "Name of the dataset/`Path.stem`_.", + tp_str="Dataset | LiteralString", + ), + Column( + "suffix", + path_suffix("path"), + "File extension/`Path.suffix`_.", + tp_str="Extension", + ), Column("file_name", col("path"), "Equivalent to `Path.name`_."), Column("bytes", col("bytes"), "File size in *bytes*."), Column("is_image", fmt == "png", "Only accessible via url."),