diff --git a/altair/__init__.py b/altair/__init__.py
index c2d05e464..d75de0105 100644
--- a/altair/__init__.py
+++ b/altair/__init__.py
@@ -603,6 +603,7 @@
     "core",
     "data",
     "data_transformers",
+    "datasets",
     "datum",
     "default_data_transformer",
     "display",
@@ -651,7 +652,7 @@ def __dir__():
 from altair.jupyter import JupyterChart
 from altair.expr import expr
 from altair.utils import AltairDeprecationWarning, parse_shorthand, Undefined
-from altair import typing, theme
+from altair import datasets, theme, typing
 
 
 def load_ipython_extension(ipython):
diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py
new file mode 100644
index 000000000..efdd85c3c
--- /dev/null
+++ b/altair/datasets/__init__.py
@@ -0,0 +1,151 @@
+"""
+Load example datasets *remotely* from `vega-datasets`_.
+
+Provides **70+** datasets, used throughout our `Example Gallery`_.
+
+You can learn more about each dataset at `datapackage.md`_.
+
+Examples
+--------
+Load a dataset as a ``DataFrame``/``Table``::
+
+    from altair.datasets import load
+
+    load("cars")
+
+.. note::
+   Requires installation of either `polars`_, `pandas`_, or `pyarrow`_.
+
+Get the remote address of a dataset and use directly in a :class:`altair.Chart`::
+
+    import altair as alt
+    from altair.datasets import url
+
+    source = url("co2-concentration")
+    alt.Chart(source).mark_line(tooltip=True).encode(x="Date:T", y="CO2:Q")
+
+.. note::
+   Works without any additional dependencies.
+
+For greater control over the backend library use::
+
+    from altair.datasets import Loader
+
+    load = Loader.from_backend("polars")
+    load("penguins")
+    load.url("penguins")
+
+This method also provides *precise* <kbd>Tab</kbd> completions on the returned object::
+
+    load("cars").<Tab>
+    #            bottom_k
+    #            drop
+    #            drop_in_place
+    #            drop_nans
+    #            dtypes
+    #            ...
+
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
+.. _Example Gallery:
+    https://altair-viz.github.io/gallery/index.html#example-gallery
+.. _datapackage.md:
+    https://github.com/vega/vega-datasets/blob/main/datapackage.md
+.. _polars:
+    https://docs.pola.rs/user-guide/installation/
+.. _pandas:
+    https://pandas.pydata.org/docs/getting_started/install.html
+.. _pyarrow:
+    https://arrow.apache.org/docs/python/install.html
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from altair.datasets._loader import Loader
+
+if TYPE_CHECKING:
+    import sys
+    from typing import Any
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+
+    from altair.datasets._loader import _Load
+    from altair.datasets._typing import Dataset, Extension
+
+
+__all__ = ["Loader", "load", "url"]
+
+
+load: _Load[Any, Any]
+"""
+Get a remote dataset and load as tabular data.
+
+For full <kbd>Tab</kbd> completions, instead use::
+
+    from altair.datasets import Loader
+    load = Loader.from_backend("polars")
+    cars = load("cars")
+    movies = load("movies")
+
+Alternatively, specify ``backend`` during a call::
+
+    from altair.datasets import load
+    cars = load("cars", backend="polars")
+    movies = load("movies", backend="polars")
+"""
+
+
+def url(
+    name: Dataset | LiteralString,
+    suffix: Extension | None = None,
+    /,
+) -> str:
+    """
+    Return the address of a remote dataset.
+
+    Parameters
+    ----------
+    name
+        Name of the dataset/`Path.stem`_.
+    suffix
+        File extension/`Path.suffix`_.
+
+        .. note::
+            Only needed if ``name`` is available in multiple formats.
+
+    Returns
+    -------
+    ``str``
+
+    .. _Path.stem:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+    .. _Path.suffix:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+    """
+    from altair.datasets._exceptions import AltairDatasetsError
+
+    try:
+        from altair.datasets._loader import load
+
+        url = load.url(name, suffix)
+    except AltairDatasetsError:
+        from altair.datasets._cache import csv_cache
+
+        url = csv_cache.url(name)
+
+    return url
+
+
+def __getattr__(name):
+    if name == "load":
+        from altair.datasets._loader import load
+
+        return load
+    else:
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py
new file mode 100644
index 000000000..eb22cc36e
--- /dev/null
+++ b/altair/datasets/_cache.py
@@ -0,0 +1,471 @@
+from __future__ import annotations
+
+import os
+import sys
+from collections import defaultdict
+from importlib.util import find_spec
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar, TypeVar, cast
+
+import narwhals.stable.v1 as nw
+
+from altair.datasets._exceptions import AltairDatasetsError
+
+if sys.version_info >= (3, 12):
+    from typing import Protocol
+else:
+    from typing_extensions import Protocol
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Iterable,
+        Iterator,
+        Mapping,
+        MutableMapping,
+        MutableSequence,
+        Sequence,
+    )
+    from io import IOBase
+    from typing import Any, Final
+    from urllib.request import OpenerDirector
+
+    from _typeshed import StrPath
+    from narwhals.stable.v1.dtypes import DType
+    from narwhals.stable.v1.typing import IntoExpr
+
+    from altair.datasets._typing import Dataset, Metadata
+
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from altair.datasets._typing import FlFieldStr
+    from altair.vegalite.v5.schema._typing import OneOrSeq
+
+    _Dataset: TypeAlias = "Dataset | LiteralString"
+    _FlSchema: TypeAlias = Mapping[str, FlFieldStr]
+
+__all__ = ["CsvCache", "DatasetCache", "SchemaCache", "csv_cache"]
+
+
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
+_T = TypeVar("_T")
+
+_METADATA_DIR: Final[Path] = Path(__file__).parent / "_metadata"
+
+_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = {
+    nw.Int64: "integer",
+    nw.Float64: "number",
+    nw.Boolean: "boolean",
+    nw.String: "string",
+    nw.Struct: "object",
+    nw.List: "array",
+    nw.Date: "date",
+    nw.Datetime: "datetime",
+    nw.Duration: "duration",
+    # nw.Time: "time" (Not Implemented, but we don't have any cases using it anyway)
+}
+"""
+Similar to `pl.datatypes.convert.dtype_to_ffiname`_.
+
+But using `narwhals.dtypes`_ to the string repr of ``frictionless`` `Field Types`_.
+
+.. _pl.datatypes.convert.dtype_to_ffiname:
+    https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165
+.. _Field Types:
+    https://datapackage.org/standard/table-schema/#field-types
+.. _narwhals.dtypes:
+    https://narwhals-dev.github.io/narwhals/api-reference/dtypes/
+"""
+
+_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = {
+    v: k for k, v in _DTYPE_TO_FIELD.items()
+}
+
+
+def _iter_metadata(df: nw.DataFrame[Any], /) -> Iterator[Metadata]:
+    """
+    Yield rows from ``df``, where each represents a dataset.
+
+    See Also
+    --------
+    ``altair.datasets._typing.Metadata``
+    """
+    yield from cast("Iterator[Metadata]", df.iter_rows(named=True))
+
+
+class CompressedCache(Protocol[_KT, _VT]):
+    fp: Path
+    _mapping: MutableMapping[_KT, _VT]
+
+    def read(self) -> Any: ...
+    def __getitem__(self, key: _KT, /) -> _VT: ...
+
+    def __enter__(self) -> IOBase:
+        import gzip
+
+        return gzip.open(self.fp, mode="rb").__enter__()
+
+    def __exit__(self, *args) -> None:
+        return
+
+    def get(self, key: _KT, default: _T, /) -> _VT | _T:
+        return self.mapping.get(key, default)
+
+    @property
+    def mapping(self) -> MutableMapping[_KT, _VT]:
+        if not self._mapping:
+            self._mapping.update(self.read())
+        return self._mapping
+
+
+class CsvCache(CompressedCache["_Dataset", "Metadata"]):
+    """
+    `csv`_, `gzip`_ -based, lazy metadata lookup.
+
+    Used as a fallback for 2 scenarios:
+
+    1. ``url(...)`` when no optional dependencies are installed.
+    2. ``(Loader|load)(...)`` when the backend is missing* ``.parquet`` support.
+
+    Notes
+    -----
+    *All backends *can* support ``.parquet``, but ``pandas`` requires an optional dependency.
+
+    .. _csv:
+        https://docs.python.org/3/library/csv.html
+    .. _gzip:
+        https://docs.python.org/3/library/gzip.html
+    """
+
+    fp = _METADATA_DIR / "metadata.csv.gz"
+
+    def __init__(
+        self,
+        *,
+        tp: type[MutableMapping[_Dataset, Metadata]] = dict["_Dataset", "Metadata"],
+    ) -> None:
+        self._mapping: MutableMapping[_Dataset, Metadata] = tp()
+        self._rotated: MutableMapping[str, MutableSequence[Any]] = defaultdict(list)
+
+    def read(self) -> Any:
+        import csv
+
+        with self as f:
+            b_lines = f.readlines()
+        reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
+        header = tuple(next(reader))
+        return {row[0]: dict(self._convert_row(header, row)) for row in reader}
+
+    def _convert_row(
+        self, header: Iterable[str], row: Iterable[str], /
+    ) -> Iterator[tuple[str, Any]]:
+        map_tf = {"true": True, "false": False}
+        for col, value in zip(header, row):
+            if col.startswith(("is_", "has_")):
+                yield col, map_tf[value]
+            elif col == "bytes":
+                yield col, int(value)
+            else:
+                yield col, value
+
+    @property
+    def rotated(self) -> Mapping[str, Sequence[Any]]:
+        """Columnar view."""
+        if not self._rotated:
+            for record in self.mapping.values():
+                for k, v in record.items():
+                    self._rotated[k].append(v)
+        return self._rotated
+
+    def __getitem__(self, key: _Dataset, /) -> Metadata:
+        if meta := self.get(key, None):
+            return meta
+        msg = f"{key!r} does not refer to a known dataset."
+        raise TypeError(msg)
+
+    def url(self, name: _Dataset, /) -> str:
+        meta = self[name]
+        if meta["suffix"] == ".parquet" and not find_spec("vegafusion"):
+            raise AltairDatasetsError.from_url(meta)
+        return meta["url"]
+
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__}: {'COLLECTED' if self._mapping else 'READY'}>"
+
+
+class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]):
+    """
+    `json`_, `gzip`_ -based, lazy schema lookup.
+
+    - Primarily benefits ``pandas``, which needs some help identifying **temporal** columns.
+    - Utilizes `data package`_ schema types.
+    - All methods return falsy containers instead of exceptions
+
+    .. _json:
+        https://docs.python.org/3/library/json.html
+    .. _gzip:
+        https://docs.python.org/3/library/gzip.html
+    .. _data package:
+        https://github.com/vega/vega-datasets/pull/631
+    """
+
+    fp = _METADATA_DIR / "schemas.json.gz"
+
+    def __init__(
+        self,
+        *,
+        tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"],
+        implementation: nw.Implementation = nw.Implementation.UNKNOWN,
+    ) -> None:
+        self._mapping: MutableMapping[_Dataset, _FlSchema] = tp()
+        self._implementation: nw.Implementation = implementation
+
+    def read(self) -> Any:
+        import json
+
+        with self as f:
+            return json.load(f)
+
+    def __getitem__(self, key: _Dataset, /) -> _FlSchema:
+        return self.get(key, {})
+
+    def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]:
+        """
+        Return column names specfied in ``name``'s schema.
+
+        Parameters
+        ----------
+        name
+            Dataset name.
+        *dtypes
+            Optionally, only return columns matching the given data type(s).
+        """
+        if (match := self[name]) and dtypes:
+            include = {_DTYPE_TO_FIELD[tp] for tp in dtypes}
+            return [col for col, tp_str in match.items() if tp_str in include]
+        else:
+            return list(match)
+
+    def is_active(self) -> bool:
+        return self._implementation in {
+            nw.Implementation.PANDAS,
+            nw.Implementation.PYARROW,
+            nw.Implementation.MODIN,
+            nw.Implementation.PYARROW,
+        }
+
+    def schema_kwds(self, meta: Metadata, /) -> dict[str, Any]:
+        name: Any = meta["dataset_name"]
+        impl = self._implementation
+        if (impl.is_pandas_like() or impl.is_pyarrow()) and (self[name]):
+            suffix = meta["suffix"]
+            if impl.is_pandas_like():
+                if cols := self.by_dtype(name, nw.Date, nw.Datetime):
+                    if suffix == ".json":
+                        return {"convert_dates": cols}
+                    elif suffix in {".csv", ".tsv"}:
+                        return {"parse_dates": cols}
+            else:
+                schema = self.schema_pyarrow(name)
+                if suffix in {".csv", ".tsv"}:
+                    from pyarrow.csv import ConvertOptions
+
+                    return {"convert_options": ConvertOptions(column_types=schema)}  # pyright: ignore[reportCallIssue]
+                elif suffix == ".parquet":
+                    return {"schema": schema}
+
+        return {}
+
+    def schema(self, name: _Dataset, /) -> Mapping[str, DType]:
+        return {
+            column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items()
+        }
+
+    # TODO: Open an issue in ``narwhals`` to try and get a public api for type conversion
+    def schema_pyarrow(self, name: _Dataset, /):
+        schema = self.schema(name)
+        if schema:
+            from narwhals._arrow.utils import narwhals_to_native_dtype
+            from narwhals.utils import Version
+
+            m = {k: narwhals_to_native_dtype(v, Version.V1) for k, v in schema.items()}
+        else:
+            m = {}
+        return nw.dependencies.get_pyarrow().schema(m)
+
+
+class _SupportsScanMetadata(Protocol):
+    _opener: ClassVar[OpenerDirector]
+
+    def _scan_metadata(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame[Any]: ...
+
+
+class DatasetCache:
+    """Opt-out caching of remote dataset requests."""
+
+    _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR"
+    _XDG_CACHE: ClassVar[Path] = (
+        Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) / "altair"
+    ).resolve()
+
+    def __init__(self, reader: _SupportsScanMetadata, /) -> None:
+        self._rd: _SupportsScanMetadata = reader
+
+    def clear(self) -> None:
+        """Delete all previously cached datasets."""
+        self._ensure_active()
+        if self.is_empty():
+            return None
+        ser = (
+            self._rd._scan_metadata()
+            .select("sha", "suffix")
+            .unique("sha")
+            .select(nw.concat_str("sha", "suffix").alias("sha_suffix"))
+            .collect()
+            .get_column("sha_suffix")
+        )
+        names = set[str](ser.to_list())
+        for fp in self:
+            if fp.name in names:
+                fp.unlink()
+
+    def download_all(self) -> None:
+        """
+        Download any missing datasets for latest version.
+
+        Requires **30-50MB** of disk-space.
+        """
+        stems = tuple(fp.stem for fp in self)
+        predicates = (~(nw.col("sha").is_in(stems)),) if stems else ()
+        frame = (
+            self._rd._scan_metadata(*predicates, is_image=False)
+            .select("sha", "suffix", "url")
+            .unique("sha")
+            .collect()
+        )
+        if frame.is_empty():
+            print("Already downloaded all datasets")
+            return None
+        print(f"Downloading {len(frame)} missing datasets...")
+        for meta in _iter_metadata(frame):
+            self._download_one(meta["url"], self.path_meta(meta))
+        print("Finished downloads")
+        return None
+
+    def _maybe_download(self, meta: Metadata, /) -> Path:
+        fp = self.path_meta(meta)
+        return (
+            fp
+            if (fp.exists() and fp.stat().st_size)
+            else self._download_one(meta["url"], fp)
+        )
+
+    def _download_one(self, url: str, fp: Path, /) -> Path:
+        with self._rd._opener.open(url) as f:
+            fp.touch()
+            fp.write_bytes(f.read())
+        return fp
+
+    @property
+    def path(self) -> Path:
+        """
+        Returns path to datasets cache.
+
+        Defaults to (`XDG_CACHE_HOME`_)::
+
+            "$XDG_CACHE_HOME/altair/"
+
+        But can be configured using the environment variable::
+
+            "$ALTAIR_DATASETS_DIR"
+
+        You can set this for the current session via::
+
+            from pathlib import Path
+            from altair.datasets import load
+
+            load.cache.path = Path.home() / ".altair_cache"
+
+            load.cache.path.relative_to(Path.home()).as_posix()
+            ".altair_cache"
+
+        You can *later* disable caching via::
+
+           load.cache.path = None
+
+        .. _XDG_CACHE_HOME:
+            https://specifications.freedesktop.org/basedir-spec/latest/#variables
+        """
+        self._ensure_active()
+        fp = Path(usr) if (usr := os.environ.get(self._ENV_VAR)) else self._XDG_CACHE
+        fp.mkdir(parents=True, exist_ok=True)
+        return fp
+
+    @path.setter
+    def path(self, source: StrPath | None, /) -> None:
+        if source is not None:
+            os.environ[self._ENV_VAR] = str(Path(source).resolve())
+        else:
+            os.environ[self._ENV_VAR] = ""
+
+    def path_meta(self, meta: Metadata, /) -> Path:
+        return self.path / (meta["sha"] + meta["suffix"])
+
+    def __iter__(self) -> Iterator[Path]:
+        yield from self.path.iterdir()
+
+    def __repr__(self) -> str:
+        name = type(self).__name__
+        if self.is_not_active():
+            return f"{name}<UNSET>"
+        else:
+            return f"{name}<{self.path.as_posix()!r}>"
+
+    def is_active(self) -> bool:
+        return not self.is_not_active()
+
+    def is_not_active(self) -> bool:
+        return os.environ.get(self._ENV_VAR) == ""
+
+    def is_empty(self) -> bool:
+        """Cache is active, but no files are stored in ``self.path``."""
+        return next(iter(self), None) is None
+
+    def _ensure_active(self) -> None:
+        if self.is_not_active():
+            msg = (
+                f"Cache is unset.\n"
+                f"To enable dataset caching, set the environment variable:\n"
+                f"    {self._ENV_VAR!r}\n\n"
+                f"You can set this for the current session via:\n"
+                f"    from pathlib import Path\n"
+                f"    from altair.datasets import load\n\n"
+                f"    load.cache.path = Path.home() / '.altair_cache'"
+            )
+            raise ValueError(msg)
+
+
+csv_cache: CsvCache
+
+
+def __getattr__(name):
+    if name == "csv_cache":
+        global csv_cache
+        csv_cache = CsvCache()
+        return csv_cache
+
+    else:
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/altair/datasets/_constraints.py b/altair/datasets/_constraints.py
new file mode 100644
index 000000000..395a9d906
--- /dev/null
+++ b/altair/datasets/_constraints.py
@@ -0,0 +1,119 @@
+"""Set-like guards for matching metadata to an implementation."""
+
+from __future__ import annotations
+
+from collections.abc import Set
+from itertools import chain
+from typing import TYPE_CHECKING, Any
+
+from narwhals.stable import v1 as nw
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Iterable, Iterator
+
+    from altair.datasets._typing import Metadata
+
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+
+__all__ = [
+    "Items",
+    "MetaIs",
+    "is_arrow",
+    "is_csv",
+    "is_json",
+    "is_meta",
+    "is_not_tabular",
+    "is_parquet",
+    "is_spatial",
+    "is_tsv",
+]
+
+Items: TypeAlias = Set[tuple[str, Any]]
+
+
+class MetaIs(Set[tuple[str, Any]]):
+    _requires: frozenset[tuple[str, Any]]
+
+    def __init__(self, kwds: frozenset[tuple[str, Any]], /) -> None:
+        object.__setattr__(self, "_requires", kwds)
+
+    @classmethod
+    def from_metadata(cls, meta: Metadata, /) -> MetaIs:
+        return cls(frozenset(meta.items()))
+
+    def to_metadata(self) -> Metadata:
+        if TYPE_CHECKING:
+
+            def collect(**kwds: Unpack[Metadata]) -> Metadata:
+                return kwds
+
+            return collect(**dict(self))
+        return dict(self)
+
+    def to_expr(self) -> nw.Expr:
+        """Convert constraint into a narwhals expression."""
+        if not self:
+            msg = f"Unable to convert an empty set to an expression:\n\n{self!r}"
+            raise TypeError(msg)
+        return nw.all_horizontal(nw.col(name) == val for name, val in self)
+
+    def isdisjoint(self, other: Iterable[Any]) -> bool:
+        return super().isdisjoint(other)
+
+    def issubset(self, other: Iterable[Any]) -> bool:
+        return self._requires.issubset(other)
+
+    def __call__(self, meta: Items, /) -> bool:
+        return self._requires <= meta
+
+    def __hash__(self) -> int:
+        return hash(self._requires)
+
+    def __contains__(self, x: object) -> bool:
+        return self._requires.__contains__(x)
+
+    def __iter__(self) -> Iterator[tuple[str, Any]]:
+        yield from self._requires
+
+    def __len__(self) -> int:
+        return self._requires.__len__()
+
+    def __setattr__(self, name: str, value: Any):
+        msg = (
+            f"{type(self).__name__!r} is immutable.\n"
+            f"Could not assign self.{name} = {value}"
+        )
+        raise TypeError(msg)
+
+    def __repr__(self) -> str:
+        items = dict(self)
+        if not items:
+            contents = "<placeholder>"
+        elif suffix := items.pop("suffix", None):
+            contents = ", ".join(
+                chain([f"'*{suffix}'"], (f"{k}={v!r}" for k, v in items.items()))
+            )
+        else:
+            contents = ", ".join(f"{k}={v!r}" for k, v in items.items())
+        return f"is_meta({contents})"
+
+
+def is_meta(**kwds: Unpack[Metadata]) -> MetaIs:
+    return MetaIs.from_metadata(kwds)
+
+
+is_csv = is_meta(suffix=".csv")
+is_json = is_meta(suffix=".json")
+is_tsv = is_meta(suffix=".tsv")
+is_arrow = is_meta(suffix=".arrow")
+is_parquet = is_meta(suffix=".parquet")
+is_spatial = is_meta(is_spatial=True)
+is_not_tabular = is_meta(is_tabular=False)
diff --git a/altair/datasets/_exceptions.py b/altair/datasets/_exceptions.py
new file mode 100644
index 000000000..3b377f657
--- /dev/null
+++ b/altair/datasets/_exceptions.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from altair.datasets._reader import _Backend
+    from altair.datasets._typing import Metadata
+
+
+class AltairDatasetsError(Exception):
+    @classmethod
+    def from_url(cls, meta: Metadata, /) -> AltairDatasetsError:
+        if meta["suffix"] == ".parquet":
+            msg = (
+                f"{_failed_url(meta)}"
+                f"{meta['suffix']!r} datasets require `vegafusion`.\n"
+                "See upstream issue for details: https://github.com/vega/vega/issues/3961"
+            )
+        else:
+            msg = (
+                f"{cls.from_url.__qualname__}() called for "
+                f"unimplemented extension: {meta['suffix']}\n\n{meta!r}"
+            )
+            raise NotImplementedError(msg)
+        return cls(msg)
+
+    @classmethod
+    def from_tabular(cls, meta: Metadata, backend_name: str, /) -> AltairDatasetsError:
+        if meta["is_image"]:
+            reason = "Image data is non-tabular."
+            return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta)}")
+        elif not meta["is_tabular"] or meta["suffix"] in {".arrow", ".parquet"}:
+            if meta["suffix"] in {".arrow", ".parquet"}:
+                install: tuple[str, ...] = "pyarrow", "polars"
+                what = f"{meta['suffix']!r}"
+            else:
+                install = ("polars",)
+                if meta["is_spatial"]:
+                    what = "Geospatial data"
+                elif meta["is_json"]:
+                    what = "Non-tabular json"
+                else:
+                    what = f"{meta['file_name']!r}"
+            reason = _why(what, backend_name)
+            return cls(f"{_failed_tabular(meta)}{reason}{_suggest_url(meta, *install)}")
+        else:
+            return cls(_implementation_not_found(meta))
+
+    @classmethod
+    def from_priority(cls, priority: Sequence[_Backend], /) -> AltairDatasetsError:
+        msg = f"Found no supported backend, searched:\n{priority!r}"
+        return cls(msg)
+
+
+def module_not_found(
+    backend_name: str, reqs: Sequence[str], missing: str
+) -> ModuleNotFoundError:
+    if len(reqs) == 1:
+        depends = f"{reqs[0]!r} package"
+    else:
+        depends = ", ".join(f"{req!r}" for req in reqs) + " packages"
+    msg = (
+        f"Backend {backend_name!r} requires the {depends}, but {missing!r} could not be found.\n"
+        f"This can be installed with pip using:\n"
+        f"    pip install {missing}\n"
+        f"Or with conda using:\n"
+        f"    conda install -c conda-forge {missing}"
+    )
+    return ModuleNotFoundError(msg, name=missing)
+
+
+def _failed_url(meta: Metadata, /) -> str:
+    return f"Unable to load {meta['file_name']!r} via url.\n"
+
+
+def _failed_tabular(meta: Metadata, /) -> str:
+    return f"Unable to load {meta['file_name']!r} as tabular data.\n"
+
+
+def _why(what: str, backend_name: str, /) -> str:
+    return f"{what} is not supported natively by {backend_name!r}."
+
+
+def _suggest_url(meta: Metadata, *install_other: str) -> str:
+    other = ""
+    if install_other:
+        others = " or ".join(f"`{other}`" for other in install_other)
+        other = f" installing {others}, or use"
+    return (
+        f"\n\nInstead, try{other}:\n"
+        "    from altair.datasets import url\n"
+        f"    url({meta['dataset_name']!r})"
+    )
+
+
+def _implementation_not_found(meta: Metadata, /) -> str:
+    """Search finished without finding a *declared* incompatibility."""
+    INDENT = " " * 4
+    record = f",\n{INDENT}".join(
+        f"{k}={v!r}"
+        for k, v in meta.items()
+        if not (k.startswith(("is_", "sha", "bytes", "has_")))
+        or (v is True and k.startswith("is_"))
+    )
+    return f"Found no implementation that supports:\n{INDENT}{record}"
diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py
new file mode 100644
index 000000000..cc72fb950
--- /dev/null
+++ b/altair/datasets/_loader.py
@@ -0,0 +1,358 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Generic, final, overload
+
+from narwhals.stable.v1.typing import IntoDataFrameT
+
+from altair.datasets import _reader
+from altair.datasets._reader import IntoFrameT
+
+if TYPE_CHECKING:
+    import sys
+    from typing import Any, Literal
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+
+    from altair.datasets._cache import DatasetCache
+    from altair.datasets._reader import Reader
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString, Self
+    else:
+        from typing_extensions import LiteralString, Self
+    from altair.datasets._reader import _Backend
+    from altair.datasets._typing import Dataset, Extension
+
+
+__all__ = ["Loader", "load"]
+
+
+class Loader(Generic[IntoDataFrameT, IntoFrameT]):
+    """
+    Load example datasets *remotely* from `vega-datasets`_, with caching.
+
+    A new ``Loader`` must be initialized by specifying a backend::
+
+        from altair.datasets import Loader
+
+        load = Loader.from_backend("polars")
+        load
+        Loader[polars]
+
+    .. _vega-datasets:
+        https://github.com/vega/vega-datasets
+    """
+
+    _reader: Reader[IntoDataFrameT, IntoFrameT]
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["polars"] = ..., /
+    ) -> Loader[pl.DataFrame, pl.LazyFrame]: ...
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["pandas", "pandas[pyarrow]"], /
+    ) -> Loader[pd.DataFrame, pd.DataFrame]: ...
+
+    @overload
+    @classmethod
+    def from_backend(
+        cls, backend_name: Literal["pyarrow"], /
+    ) -> Loader[pa.Table, pa.Table]: ...
+
+    @classmethod
+    def from_backend(
+        cls: type[Loader[Any, Any]], backend_name: _Backend = "polars", /
+    ) -> Loader[Any, Any]:
+        """
+        Initialize a new loader, with the specified backend.
+
+        Parameters
+        ----------
+        backend_name
+            DataFrame package/config used to return data.
+
+            * *polars*: Using `polars defaults`_
+            * *pandas*: Using `pandas defaults`_.
+            * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"``
+            * *pyarrow*: (*Experimental*)
+
+            .. warning::
+                Most datasets use a `JSON format not supported`_ by ``pyarrow``
+
+        Examples
+        --------
+        Using ``polars``::
+
+            from altair.datasets import Loader
+
+            load = Loader.from_backend("polars")
+            cars = load("cars")
+
+            type(cars)
+            polars.dataframe.frame.DataFrame
+
+        Using ``pandas``::
+
+            load = Loader.from_backend("pandas")
+            cars = load("cars")
+
+            type(cars)
+            pandas.core.frame.DataFrame
+
+        Using ``pandas``, backed by ``pyarrow`` dtypes::
+
+            load = Loader.from_backend("pandas[pyarrow]")
+            co2 = load("co2")
+
+            type(co2)
+            pandas.core.frame.DataFrame
+
+            co2.dtypes
+            Date             datetime64[ns]
+            CO2             double[pyarrow]
+            adjusted CO2    double[pyarrow]
+            dtype: object
+
+        .. _polars defaults:
+            https://docs.pola.rs/api/python/stable/reference/io.html
+        .. _pandas defaults:
+            https://pandas.pydata.org/docs/reference/io.html
+        .. _JSON format not supported:
+            https://arrow.apache.org/docs/python/json.html#reading-json-files
+        """
+        return cls.from_reader(_reader._from_backend(backend_name))
+
+    @classmethod
+    def from_reader(cls, reader: Reader[IntoDataFrameT, IntoFrameT], /) -> Self:
+        obj = cls.__new__(cls)
+        obj._reader = reader
+        return obj
+
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        """
+        Get a remote dataset and load as tabular data.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`Path.stem`_.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+        **kwds
+            Arguments passed to the underlying read function.
+
+        Examples
+        --------
+        Using ``polars``::
+
+            from altair.datasets import Loader
+
+            load = Loader.from_backend("polars")
+            source = load("iowa-electricity")
+
+            source.columns
+            ['year', 'source', 'net_generation']
+
+            source.head(5)
+            shape: (5, 3)
+            ┌────────────┬──────────────┬────────────────┐
+            │ year       ┆ source       ┆ net_generation │
+            │ ---        ┆ ---          ┆ ---            │
+            │ date       ┆ str          ┆ i64            │
+            ╞════════════╪══════════════╪════════════════╡
+            │ 2001-01-01 ┆ Fossil Fuels ┆ 35361          │
+            │ 2002-01-01 ┆ Fossil Fuels ┆ 35991          │
+            │ 2003-01-01 ┆ Fossil Fuels ┆ 36234          │
+            │ 2004-01-01 ┆ Fossil Fuels ┆ 36205          │
+            │ 2005-01-01 ┆ Fossil Fuels ┆ 36883          │
+            └────────────┴──────────────┴────────────────┘
+
+        Using ``pandas``::
+
+            load = Loader.from_backend("pandas")
+            source = load("iowa-electricity")
+
+            source.columns
+            Index(['year', 'source', 'net_generation'], dtype='object')
+
+            source.head(5)
+                    year        source  net_generation
+            0 2001-01-01  Fossil Fuels           35361
+            1 2002-01-01  Fossil Fuels           35991
+            2 2003-01-01  Fossil Fuels           36234
+            3 2004-01-01  Fossil Fuels           36205
+            4 2005-01-01  Fossil Fuels           36883
+
+        Using ``pyarrow``::
+
+            load = Loader.from_backend("pyarrow")
+            source = load("iowa-electricity")
+
+            source.column_names
+            ['year', 'source', 'net_generation']
+
+            source.slice(0, 5)
+            pyarrow.Table
+            year: date32[day]
+            source: string
+            net_generation: int64
+            ----
+            year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01]]
+            source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels"]]
+            net_generation: [[35361,35991,36234,36205,36883]]
+
+        .. _Path.stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+        """
+        return self._reader.dataset(name, suffix, **kwds)
+
+    def url(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+    ) -> str:
+        """
+        Return the address of a remote dataset.
+
+        Parameters
+        ----------
+        name
+            Name of the dataset/`Path.stem`_.
+        suffix
+            File extension/`Path.suffix`_.
+
+            .. note::
+                Only needed if ``name`` is available in multiple formats.
+
+        .. _Path.stem:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+        .. _Path.suffix:
+            https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+
+        Examples
+        --------
+        The returned url will always point to an accessible dataset::
+
+            import altair as alt
+            from altair.datasets import Loader
+
+            load = Loader.from_backend("polars")
+            load.url("cars")
+            "https://cdn.jsdelivr.net/npm/vega-datasets@v2.11.0/data/cars.json"
+
+        We can pass the result directly to a chart::
+
+            url = load.url("cars")
+            alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q")
+        """
+        return self._reader.url(name, suffix)
+
+    @property
+    def cache(self) -> DatasetCache:
+        """
+        Caching of remote dataset requests.
+
+        Configure cache path::
+
+            self.cache.path = "..."
+
+        Download the latest datasets *ahead-of-time*::
+
+            self.cache.download_all()
+
+        Remove all downloaded datasets::
+
+            self.cache.clear()
+
+        Disable caching::
+
+            self.cache.path = None
+        """
+        return self._reader.cache
+
+    def __repr__(self) -> str:
+        return f"{type(self).__name__}[{self._reader._name}]"
+
+
+@final
+class _Load(Loader[IntoDataFrameT, IntoFrameT]):
+    @overload
+    def __call__(  # pyright: ignore[reportOverlappingOverload]
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        backend: None = ...,
+        **kwds: Any,
+    ) -> IntoDataFrameT: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        backend: Literal["polars"] = ...,
+        **kwds: Any,
+    ) -> pl.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        backend: Literal["pandas", "pandas[pyarrow]"] = ...,
+        **kwds: Any,
+    ) -> pd.DataFrame: ...
+    @overload
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = ...,
+        /,
+        backend: Literal["pyarrow"] = ...,
+        **kwds: Any,
+    ) -> pa.Table: ...
+    def __call__(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        backend: _Backend | None = None,
+        **kwds: Any,
+    ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table:
+        if backend is None:
+            return super().__call__(name, suffix, **kwds)
+        else:
+            return self.from_backend(backend)(name, suffix, **kwds)
+
+
+load: _Load[Any, Any]
+
+
+def __getattr__(name):
+    if name == "load":
+        reader = _reader.infer_backend()
+        global load
+        load = _Load.from_reader(reader)
+        return load
+    else:
+        msg = f"module {__name__!r} has no attribute {name!r}"
+        raise AttributeError(msg)
diff --git a/altair/datasets/_metadata/metadata.csv.gz b/altair/datasets/_metadata/metadata.csv.gz
new file mode 100644
index 000000000..50fef1d82
Binary files /dev/null and b/altair/datasets/_metadata/metadata.csv.gz differ
diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet
new file mode 100644
index 000000000..840a44a53
Binary files /dev/null and b/altair/datasets/_metadata/metadata.parquet differ
diff --git a/altair/datasets/_metadata/schemas.json.gz b/altair/datasets/_metadata/schemas.json.gz
new file mode 100644
index 000000000..8a593ec26
Binary files /dev/null and b/altair/datasets/_metadata/schemas.json.gz differ
diff --git a/altair/datasets/_reader.py b/altair/datasets/_reader.py
new file mode 100644
index 000000000..4f974fef0
--- /dev/null
+++ b/altair/datasets/_reader.py
@@ -0,0 +1,548 @@
+"""
+Backend for ``alt.datasets.Loader``.
+
+Notes
+-----
+Extending would be more ergonomic if `read`, `scan`, `_constraints` were available under a single export::
+
+    from altair.datasets import ext, reader
+    import polars as pl
+
+    impls = (
+        ext.read(pl.read_parquet, ext.is_parquet),
+        ext.read(pl.read_csv, ext.is_csv),
+        ext.read(pl.read_json, ext.is_json),
+    )
+    user_reader = reader(impls)
+    user_reader.dataset("airports")
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Mapping
+from importlib import import_module
+from importlib.util import find_spec
+from itertools import chain
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Literal, cast, overload
+from urllib.request import build_opener as _build_opener
+
+from narwhals.stable import v1 as nw
+from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr
+from packaging.requirements import Requirement
+
+from altair.datasets import _readimpl
+from altair.datasets._cache import CsvCache, DatasetCache, SchemaCache, _iter_metadata
+from altair.datasets._constraints import is_parquet
+from altair.datasets._exceptions import AltairDatasetsError, module_not_found
+from altair.datasets._readimpl import IntoFrameT, is_available
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Callable, Sequence
+    from urllib.request import OpenerDirector
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+
+    from altair.datasets._readimpl import BaseImpl, R, Read, Scan
+    from altair.datasets._typing import Dataset, Extension, Metadata
+    from altair.vegalite.v5.schema._typing import OneOrSeq
+
+    if sys.version_info >= (3, 13):
+        from typing import TypeIs, TypeVar
+    else:
+        from typing_extensions import TypeIs, TypeVar
+    if sys.version_info >= (3, 12):
+        from typing import Unpack
+    else:
+        from typing_extensions import Unpack
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    _Polars: TypeAlias = Literal["polars"]
+    _Pandas: TypeAlias = Literal["pandas"]
+    _PyArrow: TypeAlias = Literal["pyarrow"]
+    _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"]
+    _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow]
+    _CuDF: TypeAlias = Literal["cudf"]
+    _Dask: TypeAlias = Literal["dask"]
+    _DuckDB: TypeAlias = Literal["duckdb"]
+    _Ibis: TypeAlias = Literal["ibis"]
+    _PySpark: TypeAlias = Literal["pyspark"]
+    _NwSupport: TypeAlias = Literal[
+        _Polars, _Pandas, _PyArrow, _CuDF, _Dask, _DuckDB, _Ibis, _PySpark
+    ]
+    _NwSupportT = TypeVar(
+        "_NwSupportT",
+        _Polars,
+        _Pandas,
+        _PyArrow,
+        _CuDF,
+        _Dask,
+        _DuckDB,
+        _Ibis,
+        _PySpark,
+    )
+
+_SupportProfile: TypeAlias = Mapping[
+    Literal["supported", "unsupported"], "Sequence[Dataset]"
+]
+"""
+Dataset support varies between backends and available dependencies.
+
+Any name listed in ``"unsupported"`` will raise an error on::
+
+    from altair.datasets import load
+
+    load("7zip")
+
+Instead, they can be loaded via::
+
+    import altair as alt
+    from altair.datasets import url
+
+    alt.Chart(url("7zip"))
+"""
+
+
+class Reader(Generic[IntoDataFrameT, IntoFrameT]):
+    """
+    Modular file reader, targeting remote & local tabular resources.
+
+    .. warning::
+        Use ``reader(...)`` instead of instantiating ``Reader`` directly.
+    """
+
+    _read: Sequence[Read[IntoDataFrameT]]
+    """Eager file read functions."""
+
+    _scan: Sequence[Scan[IntoFrameT]]
+    """Lazy file read functions."""
+
+    _name: str
+    """
+    Used in error messages, repr and matching ``@overload``(s).
+
+    Otherwise, has no concrete meaning.
+    """
+
+    _implementation: nw.Implementation
+    """
+    Corresponding `narwhals implementation`_.
+
+    .. _narwhals implementation:
+        https://github.com/narwhals-dev/narwhals/blob/9b6a355530ea46c590d5a6d1d0567be59c0b5742/narwhals/utils.py#L61-L290
+    """
+
+    _opener: ClassVar[OpenerDirector] = _build_opener()
+    _metadata_path: ClassVar[Path] = (
+        Path(__file__).parent / "_metadata" / "metadata.parquet"
+    )
+
+    def __init__(
+        self,
+        read: Sequence[Read[IntoDataFrameT]],
+        scan: Sequence[Scan[IntoFrameT]],
+        name: str,
+        implementation: nw.Implementation,
+    ) -> None:
+        self._read = read
+        self._scan = scan
+        self._name = name
+        self._implementation = implementation
+        self._schema_cache = SchemaCache(implementation=implementation)
+
+    def __repr__(self) -> str:
+        from textwrap import indent
+
+        PREFIX = " " * 4
+        NL = "\n"
+        body = f"read\n{indent(NL.join(str(el) for el in self._read), PREFIX)}"
+        if self._scan:
+            body += f"\nscan\n{indent(NL.join(str(el) for el in self._scan), PREFIX)}"
+        return f"Reader[{self._name}] {self._implementation!r}\n{body}"
+
+    def read_fn(self, meta: Metadata, /) -> Callable[..., IntoDataFrameT]:
+        return self._solve(meta, self._read)
+
+    def scan_fn(self, meta: Metadata | Path | str, /) -> Callable[..., IntoFrameT]:
+        meta = meta if isinstance(meta, Mapping) else {"suffix": _into_suffix(meta)}
+        return self._solve(meta, self._scan)
+
+    @property
+    def cache(self) -> DatasetCache:
+        return DatasetCache(self)
+
+    def dataset(
+        self,
+        name: Dataset | LiteralString,
+        suffix: Extension | None = None,
+        /,
+        **kwds: Any,
+    ) -> IntoDataFrameT:
+        frame = self._query(name, suffix)
+        meta = next(_iter_metadata(frame))
+        fn = self.read_fn(meta)
+        fn_kwds = self._merge_kwds(meta, kwds)
+        if self.cache.is_active():
+            fp = self.cache._maybe_download(meta)
+            return fn(fp, **fn_kwds)
+        else:
+            with self._opener.open(meta["url"]) as f:
+                return fn(f, **fn_kwds)
+
+    def url(
+        self, name: Dataset | LiteralString, suffix: Extension | None = None, /
+    ) -> str:
+        frame = self._query(name, suffix)
+        meta = next(_iter_metadata(frame))
+        if is_parquet(meta.items()) and not is_available("vegafusion"):
+            raise AltairDatasetsError.from_url(meta)
+        url = meta["url"]
+        if isinstance(url, str):
+            return url
+        else:
+            msg = f"Expected 'str' but got {type(url).__name__!r}\nfrom {url!r}."
+            raise TypeError(msg)
+
+    # TODO: (Multiple)
+    # - Settle on a better name
+    # - Add method to `Loader`
+    # - Move docs to `Loader.{new name}`
+    def open_markdown(self, name: Dataset, /) -> None:
+        """
+        Learn more about a dataset, opening `vega-datasets/datapackage.md`_ with the default browser.
+
+        Additional info *may* include: `description`_, `schema`_, `sources`_, `licenses`_.
+
+        .. _vega-datasets/datapackage.md:
+            https://github.com/vega/vega-datasets/blob/main/datapackage.md
+        .. _description:
+            https://datapackage.org/standard/data-resource/#description
+        .. _schema:
+            https://datapackage.org/standard/table-schema/#schema
+        .. _sources:
+            https://datapackage.org/standard/data-package/#sources
+        .. _licenses:
+            https://datapackage.org/standard/data-package/#licenses
+        """
+        import webbrowser
+
+        from altair.utils import VERSIONS
+
+        ref = self._query(name).get_column("file_name").item(0).replace(".", "")
+        tag = VERSIONS["vega-datasets"]
+        url = f"https://github.com/vega/vega-datasets/blob/{tag}/datapackage.md#{ref}"
+        webbrowser.open(url)
+
+    @overload
+    def profile(self, *, show: Literal[False] = ...) -> _SupportProfile: ...
+
+    @overload
+    def profile(self, *, show: Literal[True]) -> None: ...
+
+    def profile(self, *, show: bool = False) -> _SupportProfile | None:
+        """
+        Describe which datasets can be loaded as tabular data.
+
+        Parameters
+        ----------
+        show
+            Print a densely formatted repr *instead of* returning a mapping.
+        """
+        relevant_columns = set(
+            chain.from_iterable(impl._relevant_columns for impl in self._read)
+        )
+        frame = self._scan_metadata().select("dataset_name", *relevant_columns)
+        inc_expr = nw.any_horizontal(impl._include_expr for impl in self._read)
+        result: _SupportProfile = {
+            "unsupported": _dataset_names(frame, ~inc_expr),
+            "supported": _dataset_names(frame, inc_expr),
+        }
+        if show:
+            import pprint
+
+            pprint.pprint(result, compact=True, sort_dicts=False)
+            return None
+        return result
+
+    def _query(
+        self, name: Dataset | LiteralString, suffix: Extension | None = None, /
+    ) -> nw.DataFrame[IntoDataFrameT]:
+        """
+        Query a tabular version of `vega-datasets/datapackage.json`_.
+
+        Applies a filter, erroring out when no results would be returned.
+
+        .. _vega-datasets/datapackage.json:
+            https://github.com/vega/vega-datasets/blob/main/datapackage.json
+        """
+        constraints = _into_constraints(name, suffix)
+        frame = self._scan_metadata(**constraints).collect()
+        if not frame.is_empty():
+            return frame
+        else:
+            msg = f"Found no results for:\n    {constraints!r}"
+            raise ValueError(msg)
+
+    def _merge_kwds(self, meta: Metadata, kwds: dict[str, Any], /) -> Mapping[str, Any]:
+        """
+        Extend user-provided arguments with dataset & library-specfic defaults.
+
+        .. important:: User-provided arguments have a higher precedence.
+        """
+        if self._schema_cache.is_active() and (
+            schema := self._schema_cache.schema_kwds(meta)
+        ):
+            kwds = schema | kwds if kwds else schema
+        return kwds
+
+    @property
+    def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]:
+        fp = self._metadata_path
+        return nw.from_native(self.scan_fn(fp)(fp)).lazy()
+
+    def _scan_metadata(
+        self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata]
+    ) -> nw.LazyFrame[IntoFrameT]:
+        if predicates or constraints:
+            return self._metadata_frame.filter(*predicates, **constraints)
+        return self._metadata_frame
+
+    def _solve(
+        self, meta: Metadata, impls: Sequence[BaseImpl[R]], /
+    ) -> Callable[..., R]:
+        """
+        Return the first function that satisfies dataset constraints.
+
+        See Also
+        --------
+        ``altair.datasets._readimpl.BaseImpl.unwrap_or_skip``
+        """
+        items = meta.items()
+        it = (some for impl in impls if (some := impl.unwrap_or_skip(items)))
+        if fn_or_err := next(it, None):
+            if _is_err(fn_or_err):
+                raise fn_or_err.from_tabular(meta, self._name)
+            return fn_or_err
+        raise AltairDatasetsError.from_tabular(meta, self._name)
+
+
+def _dataset_names(
+    frame: nw.LazyFrame, *predicates: OneOrSeq[IntoExpr]
+) -> Sequence[Dataset]:
+    # NOTE: helper function for `Reader.profile`
+    return (
+        frame.filter(*predicates)
+        .select("dataset_name")
+        .collect()
+        .get_column("dataset_name")
+        .to_list()
+    )
+
+
+class _NoParquetReader(Reader[IntoDataFrameT, IntoFrameT]):
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}\ncsv_cache\n    {self.csv_cache!r}"
+
+    @property
+    def csv_cache(self) -> CsvCache:
+        if not hasattr(self, "_csv_cache"):
+            self._csv_cache = CsvCache()
+        return self._csv_cache
+
+    @property
+    def _metadata_frame(self) -> nw.LazyFrame[IntoFrameT]:
+        data = cast("dict[str, Any]", self.csv_cache.rotated)
+        impl = self._implementation
+        return nw.maybe_convert_dtypes(nw.from_dict(data, backend=impl)).lazy()
+
+
+@overload
+def reader(
+    read_fns: Sequence[Read[IntoDataFrameT]],
+    scan_fns: tuple[()] = ...,
+    *,
+    name: str | None = ...,
+    implementation: nw.Implementation = ...,
+) -> Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]]: ...
+
+
+@overload
+def reader(
+    read_fns: Sequence[Read[IntoDataFrameT]],
+    scan_fns: Sequence[Scan[IntoFrameT]],
+    *,
+    name: str | None = ...,
+    implementation: nw.Implementation = ...,
+) -> Reader[IntoDataFrameT, IntoFrameT]: ...
+
+
+def reader(
+    read_fns: Sequence[Read[IntoDataFrameT]],
+    scan_fns: Sequence[Scan[IntoFrameT]] = (),
+    *,
+    name: str | None = None,
+    implementation: nw.Implementation = nw.Implementation.UNKNOWN,
+) -> (
+    Reader[IntoDataFrameT, IntoFrameT]
+    | Reader[IntoDataFrameT, nw.LazyFrame[IntoDataFrameT]]
+):
+    name = name or Counter(el._inferred_package for el in read_fns).most_common(1)[0][0]
+    if implementation is nw.Implementation.UNKNOWN:
+        implementation = _into_implementation(Requirement(name))
+    if scan_fns:
+        return Reader(read_fns, scan_fns, name, implementation)
+    if stolen := _steal_eager_parquet(read_fns):
+        return Reader(read_fns, stolen, name, implementation)
+    else:
+        return _NoParquetReader[IntoDataFrameT](read_fns, (), name, implementation)
+
+
+def infer_backend(
+    *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow")
+) -> Reader[Any, Any]:
+    """
+    Return the first available reader in order of `priority`.
+
+    Notes
+    -----
+    - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``)
+    - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support
+    - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed
+    - ``"pyarrow"``: least reliable
+
+    .. _fastparquet:
+        https://github.com/dask/fastparquet
+    """
+    it = (_from_backend(name) for name in priority if is_available(_requirements(name)))
+    if reader := next(it, None):
+        return reader
+    raise AltairDatasetsError.from_priority(priority)
+
+
+@overload
+def _from_backend(name: _Polars, /) -> Reader[pl.DataFrame, pl.LazyFrame]: ...
+@overload
+def _from_backend(name: _PandasAny, /) -> Reader[pd.DataFrame, pd.DataFrame]: ...
+@overload
+def _from_backend(name: _PyArrow, /) -> Reader[pa.Table, pa.Table]: ...
+
+
+# FIXME: The order this is defined in makes splitting the module complicated
+# - Can't use a classmethod, since some result in a subclass used
+def _from_backend(name: _Backend, /) -> Reader[Any, Any]:
+    """
+    Reader initialization dispatcher.
+
+    FIXME: Works, but defining these in mixed shape functions seems off.
+    """
+    if not _is_backend(name):
+        msg = f"Unknown backend {name!r}"
+        raise TypeError(msg)
+    implementation = _into_implementation(name)
+    if name == "polars":
+        rd, sc = _readimpl.pl_only()
+        return reader(rd, sc, name=name, implementation=implementation)
+    elif name == "pandas[pyarrow]":
+        return reader(_readimpl.pd_pyarrow(), name=name, implementation=implementation)
+    elif name == "pandas":
+        return reader(_readimpl.pd_only(), name=name, implementation=implementation)
+    elif name == "pyarrow":
+        return reader(_readimpl.pa_any(), name=name, implementation=implementation)
+
+
+def _is_backend(obj: Any) -> TypeIs[_Backend]:
+    return obj in {"polars", "pandas", "pandas[pyarrow]", "pyarrow"}
+
+
+def _is_err(obj: Any) -> TypeIs[type[AltairDatasetsError]]:
+    return obj is AltairDatasetsError
+
+
+def _into_constraints(
+    name: Dataset | LiteralString, suffix: Extension | None, /
+) -> Metadata:
+    """Transform args into a mapping to column names."""
+    m: Metadata = {}
+    if "." in name:
+        m["file_name"] = name
+    elif suffix is None:
+        m["dataset_name"] = name
+    elif suffix.startswith("."):
+        m = {"dataset_name": name, "suffix": suffix}
+    else:
+        from typing import get_args
+
+        from altair.datasets._typing import Extension
+
+        msg = (
+            f"Expected 'suffix' to be one of {get_args(Extension)!r},\n"
+            f"but got: {suffix!r}"
+        )
+        raise TypeError(msg)
+    return m
+
+
+def _into_implementation(
+    backend: _NwSupport | _PandasAny | Requirement, /
+) -> nw.Implementation:
+    primary = _import_guarded(backend)
+    impl = nw.Implementation.from_backend(primary)
+    if impl is not nw.Implementation.UNKNOWN:
+        return impl
+    msg = f"Package {primary!r} is not supported by `narwhals`."
+    raise ValueError(msg)
+
+
+def _into_suffix(obj: Path | str, /) -> Any:
+    if isinstance(obj, Path):
+        return obj.suffix
+    elif isinstance(obj, str):
+        return obj
+    else:
+        msg = f"Unexpected type {type(obj).__name__!r}"
+        raise TypeError(msg)
+
+
+def _steal_eager_parquet(
+    read_fns: Sequence[Read[IntoDataFrameT]], /
+) -> Sequence[Scan[nw.LazyFrame[IntoDataFrameT]]] | None:
+    if convertable := next((rd for rd in read_fns if rd.include <= is_parquet), None):
+        return (_readimpl.into_scan(convertable),)
+    return None
+
+
+@overload
+def _import_guarded(req: _PandasAny, /) -> _Pandas: ...
+
+
+@overload
+def _import_guarded(req: _NwSupportT, /) -> _NwSupportT: ...
+
+
+@overload
+def _import_guarded(req: Requirement, /) -> LiteralString: ...
+
+
+def _import_guarded(req: Any, /) -> LiteralString:
+    requires = _requirements(req)
+    for name in requires:
+        if spec := find_spec(name):
+            import_module(spec.name)
+        else:
+            raise module_not_found(str(req), requires, missing=name)
+    return requires[0]
+
+
+def _requirements(req: Requirement | str, /) -> tuple[Any, ...]:
+    req = Requirement(req) if isinstance(req, str) else req
+    return (req.name, *req.extras)
diff --git a/altair/datasets/_readimpl.py b/altair/datasets/_readimpl.py
new file mode 100644
index 000000000..1a5840167
--- /dev/null
+++ b/altair/datasets/_readimpl.py
@@ -0,0 +1,445 @@
+"""Individual read functions and siuations they support."""
+
+from __future__ import annotations
+
+import sys
+from enum import Enum
+from functools import partial, wraps
+from importlib.util import find_spec
+from itertools import chain
+from operator import itemgetter
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generic, Literal
+
+from narwhals.stable import v1 as nw
+from narwhals.stable.v1.dependencies import get_pandas, get_polars
+from narwhals.stable.v1.typing import IntoDataFrameT
+
+from altair.datasets._constraints import (
+    is_arrow,
+    is_csv,
+    is_json,
+    is_meta,
+    is_not_tabular,
+    is_parquet,
+    is_spatial,
+    is_tsv,
+)
+from altair.datasets._exceptions import AltairDatasetsError
+
+if sys.version_info >= (3, 13):
+    from typing import TypeVar
+else:
+    from typing_extensions import TypeVar
+if sys.version_info >= (3, 12):
+    from typing import TypeAliasType
+else:
+    from typing_extensions import TypeAliasType
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Iterable, Iterator, Sequence
+    from io import IOBase
+    from types import ModuleType
+
+    import pandas as pd
+    import polars as pl
+    import pyarrow as pa
+    from narwhals.stable.v1 import typing as nwt
+
+    from altair.datasets._constraints import Items, MetaIs
+
+__all__ = ["is_available", "pa_any", "pd_only", "pd_pyarrow", "pl_only", "read", "scan"]
+
+R = TypeVar("R", bound="nwt.IntoFrame")
+IntoFrameT = TypeVar(
+    "IntoFrameT",
+    bound="nwt.NativeFrame | nw.DataFrame[Any] | nw.LazyFrame[Any] | nwt.DataFrameLike",
+    default=nw.LazyFrame[Any],
+)
+Read = TypeAliasType("Read", "BaseImpl[IntoDataFrameT]", type_params=(IntoDataFrameT,))
+"""An *eager* file read function."""
+
+Scan = TypeAliasType("Scan", "BaseImpl[IntoFrameT]", type_params=(IntoFrameT,))
+"""A *lazy* file read function."""
+
+
+class Skip(Enum):
+    """Falsy sentinel."""
+
+    skip = 0
+
+    def __bool__(self) -> Literal[False]:
+        return False
+
+    def __repr__(self) -> Literal["<Skip>"]:
+        return "<Skip>"
+
+
+class BaseImpl(Generic[R]):
+    """
+    A function wrapped with dataset support constraints.
+
+    The ``include``, ``exclude`` properties form a `NIMPLY gate`_ (`Material nonimplication`_).
+
+    Examples
+    --------
+    For some dataset ``D``, we can use ``fn`` if::
+
+        impl: BaseImpl
+        impl.include(D) and not impl.exclude(D)
+
+
+    .. _NIMPLY gate:
+        https://en.m.wikipedia.org/wiki/NIMPLY_gate
+    .. _Material nonimplication:
+        https://en.m.wikipedia.org/wiki/Material_nonimplication#Truth_table
+    """
+
+    fn: Callable[..., R]
+    """Wrapped read/scan function."""
+
+    include: MetaIs
+    """Constraint indicating ``fn`` **supports** reading a dataset."""
+
+    exclude: MetaIs
+    """Constraint *subsetting* ``include`` to mark **non-support**."""
+
+    def __init__(
+        self,
+        fn: Callable[..., R],
+        include: MetaIs,
+        exclude: MetaIs | None,
+        kwds: dict[str, Any],
+        /,
+    ) -> None:
+        exclude = exclude or self._exclude_none()
+        if not include.isdisjoint(exclude):
+            intersection = ", ".join(f"{k}={v!r}" for k, v in include & exclude)
+            msg = f"Constraints overlap at: `{intersection}`\ninclude={include!r}\nexclude={exclude!r}"
+            raise TypeError(msg)
+        object.__setattr__(self, "fn", partial(fn, **kwds) if kwds else fn)
+        object.__setattr__(self, "include", include)
+        object.__setattr__(self, "exclude", exclude)
+
+    def unwrap_or_skip(
+        self, meta: Items, /
+    ) -> Callable[..., R] | type[AltairDatasetsError] | Skip:
+        """
+        Indicate an action to take for a dataset.
+
+        **Supports** dataset, use this function::
+
+            Callable[..., R]
+
+        Has explicitly marked as **not supported**::
+
+            type[AltairDatasetsError]
+
+        No relevant constraints overlap, safe to check others::
+
+            Skip
+        """
+        if self.include.issubset(meta):
+            return self.fn if self.exclude.isdisjoint(meta) else AltairDatasetsError
+        return Skip.skip
+
+    @classmethod
+    def _exclude_none(cls) -> MetaIs:
+        """Represents the empty set."""
+        return is_meta()
+
+    def __setattr__(self, name: str, value: Any):
+        msg = (
+            f"{type(self).__name__!r} is immutable.\n"
+            f"Could not assign self.{name} = {value}"
+        )
+        raise TypeError(msg)
+
+    @property
+    def _inferred_package(self) -> str:
+        return _root_package_name(_unwrap_partial(self.fn), "UNKNOWN")
+
+    def __repr__(self) -> str:
+        tp_name = f"{type(self).__name__}[{self._inferred_package}?]"
+        return f"{tp_name}({self})"
+
+    def __str__(self) -> str:
+        if isinstance(self.fn, partial):
+            fn = _unwrap_partial(self.fn)
+            kwds = self.fn.keywords.items()
+            fn_repr = f"{fn.__name__}(..., {', '.join(f'{k}={v!r}' for k, v in kwds)})"
+        else:
+            fn_repr = f"{self.fn.__name__}(...)"
+        inc, exc = self.include, self.exclude
+        return f"{fn_repr}, {f'include={inc!r}, exclude={exc!r}' if exc else repr(inc)}"
+
+    @property
+    def _relevant_columns(self) -> Iterator[str]:
+        name = itemgetter(0)
+        yield from (name(obj) for obj in chain(self.include, self.exclude))
+
+    @property
+    def _include_expr(self) -> nw.Expr:
+        return (
+            self.include.to_expr() & ~self.exclude.to_expr()
+            if self.exclude
+            else self.include.to_expr()
+        )
+
+    @property
+    def _exclude_expr(self) -> nw.Expr:
+        if self.exclude:
+            return self.include.to_expr() & self.exclude.to_expr()
+        msg = f"Unable to generate an exclude expression without setting exclude\n\n{self!r}"
+        raise TypeError(msg)
+
+
+def read(
+    fn: Callable[..., IntoDataFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> Read[IntoDataFrameT]:
+    return BaseImpl(fn, include, exclude, kwds)
+
+
+def scan(
+    fn: Callable[..., IntoFrameT],
+    /,
+    include: MetaIs,
+    exclude: MetaIs | None = None,
+    **kwds: Any,
+) -> Scan[IntoFrameT]:
+    return BaseImpl(fn, include, exclude, kwds)
+
+
+def into_scan(impl: Read[IntoDataFrameT], /) -> Scan[nw.LazyFrame[IntoDataFrameT]]:
+    def scan_fn(
+        fn: Callable[..., IntoDataFrameT], /
+    ) -> Callable[..., nw.LazyFrame[IntoDataFrameT]]:
+        @wraps(_unwrap_partial(fn))
+        def wrapper(*args: Any, **kwds: Any) -> nw.LazyFrame[IntoDataFrameT]:
+            return nw.from_native(fn(*args, **kwds)).lazy()
+
+        return wrapper
+
+    return scan(scan_fn(impl.fn), impl.include, impl.exclude)
+
+
+def is_available(
+    pkg_names: str | Iterable[str], *more_pkg_names: str, require_all: bool = True
+) -> bool:
+    """
+    Check for importable package(s), without raising on failure.
+
+    Parameters
+    ----------
+    pkg_names, more_pkg_names
+        One or more packages.
+    require_all
+        * ``True`` every package.
+        * ``False`` at least one package.
+    """
+    if not more_pkg_names and isinstance(pkg_names, str):
+        return find_spec(pkg_names) is not None
+    pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,)
+    names = chain(pkgs_names, more_pkg_names)
+    fn = all if require_all else any
+    return fn(find_spec(name) is not None for name in names)
+
+
+def _root_package_name(obj: Any, default: str, /) -> str:
+    # NOTE: Defers importing `inspect`, if we can get the module name
+    if hasattr(obj, "__module__"):
+        return obj.__module__.split(".")[0]
+    else:
+        from inspect import getmodule
+
+        module = getmodule(obj)
+    if module and (pkg := module.__package__):
+        return pkg.split(".")[0]
+    return default
+
+
+def _unwrap_partial(fn: Any, /) -> Any:
+    # NOTE: ``functools._unwrap_partial``
+    func = fn
+    while isinstance(func, partial):
+        func = func.func
+    return func
+
+
+def pl_only() -> tuple[Sequence[Read[pl.DataFrame]], Sequence[Scan[pl.LazyFrame]]]:
+    import polars as pl
+
+    read_fns = (
+        read(pl.read_csv, is_csv, try_parse_dates=True),
+        read(_pl_read_json_roundtrip(get_polars()), is_json),
+        read(pl.read_csv, is_tsv, separator="\t", try_parse_dates=True),
+        read(pl.read_ipc, is_arrow),
+        read(pl.read_parquet, is_parquet),
+    )
+    scan_fns = (scan(pl.scan_parquet, is_parquet),)
+    return read_fns, scan_fns
+
+
+def pd_only() -> Sequence[Read[pd.DataFrame]]:
+    import pandas as pd
+
+    opt: Sequence[Read[pd.DataFrame]]
+    if is_available("pyarrow"):
+        opt = read(pd.read_feather, is_arrow), read(pd.read_parquet, is_parquet)
+    elif is_available("fastparquet"):
+        opt = (read(pd.read_parquet, is_parquet),)
+    else:
+        opt = ()
+    return (
+        read(pd.read_csv, is_csv),
+        read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial),
+        read(pd.read_csv, is_tsv, sep="\t"),
+        *opt,
+    )
+
+
+def pd_pyarrow() -> Sequence[Read[pd.DataFrame]]:
+    import pandas as pd
+
+    kwds: dict[str, Any] = {"dtype_backend": "pyarrow"}
+    return (
+        read(pd.read_csv, is_csv, **kwds),
+        read(_pd_read_json(get_pandas()), is_json, exclude=is_spatial, **kwds),
+        read(pd.read_csv, is_tsv, sep="\t", **kwds),
+        read(pd.read_feather, is_arrow, **kwds),
+        read(pd.read_parquet, is_parquet, **kwds),
+    )
+
+
+def pa_any() -> Sequence[Read[pa.Table]]:
+    from pyarrow import csv, feather, parquet
+
+    return (
+        read(csv.read_csv, is_csv),
+        _pa_read_json_impl(),
+        read(csv.read_csv, is_tsv, parse_options=csv.ParseOptions(delimiter="\t")),  # pyright: ignore[reportCallIssue]
+        read(feather.read_table, is_arrow),
+        read(parquet.read_table, is_parquet),
+    )
+
+
+def _pa_read_json_impl() -> Read[pa.Table]:
+    """
+    Mitigating ``pyarrow``'s `line-delimited`_ JSON requirement.
+
+    .. _line-delimited:
+        https://arrow.apache.org/docs/python/json.html#reading-json-files
+    """
+    if is_available("polars"):
+        return read(_pl_read_json_roundtrip_to_arrow(get_polars()), is_json)
+    elif is_available("pandas"):
+        return read(_pd_read_json_to_arrow(get_pandas()), is_json, exclude=is_spatial)
+    return read(_stdlib_read_json_to_arrow, is_json, exclude=is_not_tabular)
+
+
+def _pd_read_json(ns: ModuleType, /) -> Callable[..., pd.DataFrame]:
+    @wraps(ns.read_json)
+    def fn(source: Path | Any, /, **kwds: Any) -> pd.DataFrame:
+        return _pd_fix_dtypes_nw(ns.read_json(source, **kwds), **kwds).to_native()
+
+    return fn
+
+
+def _pd_fix_dtypes_nw(
+    df: pd.DataFrame, /, *, dtype_backend: Any = None, **kwds: Any
+) -> nw.DataFrame[pd.DataFrame]:
+    kwds = {"dtype_backend": dtype_backend} if dtype_backend else {}
+    return (
+        df.convert_dtypes(**kwds)
+        .pipe(nw.from_native, eager_only=True)
+        .with_columns(nw.selectors.by_dtype(nw.Object).cast(nw.String))
+    )
+
+
+def _pd_read_json_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]:
+    @wraps(ns.read_json)
+    def fn(source: Path | Any, /, *, schema: Any = None, **kwds: Any) -> pa.Table:
+        """``schema`` is only here to swallow the ``SchemaCache`` if used."""
+        return (
+            ns.read_json(source, **kwds)
+            .pipe(_pd_fix_dtypes_nw, dtype_backend="pyarrow")
+            .to_arrow()
+        )
+
+    return fn
+
+
+def _pl_read_json_roundtrip(ns: ModuleType, /) -> Callable[..., pl.DataFrame]:
+    """
+    Try to utilize better date parsing available in `pl.read_csv`_.
+
+    `pl.read_json`_ has few options when compared to `pl.read_csv`_.
+
+    Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_.
+
+    .. _pl.read_json:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html
+    .. _pl.read_csv:
+        https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html
+    .. _pandas.read_json:
+        https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
+    """
+    from io import BytesIO
+
+    @wraps(ns.read_json)
+    def fn(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame:
+        df = ns.read_json(source, **kwds)
+        if any(tp.is_nested() for tp in df.schema.dtypes()):
+            return df
+        buf = BytesIO()
+        df.write_csv(buf)
+        if kwds:
+            SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"}
+            kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS}
+        return ns.read_csv(buf, try_parse_dates=True, **kwds)
+
+    return fn
+
+
+def _pl_read_json_roundtrip_to_arrow(ns: ModuleType, /) -> Callable[..., pa.Table]:
+    eager = _pl_read_json_roundtrip(ns)
+
+    @wraps(ns.read_json)
+    def fn(source: Path | IOBase, /, **kwds: Any) -> pa.Table:
+        return eager(source).to_arrow()
+
+    return fn
+
+
+def _stdlib_read_json(source: Path | Any, /) -> Any:
+    import json
+
+    if not isinstance(source, Path):
+        return json.load(source)
+    else:
+        with Path(source).open(encoding="utf-8") as f:
+            return json.load(f)
+
+
+def _stdlib_read_json_to_arrow(source: Path | Any, /, **kwds: Any) -> pa.Table:
+    import pyarrow as pa
+
+    rows: list[dict[str, Any]] = _stdlib_read_json(source)
+    try:
+        return pa.Table.from_pylist(rows, **kwds)
+    except TypeError:
+        import csv
+        import io
+
+        from pyarrow import csv as pa_csv
+
+        with io.StringIO() as f:
+            writer = csv.DictWriter(f, rows[0].keys(), dialect=csv.unix_dialect)
+            writer.writeheader()
+            writer.writerows(rows)
+            with io.BytesIO(f.getvalue().encode()) as f2:
+                return pa_csv.read_csv(f2)
diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py
new file mode 100644
index 000000000..3357ddf3b
--- /dev/null
+++ b/altair/datasets/_typing.py
@@ -0,0 +1,218 @@
+# The contents of this file are automatically written by
+# tools/datasets.__init__.py. Do not modify directly.
+
+from __future__ import annotations
+
+import sys
+from typing import Literal
+
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+if sys.version_info >= (3, 10):
+    from typing import TypeAlias
+else:
+    from typing_extensions import TypeAlias
+
+
+__all__ = ["Dataset", "Extension", "Metadata"]
+
+Dataset: TypeAlias = Literal[
+    "7zip",
+    "airports",
+    "annual-precip",
+    "anscombe",
+    "barley",
+    "birdstrikes",
+    "budget",
+    "budgets",
+    "burtin",
+    "cars",
+    "co2-concentration",
+    "countries",
+    "crimea",
+    "disasters",
+    "driving",
+    "earthquakes",
+    "ffox",
+    "flare",
+    "flare-dependencies",
+    "flights-10k",
+    "flights-200k",
+    "flights-20k",
+    "flights-2k",
+    "flights-3m",
+    "flights-5k",
+    "flights-airport",
+    "football",
+    "gapminder",
+    "gapminder-health-income",
+    "gimp",
+    "github",
+    "global-temp",
+    "income",
+    "iowa-electricity",
+    "jobs",
+    "la-riots",
+    "londonBoroughs",
+    "londonCentroids",
+    "londonTubeLines",
+    "lookup_groups",
+    "lookup_people",
+    "miserables",
+    "monarchs",
+    "movies",
+    "normal-2d",
+    "obesity",
+    "ohlc",
+    "penguins",
+    "platformer-terrain",
+    "political-contributions",
+    "population",
+    "population_engineers_hurricanes",
+    "seattle-weather",
+    "seattle-weather-hourly-normals",
+    "sp500",
+    "sp500-2000",
+    "stocks",
+    "udistrict",
+    "unemployment",
+    "unemployment-across-industries",
+    "uniform-2d",
+    "us-10m",
+    "us-employment",
+    "us-state-capitals",
+    "volcano",
+    "weather",
+    "weekly-weather",
+    "wheat",
+    "windvectors",
+    "world-110m",
+    "zipcodes",
+]
+Extension: TypeAlias = Literal[".arrow", ".csv", ".json", ".parquet", ".png", ".tsv"]
+
+
+class Metadata(TypedDict, total=False):
+    """
+    Full schema for ``metadata.parquet``.
+
+    Parameters
+    ----------
+    dataset_name
+        Name of the dataset/`Path.stem`_.
+    suffix
+        File extension/`Path.suffix`_.
+    file_name
+        Equivalent to `Path.name`_.
+    bytes
+        File size in *bytes*.
+    is_image
+        Only accessible via url.
+    is_tabular
+        Can be read as tabular data.
+    is_geo
+        `GeoJSON`_ format.
+    is_topo
+        `TopoJSON`_ format.
+    is_spatial
+        Any geospatial format. Only natively supported by ``polars``.
+    is_json
+        Not supported natively by ``pyarrow``.
+    has_schema
+        Data types available for improved ``pandas`` parsing.
+    sha
+        Unique hash for the dataset.
+
+        .. note::
+            E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
+
+            then this value would remain stable.
+    url
+        Remote url used to access dataset.
+
+    .. _Path.stem:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem
+    .. _Path.name:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name
+    .. _Path.suffix:
+        https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix
+    .. _GeoJSON:
+        https://en.wikipedia.org/wiki/GeoJSON
+    .. _TopoJSON:
+        https://en.wikipedia.org/wiki/GeoJSON#TopoJSON
+
+
+    Examples
+    --------
+    ``Metadata`` keywords form constraints to filter a table like the below sample:
+
+    ```
+    shape: (72, 13)
+    ┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
+    │ dataset_name   ┆ suffix ┆ file_name      ┆ … ┆ sha           ┆ url           │
+    │ ---            ┆ ---    ┆ ---            ┆   ┆ ---           ┆ ---           │
+    │ str            ┆ str    ┆ str            ┆   ┆ str           ┆ str           │
+    ╞════════════════╪════════╪════════════════╪═══╪═══════════════╪═══════════════╡
+    │ 7zip           ┆ .png   ┆ 7zip.png       ┆ … ┆ 6586d6c00887c ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ d48850099c17… ┆ sdelivr.net/… │
+    │ airports       ┆ .csv   ┆ airports.csv   ┆ … ┆ 608ba6d51fa70 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 584c3fa1d31e… ┆ sdelivr.net/… │
+    │ annual-precip  ┆ .json  ┆ annual-precip. ┆ … ┆ 719e73406cfc0 ┆ https://cdn.j │
+    │                ┆        ┆ json           ┆   ┆ 8f16dda65151… ┆ sdelivr.net/… │
+    │ anscombe       ┆ .json  ┆ anscombe.json  ┆ … ┆ 11ae97090b626 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 3bdf0c866115… ┆ sdelivr.net/… │
+    │ barley         ┆ .json  ┆ barley.json    ┆ … ┆ 8dc50de2509b6 ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ e197ce95c24c… ┆ sdelivr.net/… │
+    │ …              ┆ …      ┆ …              ┆ … ┆ …             ┆ …             │
+    │ weekly-weather ┆ .json  ┆ weekly-weather ┆ … ┆ bd42a3e2403e7 ┆ https://cdn.j │
+    │                ┆        ┆ .json          ┆   ┆ ccd6baaa89f9… ┆ sdelivr.net/… │
+    │ wheat          ┆ .json  ┆ wheat.json     ┆ … ┆ cde46b43fc82f ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 4c3c2a37ddcf… ┆ sdelivr.net/… │
+    │ windvectors    ┆ .csv   ┆ windvectors.cs ┆ … ┆ ed686b0ba613a ┆ https://cdn.j │
+    │                ┆        ┆ v              ┆   ┆ bd59d09fcd94… ┆ sdelivr.net/… │
+    │ world-110m     ┆ .json  ┆ world-110m.jso ┆ … ┆ a1ce852de6f27 ┆ https://cdn.j │
+    │                ┆        ┆ n              ┆   ┆ 13c94c0c2840… ┆ sdelivr.net/… │
+    │ zipcodes       ┆ .csv   ┆ zipcodes.csv   ┆ … ┆ d3df33e12be0d ┆ https://cdn.j │
+    │                ┆        ┆                ┆   ┆ 0544c95f1bd4… ┆ sdelivr.net/… │
+    └────────────────┴────────┴────────────────┴───┴───────────────┴───────────────┘
+    ```
+    """
+
+    dataset_name: str
+    suffix: str
+    file_name: str
+    bytes: int
+    is_image: bool
+    is_tabular: bool
+    is_geo: bool
+    is_topo: bool
+    is_spatial: bool
+    is_json: bool
+    has_schema: bool
+    sha: str
+    url: str
+
+
+FlFieldStr: TypeAlias = Literal[
+    "integer",
+    "number",
+    "boolean",
+    "string",
+    "object",
+    "array",
+    "date",
+    "datetime",
+    "time",
+    "duration",
+]
+"""
+String representation of `frictionless`_ `Field Types`_.
+
+.. _frictionless:
+    https://github.com/frictionlessdata/frictionless-py
+.. _Field Types:
+    https://datapackage.org/standard/table-schema/#field-types
+"""
diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py
index e7708f0d7..d75cdb593 100644
--- a/altair/utils/schemapi.py
+++ b/altair/utils/schemapi.py
@@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]:
     ],
     str,
 ] = {
-    "vega-datasets": "v2.11.0",
+    "vega-datasets": "3.0.0-alpha.1",
     "vega-embed": "6",
     "vega-lite": "v5.21.0",
     "vegafusion": "1.6.6",
diff --git a/doc/user_guide/api.rst b/doc/user_guide/api.rst
index 5793f0ae8..336c29d54 100644
--- a/doc/user_guide/api.rst
+++ b/doc/user_guide/api.rst
@@ -791,5 +791,21 @@ Typing
    Optional
    is_chart_type
 
+.. _api-datasets:
+
+Datasets
+--------
+.. currentmodule:: altair.datasets
+
+.. autosummary::
+   :toctree: generated/datasets/
+   :nosignatures:
+
+   Loader
+   load
+   url
+
 .. _Generic:
     https://typing.readthedocs.io/en/latest/spec/generics.html#generics
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
diff --git a/pyproject.toml b/pyproject.toml
index 7fadb3049..b9edc7ea2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,9 +104,9 @@ doc = [
 
 [tool.altair.vega]
 # Minimum/exact versions, for projects under the `vega` organization
-vega-datasets     = "v2.11.0" # https://github.com/vega/vega-datasets
-vega-embed        = "6"       # https://github.com/vega/vega-embed
-vega-lite         = "v5.21.0" # https://github.com/vega/vega-lite
+vega-datasets     = "3.0.0-alpha.1" # https://github.com/vega/vega-datasets
+vega-embed        = "6"             # https://github.com/vega/vega-embed
+vega-lite         = "v5.21.0"       # https://github.com/vega/vega-lite
 
 [tool.hatch]
 build    = { include = ["/altair"], artifacts = ["altair/jupyter/js/index.js"] }
@@ -137,7 +137,8 @@ target-version = "py39"
 
 [tool.ruff.lint]
 extend-safe-fixes = [ # https://docs.astral.sh/ruff/settings/#lint_extend-safe-fixes
-    "ANN204",  # missing-return-type-special-method 
+    "ANN204",  # missing-return-type-special-method
+    "C405",    # unnecessary-literal-set
     "C419",    # unnecessary-comprehension-in-call
     "C420",    # unnecessary-dict-comprehension-for-iterable
     "D200",    # fits-on-one-line
@@ -261,15 +262,19 @@ cwd = "."
 [tool.taskipy.tasks]
 lint       = "ruff check"
 format     = "ruff format --diff --check"
+ruff-check = "task lint && task format"
 ruff-fix   = "task lint && ruff format"
 type-check = "mypy altair tests"
 
-pytest    = "pytest"
-test      = "task lint && task format && task type-check && task pytest"
-test-fast = "task ruff-fix && pytest -m \"not slow\""
-test-slow = "task ruff-fix && pytest -m \"slow\""
-test-min  = "task lint && task format && task type-check && hatch test --python 3.9"
-test-all  = "task lint && task format && task type-check && hatch test --all"
+pytest-serial = "pytest -m \"no_xdist\" --numprocesses=1"
+pytest        = "pytest && task pytest-serial"
+test          = "task ruff-check && task type-check && task pytest"
+test-fast     = "task ruff-fix && pytest -m \"not slow and not datasets_debug and not no_xdist\""
+test-slow     = "task ruff-fix && pytest -m \"slow and not datasets_debug and not no_xdist\""
+test-datasets = "task ruff-fix && pytest tests -k test_datasets -m \"not no_xdist\" && task pytest-serial"
+test-min      = "task ruff-check && task type-check && hatch test --python 3.9"
+test-all      = "task ruff-check && task type-check && hatch test --all"
+
 
 generate-schema-wrapper = "mypy tools && python tools/generate_schema_wrapper.py && task test"
 update-init-file        = "python tools/update_init_file.py && task ruff-fix"
@@ -294,10 +299,19 @@ publish = "task build && uv publish"
 # They contain examples which are being executed by the
 # test_examples tests.
 norecursedirs = ["tests/examples_arguments_syntax", "tests/examples_methods_syntax"]
-addopts = ["--numprocesses=logical", "--doctest-modules", "tests", "altair", "tools"]
+addopts = [
+    "--numprocesses=logical",
+    "--doctest-modules",
+    "tests",
+    "altair",
+    "tools",
+    "-m not datasets_debug and not no_xdist",
+]
 # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks
 markers = [
-    "slow: Label tests as slow (deselect with '-m \"not slow\"')"
+    "slow: Label tests as slow (deselect with '-m \"not slow\"')",
+    "datasets_debug: Disabled by default due to high number of requests",
+    "no_xdist: Unsafe to run in parallel"
 ]
 
 [tool.mypy]
diff --git a/tests/__init__.py b/tests/__init__.py
index 5d78dce0d..80c27fc2c 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -60,6 +60,16 @@ def windows_has_tzdata() -> bool:
     >>> hatch run test-slow --durations=25  # doctest: +SKIP
 """
 
+no_xdist: pytest.MarkDecorator = pytest.mark.no_xdist()
+"""
+Custom ``pytest.mark`` decorator.
+
+Each marked test will run **serially**, after all other selected tests.
+
+.. tip::
+   Use as a last resort when a test depends on manipulating global state.
+"""
+
 skip_requires_ipython: pytest.MarkDecorator = pytest.mark.skipif(
     find_spec("IPython") is None, reason="`IPython` not installed."
 )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 000000000..f112cacb8
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,680 @@
+from __future__ import annotations
+
+import datetime as dt
+import re
+import sys
+from functools import partial
+from importlib import import_module
+from importlib.util import find_spec
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast, get_args
+from urllib.error import URLError
+
+import pytest
+from narwhals.stable import v1 as nw
+from narwhals.stable.v1 import dependencies as nw_dep
+
+from altair.datasets import Loader
+from altair.datasets._exceptions import AltairDatasetsError
+from altair.datasets._typing import Dataset, Metadata
+from tests import no_xdist, skip_requires_pyarrow
+from tools import fs
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Mapping
+    from pathlib import Path
+    from typing import Literal
+
+    import pandas as pd
+    import polars as pl
+    from _pytest.mark.structures import ParameterSet
+
+    from altair.datasets._reader import _Backend, _PandasAny, _Polars, _PyArrow
+    from altair.vegalite.v5.schema._typing import OneOrSeq
+
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    PolarsLoader: TypeAlias = Loader[pl.DataFrame, pl.LazyFrame]
+
+datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug()
+"""
+Custom ``pytest.mark`` decorator.
+
+Use for more exhaustive tests that require many requests.
+
+**Disabled** by default in ``pyproject.toml``:
+
+    [tool.pytest.ini_options]
+    addopts = ...
+"""
+
+_backend_params: Mapping[_Backend, ParameterSet] = {
+    "polars": pytest.param("polars"),
+    "pandas": pytest.param("pandas"),
+    "pandas[pyarrow]": pytest.param("pandas[pyarrow]", marks=skip_requires_pyarrow()),
+    "pyarrow": pytest.param("pyarrow", marks=skip_requires_pyarrow()),
+}
+
+backends: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", _backend_params.values()
+)
+backends_no_polars: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _backend_params.items() if k != "polars"]
+)
+backends_pandas_any: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _backend_params.items() if "pandas" in k]
+)
+backends_pyarrow: pytest.MarkDecorator = pytest.mark.parametrize(
+    "backend", [v for k, v in _backend_params.items() if k == "pyarrow"]
+)
+
+datasets_all: pytest.MarkDecorator = pytest.mark.parametrize("name", get_args(Dataset))
+datasets_spatial: pytest.MarkDecorator = pytest.mark.parametrize(
+    "name", ["earthquakes", "londonBoroughs", "londonTubeLines", "us-10m", "world-110m"]
+)
+
+CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR"
+
+
+@pytest.fixture(scope="session")
+def polars_loader() -> PolarsLoader:
+    """Fastest and **most reliable** backend."""
+    load = Loader.from_backend("polars")
+    if load.cache.is_not_active():
+        load.cache.path = load.cache._XDG_CACHE
+    return load
+
+
+@pytest.fixture
+def metadata_columns() -> frozenset[str]:
+    return Metadata.__required_keys__.union(Metadata.__optional_keys__)
+
+
+def is_frame_backend(frame: Any, backend: _Backend, /) -> bool:
+    pandas_any: set[_PandasAny] = {"pandas", "pandas[pyarrow]"}
+    if backend in pandas_any:
+        return nw_dep.is_pandas_dataframe(frame)
+    elif backend == "pyarrow":
+        return nw_dep.is_pyarrow_table(frame)
+    elif backend == "polars":
+        return nw_dep.is_polars_dataframe(frame)
+    else:
+        raise TypeError(backend)
+
+
+def is_loader_backend(loader: Loader[Any, Any], backend: _Backend, /) -> bool:
+    return repr(loader) == f"{type(loader).__name__}[{backend}]"
+
+
+def is_url(name: Dataset, fn_url: Callable[..., str], /) -> bool:
+    pattern = rf".+/vega-datasets@.+/data/{name}\..+"
+    url = fn_url(name)
+    return re.match(pattern, url) is not None
+
+
+def is_polars_backed_pyarrow(loader: Loader[Any, Any], /) -> bool:
+    """
+    User requested ``pyarrow``, but also has ``polars`` installed.
+
+    Both support nested datatypes, which are required for spatial json.
+    """
+    return (
+        is_loader_backend(loader, "pyarrow")
+        and "earthquakes" in loader._reader.profile()["supported"]
+    )
+
+
+@backends
+def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None:
+    """Ensure all backends will query the same column names."""
+    load = Loader.from_backend(backend)
+    schema_columns = load._reader._scan_metadata().collect().columns
+    assert set(schema_columns) == metadata_columns
+
+
+@backends
+def test_loader_from_backend(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    assert is_loader_backend(load, backend)
+
+
+@backends
+def test_loader_url(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    assert is_url("volcano", load.url)
+
+
+@no_xdist
+def test_load_infer_priority(monkeypatch: pytest.MonkeyPatch) -> None:
+    """
+    Ensure the **most reliable**, available backend is selected.
+
+    See Also
+    --------
+    ``altair.datasets._reader.infer_backend``
+    """
+    import altair.datasets._loader
+    from altair.datasets import load
+
+    assert is_loader_backend(load, "polars")
+    monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
+    monkeypatch.setitem(sys.modules, "polars", None)
+
+    from altair.datasets import load
+
+    if find_spec("pyarrow") is None:
+        # NOTE: We can end the test early for the CI job that removes `pyarrow`
+        assert is_loader_backend(load, "pandas")
+        monkeypatch.delattr(altair.datasets._loader, "load")
+        monkeypatch.setitem(sys.modules, "pandas", None)
+        with pytest.raises(AltairDatasetsError, match=r"no.+backend"):
+            from altair.datasets import load
+    else:
+        assert is_loader_backend(load, "pandas[pyarrow]")
+        monkeypatch.delattr(altair.datasets._loader, "load")
+        monkeypatch.setitem(sys.modules, "pyarrow", None)
+
+        from altair.datasets import load
+
+        assert is_loader_backend(load, "pandas")
+        monkeypatch.delattr(altair.datasets._loader, "load")
+        monkeypatch.setitem(sys.modules, "pandas", None)
+        monkeypatch.delitem(sys.modules, "pyarrow")
+        monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow"))
+        from altair.datasets import load
+
+        assert is_loader_backend(load, "pyarrow")
+        monkeypatch.delattr(altair.datasets._loader, "load")
+        monkeypatch.setitem(sys.modules, "pyarrow", None)
+
+        with pytest.raises(AltairDatasetsError, match=r"no.+backend"):
+            from altair.datasets import load
+
+
+@backends
+def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
+    import altair.datasets._loader
+
+    monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
+    from altair.datasets import load
+
+    assert is_loader_backend(load, "polars")
+    default = load("cars")
+    df = load("cars", backend=backend)
+    default_2 = load("cars")
+    assert nw_dep.is_polars_dataframe(default)
+    assert is_frame_backend(df, backend)
+    assert nw_dep.is_polars_dataframe(default_2)
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "jobs",
+        "la-riots",
+        "londonBoroughs",
+        "londonCentroids",
+        "londonTubeLines",
+        "lookup_groups",
+        "lookup_people",
+        "miserables",
+        "monarchs",
+        "movies",
+        "normal-2d",
+        "obesity",
+        "ohlc",
+        "penguins",
+        "platformer-terrain",
+        "political-contributions",
+        "population",
+        "population_engineers_hurricanes",
+        "unemployment",
+        "seattle-weather",
+        "seattle-weather-hourly-normals",
+        "gapminder-health-income",
+        "sp500",
+        "sp500-2000",
+        "stocks",
+        "udistrict",
+    ],
+)
+def test_url(name: Dataset) -> None:
+    from altair.datasets import url
+
+    assert is_url(name, url)
+
+
+def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
+    from altair.datasets._cache import csv_cache
+    from altair.datasets._reader import infer_backend
+
+    priority: Any = ("fake_mod_1", "fake_mod_2", "fake_mod_3", "fake_mod_4")
+    assert csv_cache._mapping == {}
+    with pytest.raises(AltairDatasetsError):
+        infer_backend(priority=priority)
+
+    url = csv_cache.url
+    assert is_url("jobs", url)
+    assert csv_cache._mapping != {}
+    assert is_url("cars", url)
+    assert is_url("stocks", url)
+    assert is_url("countries", url)
+    assert is_url("crimea", url)
+    assert is_url("disasters", url)
+    assert is_url("driving", url)
+    assert is_url("earthquakes", url)
+    assert is_url("flare", url)
+    assert is_url("flights-10k", url)
+    assert is_url("flights-200k", url)
+    if find_spec("vegafusion"):
+        assert is_url("flights-3m", url)
+
+    with monkeypatch.context() as mp:
+        mp.setitem(sys.modules, "vegafusion", None)
+        with pytest.raises(AltairDatasetsError, match=r".parquet.+require.+vegafusion"):
+            url("flights-3m")
+    with pytest.raises(
+        TypeError, match="'fake data' does not refer to a known dataset"
+    ):
+        url("fake data")
+
+
+@backends
+def test_loader_call(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    frame = load("stocks", ".csv")
+    assert nw_dep.is_into_dataframe(frame)
+    nw_frame = nw.from_native(frame)
+    assert set(nw_frame.columns) == {"symbol", "date", "price"}
+
+
+@backends
+def test_dataset_not_found(backend: _Backend) -> None:
+    """Various queries that should **always raise** due to non-existent dataset."""
+    load = Loader.from_backend(backend)
+    real_name: Literal["disasters"] = "disasters"
+    invalid_name: Literal["fake name"] = "fake name"
+    invalid_suffix: Literal["fake suffix"] = "fake suffix"
+    incorrect_suffix: Literal[".json"] = ".json"
+    ERR_NO_RESULT = ValueError
+    MSG_NO_RESULT = "Found no results for"
+    NAME = "dataset_name"
+    SUFFIX = "suffix"
+
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL),
+    ):
+        load.url(invalid_name)
+    with pytest.raises(
+        TypeError,
+        match=re.compile(
+            rf"Expected '{SUFFIX}' to be one of.+\(.+\).+but got.+{invalid_suffix}",
+            re.DOTALL,
+        ),
+    ):
+        load.url(real_name, invalid_suffix)  # type: ignore[arg-type]
+    with pytest.raises(
+        ERR_NO_RESULT,
+        match=re.compile(
+            rf"{MSG_NO_RESULT}.+{NAME}.+{real_name}.+{SUFFIX}.+{incorrect_suffix}",
+            re.DOTALL,
+        ),
+    ):
+        load.url(real_name, incorrect_suffix)
+
+
+def test_reader_missing_dependencies() -> None:
+    from altair.datasets._reader import _import_guarded
+
+    fake_name = "not_a_real_package"
+    real_name = "altair"
+    fake_extra = "AnotherFakePackage"
+    backend = f"{real_name}[{fake_extra}]"
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{fake_name}.+requires.+{fake_name}.+but.+{fake_name}.+not.+found.+pip install {fake_name}",
+            flags=re.DOTALL,
+        ),
+    ):
+        _import_guarded(fake_name)  # type: ignore
+    with pytest.raises(
+        ModuleNotFoundError,
+        match=re.compile(
+            rf"{re.escape(backend)}.+requires.+'{real_name}', '{fake_extra}'.+but.+{fake_extra}.+not.+found.+pip install {fake_extra}",
+            flags=re.DOTALL,
+        ),
+    ):
+        _import_guarded(backend)  # type: ignore
+
+
+def test_reader_missing_implementation() -> None:
+    from altair.datasets._constraints import is_csv
+    from altair.datasets._reader import reader
+    from altair.datasets._readimpl import read
+
+    def func(*args, **kwds) -> pd.DataFrame:
+        if TYPE_CHECKING:
+            return pd.DataFrame()
+
+    name = "pandas"
+    rd = reader((read(func, is_csv),), name=name)
+    with pytest.raises(
+        AltairDatasetsError,
+        match=re.compile(rf"Unable.+parquet.+native.+{name}", flags=re.DOTALL),
+    ):
+        rd.dataset("flights-3m")
+    with pytest.raises(
+        AltairDatasetsError,
+        match=re.compile(r"Found no.+support.+flights.+json", flags=re.DOTALL),
+    ):
+        rd.dataset("flights-2k")
+    with pytest.raises(
+        AltairDatasetsError, match=re.compile(r"Image data is non-tabular")
+    ):
+        rd.dataset("7zip")
+
+
+@backends
+def test_reader_cache(
+    backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    """Ensure cache hits avoid network activity."""
+    import polars as pl
+    from polars.testing import assert_frame_equal
+
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+    load = Loader.from_backend(backend)
+    assert load.cache.is_active()
+    cache_dir = load.cache.path
+    assert cache_dir == tmp_path
+    assert tuple(load.cache) == ()
+
+    # smallest csvs
+    lookup_groups = load("lookup_groups")
+    load("lookup_people")
+    load("iowa-electricity")
+    load("global-temp")
+    cached_paths = tuple(load.cache)
+    assert len(cached_paths) == 4
+
+    if nw_dep.is_polars_dataframe(lookup_groups):
+        left, right = (
+            lookup_groups,
+            cast("pl.DataFrame", load("lookup_groups", ".csv")),
+        )
+    else:
+        left, right = (
+            pl.DataFrame(lookup_groups),
+            pl.DataFrame(load("lookup_groups", ".csv")),
+        )
+
+    assert_frame_equal(left, right)
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
+    load("iowa-electricity", ".csv")
+    load("global-temp", ".csv")
+    load("global-temp.csv")
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
+    load("lookup_people")
+    load("lookup_people.csv")
+    load("lookup_people", ".csv")
+    load("lookup_people")
+    assert len(tuple(load.cache)) == 4
+    assert cached_paths == tuple(load.cache)
+
+
+@datasets_debug
+@backends
+def test_reader_cache_exhaustive(
+    backend: _Backend,
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    polars_loader: PolarsLoader,
+) -> None:
+    """
+    Fully populate and then purge the cache for all backends.
+
+    Notes
+    -----
+    - Does not attempt to read the files
+    - Checking we can support pre-downloading and safely deleting
+        - Requests work the same for all backends
+        - The logic for detecting the cache contents uses ``narhwals``
+        - Here, we're testing that these ``narwhals`` ops are consistent
+    - `DatasetCache.download_all` is expensive for CI, so aiming for it to run **at most once**
+        - 34-45s per call (4x backends)
+    """
+    polars_loader.cache.download_all()
+    CLONED: Path = tmp_path / "clone"
+    fs.mkdir(CLONED)
+    fs.copytree(polars_loader.cache.path, CLONED)
+
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+    load = Loader.from_backend(backend)
+    assert load.cache.is_active()
+    cache_dir = load.cache.path
+    assert cache_dir == tmp_path
+    assert tuple(load.cache) == (CLONED,)
+    load.cache.path = CLONED
+    cached_paths = tuple(load.cache)
+    assert cached_paths != ()
+
+    # NOTE: Approximating all datasets downloaded
+    assert len(cached_paths) >= 70
+    assert all(bool(fp.exists() and fp.stat().st_size) for fp in load.cache)
+    # NOTE: Confirm this is a no-op
+    load.cache.download_all()
+    assert len(cached_paths) == len(tuple(load.cache))
+
+    # NOTE: Ensure unrelated files in the directory are not removed
+    dummy: Path = tmp_path / "dummy.json"
+    dummy.touch(exist_ok=False)
+    load.cache.clear()
+
+    remaining = tuple(tmp_path.iterdir())
+    assert set(remaining) == {dummy, CLONED}
+    fs.rm(dummy, CLONED)
+
+
+@no_xdist
+def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    from altair.datasets import load
+
+    monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path))
+    assert load.cache.is_active()
+    assert load.cache.path == tmp_path
+    assert load.cache.is_empty()
+    load("cars")
+    assert not load.cache.is_empty()
+    # ISSUE: https://github.com/python/mypy/issues/3004
+    load.cache.path = None  # type: ignore[assignment]
+    assert load.cache.is_not_active()
+    with pytest.raises(
+        ValueError,
+        match=re.compile(
+            rf"Cache.+unset.+{CACHE_ENV_VAR}.+\.cache\.path =", flags=re.DOTALL
+        ),
+    ):
+        tuple(load.cache)
+    load.cache.path = tmp_path
+    assert load.cache.is_active()
+    assert load.cache.path == tmp_path
+    assert not load.cache.is_empty()
+
+
+@pytest.mark.parametrize(
+    "name", ["cars", "movies", "wheat", "barley", "gapminder", "income", "burtin"]
+)
+@pytest.mark.parametrize("fallback", ["polars", None])
+@backends_pyarrow
+def test_pyarrow_read_json(
+    backend: _PyArrow,
+    fallback: _Polars | None,
+    name: Dataset,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    if fallback is None:
+        monkeypatch.setitem(sys.modules, "polars", None)
+    load = Loader.from_backend(backend)
+    assert load(name, ".json")
+
+
+@datasets_spatial
+@backends_no_polars
+def test_spatial(backend: _Backend, name: Dataset) -> None:
+    load = Loader.from_backend(backend)
+    if is_polars_backed_pyarrow(load):
+        assert nw_dep.is_pyarrow_table(load(name))
+    else:
+        pattern = re.compile(
+            rf"{name}.+geospatial.+native.+{re.escape(backend)}.+try.+polars.+url",
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        with pytest.raises(AltairDatasetsError, match=pattern):
+            load(name)
+
+
+@backends
+def test_tsv(backend: _Backend) -> None:
+    load = Loader.from_backend(backend)
+    is_frame_backend(load("unemployment", ".tsv"), backend)
+
+
+@datasets_all
+@datasets_debug
+def test_all_datasets(polars_loader: PolarsLoader, name: Dataset) -> None:
+    if name in {"7zip", "ffox", "gimp"}:
+        pattern = re.compile(
+            rf"Unable to load.+{name}.png.+as tabular data",
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        with pytest.raises(AltairDatasetsError, match=pattern):
+            polars_loader(name)
+    else:
+        frame = polars_loader(name)
+        assert nw_dep.is_polars_dataframe(frame)
+
+
+def _raise_exception(e: type[Exception], *args: Any, **kwds: Any):
+    raise e(*args, **kwds)
+
+
+def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    from polars.testing import assert_frame_equal
+
+    load = Loader.from_backend("polars")
+    load.cache.path = tmp_path
+    load("londonCentroids")
+    load("stocks")
+    load("driving")
+    cached_paths = tuple(tmp_path.iterdir())
+    assert len(cached_paths) == 3
+    raiser = partial(_raise_exception, URLError)
+    with monkeypatch.context() as mp:
+        mp.setattr(load._reader._opener, "open", raiser)
+        # Existing cache entries don't trigger an error
+        load("londonCentroids")
+        load("stocks")
+        load("driving")
+        # Mocking cache-miss without remote conn
+        with pytest.raises(URLError):
+            load("birdstrikes")
+        assert len(tuple(tmp_path.iterdir())) == 3
+
+    # Now we can get a cache-hit
+    frame = load("birdstrikes")
+    assert nw_dep.is_polars_dataframe(frame)
+    assert len(tuple(tmp_path.iterdir())) == 4
+
+    with monkeypatch.context() as mp:
+        mp.setattr(load._reader._opener, "open", raiser)
+        # Here, the remote conn isn't considered - we already have the file
+        frame_from_cache = load("birdstrikes")
+        assert len(tuple(tmp_path.iterdir())) == 4
+    assert_frame_equal(frame, frame_from_cache)
+
+
+@pytest.mark.parametrize(
+    ("name", "column"),
+    [
+        ("cars", "Year"),
+        ("unemployment-across-industries", "date"),
+        ("flights-10k", "date"),
+        ("football", "date"),
+        ("crimea", "date"),
+        ("ohlc", "date"),
+    ],
+)
+def test_polars_date_read_json_roundtrip(
+    polars_loader: PolarsLoader, name: Dataset, column: str
+) -> None:
+    """Ensure ``date`` columns are inferred using the roundtrip json -> csv method."""
+    frame = polars_loader(name, ".json")
+    tp = frame.schema.to_python()[column]
+    assert tp is dt.date or issubclass(tp, dt.date)
+
+
+@backends_pandas_any
+@pytest.mark.parametrize(
+    ("name", "columns"),
+    [
+        ("birdstrikes", "Flight Date"),
+        ("cars", "Year"),
+        ("co2-concentration", "Date"),
+        ("crimea", "date"),
+        ("football", "date"),
+        ("iowa-electricity", "year"),
+        ("la-riots", "death_date"),
+        ("ohlc", "date"),
+        ("seattle-weather-hourly-normals", "date"),
+        ("seattle-weather", "date"),
+        ("sp500-2000", "date"),
+        ("unemployment-across-industries", "date"),
+        ("us-employment", "month"),
+    ],
+)
+def test_pandas_date_parse(
+    backend: _PandasAny,
+    name: Dataset,
+    columns: OneOrSeq[str],
+    polars_loader: PolarsLoader,
+) -> None:
+    """
+    Ensure schema defaults are correctly parsed.
+
+    Notes
+    -----
+    - Depends on ``frictionless`` being able to detect the date/datetime columns.
+    - Not all format strings work
+    """
+    date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns)
+    load = Loader.from_backend(backend)
+    url = load.url(name)
+    kwds: dict[str, Any] = (
+        {"convert_dates": date_columns}
+        if url.endswith(".json")
+        else {"parse_dates": date_columns}
+    )
+    kwds_empty: dict[str, Any] = {k: [] for k in kwds}
+    df_schema_derived: pd.DataFrame = load(name)
+    nw_schema = nw.from_native(df_schema_derived).schema
+    df_manually_specified: pd.DataFrame = load(name, **kwds)
+    df_dates_empty: pd.DataFrame = load(name, **kwds_empty)
+
+    assert set(date_columns).issubset(nw_schema)
+    for column in date_columns:
+        assert nw_schema[column] in {nw.Date, nw.Datetime}
+
+    assert nw_schema == nw.from_native(df_manually_specified).schema
+    assert nw_schema != nw.from_native(df_dates_empty).schema
+
+    # NOTE: Checking `polars` infers the same[1] as what `pandas` needs a hint for
+    # [1] Doesn't need to be exact, just recognise as *some kind* of date/datetime
+    pl_schema: pl.Schema = polars_loader(name).schema
+    for column in date_columns:
+        assert pl_schema[column].is_temporal()
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
new file mode 100644
index 000000000..a7c1d06c4
--- /dev/null
+++ b/tools/datasets/__init__.py
@@ -0,0 +1,203 @@
+"""
+Metadata generation from `vega/vega-datasets`_.
+
+Inspired by `altair-viz/vega_datasets`_.
+
+The core interface of this package is provided by::
+
+    tools.datasets.app
+
+.. _vega/vega-datasets:
+    https://github.com/vega/vega-datasets
+.. _altair-viz/vega_datasets:
+    https://github.com/altair-viz/vega_datasets
+"""
+
+from __future__ import annotations
+
+import gzip
+import json
+import types
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
+
+from tools import fs
+from tools.codemod import ruff
+from tools.datasets.npm import Npm
+from tools.schemapi import utils
+
+if TYPE_CHECKING:
+    import sys
+    from collections.abc import Mapping
+
+    import polars as pl
+
+    from tools.datasets import datapackage
+
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+
+    _PathAlias: TypeAlias = Literal["typing", "metadata-csv", "metadata", "schemas"]
+    PathMap: TypeAlias = Mapping[_PathAlias, Path]
+
+__all__ = ["app"]
+
+HEADER_COMMENT = """\
+# The contents of this file are automatically written by
+# tools/datasets.__init__.py. Do not modify directly.
+"""
+
+
+class Application:
+    """Top-level context."""
+
+    OUT_DIR: ClassVar[Path] = fs.REPO_ROOT / "altair" / "datasets"
+
+    def __init__(self) -> None:
+        METADATA = "metadata"
+        out_meta = self.OUT_DIR / "_metadata"
+        self.paths = types.MappingProxyType["_PathAlias", Path](
+            {
+                "typing": self.OUT_DIR / "_typing.py",
+                "metadata-csv": out_meta / f"{METADATA}.csv.gz",
+                "metadata": out_meta / f"{METADATA}.parquet",
+                "schemas": out_meta / "schemas.json.gz",
+            }
+        )
+        self._npm: Npm = Npm(self.paths)
+
+    @property
+    def npm(self) -> Npm:
+        return self._npm
+
+    def refresh(self, tag: Any, /, *, include_typing: bool = False) -> pl.DataFrame:
+        """
+        Update and sync all dataset metadata files.
+
+        Parameters
+        ----------
+        tag
+            Branch or release version to build against.
+        include_typing
+            Regenerate ``altair.datasets._typing``.
+        """
+        print("Syncing datasets ...")
+        dpkg = self.npm.datapackage(tag=tag)
+        self.write_parquet(dpkg.core, self.paths["metadata"])
+        self.write_json_gzip(dpkg.schemas(), self.paths["schemas"])
+        self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"])
+        print("Finished updating datasets.")
+
+        if include_typing:
+            self.generate_typing(dpkg)
+        return dpkg.core.collect()
+
+    def reset(self) -> None:
+        """Remove all metadata files."""
+        fs.rm(*self.paths.values())
+
+    def read(self, name: _PathAlias, /) -> pl.DataFrame:
+        """Read existing metadata from file."""
+        return self.scan(name).collect()
+
+    def scan(self, name: _PathAlias, /) -> pl.LazyFrame:
+        """Scan existing metadata from file."""
+        import polars as pl
+
+        fp = self.paths[name]
+        if fp.suffix == ".parquet":
+            return pl.scan_parquet(fp)
+        elif ".csv" in fp.suffixes:
+            return pl.scan_csv(fp)
+        elif ".json" in fp.suffixes:
+            return pl.read_json(fp).lazy()
+        else:
+            msg = (
+                f"Unable to read {fp.name!r} as tabular data.\nSuffixes: {fp.suffixes}"
+            )
+            raise NotImplementedError(msg)
+
+    def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
+        """
+        Write ``frame`` as a `gzip`_ compressed `csv`_ file.
+
+        - *Much smaller* than a regular ``.csv``.
+        - Still readable using ``stdlib`` modules.
+
+        .. _gzip:
+            https://docs.python.org/3/library/gzip.html
+        .. _csv:
+            https://docs.python.org/3/library/csv.html
+        """
+        if fp.suffix != ".gz":
+            fp = fp.with_suffix(".csv.gz")
+        fp.touch()
+        df = frame.lazy().collect()
+        buf = BytesIO()
+        with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
+            df.write_csv(buf)
+            f.write(buf.getbuffer())
+
+    def write_json_gzip(self, obj: Any, fp: Path, /) -> None:
+        """
+        Write ``obj`` as a `gzip`_ compressed ``json`` file.
+
+        .. _gzip:
+            https://docs.python.org/3/library/gzip.html
+        """
+        if fp.suffix != ".gz":
+            fp = fp.with_suffix(".json.gz")
+        fp.touch()
+        with gzip.GzipFile(fp, mode="wb", mtime=0) as f:
+            f.write(json.dumps(obj).encode())
+
+    def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
+        """Write ``frame`` to ``fp``, with some extra safety."""
+        fp.touch()
+        df = frame.lazy().collect()
+        df.write_parquet(fp, compression="zstd", compression_level=17)
+
+    def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
+        indent = " " * 4
+        NAME = "Dataset"
+        EXT = "Extension"
+        FIELD = "FlFieldStr"
+        FIELD_TYPES = (
+            "integer",
+            "number",
+            "boolean",
+            "string",
+            "object",
+            "array",
+            "date",
+            "datetime",
+            "time",
+            "duration",
+        )
+
+        contents = (
+            f"{HEADER_COMMENT}",
+            "from __future__ import annotations\n",
+            "import sys",
+            "from typing import Literal, TYPE_CHECKING",
+            utils.import_typing_extensions((3, 14), "TypedDict"),
+            utils.import_typing_extensions((3, 10), "TypeAlias"),
+            "\n",
+            f"__all__ = {[NAME, EXT, dpkg._NAME_TYPED_DICT]}\n",
+            utils.spell_literal_alias(NAME, dpkg.dataset_names()),
+            utils.spell_literal_alias(EXT, dpkg.extensions()),
+            dpkg.typed_dict(),
+            utils.spell_literal_alias(FIELD, FIELD_TYPES),
+            '"""\n'
+            "String representation of `frictionless`_ `Field Types`_.\n\n"
+            f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n"
+            f".. _Field Types:\n{indent}https://datapackage.org/standard/table-schema/#field-types\n"
+            '"""\n',
+        )
+        ruff.write_lint_format(self.paths["typing"], contents)
+
+
+app = Application()
diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py
new file mode 100644
index 000000000..ec707c0da
--- /dev/null
+++ b/tools/datasets/datapackage.py
@@ -0,0 +1,279 @@
+"""
+``frictionless`` `datapackage`_ parsing.
+
+.. _datapackage:
+    https://datapackage.org/
+"""
+
+from __future__ import annotations
+
+import textwrap
+from collections import deque
+from functools import cached_property
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
+
+import polars as pl
+from polars import col
+
+from tools.schemapi import utils
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Mapping, Sequence
+
+    from altair.datasets._typing import Dataset, FlFieldStr
+    from tools.datasets.models import Package, Resource
+
+
+__all__ = ["DataPackage"]
+
+INDENT = " " * 4
+
+
+class Column:
+    def __init__(self, name: str, expr: pl.Expr, /, doc: str = "_description_") -> None:
+        self._name: str = name
+        self._expr: pl.Expr = expr
+        self._doc: str = doc
+
+    @property
+    def expr(self) -> pl.Expr:
+        return self._expr.alias(self._name)
+
+    @property
+    def doc(self) -> str:
+        return f"{self._name}\n{INDENT * 2}{self._doc}"
+
+    def is_feature(self) -> bool:
+        return self._name.startswith("is_")
+
+
+class DataPackage:
+    NAME: ClassVar[Literal["dataset_name"]] = "dataset_name"
+    """
+    Main user-facing column name.
+
+    - Does not include file extension
+    - Preserves case of original file name
+    """
+
+    sort_by: str | Sequence[str] = "dataset_name", "bytes"
+    """Key(s) used to ensure output is deterministic."""
+
+    _NAME_TYPED_DICT: ClassVar[Literal["Metadata"]] = "Metadata"
+    _columns: ClassVar[Sequence[Column]]
+    _links: ClassVar[Sequence[str]]
+
+    def __init__(self, pkg: Package, base_url: str, path: Path, /) -> None:
+        self._pkg: Package = pkg
+        self._base_url: str = base_url
+        self._path: Path = path
+
+    @classmethod
+    def with_columns(cls, *columns: Column) -> type[DataPackage]:
+        cls._columns = columns
+        return cls
+
+    @classmethod
+    def with_links(cls, *links: str) -> type[DataPackage]:
+        cls._links = links
+        return cls
+
+    @property
+    def columns(self) -> Iterator[Column]:
+        yield from self._columns
+        yield self._url
+
+    @cached_property
+    def core(self) -> pl.LazyFrame:
+        """A minimal, tabular view of ``datapackage.json``."""
+        return pl.LazyFrame(self._resources).select(self._exprs).sort(self.sort_by)
+
+    def schemas(self) -> Mapping[Dataset, Mapping[str, FlFieldStr]]:
+        """Reduce all datasets with schemas to a minimal mapping."""
+        m: Any = {
+            Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]}
+            for rsrc in self._resources
+            if (s := rsrc.get("schema"))
+        }
+        return m
+
+    def dataset_names(self) -> Iterable[str]:
+        return self.core.select(col(self.NAME).unique().sort()).collect().to_series()
+
+    def extensions(self) -> tuple[str, ...]:
+        return tuple(
+            self.core.select(col("suffix").unique().sort())
+            .collect()
+            .to_series()
+            .to_list()
+        )
+
+    # TODO: Collect, then raise if cannot guarantee uniqueness
+    def metadata_csv(self) -> pl.LazyFrame:
+        """Variant with duplicate dataset names removed."""
+        return self.core.filter(col("suffix") != ".arrow").sort(self.NAME)
+
+    def typed_dict(self) -> str:
+        from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
+
+        return UNIVERSAL_TYPED_DICT.format(
+            name=self._NAME_TYPED_DICT,
+            metaclass_kwds=", total=False",
+            td_args=self._metadata_td_args,
+            summary=f"Full schema for ``{self._path.name}``.",
+            doc=self._metadata_doc,
+            comment="",
+        )
+
+    @property
+    def _exprs(self) -> Iterator[pl.Expr]:
+        return (column.expr for column in self.columns)
+
+    @property
+    def _docs(self) -> Iterator[str]:
+        return (column.doc for column in self.columns)
+
+    @property
+    def _resources(self) -> Sequence[Resource]:
+        return self._pkg["resources"]
+
+    @property
+    def _metadata_doc(self) -> str:
+        NLINDENT = f"\n{INDENT}"
+        return (
+            f"{NLINDENT.join(self._docs)}\n\n{''.join(self._links)}\n"
+            f"{textwrap.indent(self._metadata_examples, INDENT)}"
+            f"{INDENT}"
+        )
+
+    @property
+    def _metadata_examples(self) -> str:
+        with pl.Config(fmt_str_lengths=25, tbl_cols=5, tbl_width_chars=80):
+            table = repr(self.core.collect())
+        return (
+            f"\nExamples"
+            f"\n--------\n"
+            f"``{self._NAME_TYPED_DICT}`` keywords form constraints to filter a table like the below sample:\n\n"
+            f"```\n{table}\n```\n"
+        )
+
+    @property
+    def _metadata_td_args(self) -> str:
+        schema = self.core.collect_schema().to_python()
+        return f"\n{INDENT}".join(f"{p}: {tp.__name__}" for p, tp in schema.items())
+
+    @property
+    def _url(self) -> Column:
+        expr = pl.concat_str(pl.lit(self._base_url), "path")
+        return Column("url", expr, "Remote url used to access dataset.")
+
+    def features_typing(self, frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]:
+        """
+        Current plan is to use type aliases in overloads.
+
+        - ``Tabular`` can be treated interchangeably
+        - ``Image`` can only work with ``url``
+        - ``(Spatial|Geo|Topo)`` can be read with ``polars``
+            - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955
+        - ``Json`` should warn when using the ``pyarrow`` backend
+        """
+        guards = deque[str]()
+        ldf = frame.lazy()
+        for column in self.columns:
+            if not column.is_feature():
+                continue
+            guard_name = column._name
+            alias_name = guard_name.removeprefix("is_").capitalize()
+            members = ldf.filter(guard_name).select(self.NAME).collect().to_series()
+            guards.append(guard_literal(alias_name, guard_name, members))
+            yield utils.spell_literal_alias(alias_name, members)
+        yield from guards
+
+
+def path_stem(column: str | pl.Expr, /) -> pl.Expr:
+    """
+    The final path component, minus its last suffix.
+
+    Needed since `Resource.name`_ must be lowercase.
+
+    .. _Resource.name:
+        https://specs.frictionlessdata.io/data-resource/#name
+    """
+    path = col(column) if isinstance(column, str) else column
+    rfind = (path.str.len_bytes() - 1) - path.str.reverse().str.find(r"\.")
+    return path.str.head(rfind)
+
+
+def path_suffix(column: str | pl.Expr, /) -> pl.Expr:
+    """
+    The final component's last suffix.
+
+    This includes the leading period. For example: '.txt'.
+    """
+    path = col(column) if isinstance(column, str) else column
+    return path.str.tail(path.str.reverse().str.find(r"\.") + 1)
+
+
+def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str:
+    """Type narrowing function, all members must be literal strings."""
+    return (
+        f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n"
+        f"    return obj in set({sorted(set(members))!r})\n"
+    )
+
+
+PATHLIB = "https://docs.python.org/3/library/pathlib.html"
+GEOJSON = "https://en.wikipedia.org/wiki/GeoJSON"
+
+
+def link(name: str, url: str, /) -> str:
+    return f"{INDENT}.. _{name}:\n{INDENT * 2}{url}\n"
+
+
+def note(s: str, /) -> str:
+    return f"\n\n{INDENT * 2}.. note::\n{INDENT * 3}{s}"
+
+
+fmt = col("format")
+DataPackage.with_columns(
+    Column("dataset_name", path_stem("path"), "Name of the dataset/`Path.stem`_."),
+    Column("suffix", path_suffix("path"), "File extension/`Path.suffix`_."),
+    Column("file_name", col("path"), "Equivalent to `Path.name`_."),
+    Column("bytes", col("bytes"), "File size in *bytes*."),
+    Column("is_image", fmt == "png", "Only accessible via url."),
+    Column("is_tabular", col("type") == "table", "Can be read as tabular data."),
+    Column("is_geo", fmt == "geojson", "`GeoJSON`_ format."),
+    Column("is_topo", fmt == "topojson", "`TopoJSON`_ format."),
+    Column(
+        "is_spatial",
+        fmt.is_in(("geojson", "topojson")),
+        "Any geospatial format. Only natively supported by ``polars``.",
+    ),
+    Column(
+        "is_json", fmt.str.contains("json"), "Not supported natively by ``pyarrow``."
+    ),
+    Column(
+        "has_schema",
+        col("schema").is_not_null(),
+        "Data types available for improved ``pandas`` parsing.",
+    ),
+    Column(
+        "sha",
+        col("hash").str.split(":").list.last(),
+        doc=(
+            "Unique hash for the dataset."
+            + note(
+                f"E.g. if the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{INDENT * 3}"
+                f"then this value would remain stable."
+            )
+        ),
+    ),
+)
+DataPackage.with_links(
+    link("Path.stem", f"{PATHLIB}#pathlib.PurePath.stem"),
+    link("Path.name", f"{PATHLIB}#pathlib.PurePath.name"),
+    link("Path.suffix", f"{PATHLIB}#pathlib.PurePath.suffix"),
+    link("GeoJSON", GEOJSON),
+    link("TopoJSON", f"{GEOJSON}#TopoJSON"),
+)
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
new file mode 100644
index 000000000..ee1af8953
--- /dev/null
+++ b/tools/datasets/models.py
@@ -0,0 +1,118 @@
+"""API-related data structures."""
+
+from __future__ import annotations
+
+import sys
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Literal
+
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+if TYPE_CHECKING:
+    if sys.version_info >= (3, 11):
+        from typing import NotRequired, Required
+    else:
+        from typing_extensions import NotRequired, Required
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+
+    from altair.datasets._typing import Dataset, FlFieldStr
+
+
+CsvDialect: TypeAlias = Mapping[
+    Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]]
+]
+JsonDialect: TypeAlias = Mapping[
+    Literal[r"json"], Mapping[Literal["keyed"], Literal[True]]
+]
+
+
+class Field(TypedDict):
+    """https://datapackage.org/standard/table-schema/#field."""
+
+    name: str
+    type: FlFieldStr
+    description: NotRequired[str]
+
+
+class Schema(TypedDict):
+    """https://datapackage.org/standard/table-schema/#properties."""
+
+    fields: Sequence[Field]
+
+
+class Source(TypedDict, total=False):
+    title: str
+    path: Required[str]
+    email: str
+    version: str
+
+
+class License(TypedDict):
+    name: str
+    path: str
+    title: NotRequired[str]
+
+
+class Resource(TypedDict):
+    """https://datapackage.org/standard/data-resource/#properties."""
+
+    name: Dataset
+    type: Literal["table", "file", r"json"]
+    description: NotRequired[str]
+    licenses: NotRequired[Sequence[License]]
+    sources: NotRequired[Sequence[Source]]
+    path: str
+    scheme: Literal["file"]
+    format: Literal[
+        "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv"
+    ]
+    mediatype: Literal[
+        "application/parquet",
+        "application/vnd.apache.arrow.file",
+        "image/png",
+        "text/csv",
+        "text/tsv",
+        r"text/json",
+        "text/geojson",
+        "text/topojson",
+    ]
+    encoding: NotRequired[Literal["utf-8"]]
+    hash: str
+    bytes: int
+    dialect: NotRequired[CsvDialect | JsonDialect]
+    schema: NotRequired[Schema]
+
+
+class Contributor(TypedDict, total=False):
+    title: str
+    givenName: str
+    familyName: str
+    path: str
+    email: str
+    roles: Sequence[str]
+    organization: str
+
+
+class Package(TypedDict):
+    """
+    A subset of the `Data Package`_ standard.
+
+    .. _Data Package:
+        https://datapackage.org/standard/data-package/#properties
+    """
+
+    name: Literal["vega-datasets"]
+    version: str
+    homepage: str
+    description: str
+    licenses: Sequence[License]
+    contributors: Sequence[Contributor]
+    sources: Sequence[Source]
+    created: str
+    resources: Sequence[Resource]
diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py
new file mode 100644
index 000000000..a10e13a64
--- /dev/null
+++ b/tools/datasets/npm.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import json
+import string
+import urllib.request
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple
+from urllib.request import Request
+
+from tools.datasets import datapackage
+
+if TYPE_CHECKING:
+    import sys
+    from urllib.request import OpenerDirector
+
+    if sys.version_info >= (3, 11):
+        from typing import LiteralString
+    else:
+        from typing_extensions import LiteralString
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from tools.datasets import PathMap
+    from tools.datasets.datapackage import DataPackage
+
+    BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'
+
+
+__all__ = ["Npm"]
+
+
+class NpmUrl(NamedTuple):
+    CDN: LiteralString
+    GH: LiteralString
+
+
+class Npm:
+    """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview."""
+
+    _opener: ClassVar[OpenerDirector] = urllib.request.build_opener()
+
+    def __init__(
+        self,
+        paths: PathMap,
+        *,
+        jsdelivr: Literal["jsdelivr"] = "jsdelivr",
+        npm: Literal["npm"] = "npm",
+        package: LiteralString = "vega-datasets",
+    ) -> None:
+        self.paths: PathMap = paths
+        self._url: NpmUrl = NpmUrl(
+            CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@",
+            GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
+        )
+
+    def _prefix(self, version: BranchOrTag, /) -> LiteralString:
+        return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/"
+
+    def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
+        """Common url prefix for all datasets derived from ``version``."""
+        return f"{self._prefix(version)}data/"
+
+    @property
+    def url(self) -> NpmUrl:
+        return self._url
+
+    def file(
+        self,
+        branch_or_tag: BranchOrTag,
+        path: str,
+        /,
+    ) -> Any:
+        """
+        Request a file from `jsdelivr`  `npm`_ or `GitHub`_ endpoints.
+
+        Parameters
+        ----------
+        branch_or_tag
+            Version of the file, see `branches`_ and `tags`_.
+        path
+            Relative filepath from the root of the repo.
+
+        .. _npm:
+            https://www.jsdelivr.com/documentation#id-npm
+        .. _GitHub:
+            https://www.jsdelivr.com/documentation#id-github
+        .. _branches:
+            https://github.com/vega/vega-datasets/branches
+        .. _tags:
+            https://github.com/vega/vega-datasets/tags
+        """
+        path = path.lstrip("./")
+        suffix = Path(path).suffix
+        if suffix == ".json":
+            headers = {"Accept": "application/json"}
+            read_fn = json.load
+        else:
+            raise NotImplementedError(path, suffix)
+        req = Request(f"{self._prefix(branch_or_tag)}{path}", headers=headers)
+        with self._opener.open(req) as response:
+            return read_fn(response)
+
+    def datapackage(self, *, tag: LiteralString) -> DataPackage:
+        return datapackage.DataPackage(
+            self.file(tag, "datapackage.json"),
+            self.dataset_base_url(tag),
+            self.paths["metadata"],
+        )
+
+
+def is_branch(s: BranchOrTag, /) -> bool:
+    return s == "main" or not (s.startswith(tuple("v" + string.digits)))
diff --git a/tools/generate_api_docs.py b/tools/generate_api_docs.py
index 55c68729e..babd3d3eb 100644
--- a/tools/generate_api_docs.py
+++ b/tools/generate_api_docs.py
@@ -110,8 +110,22 @@
 
    {typing_objects}
 
+.. _api-datasets:
+
+Datasets
+--------
+.. currentmodule:: altair.datasets
+
+.. autosummary::
+   :toctree: generated/datasets/
+   :nosignatures:
+
+   {datasets_objects}
+
 .. _Generic:
     https://typing.readthedocs.io/en/latest/spec/generics.html#generics
+.. _vega-datasets:
+    https://github.com/vega/vega-datasets
 """
 
 
@@ -171,6 +185,10 @@ def theme() -> list[str]:
     return sort_3
 
 
+def datasets() -> list[str]:
+    return alt.datasets.__all__
+
+
 def lowlevel_wrappers() -> list[str]:
     objects = sorted(iter_objects(alt.schema.core, restrict_to_subclass=alt.SchemaBase))
     # The names of these two classes are also used for classes in alt.channels. Due to
@@ -194,6 +212,7 @@ def write_api_file() -> None:
             api_classes=sep.join(api_classes()),
             typing_objects=sep.join(type_hints()),
             theme_objects=sep.join(theme()),
+            datasets_objects=sep.join(datasets()),
         ),
         encoding="utf-8",
     )
diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py
index 0bb36d628..92c6f101d 100644
--- a/tools/generate_schema_wrapper.py
+++ b/tools/generate_schema_wrapper.py
@@ -1392,6 +1392,8 @@ def generate_encoding_artifacts(
 
 
 def main() -> None:
+    from tools import datasets
+
     parser = argparse.ArgumentParser(
         prog="generate_schema_wrapper.py", description="Generate the Altair package."
     )
@@ -1403,6 +1405,7 @@ def main() -> None:
     copy_schemapi_util()
     vegalite_main(args.skip_download)
     write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT)
+    datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True)
 
     # The modules below are imported after the generation of the new schema files
     # as these modules import Altair. This allows them to use the new changes
diff --git a/tools/schemapi/utils.py b/tools/schemapi/utils.py
index eb44f9c01..ecb58ee5c 100644
--- a/tools/schemapi/utils.py
+++ b/tools/schemapi/utils.py
@@ -1227,6 +1227,26 @@ def spell_literal(it: Iterable[str], /, *, quote: bool = True) -> str:
     return f"Literal[{', '.join(it_el)}]"
 
 
+def spell_literal_alias(
+    alias_name: str, members: Iterable[str], /, *, quote: bool = True
+) -> str:
+    """
+    Wraps ``utils.spell_literal`` as a ``TypeAlias``.
+
+    Examples
+    --------
+    >>> spell_literal_alias("Animals", ("Dog", "Cat", "Fish"))
+    "Animals: TypeAlias = Literal['Dog', 'Cat', 'Fish']"
+
+    >>> spell_literal_alias("Digits", "0123456789")
+    "Digits: TypeAlias = Literal['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']"
+
+    >>> spell_literal_alias("LessThanFive", (repr(i) for i in range(5)))
+    "LessThanFive: TypeAlias = Literal['0', '1', '2', '3', '4']"
+    """
+    return f"{alias_name}: TypeAlias = {spell_literal(members, quote=quote)}"
+
+
 def maybe_rewrap_literal(it: Iterable[str], /) -> Iterator[str]:
     """
     Where `it` may contain one or more `"enum"`, `"const"`, flatten to a single `Literal[...]`.