From 7eecafe041ce46c95d7024df22eb7731688bc59b Mon Sep 17 00:00:00 2001
From: Favyen Bastani <favyenb@allenai.org>
Date: Mon, 25 Aug 2025 14:59:00 -0700
Subject: [PATCH 1/2] Add EuroCrops data source and refresh the data source
 documentation.

---
 docs/DatasetConfig.md                 | 362 ++++++++++++++++++++++++--
 rslearn/data_sources/copernicus.py    |  10 +-
 rslearn/data_sources/eurocrops.py     | 247 ++++++++++++++++++
 rslearn/data_sources/openstreetmap.py |   6 +-
 4 files changed, 600 insertions(+), 25 deletions(-)
 create mode 100644 rslearn/data_sources/eurocrops.py

diff --git a/docs/DatasetConfig.md b/docs/DatasetConfig.md
index f39883e6..24be41e0 100644
--- a/docs/DatasetConfig.md
+++ b/docs/DatasetConfig.md
@@ -301,7 +301,7 @@ The data source specification looks like this:
   // The query configuration specifies how items should be matched to windows. It is
   // optional, and the values below are defaults.
   "query_config": {
-    // The space mode must be "MOSAIC" (default), "CONTAINS", or "INTERSECTS".
+    // The space mode must be "MOSAIC" (default), "CONTAINS", "INTERSECTS", or "COMPOSITE".
     "space_mode": "MOSAIC",
     // The time mode must be "WITHIN" (default), "BEFORE", or "AFTER".
     "time_mode": "WITHIN",
@@ -457,6 +457,10 @@ S3 bucket maintained by USGS. It includes Tier 1/2 scenes but not Real-Time scen
 https://aws.amazon.com/marketplace/pp/prodview-ivr4jeq6flk7u for details about the
 bucket.
 
+This data source supports direct materialization: if the "ingest" flag is set false,
+then ingestion will be skipped and windows will be directly populated from windowed
+reads of the underlying cloud-optimized GeoTIFFs on S3.
+
 The additional data source configuration looks like this:
 
 ```jsonc
@@ -564,6 +568,31 @@ Available bands:
 - G (from TCI asset; derived from B03)
 - B (from TCI asset; derived from B02)
 
+### rslearn.data_sources.aws_sentinel1.Sentinel1
+
+This data source is for Sentinel-1 GRD imagery on AWS. It uses the sentinel-s1-l1c S3
+bucket maintained by Sinergise. See
+https://aws.amazon.com/marketplace/pp/prodview-uxrsbvhd35ifw for details about the
+bucket.
+
+Although other Sentinel-1 scenes are available, the data source currently only supports
+the GRD IW DV scenes (vv+vh bands). It uses the Copernicus API for metadata search
+(prepare step).
+
+The additional data source configuration looks like this:
+
+```jsonc
+{
+  // Optional orbit direction to filter by, either "ASCENDING" or "DESCENDING". The
+  // default is to not filter (so both types of scenes are included/mixed).
+  "orbit_direction": null
+}
+```
+
+Available bands:
+- vv
+- vh
+
 ### rslearn.data_sources.climate_data_store.ERA5LandMonthlyMeans
 
 This data source is for ingesting ERA5 land monthly averaged data from the Copernicus Climate Data Store.
@@ -586,6 +615,159 @@ The additional data source configuration looks like this:
 }
 ```
 
+### rslearn.data_sources.copernicus.Copernicus
+
+This data source is for images from the ESA Copernicus OData API. See
+https://documentation.dataspace.copernicus.eu/APIs/OData.html for details about the API
+and how to get an access token.
+
+The additional data source configuration looks like this:
+
+```jsonc
+{
+  // Required dictionary mapping from a filename or glob string of an asset inside the
+  // product zip file, to the list of bands that the asset contains. An example for
+  // Sentinel-2 images is shown.
+  "glob_to_bands": {
+    "*/GRANULE/*/IMG_DATA/*_B01.jp2": ["B01"],
+    "*/GRANULE/*/IMG_DATA/*_TCI.jp2": ["R", "G", "B"]
+  },
+  // Optional API access token. See https://documentation.dataspace.copernicus.eu/APIs/OData.html
+  // for how to get a token. If not set, it is read from the environment variable
+  // COPERNICUS_ACCESS_TOKEN. If that environment variable doesn't exist, then we
+  // attempt to read the username/password from COPERNICUS_USERNAME and
+  // COPERNICUS_PASSWORD (this is useful since access tokens are only valid for an hour).
+  "access_token": null,
+  // Optional query filter string to include when searching for items. This will be
+  // appended to other name, geographic, and sensing time filters where applicable. For
+  // example, "Collection/Name eq 'SENTINEL-2'". See the API documentation for more
+  // examples.
+  "query_filter": null,
+  // Optional order by string to include when searching for items. For example,
+  // "ContentDate/Start asc". See the API documentation for more examples.
+  "order_by": null,
+  // Optional product attribute name to sort returned products by that attribute. If
+  // set, attributes will be expanded when listing products. Note that while order_by
+  // uses the API to order products, the API provides limited options, and sort_by
+  // instead is done after the API call.
+  "sort_by": null,
+  // If sort_by is set, sort in descending order instead of ascending order.
+  "sort_desc": false,
+  // Timeout for requests in seconds.
+  "timeout": 10
+}
+```
+
+### rslearn.data_sources.copernicus.Sentinel2
+
+This data source is for Sentinel-2 images from the ESA Copernicus OData API.
+
+The additional data source configuration looks like this:
+
+```jsonc
+{
+  // Required product type, either "L1C" or "L2A".
+  "product_type": "L1C",
+  // Flag (default false) to harmonize pixel values across different processing
+  // baselines (recommended), see
+  // https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR_HARMONIZED
+  "harmonize": false,
+  // See rslearn.data_sources.copernicus.Copernicus for details about the configuration
+  // options below.
+  "access_token": null,
+  "order_by": null,
+  "sort_by": null,
+  "sort_desc": false,
+  "timeout": 10
+}
+```
+
+Available bands:
+- B01
+- B02
+- B03
+- B04
+- B05
+- B06
+- B07
+- B08
+- B09
+- B11
+- B12
+- B8A
+- TCI
+- B10 (L1C only)
+- AOT (L2A only)
+- WVP (L2A only)
+- SCL (L2A only)
+
+### rslearn.data_sources.copernicus.Sentinel1
+
+This data source is for Sentinel-1 images from the ESA Copernicus OData API. Currently
+only IW GRDH VV+VH products are supported, even though all Sentinel-1 scenes are
+available in the data source.
+
+The additional data source configuration looks like this:
+
+```jsonc
+{
+  // Required product type, must be "IW_GRDH".
+  "product_type": "IW_GRDH",
+  // Required polarisation, must be "VV_VH".
+  "polarisation": "VV_VH",
+  // Optional orbit direction to filter by, either "ASCENDING" or "DESCENDING". The
+  // default is to not filter (so both types of scenes are included/mixed).
+  "orbit_direction": null,
+  // See rslearn.data_sources.copernicus.Copernicus for details about the configuration
+  // options below.
+  "access_token": null,
+  "order_by": null,
+  "sort_by": null,
+  "sort_desc": false,
+  "timeout": 10
+}
+```
+
+### rslearn.data_sources.earthdata_srtm.SRTM
+
+Elevation data from the Shuttle Radar Topography Mission via NASA Earthdata.
+
+A NASA Earthdata account is needed, see https://urs.earthdata.nasa.gov/.
+
+```jsonc
+{
+  // Earthdata account username. It can also be set via the NASA_EARTHDATA_USERNAME
+  // environment variable.
+  "username": null,
+  // Earthdata account password. It can also be set via the NASA_EARTHDATA_PASSWORD
+  // environment variable.
+  "password": null,
+  // Timeout for requests.
+  "timeout_seconds": 10,
+}
+```
+
+The data source should be configured with a single band set containing a single band.
+The band name can be set arbitrarily, but "srtm" or "elevation" is suggested. The data
+type of the band should be set to int16 to match the source data.
+
+### rslearn.data_sources.eurocrops.EuroCrops
+
+This data source is for EuroCrops vector data (v11).
+
+See https://zenodo.org/records/14094196 for details.
+
+While the source data is split into country-level files, this data source uses one item
+per year for simplicity. So each item corresponds to all of the country-level files for
+that year.
+
+Note that the RO_ny.zip file is not used.
+
+There is no data-source-specific configuration.
+
+The vector features should have `EC_hcat_c` and `EC_hcat_n` properties indicating the
+HCAT category code and name respectively.
+
 ### rslearn.data_sources.gcp_public_data.Sentinel2
 
 This data source is for Sentinel-2 data on Google Cloud Storage.
@@ -645,7 +827,112 @@ Available bands:
 
 ### rslearn.data_sources.google_earth_engine.GEE
 
-This data source is still experimental.
+This data source is for ingesting images from Google Earth Engine (GEE).
+
+It must be configured with the name of an ee.ImageCollection on GEE. Each ee.Image in
+the ee.ImageCollection is treated as a different data source item. A Cloud Storage
+bucket is also required to store the intermediate outputs from GEE export jobs.
+
+During the prepare stage, it will first export the metadata (geometry and time range)
+of all ee.Image objects in the ee.ImageCollection. Then it will use this to build an
+rtree from which prepare requests can be satisfied.
+
+During the ingest stage, it will start export jobs to export images to the bucket. Each
+worker will start one job and poll until it finishes before proceeding onto the next
+ee.Image to export. After the export finishes, the resulting GeoTIFF(s) are read and
+processed into the tile store. Note that export jobs can take several minutes to
+complete depending on the size of the image.
+
+This data source does support direct materialization, which can greatly speed up
+materialization for sparse windows. Whereas exporting a 10Kx10K image make take 5000
+EECU-seconds (and potentially several minutes), exporting a 256x256 image should take
+only a few seconds.
+
+```jsonc
+{
+  // Required name of the ee.ImageCollection, e.g. "COPERNICUS/S1_GRD".
+  "collection_name": "COPERNICUS/S1_GRD",
+  // Required name of the GCS bucket to use to store intermediate outputs from export
+  // jobs. You could set up lifecycle rules on this bucket to delete outputs after 1
+  // day.
+  "gcs_bucket_name": "...",
+  // Required service account name.
+  "service_account_name": "...",
+  // Required path to a local file containing the service account credentials.
+  "service_account_credentials": "/etc/credentials/gee_credentials.json",
+  // Required directory to store rtree index over the exported ee.Image metadata.
+  "index_cache_dir": "cache/gee",
+  // Optional filters to aply on the ee.ImageCollection. See Sentinel-1 example below.
+  // Currently only equality filters are supported.
+  "filters": null
+}
+```
+
+The available bands depends on the chosen ee.ImageCollection. Here is an example layer
+configuration for Sentinel-1. The filters match only ee.Image objects where the
+"transmitterReceiverPolarisation" attribute is ["VV", "VH"] and the "instrumentMode"
+attribute is "IW".
+
+```json
+{
+  "sentinel1": {
+    "band_sets": [
+      {
+        "bands": [
+          "VV",
+          "VH"
+        ],
+        "dtype": "uint16",
+        "format": "geotiff"
+      }
+    ],
+    "data_source": {
+      "collection_name": "COPERNICUS/S1_GRD",
+      "dtype": "float32",
+      "filters": [
+        [
+          "transmitterReceiverPolarisation",
+          [
+            "VV",
+            "VH"
+          ]
+        ],
+        [
+          "instrumentMode",
+          "IW"
+        ]
+      ],
+      "gcs_bucket_name": "YOUR_BUCKET_NAME",
+      "index_fname": "cache/sentinel1_index",
+      "name": "rslearn.data_sources.google_earth_engine.GEE",
+      "query_config": {
+        "max_matches": 1
+      },
+      "service_account_credentials": "/etc/credentials/gee_credentials.json",
+      "service_account_name": "YOUR_SERVICE_ACCOUNT_NAME"
+    },
+    "type": "raster"
+  }
+}
+```
+
+### rslearn.data_sources.google_earth_engine.GoogleSatelliteEmbeddings
+
+This data source is for Google Satellite Embeddings (AlphaEarth Embeddings) from Google
+Earth Engine. The embedding values are stored as unsigned 16-bit integers from 0 to
+16383, computed by multiplying the original [-1, 1] floating point values by 8192 and
+adding 8192.
+
+```jsonc
+{
+  // See rslearn.data_sources.google_earth_engine.GEE for details about these
+  // required configuration options.
+  "gcs_bucket_name": "...",
+  "service_account_name": "...",
+  "service_account_credentials": "/etc/credentials/gee_credentials.json",
+  "index_cache_dir": "cache/gee"
+}
+```
 
 ### rslearn.data_sources.local_files.LocalFiles
 
@@ -744,6 +1031,10 @@ This data source is still experimental.
 This data source is for raster data from Microsoft Planetary Computer. See their
 [Data Catalog](https://planetarycomputer.microsoft.com/catalog).
 
+This data source supports direct materialization: if the "ingest" flag is set false,
+then ingestion will be skipped and windows will be directly populated from windowed
+reads of the underlying cloud-optimized GeoTIFFs on Azure Blob Storage.
+
 ```jsonc
 {
   // Required collection name, e.g. "landsat-c2-l2" or "modis-17A2HGF-061".
@@ -768,6 +1059,7 @@ This data source is for raster data from Microsoft Planetary Computer. See their
 ### rslearn.data_sources.planetary_computer.Sentinel1
 
 Sentinel-1 radiometrically-terrain-corrected data on Microsoft Planetary Computer.
+Direct materialization is supported.
 
 It automatically determines the bands to download from the band sets, so all parameters
 are optional. The band names are "hh", "hv", "vv", and "vh" depending on the scene.
@@ -784,7 +1076,8 @@ are optional. The band names are "hh", "hv", "vv", and "vh" depending on the sce
 
 ### rslearn.data_sources.planetary_computer.Sentinel2
 
-Sentinel-2 L2A data on Microsoft Planetary Computer.
+Sentinel-2 L2A data on Microsoft Planetary Computer. Direct materialization is
+supported.
 
 The bands to download are determined from the band sets.
 
@@ -819,28 +1112,26 @@ Available bands:
 
 Note that B10 is not present in L2A.
 
-### rslearn.data_sources.earthdata_srtm.SRTM
+### rslearn.data_sources.usda_cdl.CDL
 
-Elevation data from the Shuttle Radar Topography Mission via NASA Earthdata.
+This data source is for the USDA Cropland Data Layer.
 
-A NASA Earthdata account is needed, see https://urs.earthdata.nasa.gov/.
+The GeoTIFF data will be downloaded from the USDA website. See
+https://www.nass.usda.gov/Research_and_Science/Cropland/SARS1a.php for details about
+the data.
+
+There is one GeoTIFF item per year from 2008. Each GeoTIFF spans the entire continental
+US, and has a single band.
 
 ```jsonc
 {
-  // Earthdata account username. It can also be set via the NASA_EARTHDATA_USERNAME
-  // environment variable.
-  "username": null,
-  // Earthdata account password. It can also be set via the NASA_EARTHDATA_PASSWORD
-  // environment variable.
-  "password": null,
-  // Timeout for requests.
-  "timeout_seconds": 10,
+  // Optional timeout for HTTP requests.
+  "timeout_seconds": 10
 }
 ```
 
-The data source should be configured with a single band set containing a single band.
-The band name can be set arbitrarily, but "srtm" or "elevation" is suggested. The data
-type of the band should be set to int16 to match the source data.
+The data source yields one band, and the name will match whatever is configured in the
+band set. It should be uint8.
 
 ### rslearn.data_sources.usgs_landsat.LandsatOliTirs
 
@@ -873,6 +1164,43 @@ Available bands:
 - B10
 - B11
 
+### rslearn.data_sources.worldcover.WorldCover
+
+This data source is for the ESA WorldCover 2021 land cover map.
+
+For details about the land cover map, see https://worldcover2021.esa.int/.
+
+This data source downloads the 18 zip files that contain the map. They are then
+extracted, yielding 2,651 GeoTIFF files. These are then used with
+`rslearn.data_sources.local_files.LocalFiles` to implement the data source.
+
+```jsonc
+{
+  // Required local path to store the downloaded zip files and extracted GeoTIFFs.
+  "worldcover_dir": "cache/worldcover"
+}
+```
+
+Available bands:
+- B1 (uint8)
+
+### rslearn.data_sources.worldpop.WorldPop
+
+This data source is for world population data from worldpop.org.
+
+Currently, this only supports the WorldPop Constrained 2020 100 m Resolution dataset.
+See https://hub.worldpop.org/project/categories?id=3 for details.
+
+The data is split by country. We implement with LocalFiles data source for simplicity,
+but it means that all of the data must be downloaded first.
+
+```jsonc
+{
+  // Required local path to store the downoladed WorldPop data.
+  "worldpop_dir": "cache/worldpop"
+}
+```
+
 ### rslearn.data_sources.xyz_tiles.XyzTiles
 
 This data source is for web xyz image tiles (slippy tiles).
diff --git a/rslearn/data_sources/copernicus.py b/rslearn/data_sources/copernicus.py
index 42df515a..a1503898 100644
--- a/rslearn/data_sources/copernicus.py
+++ b/rslearn/data_sources/copernicus.py
@@ -319,7 +319,6 @@ def __init__(
                 then we attempt to read the username/password from COPERNICUS_USERNAME
                 and COPERNICUS_PASSWORD (this is useful since access tokens are only
                 valid for an hour).
-            password: set API username/password instead of access token.
             query_filter: filter string to include when searching for items. This will
                 be appended to other name, geographic, and sensing time filters where
                 applicable. For example, "Collection/Name eq 'SENTINEL-2'". See the API
@@ -368,6 +367,7 @@ def from_config(config: RasterLayerConfig, ds_path: UPath) -> "Copernicus":
             "order_by",
             "sort_by",
             "sort_desc",
+            "timeout",
         ]
         for k in simple_optionals:
             if k in d:
@@ -709,6 +709,8 @@ class Sentinel2(Copernicus):
         "B12": ["B12"],
         "B8A": ["B8A"],
         "TCI": ["R", "G", "B"],
+        # L1C-only products.
+        "B10": ["B10"],
         # L2A-only products.
         "AOT": ["AOT"],
         "WVP": ["WVP"],
@@ -809,17 +811,16 @@ def from_config(config: RasterLayerConfig, ds_path: UPath) -> "Sentinel2":
 
         kwargs: dict[str, Any] = dict(
             assets=list(needed_assets),
+            product_type=Sentinel2ProductType[d["product_type"]],
         )
 
-        if "product_type" in d:
-            kwargs["product_type"] = Sentinel2ProductType(d["product_type"])
-
         simple_optionals = [
             "harmonize",
             "access_token",
             "order_by",
             "sort_by",
             "sort_desc",
+            "timeout",
         ]
         for k in simple_optionals:
             if k in d:
@@ -965,6 +966,7 @@ def from_config(config: RasterLayerConfig, ds_path: UPath) -> "Sentinel1":
             "order_by",
             "sort_by",
             "sort_desc",
+            "timeout",
         ]
         for k in simple_optionals:
             if k in d:
diff --git a/rslearn/data_sources/eurocrops.py b/rslearn/data_sources/eurocrops.py
new file mode 100644
index 00000000..23cd8fbd
--- /dev/null
+++ b/rslearn/data_sources/eurocrops.py
@@ -0,0 +1,247 @@
+"""Data source for vector EuroCrops crop type data."""
+
+import glob
+import os
+import tempfile
+import zipfile
+from datetime import UTC, datetime, timedelta
+from typing import Any
+
+import fiona
+import requests
+from rasterio.crs import CRS
+from upath import UPath
+
+from rslearn.config import QueryConfig, VectorLayerConfig
+from rslearn.const import WGS84_PROJECTION
+from rslearn.data_sources import DataSource, Item
+from rslearn.data_sources.utils import match_candidate_items_to_window
+from rslearn.log_utils import get_logger
+from rslearn.tile_stores import TileStoreWithLayer
+from rslearn.utils.feature import Feature
+from rslearn.utils.geometry import Projection, STGeometry, get_global_geometry
+
+logger = get_logger(__name__)
+
+
+class EuroCropsItem(Item):
+    """An item in the EuroCrops data source.
+
+    For simplicity, we have just one item per year, so each item combines all of the
+    country-level files for that year.
+    """
+
+    def __init__(self, name: str, geometry: STGeometry, zip_fnames: list[str]):
+        """Creates a new EuroCropsItem.
+
+        Args:
+            name: unique name of the item. It is just the year that this item
+                corresponds to.
+            geometry: the spatial and temporal extent of the item
+            zip_fnames: the filenames of the zip files that contain country-level crop
+                type data for this year.
+        """
+        super().__init__(name, geometry)
+        self.zip_fnames = zip_fnames
+
+    def serialize(self) -> dict:
+        """Serializes the item to a JSON-encodable dictionary."""
+        d = super().serialize()
+        d["zip_fnames"] = self.zip_fnames
+        return d
+
+    @staticmethod
+    def deserialize(d: dict) -> "EuroCropsItem":
+        """Deserializes an item from a JSON-decoded dictionary."""
+        item = super(EuroCropsItem, EuroCropsItem).deserialize(d)
+        return EuroCropsItem(
+            name=item.name, geometry=item.geometry, zip_fnames=d["zip_fnames"]
+        )
+
+
+class EuroCrops(DataSource[EuroCropsItem]):
+    """A data source for EuroCrops vector data (v11).
+
+    See https://zenodo.org/records/14094196 for details.
+
+    While the source data is split into country-level files, this data source uses one
+    item per year for simplicity. So each item corresponds to all of the country-level
+    files for that year.
+
+    Note that the RO_ny.zip file is not used.
+    """
+
+    BASE_URL = "https://zenodo.org/records/14094196/files/"
+    FILENAMES_BY_YEAR = {
+        2018: [
+            "FR_2018.zip",
+        ],
+        2019: [
+            "DK_2019.zip",
+        ],
+        2020: [
+            "ES_NA_2020.zip",
+            "FI_2020.zip",
+            "HR_2020.zip",
+            "NL_2020.zip",
+        ],
+        2021: [
+            "AT_2021.zip",
+            "BE_VLG_2021.zip",
+            "BE_WAL_2021.zip",
+            "EE_2021.zip",
+            "LT_2021.zip",
+            "LV_2021.zip",
+            "PT_2021.zip",
+            "SE_2021.zip",
+            "SI_2021.zip",
+            "SK_2021.zip",
+        ],
+        2023: [
+            "CZ_2023.zip",
+            "DE_BB_2023.zip",
+            "DE_LS_2021.zip",
+            "DE_NRW_2021.zip",
+            "ES_2023.zip",
+            "IE_2023.zip",
+        ],
+    }
+    TIMEOUT = timedelta(seconds=10)
+
+    @staticmethod
+    def from_config(config: VectorLayerConfig, ds_path: UPath) -> "EuroCrops":
+        """Creates a new EuroCrops instance from a configuration dictionary."""
+        if config.data_source is None:
+            raise ValueError("data_source is required")
+        return EuroCrops()
+
+    def _get_all_items(self) -> list[EuroCropsItem]:
+        """Get a list of all available items in the data source."""
+        items: list[EuroCropsItem] = []
+        for year, fnames in self.FILENAMES_BY_YEAR.items():
+            items.append(
+                EuroCropsItem(
+                    str(year),
+                    get_global_geometry(
+                        time_range=(
+                            datetime(year, 1, 1, tzinfo=UTC),
+                            datetime(year + 1, 1, 1, tzinfo=UTC),
+                        ),
+                    ),
+                    fnames,
+                )
+            )
+        return items
+
+    def get_items(
+        self, geometries: list[STGeometry], query_config: QueryConfig
+    ) -> list[list[list[EuroCropsItem]]]:
+        """Get a list of items in the data source intersecting the given geometries.
+
+        Args:
+            geometries: the spatiotemporal geometries
+            query_config: the query configuration
+
+        Returns:
+            List of groups of items that should be retrieved for each geometry.
+        """
+        wgs84_geometries = [
+            geometry.to_projection(WGS84_PROJECTION) for geometry in geometries
+        ]
+        all_items = self._get_all_items()
+        groups = []
+        for geometry in wgs84_geometries:
+            cur_groups = match_candidate_items_to_window(
+                geometry, all_items, query_config
+            )
+            groups.append(cur_groups)
+        return groups
+
+    def deserialize_item(self, serialized_item: Any) -> EuroCropsItem:
+        """Deserializes an item from JSON-decoded data."""
+        return EuroCropsItem.deserialize(serialized_item)
+
+    def _extract_features(self, fname: str) -> list[Feature]:
+        """Download the given zip file, extract shapefile, and return list of features."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Download the zip file.
+            url = self.BASE_URL + fname
+            logger.debug(f"Downloading zip file from {url}")
+            response = requests.get(
+                url,
+                stream=True,
+                timeout=self.TIMEOUT.total_seconds(),
+                allow_redirects=False,
+            )
+            response.raise_for_status()
+            zip_fname = os.path.join(tmp_dir, "data.zip")
+            with open(zip_fname, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            # Extract all of the files and look for shapefile filename.
+            logger.debug(f"Extracting zip file {fname}")
+            with zipfile.ZipFile(zip_fname) as zip_f:
+                zip_f.extractall(path=tmp_dir)
+
+            # The shapefiles can appear at any of three levels in the file hierarchy.
+            shp_fnames = (
+                glob.glob("*.shp", root_dir=tmp_dir)
+                + glob.glob("*/*.shp", root_dir=tmp_dir)
+                + glob.glob("*/*/*.shp", root_dir=tmp_dir)
+            )
+            if len(shp_fnames) == 0:
+                tmp_dir_fnames = os.listdir(tmp_dir)
+                raise ValueError(
+                    f"expected {fname} to contain .shp file but none found (matches={shp_fnames}, ls={tmp_dir_fnames})"
+                )
+
+            # Load the features from the shapefile(s).
+            features = []
+            for shp_fname in shp_fnames:
+                logger.debug(f"Loading feature list from {shp_fname}")
+                with fiona.open(os.path.join(tmp_dir, shp_fname)) as src:
+                    crs = CRS.from_wkt(src.crs.to_wkt())
+                    # Normal GeoJSON should have coordinates in CRS coordinates, i.e. it
+                    # should be 1 projection unit/pixel.
+                    projection = Projection(crs, 1, 1)
+
+                    for feat in src:
+                        features.append(
+                            Feature.from_geojson(
+                                projection,
+                                {
+                                    "type": "Feature",
+                                    "geometry": dict(feat.geometry),
+                                    "properties": dict(feat.properties),
+                                },
+                            )
+                        )
+
+            return features
+
+    def ingest(
+        self,
+        tile_store: TileStoreWithLayer,
+        items: list[EuroCropsItem],
+        geometries: list[list[STGeometry]],
+    ) -> None:
+        """Ingest items into the given tile store.
+
+        Args:
+            tile_store: the tile store to ingest into
+            items: the items to ingest
+            geometries: a list of geometries needed for each item
+        """
+        for item in items:
+            if tile_store.is_vector_ready(item.name):
+                continue
+
+            # Get features across all shapefiles.
+            features: list[Feature] = []
+            for fname in item.zip_fnames:
+                logger.debug(f"Getting features from {fname} for item {item.name}")
+                features.extend(self._extract_features(fname))
+
+            logger.debug(f"Writing features for {item.name} to the tile store")
+            tile_store.write_vector(item.name, features)
diff --git a/rslearn/data_sources/openstreetmap.py b/rslearn/data_sources/openstreetmap.py
index 3007e42a..1109dddb 100644
--- a/rslearn/data_sources/openstreetmap.py
+++ b/rslearn/data_sources/openstreetmap.py
@@ -1,4 +1,4 @@
-"""Data source for raster data on public Cloud Storage buckets."""
+"""Data source for OpenStreetMap vector features."""
 
 import json
 import shutil
@@ -392,7 +392,7 @@ def __init__(
         bounds_fname: UPath,
         categories: dict[str, Filter],
     ):
-        """Initialize a new Sentinel2 instance.
+        """Initialize a new OpenStreetMap instance.
 
         Args:
             config: the configuration of this layer.
@@ -508,8 +508,6 @@ def ingest(
             items: the items to ingest
             geometries: a list of geometries needed for each item
         """
-        item_names = [item.name for item in items]
-        item_names.sort()
         for cur_item, cur_geometries in zip(items, geometries):
             if tile_store.is_vector_ready(cur_item.name):
                 continue

From 82b2b397aec4bb0209760d8cb1ce9cc89e37f3d7 Mon Sep 17 00:00:00 2001
From: Favyen Bastani <favyenb@allenai.org>
Date: Wed, 27 Aug 2025 09:01:32 -0700
Subject: [PATCH 2/2] Fix since some contain gpkg file

---
 rslearn/data_sources/eurocrops.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/rslearn/data_sources/eurocrops.py b/rslearn/data_sources/eurocrops.py
index 23cd8fbd..1fd561fc 100644
--- a/rslearn/data_sources/eurocrops.py
+++ b/rslearn/data_sources/eurocrops.py
@@ -184,12 +184,11 @@ def _extract_features(self, fname: str) -> list[Feature]:
             with zipfile.ZipFile(zip_fname) as zip_f:
                 zip_f.extractall(path=tmp_dir)
 
-            # The shapefiles can appear at any of three levels in the file hierarchy.
-            shp_fnames = (
-                glob.glob("*.shp", root_dir=tmp_dir)
-                + glob.glob("*/*.shp", root_dir=tmp_dir)
-                + glob.glob("*/*/*.shp", root_dir=tmp_dir)
-            )
+            # The shapefiles or geopackage files can appear at any level in the hierarchy.
+            # Most zip files contain one but some contain multiple (one per region).
+            shp_fnames = glob.glob(
+                "**/*.shp", root_dir=tmp_dir, recursive=True
+            ) + glob.glob("**/*.gpkg", root_dir=tmp_dir, recursive=True)
             if len(shp_fnames) == 0:
                 tmp_dir_fnames = os.listdir(tmp_dir)
                 raise ValueError(