diff --git a/datasets/naip/Dockerfile b/datasets/naip/Dockerfile index 5c227dd67..9874b5874 100644 --- a/datasets/naip/Dockerfile +++ b/datasets/naip/Dockerfile @@ -29,7 +29,7 @@ RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/downloa ENV PATH /opt/conda/bin:$PATH ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH -RUN mamba install -y -c conda-forge python=3.8 gdal=3.3.3 pip setuptools cython numpy==1.21.5 +RUN mamba install -y -c conda-forge python=3.10 gdal=3.3.3 pip setuptools cython numpy==1.21.5 RUN python -m pip install --upgrade pip diff --git a/datasets/naip/Explore.ipynb b/datasets/naip/Explore.ipynb new file mode 100644 index 000000000..7250a0e81 --- /dev/null +++ b/datasets/naip/Explore.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pystac_client\n", + "import planetary_computer\n", + "\n", + "catalog = pystac_client.Client.open(\n", + " \"https://planetarycomputer-test.microsoft.com/stac\",\n", + " modifier=planetary_computer.sign_inplace,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time_range = \"2022-01-01/2022-12-31\"\n", + "search = catalog.search(collections=[\"naip\"], datetime=time_range)\n", + "items = search.item_collection()\n", + "len(items)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import contextily\n", + "import geopandas\n", + "\n", + "df = geopandas.GeoDataFrame.from_features(items.to_dict(), crs=\"epsg:4326\")\n", + "\n", + "ax = df[[\"geometry\", \"datetime\"]].plot(\n", + " facecolor=\"none\", figsize=(12, 8)\n", + ")\n", + "contextily.add_basemap(\n", + " ax, crs=df.crs.to_string(), source=contextily.providers.Esri.NatGeoWorldMap\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()\n", + "summary_table = df.groupby(\"naip:state\").size().reset_index(name=\"Count\")\n", + "summary_table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "# One day Alaska will be included in NAIP\n", + "states = [\n", + " {\n", + " \"code\": \"hi\",\n", + " \"name\": \"Hawaii\"\n", + " },\n", + " {\n", + " \"code\": \"pr\",\n", + " \"name\": \"Puerto Rico\"\n", + " },\n", + " {\n", + " \"code\": \"ak\",\n", + " \"name\": \"Alaska\"\n", + " },\n", + " {\n", + " \"code\": \"vi\",\n", + " \"name\": \"Virgin Islands\"\n", + " },\n", + "]\n", + "\n", + "fig, axs = plt.subplots(len(states), 1, figsize=(12, 8))\n", + "\n", + "for idx, state in enumerate(states):\n", + " stateDf = df[df[\"naip:state\"] == state[\"code\"]]\n", + " if stateDf.empty:\n", + " continue\n", + " merged_polygon = stateDf[\"geometry\"].unary_union\n", + " bounding_box = merged_polygon.bounds\n", + " stateDf.plot(ax=axs[idx]) # f\"{state} {bounding_box}\")\n", + " axs[idx].set_title(f\"{state['name']} {bounding_box}\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/datasets/naip/README.md b/datasets/naip/README.md index f5a1280c3..331af216c 100644 --- a/datasets/naip/README.md +++ b/datasets/naip/README.md @@ -13,3 +13,10 @@ $ pctasks dataset process-items --dataset datasets/naip/dataset.yaml test-2023-0 ```shell az acr build -r {the registry} --subscription {the subscription} -t pctasks-naip:latest -f datasets/naip/Dockerfile . ``` + +## Test + +The Azure credentials of the user running the tests will be used. +If found, [environment variables](https://github.com/Azure/azure-sdk-for-go/wiki/Set-up-Your-Environment-for-Authentication) +for service principal authentication will be used. +If the tests are failing due to storage authorization issues, make sure either you or your service principal has access to the storage account. \ No newline at end of file diff --git a/datasets/naip/collection/template.json b/datasets/naip/collection/template.json index ebc4f1730..f8e12c39c 100644 --- a/datasets/naip/collection/template.json +++ b/datasets/naip/collection/template.json @@ -117,6 +117,24 @@ 24.744, -66.951, 49.346 + ], + [ + -156.003, + 19.059, + -154.809, + 20.127 + ], + [ + -67.316, + 17.871, + -65.596, + 18.565 + ], + [ + -64.940, + 17.622, + -64.560, + 17.814 ] ] }, diff --git a/datasets/naip/dataset.yaml b/datasets/naip/dataset.yaml index 328b158a8..7c6cb4ef0 100644 --- a/datasets/naip/dataset.yaml +++ b/datasets/naip/dataset.yaml @@ -1,5 +1,5 @@ id: naip -image: ${{ args.registry }}/pctasks-naip:2023.4.26.0 +image: ${{ args.registry }}/pctasks-naip:latest args: - registry diff --git a/deployment/bin/get_tfvars b/deployment/bin/get_tfvars index 4fe9e6207..a9d987e83 100755 --- a/deployment/bin/get_tfvars +++ b/deployment/bin/get_tfvars @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 import argparse from subprocess import check_output diff --git a/pctasks/core/pctasks/core/storage/base.py b/pctasks/core/pctasks/core/storage/base.py index cf86e4067..36ff5bb76 100644 --- a/pctasks/core/pctasks/core/storage/base.py +++ b/pctasks/core/pctasks/core/storage/base.py @@ -286,7 +286,11 @@ def write_dict( text: The text to write. overwrite: Overwrite if file is already present. """ - self.write_bytes(file_path, orjson.dumps(d), overwrite=overwrite) + self.write_bytes( + file_path, + orjson.dumps(d, option=orjson.OPT_SERIALIZE_NUMPY), + overwrite=overwrite, + ) @abstractmethod def delete_folder(self, folder_path: str) -> None: diff --git a/pctasks/dataset/pctasks/dataset/items/task.py b/pctasks/dataset/pctasks/dataset/items/task.py index 22394e214..16cf66c14 100644 --- a/pctasks/dataset/pctasks/dataset/items/task.py +++ b/pctasks/dataset/pctasks/dataset/items/task.py @@ -265,7 +265,10 @@ def run( ) chunkset.write_chunk( items_chunk_id, - [orjson.dumps(item.to_dict()) for item in results], + [ + orjson.dumps(item.to_dict(), option=orjson.OPT_SERIALIZE_NUMPY) + for item in results + ], ) output = CreateItemsOutput( diff --git a/pctasks/ingest_task/pctasks/ingest_task/items.py b/pctasks/ingest_task/pctasks/ingest_task/items.py index 07309911c..98248df56 100644 --- a/pctasks/ingest_task/pctasks/ingest_task/items.py +++ b/pctasks/ingest_task/pctasks/ingest_task/items.py @@ -24,7 +24,7 @@ class IngestFailedException(Exception): def ingest_item(pgstac: PgSTAC, item: Dict[str, Any]) -> None: - pgstac.ingest_items([orjson.dumps(item)]) + pgstac.ingest_items([orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY)]) @dataclass diff --git a/pctasks/ingest_task/pctasks/ingest_task/pgstac.py b/pctasks/ingest_task/pctasks/ingest_task/pgstac.py index 546dbf532..11544913a 100644 --- a/pctasks/ingest_task/pctasks/ingest_task/pgstac.py +++ b/pctasks/ingest_task/pctasks/ingest_task/pgstac.py @@ -62,7 +62,13 @@ def ingest_collections( ) -> None: self._with_connection_retry( lambda: self.loader.load_collections( - iter([orjson.dumps(c) for c in collections]), insert_mode=mode + iter( + [ + orjson.dumps(c, option=orjson.OPT_SERIALIZE_NUMPY) + for c in collections + ] + ), + insert_mode=mode, ) ) diff --git a/pctasks/router/pctasks/router/handlers/forward.py b/pctasks/router/pctasks/router/handlers/forward.py index 4f35a1e2a..5e15ba26e 100644 --- a/pctasks/router/pctasks/router/handlers/forward.py +++ b/pctasks/router/pctasks/router/handlers/forward.py @@ -17,4 +17,4 @@ def handle(self, message: Dict[str, Any]) -> None: connection_string=settings.queues_connection_string, queue_name=self.get_queue_name(settings), ) as queue: - queue.send_message(orjson.dumps(message)) + queue.send_message(orjson.dumps(message, option=orjson.OPT_SERIALIZE_NUMPY)) diff --git a/pctasks/task/pctasks/task/common/write.py b/pctasks/task/pctasks/task/common/write.py index 259e2b102..e627570aa 100644 --- a/pctasks/task/pctasks/task/common/write.py +++ b/pctasks/task/pctasks/task/common/write.py @@ -30,7 +30,9 @@ def run(self, input: WriteInput, context: TaskContext) -> WriteOutput: if isinstance(input.content, str): storage.write_text(path, input.content) else: - storage.write_bytes(path, orjson.dumps(input.content)) + storage.write_bytes( + path, orjson.dumps(input.content, option=orjson.OPT_SERIALIZE_NUMPY) + ) return WriteOutput(uri=input.uri)