From c51f26d12c1dac4e614cb6345b7ff6045d14853b Mon Sep 17 00:00:00 2001 From: Beldine-Moturi Date: Mon, 30 Mar 2026 14:32:57 +0300 Subject: [PATCH 1/5] Update lulc tile matching to use mgrs library --- .../postprocess_generic_helper_functions.py | 92 +++++++++++++++---- pyproject.toml | 1 + 2 files changed, 74 insertions(+), 19 deletions(-) diff --git a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py index 4ab8ad6..bf2a948 100644 --- a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py +++ b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py @@ -8,6 +8,7 @@ import json import rasterio import rioxarray +import mgrs import numpy as np import pandas as pd import xarray as xr @@ -353,30 +354,83 @@ def get_lulc_tile_for_input(input_image_path): # Setup output directory outputs_dir = "/".join(input_image_path.split("/")[:-1]) Path(outputs_dir).mkdir(parents=True, exist_ok=True) - # Read LULC shapefile - gdf = gpd.read_file(LULC_TILE_SHAPEFILE) # Get input image bbox bbox = rdo.get_raster_bbox(input_image_path) - tile_list = get_tiles_list([bbox[0], bbox[1], bbox[2], bbox[3]]) - # Prepare dataframe - df_tmp = pd.DataFrame({"file": [input_image_path]}) - df_tmp["geometry"] = box(bbox.bottom, bbox.left, bbox.right, bbox.top) - gdf_bbox = gpd.GeoDataFrame(df_tmp, geometry="geometry", crs="EPSG:4326") - # Join lulc shapefile dataframe with model input image dataframe - # to create a dataframe of lulc tile paths, model input image path - # and model input bbox. - join = gpd.sjoin(left_df=gdf, right_df=gdf_bbox, how="right", predicate="intersects") - # Iterate through the list of relavent tiles, then reproject as required. - for j in range(len(tile_list)): - tile_j_value = join[join["tile"] == tile_list[j]]["tile"].values[0] - file_j_value = join[join["tile"] == tile_list[j]]["file"].values[0] - if tile_j_value == "nan" or not join["tile"].values[j]: - continue + + m = mgrs.MGRS() + grid_zones = set() + + # Sample points across bbox to find all intersecting grid zones + lat_range = bbox.top - bbox.bottom + lon_range = bbox.right - bbox.left + lat_samples = max(3, min(10, int(lat_range / 0.5))) + lon_samples = max(3, min(10, int(lon_range / 0.5))) + + for lat in np.linspace(bbox.bottom, bbox.top, lat_samples): + for lon in np.linspace(bbox.left, bbox.right, lon_samples): + try: + # Get MGRS code for this point + mgrs_code = m.toMGRS(lat, lon, MGRSPrecision=0) + # Extract grid zone (first 3 characters: zone number + latitude band) + # Example: "37MBT12345678" -> "37M" + grid_zone = mgrs_code[:3] + grid_zones.add(grid_zone) + except Exception as e: + # Skip invalid coordinates (e.g., near poles) + continue + + if not grid_zones: + logger.error("Failed to calculate MGRS grid zones for input image") + return None + + # Find matching LULC tiles in the data directory + # Expected naming: [GridZone]_[StartDate]-[EndDate].tiff + # Example: 37N_20240101-20241231.tiff + tile_paths = [] + missing_zones = [] + + # Get all available LULC files + available_files = glob.glob(os.path.join(LULC_TILE_ROOT, "*.tif")) + available_files.extend(glob.glob(os.path.join(LULC_TILE_ROOT, "*.tiff"))) + + for grid_zone in sorted(grid_zones): + found = False + + # Find file matching this grid zone + for filepath in available_files: + filename = os.path.basename(filepath) + + # Check if filename starts with the grid zone + # Handles: 37N_20240101-20241231.tiff, 37N.tif, etc. + if filename.startswith(grid_zone): + tile_paths.append(filepath) + found = True + logger.info(f"Found LULC tile for zone {grid_zone}: {filename}") + break + + if not found: + missing_zones.append(grid_zone) + + if missing_zones: + logger.warning( + f"Missing LULC tiles for {len(missing_zones)} grid zones: {missing_zones}. " + f"Download from: " + f"https://lulctimeseries.blob.core.windows.net/lulctimeseriesv003/lc2024/" + ) + + if not tile_paths: + logger.error( + f"No LULC tiles found for calculated grid zones: {sorted(grid_zones)}. " + f"Please download required tiles to {LULC_TILE_ROOT}" + ) + return None + + for j, tile_path in enumerate(tile_paths): tmp_unique_id = str(uuid4()) # Match LULC tile to model input image output_tmp_path = rdo.match_raster_to_target( - input_file=LULC_TILE_ROOT + tile_j_value, - target_file=file_j_value, + input_file=tile_path, + target_file=input_image_path, output_suffix=f"_{tmp_unique_id}_padded", ) os.system(f"mv {output_tmp_path} {outputs_dir}/lulc_tile{str(j)}.tif") diff --git a/pyproject.toml b/pyproject.toml index 9c93a56..cf00425 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ dependencies = [ "typer>=0.20.0", "urllib3>=2.5.0", "uvicorn[standard]>=0.35.0", + "mgrs==1.5.4", ] [project.scripts] From fa88047aba46226d3f091842280f871171ef15de Mon Sep 17 00:00:00 2001 From: Beldine-Moturi Date: Mon, 30 Mar 2026 15:21:37 +0300 Subject: [PATCH 2/5] fix lulc file path error --- .../postprocess_generic_helper_functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py index bf2a948..12ced26 100644 --- a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py +++ b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py @@ -435,6 +435,7 @@ def get_lulc_tile_for_input(input_image_path): ) os.system(f"mv {output_tmp_path} {outputs_dir}/lulc_tile{str(j)}.tif") lulc_tiles = glob.glob(outputs_dir + "/lulc_tile*") + lulc_file_path = "" if len(lulc_tiles) > 1: print("mosaic") lulc_file_path = rdo.create_mosaic( From 08dc14ba0043feebae247a8d82f1e5f606f98276 Mon Sep 17 00:00:00 2001 From: Beldine-Moturi Date: Tue, 31 Mar 2026 12:23:56 +0300 Subject: [PATCH 3/5] run pre-commit --- .../postprocess_generic_helper_functions.py | 134 +++++++++++++----- 1 file changed, 97 insertions(+), 37 deletions(-) diff --git a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py index bfae7dc..a8f0d1b 100644 --- a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py +++ b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py @@ -14,8 +14,8 @@ from zipfile import ZipFile import geopandas as gpd +import mgrs import numpy as np -import pandas as pd import rasterio import rioxarray import xarray as xr @@ -29,15 +29,20 @@ ) from rasterio import features from shapely.geometry import Polygon, box -import mgrs # LULC data location LULC_TILE_ROOT = os.environ.get("LULC_SHARED_DATA_ROOT", "/auxdata/lulc/lc2021/") LULC_TILE_SHAPEFILE = os.environ.get("LULC_TILE_SHAPEFILE", "/auxdata/lulc/tiles.shp") -LAND_POLYGON_PATH = os.environ.get("LAND_POLYGON_PATH", "/auxdata/general/land_polygons.shp") +LAND_POLYGON_PATH = os.environ.get( + "LAND_POLYGON_PATH", "/auxdata/general/land_polygons.shp" +) # Cloud mask -S2_CLOUD_MASK = [3, 8, 9] # Cloud shadows, Cloud medium probability, Cloud high probability +S2_CLOUD_MASK = [ + 3, + 8, + 9, +] # Cloud shadows, Cloud medium probability, Cloud high probability HLS_CLOUD_MASK = ["0", "0", "0", "0", "1", "1", "1", "1"] CLOUD_VALUE = 999 @@ -48,7 +53,9 @@ # Permanent water mask S2_PERMANENT_WATER_MASK = [6] # Permanent water using S2 SCL -S2_LULC_PERMANENT_WATER_MASK = [1] # https://www.arcgis.com/home/item.html?id=cfcb7609de5f478eb7666240902d4d3d +S2_LULC_PERMANENT_WATER_MASK = [ + 1 +] # https://www.arcgis.com/home/item.html?id=cfcb7609de5f478eb7666240902d4d3d S2_LULC_PERMANENT_WATER_BAND_INDEX = 0 HLS_PERMANENT_WATER_MASK = ["0", "0", "1", "0", "0", "0", "0", "0"] PERMANENT_WATER_VALUE = 997 @@ -81,13 +88,15 @@ def resize_image(image, input_image=None, bbox=None): def __resize_image(input_image): if input_image: get_bbox_cmd = f"""gdalinfo -json {input_image} | jq '(.cornerCoordinates.upperLeft[0] | tostring) + " " + (.cornerCoordinates.upperLeft[1] | tostring ) + " " + (.cornerCoordinates.lowerRight[0] | tostring) + " " + (.cornerCoordinates.lowerRight[1] | tostring )'""" - input_image_bbox = check_output([get_bbox_cmd], shell=True, text=True).replace('"', "").replace("\n", "") + input_image_bbox = ( + check_output([get_bbox_cmd], shell=True, text=True) + .replace('"', "") + .replace("\n", "") + ) os.system(f"cp {image} {image}-temp.tif") command = f"gdal_translate -projwin {input_image_bbox} -of GTiff {image}-temp.tif {image}" elif bbox: - command = ( - f"gdal_translate -projwin {bbox[0]} {bbox[3]} {bbox[2]} {bbox[1]} -of GTiff {image}-temp.tif {image}" - ) + command = f"gdal_translate -projwin {bbox[0]} {bbox[3]} {bbox[2]} {bbox[1]} -of GTiff {image}-temp.tif {image}" os.system(command) os.system(f"rm {image}-temp.tif") @@ -119,8 +128,16 @@ def read_json_with_retries(filepath, max_retries=5, base_delay=0.5): time.sleep(base_delay * (2 ** (attempt - 1))) -def hls_masking(xds, ids, fmask_index=6, mask_bits=["0", "0", "0", "0", "1", "1", "1", "1"], mask_to_value=999): - logger.info(f">>>>>>>>>>>> Performing mask for scenes {mask_bits} with masked values set to {mask_to_value}") +def hls_masking( + xds, + ids, + fmask_index=6, + mask_bits=["0", "0", "0", "0", "1", "1", "1", "1"], + mask_to_value=999, +): + logger.info( + f">>>>>>>>>>>> Performing mask for scenes {mask_bits} with masked values set to {mask_to_value}" + ) qVals = list(np.unique(ids[fmask_index].astype(int))) goodQuality = [] for v in qVals: @@ -161,7 +178,9 @@ def s2_masking(xds, ids, fmask_index, scene_classification, mask_to_value): logger.info( f">>>>>>>>>>>> Performing mask for scenes {scene_classification} with masked values set to {mask_to_value}" ) - mask = ids[fmask_index].values == scene_classification[0] # initalise the mask array + mask = ( + ids[fmask_index].values == scene_classification[0] + ) # initalise the mask array for scene_class in range(1, len(scene_classification)): mask = np.logical_or( mask, ids[fmask_index].values == scene_classification[scene_class] @@ -183,12 +202,19 @@ def mask_and_set_to(xds, ids, masking_type, inference_dict, i=0): mask_band = masking_config.get("band", None) if mask_band is not None: encoding_type = masking_config.get("encoding", None) - if encoding_type is not None and "bands" in inference_dict["model_input_data_spec"][i]: - model_input_data_spec_bands = inference_dict["model_input_data_spec"][i]["bands"] + if ( + encoding_type is not None + and "bands" in inference_dict["model_input_data_spec"][i] + ): + model_input_data_spec_bands = inference_dict["model_input_data_spec"][ + i + ]["bands"] if encoding_type == "sentinel2_lulc": mask_band_index = S2_LULC_PERMANENT_WATER_BAND_INDEX else: - mask_band_index = search_band_dict(model_input_data_spec_bands, "band_name", f"{mask_band}") + mask_band_index = search_band_dict( + model_input_data_spec_bands, "band_name", f"{mask_band}" + ) if mask_band_index is not None: mask_band_index = mask_band_index.get("index") else: @@ -279,7 +305,9 @@ def mask_from_url(mask_url: str, xds: object, buffer_size_m: float = 100): ccgdf["geometry"] = ccgdf.buffer(buffer_size_m) ccgdf.to_crs(crs=original_crs, inplace=True) - shapes = ((geom, value) for geom, value in zip(ccgdf.geometry, [90] * len(ccgdf.geometry))) + shapes = ( + (geom, value) for geom, value in zip(ccgdf.geometry, [90] * len(ccgdf.geometry)) + ) features.rasterize(shapes=shapes, fill=0, out=xds, transform=xds.rio.transform()) return xds @@ -306,11 +334,19 @@ def _make_rgb_single_file(model_input_original_image: str, i: int = 0): nodata_value = xds.attrs.get("_FillValue", -9999) xds.rio.write_nodata(nodata_value, inplace=True) - model_input_data_spec_bands = inference_dict["model_input_data_spec"][i]["bands"] + model_input_data_spec_bands = inference_dict["model_input_data_spec"][i][ + "bands" + ] logger.info(f"model_input_data_spec_bands: {model_input_data_spec_bands}") - red_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "R").get("index")) - green_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "G").get("index")) - blue_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "B").get("index")) + red_band_index = int( + search_band_dict(model_input_data_spec_bands, "RGB_band", "R").get("index") + ) + green_band_index = int( + search_band_dict(model_input_data_spec_bands, "RGB_band", "G").get("index") + ) + blue_band_index = int( + search_band_dict(model_input_data_spec_bands, "RGB_band", "B").get("index") + ) rgb_indexes = [red_band_index, green_band_index, blue_band_index] rgb_xds = xds[rgb_indexes, :, :] @@ -366,7 +402,7 @@ def get_lulc_tile_for_input(input_image_path): lon_range = bbox.right - bbox.left lat_samples = max(3, min(10, int(lat_range / 0.5))) lon_samples = max(3, min(10, int(lon_range / 0.5))) - + for lat in np.linspace(bbox.bottom, bbox.top, lat_samples): for lon in np.linspace(bbox.left, bbox.right, lon_samples): try: @@ -376,10 +412,10 @@ def get_lulc_tile_for_input(input_image_path): # Example: "37MBT12345678" -> "37M" grid_zone = mgrs_code[:3] grid_zones.add(grid_zone) - except Exception as e: + except Exception: # Skip invalid coordinates (e.g., near poles) continue - + if not grid_zones: logger.error("Failed to calculate MGRS grid zones for input image") return None @@ -389,18 +425,18 @@ def get_lulc_tile_for_input(input_image_path): # Example: 37N_20240101-20241231.tiff tile_paths = [] missing_zones = [] - + # Get all available LULC files available_files = glob.glob(os.path.join(LULC_TILE_ROOT, "*.tif")) available_files.extend(glob.glob(os.path.join(LULC_TILE_ROOT, "*.tiff"))) - + for grid_zone in sorted(grid_zones): found = False - + # Find file matching this grid zone for filepath in available_files: filename = os.path.basename(filepath) - + # Check if filename starts with the grid zone # Handles: 37N_20240101-20241231.tiff, 37N.tif, etc. if filename.startswith(grid_zone): @@ -408,17 +444,17 @@ def get_lulc_tile_for_input(input_image_path): found = True logger.info(f"Found LULC tile for zone {grid_zone}: {filename}") break - + if not found: missing_zones.append(grid_zone) - + if missing_zones: logger.warning( f"Missing LULC tiles for {len(missing_zones)} grid zones: {missing_zones}. " f"Download from: " f"https://lulctimeseries.blob.core.windows.net/lulctimeseriesv003/lc2024/" ) - + if not tile_paths: logger.error( f"No LULC tiles found for calculated grid zones: {sorted(grid_zones)}. " @@ -455,7 +491,11 @@ def get_lulc_tile_for_input(input_image_path): def search_band_dict(bands_list, search_key, search_value): - band_dict_match = (band_dict for band_dict in bands_list if band_dict.get(f"{search_key}") == f"{search_value}") + band_dict_match = ( + band_dict + for band_dict in bands_list + if band_dict.get(f"{search_key}") == f"{search_value}" + ) band_dict = next(band_dict_match, None) return band_dict @@ -478,10 +518,21 @@ def zip_inference_data(task_dir): zip_location = f"{task_dir}/archive.zip" directory = Path(task_dir) - geoserver_supported_extensions = ("*.tif", "*.gpkg", "*.shp", "*.shx", "*.dbf", "*.cpg", "*.prj", "*.nc") + geoserver_supported_extensions = ( + "*.tif", + "*.gpkg", + "*.shp", + "*.shx", + "*.dbf", + "*.cpg", + "*.prj", + "*.nc", + ) with ZipFile(zip_location, mode="w") as archive: - for file_path in chain.from_iterable(directory.glob(ext) for ext in geoserver_supported_extensions): + for file_path in chain.from_iterable( + directory.glob(ext) for ext in geoserver_supported_extensions + ): archive.write(file_path, arcname=file_path.relative_to(directory)) @@ -528,12 +579,18 @@ def regularize_by_technique( threshold=0, unique_attribute_value=False, ) - regularization_technique_to_run = REGULARIZATION_TECHNIQUES.get(technique, adaptive_regularization) + regularization_technique_to_run = REGULARIZATION_TECHNIQUES.get( + technique, adaptive_regularization + ) - regularized_gdf = regularization_technique_to_run(building_polygons=rasterized_vector, **kwargs) + regularized_gdf = regularization_technique_to_run( + building_polygons=rasterized_vector, **kwargs + ) if attribute_name not in regularized_gdf.columns: - logger.warning(f" Regularized polgon lacks the attribute name {attribute_name}. Adding it back") + logger.warning( + f" Regularized polgon lacks the attribute name {attribute_name}. Adding it back" + ) regularized_gdf[attribute_name] = rasterized_vector[attribute_name] return regularized_gdf @@ -553,7 +610,10 @@ def geojson_to_tiff( # Rasterize using the existing raster's shape and transform rasterized_data = features.rasterize( - ((geom, value) for geom, value in zip(geodataframe.geometry, geodataframe[attribute_name])), + ( + (geom, value) + for geom, value in zip(geodataframe.geometry, geodataframe[attribute_name]) + ), # [(geom, val) for geom, val in zip(vector.geometry, vector["id"])], out_shape=raster.shape, transform=raster.transform, From 69cc68b5f02f174d66c650d5e55434af4dd4d3e6 Mon Sep 17 00:00:00 2001 From: Beldine-Moturi Date: Tue, 31 Mar 2026 16:48:32 +0300 Subject: [PATCH 4/5] update mgrs requirement --- pipelines/requirements.txt | 1 + pyproject.toml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/requirements.txt b/pipelines/requirements.txt index 5d7c359..031986b 100644 --- a/pipelines/requirements.txt +++ b/pipelines/requirements.txt @@ -28,6 +28,7 @@ terrakit==0.1.3 tqdm wget xarray +mgrs==1.5.4 diff --git a/pyproject.toml b/pyproject.toml index cf00425..9c93a56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,6 @@ dependencies = [ "typer>=0.20.0", "urllib3>=2.5.0", "uvicorn[standard]>=0.35.0", - "mgrs==1.5.4", ] [project.scripts] From 182bebc6134107eb31d8f4eac466d111049d9b66 Mon Sep 17 00:00:00 2001 From: Beldine-Moturi Date: Mon, 13 Apr 2026 13:55:10 +0300 Subject: [PATCH 5/5] Update LULC_TILE_ROOT folder --- .../postprocess_generic_helper_functions.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py index a8f0d1b..49e1e9d 100644 --- a/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py +++ b/pipelines/components/postprocess-generic-single/postprocess_generic_helper_functions.py @@ -31,8 +31,7 @@ from shapely.geometry import Polygon, box # LULC data location -LULC_TILE_ROOT = os.environ.get("LULC_SHARED_DATA_ROOT", "/auxdata/lulc/lc2021/") -LULC_TILE_SHAPEFILE = os.environ.get("LULC_TILE_SHAPEFILE", "/auxdata/lulc/tiles.shp") +LULC_TILE_ROOT = os.environ.get("LULC_SHARED_DATA_ROOT", "/auxdata/lulc/") LAND_POLYGON_PATH = os.environ.get( "LAND_POLYGON_PATH", "/auxdata/general/land_polygons.shp" ) @@ -366,15 +365,6 @@ def _make_rgb_single_file(model_input_original_image: str, i: int = 0): else: return _make_rgb_single_file(model_input_original_image) - -def get_tiles_list(bbox): - ldf = gpd.read_file(LULC_TILE_SHAPEFILE) - polygon = box(*bbox) - ldf = ldf.clip(polygon) - tiles_list = list(ldf["tile"].values) - return tiles_list - - def get_lulc_tile_for_input(input_image_path): """ Find the file path for the LULC tile corresponding to a given input.