Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from zipfile import ZipFile

import geopandas as gpd
import mgrs
import numpy as np
import pandas as pd
import rasterio
import rioxarray
import xarray as xr
Expand All @@ -31,12 +31,17 @@
from shapely.geometry import Polygon, box

# LULC data location
LULC_TILE_ROOT = os.environ.get("LULC_SHARED_DATA_ROOT", "/auxdata/lulc/lc2021/")
LULC_TILE_SHAPEFILE = os.environ.get("LULC_TILE_SHAPEFILE", "/auxdata/lulc/tiles.shp")
LAND_POLYGON_PATH = os.environ.get("LAND_POLYGON_PATH", "/auxdata/general/land_polygons.shp")
LULC_TILE_ROOT = os.environ.get("LULC_SHARED_DATA_ROOT", "/auxdata/lulc/")
LAND_POLYGON_PATH = os.environ.get(
"LAND_POLYGON_PATH", "/auxdata/general/land_polygons.shp"
)

# Cloud mask
S2_CLOUD_MASK = [3, 8, 9] # Cloud shadows, Cloud medium probability, Cloud high probability
S2_CLOUD_MASK = [
3,
8,
9,
] # Cloud shadows, Cloud medium probability, Cloud high probability
HLS_CLOUD_MASK = ["0", "0", "0", "0", "1", "1", "1", "1"]
CLOUD_VALUE = 999

Expand All @@ -47,7 +52,9 @@

# Permanent water mask
S2_PERMANENT_WATER_MASK = [6] # Permanent water using S2 SCL
S2_LULC_PERMANENT_WATER_MASK = [1] # https://www.arcgis.com/home/item.html?id=cfcb7609de5f478eb7666240902d4d3d
S2_LULC_PERMANENT_WATER_MASK = [
1
] # https://www.arcgis.com/home/item.html?id=cfcb7609de5f478eb7666240902d4d3d
S2_LULC_PERMANENT_WATER_BAND_INDEX = 0
HLS_PERMANENT_WATER_MASK = ["0", "0", "1", "0", "0", "0", "0", "0"]
PERMANENT_WATER_VALUE = 997
Expand Down Expand Up @@ -80,13 +87,15 @@ def resize_image(image, input_image=None, bbox=None):
def __resize_image(input_image):
if input_image:
get_bbox_cmd = f"""gdalinfo -json {input_image} | jq '(.cornerCoordinates.upperLeft[0] | tostring) + " " + (.cornerCoordinates.upperLeft[1] | tostring ) + " " + (.cornerCoordinates.lowerRight[0] | tostring) + " " + (.cornerCoordinates.lowerRight[1] | tostring )'"""
input_image_bbox = check_output([get_bbox_cmd], shell=True, text=True).replace('"', "").replace("\n", "")
input_image_bbox = (
check_output([get_bbox_cmd], shell=True, text=True)
.replace('"', "")
.replace("\n", "")
)
os.system(f"cp {image} {image}-temp.tif")
command = f"gdal_translate -projwin {input_image_bbox} -of GTiff {image}-temp.tif {image}"
elif bbox:
command = (
f"gdal_translate -projwin {bbox[0]} {bbox[3]} {bbox[2]} {bbox[1]} -of GTiff {image}-temp.tif {image}"
)
command = f"gdal_translate -projwin {bbox[0]} {bbox[3]} {bbox[2]} {bbox[1]} -of GTiff {image}-temp.tif {image}"

os.system(command)
os.system(f"rm {image}-temp.tif")
Expand Down Expand Up @@ -118,8 +127,16 @@ def read_json_with_retries(filepath, max_retries=5, base_delay=0.5):
time.sleep(base_delay * (2 ** (attempt - 1)))


def hls_masking(xds, ids, fmask_index=6, mask_bits=["0", "0", "0", "0", "1", "1", "1", "1"], mask_to_value=999):
logger.info(f">>>>>>>>>>>> Performing mask for scenes {mask_bits} with masked values set to {mask_to_value}")
def hls_masking(
xds,
ids,
fmask_index=6,
mask_bits=["0", "0", "0", "0", "1", "1", "1", "1"],
mask_to_value=999,
):
logger.info(
f">>>>>>>>>>>> Performing mask for scenes {mask_bits} with masked values set to {mask_to_value}"
)
qVals = list(np.unique(ids[fmask_index].astype(int)))
goodQuality = []
for v in qVals:
Expand Down Expand Up @@ -160,7 +177,9 @@ def s2_masking(xds, ids, fmask_index, scene_classification, mask_to_value):
logger.info(
f">>>>>>>>>>>> Performing mask for scenes {scene_classification} with masked values set to {mask_to_value}"
)
mask = ids[fmask_index].values == scene_classification[0] # initalise the mask array
mask = (
ids[fmask_index].values == scene_classification[0]
) # initalise the mask array
for scene_class in range(1, len(scene_classification)):
mask = np.logical_or(
mask, ids[fmask_index].values == scene_classification[scene_class]
Expand All @@ -182,12 +201,19 @@ def mask_and_set_to(xds, ids, masking_type, inference_dict, i=0):
mask_band = masking_config.get("band", None)
if mask_band is not None:
encoding_type = masking_config.get("encoding", None)
if encoding_type is not None and "bands" in inference_dict["model_input_data_spec"][i]:
model_input_data_spec_bands = inference_dict["model_input_data_spec"][i]["bands"]
if (
encoding_type is not None
and "bands" in inference_dict["model_input_data_spec"][i]
):
model_input_data_spec_bands = inference_dict["model_input_data_spec"][
i
]["bands"]
if encoding_type == "sentinel2_lulc":
mask_band_index = S2_LULC_PERMANENT_WATER_BAND_INDEX
else:
mask_band_index = search_band_dict(model_input_data_spec_bands, "band_name", f"{mask_band}")
mask_band_index = search_band_dict(
model_input_data_spec_bands, "band_name", f"{mask_band}"
)
if mask_band_index is not None:
mask_band_index = mask_band_index.get("index")
else:
Expand Down Expand Up @@ -278,7 +304,9 @@ def mask_from_url(mask_url: str, xds: object, buffer_size_m: float = 100):
ccgdf["geometry"] = ccgdf.buffer(buffer_size_m)
ccgdf.to_crs(crs=original_crs, inplace=True)

shapes = ((geom, value) for geom, value in zip(ccgdf.geometry, [90] * len(ccgdf.geometry)))
shapes = (
(geom, value) for geom, value in zip(ccgdf.geometry, [90] * len(ccgdf.geometry))
)

features.rasterize(shapes=shapes, fill=0, out=xds, transform=xds.rio.transform())
return xds
Expand All @@ -305,11 +333,19 @@ def _make_rgb_single_file(model_input_original_image: str, i: int = 0):
nodata_value = xds.attrs.get("_FillValue", -9999)
xds.rio.write_nodata(nodata_value, inplace=True)

model_input_data_spec_bands = inference_dict["model_input_data_spec"][i]["bands"]
model_input_data_spec_bands = inference_dict["model_input_data_spec"][i][
"bands"
]
logger.info(f"model_input_data_spec_bands: {model_input_data_spec_bands}")
red_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "R").get("index"))
green_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "G").get("index"))
blue_band_index = int(search_band_dict(model_input_data_spec_bands, "RGB_band", "B").get("index"))
red_band_index = int(
search_band_dict(model_input_data_spec_bands, "RGB_band", "R").get("index")
)
green_band_index = int(
search_band_dict(model_input_data_spec_bands, "RGB_band", "G").get("index")
)
blue_band_index = int(
search_band_dict(model_input_data_spec_bands, "RGB_band", "B").get("index")
)
rgb_indexes = [red_band_index, green_band_index, blue_band_index]

rgb_xds = xds[rgb_indexes, :, :]
Expand All @@ -329,15 +365,6 @@ def _make_rgb_single_file(model_input_original_image: str, i: int = 0):
else:
return _make_rgb_single_file(model_input_original_image)


def get_tiles_list(bbox):
ldf = gpd.read_file(LULC_TILE_SHAPEFILE)
polygon = box(*bbox)
ldf = ldf.clip(polygon)
tiles_list = list(ldf["tile"].values)
return tiles_list


def get_lulc_tile_for_input(input_image_path):
"""
Find the file path for the LULC tile corresponding to a given input.
Expand All @@ -354,34 +381,88 @@ def get_lulc_tile_for_input(input_image_path):
# Setup output directory
outputs_dir = "/".join(input_image_path.split("/")[:-1])
Path(outputs_dir).mkdir(parents=True, exist_ok=True)
# Read LULC shapefile
gdf = gpd.read_file(LULC_TILE_SHAPEFILE)
# Get input image bbox
bbox = rdo.get_raster_bbox(input_image_path)
tile_list = get_tiles_list([bbox[0], bbox[1], bbox[2], bbox[3]])
# Prepare dataframe
df_tmp = pd.DataFrame({"file": [input_image_path]})
df_tmp["geometry"] = box(bbox.bottom, bbox.left, bbox.right, bbox.top)
gdf_bbox = gpd.GeoDataFrame(df_tmp, geometry="geometry", crs="EPSG:4326")
# Join lulc shapefile dataframe with model input image dataframe
# to create a dataframe of lulc tile paths, model input image path
# and model input bbox.
join = gpd.sjoin(left_df=gdf, right_df=gdf_bbox, how="right", predicate="intersects")
# Iterate through the list of relavent tiles, then reproject as required.
for j in range(len(tile_list)):
tile_j_value = join[join["tile"] == tile_list[j]]["tile"].values[0]
file_j_value = join[join["tile"] == tile_list[j]]["file"].values[0]
if tile_j_value == "nan" or not join["tile"].values[j]:
continue

m = mgrs.MGRS()
grid_zones = set()

# Sample points across bbox to find all intersecting grid zones
lat_range = bbox.top - bbox.bottom
lon_range = bbox.right - bbox.left
lat_samples = max(3, min(10, int(lat_range / 0.5)))
lon_samples = max(3, min(10, int(lon_range / 0.5)))

for lat in np.linspace(bbox.bottom, bbox.top, lat_samples):
for lon in np.linspace(bbox.left, bbox.right, lon_samples):
try:
# Get MGRS code for this point
mgrs_code = m.toMGRS(lat, lon, MGRSPrecision=0)
# Extract grid zone (first 3 characters: zone number + latitude band)
# Example: "37MBT12345678" -> "37M"
grid_zone = mgrs_code[:3]
grid_zones.add(grid_zone)
except Exception:
# Skip invalid coordinates (e.g., near poles)
continue

if not grid_zones:
logger.error("Failed to calculate MGRS grid zones for input image")
return None

# Find matching LULC tiles in the data directory
# Expected naming: [GridZone]_[StartDate]-[EndDate].tiff
# Example: 37N_20240101-20241231.tiff
tile_paths = []
missing_zones = []

# Get all available LULC files
available_files = glob.glob(os.path.join(LULC_TILE_ROOT, "*.tif"))
available_files.extend(glob.glob(os.path.join(LULC_TILE_ROOT, "*.tiff")))

for grid_zone in sorted(grid_zones):
found = False

# Find file matching this grid zone
for filepath in available_files:
filename = os.path.basename(filepath)

# Check if filename starts with the grid zone
# Handles: 37N_20240101-20241231.tiff, 37N.tif, etc.
if filename.startswith(grid_zone):
tile_paths.append(filepath)
found = True
logger.info(f"Found LULC tile for zone {grid_zone}: {filename}")
break

if not found:
missing_zones.append(grid_zone)

if missing_zones:
logger.warning(
f"Missing LULC tiles for {len(missing_zones)} grid zones: {missing_zones}. "
f"Download from: "
f"https://lulctimeseries.blob.core.windows.net/lulctimeseriesv003/lc2024/"
)

if not tile_paths:
logger.error(
f"No LULC tiles found for calculated grid zones: {sorted(grid_zones)}. "
f"Please download required tiles to {LULC_TILE_ROOT}"
)
return None

for j, tile_path in enumerate(tile_paths):
tmp_unique_id = str(uuid4())
# Match LULC tile to model input image
output_tmp_path = rdo.match_raster_to_target(
input_file=LULC_TILE_ROOT + tile_j_value,
target_file=file_j_value,
input_file=tile_path,
target_file=input_image_path,
output_suffix=f"_{tmp_unique_id}_padded",
)
os.system(f"mv {output_tmp_path} {outputs_dir}/lulc_tile{str(j)}.tif")
lulc_tiles = glob.glob(outputs_dir + "/lulc_tile*")
lulc_file_path = ""
if len(lulc_tiles) > 1:
print("mosaic")
lulc_file_path = rdo.create_mosaic(
Expand All @@ -400,7 +481,11 @@ def get_lulc_tile_for_input(input_image_path):


def search_band_dict(bands_list, search_key, search_value):
band_dict_match = (band_dict for band_dict in bands_list if band_dict.get(f"{search_key}") == f"{search_value}")
band_dict_match = (
band_dict
for band_dict in bands_list
if band_dict.get(f"{search_key}") == f"{search_value}"
)
band_dict = next(band_dict_match, None)
return band_dict

Expand All @@ -423,10 +508,21 @@ def zip_inference_data(task_dir):
zip_location = f"{task_dir}/archive.zip"

directory = Path(task_dir)
geoserver_supported_extensions = ("*.tif", "*.gpkg", "*.shp", "*.shx", "*.dbf", "*.cpg", "*.prj", "*.nc")
geoserver_supported_extensions = (
"*.tif",
"*.gpkg",
"*.shp",
"*.shx",
"*.dbf",
"*.cpg",
"*.prj",
"*.nc",
)

with ZipFile(zip_location, mode="w") as archive:
for file_path in chain.from_iterable(directory.glob(ext) for ext in geoserver_supported_extensions):
for file_path in chain.from_iterable(
directory.glob(ext) for ext in geoserver_supported_extensions
):
archive.write(file_path, arcname=file_path.relative_to(directory))


Expand Down Expand Up @@ -473,12 +569,18 @@ def regularize_by_technique(
threshold=0,
unique_attribute_value=False,
)
regularization_technique_to_run = REGULARIZATION_TECHNIQUES.get(technique, adaptive_regularization)
regularization_technique_to_run = REGULARIZATION_TECHNIQUES.get(
technique, adaptive_regularization
)

regularized_gdf = regularization_technique_to_run(building_polygons=rasterized_vector, **kwargs)
regularized_gdf = regularization_technique_to_run(
building_polygons=rasterized_vector, **kwargs
)

if attribute_name not in regularized_gdf.columns:
logger.warning(f" Regularized polgon lacks the attribute name {attribute_name}. Adding it back")
logger.warning(
f" Regularized polgon lacks the attribute name {attribute_name}. Adding it back"
)
regularized_gdf[attribute_name] = rasterized_vector[attribute_name]

return regularized_gdf
Expand All @@ -498,7 +600,10 @@ def geojson_to_tiff(

# Rasterize using the existing raster's shape and transform
rasterized_data = features.rasterize(
((geom, value) for geom, value in zip(geodataframe.geometry, geodataframe[attribute_name])),
(
(geom, value)
for geom, value in zip(geodataframe.geometry, geodataframe[attribute_name])
),
# [(geom, val) for geom, val in zip(vector.geometry, vector["id"])],
out_shape=raster.shape,
transform=raster.transform,
Expand Down
1 change: 1 addition & 0 deletions pipelines/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ terrakit==0.1.3
tqdm
wget
xarray
mgrs==1.5.4



Expand Down
Loading