saws/dataset_builder/transformation.py at main · we3lab/saws · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#######################################################################################

# PROJECT: SAWS
# FILE NAME: transformation.py
# AUTHOR: Caroline Adkins
# LAST UPDATED: June 2025

# This module contains several functions that can be used to transform datasets
# (e.g., converting between different resolutions)

#######################################################################################
#######################################################################################

import os
import pandas as pd
import geopandas as gpd
import numpy as np
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from project_utils import saws_crs, saws_dir, saws_scales, load_saws_gdf, scale_to_geo_unit_cols_dict

##########################################
### BUILD OVERLAP MATRIX
##########################################

def build_overlap_matrix(gdf1, geo_col1, gdf2, geo_col2, save = False, scale1 = None, scale2 = None):
    """
    This function builds a dataframe whose columns represent each geographic
    unit in one resolution, and rows represent each geographic unit in another
    resolution. Cell values respresent the area of overlap (in m2) between
    the given column, row unit pair.

    Arguments:
    gdf1     : GeoDataFrame
        Geodataframe containing polygons for the first resolution.
    geo_col1 : str
        Scale code for the first resolution (to be organized across columns).
    gdf2     : GeoDataFrame
        Geodataframe containing polygons for the second resolution.
    geo_col2 : str
        Scale code for the second resolution (to be organized across rows).
    save : bool, optional
        If True, the resulting overlap matrix will be saved as a .csv file in the
        overlap_matrices/ folder of the dataset_builder package. Default is False.
    scale1 : str, optional
        Name of the first resolution scale (used for naming the output file).
        Default is None.
    scale2 : str, optional
        Name of the second resolution scale (used for naming the output file).
        Default is None.

    Returns:
    overlap_df : DataFrame
        Output dataframe with areas of overlap between each pair of geographic
        units.

    """
    # sort gdfs using geo_col
    gdf1 = gdf1.sort_values(by=geo_col1)
    gdf2 = gdf2.sort_values(by=geo_col2)

    # check to ensure crs matches
    if (gdf1.crs != saws_crs) or (gdf2.crs != saws_crs):
        raise ValueError('CRS of input geodataframes does not match expected CRS.')

    # initialize an empty list to store results
    overlap_data = []

    # calculate total number of pairs to test
    total_pairs = len(gdf1) * len(gdf2)
    pair_counter = 0

    print(f'\tGenerating overlap matrix. This may take a while depending on the size of the input geodataframes.', end='\n')

    # loop through each pair of polygons
    for unit1_ix, unit1_row in gdf1.iterrows():
        for unit2_ix, unit2_row in gdf2.iterrows():
            pair_counter += 1
            print(f'\tProcessing pair {pair_counter} of {total_pairs} ({round(100*pair_counter/total_pairs,0)}% complete).', end='\r')
            # compute intersection of polygons
            intersection = unit1_row.geometry.intersection(unit2_row.geometry)
            # calculate area of overlap
            if intersection.is_empty:
                overlap_area = 0
            else:
                overlap_area = intersection.area
            overlap_data.append({
                geo_col1: unit1_row[geo_col1],
                geo_col2: unit2_row[geo_col2],
                'overlap_area': overlap_area
            })
    print(f'\tProcessing pair {total_pairs} of {total_pairs} (100% complete).                ', end='\n')

    overlap_df = pd.DataFrame(overlap_data)
    overlap_df = overlap_df.pivot(columns=geo_col1, index=geo_col2, values='overlap_area')

    if save == True:
        overlap_dir = os.path.abspath(os.path.join(saws_dir, 'dataset_builder', 'overlap_matrices'))
        filename = f'{scale1}_x_{scale2}_overlap_matrix.csv'
        filepath = os.path.abspath(os.path.join(overlap_dir, filename))
        overlap_df.to_csv(filepath, index=True)

    return overlap_df

##########################################
### LOAD OVERLAP MATRIX
##########################################
def load_overlap_matrix(scale1, scale2):
    """
    This function loads the overlap matrix for two specified scales.
    Listed order of scales does not matter.

    Arguments:
    scale1 : str
        First desired scale for overlap matrix.
    scale2 : str
        Second desired scale for overlap matrix.

    Returns:
    overlap_df : pandas DataFrame
        DataFrame containing the overlap matrix.

    """
    # set overlap directory
    overlap_dir = os.path.abspath(os.path.join(saws_dir, 'dataset_builder', 'overlap_matrices'))
    # file could be stored under either name
    filename1 = f'{scale1}_x_{scale2}_overlap_matrix.csv'
    filename2 = f'{scale2}_x_{scale1}_overlap_matrix.csv'

    # try loading file from first filename
    try:
        overlap_df = pd.read_csv(os.path.abspath(os.path.join(overlap_dir, filename1)), index_col=0, header=0)
    except: # if that fails, try loading file from second filename
        try:
            overlap_df = pd.read_csv(os.path.abspath(os.path.join(overlap_dir, filename2)), index_col=0, header=0)
        except: # if that fails, raise an error
            raise FileNotFoundError(f'Overlap matrix for {scale1} and {scale2} does not exist in overlap_matrices/ folder.')

    # if county is on index, file load will have converted geoids to int values, so convert back to strings and pad
    if overlap_df.index.name == 'geoid':
        overlap_df.index = overlap_df.index.astype(str).str.zfill(5)

    return overlap_df

##########################################
### GEOSPATIAL CONVERSIONS
##########################################
def geo_convert(data, from_scale, from_gdf, from_geo_unit_col, to_scale, to_gdf, to_geo_unit_col, aggregation_dict, null_to_neg=False, save_matrix=True):
    """
    This function converts the provided data from its native resolution (from_scale)
    to a target resolution (to_scale) using a specified aggregaion method.

    Arguments:
    data : DataFrame
        The DataFrame containing the data that needs to be converted.
    from_scale : str
        Scale code for original resolution.
    from_gdf : GeoDataFrame
        Contains polygons for the resolution that data is currently in.
    from_geo_unit_col : str
        Column containing geospatial identifiers for data/from_gdf.
    to_scale : str
        Scale code for target resolution.
    to_gdf : str or GeoDataFrame
       Contains polygons for the target resolution.
    to_geo_unit_col : str
        Column containing geospatial identifiers for to_gdf.
    aggregation_dict : dict
        Dictionary whose keys are attributes in data that need to be converted and values
        represent the method used to aggregate that attribute when converting ('sum' or 'mean')
    null_to_neg : bool, optional
        If True, null values in the data will be replaced with -9999 before conversion, and all
        negative values will be converted back to NaN after conversion. Default is False. Should
        not be set to true for datasets containing negative values.
    save_matrix : bool, optional
        If True and appropriate overlapmatrix doesn't already exist, the generated overlap matrix will
        be saved to the overlap_matrices/ folder of the dataset_builder package.

    Returns:
    conv_data : DataFrame
        The converted data.

    """
    # create a dataframe to store results
    attributes = list(aggregation_dict.keys())
    data_cols = [col for col in data.columns if col.startswith(tuple(attributes))]
    conv_data = pd.DataFrame(index=to_gdf[to_geo_unit_col], columns=data_cols) # index is geo units, columns are attribute columns (with params)

    # load or create the overlap matrix
    try:
        overlap_matrix = load_overlap_matrix(scale1=from_scale, scale2=to_scale)
    except:
        overlap_matrix = build_overlap_matrix(from_gdf, from_geo_unit_col, to_gdf, to_geo_unit_col,
                                              save = save_matrix, scale1=from_scale, scale2=to_scale)

    # make sure matrix has the appropriate scale on columns vs index
    geo_unit_col_on_index = overlap_matrix.index.name
    if geo_unit_col_on_index != to_geo_unit_col:
        overlap_matrix = overlap_matrix.T

    # make sure data to convert is formatted properly (correct order of geographic units)
    fmtd_data = pd.DataFrame(index=overlap_matrix.columns)
    cols_to_merge = [from_geo_unit_col] + data_cols
    fmtd_data = fmtd_data.merge(data[cols_to_merge], how='left',
                                left_index=True, right_on=from_geo_unit_col)
    fmtd_data.set_index(from_geo_unit_col, inplace=True)

    if null_to_neg == True:
        # replace null attribute values with -9999
        fmtd_data.fillna(-9999, inplace=True)
    else:
        # drop any rows with null attribute values
        null_geo_units = fmtd_data[fmtd_data.isna().any(axis=1)].index
        fmtd_data.drop(index=null_geo_units, inplace=True)
        overlap_matrix.drop(columns=null_geo_units, inplace=True)

    # drop any geographic units that aren't in the final dataset
    overlap_matrix.drop(index=overlap_matrix.index.difference(conv_data.index), inplace=True)

    # set up overlap matrices for each aggregation type
    sum_matrix = overlap_matrix.div(overlap_matrix.sum(axis=0), axis=1)
    mean_matrix = overlap_matrix.div(overlap_matrix.sum(axis=1), axis=0)

    # convert each attribute based on the aggregation method specified
    for attribute, agg_method in aggregation_dict.items():
        attribute_cols = [col for col in fmtd_data.columns if col.startswith(attribute)]
        if agg_method == 'sum':
            conv_data[attribute_cols] = np.matmul(sum_matrix.values, fmtd_data[attribute_cols].values)
        elif agg_method == 'mean':
            conv_data[attribute_cols] = np.matmul(mean_matrix.values, fmtd_data[attribute_cols].values)
        else:
            raise ValueError(f"Unsupported aggregation method: {agg_method}. Use 'sum' or 'mean'.")

    # move index back to column
    conv_data.reset_index(inplace=True, names=to_geo_unit_col)

    # make sure attribute values are read as floats
    conv_data[data_cols] = conv_data[data_cols].astype(float)

    if null_to_neg == True:
        # switch negative values back to NaN
        conv_data[data_cols] = conv_data[data_cols].mask(conv_data[data_cols] < 0, np.nan)

    return conv_data

##########################################

def convert_to_saws_resolutions(df, native_resolution, geo_unit_col, aggregation_dict, **kwargs):
    """
    This function creates a dictionary of dataframes of the same data
    in the different SAWS resolutions.

    Arguments:
    df : DataFrame
        The DataFrame to be converted. If native_resolution is not a SAWS scale, this should be a
        GeoDataFrame.
    native_resolution : str
        The resolution of the data in df.
    geo_unit_col : str
        Column containing geospatial identifiers for df.
    aggregation_dict : dict
        Dictionary whose keys are attributes in data that need to be converted and values
        represent the method used to aggregate that attribute when converting ('sum' or 'mean)
    **kwargs :
        Additional keyword arguments that can be passed to the geo_convert function.

    Returns:
    df_dict : dict
        Dictionary where keys are scale codes and values are DataFrames corresponding to each resolution.

    """
    # create empty dictionary to store data
    df_dict = {}

    # if native_resolution is a SAWS scale, load the corresponding SAWS gdf
    if native_resolution in saws_scales:
        from_gdf = load_saws_gdf(native_resolution, full_res=False)
    else:
        from_gdf = df[[geo_unit_col, 'geometry']].copy()
        df = df[[geo_unit_col] + [col for col in df.columns if col.startswith(tuple(aggregation_dict.keys()))]].copy()

    # iterate through saws scales and convert data to each resolution
    for scale in saws_scales:
        if scale == native_resolution: # if scale is the native resolution, just copy the dataframe
            df_dict[scale] = df.copy()
        else:
            df_dict[scale] = geo_convert(
                data = df,
                from_scale = native_resolution,
                from_gdf = from_gdf,
                from_geo_unit_col = geo_unit_col,
                to_scale = scale,
                to_gdf = load_saws_gdf(scale, full_res=False),
                to_geo_unit_col = scale_to_geo_unit_cols_dict[scale],
                aggregation_dict = aggregation_dict,
                **kwargs
            )

    return df_dict