-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtransformation.py
More file actions
300 lines (252 loc) · 12.6 KB
/
transformation.py
File metadata and controls
300 lines (252 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#######################################################################################
# PROJECT: SAWS
# FILE NAME: transformation.py
# AUTHOR: Caroline Adkins
# LAST UPDATED: June 2025
# This module contains several functions that can be used to transform datasets
# (e.g., converting between different resolutions)
#######################################################################################
#######################################################################################
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from project_utils import saws_crs, saws_dir, saws_scales, load_saws_gdf, scale_to_geo_unit_cols_dict
##########################################
### BUILD OVERLAP MATRIX
##########################################
def build_overlap_matrix(gdf1, geo_col1, gdf2, geo_col2, save = False, scale1 = None, scale2 = None):
"""
This function builds a dataframe whose columns represent each geographic
unit in one resolution, and rows represent each geographic unit in another
resolution. Cell values respresent the area of overlap (in m2) between
the given column, row unit pair.
Arguments:
gdf1 : GeoDataFrame
Geodataframe containing polygons for the first resolution.
geo_col1 : str
Scale code for the first resolution (to be organized across columns).
gdf2 : GeoDataFrame
Geodataframe containing polygons for the second resolution.
geo_col2 : str
Scale code for the second resolution (to be organized across rows).
save : bool, optional
If True, the resulting overlap matrix will be saved as a .csv file in the
overlap_matrices/ folder of the dataset_builder package. Default is False.
scale1 : str, optional
Name of the first resolution scale (used for naming the output file).
Default is None.
scale2 : str, optional
Name of the second resolution scale (used for naming the output file).
Default is None.
Returns:
overlap_df : DataFrame
Output dataframe with areas of overlap between each pair of geographic
units.
"""
# sort gdfs using geo_col
gdf1 = gdf1.sort_values(by=geo_col1)
gdf2 = gdf2.sort_values(by=geo_col2)
# check to ensure crs matches
if (gdf1.crs != saws_crs) or (gdf2.crs != saws_crs):
raise ValueError('CRS of input geodataframes does not match expected CRS.')
# initialize an empty list to store results
overlap_data = []
# calculate total number of pairs to test
total_pairs = len(gdf1) * len(gdf2)
pair_counter = 0
print(f'\tGenerating overlap matrix. This may take a while depending on the size of the input geodataframes.', end='\n')
# loop through each pair of polygons
for unit1_ix, unit1_row in gdf1.iterrows():
for unit2_ix, unit2_row in gdf2.iterrows():
pair_counter += 1
print(f'\tProcessing pair {pair_counter} of {total_pairs} ({round(100*pair_counter/total_pairs,0)}% complete).', end='\r')
# compute intersection of polygons
intersection = unit1_row.geometry.intersection(unit2_row.geometry)
# calculate area of overlap
if intersection.is_empty:
overlap_area = 0
else:
overlap_area = intersection.area
overlap_data.append({
geo_col1: unit1_row[geo_col1],
geo_col2: unit2_row[geo_col2],
'overlap_area': overlap_area
})
print(f'\tProcessing pair {total_pairs} of {total_pairs} (100% complete). ', end='\n')
overlap_df = pd.DataFrame(overlap_data)
overlap_df = overlap_df.pivot(columns=geo_col1, index=geo_col2, values='overlap_area')
if save == True:
overlap_dir = os.path.abspath(os.path.join(saws_dir, 'dataset_builder', 'overlap_matrices'))
filename = f'{scale1}_x_{scale2}_overlap_matrix.csv'
filepath = os.path.abspath(os.path.join(overlap_dir, filename))
overlap_df.to_csv(filepath, index=True)
return overlap_df
##########################################
### LOAD OVERLAP MATRIX
##########################################
def load_overlap_matrix(scale1, scale2):
"""
This function loads the overlap matrix for two specified scales.
Listed order of scales does not matter.
Arguments:
scale1 : str
First desired scale for overlap matrix.
scale2 : str
Second desired scale for overlap matrix.
Returns:
overlap_df : pandas DataFrame
DataFrame containing the overlap matrix.
"""
# set overlap directory
overlap_dir = os.path.abspath(os.path.join(saws_dir, 'dataset_builder', 'overlap_matrices'))
# file could be stored under either name
filename1 = f'{scale1}_x_{scale2}_overlap_matrix.csv'
filename2 = f'{scale2}_x_{scale1}_overlap_matrix.csv'
# try loading file from first filename
try:
overlap_df = pd.read_csv(os.path.abspath(os.path.join(overlap_dir, filename1)), index_col=0, header=0)
except: # if that fails, try loading file from second filename
try:
overlap_df = pd.read_csv(os.path.abspath(os.path.join(overlap_dir, filename2)), index_col=0, header=0)
except: # if that fails, raise an error
raise FileNotFoundError(f'Overlap matrix for {scale1} and {scale2} does not exist in overlap_matrices/ folder.')
# if county is on index, file load will have converted geoids to int values, so convert back to strings and pad
if overlap_df.index.name == 'geoid':
overlap_df.index = overlap_df.index.astype(str).str.zfill(5)
return overlap_df
##########################################
### GEOSPATIAL CONVERSIONS
##########################################
def geo_convert(data, from_scale, from_gdf, from_geo_unit_col, to_scale, to_gdf, to_geo_unit_col, aggregation_dict, null_to_neg=False, save_matrix=True):
"""
This function converts the provided data from its native resolution (from_scale)
to a target resolution (to_scale) using a specified aggregaion method.
Arguments:
data : DataFrame
The DataFrame containing the data that needs to be converted.
from_scale : str
Scale code for original resolution.
from_gdf : GeoDataFrame
Contains polygons for the resolution that data is currently in.
from_geo_unit_col : str
Column containing geospatial identifiers for data/from_gdf.
to_scale : str
Scale code for target resolution.
to_gdf : str or GeoDataFrame
Contains polygons for the target resolution.
to_geo_unit_col : str
Column containing geospatial identifiers for to_gdf.
aggregation_dict : dict
Dictionary whose keys are attributes in data that need to be converted and values
represent the method used to aggregate that attribute when converting ('sum' or 'mean')
null_to_neg : bool, optional
If True, null values in the data will be replaced with -9999 before conversion, and all
negative values will be converted back to NaN after conversion. Default is False. Should
not be set to true for datasets containing negative values.
save_matrix : bool, optional
If True and appropriate overlapmatrix doesn't already exist, the generated overlap matrix will
be saved to the overlap_matrices/ folder of the dataset_builder package.
Returns:
conv_data : DataFrame
The converted data.
"""
# create a dataframe to store results
attributes = list(aggregation_dict.keys())
data_cols = [col for col in data.columns if col.startswith(tuple(attributes))]
conv_data = pd.DataFrame(index=to_gdf[to_geo_unit_col], columns=data_cols) # index is geo units, columns are attribute columns (with params)
# load or create the overlap matrix
try:
overlap_matrix = load_overlap_matrix(scale1=from_scale, scale2=to_scale)
except:
overlap_matrix = build_overlap_matrix(from_gdf, from_geo_unit_col, to_gdf, to_geo_unit_col,
save = save_matrix, scale1=from_scale, scale2=to_scale)
# make sure matrix has the appropriate scale on columns vs index
geo_unit_col_on_index = overlap_matrix.index.name
if geo_unit_col_on_index != to_geo_unit_col:
overlap_matrix = overlap_matrix.T
# make sure data to convert is formatted properly (correct order of geographic units)
fmtd_data = pd.DataFrame(index=overlap_matrix.columns)
cols_to_merge = [from_geo_unit_col] + data_cols
fmtd_data = fmtd_data.merge(data[cols_to_merge], how='left',
left_index=True, right_on=from_geo_unit_col)
fmtd_data.set_index(from_geo_unit_col, inplace=True)
if null_to_neg == True:
# replace null attribute values with -9999
fmtd_data.fillna(-9999, inplace=True)
else:
# drop any rows with null attribute values
null_geo_units = fmtd_data[fmtd_data.isna().any(axis=1)].index
fmtd_data.drop(index=null_geo_units, inplace=True)
overlap_matrix.drop(columns=null_geo_units, inplace=True)
# drop any geographic units that aren't in the final dataset
overlap_matrix.drop(index=overlap_matrix.index.difference(conv_data.index), inplace=True)
# set up overlap matrices for each aggregation type
sum_matrix = overlap_matrix.div(overlap_matrix.sum(axis=0), axis=1)
mean_matrix = overlap_matrix.div(overlap_matrix.sum(axis=1), axis=0)
# convert each attribute based on the aggregation method specified
for attribute, agg_method in aggregation_dict.items():
attribute_cols = [col for col in fmtd_data.columns if col.startswith(attribute)]
if agg_method == 'sum':
conv_data[attribute_cols] = np.matmul(sum_matrix.values, fmtd_data[attribute_cols].values)
elif agg_method == 'mean':
conv_data[attribute_cols] = np.matmul(mean_matrix.values, fmtd_data[attribute_cols].values)
else:
raise ValueError(f"Unsupported aggregation method: {agg_method}. Use 'sum' or 'mean'.")
# move index back to column
conv_data.reset_index(inplace=True, names=to_geo_unit_col)
# make sure attribute values are read as floats
conv_data[data_cols] = conv_data[data_cols].astype(float)
if null_to_neg == True:
# switch negative values back to NaN
conv_data[data_cols] = conv_data[data_cols].mask(conv_data[data_cols] < 0, np.nan)
return conv_data
##########################################
def convert_to_saws_resolutions(df, native_resolution, geo_unit_col, aggregation_dict, **kwargs):
"""
This function creates a dictionary of dataframes of the same data
in the different SAWS resolutions.
Arguments:
df : DataFrame
The DataFrame to be converted. If native_resolution is not a SAWS scale, this should be a
GeoDataFrame.
native_resolution : str
The resolution of the data in df.
geo_unit_col : str
Column containing geospatial identifiers for df.
aggregation_dict : dict
Dictionary whose keys are attributes in data that need to be converted and values
represent the method used to aggregate that attribute when converting ('sum' or 'mean)
**kwargs :
Additional keyword arguments that can be passed to the geo_convert function.
Returns:
df_dict : dict
Dictionary where keys are scale codes and values are DataFrames corresponding to each resolution.
"""
# create empty dictionary to store data
df_dict = {}
# if native_resolution is a SAWS scale, load the corresponding SAWS gdf
if native_resolution in saws_scales:
from_gdf = load_saws_gdf(native_resolution, full_res=False)
else:
from_gdf = df[[geo_unit_col, 'geometry']].copy()
df = df[[geo_unit_col] + [col for col in df.columns if col.startswith(tuple(aggregation_dict.keys()))]].copy()
# iterate through saws scales and convert data to each resolution
for scale in saws_scales:
if scale == native_resolution: # if scale is the native resolution, just copy the dataframe
df_dict[scale] = df.copy()
else:
df_dict[scale] = geo_convert(
data = df,
from_scale = native_resolution,
from_gdf = from_gdf,
from_geo_unit_col = geo_unit_col,
to_scale = scale,
to_gdf = load_saws_gdf(scale, full_res=False),
to_geo_unit_col = scale_to_geo_unit_cols_dict[scale],
aggregation_dict = aggregation_dict,
**kwargs
)
return df_dict