Skip to content

Commit 8222039

Browse files
frances-hR-Palazzo
andauthored
[DayZ Parameters] 'missing_values_proportion' should not be valid for any key columns (#2709)
Co-authored-by: R-Palazzo <[email protected]>
1 parent f7f4345 commit 8222039

File tree

3 files changed

+53
-17
lines changed

3 files changed

+53
-17
lines changed

sdv/single_table/dayz.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def _validate_categorical_parameters(column_parameters, column_table_msg):
132132
raise SynthesizerProcessingError(msg)
133133

134134

135-
def _validate_missing_value_parameters(column_parameters, column_table_msg):
135+
def _validate_missing_value_parameters(column_parameters, column_table_msg, is_key_column):
136136
missing_values_proportion = column_parameters['missing_values_proportion']
137137
if not _is_numerical(missing_values_proportion) or (
138138
missing_values_proportion < 0.0 or missing_values_proportion > 1.0
@@ -142,9 +142,15 @@ def _validate_missing_value_parameters(column_parameters, column_table_msg):
142142
'must be a float between 0.0 and 1.0.'
143143
)
144144
raise SynthesizerProcessingError(msg)
145+
elif is_key_column and missing_values_proportion != 0:
146+
msg = (
147+
f"Invalid 'missing_values_proportion' parameter for {column_table_msg}. Primary "
148+
"and alternate keys must have 'missing_values_proportion' parameter set to zero."
149+
)
150+
raise SynthesizerProcessingError(msg)
145151

146152

147-
def _validate_column_parameters(table, column, column_metadata, column_parameters):
153+
def _validate_column_parameters(table, column, column_metadata, column_parameters, is_key_column):
148154
column_table_msg = f"column '{column}' in table '{table}'"
149155
sdtype = column_metadata['sdtype']
150156
sdtype_parameters = SDTYPE_TO_PARAMETERS.get(sdtype, COLUMN_PARAMETER_KEYS)
@@ -165,7 +171,7 @@ def _validate_column_parameters(table, column, column_metadata, column_parameter
165171
_validate_categorical_parameters(column_parameters, column_table_msg)
166172

167173
if 'missing_values_proportion' in column_parameters:
168-
_validate_missing_value_parameters(column_parameters, column_table_msg)
174+
_validate_missing_value_parameters(column_parameters, column_table_msg, is_key_column)
169175

170176

171177
def _validate_table_parameters(table, table_metadata, table_parameters):
@@ -186,9 +192,11 @@ def _validate_table_parameters(table, table_metadata, table_parameters):
186192
)
187193
raise SynthesizerProcessingError(msg)
188194

195+
key_columns = table_metadata._get_primary_and_alternate_keys()
189196
for column, column_parameters in table_parameters.get('columns', {}).items():
197+
is_key_column = column in key_columns
190198
_validate_column_parameters(
191-
table, column, table_metadata.columns[column], column_parameters
199+
table, column, table_metadata.columns[column], column_parameters, is_key_column
192200
)
193201

194202

tests/unit/single_table/test__dayz_utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,26 @@ def test_detect_column_parameter():
2727
"""Test the `detect_column_parameters` method."""
2828
# Setup
2929
data = pd.DataFrame({
30+
'pk': [0, 1, 2, 3],
3031
'num_col': [1.0, 2.5, 3.0, None],
3132
'cat_col': ['A', 'B', 'A', None],
3233
'date_col': ['2020-01-01', '2020-01-02', None, None],
3334
'date_col_2': ['2020 Jan 01', '2020 Jan 02', '2020 Jan 03', None],
35+
'alt_key': ['id0', 'id1', 'id2', 'id3'],
3436
})
3537
metadata = Metadata.load_from_dict({
3638
'tables': {
3739
'table_name': {
3840
'columns': {
41+
'pk': {'sdtype': 'id'},
3942
'num_col': {'sdtype': 'numerical'},
4043
'cat_col': {'sdtype': 'categorical'},
4144
'date_col': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
4245
'date_col_2': {'sdtype': 'datetime'},
43-
}
46+
'alt_key': {'sdtype': 'ssn'},
47+
},
48+
'primary_key': 'pk',
49+
'alternate_keys': ['alt_key'],
4450
}
4551
}
4652
})
@@ -50,6 +56,7 @@ def test_detect_column_parameter():
5056
# Assert
5157
assert result == {
5258
'columns': {
59+
'pk': {'missing_values_proportion': 0.0},
5360
'num_col': {
5461
'num_decimal_digits': 1,
5562
'min_value': 1.0,
@@ -70,6 +77,7 @@ def test_detect_column_parameter():
7077
'end_timestamp': '2020-01-03 00:00:00',
7178
'missing_values_proportion': 0.25,
7279
},
80+
'alt_key': {'missing_values_proportion': 0.0},
7381
}
7482
}
7583

tests/unit/single_table/test_dayz.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ def metadata():
2828
'categorical': {'sdtype': 'categorical'},
2929
'pii': {'sdtype': 'ssn'},
3030
'extra_column': {'sdtype': 'numerical'},
31-
}
31+
},
32+
'primary_key': 'id',
3233
}
3334
}
3435
})
@@ -97,21 +98,31 @@ def test__validate_column_parameter():
9798
column_metadata = {'sdtype': 'id'}
9899
bad_column_parameters = {'invalid_key': None}
99100
bad_missing_value = {'missing_values_proportion': 100}
101+
bad_key_missing_value = {'missing_values_proportion': 0.5}
100102

101103
# Run and Assert
102104
expected_bad_column_msg = re.escape(
103105
"The parameters for column 'column' in table 'table' contains unexpected "
104106
"key(s) 'invalid_key'."
105107
)
106108
with pytest.raises(SynthesizerProcessingError, match=expected_bad_column_msg):
107-
_validate_column_parameters('table', 'column', column_metadata, bad_column_parameters)
109+
_validate_column_parameters(
110+
'table', 'column', column_metadata, bad_column_parameters, False
111+
)
108112

109113
expected_bad_missing_value_msg = re.escape(
110114
"The 'missing_values_proportion' parameter for column 'column' in table 'table' "
111115
'must be a float between 0.0 and 1.0.'
112116
)
113117
with pytest.raises(SynthesizerProcessingError, match=expected_bad_missing_value_msg):
114-
_validate_column_parameters('table', 'column', column_metadata, bad_missing_value)
118+
_validate_column_parameters('table', 'column', column_metadata, bad_missing_value, False)
119+
120+
expected_missing_values_with_key_msg = re.escape(
121+
"Invalid 'missing_values_proportion' parameter for column 'column' in table 'table'. "
122+
"Primary and alternate keys must have 'missing_values_proportion' parameter set to zero."
123+
)
124+
with pytest.raises(SynthesizerProcessingError, match=expected_missing_values_with_key_msg):
125+
_validate_column_parameters('table', 'column', column_metadata, bad_key_missing_value, True)
115126

116127

117128
def test__validate_column_parameters_numerical():
@@ -127,21 +138,25 @@ def test__validate_column_parameters_numerical():
127138
"The 'min_value' parameter for column 'column' in table 'table' must be a float."
128139
)
129140
with pytest.raises(SynthesizerProcessingError, match=expected_bad_parameter_value_msg):
130-
_validate_column_parameters('table', 'column', column_metadata, bad_parameter_value)
141+
_validate_column_parameters('table', 'column', column_metadata, bad_parameter_value, False)
131142

132143
expected_bad_min_max_msg = re.escape(
133144
"Invalid parameters for column 'column' in table 'table'. The 'min_value' "
134145
"must be less than or equal to the 'max_value'"
135146
)
136147
with pytest.raises(SynthesizerProcessingError, match=expected_bad_min_max_msg):
137-
_validate_column_parameters('table', 'column', column_metadata, bad_min_max_combination)
148+
_validate_column_parameters(
149+
'table', 'column', column_metadata, bad_min_max_combination, False
150+
)
138151

139152
expected_bad_num_decimal_digits_msg = re.escape(
140153
"The 'num_decimal_digits' parameter for column 'column' in table 'table' must be an "
141154
'integer greater than or equal to zero.'
142155
)
143156
with pytest.raises(SynthesizerProcessingError, match=expected_bad_num_decimal_digits_msg):
144-
_validate_column_parameters('table', 'column', column_metadata, bad_num_decimal_digits)
157+
_validate_column_parameters(
158+
'table', 'column', column_metadata, bad_num_decimal_digits, False
159+
)
145160

146161

147162
def test__validate_column_parameters_datetime():
@@ -157,28 +172,32 @@ def test__validate_column_parameters_datetime():
157172
"The 'start_timestamp' parameter for column 'column' in table 'table' must be a string."
158173
)
159174
with pytest.raises(SynthesizerProcessingError, match=expected_bad_parameter_value_msg):
160-
_validate_column_parameters('table', 'column', column_metadata, bad_parameter_value)
175+
_validate_column_parameters('table', 'column', column_metadata, bad_parameter_value, False)
161176

162177
expected_bad_datetime_value_msg = re.escape(
163178
"The 'start_timestamp' parameter for column 'column' in table 'table' is not a valid "
164179
'datetime string or does not match the date time format (%d %b %Y).'
165180
)
166181
with pytest.raises(SynthesizerProcessingError, match=expected_bad_datetime_value_msg):
167-
_validate_column_parameters('table', 'column', column_metadata, bad_datetime_value)
182+
_validate_column_parameters('table', 'column', column_metadata, bad_datetime_value, False)
168183

169184
expected_bad_value_no_format_msg = re.escape(
170185
"The 'start_timestamp' parameter for column 'column' in table 'table' is not a "
171186
'valid datetime string.'
172187
)
173188
with pytest.raises(SynthesizerProcessingError, match=expected_bad_value_no_format_msg):
174-
_validate_column_parameters('table', 'column', {'sdtype': 'datetime'}, bad_datetime_value)
189+
_validate_column_parameters(
190+
'table', 'column', {'sdtype': 'datetime'}, bad_datetime_value, False
191+
)
175192

176193
expected_bad_start_end_msg = re.escape(
177194
"Invalid parameters for column 'column' in table 'table'. The 'start_timestamp' "
178195
"must be less than the 'end_timestamp'"
179196
)
180197
with pytest.raises(SynthesizerProcessingError, match=expected_bad_start_end_msg):
181-
_validate_column_parameters('table', 'column', column_metadata, bad_start_end_combination)
198+
_validate_column_parameters(
199+
'table', 'column', column_metadata, bad_start_end_combination, False
200+
)
182201

183202

184203
def test__validate_column_parameters_categorical():
@@ -192,7 +211,7 @@ def test__validate_column_parameters_categorical():
192211
"The 'category_values' parameter for column 'column' in table 'table' must be a list."
193212
)
194213
with pytest.raises(SynthesizerProcessingError, match=expected_msg):
195-
_validate_column_parameters('table', 'column', column_metadata, bad_category_values)
214+
_validate_column_parameters('table', 'column', column_metadata, bad_category_values, False)
196215

197216

198217
@patch('sdv.single_table.dayz._validate_column_parameters')
@@ -202,6 +221,7 @@ def test__validate_table_parameters(mock__validate_column_parameters, metadata,
202221
table_metadata = metadata.tables['table']
203222
bad_table_columns = {'columns': {'bad_column': {}}}
204223
bad_num_rows = {'num_rows': -1}
224+
keys = ['id']
205225

206226
# Run and Assert
207227
expected_bad_column_msg = re.escape(
@@ -222,7 +242,7 @@ def test__validate_table_parameters(mock__validate_column_parameters, metadata,
222242

223243
# Assert
224244
expected_calls = [
225-
call('table', col, table_metadata.columns[col], col_parameters)
245+
call('table', col, table_metadata.columns[col], col_parameters, col in keys)
226246
for col, col_parameters in dayz_parameters['tables']['table']['columns'].items()
227247
]
228248
mock__validate_column_parameters.assert_has_calls(expected_calls)

0 commit comments

Comments
 (0)