@@ -18,6 +18,79 @@ def detect_table_parameters(data):
1818 return {'num_rows' : len (data )}
1919
2020
21+ def _compute_missing_values_proportion (series ):
22+ """Compute missing value proportion with a safe fallback for empty series."""
23+ if len (series ) == 0 :
24+ return 0.0
25+
26+ value = float (series .isna ().mean ())
27+ return 0.0 if pd .isna (value ) else value
28+
29+
30+ def _detect_numerical_column_parameters (series ):
31+ """Detect numerical-specific parameters with fallbacks when undetectable.
32+
33+ Returns only keys that can be reliably detected (no None values).
34+ """
35+ params = {}
36+ non_null = series .dropna ()
37+ if non_null .empty :
38+ return params
39+
40+ try :
41+ num_decimal_digits = learn_rounding_digits (series )
42+ if isinstance (num_decimal_digits , int ) and num_decimal_digits >= 0 :
43+ params ['num_decimal_digits' ] = num_decimal_digits
44+ except Exception :
45+ pass
46+
47+ min_value = non_null .min ()
48+ max_value = non_null .max ()
49+ if not pd .isna (min_value ):
50+ params ['min_value' ] = min_value .item () if hasattr (min_value , 'item' ) else float (min_value )
51+ if not pd .isna (max_value ):
52+ params ['max_value' ] = max_value .item () if hasattr (max_value , 'item' ) else float (max_value )
53+
54+ return params
55+
56+
57+ def _detect_datetime_column_parameters (series , column_metadata ):
58+ """Detect datetime-specific parameters with fallbacks when undetectable.
59+
60+ Returns only keys that can be reliably detected (no None values).
61+ """
62+ params = {}
63+ datetime_format = column_metadata .get ('datetime_format' , None )
64+ if datetime_format :
65+ datetime_column = pd .to_datetime (series , format = datetime_format , errors = 'coerce' )
66+ else :
67+ datetime_column = pd .to_datetime (series , errors = 'coerce' )
68+
69+ non_na = datetime_column [~ pd .isna (datetime_column )]
70+ if non_na .empty :
71+ return params
72+
73+ start_dt = non_na .min ()
74+ end_dt = non_na .max ()
75+ if datetime_format :
76+ params ['start_timestamp' ] = start_dt .strftime (datetime_format )
77+ params ['end_timestamp' ] = end_dt .strftime (datetime_format )
78+ else :
79+ params ['start_timestamp' ] = start_dt .strftime ('%Y-%m-%d %H:%M:%S' )
80+ params ['end_timestamp' ] = end_dt .strftime ('%Y-%m-%d %H:%M:%S' )
81+
82+ return params
83+
84+
85+ def _detect_categorical_column_parameters (series ):
86+ """Detect categorical/boolean parameters."""
87+ categorical_values = series .dropna ().unique ()
88+ if len (categorical_values ) == 0 :
89+ return {}
90+
91+ return {'category_values' : categorical_values .tolist ()}
92+
93+
2194def detect_column_parameters (data , metadata , table_name ):
2295 """Detect all column-level Dayz parameters.
2396
@@ -37,46 +110,28 @@ def detect_column_parameters(data, metadata, table_name):
37110 table_metadata = metadata .tables [table_name ]
38111 column_parameters = {}
39112 for column_name , column_metadata in table_metadata .columns .items ():
40- column_parameters [column_name ] = {}
41113 sdtype = column_metadata ['sdtype' ]
114+ params = {}
42115 if sdtype == 'numerical' :
43- column_parameters [column_name ] = {
44- 'num_decimal_digits' : learn_rounding_digits (data [column_name ]),
45- 'min_value' : data [column_name ].min (),
46- 'max_value' : data [column_name ].max (),
47- }
116+ params .update (_detect_numerical_column_parameters (data [column_name ]))
48117 elif sdtype == 'datetime' :
49- datetime_format = column_metadata .get ('datetime_format' , None )
50- if datetime_format :
51- datetime_column = pd .to_datetime (
52- data [column_name ], format = datetime_format , errors = 'coerce'
53- )
54- start_timestamp = datetime_column .min ().strftime (datetime_format )
55- end_timestamp = datetime_column .max ().strftime (datetime_format )
56-
57- else :
58- datetime_column = pd .to_datetime (data [column_name ], errors = 'coerce' )
59- start_timestamp = str (datetime_column .min ())
60- end_timestamp = str (datetime_column .max ())
61-
62- column_parameters [column_name ] = {
63- 'start_timestamp' : start_timestamp ,
64- 'end_timestamp' : end_timestamp ,
65- }
118+ params .update (_detect_datetime_column_parameters (data [column_name ], column_metadata ))
66119 elif sdtype == 'categorical' :
67- column_parameters [column_name ] = {
68- 'category_values' : data [column_name ].dropna ().unique ().tolist ()
69- }
120+ params .update (_detect_categorical_column_parameters (data [column_name ]))
70121
71- column_parameters [column_name ]['missing_values_proportion' ] = float (
72- data [column_name ].isna ().mean ()
73- )
122+ params ['missing_values_proportion' ] = _compute_missing_values_proportion (data [column_name ])
123+ column_parameters [column_name ] = params
74124
75125 return {'columns' : column_parameters }
76126
77127
78128def create_parameters (data , metadata , output_filename ):
79129 """Detect and create a parameter dict for the DayZ model."""
130+ if len (data ) == 0 :
131+ raise ValueError ('Data is empty' )
132+ if len (metadata .tables ) == 0 :
133+ raise ValueError ('Metadata is empty' )
134+
80135 metadata .validate ()
81136 datas = data if isinstance (data , dict ) else {metadata ._get_single_table_name (): data }
82137 metadata .validate_data (datas )
0 commit comments