From f2b6a5c1a5bf9856f35cf506fe62747095940401 Mon Sep 17 00:00:00 2001
From: Coerulatus <marcomontagna21@gmail.com>
Date: Wed, 8 May 2024 16:40:46 +0000
Subject: [PATCH] substituted nan with means

---
 topobenchmarkx/io/load/us_county_demos.py | 24 ++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/topobenchmarkx/io/load/us_county_demos.py b/topobenchmarkx/io/load/us_county_demos.py
index 7c267393..f650adcb 100644
--- a/topobenchmarkx/io/load/us_county_demos.py
+++ b/topobenchmarkx/io/load/us_county_demos.py
@@ -36,7 +36,17 @@ def load_us_county_demos(path, year=2012, y_col="Election"):
         "BachelorRate",
         "UnemploymentRate",
     ]
-    # Drop rows with missing values
+    
+    # Select columns, replace ',' with '.' and convert to numeric
+    stat = stat.loc[:, keep_cols]
+    stat["MedianIncome"] = stat["MedianIncome"].replace(',','.', regex=True)
+    stat = stat.apply(pd.to_numeric, errors='coerce')
+    
+    # Step 2: Substitute NaN values with column mean
+    for column in stat.columns:
+        if column != "FIPS":
+            mean_value = stat[column].mean()
+            stat[column].fillna(mean_value, inplace=True)
     stat = stat[keep_cols].dropna()
 
     # Delete edges that are not present in stat df
@@ -103,12 +113,12 @@ def load_us_county_demos(path, year=2012, y_col="Election"):
 
     x_col = list(set(stat.columns).difference(set([y_col])))
 
-    stat["MedianIncome"] = (
-        stat["MedianIncome"]
-        .apply(lambda x: x.replace(",", ""))
-        .to_numpy()
-        .astype(float)
-    )
+    # stat["MedianIncome"] = (
+    #     stat["MedianIncome"]
+    #     .apply(lambda x: x.replace(",", ""))
+    #     .to_numpy()
+    #     .astype(float)
+    # )
 
     x = torch.tensor(stat[x_col].to_numpy(), dtype=torch.float32)
     y = torch.tensor(stat[y_col].to_numpy(), dtype=torch.float32)