-
Notifications
You must be signed in to change notification settings - Fork 0
/
json_ohe.py
41 lines (35 loc) · 1.53 KB
/
json_ohe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json
# Preprocess the column to parse strings into dictionaries
def preprocess_column(column):
cleaned_data = []
for row in column.dropna():
cleaned_row = []
for item in row.split(','): # Split by commas
try:
# Replace single quotes with double quotes and strip whitespace
cleaned_item = item.strip().replace("'", '"')
# Parse as a dictionary
parsed_dict = json.loads(cleaned_item)
cleaned_row.append(parsed_dict)
except json.JSONDecodeError:
pass # Skip invalid entries
cleaned_data.append(cleaned_row)
return cleaned_data
# Step 1: Preprocess the column
filtered_df[column_name] = preprocess_column(filtered_df[column_name])
# Step 2: Extract all unique keys from the cleaned dictionaries
def extract_keys_from_preprocessed(column):
keys = set()
for row in column.dropna():
for dictionary in row: # Iterate over parsed dictionaries
if isinstance(dictionary, dict): # Ensure it's a dictionary
keys.update(dictionary.keys()) # Add keys to the set
return keys
unique_keys = extract_keys_from_preprocessed(filtered_df[column_name])
# Step 3: Create one-hot encoded columns for each key
for key in unique_keys:
filtered_df[f"{column_name}_{key}"] = filtered_df[column_name].apply(
lambda x: 1 if any(key in dictionary for dictionary in x if isinstance(dictionary, dict)) else 0
)
# Display the resulting DataFrame
print(filtered_df)