-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathApproach2_Generic_EDA.py
151 lines (127 loc) · 6.02 KB
/
Approach2_Generic_EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
# Initialize the transformer model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
def perform_detailed_textual_eda(dataset_path, n_components=2):
try:
# Load dataset
df = pd.read_csv(dataset_path)
except Exception as e:
return f"Error loading dataset: {e}"
# Initialize a description string
description = ""
# Basic Information
description += "--- Basic Information ---\n"
description += f"Number of Rows: {df.shape[0]}\n"
description += f"Number of Columns: {df.shape[1]}\n"
description += "\nColumn Names:\n"
description += ", ".join(df.columns.tolist()) + "\n\n"
# Data Types
description += "--- Data Types ---\n"
description += df.dtypes.to_string() + "\n\n"
# Missing Values
description += "--- Missing Values ---\n"
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_summary = pd.DataFrame({
"Missing Values": missing_values,
"Percentage": missing_percentage
})
description += missing_summary.to_string() + "\n\n"
# Constant Columns
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
description += "--- Constant Columns ---\n"
if constant_columns:
description += f"The dataset contains {len(constant_columns)} constant column(s): {', '.join(constant_columns)}\n"
else:
description += "No constant columns detected.\n\n"
# Duplicate Rows
duplicate_rows = df.duplicated().sum()
description += "--- Duplicate Rows ---\n"
description += f"Number of duplicate rows: {duplicate_rows}\n\n"
# Summary Statistics (Numeric Columns)
description += "--- Summary Statistics (Numeric Columns) ---\n"
numeric_summary = df.describe().T
description += numeric_summary.to_string() + "\n\n"
# Skewness and Kurtosis
description += "--- Skewness and Kurtosis (Numeric Columns) ---\n"
numeric_columns = df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
skewness = df[column].skew()
kurtosis = df[column].kurt()
description += f"{column}: Skewness = {skewness:.2f}, Kurtosis = {kurtosis:.2f}\n"
description += "\n"
# Unique Values Per Column
description += "--- Unique Values Per Column ---\n"
unique_values = {col: df[col].nunique() for col in df.columns}
for col, count in unique_values.items():
description += f"{col}: {count} unique values\n"
description += "\n"
# Value Counts for Categorical Columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
description += "--- Value Counts (Categorical Columns) ---\n"
for column in categorical_columns:
description += f"\nColumn: {column}\n"
description += df[column].value_counts().to_string() + "\n"
description += "\n"
# Correlation Analysis
description += "--- Correlation Analysis (Numeric Columns) ---\n"
if not numeric_columns.empty:
correlation_matrix = df[numeric_columns].corr()
description += "Correlation Matrix:\n"
description += correlation_matrix.to_string() + "\n\n"
# Significant Correlations (Above Threshold)
significant_threshold = 0.7
significant_correlations = correlation_matrix[(correlation_matrix > significant_threshold) & (correlation_matrix != 1)]
description += f"Significant correlations (>|{significant_threshold}|):\n"
description += significant_correlations.to_string() + "\n\n"
else:
description += "No numeric columns available for correlation analysis.\n\n"
# Outlier Detection (Using IQR)
description += "--- Outlier Detection (Numeric Columns) ---\n"
outlier_summary = {}
for column in numeric_columns:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
outlier_summary[column] = outliers
for column, outliers in outlier_summary.items():
description += f"{column}: {outliers} outlier(s) detected\n"
description += "\n"
# One Hot Encoding for Categorical Columns
description += "--- One Hot Encoding ---\n"
if not categorical_columns.empty:
encoder = OneHotEncoder(sparse=False)
encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))
encoded_df.columns = encoder.get_feature_names_out(categorical_columns)
df = df.drop(columns=categorical_columns).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)
description += "One hot encoding applied to categorical columns.\n\n"
else:
description += "No categorical columns available for one hot encoding.\n\n"
# PCA for Dimensionality Reduction
description += "--- PCA for Dimensionality Reduction ---\n"
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(df)
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(n_components)])
description += f"PCA applied with {n_components} components. Explained variance ratio: {pca.explained_variance_ratio_}\n\n"
description += "Extended EDA Completed."
return description
def generate_vector_embeddings(sentences):
embeddings = model.encode(sentences)
return embeddings
dataset_path = '/Users/ballu_macbookpro/Downloads/titanic.csv'
eda_description = perform_detailed_textual_eda(dataset_path)
print(eda_description)
# Generate vector embeddings for extracted values
extracted_values = eda_description.split('\n')
embeddings = generate_vector_embeddings(extracted_values)
print("\n--- Vector Embeddings ---\n")
for value, embedding in zip(extracted_values, embeddings):
print(f"Value: {value}\nEmbedding: {embedding}\n")