a2_falcification_SVM.py

# %%
import numpy as np
import os
import pandas as pd
import sup_fun as spf
import pygimli as pg
import pybert as pb
from pygimli.physics import ert
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
saturation_maps = np.load("saturation_maps.npy")
resistivity_maps = np.load("resistivity_maps.npy")
total_water_contents = np.load("total_water_contents.npy")
data_dd_rhoa_array = np.load("data_dd.npy")
data_slm_rhoa_array = np.load("data_slm.npy")
apparent_resistivity = np.concatenate([data_dd_rhoa_array, data_slm_rhoa_array], axis=1)

print(f"Saturation Maps Shape: {saturation_maps.shape}")
print(f"Resistivity Maps Shape: {resistivity_maps.shape}")
print(f"Total Water Contents Shape: {total_water_contents.shape}")
print(f"DD Rhoa Array Shape: {data_dd_rhoa_array.shape}")
print(f"SLM Rhoa Array Shape: {data_slm_rhoa_array.shape}")
print(f"All Rhoa Array Shape: {apparent_resistivity.shape}")

# %%  Load data files, only keep the data with shared abmn, then we can make time lapse comparation
x_ini = 0.0
end_line = 142.0
ne = 72
scheme_slm_base = pb.createData(elecs=pg.utils.grange(start=x_ini, end=end_line, n=ne),
schemeName='slm')
scheme_dd_base = pb.createData(elecs=pg.utils.grange(start=x_ini, end=end_line, n=ne), schemeName='dd')
spf.dd_generation(scheme_dd_base, max_geom_factor=5000)   
sensor_df_dd = pd.DataFrame(columns=['a','b','m','n'])
for ii in ['a','b','m','n']:
    sensor_df_dd[ii] = scheme_dd_base[ii]
sensor_df_slm = pd.DataFrame(columns=['a','b','m','n'])
for ii in ['a','b','m','n']:
    sensor_df_slm[ii] = scheme_slm_base[ii]
sensor_df = pd.concat([sensor_df_dd,sensor_df_slm], ignore_index=True)


# 
BB11Z_05=pd.read_pickle('BB11Z_05.pkl')
BB11Z_06=pd.read_pickle('BB11Z_06.pkl')
BB11Z_07=pd.read_pickle('BB11Z_07.pkl')
BB11Z_02=pd.read_pickle('BB11Z_02.pkl')

# import time sequence dataframe file 
with open('./process_data/time_sequence_data.pkl', 'rb') as f:
    BB11Z, BB11N, BB_long = pd.read_pickle(f)
BB11Z = BB11Z.dropna()
# %%

# Load the pickle files
BB11Z_05 = pd.read_pickle('BB11Z_05.pkl')
BB11Z_06 = pd.read_pickle('BB11Z_06.pkl')
BB11Z_07 = pd.read_pickle('BB11Z_07.pkl')
BB11Z_02 = pd.read_pickle('BB11Z_02.pkl')
BB11Z_02 = BB11Z_02.dropna()
# Select only the common columns [a, b, m, n]
sensor_common = sensor_df[['a', 'b', 'm', 'n']]
BB11Z_05_common = BB11Z_05[['a', 'b', 'm', 'n']]
BB11Z_06_common = BB11Z_06[['a', 'b', 'm', 'n']]
BB11Z_07_common = BB11Z_07[['a', 'b', 'm', 'n']]
BB11Z_02_common = BB11Z_02[['a', 'b', 'm', 'n']]

# Perform an inner merge across all DataFrames to keep only the rows with same [a, b, m, n] values
merged_df = sensor_common.merge(BB11Z_05_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_06_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_07_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_02_common, on=['a', 'b', 'm', 'n'], how='inner')

# Get the indexes from sensor_df for these matching rows
matching_indexes = sensor_df.reset_index().merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')['index']

# Output the matching indexes


# Filter the original DataFrames to only keep rows with common [a, b, m, n]
BB11Z_05_filtered = BB11Z_05.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_06_filtered = BB11Z_06.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_07_filtered = BB11Z_07.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_02_filtered = BB11Z_02.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
sensor_df_filtered = apparent_resistivity[:,matching_indexes.values]
sensor_df_filtered = sensor_df_filtered[~np.isnan(sensor_df_filtered).any(axis=1)]
# %%
# print(f'sam_mean={sensor_df_filtered.flatten().mean()}')
# print(f'sam_std={sensor_df_filtered.flatten().std()}')
fig1, x1 = plt.subplots()
data = pd.Series(sensor_df_filtered[:10,:].flatten())
sns.histplot(data.dropna(),stat='density')
sns.histplot(BB11Z_06_filtered['rhoa'],stat='density')
x1.set_xlim([0, 1000])
# %% Now I want to apply one-class SVM for falcification.
# sensor_df_filtered training data
X_train = sensor_df_filtered  # Shape: (50000, 3661)
# Extract 'rhoa' values from each dataframe and convert to NumPy arrays
test_sample_1 = BB11Z_05_filtered['rhoa'].values  # Shape: (3661,)
test_sample_2 = BB11Z_06_filtered['rhoa'].values  # Shape: (3661,)
test_sample_3 = BB11Z_07_filtered['rhoa'].values  # Shape: (3661,)
test_sample_4 = BB11Z_02_filtered['rhoa'].values  # Shape: (3661,)

# Combine the test samples into a single array
X_test = np.vstack([test_sample_1, test_sample_2, test_sample_3, test_sample_4])  # Shape: (4, 3661)
# * Scaling is crucial for SVM and PCA. 
# * We’ll fit the scaler on the training data and transform both training and test data.
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# * Apply Dimensionality Reduction
# Initialize PCA to retain 95% of the variance
pca = PCA(n_components=0.95)

# Fit PCA on the scaled training data
X_train_pca = pca.fit_transform(X_train_scaled)

# Transform the test data using the fitted PCA
X_test_pca = pca.transform(X_test_scaled)

# Check the reduced dimensions
print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

# %%
# Plot the data
import matplotlib.pyplot as plt

# You already have X_train_pca and X_test_pca, with 61 components
# We'll plot the first 5 principal components one by one, pairwise.

# Plot the first 5 components one by one (PC1 vs. PC2, PC1 vs. PC3, ..., PC4 vs. PC5)
test_samples_labels = ['BB11Z_05', 'BB11Z_06', 'BB11Z_07', 'BB11Z_02']
colors = ['red', 'green', 'orange', 'purple']
markers = ['o', 's', 'D', '^']

# Loop through the first 5 components and plot them pairwise
for i in range(5):
    for j in range(i+1, 5):
        plt.figure(figsize=(8, 6))

        # Plot the training data
        plt.scatter(X_train_pca[:, i], X_train_pca[:, j], c='lightblue', alpha=0.5, label='Training Data')

        # Plot the test data with different colors/markers
        for k, test_point in enumerate(X_test_pca):
            plt.scatter(test_point[i], test_point[j], color=colors[k], marker=markers[k], s=100, edgecolor='k', label=f'Test Sample {test_samples_labels[k]}')

        # Add labels and title
        plt.xlabel(f'Principal Component {i+1}')
        plt.ylabel(f'Principal Component {j+1}')
        plt.title(f'PC{i+1} vs PC{j+1}')
        plt.legend()
        plt.grid(True)

        # Show the plot
        plt.show()

# %% 
# Train the one-class SVM model
# Initialize the One-Class SVM model
clf = OneClassSVM(gamma='auto')

# Fit the model on the PCA-transformed training data
clf.fit(X_train_pca)

# Predict whether each test sample is an inlier or an outlier
predictions = clf.predict(X_test_pca)  # Returns 1 for inliers, -1 for outliers

# * Interpret and Output the Results
# Map predictions to meaningful labels
results = ['Inlier' if pred == 1 else 'Outlier' for pred in predictions]

# Create a DataFrame to display the results
test_samples = ['BB11Z_05', 'BB11Z_06', 'BB11Z_07', 'BB11Z_02']
output_df = pd.DataFrame({'Sample': test_samples, 'Prediction': results})

print(output_df)
# %%
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assume you already have your scaled data
# Run PCA on your scaled training data
pca = PCA().fit(X_train_scaled)  # Fit PCA to capture all components

# Get explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Plot the explained variance (Scree Plot)
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='--', color='b')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot: Cumulative Explained Variance by Principal Components')
plt.grid(True)
plt.show()
# %%