-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha2_falcification_SVM.py
209 lines (173 loc) · 8.02 KB
/
a2_falcification_SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# %%
import numpy as np
import os
import pandas as pd
import sup_fun as spf
import pygimli as pg
import pybert as pb
from pygimli.physics import ert
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
saturation_maps = np.load("saturation_maps.npy")
resistivity_maps = np.load("resistivity_maps.npy")
total_water_contents = np.load("total_water_contents.npy")
data_dd_rhoa_array = np.load("data_dd.npy")
data_slm_rhoa_array = np.load("data_slm.npy")
apparent_resistivity = np.concatenate([data_dd_rhoa_array, data_slm_rhoa_array], axis=1)
print(f"Saturation Maps Shape: {saturation_maps.shape}")
print(f"Resistivity Maps Shape: {resistivity_maps.shape}")
print(f"Total Water Contents Shape: {total_water_contents.shape}")
print(f"DD Rhoa Array Shape: {data_dd_rhoa_array.shape}")
print(f"SLM Rhoa Array Shape: {data_slm_rhoa_array.shape}")
print(f"All Rhoa Array Shape: {apparent_resistivity.shape}")
# %% Load data files, only keep the data with shared abmn, then we can make time lapse comparation
x_ini = 0.0
end_line = 142.0
ne = 72
scheme_slm_base = pb.createData(elecs=pg.utils.grange(start=x_ini, end=end_line, n=ne),
schemeName='slm')
scheme_dd_base = pb.createData(elecs=pg.utils.grange(start=x_ini, end=end_line, n=ne), schemeName='dd')
spf.dd_generation(scheme_dd_base, max_geom_factor=5000)
sensor_df_dd = pd.DataFrame(columns=['a','b','m','n'])
for ii in ['a','b','m','n']:
sensor_df_dd[ii] = scheme_dd_base[ii]
sensor_df_slm = pd.DataFrame(columns=['a','b','m','n'])
for ii in ['a','b','m','n']:
sensor_df_slm[ii] = scheme_slm_base[ii]
sensor_df = pd.concat([sensor_df_dd,sensor_df_slm], ignore_index=True)
#
BB11Z_05=pd.read_pickle('BB11Z_05.pkl')
BB11Z_06=pd.read_pickle('BB11Z_06.pkl')
BB11Z_07=pd.read_pickle('BB11Z_07.pkl')
BB11Z_02=pd.read_pickle('BB11Z_02.pkl')
# import time sequence dataframe file
with open('./process_data/time_sequence_data.pkl', 'rb') as f:
BB11Z, BB11N, BB_long = pd.read_pickle(f)
BB11Z = BB11Z.dropna()
# %%
# Load the pickle files
BB11Z_05 = pd.read_pickle('BB11Z_05.pkl')
BB11Z_06 = pd.read_pickle('BB11Z_06.pkl')
BB11Z_07 = pd.read_pickle('BB11Z_07.pkl')
BB11Z_02 = pd.read_pickle('BB11Z_02.pkl')
BB11Z_02 = BB11Z_02.dropna()
# Select only the common columns [a, b, m, n]
sensor_common = sensor_df[['a', 'b', 'm', 'n']]
BB11Z_05_common = BB11Z_05[['a', 'b', 'm', 'n']]
BB11Z_06_common = BB11Z_06[['a', 'b', 'm', 'n']]
BB11Z_07_common = BB11Z_07[['a', 'b', 'm', 'n']]
BB11Z_02_common = BB11Z_02[['a', 'b', 'm', 'n']]
# Perform an inner merge across all DataFrames to keep only the rows with same [a, b, m, n] values
merged_df = sensor_common.merge(BB11Z_05_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_06_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_07_common, on=['a', 'b', 'm', 'n'], how='inner')
merged_df = merged_df.merge(BB11Z_02_common, on=['a', 'b', 'm', 'n'], how='inner')
# Get the indexes from sensor_df for these matching rows
matching_indexes = sensor_df.reset_index().merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')['index']
# Output the matching indexes
# Filter the original DataFrames to only keep rows with common [a, b, m, n]
BB11Z_05_filtered = BB11Z_05.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_06_filtered = BB11Z_06.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_07_filtered = BB11Z_07.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
BB11Z_02_filtered = BB11Z_02.merge(merged_df, on=['a', 'b', 'm', 'n'], how='inner')
sensor_df_filtered = apparent_resistivity[:,matching_indexes.values]
sensor_df_filtered = sensor_df_filtered[~np.isnan(sensor_df_filtered).any(axis=1)]
# %%
# print(f'sam_mean={sensor_df_filtered.flatten().mean()}')
# print(f'sam_std={sensor_df_filtered.flatten().std()}')
fig1, x1 = plt.subplots()
data = pd.Series(sensor_df_filtered[:10,:].flatten())
sns.histplot(data.dropna(),stat='density')
sns.histplot(BB11Z_06_filtered['rhoa'],stat='density')
x1.set_xlim([0, 1000])
# %% Now I want to apply one-class SVM for falcification.
# sensor_df_filtered training data
X_train = sensor_df_filtered # Shape: (50000, 3661)
# Extract 'rhoa' values from each dataframe and convert to NumPy arrays
test_sample_1 = BB11Z_05_filtered['rhoa'].values # Shape: (3661,)
test_sample_2 = BB11Z_06_filtered['rhoa'].values # Shape: (3661,)
test_sample_3 = BB11Z_07_filtered['rhoa'].values # Shape: (3661,)
test_sample_4 = BB11Z_02_filtered['rhoa'].values # Shape: (3661,)
# Combine the test samples into a single array
X_test = np.vstack([test_sample_1, test_sample_2, test_sample_3, test_sample_4]) # Shape: (4, 3661)
# * Scaling is crucial for SVM and PCA.
# * We’ll fit the scaler on the training data and transform both training and test data.
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data
X_test_scaled = scaler.transform(X_test)
# * Apply Dimensionality Reduction
# Initialize PCA to retain 95% of the variance
pca = PCA(n_components=0.95)
# Fit PCA on the scaled training data
X_train_pca = pca.fit_transform(X_train_scaled)
# Transform the test data using the fitted PCA
X_test_pca = pca.transform(X_test_scaled)
# Check the reduced dimensions
print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")
# %%
# Plot the data
import matplotlib.pyplot as plt
# You already have X_train_pca and X_test_pca, with 61 components
# We'll plot the first 5 principal components one by one, pairwise.
# Plot the first 5 components one by one (PC1 vs. PC2, PC1 vs. PC3, ..., PC4 vs. PC5)
test_samples_labels = ['BB11Z_05', 'BB11Z_06', 'BB11Z_07', 'BB11Z_02']
colors = ['red', 'green', 'orange', 'purple']
markers = ['o', 's', 'D', '^']
# Loop through the first 5 components and plot them pairwise
for i in range(5):
for j in range(i+1, 5):
plt.figure(figsize=(8, 6))
# Plot the training data
plt.scatter(X_train_pca[:, i], X_train_pca[:, j], c='lightblue', alpha=0.5, label='Training Data')
# Plot the test data with different colors/markers
for k, test_point in enumerate(X_test_pca):
plt.scatter(test_point[i], test_point[j], color=colors[k], marker=markers[k], s=100, edgecolor='k', label=f'Test Sample {test_samples_labels[k]}')
# Add labels and title
plt.xlabel(f'Principal Component {i+1}')
plt.ylabel(f'Principal Component {j+1}')
plt.title(f'PC{i+1} vs PC{j+1}')
plt.legend()
plt.grid(True)
# Show the plot
plt.show()
# %%
# Train the one-class SVM model
# Initialize the One-Class SVM model
clf = OneClassSVM(gamma='auto')
# Fit the model on the PCA-transformed training data
clf.fit(X_train_pca)
# Predict whether each test sample is an inlier or an outlier
predictions = clf.predict(X_test_pca) # Returns 1 for inliers, -1 for outliers
# * Interpret and Output the Results
# Map predictions to meaningful labels
results = ['Inlier' if pred == 1 else 'Outlier' for pred in predictions]
# Create a DataFrame to display the results
test_samples = ['BB11Z_05', 'BB11Z_06', 'BB11Z_07', 'BB11Z_02']
output_df = pd.DataFrame({'Sample': test_samples, 'Prediction': results})
print(output_df)
# %%
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# Assume you already have your scaled data
# Run PCA on your scaled training data
pca = PCA().fit(X_train_scaled) # Fit PCA to capture all components
# Get explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_
# Plot the explained variance (Scree Plot)
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='--', color='b')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot: Cumulative Explained Variance by Principal Components')
plt.grid(True)
plt.show()
# %%