-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathe2e.py
71 lines (56 loc) · 3.17 KB
/
e2e.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# Import custom transformers and utility classes
from transformations import LabelEncoderTransformer, StandardScalerTransformer
from trainer import ModelTrainer
from saver import ModelSaver
from test_train_split import DataSplitter
from dataUtils import load_data # Load data function from dataUtils.py
from dataAnalysis import data_overview # Import data overview function
if __name__ == "__main__":
# Load data configuration and dataset
input_details = load_data() # Load input details from configuration
file_id = input_details['file_id']
download_url = f"https://drive.google.com/uc?id={file_id}"
df = pd.read_csv(download_url) # Read data from the Google Drive URL
# Step 1: Perform data overview and preprocessing
df = data_overview(df) # Apply data overview for basic EDA
# Step 2: Create preprocessing pipeline (Label Encoding + Scaling)
preprocessing_pipeline = Pipeline([
('label_encoder', LabelEncoderTransformer()), # Add label encoding as a stage
('scaling', StandardScalerTransformer()) # Add standard scaling as a stage
])
# Transform data using preprocessing pipeline
df_transformed = preprocessing_pipeline.fit_transform(df)
# Split the dataset into training and testing sets
splitter = DataSplitter(target_feature='fertilizer_name')
X_train, X_test, y_train, y_test = splitter.fit_transform(df_transformed)
print('y_train data type:', y_train.dtype)
y_train = pd.Series(y_train).astype('category') # Convert to category type
print('y_train after conversion to category:', y_train.dtype)
print("Unique values in y_train:", y_train.unique())
print("Value counts in y_train:", y_train.value_counts())
print('Shape of y_train:', y_train.shape)
if len(y_train.shape) > 1:
y_train = y_train.flatten()
print("Data type check for unique labels:", y_train.apply(type).unique())
y_train = pd.Series(y_train).astype(int) # Ensure it is treated as an integer
print("y_train after conversion:", y_train.head())
# Step 3: Initialize and train the model
model_trainer = ModelTrainer(n_estimators=50, max_depth=10, min_samples_split=2, min_samples_leaf=1, criterion='gini', bootstrap=True)
model_trainer.fit(X_train, y_train)
# Step 4: Save the model after training
model_saver = ModelSaver(filename='final_rf_model.pkl')
model_saver.transform(X_test, model_trainer.model)
# Step 5: Make predictions using the test data
y_pred = model_trainer.predict(X_test)
print("Predictions:", y_pred)
# Reverse label encoding for the predictions (from transformed to original labels)
y_pred_df = pd.DataFrame(y_pred, columns=['fertilizer_name'])
y_pred_original = preprocessing_pipeline.named_steps['label_encoder'].inverse_transform(y_pred_df)
print("Predicted Fertilizer Names:", y_pred_original)
# Step 6: Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))