creditscoreeval.py

# -*- coding: utf-8 -*-
"""CreditScoreEval.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1LG2LjN-Z0K2IfeUDnrHd14Fp5iEArifa
"""

# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve

# Load the dataset
data = pd.read_csv('/content/Preprocessed_Missing_dataset.csv')
# Display the first few rows of the dataset
data.head()

# Display summary statistics
print(data.describe())

# Check data types and info
print(data.info())

# Check for unique values in each column
print(data.nunique())

# Plotting histograms for all features
data.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Correlation heatmap
numeric_data = data.select_dtypes(include=[np.number])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.show()

# Assuming 'Credit_Score' is the target column (adjust as needed)
sns.countplot(data['Credit_Score'])
plt.title('Distribution of Credit Scores')
plt.show()

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

from sklearn.preprocessing import LabelEncoder

# Example of encoding multiple categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

from sklearn.preprocessing import LabelEncoder

# Encoding specific categorical columns
label_encoder = LabelEncoder()
for col in ['Occupation', 'Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']:  # Replace with actual categorical column names
    data[col] = label_encoder.fit_transform(data[col])

# Continue with your analysis and modeling
print(data.head())

# Define the features (X) and target (y)
X = data.drop('Credit_Score', axis=1)  # All columns except the target
y = data['Credit_Score']  # The target column

# Check the shape of X and y to confirm the split
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize RandomForest Classifier
rf = RandomForestClassifier(random_state=42)

# Define hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 10],
    'criterion': ['gini', 'entropy']
}

# Setup GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")

# Initialize Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)

# Define hyperparameters grid for tuning
gbc_param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4]
}

# Setup GridSearchCV for Gradient Boosting
# gbc_grid_search = GridSearchCV(estimator=gbc, param_grid=gbc_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
gbc_grid_search = GridSearchCV(
    estimator=gbc,
    param_grid=gbc_param_grid,
    cv=3,  # Reduce number of folds
    n_jobs=-1,
    scoring='accuracy'
)
# Fit the model
gbc_grid_search.fit(X_train, y_train)

# Print best parameters and best score for Gradient Boosting
print(f"Best parameters for GBC: {gbc_grid_search.best_params_}")
print(f"Best accuracy for GBC: {gbc_grid_search.best_score_}")

# Predictions using the best RandomForest model
rf_best = grid_search.best_estimator_
y_pred_rf = rf_best.predict(X_test)

# Performance metrics for RandomForest
print("RandomForest Model Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Predictions using the best GradientBoosting model
gbc_best = gbc_grid_search.best_estimator_
y_pred_gbc = gbc_best.predict(X_test)

# Performance metrics for Gradient Boosting
print("Gradient Boosting Model Accuracy:", accuracy_score(y_test, y_pred_gbc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gbc))
print("Classification Report:\n", classification_report(y_test, y_pred_gbc))

from sklearn.metrics import roc_auc_score, roc_curve

# Calculate ROC-AUC using multi-class setting
rf_roc_auc = roc_auc_score(y_test, rf_best.predict_proba(X_test), multi_class='ovr')
gbc_roc_auc = roc_auc_score(y_test, gbc_best.predict_proba(X_test), multi_class='ovr')

print(f"Random Forest ROC-AUC: {rf_roc_auc}")
print(f"Gradient Boosting ROC-AUC: {gbc_roc_auc}")

# Since you are dealing with multi-class classification, plotting ROC curves separately for each class
for i in range(len(rf_best.classes_)):
    rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_best.predict_proba(X_test)[:, i], pos_label=rf_best.classes_[i])
    gbc_fpr, gbc_tpr, _ = roc_curve(y_test, gbc_best.predict_proba(X_test)[:, i], pos_label=gbc_best.classes_[i])

    plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (Class {i}, AUC = {rf_roc_auc:.2f})')
    plt.plot(gbc_fpr, gbc_tpr, label=f'Gradient Boosting (Class {i}, AUC = {gbc_roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Multi-Class)')
plt.legend(loc='lower right')
plt.show()

import pickle

# Save the best RandomForest model
with open('random_forest_credit_score_model.pkl', 'wb') as file:
    pickle.dump(rf_best, file)

# Save the best GradientBoosting model
with open('gradient_boosting_credit_score_model.pkl', 'wb') as file:
    pickle.dump(gbc_best, file)

from google.colab import files

# Download RandomForest model
files.download('random_forest_credit_score_model.pkl')

# Download GradientBoosting model
files.download('gradient_boosting_credit_score_model.pkl')