diff --git a/EDA/Exploratory_Data_Analysis.ipynb b/EDA/Exploratory_Data_Analysis.ipynb index 48941b8..5187afd 100644 --- a/EDA/Exploratory_Data_Analysis.ipynb +++ b/EDA/Exploratory_Data_Analysis.ipynb @@ -1,25 +1,32 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", - "source": "# Exploratory Data Analysis", - "id": "8f0aa07b2205167f" + "id": "8f0aa07b2205167f", + "metadata": {}, + "source": [ + "# Exploratory Data Analysis" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### Imports", - "id": "f81847639d887ca1" + "id": "f81847639d887ca1", + "metadata": {}, + "source": [ + "### Imports" + ] }, { + "cell_type": "code", + "execution_count": 151, + "id": "498a4928e6383010", "metadata": { "ExecuteTime": { "end_time": "2025-12-26T09:36:26.136230Z", "start_time": "2025-12-26T09:36:26.128397Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -42,69 +49,42 @@ "\n", "\n", "from scipy.stats import pearsonr, ttest_ind" - ], - "id": "498a4928e6383010", - "outputs": [], - "execution_count": 210 + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### General Information on Dataset and Domain", - "id": "4103c2fee6be983d" + "id": "4103c2fee6be983d", + "metadata": {}, + "source": [ + "### General Information on Dataset and Domain" + ] }, { + "cell_type": "markdown", + "id": "5d0d21a3e8b9425f", "metadata": { "ExecuteTime": { "end_time": "2025-12-02T05:13:28.828273Z", "start_time": "2025-12-02T05:13:28.820090Z" } }, - "cell_type": "markdown", - "source": "#### Reading the dataset and describing it", - "id": "5d0d21a3e8b9425f" + "source": [ + "#### Reading the dataset and describing it" + ] }, { + "cell_type": "code", + "execution_count": 152, + "id": "cd4d1f683291397c", "metadata": { "ExecuteTime": { "end_time": "2025-12-26T09:04:05.449957Z", "start_time": "2025-12-26T09:04:05.429759Z" } }, - "cell_type": "code", - "source": [ - "df = pd.read_csv(\"../Dataset/predictive_maintenance_final.csv\")\n", - "df.head(10)" - ], - "id": "cd4d1f683291397c", "outputs": [ { "data": { - "text/plain": [ - " Air temperature [K] Process temperature [K] Rotational speed [rpm] \\\n", - "0 300.8 310.3 1538 \n", - "1 296.3 307.3 1368 \n", - "2 298.6 309.1 1339 \n", - "3 302.4 311.1 1634 \n", - "4 297.9 307.7 1546 \n", - "5 301.6 310.8 1290 \n", - "6 298.2 308.2 1407 \n", - "7 296.6 307.4 1465 \n", - "8 298.6 309.6 1353 \n", - "9 301.0 310.7 1391 \n", - "\n", - " Torque [Nm] Tool wear [min] Type prediction \n", - "0 36.1 198 L 0 \n", - "1 49.5 10 M 0 \n", - "2 51.1 34 M 0 \n", - "3 34.2 184 L 0 \n", - "4 37.6 72 L 0 \n", - "5 65.3 63 M 0 \n", - "6 47.1 153 L 0 \n", - "7 47.8 187 L 0 \n", - "8 62.5 9 L 0 \n", - "9 45.0 123 L 0 " - ], "text/html": [ "
\n", "
RandomForestClassifier(class_weight='balanced', n_estimators=300,\n",
-       "                       random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
RandomForestClassifier(class_weight='balanced', n_estimators=300,\n",
+       "                       random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(class_weight='balanced', n_estimators=300,\n", + " random_state=42)" ] }, - "execution_count": 170, + "execution_count": 207, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 170 + "source": [ + "rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight=\"balanced\")\n", + "rf.fit(X_scaled, y)" + ] }, { + "cell_type": "code", + "execution_count": 208, + "id": "cd39f19539c5bc84", "metadata": { "ExecuteTime": { "end_time": "2025-12-26T09:05:40.242389Z", "start_time": "2025-12-26T09:05:40.217815Z" } }, - "cell_type": "code", - "source": [ - "importance = pd.Series(rf.feature_importances_, index=X.columns)\n", - "importance.sort_values(ascending=False)" - ], - "id": "cd39f19539c5bc84", "outputs": [ { "data": { @@ -3384,196 +3155,686 @@ "dtype: float64" ] }, - "execution_count": 171, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "importance = pd.Series(rf.feature_importances_, index=X.columns)\n", + "importance.sort_values(ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ee8a231b55731f28", + "metadata": {}, + "source": [ + "#### Dimensionality Reduction (PCA)" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "98b889c77e5192ce", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:13:26.190729Z", + "start_time": "2025-12-26T09:13:26.157050Z" + } + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=0.95)\n", + "X_pca = pca.fit_transform(X_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "e94df57cea8cc9ea", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:27:57.269466Z", + "start_time": "2025-12-26T09:27:57.254366Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((1631, 9), (1631, 6))" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_scaled.shape, X_pca.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "f544b8c41a72e7d1", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:13:48.106001Z", + "start_time": "2025-12-26T09:13:48.093225Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.40648903, 0.21730587, 0.03071707, ..., -0.64387164,\n", + " 0.44824634, -0.37284469],\n", + " [-1.74784264, -1.734916 , -1.10196523, ..., 1.5531046 ,\n", + " -0.85696036, 0.76480882],\n", + " [-0.64673979, -0.56358288, -1.2951875 , ..., 1.5531046 ,\n", + " -0.42189146, 0.8307588 ],\n", + " ...,\n", + " [-0.50311768, 0.08715775, -1.46842126, ..., 1.5531046 ,\n", + " -1.03098791, 1.40503396],\n", + " [-0.74248786, -1.34447163, 0.47712715, ..., -0.64387164,\n", + " 0.44824634, -0.91085622],\n", + " [ 0.83735536, -0.10806444, -0.28243627, ..., -0.64387164,\n", + " 1.66643925, -0.08601032]])" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "596104fe6e2ff545", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:28:15.384097Z", + "start_time": "2025-12-26T09:28:15.375275Z" + } + }, + "source": [ + "### CLASS IMBALANCE HANDLING" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "id": "c6045a4f588d0241", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:14:11.421312Z", + "start_time": "2025-12-26T09:14:10.687626Z" + } + }, + "outputs": [], + "source": [ + "smote = SMOTE(random_state=42)\n", + "X_resampled, y_resampled = smote.fit_resample(X_scaled, y)" + ] + }, + { + "cell_type": "markdown", + "id": "fa0981b2418b4f7", + "metadata": {}, + "source": [ + "### MODELLING" + ] + }, + { + "cell_type": "markdown", + "id": "3e357681bf71c955", + "metadata": {}, + "source": [ + "#### Train-Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "b8631c59c2a553bd", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:30:03.880257Z", + "start_time": "2025-12-26T09:30:03.869256Z" + } + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_resampled,\n", + " y_resampled,\n", + " test_size=0.2,\n", + " stratify=y_resampled,\n", + " random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d01839c6fe960685", + "metadata": {}, + "source": [ + "#### Logistic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "id": "d65ee0bfffba9d3f", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-26T09:31:40.231855Z", + "start_time": "2025-12-26T09:31:40.210068Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression()" + ] + }, + "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 171 + "source": [ + "lr = LogisticRegression(max_iter=100)\n", + "lr.fit(X_train, y_train)" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "#### Dimensionality Reduction (PCA)", - "id": "ee8a231b55731f28" - }, - { + "id": "dd0902593e98a341", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:13:26.190729Z", - "start_time": "2025-12-26T09:13:26.157050Z" + "end_time": "2025-12-26T09:32:29.987176Z", + "start_time": "2025-12-26T09:32:29.970700Z" } }, - "cell_type": "code", "source": [ - "pca = PCA(n_components=0.95)\n", - "X_pca = pca.fit_transform(X_scaled)" - ], - "id": "98b889c77e5192ce", - "outputs": [ - { - "data": { - "text/plain": [ - "((1631, 9), (1631, 6))" - ] - }, - "execution_count": 177, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 177 + "##### Classification report for Logistic Regression" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:27:57.269466Z", - "start_time": "2025-12-26T09:27:57.254366Z" - } - }, "cell_type": "code", - "source": "X_scaled.shape, X_pca.shape", - "id": "e94df57cea8cc9ea", - "outputs": [ - { - "data": { - "text/plain": [ - "((1631, 9), (1631, 6))" - ] - }, - "execution_count": 187, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 187 - }, - { + "execution_count": 215, + "id": "546fa570e262384d", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:13:48.106001Z", - "start_time": "2025-12-26T09:13:48.093225Z" + "end_time": "2025-12-26T09:31:57.971554Z", + "start_time": "2025-12-26T09:31:57.961027Z" } }, - "cell_type": "code", - "source": "X_scaled", - "id": "f544b8c41a72e7d1", "outputs": [ { - "data": { - "text/plain": [ - "array([[ 0.40648903, 0.21730587, 0.03071707, ..., -0.64387164,\n", - " 0.44824634, -0.37284469],\n", - " [-1.74784264, -1.734916 , -1.10196523, ..., 1.5531046 ,\n", - " -0.85696036, 0.76480882],\n", - " [-0.64673979, -0.56358288, -1.2951875 , ..., 1.5531046 ,\n", - " -0.42189146, 0.8307588 ],\n", - " ...,\n", - " [-0.50311768, 0.08715775, -1.46842126, ..., 1.5531046 ,\n", - " -1.03098791, 1.40503396],\n", - " [-0.74248786, -1.34447163, 0.47712715, ..., -0.64387164,\n", - " 0.44824634, -0.91085622],\n", - " [ 0.83735536, -0.10806444, -0.28243627, ..., -0.64387164,\n", - " 1.66643925, -0.08601032]])" - ] - }, - "execution_count": 178, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.89 0.90 320\n", + " 1 0.89 0.90 0.90 319\n", + "\n", + " accuracy 0.90 639\n", + " macro avg 0.90 0.90 0.90 639\n", + "weighted avg 0.90 0.90 0.90 639\n", + "\n" + ] } ], - "execution_count": 178 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:28:15.384097Z", - "start_time": "2025-12-26T09:28:15.375275Z" - } - }, - "cell_type": "markdown", - "source": "### CLASS IMBALANCE HANDLING", - "id": "596104fe6e2ff545" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:14:11.421312Z", - "start_time": "2025-12-26T09:14:10.687626Z" - } - }, - "cell_type": "code", "source": [ - "smote = SMOTE(random_state=42)\n", - "X_resampled, y_resampled = smote.fit_resample(X_scaled, y)" - ], - "id": "c6045a4f588d0241", - "outputs": [], - "execution_count": 179 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### MODELLING", - "id": "fa0981b2418b4f7" + "print(classification_report(y_test, lr.predict(X_test)))" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "#### Train-Test Split", - "id": "3e357681bf71c955" - }, - { + "id": "33f021631e5e978", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:30:03.880257Z", - "start_time": "2025-12-26T09:30:03.869256Z" + "end_time": "2025-12-26T09:32:59.464166Z", + "start_time": "2025-12-26T09:32:59.447053Z" } }, - "cell_type": "code", "source": [ - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X_resampled,\n", - " y_resampled,\n", - " test_size=0.2,\n", - " stratify=y_resampled,\n", - " random_state=42\n", - ")" - ], - "id": "b8631c59c2a553bd", - "outputs": [], - "execution_count": 191 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### Logistic Regression", - "id": "d01839c6fe960685" + "#### Decision Tree" + ] }, { + "cell_type": "code", + "execution_count": 216, + "id": "ce222ad2e3a6462f", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:31:40.231855Z", - "start_time": "2025-12-26T09:31:40.210068Z" + "end_time": "2025-12-26T09:34:12.836678Z", + "start_time": "2025-12-26T09:34:12.808733Z" } }, - "cell_type": "code", - "source": [ - "lr = LogisticRegression(max_iter=100)\n", - "lr.fit(X_train, y_train)" - ], - "id": "d65ee0bfffba9d3f", "outputs": [ { "data": { - "text/plain": [ - "LogisticRegression()" - ], "text/html": [ - "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
DecisionTreeClassifier(max_depth=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DecisionTreeClassifier(max_depth=5, random_state=42)" ] }, - "execution_count": 198, + "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 198 + "source": [ + "dt = DecisionTreeClassifier(max_depth=5, random_state=42)\n", + "dt.fit(X_train, y_train)" + ] }, { + "cell_type": "markdown", + "id": "6e2adbf7a01fc741", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:32:29.987176Z", - "start_time": "2025-12-26T09:32:29.970700Z" + "end_time": "2025-12-26T09:33:53.527743Z", + "start_time": "2025-12-26T09:33:53.515504Z" } }, - "cell_type": "markdown", - "source": "##### Classification report for Logistic Regression", - "id": "dd0902593e98a341" + "source": [ + "##### Classification report for Decision Tree" + ] }, { + "cell_type": "code", + "execution_count": 217, + "id": "d00638cdfb10f43a", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:31:57.971554Z", - "start_time": "2025-12-26T09:31:57.961027Z" + "end_time": "2025-12-26T09:34:19.566567Z", + "start_time": "2025-12-26T09:34:19.542583Z" } }, - "cell_type": "code", - "source": "print(classification_report(y_test, lr.predict(X_test)))", - "id": "546fa570e262384d", "outputs": [ { "name": "stdout", @@ -4287,53 +4282,45 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.90 0.89 0.90 320\n", - " 1 0.89 0.90 0.90 319\n", + " 0 0.94 0.99 0.97 320\n", + " 1 0.99 0.94 0.96 319\n", "\n", - " accuracy 0.90 639\n", - " macro avg 0.90 0.90 0.90 639\n", - "weighted avg 0.90 0.90 0.90 639\n", + " accuracy 0.97 639\n", + " macro avg 0.97 0.97 0.97 639\n", + "weighted avg 0.97 0.97 0.97 639\n", "\n" ] } ], - "execution_count": 200 + "source": [ + "print(classification_report(y_test, dt.predict(X_test)))" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:32:59.464166Z", - "start_time": "2025-12-26T09:32:59.447053Z" - } - }, "cell_type": "markdown", - "source": "#### Decision Tree", - "id": "33f021631e5e978" + "id": "4c288e1bee45716e", + "metadata": {}, + "source": [ + "#### Random Forest" + ] }, { + "cell_type": "code", + "execution_count": 218, + "id": "425fee66cb170b7a", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:34:12.836678Z", - "start_time": "2025-12-26T09:34:12.808733Z" + "end_time": "2025-12-26T09:34:51.693945Z", + "start_time": "2025-12-26T09:34:51.115670Z" } }, - "cell_type": "code", - "source": [ - "dt = DecisionTreeClassifier(max_depth=5, random_state=42)\n", - "dt.fit(X_train, y_train)" - ], - "id": "ce222ad2e3a6462f", "outputs": [ { "data": { - "text/plain": [ - "DecisionTreeClassifier(max_depth=5, random_state=42)" - ], "text/html": [ - "
DecisionTreeClassifier(max_depth=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
RandomForestClassifier(n_estimators=200, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(n_estimators=200, random_state=42)" ] }, - "execution_count": 205, + "execution_count": 218, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 205 + "source": [ + "rf = RandomForestClassifier(n_estimators=200, random_state=42)\n", + "rf.fit(X_train, y_train)" + ] }, { + "cell_type": "markdown", + "id": "4e9b3f09b33e37dc", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:33:53.527743Z", - "start_time": "2025-12-26T09:33:53.515504Z" + "end_time": "2025-12-26T09:35:28.367416Z", + "start_time": "2025-12-26T09:35:28.355095Z" } }, - "cell_type": "markdown", - "source": "##### Classification report for Decision Tree", - "id": "6e2adbf7a01fc741" + "source": [ + "##### Classification report for Random Forest" + ] }, { + "cell_type": "code", + "execution_count": 219, + "id": "a2c31c963a41cda7", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:34:19.566567Z", - "start_time": "2025-12-26T09:34:19.542583Z" + "end_time": "2025-12-26T09:35:05.872310Z", + "start_time": "2025-12-26T09:35:05.841344Z" } }, - "cell_type": "code", - "source": "print(classification_report(y_test, dt.predict(X_test)))", - "id": "d00638cdfb10f43a", "outputs": [ { "name": "stdout", @@ -5027,48 +4768,45 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.94 0.99 0.97 320\n", - " 1 0.99 0.94 0.96 319\n", + " 0 1.00 1.00 1.00 320\n", + " 1 1.00 1.00 1.00 319\n", "\n", - " accuracy 0.97 639\n", - " macro avg 0.97 0.97 0.97 639\n", - "weighted avg 0.97 0.97 0.97 639\n", + " accuracy 1.00 639\n", + " macro avg 1.00 1.00 1.00 639\n", + "weighted avg 1.00 1.00 1.00 639\n", "\n" ] } ], - "execution_count": 206 + "source": [ + "print(classification_report(y_test, rf.predict(X_test)))" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "#### Random Forest", - "id": "4c288e1bee45716e" + "id": "7123e9bad62a2ca5", + "metadata": {}, + "source": [ + "#### SVM" + ] }, { + "cell_type": "code", + "execution_count": 220, + "id": "e82b253e57afc7c3", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:34:51.693945Z", - "start_time": "2025-12-26T09:34:51.115670Z" + "end_time": "2025-12-26T09:36:44.636845Z", + "start_time": "2025-12-26T09:36:44.601182Z" } }, - "cell_type": "code", - "source": [ - "rf = RandomForestClassifier(n_estimators=200, random_state=42)\n", - "rf.fit(X_train, y_train)" - ], - "id": "425fee66cb170b7a", "outputs": [ { "data": { - "text/plain": [ - "RandomForestClassifier(n_estimators=200, random_state=42)" - ], "text/html": [ - "
RandomForestClassifier(n_estimators=200, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "SVC()" ] }, - "execution_count": 207, + "execution_count": 220, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 207 + "source": [ + "svm = SVC(kernel='rbf')\n", + "svm.fit(X_train, y_train)" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:35:28.367416Z", - "start_time": "2025-12-26T09:35:28.355095Z" - } - }, "cell_type": "markdown", - "source": "##### Classification report for Random Forest", - "id": "4e9b3f09b33e37dc" + "id": "6545a490268fd00d", + "metadata": {}, + "source": [ + "##### Classification report for SVM" + ] }, { + "cell_type": "code", + "execution_count": 221, + "id": "d9097b2f27321b85", "metadata": { "ExecuteTime": { - "end_time": "2025-12-26T09:35:05.872310Z", - "start_time": "2025-12-26T09:35:05.841344Z" + "end_time": "2025-12-26T09:37:13.661171Z", + "start_time": "2025-12-26T09:37:13.634880Z" } }, - "cell_type": "code", - "source": "print(classification_report(y_test, rf.predict(X_test)))", - "id": "a2c31c963a41cda7", "outputs": [ { "name": "stdout", @@ -5822,48 +5249,40 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 1.00 1.00 1.00 320\n", - " 1 1.00 1.00 1.00 319\n", + " 0 1.00 0.99 0.99 320\n", + " 1 0.99 1.00 0.99 319\n", "\n", - " accuracy 1.00 639\n", - " macro avg 1.00 1.00 1.00 639\n", - "weighted avg 1.00 1.00 1.00 639\n", + " accuracy 0.99 639\n", + " macro avg 0.99 0.99 0.99 639\n", + "weighted avg 0.99 0.99 0.99 639\n", "\n" ] } ], - "execution_count": 208 + "source": [ + "print(classification_report(y_test, svm.predict(X_test)))" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "#### SVM", - "id": "7123e9bad62a2ca5" + "id": "5879e6d4", + "metadata": {}, + "source": [ + "CONFUSION MATRIX + ROC-AUC" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:36:44.636845Z", - "start_time": "2025-12-26T09:36:44.601182Z" - } - }, "cell_type": "code", - "source": [ - "svm = SVC(kernel='rbf')\n", - "svm.fit(X_train, y_train)" - ], - "id": "e82b253e57afc7c3", + "execution_count": 222, + "id": "22f1f052ed96d7d1", + "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "SVC()" - ], "text/html": [ - "
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
Pipeline(steps=[('scaler', StandardScaler()), ('smote', SMOTE(random_state=42)),\n",
+       "                ('pca', PCA(n_components=0.95)),\n",
+       "                ('model',\n",
+       "                 RandomForestClassifier(class_weight='balanced',\n",
+       "                                        n_estimators=300, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('scaler', StandardScaler()), ('smote', SMOTE(random_state=42)),\n", + " ('pca', PCA(n_components=0.95)),\n", + " ('model',\n", + " RandomForestClassifier(class_weight='balanced',\n", + " n_estimators=300, random_state=42))])" ] }, - "execution_count": 211, + "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 211 + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from imblearn.pipeline import Pipeline as ImbPipeline\n", + "from imblearn.over_sampling import SMOTE\n", + "\n", + "# Feature / target split (use your existing X, y if already created)\n", + "X = df_encoded.drop(columns='machine_failure')\n", + "y = df_encoded['machine_failure']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, stratify=y, random_state=42\n", + ")\n", + "\n", + "# Pipeline definition\n", + "pipeline = ImbPipeline(steps=[\n", + " ('scaler', StandardScaler()),\n", + " ('smote', SMOTE(random_state=42)),\n", + " ('pca', PCA(n_components=0.95)),\n", + " ('model', RandomForestClassifier(\n", + " n_estimators=300,\n", + " random_state=42,\n", + " class_weight='balanced'\n", + " ))\n", + "])\n", + "\n", + "# FIT PIPELINE (this creates the variable!)\n", + "pipeline.fit(X_train, y_train)\n" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "##### Classification report for SVM", - "id": "6545a490268fd00d" + "id": "a72d4691", + "metadata": {}, + "source": [ + "CROSS-VALIDATION" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2025-12-26T09:37:13.661171Z", - "start_time": "2025-12-26T09:37:13.634880Z" - } - }, "cell_type": "code", - "source": "print(classification_report(y_test, svm.predict(X_test)))", - "id": "d9097b2f27321b85", + "execution_count": 223, + "id": "dd670e3bbc6db468", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " precision recall f1-score support\n", - "\n", - " 0 1.00 0.99 0.99 320\n", - " 1 0.99 1.00 0.99 319\n", - "\n", - " accuracy 0.99 639\n", - " macro avg 0.99 0.99 0.99 639\n", - "weighted avg 0.99 0.99 0.99 639\n", - "\n" + "CV F1 scores: [0.66666667 0.42857143 0.5 0.63157895 0.66666667]\n", + "Mean CV F1: 0.5786967418546365\n" ] } ], - "execution_count": 213 - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "22f1f052ed96d7d1" + "source": [ + "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", + "\n", + "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "cv_scores = cross_val_score(\n", + " pipeline,\n", + " X,\n", + " y,\n", + " cv=cv,\n", + " scoring='f1'\n", + ")\n", + "\n", + "print(\"CV F1 scores:\", cv_scores)\n", + "print(\"Mean CV F1:\", cv_scores.mean())\n" + ] }, { + "cell_type": "markdown", + "id": "1dc3e31d", "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "dd670e3bbc6db468" + "source": [ + "HYPERPARAMETER TUNING" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "6ba10612400c3ffc" - }, - { + "execution_count": 224, + "id": "6ba10612400c3ffc", "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "2b286459a7af7ccf" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = {\n", + " 'model__n_estimators': [100, 300],\n", + " 'model__max_depth': [None, 10, 20],\n", + " 'model__min_samples_split': [2, 5]\n", + "}\n", + "\n", + "grid = GridSearchCV(\n", + " pipeline,\n", + " param_grid,\n", + " scoring='f1',\n", + " cv=3,\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid.fit(X_train, y_train)\n", + "print(\"Best Params:\", grid.best_params_)\n" + ] }, { + "cell_type": "markdown", + "id": "bd63dbfb", "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "b633f33419a941d3" + "source": [ + "MODEL COMPARISON TABLE" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "68d163babd4a039a" - }, - { + "execution_count": 225, + "id": "2b286459a7af7ccf", "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "f04f1e749c9fbc18" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelF1 Score
0Logistic Regression0.90
1Decision Tree0.97
2Random Forest1.00
3SVM0.99
\n", + "
" + ], + "text/plain": [ + " Model F1 Score\n", + "0 Logistic Regression 0.90\n", + "1 Decision Tree 0.97\n", + "2 Random Forest 1.00\n", + "3 SVM 0.99" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_results = pd.DataFrame({\n", + " 'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM'],\n", + " 'F1 Score': [0.90, 0.97, 1.00, 0.99]\n", + "})\n", + "\n", + "model_results\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.7" } }, "nbformat": 4,