diff --git a/ISSR_AI4MH_Yixing_Fan/pipeline/fetch_reddit.py b/ISSR_AI4MH_Yixing_Fan/pipeline/fetch_reddit.py index 2aa1b2d..e72904c 100644 --- a/ISSR_AI4MH_Yixing_Fan/pipeline/fetch_reddit.py +++ b/ISSR_AI4MH_Yixing_Fan/pipeline/fetch_reddit.py @@ -31,7 +31,7 @@ "anxiety attack", "feeling empty", "burnout", "mental exhaustion", "can't sleep", "constant worry", "feeling worthless", "no motivation", "social isolation", "panic disorder", "intrusive thoughts", "emotional pain", "mental fog", "dissociation", "feeling trapped", - "racing thoughts", "mood swings", "emotional breakdown", "mental health crisis", "therapy needed" + "racing thoughts", "mood swings", "emotional breakdown", "mental health crisis", "therapy needed" , "feel like a burden" ,"nobody cares about me" ] subreddits = [ @@ -41,7 +41,7 @@ "mentalhealthawareness", "bipolar", "traumatoolbox", "mentalhealthmemes", "mentalhealthart", "mentalhealthvideos", "mentalhealthresources", "mentalhealthadvice", "mentalhealthstories", "mentalhealthchat", "mentalhealthhelp", "mentalhealthcommunity", - "mentalhealthmatters", "mentalhealthwarriors" + "mentalhealthmatters", "mentalhealthwarriors","healthanxiety" ,"KindVoice" ] # ========== Preprocessing function ========== diff --git a/ISSR_AI4MH_Yixing_Fan/pipeline/test_model.py b/ISSR_AI4MH_Yixing_Fan/pipeline/test_model.py index 9875f7b..0dddd44 100644 --- a/ISSR_AI4MH_Yixing_Fan/pipeline/test_model.py +++ b/ISSR_AI4MH_Yixing_Fan/pipeline/test_model.py @@ -2,7 +2,7 @@ import torch from transformers import BertTokenizer, BertForSequenceClassification, Trainer from datasets import Dataset -from sklearn.metrics import accuracy_score, precision_recall_fscore_support +from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score # 1. Load model and tokenizer model_path = "results/final_model" @@ -29,9 +29,11 @@ def tokenize_function(examples): def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) + probs = torch.softmax(torch.tensor(pred.predictions), dim=1)[:, 1].numpy() + roc_auc = roc_auc_score(labels,probs) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary") acc = accuracy_score(labels, preds) - return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} + return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall , "roc_auc": roc_auc} trainer = Trainer(model=model, tokenizer=tokenizer) preds = trainer.predict(val_dataset) diff --git a/ISSR_Communication_Analysis_Tool_Samuel_Kalu/app.py b/ISSR_Communication_Analysis_Tool_Samuel_Kalu/app.py index 89a9358..1387814 100644 --- a/ISSR_Communication_Analysis_Tool_Samuel_Kalu/app.py +++ b/ISSR_Communication_Analysis_Tool_Samuel_Kalu/app.py @@ -8,6 +8,7 @@ import base64 import io +import traceback import warnings import gradio as gr @@ -21,12 +22,13 @@ def get_image_base64(path): + if not os.path.exists(path): + raise FileNotFoundError(f"Image asset not found at '{path}'") with open(path, "rb") as image_file: encoded = base64.b64encode(image_file.read()).decode("utf-8") return f'' - def process_multiple_videos( folder_path=None, video_files=None, @@ -61,7 +63,6 @@ def process_multiple_videos( if folder_path and os.path.isdir(folder_path): logger.info(f"Processing videos from folder: {folder_path}") - # Pass checkbox states to process_all_videos_from_path process_all_videos_from_path( folder_path, output_dir, @@ -78,6 +79,16 @@ def process_multiple_videos( f"Finished processing videos from folder. Found {len(processed_csvs)} CSVs." ) + elif folder_path and not os.path.isdir(folder_path): + logger.warning(f"Provided folder path does not exist: '{folder_path}'") + return ( + f"Error: Folder path '{folder_path}' does not exist or is not a directory. Please check the path and try again.", + gr.update(visible=False, value=None), + gr.update(visible=False, value=None), + gr.update(visible=False, value=None), + gr.update(open=False), + ) + elif video_files: logger.info(f"Processing uploaded video files: {video_files}") for idx, video_path in enumerate(video_files): @@ -95,7 +106,6 @@ def process_multiple_videos( ) original_filename = os.path.basename(video_path) - # Ensure proper CSV filename generation from video filename output_csv_filename = os.path.splitext(original_filename)[0] + ".csv" output_csv_path = os.path.join(output_dir, output_csv_filename) @@ -104,8 +114,6 @@ def process_multiple_videos( ) try: - # Call the process_video function from src.pipeline - # Pass checkbox states to process_video process_video( video_path, output_csv_path, @@ -118,8 +126,6 @@ def process_multiple_videos( f"Successfully processed uploaded video: {original_filename}" ) except Exception as e: - import traceback - logger.error( f"Exception while processing {original_filename}: {str(e)}" ) @@ -147,10 +153,8 @@ def process_multiple_videos( if processed_csvs: try: - # Display the first processed CSV in the DataFrame df = pd.read_csv(processed_csvs[0]) logger.info(f"Displaying data from: {os.path.basename(processed_csvs[0])}") - # Generate plots and convert to base64 HTML plot_img = plot_speaker_charts(df) return ( f"Successfully processed {len(processed_csvs)} video(s). Displaying data from {os.path.basename(processed_csvs[0])}.", @@ -160,8 +164,6 @@ def process_multiple_videos( gr.update(open=True), ) except Exception as e: - import traceback - logger.error( f"Error reading processed CSV {os.path.basename(processed_csvs[0])}: {str(e)}" ) @@ -214,12 +216,18 @@ def create_interface(): } """ + logo_path = "assets/trip_lab_logo.png" + if not os.path.exists(logo_path): + logger.warning(f"Logo asset not found at '{logo_path}'. Displaying without logo.") + html_img = "" + else: + html_img = get_image_base64(logo_path) + with gr.Blocks( title="Driving Simulator Communication Analysis Tool", theme=gr.themes.Monochrome(), - css=custom_css, # Added custom CSS + css=custom_css, ) as demo: - html_img = get_image_base64("assets/trip_lab_logo.png") gr.HTML( f"""
@@ -239,12 +247,12 @@ def create_interface(): video_files = gr.File( label="Upload Videos (Optional)", file_count="multiple", - type="filepath", # Ensures a temporary file path is provided + type="filepath", file_types=[".mp4", ".mov", ".avi", ".webm"], ) with gr.Group(): - with gr.Row(): # Make checkboxes inline + with gr.Row(): checkbox_ner = gr.Checkbox( label="NER (Named Entity Recognition)", value=True, @@ -279,7 +287,7 @@ def create_interface(): download_csv = gr.File( label="Download Processed CSV", visible=False, - type="filepath", # This is for download, not upload + type="filepath", file_count="single", ) with gr.Accordion("Show Plots", open=True) as plot_accordion: @@ -301,4 +309,4 @@ def create_interface(): demo = create_interface() -demo.launch() \ No newline at end of file +demo.launch() diff --git a/ISSR_Communication_Analysis_Tool_Samuel_Kalu/src/sentiment_analysis.py b/ISSR_Communication_Analysis_Tool_Samuel_Kalu/src/sentiment_analysis.py index 4759d06..4d4a3e9 100644 --- a/ISSR_Communication_Analysis_Tool_Samuel_Kalu/src/sentiment_analysis.py +++ b/ISSR_Communication_Analysis_Tool_Samuel_Kalu/src/sentiment_analysis.py @@ -12,10 +12,20 @@ warnings.filterwarnings("ignore") analyzer = SentimentIntensityAnalyzer() +hf_analyzer = None -def analyze_sentiment(text): +def analyze_sentiment(text, use_hf=False): """Perform sentiment analysis using VADER and return a composite score from -1 to +1.""" + global hf_analyzer + if use_hf: + if hf_analyzer is None: + from transformers import pipeline + hf_analyzer = pipeline("sentiment-analysis") + result = hf_analyzer(text)[0] + label = result["label"].upper() + confidence = float(result["score"]) + return confidence if label == "POSITIVE" else -confidence + scores = analyzer.polarity_scores(text) - sentiment_score = scores["compound"] - return sentiment_score + return scores["compound"] diff --git a/ISSR_Improve_Accuracy_Mixed_Data_Shao_Jin/Regression_model.ipynb b/ISSR_Improve_Accuracy_Mixed_Data_Shao_Jin/Regression_model.ipynb index b08f2a2..469661a 100644 --- a/ISSR_Improve_Accuracy_Mixed_Data_Shao_Jin/Regression_model.ipynb +++ b/ISSR_Improve_Accuracy_Mixed_Data_Shao_Jin/Regression_model.ipynb @@ -1,470 +1,512 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-09T09:15:50.737507Z", + "iopub.status.busy": "2026-04-09T09:15:50.736973Z", + "iopub.status.idle": "2026-04-09T09:15:50.742628Z", + "shell.execute_reply": "2026-04-09T09:15:50.741562Z", + "shell.execute_reply.started": "2026-04-09T09:15:50.737478Z" }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "id": "WGaZ3nQL9lih", + "trusted": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.ensemble import RandomForestRegressor, VotingRegressor\n", + "from xgboost import XGBRegressor\n", + "from lightgbm import LGBMRegressor\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", + "import warnings\n", + "import logging" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s4kXMPdb-XCA" + }, + "source": [ + "Preprocess data as in https://colab.research.google.com/drive/1tdgzzgPODIzrQ47CG1PKw9Gi6WHrONWf?authuser=1#scrollTo=y0BGiw2ESzV5&line=84&uniqifier=1, and https://colab.research.google.com/drive/1tdgzzgPODIzrQ47CG1PKw9Gi6WHrONWf?authuser=1#scrollTo=D5EhRjxxK8p3&line=1&uniqifier=1." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-09T09:15:50.744547Z", + "iopub.status.busy": "2026-04-09T09:15:50.744186Z", + "iopub.status.idle": "2026-04-09T09:15:55.928641Z", + "shell.execute_reply": "2026-04-09T09:15:55.927894Z", + "shell.execute_reply.started": "2026-04-09T09:15:50.744500Z" }, - "language_info": { - "name": "python" - } + "id": "g5JvKnOY9zht", + "trusted": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "def merge_cr5(row):\n", + " cr5_cols = ['CR5A', 'CR5B', 'CR5C', 'CR5D', 'CR5E']\n", + " cr5_flags = [1, 2, 3, 4, 5]\n", + " cr5_values = [not pd.isna(row[col]) for col in cr5_cols] # generates a Boolean list indicating whether each column has a value\n", + "\n", + " if sum(cr5_values) > 1: # 7 represents two or more races\n", + " return 7\n", + " elif row['CR4'] in [2, 3, 4, 5] and any(cr5_values):\n", + " return 7\n", + " elif sum(cr5_values) == 1:\n", + " return cr5_flags[cr5_values.index(True)]\n", + " elif row['CR4'] in [2, 3, 4, 5]:\n", + " return 6\n", + " else:\n", + " return None\n", + "\n", + "# Preprocess 2012 data\n", + "data_2012 = pd.read_excel('/kaggle/input/datasets/krishnasimha/2012ms/Alabama12_ms.xlsx')\n", + "data_2012 = data_2012.copy()\n", + "data_2012['CR5'] = data_2012.apply(merge_cr5, axis=1)\n", + "selected_columns = ['CR2', 'CR3', 'CR5', 'CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR54', 'CR56']\n", + "data_2012 = data_2012.loc[:, selected_columns]\n", + "\n", + "data_2014 = pd.read_excel('/kaggle/input/datasets/krishnasimha/2014ms/Alabama14_ms.xlsx')\n", + "data_2014 = data_2014.copy()\n", + "data_2014['CR5'] = data_2014.apply(merge_cr5, axis=1)\n", + "data_2014 = data_2014.loc[:, selected_columns]\n", + "\n", + "data_2016 = pd.read_csv('/kaggle/input/datasets/krishnasimha/2016regions/alabama16_6rigions_ms.sas7bdat.csv')\n", + "data_2016 = data_2016.copy()\n", + "data_2016['CR5'] = data_2016.apply(merge_cr5, axis=1)\n", + "data_2016 = data_2016.loc[:, selected_columns]\n", + "\n", + "# Load the 2024 input data\n", + "inputs_2024 = pd.read_csv('/kaggle/input/datasets/krishnasimha/2024dat/2024_sample_data_not_survey.csv')\n", + "required_columns = ['CR2', 'CR3', 'CR5']\n", + "# Ensure the 2024 input data columns match the required columns\n", + "conditions_2024 = inputs_2024.loc[:, required_columns]" + ] }, - "cells": [ - { - "cell_type": "code", - "source": [ - "from google.colab import drive\n", - "drive.mount('/content/drive/')" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZcW0mvRV9yrc", - "outputId": "c34c34bb-176d-4445-aea2-e0b22287d14e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive/\n" - ] - } - ] + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-09T09:15:55.929979Z", + "iopub.status.busy": "2026-04-09T09:15:55.929645Z", + "iopub.status.idle": "2026-04-09T09:15:55.940102Z", + "shell.execute_reply": "2026-04-09T09:15:55.939402Z", + "shell.execute_reply.started": "2026-04-09T09:15:55.929954Z" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WGaZ3nQL9lih" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.ensemble import RandomForestRegressor, VotingRegressor\n", - "from xgboost import XGBRegressor\n", - "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", - "import warnings\n", - "import logging" - ] + "id": "gkkquSgW-wqN", + "trusted": true + }, + "outputs": [], + "source": [ + "data_2012.fillna(0, inplace=True)\n", + "data_2014.fillna(0, inplace=True)\n", + "data_2016.fillna(0, inplace=True)\n", + "conditions_2024.fillna(0, inplace=True)\n", + "data_2012.insert(0, 'Year', 2012)\n", + "data_2014.insert(0, 'Year', 2014)\n", + "data_2016.insert(0, 'Year', 2016)\n", + "conditions_2024.insert(0, 'Year', 2024) # including year feature\n", + "\n", + "# Combine the datasets into one DataFrame\n", + "combined_data = pd.concat([data_2012, data_2014, data_2016], ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "erXEnudcXCrJ" + }, + "source": [ + "I decided to use regression model for the time series data. I incorporate feature engineering in this process, so that enriching the dataset with valuable information that helps the model learn patterns more effectively. This is essential for time series data, where past behavior can strongly influence future predictions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "source": [ - "Preprocess data as in https://colab.research.google.com/drive/1tdgzzgPODIzrQ47CG1PKw9Gi6WHrONWf?authuser=1#scrollTo=y0BGiw2ESzV5&line=84&uniqifier=1, and https://colab.research.google.com/drive/1tdgzzgPODIzrQ47CG1PKw9Gi6WHrONWf?authuser=1#scrollTo=D5EhRjxxK8p3&line=1&uniqifier=1." - ], - "metadata": { - "id": "s4kXMPdb-XCA" - } + "execution": { + "iopub.execute_input": "2026-04-09T09:15:55.942141Z", + "iopub.status.busy": "2026-04-09T09:15:55.941831Z", + "iopub.status.idle": "2026-04-09T09:15:55.979773Z", + "shell.execute_reply": "2026-04-09T09:15:55.979075Z", + "shell.execute_reply.started": "2026-04-09T09:15:55.942114Z" }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "def merge_cr5(row):\n", - " cr5_cols = ['CR5A', 'CR5B', 'CR5C', 'CR5D', 'CR5E']\n", - " cr5_flags = [1, 2, 3, 4, 5]\n", - " cr5_values = [not pd.isna(row[col]) for col in cr5_cols] # generates a Boolean list indicating whether each column has a value\n", - "\n", - " if sum(cr5_values) > 1: # 7 represents two or more races\n", - " return 7\n", - " elif row['CR4'] in [2, 3, 4, 5] and any(cr5_values):\n", - " return 7\n", - " elif sum(cr5_values) == 1:\n", - " return cr5_flags[cr5_values.index(True)]\n", - " elif row['CR4'] in [2, 3, 4, 5]:\n", - " return 6\n", - " else:\n", - " return None\n", - "\n", - "# Preprocess 2012 data\n", - "data_2012 = pd.read_excel('/content/drive/My Drive/Alabama12_ms.xlsx')\n", - "data_2012 = data_2012.copy()\n", - "data_2012['CR5'] = data_2012.apply(merge_cr5, axis=1)\n", - "selected_columns = ['CR2', 'CR3', 'CR5', 'CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR54', 'CR56']\n", - "data_2012 = data_2012.loc[:, selected_columns]\n", - "\n", - "data_2014 = pd.read_excel('/content/drive/My Drive/Alabama14_ms.xlsx')\n", - "data_2014 = data_2014.copy()\n", - "data_2014['CR5'] = data_2014.apply(merge_cr5, axis=1)\n", - "data_2014 = data_2014.loc[:, selected_columns]\n", - "\n", - "data_2016 = pd.read_csv('/content/drive/My Drive/alabama16_6rigions_ms.sas7bdat.csv')\n", - "data_2016 = data_2016.copy()\n", - "data_2016['CR5'] = data_2016.apply(merge_cr5, axis=1)\n", - "data_2016 = data_2016.loc[:, selected_columns]\n", - "\n", - "# Load the 2024 input data\n", - "inputs_2024 = pd.read_csv('/content/drive/My Drive/2024_sample_data_not_survey.csv')\n", - "required_columns = ['CR2', 'CR3', 'CR5']\n", - "# Ensure the 2024 input data columns match the required columns\n", - "conditions_2024 = inputs_2024.loc[:, required_columns]" - ], - "metadata": { - "id": "g5JvKnOY9zht" - }, - "execution_count": null, - "outputs": [] + "id": "ZTUzV9MUGxMX", + "outputId": "55c1ce79-f941-4e1d-84b7-bdf26e25094f", + "trusted": true + }, + "outputs": [], + "source": [ + "#Computes the accuracy of predictions by checking if the relative error between predicted and actual values is less than a specified threshold\n", + "def calculate_accuracy(y_true, y_pred, threshold=0.1):\n", + " return ((abs(y_true - y_pred) / y_true) < threshold).mean()\n", + "\n", + "# Creates lag features for specified columns, which represent previous time steps\n", + "def generate_lag_features(df, features, lags=[1]):\n", + " for lag in lags:\n", + " for feature in features:\n", + " df[f\"{feature}_lag{lag}\"] = df[feature].shift(lag)\n", + " return df\n", + "\n", + "#Computes moving averages for the specified columns, smoothing out fluctuations in the data\n", + "def add_moving_averages(df, features, windows=[3]):\n", + " for window in windows:\n", + " for feature in features:\n", + " df[f\"{feature}_ma{window}\"] = df[feature].rolling(window=window).mean()\n", + " return df\n", + "\n", + "# Adds difference features, which calculate the change between the current value and the previous value for each feature, helping to identify trends or shifts over time.\n", + "def add_diff_features(df, features):\n", + " for feature in features:\n", + " df[f\"{feature}_diff\"] = df[feature].diff()\n", + " return df\n", + "\n", + "#Generates ratio features by dividing one feature by another, highlighting the relationships between multiple features\n", + "def add_ratios(df, features):\n", + " if len(features) > 1:\n", + " for i in range(len(features)):\n", + " for j in range(i + 1, len(features)):\n", + " df[f\"{features[i]}_{features[j]}_ratio\"] = df[features[i]] / (df[features[j]] + 1e-9)\n", + " return df\n", + "\n", + "\n", + "def process_data(data, features):\n", + " data = generate_lag_features(data, features, lags=[1])\n", + " data = add_moving_averages(data, features, windows=[3])\n", + " data = add_diff_features(data, features)\n", + " data = add_ratios(data, features)\n", + " imputer = SimpleImputer(strategy='mean')\n", + " data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns, index=data.index)\n", + " return data_imputed\n", + "\n", + "# Make sure the data is all numeric types\n", + "historical_data = combined_data.apply(pd.to_numeric, errors=\"coerce\")\n", + "input_data = conditions_2024.apply(pd.to_numeric, errors=\"coerce\")\n", + "\n", + "# Feature generation and filling missing values\n", + "features = [\"CR2\", \"CR3\", \"CR5\"]\n", + "historical_data_imputed = process_data(historical_data, features)\n", + "\n", + "input_data_imputed = process_data(input_data, features)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-09T09:15:55.981120Z", + "iopub.status.busy": "2026-04-09T09:15:55.980790Z", + "iopub.status.idle": "2026-04-09T09:15:55.998874Z", + "shell.execute_reply": "2026-04-09T09:15:55.997947Z", + "shell.execute_reply.started": "2026-04-09T09:15:55.981087Z" }, - { - "cell_type": "code", - "source": [ - "data_2012.fillna(0, inplace=True)\n", - "data_2014.fillna(0, inplace=True)\n", - "data_2016.fillna(0, inplace=True)\n", - "conditions_2024.fillna(0, inplace=True)\n", - "data_2012.insert(0, 'Year', 2012)\n", - "data_2014.insert(0, 'Year', 2014)\n", - "data_2016.insert(0, 'Year', 2016)\n", - "conditions_2024.insert(0, 'Year', 2024) # including year feature\n", - "\n", - "# Combine the datasets into one DataFrame\n", - "combined_data = pd.concat([data_2012, data_2014, data_2016], ignore_index=True)" - ], - "metadata": { - "id": "gkkquSgW-wqN" - }, - "execution_count": null, - "outputs": [] + "id": "HffUBIp-IEbg", + "trusted": true + }, + "outputs": [], + "source": [ + "output_cols = {\"CR7\", \"CR11\", \"CR12\", \"CR13\", \"CR21\", \"CR22\", \"CR23\", \"CR54\", \"CR56\"}\n", + "\n", + "# Subtract targets from set and convert the feature to list\n", + "common_features_set = set(historical_data_imputed.columns) & set(input_data_imputed.columns)\n", + "common_features_list = list(common_features_set - output_cols)\n", + "\n", + "# split dataset\n", + "train_data, test_data = train_test_split(historical_data_imputed, test_size=0.2, random_state=42)\n", + "# Standardize dataset\n", + "scaler = StandardScaler()\n", + "X_train = pd.DataFrame(scaler.fit_transform(train_data[common_features_list]), columns=common_features_list)\n", + "X_test = pd.DataFrame(scaler.transform(test_data[common_features_list]), columns=common_features_list)\n", + "X_predict = pd.DataFrame(scaler.transform(input_data_imputed[common_features_list]), columns=common_features_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ChEJuwQqYHHJ" + }, + "source": [ + "Choose best model between RandomForestRegressor, XGBRegressor, and LGBMRegressor to train each column.\n", + "\n", + "XGBRegressor is based on the gradient boosting framework to capture complex relationship in dataset.\n", + "\n", + "RandomForestRegressor is a regression model that uses an ensemble of decision trees to make predictions to handles large datasets.\n", + "\n", + "LGBMRegressor (LightGBM) is a gradient boosting model that uses a leaf-wise tree growth strategy instead of depth-wise. It is faster to train, more memory-efficient, and often achieves better accuracy than XGBoost on tabular data with many engineered features — making it a well-justified addition to the model competition for the CR survey columns.\n", + "\n", + "However, I find out some columns(\"CR7\", \"CR11\", \"CR12\", \"CR22\", \"CR27\", \"CR56\") are hard to capture features in single model, so I combine these two models together to train it." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-09T09:15:56.000439Z", + "iopub.status.busy": "2026-04-09T09:15:56.000005Z", + "iopub.status.idle": "2026-04-09T09:15:56.007982Z", + "shell.execute_reply": "2026-04-09T09:15:56.007222Z", + "shell.execute_reply.started": "2026-04-09T09:15:56.000413Z" }, - { - "cell_type": "markdown", - "source": [ - "I decided to use regression model for the time series data. I incorporate feature engineering in this process, so that enriching the dataset with valuable information that helps the model learn patterns more effectively. This is essential for time series data, where past behavior can strongly influence future predictions.\n" - ], - "metadata": { - "id": "erXEnudcXCrJ" - } + "id": "VJTXLRJhKHcf", + "trusted": true + }, + "outputs": [], + "source": [ + "def train_and_predict_model(X_train, y_train, X_test, param_grid, model_type='randomforest'):\n", + " if model_type == 'randomforest':\n", + " model = RandomForestRegressor(random_state=42)\n", + " elif model_type == 'xgboost':\n", + " model = XGBRegressor(random_state=42, eval_metric='rmse')\n", + " elif model_type == 'lightgbm':\n", + " model = LGBMRegressor(random_state=42, verbose=-1)\n", + "\n", + " grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=\"neg_mean_squared_error\",\n", + " cv=5, n_jobs=-1)\n", + " grid_search.fit(X_train, y_train)\n", + " best_model = grid_search.best_estimator_\n", + " predictions = best_model.predict(X_test)\n", + " return best_model, predictions\n", + "\n", + "\n", + "def special_processing(X_train, y_train, X_test, X_predict, target):\n", + " # Feature engineering and model optimization strategies\n", + " param_grid_rf_special = {\n", + " \"n_estimators\": [100, 200],\n", + " \"max_depth\": [8, 10],\n", + " \"min_samples_split\": [5, 10],\n", + " \"min_samples_leaf\": [2, 4]\n", + " }\n", + "\n", + " best_model_rf, preds_rf = train_and_predict_model(X_train, y_train, X_test, param_grid_rf_special,\n", + " model_type='randomforest')\n", + "\n", + " param_grid_xgb_special = {\n", + " \"learning_rate\": [0.01, 0.05],\n", + " \"n_estimators\": [50, 100],\n", + " \"max_depth\": [4, 6]\n", + " }\n", + "\n", + " best_model_xgb, preds_xgb = train_and_predict_model(X_train, y_train, X_test, param_grid_xgb_special,\n", + " model_type='xgboost')\n", + "\n", + " voting_model = VotingRegressor([('rf', best_model_rf), ('xgb', best_model_xgb)])\n", + " voting_model.fit(X_train, y_train)\n", + " preds_final = voting_model.predict(X_test)\n", + "\n", + "\n", + " final_preds = voting_model.predict(X_predict)\n", + "\n", + " return voting_model, final_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "source": [ - "#Computes the accuracy of predictions by checking if the relative error between predicted and actual values is less than a specified threshold\n", - "def calculate_accuracy(y_true, y_pred, threshold=0.1):\n", - " return ((abs(y_true - y_pred) / y_true) < threshold).mean()\n", - "\n", - "# Creates lag features for specified columns, which represent previous time steps\n", - "def generate_lag_features(df, features, lags=[1]):\n", - " for lag in lags:\n", - " for feature in features:\n", - " df[f\"{feature}_lag{lag}\"] = df[feature].shift(lag)\n", - " return df\n", - "\n", - "#Computes moving averages for the specified columns, smoothing out fluctuations in the data\n", - "def add_moving_averages(df, features, windows=[3]):\n", - " for window in windows:\n", - " for feature in features:\n", - " df[f\"{feature}_ma{window}\"] = df[feature].rolling(window=window).mean()\n", - " return df\n", - "\n", - "# Adds difference features, which calculate the change between the current value and the previous value for each feature, helping to identify trends or shifts over time.\n", - "def add_diff_features(df, features):\n", - " for feature in features:\n", - " df[f\"{feature}_diff\"] = df[feature].diff()\n", - " return df\n", - "\n", - "#Generates ratio features by dividing one feature by another, highlighting the relationships between multiple features\n", - "def add_ratios(df, features):\n", - " if len(features) > 1:\n", - " for i in range(len(features)):\n", - " for j in range(i + 1, len(features)):\n", - " df[f\"{features[i]}_{features[j]}_ratio\"] = df[features[i]] / (df[features[j]] + 1e-9)\n", - " return df\n", - "\n", - "\n", - "def process_data(data, features):\n", - " data = generate_lag_features(data, features, lags=[1])\n", - " data = add_moving_averages(data, features, windows=[3])\n", - " data = add_diff_features(data, features)\n", - " data = add_ratios(data, features)\n", - " imputer = SimpleImputer(strategy='mean')\n", - " data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns, index=data.index)\n", - " return data_imputed\n", - "\n", - "# Make sure the data is all numeric types\n", - "historical_data = combined_data.apply(pd.to_numeric, errors=\"coerce\")\n", - "input_data = conditions_2024.apply(pd.to_numeric, errors=\"coerce\")\n", - "\n", - "# Feature generation and filling missing values\n", - "features = [\"CR2\", \"CR3\", \"CR5\"]\n", - "historical_data_imputed = process_data(historical_data, features)\n", - "\n", - "input_data_imputed = process_data(input_data, features)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZTUzV9MUGxMX", - "outputId": "55c1ce79-f941-4e1d-84b7-bdf26e25094f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Year CR2 CR3 CR5 CR2_lag1 CR3_lag1 CR5_lag1 CR2_ma3 CR3_ma3 \\\n", - "0 2024.0 2.0 3.0 1.0 1.522069 2.104084 4.056324 1.521801 2.103789 \n", - "1 2024.0 2.0 3.0 3.0 2.000000 3.000000 1.000000 1.521801 2.103789 \n", - "2 2024.0 1.0 2.0 3.0 2.000000 3.000000 3.000000 1.666667 2.666667 \n", - "3 2024.0 1.0 1.0 3.0 1.000000 2.000000 3.000000 1.333333 2.000000 \n", - "4 2024.0 1.0 1.0 0.0 1.000000 1.000000 3.000000 1.000000 1.333333 \n", - "... ... ... ... ... ... ... ... ... ... \n", - "3032 2024.0 1.0 1.0 5.0 1.000000 3.000000 5.000000 1.333333 2.000000 \n", - "3033 2024.0 2.0 2.0 3.0 1.000000 1.000000 5.000000 1.333333 2.000000 \n", - "3034 2024.0 1.0 3.0 2.0 2.000000 2.000000 3.000000 1.333333 2.000000 \n", - "3035 2024.0 2.0 3.0 0.0 1.000000 3.000000 2.000000 1.666667 2.666667 \n", - "3036 2024.0 1.0 3.0 5.0 2.000000 3.000000 0.000000 1.333333 3.000000 \n", - "\n", - " CR5_ma3 CR2_diff CR3_diff CR5_diff CR2_CR3_ratio CR2_CR5_ratio \\\n", - "0 4.057661 -0.000329 0.0 0.001318 0.666667 2.000000e+00 \n", - "1 4.057661 0.000000 0.0 2.000000 0.666667 6.666667e-01 \n", - "2 2.333333 -1.000000 -1.0 0.000000 0.500000 3.333333e-01 \n", - "3 3.000000 0.000000 -1.0 0.000000 1.000000 3.333333e-01 \n", - "4 2.000000 0.000000 0.0 -3.000000 1.000000 1.000000e+09 \n", - "... ... ... ... ... ... ... \n", - "3032 5.000000 0.000000 -2.0 0.000000 1.000000 2.000000e-01 \n", - "3033 4.333333 1.000000 1.0 -2.000000 1.000000 6.666667e-01 \n", - "3034 3.333333 -1.000000 1.0 -1.000000 0.333333 5.000000e-01 \n", - "3035 1.666667 1.000000 0.0 -2.000000 0.666667 2.000000e+09 \n", - "3036 2.333333 -1.000000 0.0 5.000000 0.333333 2.000000e-01 \n", - "\n", - " CR3_CR5_ratio \n", - "0 3.000000e+00 \n", - "1 1.000000e+00 \n", - "2 6.666667e-01 \n", - "3 3.333333e-01 \n", - "4 1.000000e+09 \n", - "... ... \n", - "3032 2.000000e-01 \n", - "3033 6.666667e-01 \n", - "3034 1.500000e+00 \n", - "3035 3.000000e+09 \n", - "3036 6.000000e-01 \n", - "\n", - "[3037 rows x 16 columns]\n" - ] - } - ] + "execution": { + "iopub.execute_input": "2026-04-09T09:15:56.009425Z", + "iopub.status.busy": "2026-04-09T09:15:56.009083Z", + "iopub.status.idle": "2026-04-09T09:26:54.929523Z", + "shell.execute_reply": "2026-04-09T09:26:54.928778Z", + "shell.execute_reply.started": "2026-04-09T09:15:56.009398Z" }, + "id": "2jTGpkrCIww0", + "outputId": "5c986f56-2f12-454b-f6fa-147391c4e12e", + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation Results:\n", + "CR54: MSE: 0.4820, MAE: 0.3985, R2: 0.0598, Accuracy: 0.5611\n", + "CR13: MSE: 0.8096, MAE: 0.3069, R2: 0.0210, Accuracy: 0.6293\n", + "CR23: MSE: 0.3487, MAE: 0.1950, R2: 0.0057, Accuracy: 0.6959\n", + "CR21: MSE: 0.1492, MAE: 0.2706, R2: 0.0247, Accuracy: 0.6108\n" + ] + } + ], + "source": [ + "validations = {}\n", + "rf_predictions = {}\n", + "param_grid_rf = {\n", + " \"n_estimators\": [50, 100, 150],\n", + " \"max_depth\": [4, 6, 8],\n", + " \"min_samples_split\": [2, 5, 10],\n", + " \"min_samples_leaf\": [1, 2, 4]\n", + "}\n", + "\n", + "param_grid_xgb = {\n", + " \"learning_rate\": [0.01, 0.05, 0.1],\n", + " \"n_estimators\": [50, 100, 150],\n", + " \"max_depth\": [3, 5, 7]\n", + "}\n", + "\n", + "param_grid_lgbm = {\n", + " \"learning_rate\": [0.01, 0.05, 0.1],\n", + " \"n_estimators\": [50, 100, 150],\n", + " \"max_depth\": [-1, 6, 8],\n", + " \"num_leaves\": [15, 31, 63]\n", + "}\n", + "\n", + "special_targets = {\"CR7\", \"CR11\", \"CR12\", \"CR22\", \"CR27\", \"CR56\"}\n", + "\n", + "model_list = [\n", + " ('RandomForest', RandomForestRegressor(random_state=42), param_grid_rf),\n", + " ('XGBoost', XGBRegressor(random_state=42, eval_metric='rmse'), param_grid_xgb),\n", + " ('LightGBM', LGBMRegressor(random_state=42, verbose=-1), param_grid_lgbm)\n", + "]\n", + "\n", + "for target in output_cols:\n", + " y_train = train_data[target]\n", + " y_test = test_data[target]\n", + "\n", + " try:\n", + " if target in special_targets:\n", + " logging.info(f\"Applying special processing for target: {target}\")\n", + " best_model, best_preds = special_processing(X_train, y_train, X_test, X_predict, target)\n", + " else:\n", + " best_model, best_preds, best_metrics = None, None, None\n", + "\n", + " for name, model, param_grid in model_list:\n", + " logging.info(f\"Training {name} model for target: {target}...\")\n", + " best_model_tmp, preds_tmp = train_and_predict_model(X_train, y_train, X_test, param_grid,\n", + " model_type=name.lower().replace('lightgbm', 'lightgbm'))\n", + "\n", + " mse = mean_squared_error(y_test, preds_tmp)\n", + " mae = mean_absolute_error(y_test, preds_tmp)\n", + " r2 = r2_score(y_test, preds_tmp)\n", + " accuracy = calculate_accuracy(y_test, preds_tmp)\n", + "\n", + " metrics = {\"MSE\": mse, \"MAE\": mae, \"R2\": r2, \"Accuracy\": accuracy}\n", + " logging.info(\n", + " f\"{name} results for {target} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, Accuracy: {accuracy:.4f}\")\n", + "\n", + " if not best_metrics or mse < best_metrics[\"MSE\"]:\n", + " best_model, best_preds, best_metrics = best_model_tmp, preds_tmp, metrics\n", + "\n", + " validations[target] = best_metrics\n", + "\n", + " rf_predictions[target] = best_model.predict(X_predict)\n", + "\n", + " except Exception as e:\n", + " logging.error(f\"Error validating {target} with regression model: {e}\")\n", + " rf_predictions[target] = [None] * len(input_data)\n", + "\n", + "\n", + "\n", + "print(\"Validation Results:\")\n", + "for target, metrics in validations.items():\n", + " if target not in special_targets:\n", + " logging.info(\n", + " f\"{target}: MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}\")\n", + " print(\n", + " f\"{target}: MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}\")\n", + "\n", + "# Store prediction result\n", + "final_predictions = {\n", + " i: {target: int(round(rf_predictions.get(target, [None] * len(input_data))[i]))\n", + " for target in output_cols\n", + " if rf_predictions.get(target, [None] * len(input_data))[i] is not None}\n", + " for i in range(len(input_data))\n", + "}\n", + "\n", + "# Convert final_predictions to DataFrame\n", + "predicted_df = pd.DataFrame.from_dict(final_predictions, orient=\"index\", columns=list(output_cols))\n", + "\n", + "# Ensure the index aligns with the input data\n", + "predicted_df.index = input_data.index\n", + "\n", + "desired_order = ['CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR54', 'CR56']\n", + "\n", + "# Reorder the columns of predicted_df\n", + "predicted_df = predicted_df[desired_order]\n", + "\n", + "# Concatenate inputs_2024 with the predictions\n", + "result = pd.concat([inputs_2024.reset_index(drop=True), predicted_df], axis=1)\n", + "\n", + "# Save to CSV file\n", + "result.to_csv('Regression_2024_data.csv', index=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kaggle": { + "accelerator": "none", + "dataSources": [ { - "cell_type": "code", - "source": [ - "output_cols = {\"CR7\", \"CR11\", \"CR12\", \"CR13\", \"CR21\", \"CR22\", \"CR23\", \"CR54\", \"CR56\"}\n", - "\n", - "# Subtract targets from set and convert the feature to list\n", - "common_features_set = set(historical_data_imputed.columns) & set(input_data_imputed.columns)\n", - "common_features_list = list(common_features_set - output_cols)\n", - "\n", - "# split dataset\n", - "train_data, test_data = train_test_split(historical_data_imputed, test_size=0.2, random_state=42)\n", - "# Standardize dataset\n", - "scaler = StandardScaler()\n", - "X_train = scaler.fit_transform(train_data[common_features_list])\n", - "X_test = scaler.transform(test_data[common_features_list])\n", - "X_predict = scaler.transform(input_data_imputed[common_features_list])" - ], - "metadata": { - "id": "HffUBIp-IEbg" - }, - "execution_count": null, - "outputs": [] + "databundleVersionId": 16550368, + "datasetId": 9994389, + "sourceId": 15616314, + "sourceType": "datasetVersion" }, { - "cell_type": "markdown", - "source": [ - "Choose best model between RandomForestRegressor and XGBRegressor to train each column.\n", - "\n", - "XGBRegressor is based on the gradient boosting framework to capture complex relationship in dataset.\n", - "\n", - "RandomForestRegressor is a regression model that uses an ensemble of decision trees to make predictions to handles large datasets.\n", - "\n", - "However, I find out some columns(\"CR7\", \"CR11\", \"CR12\", \"CR22\", \"CR27\", \"CR56\") are hard to capture features in single model, so I combine these two models together to train it." - ], - "metadata": { - "id": "ChEJuwQqYHHJ" - } + "databundleVersionId": 16550363, + "datasetId": 9994385, + "sourceId": 15616309, + "sourceType": "datasetVersion" }, { - "cell_type": "code", - "source": [ - "def train_and_predict_model(X_train, y_train, X_test, param_grid, model_type='randomforest'):\n", - " if model_type == 'randomforest':\n", - " model = RandomForestRegressor(random_state=42)\n", - " elif model_type == 'xgboost':\n", - " model = XGBRegressor(random_state=42, eval_metric='rmse')\n", - "\n", - " grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=\"neg_mean_squared_error\",\n", - " cv=5, n_jobs=-1)\n", - " grid_search.fit(X_train, y_train)\n", - " best_model = grid_search.best_estimator_\n", - " predictions = best_model.predict(X_test)\n", - " return best_model, predictions\n", - "\n", - "\n", - "def special_processing(X_train, y_train, X_test, X_predict, target):\n", - " # Feature engineering and model optimization strategies\n", - " param_grid_rf_special = {\n", - " \"n_estimators\": [100, 200],\n", - " \"max_depth\": [8, 10],\n", - " \"min_samples_split\": [5, 10],\n", - " \"min_samples_leaf\": [2, 4]\n", - " }\n", - "\n", - " best_model_rf, preds_rf = train_and_predict_model(X_train, y_train, X_test, param_grid_rf_special,\n", - " model_type='randomforest')\n", - "\n", - " param_grid_xgb_special = {\n", - " \"learning_rate\": [0.01, 0.05],\n", - " \"n_estimators\": [50, 100],\n", - " \"max_depth\": [4, 6]\n", - " }\n", - "\n", - " best_model_xgb, preds_xgb = train_and_predict_model(X_train, y_train, X_test, param_grid_xgb_special,\n", - " model_type='xgboost')\n", - "\n", - " voting_model = VotingRegressor([('rf', best_model_rf), ('xgb', best_model_xgb)])\n", - " voting_model.fit(X_train, y_train)\n", - " preds_final = voting_model.predict(X_test)\n", - "\n", - "\n", - " final_preds = voting_model.predict(X_predict)\n", - "\n", - " return voting_model, final_preds" - ], - "metadata": { - "id": "VJTXLRJhKHcf" - }, - "execution_count": null, - "outputs": [] + "databundleVersionId": 16550354, + "datasetId": 9994379, + "sourceId": 15616300, + "sourceType": "datasetVersion" }, { - "cell_type": "code", - "source": [ - "validations = {}\n", - "rf_predictions = {}\n", - "param_grid_rf = {\n", - " \"n_estimators\": [50, 100, 150],\n", - " \"max_depth\": [4, 6, 8],\n", - " \"min_samples_split\": [2, 5, 10],\n", - " \"min_samples_leaf\": [1, 2, 4]\n", - "}\n", - "\n", - "param_grid_xgb = {\n", - " \"learning_rate\": [0.01, 0.05, 0.1],\n", - " \"n_estimators\": [50, 100, 150],\n", - " \"max_depth\": [3, 5, 7]\n", - "}\n", - "\n", - "special_targets = {\"CR7\", \"CR11\", \"CR12\", \"CR22\", \"CR27\", \"CR56\"}\n", - "\n", - "model_list = [\n", - " ('RandomForest', RandomForestRegressor(random_state=42), param_grid_rf),\n", - " ('XGBoost', XGBRegressor(random_state=42, eval_metric='rmse'), param_grid_xgb)\n", - "]\n", - "\n", - "for target in output_cols:\n", - " y_train = train_data[target]\n", - " y_test = test_data[target]\n", - "\n", - " try:\n", - " if target in special_targets:\n", - " logging.info(f\"Applying special processing for target: {target}\")\n", - " best_model, best_preds = special_processing(X_train, y_train, X_test, X_predict, target)\n", - " else:\n", - " best_model, best_preds, best_metrics = None, None, None\n", - "\n", - " for name, model, param_grid in model_list:\n", - " logging.info(f\"Training {name} model for target: {target}...\")\n", - " best_model_tmp, preds_tmp = train_and_predict_model(X_train, y_train, X_test, param_grid,\n", - " model_type=name.lower())\n", - "\n", - " mse = mean_squared_error(y_test, preds_tmp)\n", - " mae = mean_absolute_error(y_test, preds_tmp)\n", - " r2 = r2_score(y_test, preds_tmp)\n", - " accuracy = calculate_accuracy(y_test, preds_tmp)\n", - "\n", - " metrics = {\"MSE\": mse, \"MAE\": mae, \"R2\": r2, \"Accuracy\": accuracy}\n", - " logging.info(\n", - " f\"{name} results for {target} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, Accuracy: {accuracy:.4f}\")\n", - "\n", - " if not best_metrics or mse < best_metrics[\"MSE\"]:\n", - " best_model, best_preds, best_metrics = best_model_tmp, preds_tmp, metrics\n", - "\n", - " validations[target] = best_metrics\n", - "\n", - " rf_predictions[target] = best_model.predict(X_predict)\n", - "\n", - " except Exception as e:\n", - " logging.error(f\"Error validating {target} with regression model: {e}\")\n", - " rf_predictions[target] = [None] * len(input_data)\n", - "\n", - "\n", - "\n", - "print(\"Validation Results:\")\n", - "for target, metrics in validations.items():\n", - " if target not in special_targets:\n", - " logging.info(\n", - " f\"{target}: MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}\")\n", - " print(\n", - " f\"{target}: MSE: {metrics['MSE']:.4f}, MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}, Accuracy: {metrics['Accuracy']:.4f}\")\n", - "\n", - "# Store prediction result\n", - "final_predictions = {\n", - " i: {target: int(round(rf_predictions.get(target, [None] * len(input_data))[i]))\n", - " for target in output_cols\n", - " if rf_predictions.get(target, [None] * len(input_data))[i] is not None}\n", - " for i in range(len(input_data))\n", - "}\n", - "\n", - "# Convert final_predictions to DataFrame\n", - "predicted_df = pd.DataFrame.from_dict(final_predictions, orient=\"index\", columns=list(output_cols))\n", - "\n", - "# Ensure the index aligns with the input data\n", - "predicted_df.index = input_data.index\n", - "\n", - "desired_order = ['CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR54', 'CR56']\n", - "\n", - "# Reorder the columns of predicted_df\n", - "predicted_df = predicted_df[desired_order]\n", - "\n", - "# Concatenate inputs_2024 with the predictions\n", - "result = pd.concat([inputs_2024.reset_index(drop=True), predicted_df], axis=1)\n", - "\n", - "# Save to CSV file\n", - "result.to_csv('Regression_2024_data.csv', index=False)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2jTGpkrCIww0", - "outputId": "5c986f56-2f12-454b-f6fa-147391c4e12e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Validation Results:\n", - "CR21: MSE: 0.1496, MAE: 0.2700, R2: 0.0222, Accuracy: 0.6057\n", - "CR54: MSE: 0.4811, MAE: 0.3981, R2: 0.0615, Accuracy: 0.5560\n", - "CR13: MSE: 0.8109, MAE: 0.3047, R2: 0.0195, Accuracy: 0.6613\n", - "CR23: MSE: 0.3489, MAE: 0.1933, R2: 0.0051, Accuracy: 0.7607\n" - ] - } - ] + "databundleVersionId": 16550347, + "datasetId": 9994373, + "sourceId": 15616293, + "sourceType": "datasetVersion" } - ] -} \ No newline at end of file + ], + "dockerImageVersionId": 31328, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ISSR_MIE_Classifier/ISSRMIEclassifier/ml/models/enhanced_mie_system.py b/ISSR_MIE_Classifier/ISSRMIEclassifier/ml/models/enhanced_mie_system.py index 97f6707..6e72dc1 100644 --- a/ISSR_MIE_Classifier/ISSRMIEclassifier/ml/models/enhanced_mie_system.py +++ b/ISSR_MIE_Classifier/ISSRMIEclassifier/ml/models/enhanced_mie_system.py @@ -8,12 +8,16 @@ import re import logging import json +from transformers import pipeline class EnhancedMIESystem: - def __init__(self, ollama_url="http://localhost:11434"): + def __init__(self, ollama_url="http://localhost:11434",, use_distilbert=False): self.ollama_url = ollama_url self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') self.sentiment_analyzer = SentimentIntensityAnalyzer() + self.use_distilbert = use_distilbert + if self.use_distilbert: + self.distilbert_sentiment = pipeline("sentiment-analysis") # Your existing death keywords self.death_keywords = { @@ -93,8 +97,12 @@ def extract_entities(self, text): } def analyze_sentiment_and_keywords(self, text): - """Your existing sentiment + death word analysis""" sentiment = self.sentiment_analyzer.polarity_scores(text) + if self.use_distilbert: + db_result = self.distilbert_sentiment(text[:512])[0] + distilbert_score = db_result['score'] if db_result['label'] == 'POSITIVE' else -db_result['score'] + else: + distilbert_score = None text_lower = text.lower() death_count = sum(1 for word in self.death_keywords if word in text_lower) @@ -109,7 +117,8 @@ def analyze_sentiment_and_keywords(self, text): 'mie_words': mie_count, 'is_negative': sentiment['compound'] < -0.3, 'has_death': death_count >= 1, - 'has_mie': mie_count >= 2 + 'has_mie': mie_count >= 2, + 'distilbert_sentiment': distilbert_score } def create_rag_embeddings(self, articles_df): diff --git a/ISSR_MIE_Classifier/ISSRMIEclassifier/nlp/preprocessing/text_processor.py b/ISSR_MIE_Classifier/ISSRMIEclassifier/nlp/preprocessing/text_processor.py index 1614dbd..9ee92f3 100644 --- a/ISSR_MIE_Classifier/ISSRMIEclassifier/nlp/preprocessing/text_processor.py +++ b/ISSR_MIE_Classifier/ISSRMIEclassifier/nlp/preprocessing/text_processor.py @@ -7,7 +7,7 @@ import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer +from nltk.stem import WordNetLemmatizer, PorterStemmer try: nltk.data.find('tokenizers/punkt') @@ -25,6 +25,7 @@ def __init__(self, config: Dict[str, Any]): self.config = config self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() + self.stemmer = PorterStemmer() self.max_length = config["nlp"]["max_text_length"] def clean_text(self, text: str) -> str: @@ -61,6 +62,20 @@ def lemmatize_text(self, text: str) -> str: lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words] return ' '.join(lemmatized_words) + def stem_text(self, text: str) -> str: + """Stem text using Porter Stemmer. + + Stemming is a faster, rule-based approach that strips word suffixes + (e.g. 'running' -> 'run', 'studies' -> 'studi'). It is intentionally + kept separate from lemmatization: applying both would let the stemmer + undo the linguistically precise output of the lemmatizer, so the two + methods should be treated as alternative normalisation strategies rather + than sequential steps. + """ + words = word_tokenize(text) + stemmed_words = [self.stemmer.stem(word) for word in words] + return ' '.join(stemmed_words) + def truncate_text(self, text: str) -> str: """Truncate text to maximum length""" if len(text) <= self.max_length: @@ -78,18 +93,37 @@ def truncate_text(self, text: str) -> str: return truncated.strip() - def preprocess(self, title: str, subject: str, text: str) -> str: - """Complete preprocessing pipeline""" + def preprocess(self, title: str, subject: str, text: str, stemming: bool = False) -> str: + """Complete preprocessing pipeline. + + Args: + title: Article title. + subject: Article subject / topic line. + text: Article body. + stemming: When False (default) the pipeline applies cleaning → + stopword removal → lemmatization → truncation. + When True, lemmatization is replaced by Porter Stemmer + (cleaning → stopword removal → stemming → truncation). + Combining both normalisation strategies is avoided because + the stemmer would aggressively re-stem the precise output + of the lemmatizer, degrading token quality rather than + improving it. + """ # Combine all text fields combined_text = f"{title} {subject} {text}" - - # Apply preprocessing steps + + # Shared early steps cleaned = self.clean_text(combined_text) no_stopwords = self.remove_stopwords(cleaned) - lemmatized = self.lemmatize_text(no_stopwords) - truncated = self.truncate_text(lemmatized) - - return truncated + + if stemming: + # Stemming-only path: faster, more aggressive normalisation + normalised = self.stem_text(no_stopwords) + else: + # Lemmatization-only path: linguistically precise normalisation + normalised = self.lemmatize_text(no_stopwords) + + return self.truncate_text(normalised) def chunk_text(self, text: str, chunk_size: int = None, overlap: int = None) -> List[str]: """Split text into chunks for RAG"""