draft

dhavalsays · dhavalsays · commit 660136a83094 · 2025-01-16T13:30:45.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -165,7 +165,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # PyPI configuration file
 .pypirc
diff --git a/README.md b/README.md
@@ -1,2 +1,80 @@
-# project-nlp-log-classification
-Log classification using hybrid classification framework
+
+# Log Classification With Hybrid Classification Framework
+
+This project implements a hybrid log classification system, combining three complementary approaches to handle varying levels of complexity in log patterns. The classification methods ensure flexibility and effectiveness in processing predictable, complex, and poorly-labeled data patterns.
+
+---
+
+## Classification Approaches
+
+1. **Regular Expression (Regex)**:
+   - Handles the most simplified and predictable patterns.
+   - Useful for patterns that are easily captured using predefined rules.
+
+2. **Sentence Transformer + Logistic Regression**:
+   - Manages complex patterns when there is sufficient training data.
+   - Utilizes embeddings generated by Sentence Transformers and applies Logistic Regression as the classification layer.
+
+3. **LLM (Large Language Models)**:
+   - Used for handling complex patterns when sufficient labeled training data is not available.
+   - Provides a fallback or complementary approach to the other methods.
+
+---
+
+## Folder Structure
+
+1. **`training/`**:
+   - Contains the code for training models using Sentence Transformer and Logistic Regression.
+   - Includes the code for regex-based classification.
+
+2. **`models/`**:
+   - Stores the saved models, including Sentence Transformer embeddings and the Logistic Regression model.
+
+3. **`resources/`**:
+   - This folder contains resource files such as test CSV files, output files, images, etc.
+
+4. **Root Directory**:
+   - Contains the FastAPI server code (`server.py`).
+
+---
+
+## Setup Instructions
+
+1. **Install Dependencies**:
+   Make sure you have Python installed on your system. Install the required Python libraries by running the following command:
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+2. **Run the FastAPI Server**:
+   To start the server, use the following command:
+
+   ```bash
+   uvicorn server:app --reload
+   ```
+
+   Once the server is running, you can access the API at:
+   - `http://127.0.0.1:8000/` (Main endpoint)
+   - `http://127.0.0.1:8000/docs` (Interactive Swagger documentation)
+   - `http://127.0.0.1:8000/redoc` (Alternative API documentation)
+
+---
+
+## Usage
+
+Upload a CSV file containing logs to the FastAPI endpoint for classification. Ensure the file has the following columns:
+- `source`
+- `log_message`
+
+The output will be a CSV file with an additional column `target_label`, which represents the classified label for each log entry.
+
+---
+
+## Disclaimer
+
+**Copyrights Reserved**:  
+@Codebasics Inc  
+@LearnerX Pvt Ltd  
+
+This project, including its code and resources, is intended solely for educational purposes and should not be used for any commercial purposes without proper authorization.
diff --git a/classify.py b/classify.py
@@ -0,0 +1,54 @@
+from processor_regex import classify_with_regex
+from processor_bert import classify_with_bert
+from processor_llm import classify_with_llm
+
+def classify(logs):
+    labels = []
+    for source, log_msg in logs:
+        label = classify_log(source, log_msg)
+        labels.append(label)
+    return labels
+
+
+def classify_log(source, log_msg):
+    if source == "LegacyCRM":
+        label = classify_with_llm(log_msg)
+    else:
+        label = classify_with_regex(log_msg)
+        if not label:
+            label = classify_with_bert(log_msg)
+    return label
+
+def classify_csv(input_file):
+    import pandas as pd
+    df = pd.read_csv(input_file)
+
+    # Perform classification
+    df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
+
+    # Save the modified file
+    output_file = "output.csv"
+    df.to_csv(output_file, index=False)
+
+    return output_file
+
+if __name__ == '__main__':
+    classify_csv("test.csv")
+    # logs = [
+    #     ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
+    #     ("BillingSystem", "User 12345 logged in."),
+    #     ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
+    #     ("AnalyticsEngine", "Backup completed successfully."),
+    #     ("ModernHR", "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE  200 len: 1583 time: 0.1878400"),
+    #     ("ModernHR", "Admin access escalation detected for user 9429"),
+    #     ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
+    #     ("LegacyCRM", "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."),
+    #     ("LegacyCRM", "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."),
+    #     ("LegacyCRM", " The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025")
+    # ]
+    # labels = classify(logs)
+    #
+    # for log, label in zip(logs, labels):
+    #     print(log[0], "->", label)
+
+
diff --git a/models/log_classifier.joblib b/models/log_classifier.joblib
diff --git a/processor_bert.py b/processor_bert.py
@@ -0,0 +1,29 @@
+import joblib
+from sentence_transformers import SentenceTransformer
+
+model_embedding = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
+model_classification = joblib.load("models/log_classifier.joblib")
+
+
+def classify_with_bert(log_message):
+    embeddings = model_embedding.encode([log_message])
+    probabilities = model_classification.predict_proba(embeddings)[0]
+    if max(probabilities) < 0.5:
+        return "Unclassified"
+    predicted_label = model_classification.predict(embeddings)[0]
+    
+    return predicted_label
+
+
+if __name__ == "__main__":
+    logs = [
+        "alpha.osapi_compute.wsgi.server - 12.10.11.1 - API returned 404 not found error",
+        "GET /v2/3454/servers/detail HTTP/1.1 RCODE   404 len: 1583 time: 0.1878400",
+        "System crashed due to drivers errors when restarting the server",
+        "Hey bro, chill ya!",
+        "Multiple login failures occurred on user 6454 account",
+        "Server A790 was restarted unexpectedly during the process of data transfer"
+    ]
+    for log in logs:
+        label = classify_with_bert(log)
+        print(log, "->", label)
diff --git a/processor_llm.py b/processor_llm.py
@@ -0,0 +1,34 @@
+from dotenv import load_dotenv
+from groq import Groq
+
+load_dotenv()
+
+groq = Groq()
+
+def classify_with_llm(log_msg):
+    """
+    Generate a variant of the input sentence. For example,
+    If input sentence is "User session timed out unexpectedly, user ID: 9250.",
+    variant would be "Session timed out for user 9251"
+    """
+    prompt = f'''Classify the log message into one of these categories: 
+    (1) Workflow Error, (2) Deprecation Warning.
+    If you can't figure out a category, return "Unclassified". 
+    Only return the category name. No preamble. 
+    Log message: {log_msg}'''
+
+    chat_completion = groq.chat.completions.create(
+        messages=[{"role": "user", "content": prompt}],
+        model="llama-3.3-70b-versatile",
+        temperature=0.5
+    )
+
+    return chat_completion.choices[0].message.content
+
+
+if __name__ == "__main__":
+    print(classify_with_llm(
+        "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."))
+    print(classify_with_llm(
+        "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"))
+    print(classify_with_llm("System reboot initiated by user 12345."))
diff --git a/processor_regex.py b/processor_regex.py
@@ -0,0 +1,23 @@
+import re
+def classify_with_regex(log_message):
+    regex_patterns = {
+        r"User User\d+ logged (in|out).": "User Action",
+        r"Backup (started|ended) at .*": "System Notification",
+        r"Backup completed successfully.": "System Notification",
+        r"System updated to version .*": "System Notification",
+        r"File .* uploaded successfully by user .*": "System Notification",
+        r"Disk cleanup completed successfully.": "System Notification",
+        r"System reboot initiated by user .*": "System Notification",
+        r"Account with ID .* created by .*": "User Action"
+    }
+    for pattern, label in regex_patterns.items():
+        if re.search(pattern, log_message):
+            return label
+    return None
+
+if __name__ == "__main__":
+    print(classify_with_regex("Backup completed successfully."))
+    print(classify_with_regex("Account with ID 1234 created by User1."))
+    print(classify_with_regex("Hey Bro, chill ya!"))
+
+
diff --git a/resources/output.csv b/resources/output.csv
@@ -0,0 +1,11 @@
+source,log_message,target_label
+ModernCRM," ""IP 192.168.133.114 blocked due to potential attack""",Security Alert
+BillingSystem," ""User 12345 logged in.""",Security Alert
+AnalyticsEngine," ""File data_6957.csv uploaded successfully by user User265.""",System Notification
+AnalyticsEngine," ""Backup completed successfully.""",System Notification
+ModernHR," ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE  200 len: 1583 time: 0.1878400""",HTTP Status
+ModernHR," ""Admin access escalation detected for user 9429""",Security Alert
+LegacyCRM," ""Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.""",Workflow Error
+LegacyCRM," ""Invoice generation process aborted for order ID 8910 due to invalid tax calculation module.""",Workflow Error
+LegacyCRM," ""The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality.""",Deprecation Warning
+LegacyCRM," ""The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025""",Deprecation Warning
diff --git a/resources/test.csv b/resources/test.csv
@@ -0,0 +1,11 @@
+source,log_message
+ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
+BillingSystem, "User 12345 logged in."
+AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
+AnalyticsEngine, "Backup completed successfully."
+ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE  200 len: 1583 time: 0.1878400"
+ModernHR, "Admin access escalation detected for user 9429"
+LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
+LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
+LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
+LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"
diff --git a/server.py b/server.py
@@ -0,0 +1,36 @@
+import pandas as pd
+from fastapi import FastAPI, UploadFile, HTTPException
+from fastapi.responses import FileResponse
+
+from classify import classify
+
+app = FastAPI()
+
+@app.post("/classify/")
+async def classify_logs(file: UploadFile):
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="File must be a CSV.")
+    
+    try:
+        # Read the uploaded CSV
+        df = pd.read_csv(file.file)
+        if "source" not in df.columns or "log_message" not in df.columns:
+            raise HTTPException(status_code=400, detail="CSV must contain 'source' and 'log_message' columns.")
+
+        # Perform classification
+        df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
+
+        print("Dataframe:",df.to_dict())
+
+        # Save the modified file
+        output_file = "resources/output.csv"
+        df.to_csv(output_file, index=False)
+        print("File saved to output.csv")
+        return FileResponse(output_file, media_type='text/csv')
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        file.file.close()
+        # # Clean up if the file was saved
+        # if os.path.exists("output.csv"):
+        #     os.remove("output.csv")
diff --git a/training/dataset/synthetic_logs.csv b/training/dataset/synthetic_logs.csv
diff --git a/training/log_classification.ipynb b/training/log_classification.ipynb