Skip to content

Commit 660136a

Browse files
committed
draft
1 parent 373d86e commit 660136a

12 files changed

+4989
-3
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ cython_debug/
165165
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166166
# and can be added to the global gitignore or merged into this file. For a more nuclear
167167
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
168-
#.idea/
168+
.idea/
169169

170170
# PyPI configuration file
171171
.pypirc

README.md

+80-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,80 @@
1-
# project-nlp-log-classification
2-
Log classification using hybrid classification framework
1+
2+
# Log Classification With Hybrid Classification Framework
3+
4+
This project implements a hybrid log classification system, combining three complementary approaches to handle varying levels of complexity in log patterns. The classification methods ensure flexibility and effectiveness in processing predictable, complex, and poorly-labeled data patterns.
5+
6+
---
7+
8+
## Classification Approaches
9+
10+
1. **Regular Expression (Regex)**:
11+
- Handles the most simplified and predictable patterns.
12+
- Useful for patterns that are easily captured using predefined rules.
13+
14+
2. **Sentence Transformer + Logistic Regression**:
15+
- Manages complex patterns when there is sufficient training data.
16+
- Utilizes embeddings generated by Sentence Transformers and applies Logistic Regression as the classification layer.
17+
18+
3. **LLM (Large Language Models)**:
19+
- Used for handling complex patterns when sufficient labeled training data is not available.
20+
- Provides a fallback or complementary approach to the other methods.
21+
22+
---
23+
24+
## Folder Structure
25+
26+
1. **`training/`**:
27+
- Contains the code for training models using Sentence Transformer and Logistic Regression.
28+
- Includes the code for regex-based classification.
29+
30+
2. **`models/`**:
31+
- Stores the saved models, including Sentence Transformer embeddings and the Logistic Regression model.
32+
33+
3. **`resources/`**:
34+
- This folder contains resource files such as test CSV files, output files, images, etc.
35+
36+
4. **Root Directory**:
37+
- Contains the FastAPI server code (`server.py`).
38+
39+
---
40+
41+
## Setup Instructions
42+
43+
1. **Install Dependencies**:
44+
Make sure you have Python installed on your system. Install the required Python libraries by running the following command:
45+
46+
```bash
47+
pip install -r requirements.txt
48+
```
49+
50+
2. **Run the FastAPI Server**:
51+
To start the server, use the following command:
52+
53+
```bash
54+
uvicorn server:app --reload
55+
```
56+
57+
Once the server is running, you can access the API at:
58+
- `http://127.0.0.1:8000/` (Main endpoint)
59+
- `http://127.0.0.1:8000/docs` (Interactive Swagger documentation)
60+
- `http://127.0.0.1:8000/redoc` (Alternative API documentation)
61+
62+
---
63+
64+
## Usage
65+
66+
Upload a CSV file containing logs to the FastAPI endpoint for classification. Ensure the file has the following columns:
67+
- `source`
68+
- `log_message`
69+
70+
The output will be a CSV file with an additional column `target_label`, which represents the classified label for each log entry.
71+
72+
---
73+
74+
## Disclaimer
75+
76+
**Copyrights Reserved**:
77+
@Codebasics Inc
78+
@LearnerX Pvt Ltd
79+
80+
This project, including its code and resources, is intended solely for educational purposes and should not be used for any commercial purposes without proper authorization.

classify.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from processor_regex import classify_with_regex
2+
from processor_bert import classify_with_bert
3+
from processor_llm import classify_with_llm
4+
5+
def classify(logs):
6+
labels = []
7+
for source, log_msg in logs:
8+
label = classify_log(source, log_msg)
9+
labels.append(label)
10+
return labels
11+
12+
13+
def classify_log(source, log_msg):
14+
if source == "LegacyCRM":
15+
label = classify_with_llm(log_msg)
16+
else:
17+
label = classify_with_regex(log_msg)
18+
if not label:
19+
label = classify_with_bert(log_msg)
20+
return label
21+
22+
def classify_csv(input_file):
23+
import pandas as pd
24+
df = pd.read_csv(input_file)
25+
26+
# Perform classification
27+
df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
28+
29+
# Save the modified file
30+
output_file = "output.csv"
31+
df.to_csv(output_file, index=False)
32+
33+
return output_file
34+
35+
if __name__ == '__main__':
36+
classify_csv("test.csv")
37+
# logs = [
38+
# ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
39+
# ("BillingSystem", "User 12345 logged in."),
40+
# ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
41+
# ("AnalyticsEngine", "Backup completed successfully."),
42+
# ("ModernHR", "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"),
43+
# ("ModernHR", "Admin access escalation detected for user 9429"),
44+
# ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
45+
# ("LegacyCRM", "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."),
46+
# ("LegacyCRM", "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."),
47+
# ("LegacyCRM", " The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025")
48+
# ]
49+
# labels = classify(logs)
50+
#
51+
# for log, label in zip(logs, labels):
52+
# print(log[0], "->", label)
53+
54+

models/log_classifier.joblib

16.1 KB
Binary file not shown.

processor_bert.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import joblib
2+
from sentence_transformers import SentenceTransformer
3+
4+
model_embedding = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model
5+
model_classification = joblib.load("models/log_classifier.joblib")
6+
7+
8+
def classify_with_bert(log_message):
9+
embeddings = model_embedding.encode([log_message])
10+
probabilities = model_classification.predict_proba(embeddings)[0]
11+
if max(probabilities) < 0.5:
12+
return "Unclassified"
13+
predicted_label = model_classification.predict(embeddings)[0]
14+
15+
return predicted_label
16+
17+
18+
if __name__ == "__main__":
19+
logs = [
20+
"alpha.osapi_compute.wsgi.server - 12.10.11.1 - API returned 404 not found error",
21+
"GET /v2/3454/servers/detail HTTP/1.1 RCODE 404 len: 1583 time: 0.1878400",
22+
"System crashed due to drivers errors when restarting the server",
23+
"Hey bro, chill ya!",
24+
"Multiple login failures occurred on user 6454 account",
25+
"Server A790 was restarted unexpectedly during the process of data transfer"
26+
]
27+
for log in logs:
28+
label = classify_with_bert(log)
29+
print(log, "->", label)

processor_llm.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from dotenv import load_dotenv
2+
from groq import Groq
3+
4+
load_dotenv()
5+
6+
groq = Groq()
7+
8+
def classify_with_llm(log_msg):
9+
"""
10+
Generate a variant of the input sentence. For example,
11+
If input sentence is "User session timed out unexpectedly, user ID: 9250.",
12+
variant would be "Session timed out for user 9251"
13+
"""
14+
prompt = f'''Classify the log message into one of these categories:
15+
(1) Workflow Error, (2) Deprecation Warning.
16+
If you can't figure out a category, return "Unclassified".
17+
Only return the category name. No preamble.
18+
Log message: {log_msg}'''
19+
20+
chat_completion = groq.chat.completions.create(
21+
messages=[{"role": "user", "content": prompt}],
22+
model="llama-3.3-70b-versatile",
23+
temperature=0.5
24+
)
25+
26+
return chat_completion.choices[0].message.content
27+
28+
29+
if __name__ == "__main__":
30+
print(classify_with_llm(
31+
"Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."))
32+
print(classify_with_llm(
33+
"The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"))
34+
print(classify_with_llm("System reboot initiated by user 12345."))

processor_regex.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import re
2+
def classify_with_regex(log_message):
3+
regex_patterns = {
4+
r"User User\d+ logged (in|out).": "User Action",
5+
r"Backup (started|ended) at .*": "System Notification",
6+
r"Backup completed successfully.": "System Notification",
7+
r"System updated to version .*": "System Notification",
8+
r"File .* uploaded successfully by user .*": "System Notification",
9+
r"Disk cleanup completed successfully.": "System Notification",
10+
r"System reboot initiated by user .*": "System Notification",
11+
r"Account with ID .* created by .*": "User Action"
12+
}
13+
for pattern, label in regex_patterns.items():
14+
if re.search(pattern, log_message):
15+
return label
16+
return None
17+
18+
if __name__ == "__main__":
19+
print(classify_with_regex("Backup completed successfully."))
20+
print(classify_with_regex("Account with ID 1234 created by User1."))
21+
print(classify_with_regex("Hey Bro, chill ya!"))
22+
23+

resources/output.csv

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
source,log_message,target_label
2+
ModernCRM," ""IP 192.168.133.114 blocked due to potential attack""",Security Alert
3+
BillingSystem," ""User 12345 logged in.""",Security Alert
4+
AnalyticsEngine," ""File data_6957.csv uploaded successfully by user User265.""",System Notification
5+
AnalyticsEngine," ""Backup completed successfully.""",System Notification
6+
ModernHR," ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400""",HTTP Status
7+
ModernHR," ""Admin access escalation detected for user 9429""",Security Alert
8+
LegacyCRM," ""Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.""",Workflow Error
9+
LegacyCRM," ""Invoice generation process aborted for order ID 8910 due to invalid tax calculation module.""",Workflow Error
10+
LegacyCRM," ""The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality.""",Deprecation Warning
11+
LegacyCRM," ""The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025""",Deprecation Warning

resources/test.csv

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
source,log_message
2+
ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
3+
BillingSystem, "User 12345 logged in."
4+
AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
5+
AnalyticsEngine, "Backup completed successfully."
6+
ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"
7+
ModernHR, "Admin access escalation detected for user 9429"
8+
LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
9+
LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
10+
LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
11+
LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"

server.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import pandas as pd
2+
from fastapi import FastAPI, UploadFile, HTTPException
3+
from fastapi.responses import FileResponse
4+
5+
from classify import classify
6+
7+
app = FastAPI()
8+
9+
@app.post("/classify/")
10+
async def classify_logs(file: UploadFile):
11+
if not file.filename.endswith('.csv'):
12+
raise HTTPException(status_code=400, detail="File must be a CSV.")
13+
14+
try:
15+
# Read the uploaded CSV
16+
df = pd.read_csv(file.file)
17+
if "source" not in df.columns or "log_message" not in df.columns:
18+
raise HTTPException(status_code=400, detail="CSV must contain 'source' and 'log_message' columns.")
19+
20+
# Perform classification
21+
df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
22+
23+
print("Dataframe:",df.to_dict())
24+
25+
# Save the modified file
26+
output_file = "resources/output.csv"
27+
df.to_csv(output_file, index=False)
28+
print("File saved to output.csv")
29+
return FileResponse(output_file, media_type='text/csv')
30+
except Exception as e:
31+
raise HTTPException(status_code=500, detail=str(e))
32+
finally:
33+
file.file.close()
34+
# # Clean up if the file was saved
35+
# if os.path.exists("output.csv"):
36+
# os.remove("output.csv")

0 commit comments

Comments
 (0)