-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
13,269 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
**/__pycache__ | ||
**/.classpath | ||
**/.dockerignore | ||
**/.env | ||
**/.git | ||
**/.gitignore | ||
**/.project | ||
**/.settings | ||
**/.toolstarget | ||
**/.vs | ||
**/.vscode | ||
**/*.*proj.user | ||
**/*.dbmdl | ||
**/*.jfm | ||
**/bin | ||
**/charts | ||
**/docker-compose* | ||
**/compose* | ||
**/Dockerfile* | ||
**/node_modules | ||
**/npm-debug.log | ||
**/obj | ||
**/secrets.dev.yaml | ||
**/values.dev.yaml | ||
README.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"configurations": [ | ||
{ | ||
"name": "Docker: Python - General", | ||
"type": "docker", | ||
"request": "launch", | ||
"preLaunchTask": "docker-run: debug", | ||
"python": { | ||
"pathMappings": [ | ||
{ | ||
"localRoot": "${workspaceFolder}", | ||
"remoteRoot": "/app" | ||
} | ||
], | ||
"projectType": "general" | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"version": "2.0.0", | ||
"tasks": [ | ||
{ | ||
"type": "docker-build", | ||
"label": "docker-build", | ||
"platform": "python", | ||
"dockerBuild": { | ||
"tag": "languagedetectorv02:latest", | ||
"dockerfile": "${workspaceFolder}/Dockerfile", | ||
"context": "${workspaceFolder}", | ||
"pull": true | ||
} | ||
}, | ||
{ | ||
"type": "docker-run", | ||
"label": "docker-run: debug", | ||
"dependsOn": [ | ||
"docker-build" | ||
], | ||
"python": { | ||
"file": "predict.py" | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM python:3.8 | ||
EXPOSE 5005 | ||
|
||
COPY requirements.txt . | ||
RUN python -m pip install --upgrade pip | ||
RUN python -m pip install -r requirements.txt | ||
|
||
WORKDIR /app | ||
COPY . /app | ||
|
||
CMD ["python", "predict.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM language_detector | ||
EXPOSE 5005 | ||
|
||
COPY requirements.txt . | ||
RUN python -m pip install --upgrade pip | ||
RUN python -m pip install -r requirements.txt | ||
|
||
WORKDIR /app | ||
COPY . /app | ||
|
||
CMD ["python", "predict.py"] |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import re | ||
import string | ||
|
||
class Cleaner: | ||
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' | ||
english_punctuations = string.punctuation | ||
punctuations_list = arabic_punctuations + english_punctuations | ||
translator = str.maketrans('', '', punctuations_list) | ||
url_pattern = re.compile('https?://\S+|www\.\S+') | ||
|
||
def __init__(self): | ||
pass | ||
|
||
def clean(sentance): | ||
text = re.sub(r'[[]]', ' ', sentance) | ||
text = text.lower() | ||
text = Cleaner.url_pattern.sub(r'', text) # remove URLs | ||
text = text.translate(Cleaner.translator) # remove punc | ||
text = re.sub('[0-9]', '', text) | ||
text = text.strip() | ||
|
||
return text |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#%% | ||
from tabnanny import verbose | ||
import pandas as pd | ||
import numpy as np | ||
import re | ||
import string | ||
|
||
from cleaner import Cleaner | ||
|
||
import pickle | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.naive_bayes import MultinomialNB | ||
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier | ||
from sklearn.svm import SVC | ||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | ||
|
||
#%% | ||
# loading dataset: | ||
data_set = "data/languages.csv" | ||
data = pd.read_csv(data_set) | ||
print(data.head(10)) | ||
print(data["Language"].value_counts()) | ||
|
||
#%% | ||
""" | ||
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' | ||
english_punctuations = string.punctuation | ||
punctuations_list = arabic_punctuations + english_punctuations | ||
translator = str.maketrans('', '', punctuations_list) | ||
url_pattern = re.compile('https?://\S+|www\.\S+') | ||
def cleaner(sentance): | ||
text = re.sub(r'[[]]', ' ', sentance) | ||
text = text.lower() | ||
text = url_pattern.sub(r'', text) # remove URLs | ||
text = text.translate(translator) # remove punc | ||
text = re.sub('[0-9]', '', text) | ||
return text.strip() | ||
""" | ||
cleaner = Cleaner() | ||
data['Text_clean'] = data['Text'].apply(cleaner.clean) | ||
|
||
#%% | ||
# Construct X, y | ||
X_train, X_test, y_train, y_test = train_test_split(data['Text_clean'], data["Language"], | ||
random_state=42, | ||
test_size=0.1, | ||
stratify=data["Language"].values) | ||
|
||
#%% | ||
cv = CountVectorizer() | ||
X_train_ids = cv.fit_transform(X_train).toarray() | ||
X_test_ids = cv.transform(X_test).toarray() | ||
|
||
le = LabelEncoder() | ||
y_train_ids = le.fit_transform(y_train) | ||
y_test_ids = le.transform(y_test) | ||
#%% | ||
print(X_train_ids.shape, y_train_ids.shape) | ||
print(X_test_ids.shape, y_test_ids.shape) | ||
#%% | ||
def print_scores(name, model): | ||
y_pred = model.predict(X_test_ids) | ||
y_true = y_test_ids | ||
print(f"{name} model scores:") | ||
print("Accuracy: ", accuracy_score(y_true, y_pred)) | ||
print("f1-score: ", f1_score(y_true, y_pred, average='weighted')) | ||
print("Precision: ", precision_score(y_true, y_pred, average='weighted' )) | ||
print("Recall: ", recall_score(y_true, y_pred, average='weighted')) | ||
print("\n") | ||
|
||
#%% | ||
naive_bayes = MultinomialNB() | ||
naive_bayes.fit(X_train_ids, y_train_ids) | ||
print_scores("Naive-Bayes", naive_bayes) | ||
""" | ||
Naive-Bayes model scores: | ||
Accuracy: 0.970125786163522 | ||
f1-score: 0.9700389228748256 | ||
Precision: 0.971863757231902 | ||
Recall: 0.970125786163522 | ||
""" | ||
#%% | ||
bag_mod = BaggingClassifier(n_estimators=200) | ||
bag_mod.fit(X_train_ids, y_train_ids) | ||
print_scores("Bagging", bag_mod) | ||
""" | ||
Bagging model scores: | ||
Accuracy: 0.9182389937106918 | ||
f1-score: 0.9172754850099429 | ||
Precision: 0.9235551618926304 | ||
Recall: 0.9182389937106918 | ||
""" | ||
#%% | ||
rf_mod = RandomForestClassifier(n_estimators=200) | ||
rf_mod.fit(X_train_ids, y_train_ids) | ||
print_scores("Random Forest", rf_mod) | ||
""" | ||
Random Forest model scores: | ||
Accuracy: 0.940251572327044 | ||
f1-score: 0.9399414469372429 | ||
Precision: 0.9448486324921759 | ||
Recall: 0.940251572327044 | ||
""" | ||
#%% | ||
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2) | ||
ada_mod.fit(X_train_ids, y_train_ids) | ||
print_scores("AdaBoost", ada_mod) | ||
""" | ||
AdaBoost model scores: | ||
Accuracy: 0.8244234800838575 | ||
f1-score: 0.8177240913789224 | ||
Precision: 0.8689477493622644 | ||
Recall: 0.8244234800838575 | ||
""" | ||
#%% | ||
svm_mod = SVC() | ||
svm_mod.fit(X_train_ids, y_train_ids, verbose=1) | ||
print_scores("SVM Classifier", svm_mod) | ||
""" | ||
SVM Classifier model scores: | ||
Accuracy: 0.899895178197065 | ||
f1-score: 0.8983032910057367 | ||
Precision: 0.9140863505429896 | ||
Recall: 0.899895178197065 | ||
""" | ||
#%% | ||
def save_model(model): | ||
pickle.dump(model, open('models/detector.sav', 'wb')) | ||
pickle.dump(le, open('models/map.sav', 'wb')) | ||
pickle.dump(cv, open('models/tokenizer.sav', 'wb')) | ||
|
||
save_model(naive_bayes) | ||
#%% | ||
def predict(text, model): | ||
text = cleaner.clean(text) | ||
x = cv.transform([text]).toarray() | ||
lang = model.predict(x) | ||
lang = le.inverse_transform(lang) | ||
print("langauge:", lang[0]) | ||
|
||
text = "مين انت" | ||
predict(text, naive_bayes) | ||
|
||
|
||
# %% |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#%% | ||
import unicodedata | ||
from flask import Flask, request, jsonify | ||
from cleaner import Cleaner | ||
|
||
import pickle | ||
# from sklearn.preprocessing import LabelEncoder | ||
# from sklearn.feature_extraction.text import CountVectorizer | ||
# from sklearn.naive_bayes import MultinomialNB | ||
|
||
import warnings | ||
warnings.simplefilter("ignore") | ||
#%% | ||
def predict_model(text): | ||
x = cv.transform([text]).toarray() | ||
lang = model.predict(x) | ||
lang = le.inverse_transform(lang) | ||
return(lang[0]) | ||
|
||
|
||
app = Flask(__name__) | ||
@app.route("/predict", methods=["POST"]) | ||
def predict(): | ||
|
||
text = request.json["message"] | ||
sender = request.json["sender"] | ||
text = Cleaner.clean(text) | ||
|
||
text_cleared = text[:10].replace(" ","") | ||
arabic_score = 0 | ||
for char in text_cleared[:5]: | ||
if unicodedata.name(char).split()[0] == "ARABIC": | ||
arabic_score +=1 | ||
|
||
lang = predict_model(text) | ||
if arabic_score > len(text_cleared)*0.5: | ||
lang = "Arabic" | ||
|
||
|
||
|
||
obj = {"sender": sender, "lang": lang_map[lang]} | ||
return jsonify(obj) | ||
|
||
""" | ||
if len(text)<8 : | ||
text2 = text.replace(" ","") | ||
score = 0 | ||
for char in text2: | ||
if unicodedata.name(char).split()[0] == "ARABIC": | ||
score +=1 | ||
if score > len(text2)*0.5: | ||
return("Arabic") | ||
# else: | ||
# return(predict_model(text)) | ||
# else: | ||
# return(predict_model(text)) | ||
return(predict_model(text)) | ||
""" | ||
|
||
|
||
#%% | ||
if __name__ == "__main__": | ||
cleaner = Cleaner() | ||
model = pickle.load(open("models/detector.sav", "rb")) | ||
le = pickle.load(open("models/map.sav", "rb")) | ||
cv = pickle.load(open("models/tokenizer.sav", "rb")) | ||
lang_map = { | ||
"English": "en", | ||
"Danish": "da", | ||
"French": "fr", | ||
"German": "ge", | ||
"Hindi": "hi", | ||
"Italian": "it", | ||
"Portugeese": "po", | ||
"Russian": "ru", | ||
"Spanish": "sp", | ||
"Sweedish": "sw", | ||
"Turkish": "tu", | ||
"Arabic": "ar" | ||
} | ||
|
||
|
||
app.run(host="0.0.0.0", port="5005") | ||
|
||
# %% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Flask==2.0.2 | ||
numpy==1.21.5 | ||
pandas==1.3.4 | ||
scikit_learn==1.0.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#%% | ||
import requests | ||
|
||
host = 'http://localhost:5005/predict' | ||
text = "اهلا" | ||
|
||
r = requests.post(host, json={'sender': 'amr', 'message': text}) | ||
|
||
print(r.text) | ||
# %% |