Skip to content

Commit

Permalink
initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
Amr-YA committed Apr 5, 2022
1 parent 47b7706 commit 2432406
Show file tree
Hide file tree
Showing 15 changed files with 13,269 additions and 0 deletions.
25 changes: 25 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
**/__pycache__
**/.classpath
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/*.*proj.user
**/*.dbmdl
**/*.jfm
**/bin
**/charts
**/docker-compose*
**/compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
README.md
19 changes: 19 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"configurations": [
{
"name": "Docker: Python - General",
"type": "docker",
"request": "launch",
"preLaunchTask": "docker-run: debug",
"python": {
"pathMappings": [
{
"localRoot": "${workspaceFolder}",
"remoteRoot": "/app"
}
],
"projectType": "general"
}
}
]
}
26 changes: 26 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "docker-build",
"label": "docker-build",
"platform": "python",
"dockerBuild": {
"tag": "languagedetectorv02:latest",
"dockerfile": "${workspaceFolder}/Dockerfile",
"context": "${workspaceFolder}",
"pull": true
}
},
{
"type": "docker-run",
"label": "docker-run: debug",
"dependsOn": [
"docker-build"
],
"python": {
"file": "predict.py"
}
}
]
}
11 changes: 11 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM python:3.8
EXPOSE 5005

COPY requirements.txt .
RUN python -m pip install --upgrade pip
RUN python -m pip install -r requirements.txt

WORKDIR /app
COPY . /app

CMD ["python", "predict.py"]
11 changes: 11 additions & 0 deletions Dockerfile_up
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM language_detector
EXPOSE 5005

COPY requirements.txt .
RUN python -m pip install --upgrade pip
RUN python -m pip install -r requirements.txt

WORKDIR /app
COPY . /app

CMD ["python", "predict.py"]
Binary file added __pycache__/cleaner.cpython-39.pyc
Binary file not shown.
22 changes: 22 additions & 0 deletions cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import re
import string

class Cleaner:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
translator = str.maketrans('', '', punctuations_list)
url_pattern = re.compile('https?://\S+|www\.\S+')

def __init__(self):
pass

def clean(sentance):
text = re.sub(r'[[]]', ' ', sentance)
text = text.lower()
text = Cleaner.url_pattern.sub(r'', text) # remove URLs
text = text.translate(Cleaner.translator) # remove punc
text = re.sub('[0-9]', '', text)
text = text.strip()

return text
12,908 changes: 12,908 additions & 0 deletions data/languages.csv

Large diffs are not rendered by default.

148 changes: 148 additions & 0 deletions detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#%%
from tabnanny import verbose
import pandas as pd
import numpy as np
import re
import string

from cleaner import Cleaner

import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#%%
# loading dataset:
data_set = "data/languages.csv"
data = pd.read_csv(data_set)
print(data.head(10))
print(data["Language"].value_counts())

#%%
"""
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations
translator = str.maketrans('', '', punctuations_list)
url_pattern = re.compile('https?://\S+|www\.\S+')
def cleaner(sentance):
text = re.sub(r'[[]]', ' ', sentance)
text = text.lower()
text = url_pattern.sub(r'', text) # remove URLs
text = text.translate(translator) # remove punc
text = re.sub('[0-9]', '', text)
return text.strip()
"""
cleaner = Cleaner()
data['Text_clean'] = data['Text'].apply(cleaner.clean)

#%%
# Construct X, y
X_train, X_test, y_train, y_test = train_test_split(data['Text_clean'], data["Language"],
random_state=42,
test_size=0.1,
stratify=data["Language"].values)

#%%
cv = CountVectorizer()
X_train_ids = cv.fit_transform(X_train).toarray()
X_test_ids = cv.transform(X_test).toarray()

le = LabelEncoder()
y_train_ids = le.fit_transform(y_train)
y_test_ids = le.transform(y_test)
#%%
print(X_train_ids.shape, y_train_ids.shape)
print(X_test_ids.shape, y_test_ids.shape)
#%%
def print_scores(name, model):
y_pred = model.predict(X_test_ids)
y_true = y_test_ids
print(f"{name} model scores:")
print("Accuracy: ", accuracy_score(y_true, y_pred))
print("f1-score: ", f1_score(y_true, y_pred, average='weighted'))
print("Precision: ", precision_score(y_true, y_pred, average='weighted' ))
print("Recall: ", recall_score(y_true, y_pred, average='weighted'))
print("\n")

#%%
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_ids, y_train_ids)
print_scores("Naive-Bayes", naive_bayes)
"""
Naive-Bayes model scores:
Accuracy: 0.970125786163522
f1-score: 0.9700389228748256
Precision: 0.971863757231902
Recall: 0.970125786163522
"""
#%%
bag_mod = BaggingClassifier(n_estimators=200)
bag_mod.fit(X_train_ids, y_train_ids)
print_scores("Bagging", bag_mod)
"""
Bagging model scores:
Accuracy: 0.9182389937106918
f1-score: 0.9172754850099429
Precision: 0.9235551618926304
Recall: 0.9182389937106918
"""
#%%
rf_mod = RandomForestClassifier(n_estimators=200)
rf_mod.fit(X_train_ids, y_train_ids)
print_scores("Random Forest", rf_mod)
"""
Random Forest model scores:
Accuracy: 0.940251572327044
f1-score: 0.9399414469372429
Precision: 0.9448486324921759
Recall: 0.940251572327044
"""
#%%
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
ada_mod.fit(X_train_ids, y_train_ids)
print_scores("AdaBoost", ada_mod)
"""
AdaBoost model scores:
Accuracy: 0.8244234800838575
f1-score: 0.8177240913789224
Precision: 0.8689477493622644
Recall: 0.8244234800838575
"""
#%%
svm_mod = SVC()
svm_mod.fit(X_train_ids, y_train_ids, verbose=1)
print_scores("SVM Classifier", svm_mod)
"""
SVM Classifier model scores:
Accuracy: 0.899895178197065
f1-score: 0.8983032910057367
Precision: 0.9140863505429896
Recall: 0.899895178197065
"""
#%%
def save_model(model):
pickle.dump(model, open('models/detector.sav', 'wb'))
pickle.dump(le, open('models/map.sav', 'wb'))
pickle.dump(cv, open('models/tokenizer.sav', 'wb'))

save_model(naive_bayes)
#%%
def predict(text, model):
text = cleaner.clean(text)
x = cv.transform([text]).toarray()
lang = model.predict(x)
lang = le.inverse_transform(lang)
print("langauge:", lang[0])

text = "مين انت"
predict(text, naive_bayes)


# %%
Binary file added models/detector.sav
Binary file not shown.
Binary file added models/map.sav
Binary file not shown.
Binary file added models/tokenizer.sav
Binary file not shown.
85 changes: 85 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#%%
import unicodedata
from flask import Flask, request, jsonify
from cleaner import Cleaner

import pickle
# from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.simplefilter("ignore")
#%%
def predict_model(text):
x = cv.transform([text]).toarray()
lang = model.predict(x)
lang = le.inverse_transform(lang)
return(lang[0])


app = Flask(__name__)
@app.route("/predict", methods=["POST"])
def predict():

text = request.json["message"]
sender = request.json["sender"]
text = Cleaner.clean(text)

text_cleared = text[:10].replace(" ","")
arabic_score = 0
for char in text_cleared[:5]:
if unicodedata.name(char).split()[0] == "ARABIC":
arabic_score +=1

lang = predict_model(text)
if arabic_score > len(text_cleared)*0.5:
lang = "Arabic"



obj = {"sender": sender, "lang": lang_map[lang]}
return jsonify(obj)

"""
if len(text)<8 :
text2 = text.replace(" ","")
score = 0
for char in text2:
if unicodedata.name(char).split()[0] == "ARABIC":
score +=1
if score > len(text2)*0.5:
return("Arabic")
# else:
# return(predict_model(text))
# else:
# return(predict_model(text))
return(predict_model(text))
"""


#%%
if __name__ == "__main__":
cleaner = Cleaner()
model = pickle.load(open("models/detector.sav", "rb"))
le = pickle.load(open("models/map.sav", "rb"))
cv = pickle.load(open("models/tokenizer.sav", "rb"))
lang_map = {
"English": "en",
"Danish": "da",
"French": "fr",
"German": "ge",
"Hindi": "hi",
"Italian": "it",
"Portugeese": "po",
"Russian": "ru",
"Spanish": "sp",
"Sweedish": "sw",
"Turkish": "tu",
"Arabic": "ar"
}


app.run(host="0.0.0.0", port="5005")

# %%
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Flask==2.0.2
numpy==1.21.5
pandas==1.3.4
scikit_learn==1.0.2
10 changes: 10 additions & 0 deletions test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#%%
import requests

host = 'http://localhost:5005/predict'
text = "اهلا"

r = requests.post(host, json={'sender': 'amr', 'message': text})

print(r.text)
# %%

0 comments on commit 2432406

Please sign in to comment.