initial version

Amr-YA · Apr 5, 2022 · 2432406 · 2432406
1 parent 47b7706
commit 2432406
Show file tree

Hide file tree

Showing 15 changed files with 13,269 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,25 @@
+**/__pycache__
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/bin
+**/charts
+**/docker-compose*
+**/compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+README.md
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    "configurations": [
+        {
+            "name": "Docker: Python - General",
+            "type": "docker",
+            "request": "launch",
+            "preLaunchTask": "docker-run: debug",
+            "python": {
+                "pathMappings": [
+                    {
+                        "localRoot": "${workspaceFolder}",
+                        "remoteRoot": "/app"
+                    }
+                ],
+                "projectType": "general"
+            }
+        }
+    ]
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,26 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"type": "docker-build",
+			"label": "docker-build",
+			"platform": "python",
+			"dockerBuild": {
+				"tag": "languagedetectorv02:latest",
+				"dockerfile": "${workspaceFolder}/Dockerfile",
+				"context": "${workspaceFolder}",
+				"pull": true
+			}
+		},
+		{
+			"type": "docker-run",
+			"label": "docker-run: debug",
+			"dependsOn": [
+				"docker-build"
+			],
+			"python": {
+				"file": "predict.py"
+			}
+		}
+	]
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.8
+EXPOSE 5005
+
+COPY requirements.txt .
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
+
+WORKDIR /app
+COPY . /app
+
+CMD ["python", "predict.py"]
diff --git a/Dockerfile_up b/Dockerfile_up
@@ -0,0 +1,11 @@
+FROM language_detector
+EXPOSE 5005
+
+COPY requirements.txt .
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
+
+WORKDIR /app
+COPY . /app
+
+CMD ["python", "predict.py"]
diff --git a/__pycache__/cleaner.cpython-39.pyc b/__pycache__/cleaner.cpython-39.pyc
diff --git a/cleaner.py b/cleaner.py
@@ -0,0 +1,22 @@
+import re
+import string
+
+class Cleaner:
+	arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
+	english_punctuations = string.punctuation
+	punctuations_list = arabic_punctuations + english_punctuations
+	translator = str.maketrans('', '', punctuations_list)
+	url_pattern = re.compile('https?://\S+|www\.\S+')
+
+	def __init__(self):
+		pass
+
+	def clean(sentance):
+		text = re.sub(r'[[]]', ' ', sentance)
+		text = text.lower()
+		text = Cleaner.url_pattern.sub(r'', text) # remove URLs
+		text = text.translate(Cleaner.translator) # remove punc
+		text = re.sub('[0-9]', '', text)
+		text = text.strip()
+
+		return text
diff --git a/data/languages.csv b/data/languages.csv
diff --git a/detector.py b/detector.py
@@ -0,0 +1,148 @@
+#%%
+from tabnanny import verbose
+import pandas as pd
+import numpy as np
+import re
+import string
+
+from cleaner import Cleaner
+
+import pickle
+from sklearn.preprocessing import LabelEncoder
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+#%%
+# loading dataset:
+data_set = "data/languages.csv"
+data = pd.read_csv(data_set)
+print(data.head(10))
+print(data["Language"].value_counts())
+
+#%%
+"""
+arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
+english_punctuations = string.punctuation
+punctuations_list = arabic_punctuations + english_punctuations
+translator = str.maketrans('', '', punctuations_list)
+url_pattern = re.compile('https?://\S+|www\.\S+')
+
+def cleaner(sentance):
+    text = re.sub(r'[[]]', ' ', sentance)
+    text = text.lower()
+    text = url_pattern.sub(r'', text) # remove URLs
+    text = text.translate(translator) # remove punc
+    text = re.sub('[0-9]', '', text)
+    return text.strip()
+"""
+cleaner = Cleaner()
+data['Text_clean'] = data['Text'].apply(cleaner.clean)
+
+#%%
+# Construct X, y
+X_train, X_test, y_train, y_test = train_test_split(data['Text_clean'], data["Language"], 
+                                                    random_state=42, 
+                                                    test_size=0.1,
+                                                    stratify=data["Language"].values)
+
+#%%
+cv = CountVectorizer()
+X_train_ids = cv.fit_transform(X_train).toarray()
+X_test_ids = cv.transform(X_test).toarray()
+
+le = LabelEncoder()
+y_train_ids = le.fit_transform(y_train)
+y_test_ids = le.transform(y_test)
+#%%
+print(X_train_ids.shape, y_train_ids.shape)
+print(X_test_ids.shape, y_test_ids.shape)
+#%%
+def print_scores(name, model):
+    y_pred = model.predict(X_test_ids)
+    y_true = y_test_ids
+    print(f"{name} model scores:")
+    print("Accuracy: ", accuracy_score(y_true, y_pred))
+    print("f1-score: ", f1_score(y_true, y_pred, average='weighted'))
+    print("Precision: ", precision_score(y_true, y_pred, average='weighted' ))
+    print("Recall: ", recall_score(y_true, y_pred, average='weighted'))
+    print("\n")
+
+#%%
+naive_bayes = MultinomialNB()
+naive_bayes.fit(X_train_ids, y_train_ids)
+print_scores("Naive-Bayes", naive_bayes)
+"""
+Naive-Bayes model scores:
+Accuracy:  0.970125786163522
+f1-score:  0.9700389228748256
+Precision:  0.971863757231902
+Recall:  0.970125786163522
+"""
+#%%
+bag_mod = BaggingClassifier(n_estimators=200)
+bag_mod.fit(X_train_ids, y_train_ids)
+print_scores("Bagging", bag_mod)
+"""
+Bagging model scores:
+Accuracy:  0.9182389937106918
+f1-score:  0.9172754850099429
+Precision:  0.9235551618926304
+Recall:  0.9182389937106918
+"""
+#%%
+rf_mod = RandomForestClassifier(n_estimators=200)
+rf_mod.fit(X_train_ids, y_train_ids)
+print_scores("Random Forest", rf_mod)
+"""
+Random Forest model scores:
+Accuracy:  0.940251572327044
+f1-score:  0.9399414469372429
+Precision:  0.9448486324921759
+Recall:  0.940251572327044
+"""
+#%%
+ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
+ada_mod.fit(X_train_ids, y_train_ids)
+print_scores("AdaBoost", ada_mod)
+"""
+AdaBoost model scores:
+Accuracy:  0.8244234800838575
+f1-score:  0.8177240913789224
+Precision:  0.8689477493622644
+Recall:  0.8244234800838575
+"""
+#%%
+svm_mod = SVC()
+svm_mod.fit(X_train_ids, y_train_ids, verbose=1)
+print_scores("SVM Classifier", svm_mod)
+"""
+SVM Classifier model scores:
+Accuracy:  0.899895178197065
+f1-score:  0.8983032910057367
+Precision:  0.9140863505429896
+Recall:  0.899895178197065
+"""
+#%%
+def save_model(model):
+    pickle.dump(model, open('models/detector.sav', 'wb'))
+    pickle.dump(le, open('models/map.sav', 'wb'))
+    pickle.dump(cv, open('models/tokenizer.sav', 'wb'))
+
+save_model(naive_bayes)
+#%%
+def predict(text, model):
+    text = cleaner.clean(text)
+    x = cv.transform([text]).toarray()
+    lang = model.predict(x)
+    lang = le.inverse_transform(lang)
+    print("langauge:", lang[0])
+
+text = "مين انت"
+predict(text, naive_bayes)
+
+
+# %%
diff --git a/models/detector.sav b/models/detector.sav
diff --git a/models/map.sav b/models/map.sav
diff --git a/models/tokenizer.sav b/models/tokenizer.sav
diff --git a/predict.py b/predict.py
@@ -0,0 +1,85 @@
+#%%
+import unicodedata
+from flask import Flask, request, jsonify
+from cleaner import Cleaner
+
+import pickle
+# from sklearn.preprocessing import LabelEncoder
+# from sklearn.feature_extraction.text import CountVectorizer
+# from sklearn.naive_bayes import MultinomialNB
+
+import warnings
+warnings.simplefilter("ignore")
+#%%
+def predict_model(text):
+    x = cv.transform([text]).toarray()
+    lang = model.predict(x)
+    lang = le.inverse_transform(lang)
+    return(lang[0])
+
+
+app = Flask(__name__)
+@app.route("/predict", methods=["POST"])
+def predict():
+
+    text = request.json["message"]
+    sender = request.json["sender"]
+    text = Cleaner.clean(text)
+
+    text_cleared = text[:10].replace(" ","")
+    arabic_score = 0
+    for char in text_cleared[:5]:
+        if unicodedata.name(char).split()[0] == "ARABIC":
+            arabic_score +=1
+
+    lang = predict_model(text)
+    if arabic_score > len(text_cleared)*0.5:
+        lang = "Arabic"
+
+
+
+    obj = {"sender": sender, "lang": lang_map[lang]}
+    return jsonify(obj)
+
+    """
+    if len(text)<8 :
+        text2 = text.replace(" ","")
+        score = 0
+        for char in text2:
+            if unicodedata.name(char).split()[0] == "ARABIC":
+                score +=1
+        if score > len(text2)*0.5:
+            return("Arabic")
+        # else:
+        #     return(predict_model(text))
+    # else:
+    #     return(predict_model(text))
+    return(predict_model(text))
+    """
+
+
+#%%
+if __name__ == "__main__":
+    cleaner = Cleaner()
+    model = pickle.load(open("models/detector.sav", "rb"))
+    le = pickle.load(open("models/map.sav", "rb"))
+    cv = pickle.load(open("models/tokenizer.sav", "rb"))
+    lang_map = {
+                "English": "en",
+                "Danish": "da",
+                "French": "fr",
+                "German": "ge",
+                "Hindi": "hi",
+                "Italian": "it",
+                "Portugeese": "po",
+                "Russian": "ru",
+                "Spanish": "sp",
+                "Sweedish": "sw",
+                "Turkish": "tu",
+                "Arabic": "ar"
+                }
+
+
+    app.run(host="0.0.0.0", port="5005")
+
+# %%
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+Flask==2.0.2
+numpy==1.21.5
+pandas==1.3.4
+scikit_learn==1.0.2
diff --git a/test_api.py b/test_api.py
@@ -0,0 +1,10 @@
+#%%
+import requests
+
+host = 'http://localhost:5005/predict'
+text = "اهلا"
+
+r = requests.post(host, json={'sender': 'amr', 'message': text})
+
+print(r.text)
+# %%