From abc0ae011b8c094978b2a6f0b2e22d78e9010ff6 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Wed, 11 May 2022 13:15:13 +0530 Subject: [PATCH 1/8] ENH: Sentiment analysis with Huggingface transformers --- gramex/handlers/mlhandler.py | 13 ++++++- gramex/ml_api.py | 40 +++++++++++++++++++- gramex/transformers.py | 72 ++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 gramex/transformers.py diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index f76bf96f0..5b464ed23 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -96,7 +96,11 @@ def setup(cls, data=None, model={}, config_dir='', template=DEFAULT_TEMPLATE, ** cls.store.dump('class', mclass) cls.store.dump('params', model_params) if op.exists(cls.store.model_path): # If the pkl exists, load it - cls.model = get_model(cls.store.model_path, {}) + if op.isdir(cls.store.model_path): + mclass, wrapper = ml.search_modelclass(mclass) + cls.model = locate(wrapper).from_disk(mclass, cls.store.model_path) + else: + cls.model = get_model(cls.store.model_path, {}) elif data is not None: data = cls._filtercols(data) data = cls._filterrows(data) @@ -180,12 +184,12 @@ def _transform(self, data, **kwargs): return data def _predict(self, data=None, score_col=''): + self._check_model_path() metric = self.get_argument('_metric', False) if metric: scorer = get_scorer(metric) if data is None: data = self._parse_data(False) - self.model = get_model(self.store.model_path, {}) data = self._transform(data, drop_duplicates=False) try: target = data.pop(score_col) @@ -206,6 +210,11 @@ def _check_model_path(self): self.model = get_model(self.store.model_path, {}) except FileNotFoundError: raise HTTPError(NOT_FOUND, f'No model found at {self.store.model_path}') + except IsADirectoryError: + if not hasattr(self, "model"): + mclass = self.store.load('class') + mclass, wrapper = ml.search_modelclass(mclass) + self.model = locate(wrapper).from_disk(mclass, self.store.model_path) @coroutine def prepare(self): diff --git a/gramex/ml_api.py b/gramex/ml_api.py index b149680b6..4913838ab 100644 --- a/gramex/ml_api.py +++ b/gramex/ml_api.py @@ -43,8 +43,9 @@ ], "gramex.sm_api.StatsModel": [ "statsmodels.tsa.api", - "statsmodels.tsa.statespace.sarimax" + "statsmodels.tsa.statespace.sarimax", ], + "gramex.ml_api.HFTransformer": ["gramex.transformers"], } @@ -338,7 +339,9 @@ def _predict(self, X, **kwargs): try: y = self.model.predict(X, **kwargs) except RuntimeError: - y = self.model.predict(X[self.model['transform']._feature_names_in], **kwargs) + y = self.model.predict( + X[self.model["transform"]._feature_names_in], **kwargs + ) return y def predict( @@ -382,3 +385,36 @@ class SklearnTransformer(SklearnModel): def _predict(self, X, **kwargs): """Sklearn transformers don't have a "predict", they have a "transform".""" return self.model.transform(X, **kwargs) + + +class HFTransformer(SklearnModel): + def __init__(self, model, params=None, data=None, **kwargs): + self.model = model + if params is None: + params = {"text_col": "text", "target_col": "label"} + self.params = params + self.kwargs = kwargs + + @classmethod + def from_disk(cls, klass, path): + model = op.join(path, "model") + tokenizer = op.join(path, "tokenizer") + # lenc = op.join(path, "lenc.pkl") + return cls(klass(model, tokenizer)) # , lenc=lenc)) + + def fit( + self, + X: Union[pd.DataFrame, np.ndarray], + y: Union[pd.Series, np.ndarray], + model_path: str = "", + name: str = "", + **kwargs, + ): + text = X.squeeze("columns") + self.model.fit(text, y, model_path, **kwargs) + + def _predict( + self, X: Union[pd.DataFrame, np.ndarray], target_col: str = "", **kwargs + ): + text = X["text"] + return self.model.predict(text) diff --git a/gramex/transformers.py b/gramex/transformers.py new file mode 100644 index 000000000..32cf67345 --- /dev/null +++ b/gramex/transformers.py @@ -0,0 +1,72 @@ +import os.path as op + +import pandas as pd +import transformers as trf + +from datasets import Dataset +from gramex.config import app_log +from sklearn.metrics import roc_auc_score + + +DEFAULT_MODEL = DEFAULT_TOKENIZER = "distilbert-base-uncased-finetuned-sst-2-english" + + +def load_pretrained(klass, path, default, **kwargs): + if op.isdir(path): + try: + app_log.info(f"Attempting to load {klass.__name__} from {path}") + model = klass.from_pretrained(path, **kwargs) + except: # NOQA: E722 + app_log.info(f"Falling back to default {klass.__name__}: {default}.") + model = klass.from_pretrained(default, **kwargs) + else: + model = klass.from_pretrained(default, **kwargs) + return model + + +class SentimentAnalysis(object): + task = "sentiment-analysis" + + def __init__(self, model=DEFAULT_MODEL, tokenizer=DEFAULT_TOKENIZER, **kwargs): + self._model = model + self._tokenizer = tokenizer + self.model = load_pretrained( + trf.AutoModelForSequenceClassification, model, DEFAULT_MODEL + ) + self.tokenizer = load_pretrained( + trf.AutoTokenizer, tokenizer, DEFAULT_TOKENIZER + ) + self.pipeline = trf.pipeline(self.task, model=model, tokenizer=tokenizer) + + def fit(self, text, labels, model_path, **kwargs): + if pd.api.types.is_object_dtype(labels): + labels = labels.map(self.model.config.label2id.get) + ds = Dataset.from_dict({"text": text, "label": labels}) + tokenized = ds.map( + lambda x: self.tokenizer(x["text"], padding="max_length", truncation=True), + batched=True, + ) + train_args = trf.TrainingArguments(save_strategy="no", output_dir=model_path) + trainer = trf.Trainer( + model=self.model, train_dataset=tokenized, args=train_args + ) + trainer.train() + self.model.to('cpu') + self.model.save_pretrained(op.join(model_path, "model")) + self.tokenizer.save_pretrained(op.join(model_path, "tokenizer")) + self.pipeline = trf.pipeline( + self.task, + model=self.model, + tokenizer=self.tokenizer, + ) + + def predict(self, text, **kwargs): + text = text.tolist() + predictions = self.pipeline(text) + return [k["label"] for k in predictions] + + def score(self, X, y_true, **kwargs): + y_true = [self.model.config.label2id[x] for x in y_true] + y_pred = self.predict(X.squeeze("columns")) + y_pred = [self.model.config.label2id[x] for x in y_pred] + return roc_auc_score(y_true, y_pred) From 6c4d209f2db7cb4c4a5e88c46eee290299893c58 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Mon, 16 May 2022 13:29:06 +0530 Subject: [PATCH 2/8] FIX: Models should deserialize themselves Sklearn models are saved as files, transformers are saved in directories. Earlier, the handler was doing disk I/O - this commit ensures that the wrapper classes perform the I/O for the models --- gramex/handlers/mlhandler.py | 8 ++------ gramex/ml_api.py | 14 +++++++++++--- gramex/sm_api.py | 8 ++++++++ gramex/transformers.py | 20 +++++++++++++------- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 5b464ed23..b1a9e662b 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -207,14 +207,10 @@ def _predict(self, data=None, score_col=''): def _check_model_path(self): try: - self.model = get_model(self.store.model_path, {}) + klass, wrapper = ml.search_modelclass(self.store.load('class')) + self.model = locate(wrapper).from_disk(self.store.model_path, klass=klass) except FileNotFoundError: raise HTTPError(NOT_FOUND, f'No model found at {self.store.model_path}') - except IsADirectoryError: - if not hasattr(self, "model"): - mclass = self.store.load('class') - mclass, wrapper = ml.search_modelclass(mclass) - self.model = locate(wrapper).from_disk(mclass, self.store.model_path) @coroutine def prepare(self): diff --git a/gramex/ml_api.py b/gramex/ml_api.py index 4913838ab..98d3380b0 100644 --- a/gramex/ml_api.py +++ b/gramex/ml_api.py @@ -278,6 +278,15 @@ def get_attributes(self, *args, **kwargs) -> dict: class SklearnModel(AbstractModel): """SklearnModel.""" + @classmethod + def from_disk(cls, path, **kwargs): + model = cache.open(path, joblib.load) + if isinstance(model, Pipeline): + _, wrapper = search_modelclass(model[-1].__class__.__name__) + else: + _, wrapper = search_modelclass(model.__class__.__name__) + return cls(model, params={}) + def __init__( self, model: Any, @@ -396,11 +405,10 @@ def __init__(self, model, params=None, data=None, **kwargs): self.kwargs = kwargs @classmethod - def from_disk(cls, klass, path): + def from_disk(cls, path, klass): model = op.join(path, "model") tokenizer = op.join(path, "tokenizer") - # lenc = op.join(path, "lenc.pkl") - return cls(klass(model, tokenizer)) # , lenc=lenc)) + return cls(klass(model, tokenizer)) def fit( self, diff --git a/gramex/sm_api.py b/gramex/sm_api.py index 8eb4f1e01..d838210f5 100644 --- a/gramex/sm_api.py +++ b/gramex/sm_api.py @@ -1,6 +1,8 @@ import pandas as pd import numpy as np +import joblib from gramex.config import app_log +from gramex import cache from statsmodels import api as sm from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper from sklearn.metrics import mean_absolute_error @@ -8,6 +10,12 @@ class StatsModel(AbstractModel): + + @classmethod + def from_disk(cls, path, **kwargs): + model = cache.open(path, joblib.load) + return cls(model, params={}) + def __init__(self, mclass, params, **kwargs): self.stl_kwargs = kwargs.pop("stl", False) if isinstance(mclass, SARIMAXResultsWrapper): diff --git a/gramex/transformers.py b/gramex/transformers.py index 32cf67345..12514dee8 100644 --- a/gramex/transformers.py +++ b/gramex/transformers.py @@ -5,6 +5,7 @@ from datasets import Dataset from gramex.config import app_log +from gramex import cache from sklearn.metrics import roc_auc_score @@ -15,18 +16,17 @@ def load_pretrained(klass, path, default, **kwargs): if op.isdir(path): try: app_log.info(f"Attempting to load {klass.__name__} from {path}") - model = klass.from_pretrained(path, **kwargs) + model = cache.open(path, klass.from_pretrained, **kwargs) except: # NOQA: E722 app_log.info(f"Falling back to default {klass.__name__}: {default}.") - model = klass.from_pretrained(default, **kwargs) + model = cache.open(default, klass.from_pretrained, **kwargs) else: + app_log.info(f"{path} not found on disk; loading default...") model = klass.from_pretrained(default, **kwargs) return model -class SentimentAnalysis(object): - task = "sentiment-analysis" - +class BaseTransformer(object): def __init__(self, model=DEFAULT_MODEL, tokenizer=DEFAULT_TOKENIZER, **kwargs): self._model = model self._tokenizer = tokenizer @@ -36,7 +36,13 @@ def __init__(self, model=DEFAULT_MODEL, tokenizer=DEFAULT_TOKENIZER, **kwargs): self.tokenizer = load_pretrained( trf.AutoTokenizer, tokenizer, DEFAULT_TOKENIZER ) - self.pipeline = trf.pipeline(self.task, model=model, tokenizer=tokenizer) + self.pipeline = trf.pipeline( + self.task, model=self.model, tokenizer=self.tokenizer + ) + + +class SentimentAnalysis(BaseTransformer): + task = "sentiment-analysis" def fit(self, text, labels, model_path, **kwargs): if pd.api.types.is_object_dtype(labels): @@ -51,7 +57,7 @@ def fit(self, text, labels, model_path, **kwargs): model=self.model, train_dataset=tokenized, args=train_args ) trainer.train() - self.model.to('cpu') + self.model.to("cpu") self.model.save_pretrained(op.join(model_path, "model")) self.tokenizer.save_pretrained(op.join(model_path, "tokenizer")) self.pipeline = trf.pipeline( From c4a121445411098768b416337e5ccc750403ec43 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Mon, 16 May 2022 13:30:26 +0530 Subject: [PATCH 3/8] TST: Ensure basic trainability of sentiment analysis --- tests/gramex.yaml | 8 + tests/test_mlhandler.py | 728 +++++++++++++++++++++++++--------------- 2 files changed, 461 insertions(+), 275 deletions(-) diff --git a/tests/gramex.yaml b/tests/gramex.yaml index be7c65366..09285dc1f 100644 --- a/tests/gramex.yaml +++ b/tests/gramex.yaml @@ -1252,6 +1252,14 @@ url: order: [7, 1, 0] xsrf_cookies: false + mlhandler/huggingface/sentiment: + pattern: /sentiment + handler: MLHandler + kwargs: + model: + class: SentimentAnalysis + xsrf_cookies: false + capture: pattern: /capture handler: CaptureHandler diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 03e0d57e3..3851f4deb 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -1,8 +1,11 @@ from io import BytesIO, StringIO import os from unittest import skipUnless +import logging +import warnings import joblib +import shutil import gramex from gramex.http import OK, NOT_FOUND import numpy as np @@ -17,62 +20,112 @@ from sklearn.tree import DecisionTreeClassifier from . import TestGramex, folder, tempfiles + try: from statsmodels.datasets.interest_inflation import load as infl_load + STATSMODELS_INSTALLED = True except ImportError: STATSMODELS_INSTALLED = False +try: + logging.getLogger("tensorflow").disabled = True + import transformers as trf # NOQA: F401 + + TRANSFORMERS_INSTALLED = True +except ImportError: + TRANSFORMERS_INSTALLED = False op = os.path +@skipUnless(TRANSFORMERS_INSTALLED, "Please install transformers to run these tests.") +class TestTransformers(TestGramex): + @classmethod + def tearDownClass(cls): + apps = ["mlhandler-huggingface-sentiment", "mlhanlder-huggingface-ner"] + paths = [ + op.join(gramex.config.variables["GRAMEXDATA"], "apps", "mlhandler", p) + for p in apps + ] + for path in paths: + if op.isdir(path): + shutil.rmtree(path) + + def test_blank_predictions(self): + """Ensure that the default model predicts something.""" + resp = self.get("/sentiment?text=This is bad.&text=This is good.", timeout=60) + self.assertEqual(resp.json(), ["NEGATIVE", "POSITIVE"]) + + def test_train(self): + """Train with some vague sentences.""" + warnings.warn("This test takes a LONG time. Leave while you can.") + df = pd.read_json("https://bit.ly/3NesHFs") + resp = self.get( + "/sentiment?_action=train&target_col=label", + method='post', + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + timeout=300, + ) + self.assertGreaterEqual(resp.json()['score'], 0.9) + + @skipUnless(STATSMODELS_INSTALLED, "Please install statsmodels to run these tests.") class TestStatsmodels(TestGramex): - @classmethod def setUpClass(cls): - cls.root = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler') - paths = [op.join(cls.root, f) for f in [ - 'mlhandler-sarimax/config.json', - 'mlhandler-sarimax/data.h5', - 'mlhandler-sarimax/mlhandler-sarimax.pkl', - ]] + cls.root = op.join(gramex.config.variables["GRAMEXDATA"], "apps", "mlhandler") + paths = [ + op.join(cls.root, f) + for f in [ + "mlhandler-sarimax/config.json", + "mlhandler-sarimax/data.h5", + "mlhandler-sarimax/mlhandler-sarimax.pkl", + ] + ] for p in paths: tempfiles[p] = p def test_sarimax(self): - resp = self.get('/sarimax') + resp = self.get("/sarimax") self.assertEqual(resp.status_code, NOT_FOUND) # Create the model data = infl_load().data index = pd.to_datetime( - data[['year', 'quarter']].apply(lambda x: '%dQ%d' % (x.year, x.quarter), axis=1) - ) - data.drop(['year', 'quarter'], axis=1, inplace=True) - data['index'] = index - params = { - 'index_col': 'index', - 'target_col': 'R' - } - resp = self.get('/sarimax', data=params, method='put') + data[["year", "quarter"]].apply( + lambda x: "%dQ%d" % (x.year, x.quarter), axis=1 + ) + ) + data.drop(["year", "quarter"], axis=1, inplace=True) + data["index"] = index + params = {"index_col": "index", "target_col": "R"} + resp = self.get("/sarimax", data=params, method="put") self.assertEqual(resp.status_code, OK) # Train the model - resp = self.get('/sarimax?_action=append', method='post', - data=data.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + resp = self.get( + "/sarimax?_action=append", + method="post", + data=data.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(resp.status_code, OK) - resp = self.get('/sarimax?_action=train', method='post') - self.assertLessEqual(resp.json()['score'], 0.01) + resp = self.get("/sarimax?_action=train", method="post") + self.assertLessEqual(resp.json()["score"], 0.01) # Get predictions - resp = self.get('/sarimax', method='post', - data=data[['index', 'Dp']].to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + resp = self.get( + "/sarimax", + method="post", + data=data[["index", "Dp"]].to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(resp.status_code, OK) - self.assertLessEqual(mean_absolute_error(np.array(resp.json()), data['R']), 0.01) + self.assertLessEqual( + mean_absolute_error(np.array(resp.json()), data["R"]), 0.01 + ) class TestSklearn(TestGramex): @@ -80,130 +133,151 @@ class TestSklearn(TestGramex): @classmethod def setUpClass(cls): - cls.df = pd.read_csv(op.join(folder, '..', 'testlib', 'iris.csv'), encoding='utf8') - cls.root = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler') - cls.model_path = op.join(cls.root, 'mlhandler-config', 'mlhandler-config.pkl') - paths = [op.join(cls.root, f) for f in [ - 'mlhandler-nopath/config.json', - 'mlhandler-nopath/data.h5', - 'mlhandler-blank/config.json', - 'mlhandler-blank/data.h5', - 'mlhandler-blank/mlhandler-blank.pkl', - 'mlhandler-config/config.json', - 'mlhandler-config/data.h5', - 'mlhandler-incr/config.json', - 'mlhandler-incr/data.h5', - 'mlhandler-incr/mlhandler-incr.pkl', - 'mlhandler-xform/config.json', - 'mlhandler-xform/data.h5', - 'mlhandler-xform/mlhandler-xform.pkl', - 'mlhandler-nopath/mlhandler-nopath.pkl', - 'mlhandler-badcol/config.json', - 'mlhandler-badcol/data.h5', - 'mlhandler-badcol/mlhandler-badcol.pkl', - 'mlhandler-decompositions/config.json', - 'mlhandler-decompositions/data.h5', - 'mlhandler-decompositions/mlhandler-decompositions.pkl', - 'mlhandler-pipeline/config.json', - 'mlhandler-pipeline/data.h5', - 'mlhandler-pipeline/mlhandler-pipeline.pkl' - ]] - paths += [op.join(folder, 'model.pkl')] + cls.df = pd.read_csv( + op.join(folder, "..", "testlib", "iris.csv"), encoding="utf8" + ) + cls.root = op.join(gramex.config.variables["GRAMEXDATA"], "apps", "mlhandler") + cls.model_path = op.join(cls.root, "mlhandler-config", "mlhandler-config.pkl") + paths = [ + op.join(cls.root, f) + for f in [ + "mlhandler-nopath/config.json", + "mlhandler-nopath/data.h5", + "mlhandler-blank/config.json", + "mlhandler-blank/data.h5", + "mlhandler-blank/mlhandler-blank.pkl", + "mlhandler-config/config.json", + "mlhandler-config/data.h5", + "mlhandler-incr/config.json", + "mlhandler-incr/data.h5", + "mlhandler-incr/mlhandler-incr.pkl", + "mlhandler-xform/config.json", + "mlhandler-xform/data.h5", + "mlhandler-xform/mlhandler-xform.pkl", + "mlhandler-nopath/mlhandler-nopath.pkl", + "mlhandler-badcol/config.json", + "mlhandler-badcol/data.h5", + "mlhandler-badcol/mlhandler-badcol.pkl", + "mlhandler-decompositions/config.json", + "mlhandler-decompositions/data.h5", + "mlhandler-decompositions/mlhandler-decompositions.pkl", + "mlhandler-pipeline/config.json", + "mlhandler-pipeline/data.h5", + "mlhandler-pipeline/mlhandler-pipeline.pkl", + ] + ] + paths += [op.join(folder, "model.pkl")] for p in paths: tempfiles[p] = p - circles = op.join(folder, 'circles.csv') + circles = op.join(folder, "circles.csv") tempfiles[circles] = circles def test_append(self): try: - r = self.get('/mlhandler?_action=append', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + r = self.get( + "/mlhandler?_action=append", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(r.status_code, OK) - df = pd.DataFrame.from_records(self.get('/mlhandler?_cache').json()) + df = pd.DataFrame.from_records(self.get("/mlhandler?_cache").json()) self.assertEqual(df.shape[0], 2 * self.df.shape[0]) finally: - self.get('/mlhandler?delete=cache', method='delete') - self.get('/mlhandler?_action=append', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + self.get("/mlhandler?delete=cache", method="delete") + self.get( + "/mlhandler?_action=append", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) def test_append_train(self): - df_train = self.df[self.df['species'] != 'virginica'] - df_append = self.df[self.df['species'] == 'virginica'] + df_train = self.df[self.df["species"] != "virginica"] + df_append = self.df[self.df["species"] == "virginica"] - resp = self.get('/mlincr?class=LogisticRegression&target_col=species', method='put') + resp = self.get( + "/mlincr?class=LogisticRegression&target_col=species", method="put" + ) self.assertEqual(resp.status_code, OK) resp = self.get( - '/mlincr?_action=append', method='post', - data=df_train.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + "/mlincr?_action=append", + method="post", + data=df_train.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(resp.status_code, OK) - resp = self.get('/mlincr?_action=train', method='post') + resp = self.get("/mlincr?_action=train", method="post") self.assertEqual(resp.status_code, OK) resp = self.get( - '/mlincr?_action=score', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - org_score = resp.json()['score'] + "/mlincr?_action=score", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + org_score = resp.json()["score"] resp = self.get( - '/mlincr?_action=append', method='post', - data=df_append.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + "/mlincr?_action=append", + method="post", + data=df_append.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(resp.status_code, OK) - resp = self.get('/mlincr?_action=train', method='post') + resp = self.get("/mlincr?_action=train", method="post") self.assertEqual(resp.status_code, OK) resp = self.get( - '/mlincr?_action=score', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - new_score = resp.json()['score'] + "/mlincr?_action=score", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + new_score = resp.json()["score"] # Score should improve by at least 30% self.assertGreaterEqual(new_score - org_score, 0.29) # NOQA: E912 def test_blank_slate(self): # Assert that a model doesn't have to exist - model_path = op.join(self.root, 'mlhandler-blank', 'mlhandler-blank.pkl') + model_path = op.join(self.root, "mlhandler-blank", "mlhandler-blank.pkl") self.assertFalse(op.exists(model_path)) - r = self.get('/mlblank?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8') + r = self.get( + "/mlblank?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8" + ) self.assertEqual(r.status_code, NOT_FOUND) # Post options in any order, randomly - r = self.get('/mlblank?target_col=species', method='put') + r = self.get("/mlblank?target_col=species", method="put") self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?exclude=petal_width', method='put') + r = self.get("/mlblank?exclude=petal_width", method="put") self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?nums=sepal_length&nums=sepal_width&nums=petal_length', - method='put') + r = self.get( + "/mlblank?nums=sepal_length&nums=sepal_width&nums=petal_length", + method="put", + ) self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?class=LogisticRegression', method='put') + r = self.get("/mlblank?class=LogisticRegression", method="put") self.assertEqual(r.status_code, OK) # check the training opts - params = self.get('/mlblank?_params').json() + params = self.get("/mlblank?_params").json() self.assertDictEqual( - params['opts'], + params["opts"], { - 'target_col': 'species', - 'exclude': ['petal_width'], - 'nums': ['sepal_length', 'sepal_width', 'petal_length'], - 'include': [], - 'pipeline': True, - 'drop_duplicates': True, - 'dropna': True, - 'cats': [], - 'index_col': None - } + "target_col": "species", + "exclude": ["petal_width"], + "nums": ["sepal_length", "sepal_width", "petal_length"], + "include": [], + "pipeline": True, + "drop_duplicates": True, + "dropna": True, + "cats": [], + "index_col": None, + }, ) self.assertDictEqual( - params['params'], - { - 'class': 'LogisticRegression', - 'params': {} - } + params["params"], {"class": "LogisticRegression", "params": {}} ) def test_change_model(self): @@ -212,61 +286,66 @@ def test_change_model(self): try: # put a new model r = self.get( - '/mlhandler?class=DecisionTreeClassifier&criterion=entropy&splitter=random', - method='put') + "/mlhandler?class=DecisionTreeClassifier&criterion=entropy&splitter=random", + method="put", + ) self.assertEqual(r.status_code, OK) - r = self.get('/mlhandler?_params') - self.assertDictEqual(r.json()['params'], { - 'class': 'DecisionTreeClassifier', - 'params': { - 'criterion': 'entropy', - 'splitter': 'random' - } - }) + r = self.get("/mlhandler?_params") + self.assertDictEqual( + r.json()["params"], + { + "class": "DecisionTreeClassifier", + "params": {"criterion": "entropy", "splitter": "random"}, + }, + ) # Train the model on the cache - self.get('/mlhandler?_action=retrain&target_col=species', method='post') + self.get("/mlhandler?_action=retrain&target_col=species", method="post") model = joblib.load(self.model_path) self.assertIsInstance(model, DecisionTreeClassifier) - self.assertEqual(model.criterion, 'entropy') - self.assertEqual(model.splitter, 'random') - resp = self.get( - '/mlhandler?_action=score', method='post') - self.assertGreaterEqual(resp.json()['score'], 0.8) # NOQA: E912 + self.assertEqual(model.criterion, "entropy") + self.assertEqual(model.splitter, "random") + resp = self.get("/mlhandler?_action=score", method="post") + self.assertGreaterEqual(resp.json()["score"], 0.8) # NOQA: E912 finally: # restore the backup joblib.dump(clf, self.model_path) def test_clear_cache(self): try: - r = self.get('/mlhandler?delete=cache', method='delete') + r = self.get("/mlhandler?delete=cache", method="delete") self.assertEqual(r.status_code, OK) - self.assertListEqual(self.get('/mlhandler?_cache').json(), []) + self.assertListEqual(self.get("/mlhandler?_cache").json(), []) finally: - self.get('/mlhandler?_action=append', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + self.get( + "/mlhandler?_action=append", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) def test_default(self): # Check if model has been trained on iris, and exists at the right path. clf = joblib.load(self.model_path) self.assertIsInstance(clf, LogisticRegression) - score = clf.score(self.df[[c for c in self.df if c != 'species']], self.df['species']) + score = clf.score( + self.df[[c for c in self.df if c != "species"]], self.df["species"] + ) self.assertGreaterEqual(score, self.ACC_TOL) def test_delete(self): clf = joblib.load(self.model_path) try: - r = self.get('/mlhandler?delete=model', method='delete') + r = self.get("/mlhandler?delete=model", method="delete") self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(self.model_path)) # check if the correct error message is shown - r = self.get('/mlhandler?_model') + r = self.get("/mlhandler?_model") self.assertEqual(r.status_code, NOT_FOUND) finally: joblib.dump(clf, self.model_path) def test_download(self): - r = self.get('/mlhandler?_download') + r = self.get("/mlhandler?_download") buff = BytesIO(r.content) buff.seek(0) clf = joblib.load(buff) @@ -274,158 +353,217 @@ def test_download(self): def test_filtercols(self): buff = StringIO() - self.df.to_csv(buff, index=False, encoding='utf8') + self.df.to_csv(buff, index=False, encoding="utf8") # Train excluding two columns: buff.seek(0) clf = joblib.load(self.model_path) try: - resp = self.get('/mlhandler?class=LogisticRegression&target_col=species' - '&exclude=sepal_width&exclude=petal_length', - method='put') - resp = self.get('/mlhandler?_action=retrain', - files={'file': ('iris.csv', buff.read())}, - method='post') - self.assertGreaterEqual(resp.json()['score'], 0.8) # NOQA: E912 - - r = self.get('/mlhandler?_cache').json() + resp = self.get( + "/mlhandler?class=LogisticRegression&target_col=species" + "&exclude=sepal_width&exclude=petal_length", + method="put", + ) + resp = self.get( + "/mlhandler?_action=retrain", + files={"file": ("iris.csv", buff.read())}, + method="post", + ) + self.assertGreaterEqual(resp.json()["score"], 0.8) # NOQA: E912 + + r = self.get("/mlhandler?_cache").json() # Check that the data still has all columns self.assertSetEqual( set(pd.DataFrame.from_records(r).columns), - {'sepal_length', 'petal_width', 'petal_length', 'sepal_width', 'species'}) + { + "sepal_length", + "petal_width", + "petal_length", + "sepal_width", + "species", + }, + ) # But the model has only two pipe = joblib.load(self.model_path) self.assertEqual(pipe.coef_.shape, (3, 2)) # Train including one column: buff.seek(0) - self.get('/mlhandler?include=sepal_width', method='put') - resp = self.get('/mlhandler?_action=retrain', - method='post', files={'file': ('iris.csv', buff.read())}) - self.assertGreaterEqual(resp.json()['score'], 0.5) + self.get("/mlhandler?include=sepal_width", method="put") + resp = self.get( + "/mlhandler?_action=retrain", + method="post", + files={"file": ("iris.csv", buff.read())}, + ) + self.assertGreaterEqual(resp.json()["score"], 0.5) # check coefficients shape pipe = joblib.load(self.model_path) self.assertEqual(pipe.coef_.shape, (3, 1)) finally: - self.get('/mlhandler?delete=opts&_opts=include&_opts=exclude', method='delete') + self.get( + "/mlhandler?delete=opts&_opts=include&_opts=exclude", method="delete" + ) joblib.dump(clf, self.model_path) - def test_get_bulk_predictions(self, target_col='species'): + def test_get_bulk_predictions(self, target_col="species"): df = self.df.drop_duplicates() - target = df.pop('species') - resp = self.get('/mlhandler', method='post', data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + target = df.pop("species") + resp = self.get( + "/mlhandler", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) out = pd.DataFrame.from_records(resp.json()) self.assertGreaterEqual(accuracy_score(target, out[target_col]), self.ACC_TOL) def test_get_bulk_score(self): resp = self.get( - '/mlhandler?_action=score', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL) + "/mlhandler?_action=score", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertGreaterEqual(resp.json()["score"], self.ACC_TOL) resp = self.get( - '/mlhandler?_action=score&_metric=f1_weighted', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL) + "/mlhandler?_action=score&_metric=f1_weighted", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertGreaterEqual(resp.json()["score"], self.ACC_TOL) def test_get_cache(self): - df = pd.DataFrame.from_records(self.get('/mlhandler?_cache=true').json()) + df = pd.DataFrame.from_records(self.get("/mlhandler?_cache=true").json()) pd.testing.assert_frame_equal(df, self.df) def test_get_model_params(self): - params = self.get('/mlhandler?_model').json() + params = self.get("/mlhandler?_model").json() self.assertDictEqual(LogisticRegression().get_params(), params) - def test_get_predictions(self, root="mlhandler", target_col='species'): + def test_get_predictions(self, root="mlhandler", target_col="species"): resp = self.get( - f'/{root}?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8') - self.assertEqual(resp.json(), [ - {'sepal_length': 5.9, 'sepal_width': 3.0, - 'petal_length': 5.1, 'petal_width': 1.8, - target_col: 'virginica'} - ]) + f"/{root}?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8" + ) + self.assertEqual( + resp.json(), + [ + { + "sepal_length": 5.9, + "sepal_width": 3.0, + "petal_length": 5.1, + "petal_width": 1.8, + target_col: "virginica", + } + ], + ) resp = self.get( - f'/{root}?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8') - self.assertEqual(resp.json(), [ - {'sepal_length': 5.9, 'sepal_width': 3.0, - 'petal_length': 5.1, 'petal_width': 1.8, - target_col: 'virginica'} - ]) - req = f'/{root}?' + f"/{root}?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8" + ) + self.assertEqual( + resp.json(), + [ + { + "sepal_length": 5.9, + "sepal_width": 3.0, + "petal_length": 5.1, + "petal_width": 1.8, + target_col: "virginica", + } + ], + ) + req = f"/{root}?" samples = [] target = [] - for row in self.df.sample(n=5).to_dict(orient='records'): - samples.extend([(col, value) for col, value in row.items() if col != 'species']) - target.append(row['species']) - params = '&'.join([f'{k}={v}' for k, v in samples]) + for row in self.df.sample(n=5).to_dict(orient="records"): + samples.extend( + [(col, value) for col, value in row.items() if col != "species"] + ) + target.append(row["species"]) + params = "&".join([f"{k}={v}" for k, v in samples]) resp = self.get(req + params) self.assertGreaterEqual( - accuracy_score([c[target_col] for c in resp.json()], target), 0.8) # NOQA: E912 + accuracy_score([c[target_col] for c in resp.json()], target), 0.8 + ) # NOQA: E912 def test_get_predictions_post_file(self): df = self.df.drop_duplicates() - target = df.pop('species') + target = df.pop("species") buff = StringIO() - df.to_csv(buff, index=False, encoding='utf8') + df.to_csv(buff, index=False, encoding="utf8") buff.seek(0) - resp = self.get('/mlhandler?_action=predict', - method='post', files={'file': ('iris.csv', buff)}) - pred = pd.DataFrame.from_records(resp.json())['species'] + resp = self.get( + "/mlhandler?_action=predict", + method="post", + files={"file": ("iris.csv", buff)}, + ) + pred = pd.DataFrame.from_records(resp.json())["species"] self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) def test_get_predictions_duplicates(self): df = self.df.drop_duplicates() df = pd.concat([df, df], axis=0, ignore_index=True) - target = df.pop('species') + target = df.pop("species") buff = StringIO() - df.to_csv(buff, index=False, encoding='utf8') + df.to_csv(buff, index=False, encoding="utf8") buff.seek(0) - resp = self.get('/mlhandler?_action=predict', - method='post', files={'file': ('iris.csv', buff)}) - pred = pd.DataFrame.from_records(resp.json())['species'] + resp = self.get( + "/mlhandler?_action=predict", + method="post", + files={"file": ("iris.csv", buff)}, + ) + pred = pd.DataFrame.from_records(resp.json())["species"] self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) def test_get_predictions_post_json_file(self): df = self.df.drop_duplicates() - target = df.pop('species') + target = df.pop("species") buff = StringIO() - df.to_json(buff, orient='records') + df.to_json(buff, orient="records") buff.seek(0) - resp = self.get('/mlhandler?_action=predict', - method='post', files={'file': ('iris.json', buff)}) - pred = pd.DataFrame.from_records(resp.json())['species'] + resp = self.get( + "/mlhandler?_action=predict", + method="post", + files={"file": ("iris.json", buff)}, + ) + pred = pd.DataFrame.from_records(resp.json())["species"] self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) def test_model_default_path(self): clf = joblib.load( - op.join(self.root, - 'mlhandler-nopath', 'mlhandler-nopath.pkl')) + op.join(self.root, "mlhandler-nopath", "mlhandler-nopath.pkl") + ) self.assertIsInstance(clf, LogisticRegression) resp = self.get( - '/mlnopath?_action=score', method='post', - headers={'Content-Type': 'application/json'}) - self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL) + "/mlnopath?_action=score", + method="post", + headers={"Content-Type": "application/json"}, + ) + self.assertGreaterEqual(resp.json()["score"], self.ACC_TOL) def test_post_after_delete_custom_model(self): org_clf = joblib.load(self.model_path) try: - r = self.get('/mlhandler?delete=model', method='delete') + r = self.get("/mlhandler?delete=model", method="delete") self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(self.model_path)) # recreate the model X, y = make_classification() # NOQA: N806 - xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.25) + xtrain, xtest, ytrain, ytest = train_test_split( + X, y, stratify=y, test_size=0.25 + ) df = pd.DataFrame(xtrain) - df['target'] = ytrain - r = self.get('/mlhandler?class=GaussianNB', method='put') + df["target"] = ytrain + r = self.get("/mlhandler?class=GaussianNB", method="put") self.assertEqual(r.status_code, OK) - r = self.get('/mlhandler?_action=train&target_col=target', method='post', - data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + r = self.get( + "/mlhandler?_action=train&target_col=target", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(r.status_code, OK) - self.assertGreaterEqual(r.json()['score'], 0.8) # NOQA: E912 + self.assertGreaterEqual(r.json()["score"], 0.8) # NOQA: E912 clf = joblib.load(self.model_path) self.assertIsInstance(clf, GaussianNB) finally: @@ -434,101 +572,126 @@ def test_post_after_delete_custom_model(self): def test_post_after_delete_default_model(self): clf = joblib.load(self.model_path) try: - r = self.get('/mlhandler?delete=model', method='delete') + r = self.get("/mlhandler?delete=model", method="delete") self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(self.model_path)) # recreate the model X, y = make_classification() # NOQA: N806 - xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.25) + xtrain, xtest, ytrain, ytest = train_test_split( + X, y, stratify=y, test_size=0.25 + ) df = pd.DataFrame(xtrain) - df['target'] = ytrain - r = self.get('/mlhandler?_action=train&target_col=target', method='post', - data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + df["target"] = ytrain + r = self.get( + "/mlhandler?_action=train&target_col=target", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(r.status_code, OK) - self.assertGreaterEqual(r.json()['score'], 0.8) # NOQA: E912 + self.assertGreaterEqual(r.json()["score"], 0.8) # NOQA: E912 finally: joblib.dump(clf, self.model_path) def test_retrain(self): # Make some data x, y = make_classification() - xtrain, xtest, ytrain, ytest = train_test_split(x, y, stratify=y, test_size=0.25) + xtrain, xtest, ytrain, ytest = train_test_split( + x, y, stratify=y, test_size=0.25 + ) df = pd.DataFrame(xtrain) - df['target'] = ytrain + df["target"] = ytrain test_df = pd.DataFrame(xtest) - test_df['target'] = ytest + test_df["target"] = ytest clf = joblib.load(self.model_path) try: # clear the cache - resp = self.get('/mlhandler?delete=cache', method='delete') + resp = self.get("/mlhandler?delete=cache", method="delete") self.assertEqual(resp.status_code, OK) - resp = self.get('/mlhandler?_cache') + resp = self.get("/mlhandler?_cache") self.assertListEqual(resp.json(), []) # append new data, don't train - self.get('/mlhandler?_action=append', method='post', data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + self.get( + "/mlhandler?_action=append", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) # now, retrain - self.get('/mlhandler?_action=retrain&target_col=target', method='post') + self.get("/mlhandler?_action=retrain&target_col=target", method="post") # Check score against test dataset resp = self.get( - '/mlhandler?_action=score', method='post', - data=test_df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - self.assertGreaterEqual(resp.json()['score'], 0.6) # NOQA: E912 + "/mlhandler?_action=score", + method="post", + data=test_df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertGreaterEqual(resp.json()["score"], 0.6) # NOQA: E912 finally: # revert to the original cache - self.get('/mlhandler?delete=cache', method='delete') - self.get('/mlhandler?_action=append', method='post', - data=self.df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + self.get("/mlhandler?delete=cache", method="delete") + self.get( + "/mlhandler?_action=append", + method="post", + data=self.df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) joblib.dump(clf, self.model_path) def test_single_line_train_fetch_model(self): - resp = self.get('/mlblank?class=DecisionTreeClassifier&target_col=species', - method='put') + resp = self.get( + "/mlblank?class=DecisionTreeClassifier&target_col=species", method="put" + ) self.assertEqual(resp.status_code, OK) # train line = StringIO() - self.df.head(1).to_csv(line, index=False, encoding='utf8') + self.df.head(1).to_csv(line, index=False, encoding="utf8") line.seek(0) - resp = self.get('/mlblank?_action=train', method='post', - files={'file': ('iris.csv', line.read())}) + resp = self.get( + "/mlblank?_action=train", + method="post", + files={"file": ("iris.csv", line.read())}, + ) self.assertEqual(resp.status_code, OK) - self.assertGreaterEqual(resp.json()['score'], 0.0) + self.assertGreaterEqual(resp.json()["score"], 0.0) # get the model - resp = self.get('/mlblank') + resp = self.get("/mlblank") self.assertEqual(resp.status_code, OK) # Assert that the model is a pipeline - model = joblib.load(op.join(self.root, 'mlhandler-blank', 'mlhandler-blank.pkl')) - self.assertIsInstance(model, Pipeline) - self.assertIsInstance(model.named_steps['transform'], ColumnTransformer) - self.assertIsInstance( - model.named_steps['DecisionTreeClassifier'], DecisionTreeClassifier + model = joblib.load( + op.join(self.root, "mlhandler-blank", "mlhandler-blank.pkl") ) + self.assertIsInstance(model, Pipeline) + self.assertIsInstance(model["transform"], ColumnTransformer) + self.assertIsInstance(model["DecisionTreeClassifier"], DecisionTreeClassifier) def test_template(self): """Check if viewing the template works fine.""" - r = self.get('/mlhandler') + r = self.get("/mlhandler") self.assertEqual(r.status_code, OK) # Try getting predictions - self.test_get_predictions(target_col='target') - self.test_get_bulk_predictions('target') + self.test_get_predictions(target_col="target") + self.test_get_bulk_predictions("target") def test_train(self): # backup the original model clf = joblib.load(self.model_path) X, y = make_classification() # NOQA: N806 - xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.25) + xtrain, xtest, ytrain, ytest = train_test_split( + X, y, stratify=y, test_size=0.25 + ) df = pd.DataFrame(xtrain) - df['target'] = ytrain + df["target"] = ytrain try: - resp = self.get('/mlhandler?_action=train&target_col=target', method='post', - data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - self.assertGreaterEqual(resp.json()['score'], 0.8) # NOQA: E912 + resp = self.get( + "/mlhandler?_action=train&target_col=target", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertGreaterEqual(resp.json()["score"], 0.8) # NOQA: E912 finally: joblib.dump(clf, self.model_path) # TODO: The target_col has to be reset to species for a correct teardown. @@ -536,50 +699,65 @@ def test_train(self): # Find an atomic way to reset configurations. def test_datatransform(self): - with open(op.join(op.dirname(__file__), 'circles.csv'), 'r', encoding='utf8') as fin: - resp = self.get('/mltransform?_action=score', method='post', - files={'file': ('circles.csv', fin.read())}) - self.assertEqual(resp.json()['score'], 1) + with open( + op.join(op.dirname(__file__), "circles.csv"), "r", encoding="utf8" + ) as fin: + resp = self.get( + "/mltransform?_action=score", + method="post", + files={"file": ("circles.csv", fin.read())}, + ) + self.assertEqual(resp.json()["score"], 1) def test_invalid_category(self): - self.test_get_predictions('mlhandlerbadcol') + self.test_get_predictions("mlhandlerbadcol") def test_pca(self): xtr, xts = train_test_split(self.df, random_state=12345) # Append the training data - r = self.get('/mldecompose?_action=append', method='post', - data=xtr.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + r = self.get( + "/mldecompose?_action=append", + method="post", + data=xtr.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) self.assertEqual(r.status_code, OK) # Train on it and check attributes - r = self.get('/mldecompose?_action=train', method='post') + r = self.get("/mldecompose?_action=train", method="post") attributes = r.json() - sv1, sv2 = attributes['singular_values_'] + sv1, sv2 = attributes["singular_values_"] self.assertEqual(round(sv1), 20) # NOQA: E912 self.assertEqual(round(sv2), 5) # Check if test data is transformed - r = self.get('/mldecompose?_action=predict', method='post', - data=xts.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) + r = self.get( + "/mldecompose?_action=predict", + method="post", + data=xts.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) x_red = np.array(r.json()) self.assertEqual(x_red.shape, (xts.shape[0], 2)) # Check that the fitted attributes are available from GET ?_params too - params = self.get('/mldecompose?_params').json() - sv1, sv2 = params['attrs']['singular_values_'] + params = self.get("/mldecompose?_params").json() + sv1, sv2 = params["attrs"]["singular_values_"] self.assertEqual(round(sv1), 20) # NOQA: E912 self.assertEqual(round(sv2), 5) def test_pipeline_features(self): - df = pd.read_csv('actors.csv') # Send the df as is, no preprocessing - r = self.get('/mlpipe?_action=score', method='post', data=df.to_json(orient='records'), - headers={'Content-Type': 'application/json'}) - self.assertEqual(r.json()['score'], 1) + df = pd.read_csv("actors.csv") # Send the df as is, no preprocessing + r = self.get( + "/mlpipe?_action=score", + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertEqual(r.json()["score"], 1) # Inspect the model and check that the pipeline rules apply - buff = BytesIO(self.get('/mlpipe?_download').content) + buff = BytesIO(self.get("/mlpipe?_download").content) buff.seek(0) model = joblib.load(buff) self.assertEqual( - [(k[0], k[-1]) for k in model['transform'].transformers_], - [('ohe', ['category']), ('scaler', ['votes'])] + [(k[0], k[-1]) for k in model["transform"].transformers_], + [("ohe", ["category"]), ("scaler", ["votes"])], ) From ed46d77cb1ba4eb8e844fa118f8542eb208c6403 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Wed, 1 Jun 2022 10:41:50 +0530 Subject: [PATCH 4/8] ENH: NER Entity Recognition with Huggingface transformers MLHandler now supports trainable named entity recognition. --- gramex/handlers/mlhandler.py | 11 +- gramex/ml_api.py | 6 +- gramex/transformers.py | 207 ++++++++++++++++++++++++++++++++--- tests/gramex.yaml | 8 ++ tests/test_mlhandler.py | 40 ++++++- 5 files changed, 248 insertions(+), 24 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index b1a9e662b..e0f2b34db 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -98,7 +98,7 @@ def setup(cls, data=None, model={}, config_dir='', template=DEFAULT_TEMPLATE, ** if op.exists(cls.store.model_path): # If the pkl exists, load it if op.isdir(cls.store.model_path): mclass, wrapper = ml.search_modelclass(mclass) - cls.model = locate(wrapper).from_disk(mclass, cls.store.model_path) + cls.model = locate(wrapper).from_disk(cls.store.model_path, mclass) else: cls.model = get_model(cls.store.model_path, {}) elif data is not None: @@ -134,7 +134,7 @@ def _parse_multipart_form_data(self): return pd.concat(dfs, axis=0) def _parse_application_json(self): - return pd.read_json(self.request.body.decode('utf8')) + return pd.read_json(self.request.body) def _parse_data(self, _cache=True, append=False): header = self.request.headers.get('Content-Type', '').split(';')[0] @@ -172,7 +172,12 @@ def _filterrows(cls, data, **kwargs): action = kwargs.get(method, cls.store.load(method, True)) if action: subset = action if isinstance(action, list) else None - data = getattr(data, method)(subset=subset) + try: + data = getattr(data, method)(subset=subset) + except TypeError as exc: + # The label column for an NER dataset is a nested list. + # Can't do drop_duplicates on that. + app_log.warning(exc) return data def _transform(self, data, **kwargs): diff --git a/gramex/ml_api.py b/gramex/ml_api.py index 98d3380b0..6493e9d28 100644 --- a/gramex/ml_api.py +++ b/gramex/ml_api.py @@ -368,7 +368,11 @@ def predict( """ p = self._predict(X, **kwargs) if target_col: - X[target_col] = p + try: + X[target_col] = p + except ValueError: + # This happens for NER: predictions of a single sample can be multiple entities. + X[target_col] = [p] return X return p diff --git a/gramex/transformers.py b/gramex/transformers.py index 12514dee8..52f043c6a 100644 --- a/gramex/transformers.py +++ b/gramex/transformers.py @@ -1,15 +1,105 @@ import os.path as op +from typing import List import pandas as pd +import spacy import transformers as trf -from datasets import Dataset +from datasets import Dataset, load_metric from gramex.config import app_log from gramex import cache from sklearn.metrics import roc_auc_score -DEFAULT_MODEL = DEFAULT_TOKENIZER = "distilbert-base-uncased-finetuned-sst-2-english" +def biluo2iob(tags: List[str]) -> List[str]: + """Convert BILOU tags to IOB tags. + + spaCy insists on BILOU tags, but most transformers models use IOB tags. + + Parameters + ---------- + tags : list + List of BILOU tags + + Returns + ------- + list + List of IOB tags. + + Example + ------- + >>> # "Joe R Biden is President of the United States ." + >>> tags = ['B-PER', 'I-PER', 'L-PER', 'O', 'U-PER', 'O', 'O', 'B-LOC', 'L-LOC', 'O'] + >>> biluo2iob(tags) + ['B-PER', 'I-PER', 'I-PER', 'O', 'B-PER', 'O', 'O', 'B-LOC', 'I-LOC', 'O'] + """ + # Replace L + tags = [t.replace("L-", "I-") for t in tags] + # Replace U + tags = [t.replace("U-", "B-") for t in tags] + return tags + + +def offsets2iob(text: spacy.tokens.Doc, entities: List[dict]) -> List[str]: + """Convert named entity offsets to a sequence of IOB tags. + + Parameters + ---------- + text : spacy.tokens.Doc + spaCy document of the original text + entities : list + Named entities present in the document as a list of dicts. + Each dict represents one named entity and must contain three keys: + 1. "start": the start offset of the entity + 2. "end": the end offset of the entity + 3. "label": the label of the entity + + Returns + ------- + list + A list of IOB tags for the document. + + Example + ------- + >>> import spacy + >>> nlp = load('en') + >>> doc = nlp('Narendra Modi is the PM of India.') + >>> entities = [{'start': 0, 'end': 13, 'label': 'PER'}, + ... {'start': 27, 'end': 32, 'label': 'LOC'}] + >>> offsets2iob(doc, entities) + ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-LOC', 'O'] + """ + entities = [(ent["start"], ent["end"], ent["label"]) for ent in entities] + tags = spacy.training.offsets_to_biluo_tags(text, entities) + return biluo2iob(tags) + + +def tokenize_and_align_labels(examples, tokenizer): + tokenized_inputs = tokenizer( + examples["text"], truncation=True, is_split_into_words=True + ) + + labels = [] + for i, label in enumerate(examples["ner_tags"]): + word_ids = tokenized_inputs.word_ids( + batch_index=i + ) # Map tokens to their respective word. + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: # Set the special tokens to -100. + if word_idx is None: + label_ids.append(-100) + elif ( + word_idx != previous_word_idx + ): # Only label the first token of a given word. + label_ids.append(label[word_idx]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels.append(label_ids) + + tokenized_inputs["labels"] = labels + return tokenized_inputs def load_pretrained(klass, path, default, **kwargs): @@ -27,22 +117,41 @@ def load_pretrained(klass, path, default, **kwargs): class BaseTransformer(object): - def __init__(self, model=DEFAULT_MODEL, tokenizer=DEFAULT_TOKENIZER, **kwargs): + def __init__(self, model=None, tokenizer=None, **kwargs): + if model is None: + model = self.DEFAULT_MODEL + if tokenizer is None: + tokenizer = self.DEFAULT_TOKENIZER self._model = model self._tokenizer = tokenizer - self.model = load_pretrained( - trf.AutoModelForSequenceClassification, model, DEFAULT_MODEL - ) + self.model = load_pretrained(self.AUTO_CLASS, model, self.DEFAULT_MODEL) self.tokenizer = load_pretrained( - trf.AutoTokenizer, tokenizer, DEFAULT_TOKENIZER + trf.AutoTokenizer, tokenizer, self.DEFAULT_TOKENIZER ) + self.pipeline_kwargs = kwargs self.pipeline = trf.pipeline( - self.task, model=self.model, tokenizer=self.tokenizer + self.task, model=self.model, tokenizer=self.tokenizer, **kwargs + ) + + def post_train(self, model_path): + """Move the model to the CPU, save it with the tokenizer, recreate the pipeline.""" + self.model.to("cpu") + self.model.save_pretrained(op.join(model_path, "model")) + self.tokenizer.save_pretrained(op.join(model_path, "tokenizer")) + self.pipeline = trf.pipeline( + self.task, + model=self.model, + tokenizer=self.tokenizer, + **self.pipeline_kwargs, ) class SentimentAnalysis(BaseTransformer): task = "sentiment-analysis" + DEFAULT_MODEL = ( + DEFAULT_TOKENIZER + ) = "distilbert-base-uncased-finetuned-sst-2-english" + AUTO_CLASS = trf.AutoModelForSequenceClassification def fit(self, text, labels, model_path, **kwargs): if pd.api.types.is_object_dtype(labels): @@ -57,14 +166,7 @@ def fit(self, text, labels, model_path, **kwargs): model=self.model, train_dataset=tokenized, args=train_args ) trainer.train() - self.model.to("cpu") - self.model.save_pretrained(op.join(model_path, "model")) - self.tokenizer.save_pretrained(op.join(model_path, "tokenizer")) - self.pipeline = trf.pipeline( - self.task, - model=self.model, - tokenizer=self.tokenizer, - ) + self.post_train(model_path) def predict(self, text, **kwargs): text = text.tolist() @@ -76,3 +178,76 @@ def score(self, X, y_true, **kwargs): y_pred = self.predict(X.squeeze("columns")) y_pred = [self.model.config.label2id[x] for x in y_pred] return roc_auc_score(y_true, y_pred) + + +class NER(BaseTransformer): + task = "ner" + DEFAULT_TOKENIZER = ( + DEFAULT_MODEL + ) = "dbmdz/bert-large-cased-finetuned-conll03-english" + AUTO_CLASS = trf.AutoModelForTokenClassification + + def __init__(self, model=None, tokenizer=None, **kwargs): + self.nlp = spacy.blank("en") + super(NER, self).__init__( + model=model, tokenizer=tokenizer, aggregation_strategy="first", **kwargs + ) + + @property + def labels(self): + return set([k.split("-")[-1] for k in self.model.config.label2id]) + + def predict(self, text, **kwargs): + text = text.tolist() + return self.pipeline(text) + + def score(self, X, y_true, **kwargs): + try: + metric = load_metric("seqeval") + except ImportError: + app_log.error("Could not load the seqeval metric. Scoring not supported.") + return 0 + # Get references and predictions + X = X.squeeze("columns") + predictions = self.predict(X) + for pred in predictions: + for ent in pred: + ent.update({"label": ent.pop("entity_group")}) + preds = [] + refs = [] + for doc, pred, ref in zip(self.nlp.pipe(X.tolist()), predictions, y_true): + preds.append(offsets2iob(doc, pred)) + refs.append(offsets2iob(doc, ref)) + metrics = metric.compute(references=refs, predictions=preds) + return pd.DataFrame( + {k: v for k, v in metrics.items() if k in self.labels} + ).reset_index() + + def fit(self, text, labels, model_path, **kwargs): + texts = [] + ner_tags = [] + for doc, ents in zip(self.nlp.pipe(text.tolist()), labels): + texts.append([t.text for t in doc]) + ner_tags.append(offsets2iob(doc, ents)) + + label2id = self.model.config.label2id + ner_tags = [[label2id.get(k, 0) for k in tags] for tags in ner_tags] + + dataset = Dataset.from_dict({"text": texts, "ner_tags": ner_tags}) + tokenized = dataset.map( + lambda x: tokenize_and_align_labels(x, self.tokenizer), batched=True + ) + collator = trf.DataCollatorForTokenClassification(tokenizer=self.tokenizer) + args = trf.TrainingArguments( + save_strategy="no", output_dir=model_path, evaluation_strategy="epoch" + ) + trainer = trf.Trainer( + model=self.model, + args=args, + train_dataset=tokenized, + eval_dataset=tokenized, + tokenizer=self.tokenizer, + data_collator=collator, + ) + trainer.train() + self.post_train(model_path) diff --git a/tests/gramex.yaml b/tests/gramex.yaml index 09285dc1f..e900f4fb5 100644 --- a/tests/gramex.yaml +++ b/tests/gramex.yaml @@ -1260,6 +1260,14 @@ url: class: SentimentAnalysis xsrf_cookies: false + mlhandler/huggingface/ner: + pattern: /ner + handler: MLHandler + kwargs: + model: + class: NER + xsrf_cookies: false + capture: pattern: /capture handler: CaptureHandler diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 3851f4deb..446ba4b4d 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -52,23 +52,55 @@ def tearDownClass(cls): if op.isdir(path): shutil.rmtree(path) - def test_blank_predictions(self): + def test_default_sentiment(self): """Ensure that the default model predicts something.""" resp = self.get("/sentiment?text=This is bad.&text=This is good.", timeout=60) self.assertEqual(resp.json(), ["NEGATIVE", "POSITIVE"]) - def test_train(self): + def test_train_sentiment(self): """Train with some vague sentences.""" warnings.warn("This test takes a LONG time. Leave while you can.") df = pd.read_json("https://bit.ly/3NesHFs") resp = self.get( "/sentiment?_action=train&target_col=label", - method='post', + method="post", + data=df.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + timeout=300, + ) + self.assertGreaterEqual(resp.json()["score"], 0.9) + + def test_default_ner(self): + """Ensure that the default model predicts something.""" + resp = self.get("/ner?text=Narendra Modi is the PM of India.", timeout=300) + labels = [c["labels"] for c in resp.json()] + ents = [[(r["word"], r["entity_group"]) for r in label] for label in labels] + self.assertListEqual(ents, [[("Narendra Modi", "PER"), ("India", "LOC")]]) + + resp = self.get( + "/ner?text=Narendra Modi is the PM of India.&text=Joe Biden is POTUS.", + timeout=300, + ) + labels = [c["labels"] for c in resp.json()] + ents = [[(r["word"], r["entity_group"]) for r in label] for label in labels] + self.assertListEqual( + ents, [[("Narendra Modi", "PER"), ("India", "LOC")], [("Joe Biden", "PER")]] + ) + + def test_train_ner(self): + warnings.warn("This test takes a LONG time. Leave while you can.") + df = pd.read_json("https://bit.ly/3wZYsf5") + resp = self.get( + "/ner?_action=train&target_col=labels", + method="post", data=df.to_json(orient="records"), headers={"Content-Type": "application/json"}, timeout=300, ) - self.assertGreaterEqual(resp.json()['score'], 0.9) + # Ensure that f1, precision and recall are > 0.6 for all NEs + metrics = pd.DataFrame.from_records(resp.json()["score"]).set_index("index") + metrics = metrics.drop(["number"], axis=0).mean(axis=1) + self.assertTrue((metrics > 0.6).all()) @skipUnless(STATSMODELS_INSTALLED, "Please install statsmodels to run these tests.") From c7c7b5020b853654531c6bce0e010d405e286796 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Mon, 6 Jun 2022 16:43:08 +0530 Subject: [PATCH 5/8] ENH: Labelstudio support for MLHandler template The MLHandler template now has a UI for sentiment analysis with transformers. A labelstudio interface is added for fine-tuning the model with live annotations --- gramex/apps/mlhandler/sklearn.html | 317 ++++++++++++++++++++++ gramex/apps/mlhandler/template.html | 336 ++---------------------- gramex/apps/mlhandler/transformers.html | 178 +++++++++++++ gramex/handlers/mlhandler.py | 5 +- gramex/transformers.py | 22 +- tests/test_mlhandler.py | 12 +- 6 files changed, 541 insertions(+), 329 deletions(-) create mode 100644 gramex/apps/mlhandler/sklearn.html create mode 100644 gramex/apps/mlhandler/transformers.html diff --git a/gramex/apps/mlhandler/sklearn.html b/gramex/apps/mlhandler/sklearn.html new file mode 100644 index 000000000..cd885e05a --- /dev/null +++ b/gramex/apps/mlhandler/sklearn.html @@ -0,0 +1,317 @@ +{% set base = '.' %} +{% set columns = data.columns.tolist() %} +{% set CLASSIFICTION_MODELS = [ + 'LogisticRegression', + 'BernoulliNB', + 'Perceptron', + 'PassiveAggressiveClassifier', + 'SVC', + 'NuSVC', + 'LinearSVC', + 'KNeighborsClassifier', + 'GaussianNB', + 'DecisionTreeClassifier', + 'RandomForestClassifier', + 'MLPClassifier'] %} +{% set REGRESSION_MODELS = [ + 'LinearRegression', + 'PassiveAggressiveRegressor', + 'SVR', + 'NuSVR', + 'LinearSVR', + 'KNeighborsRegressor', + 'DecisionTreeRegressor', + 'RandomForestRegressor', + 'MLPRegressor'] %} +{% set tcol = handler.store.load('target_col', False) %} +{% set CLASSIFICTION_METRICS = { + 'Accuracy': 'accuracy', + 'Balanced Accuracy': 'balanced_accuracy', + 'ROC AUC': 'roc_auc', + 'F1 Score': 'f1_weighted' +}%} +{% set REGRESSION_METRICS = { + 'R2': 'r2', + 'Explained Variance': 'explained_variance', + 'Max Error': 'max_error', + 'Negative Mean Absolute Error': 'neg_mean_absolute_error', + 'Negative Mean Squared Error': 'neg_mean_squared_error', + 'Negative Root Mean Squared Error': 'neg_root_mean_squared_error' +}%} +
+
+

Train the Model

+
+
+
+
+
+
+
+ + +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ +
+
+
+
Results
+
+
+
+
+ Your model scored +
+
+ + + + + + + 40% + + + +
+
+
+
+
+
+ + +
+
+ +
+
+
+ +
+
+
+
+
+

Make Predictions

+
+
+ +
+
+

+
+
+
+ +
+
+
+ diff --git a/gramex/apps/mlhandler/template.html b/gramex/apps/mlhandler/template.html index d2b73ae97..fa666a0f4 100644 --- a/gramex/apps/mlhandler/template.html +++ b/gramex/apps/mlhandler/template.html @@ -8,6 +8,9 @@ + + + - {% set base = '.' %} - {% set columns = data.columns.tolist() %} - {% set CLASSIFICTION_MODELS = [ - 'LogisticRegression', - 'BernoulliNB', - 'Perceptron', - 'PassiveAggressiveClassifier', - 'SVC', - 'NuSVC', - 'LinearSVC', - 'KNeighborsClassifier', - 'GaussianNB', - 'DecisionTreeClassifier', - 'RandomForestClassifier', - 'MLPClassifier'] %} - {% set REGRESSION_MODELS = [ - 'LinearRegression', - 'PassiveAggressiveRegressor', - 'SVR', - 'NuSVR', - 'LinearSVR', - 'KNeighborsRegressor', - 'DecisionTreeRegressor', - 'RandomForestRegressor', - 'MLPRegressor'] %} - {% set tcol = handler.store.load('target_col', False) %} - {% set CLASSIFICTION_METRICS = { - 'Accuracy': 'accuracy', - 'Balanced Accuracy': 'balanced_accuracy', - 'ROC AUC': 'roc_auc', - 'F1 Score': 'f1_weighted' - }%} - {% set REGRESSION_METRICS = { - 'R2': 'r2', - 'Explained Variance': 'explained_variance', - 'Max Error': 'max_error', - 'Negative Mean Absolute Error': 'neg_mean_absolute_error', - 'Negative Mean Squared Error': 'neg_mean_squared_error', - 'Negative Root Mean Squared Error': 'neg_root_mean_squared_error' - }%} -
-
-
-

Train the Model

-
-
-
-
-
-
-
- - -
-
- - -
-
-
-
- - -
-
- - -
-
-
-
- - -
-
- - -
-
- - -
-
-
- -
-
-
-
Results
-
-
-
-
- Your model scored -
-
- - - - - - - 40% - - - -
-
-
-
-
-
- - -
-
- -
-
-
- -
-
-
-
-
-

Make Predictions

-
-
- -
-
-

-
-
-
- -
-
-
-
- @@ -263,119 +59,17 @@

- - + + +
+ {% set wrappercls = handler.model.__class__.__name__ %} + {% if wrappercls == 'SklearnModel' %} + {% set print('happened') %} + {% include sklearn.html %} + {% elif wrappercls == 'HFTransformer' %} + {% include transformers.html %} + {% end %} +
+ diff --git a/gramex/apps/mlhandler/transformers.html b/gramex/apps/mlhandler/transformers.html new file mode 100644 index 000000000..a730607f8 --- /dev/null +++ b/gramex/apps/mlhandler/transformers.html @@ -0,0 +1,178 @@ + +
+
+
+
+
+ +
+
+
+ +
+
+ Download +
+
+
+
+
+
+
+
+
+
+

You have annotated 0 documents

+
+
+
+
+
+ +
+

+
+
+
+
+ diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index b1a9e662b..3d5e6b21d 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -119,6 +119,9 @@ def setup(cls, data=None, model={}, config_dir='', template=DEFAULT_TEMPLATE, ** **cls.store.model_kwargs() ) + def get_template_path(self): + return op.dirname(self.template) + def _parse_multipart_form_data(self): dfs = [] for _, files in self.request.files.items(): @@ -134,7 +137,7 @@ def _parse_multipart_form_data(self): return pd.concat(dfs, axis=0) def _parse_application_json(self): - return pd.read_json(self.request.body.decode('utf8')) + return pd.read_json(self.request.body) def _parse_data(self, _cache=True, append=False): header = self.request.headers.get('Content-Type', '').split(';')[0] diff --git a/gramex/transformers.py b/gramex/transformers.py index 12514dee8..4be0d449e 100644 --- a/gramex/transformers.py +++ b/gramex/transformers.py @@ -10,6 +10,7 @@ DEFAULT_MODEL = DEFAULT_TOKENIZER = "distilbert-base-uncased-finetuned-sst-2-english" +_CACHE = {} def load_pretrained(klass, path, default, **kwargs): @@ -22,7 +23,11 @@ def load_pretrained(klass, path, default, **kwargs): model = cache.open(default, klass.from_pretrained, **kwargs) else: app_log.info(f"{path} not found on disk; loading default...") - model = klass.from_pretrained(default, **kwargs) + key = klass.__name__ + default + if key in _CACHE: + model = _CACHE[key] + else: + model = _CACHE[key] = klass.from_pretrained(default, **kwargs) return model @@ -44,6 +49,10 @@ def __init__(self, model=DEFAULT_MODEL, tokenizer=DEFAULT_TOKENIZER, **kwargs): class SentimentAnalysis(BaseTransformer): task = "sentiment-analysis" + @property + def labels(self): + return self.model.config.label2id.keys() + def fit(self, text, labels, model_path, **kwargs): if pd.api.types.is_object_dtype(labels): labels = labels.map(self.model.config.label2id.get) @@ -69,10 +78,15 @@ def fit(self, text, labels, model_path, **kwargs): def predict(self, text, **kwargs): text = text.tolist() predictions = self.pipeline(text) - return [k["label"] for k in predictions] + return [{"text": t, "label": p["label"]} for t, p in zip(text, predictions)] def score(self, X, y_true, **kwargs): y_true = [self.model.config.label2id[x] for x in y_true] - y_pred = self.predict(X.squeeze("columns")) + y_pred = [p["label"] for p in self.predict(X.squeeze("columns"))] y_pred = [self.model.config.label2id[x] for x in y_pred] - return roc_auc_score(y_true, y_pred) + try: + score = roc_auc_score(y_true, y_pred) + # Can't find roc_auc_scores for single samples, or when only one class is present. + except ValueError: + score = 0 + return score diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 3851f4deb..edf853a49 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -55,7 +55,13 @@ def tearDownClass(cls): def test_blank_predictions(self): """Ensure that the default model predicts something.""" resp = self.get("/sentiment?text=This is bad.&text=This is good.", timeout=60) - self.assertEqual(resp.json(), ["NEGATIVE", "POSITIVE"]) + self.assertEqual( + resp.json(), + [ + {"text": "This is bad.", "label": "NEGATIVE"}, + {"text": "This is good.", "label": "POSITIVE"}, + ], + ) def test_train(self): """Train with some vague sentences.""" @@ -63,12 +69,12 @@ def test_train(self): df = pd.read_json("https://bit.ly/3NesHFs") resp = self.get( "/sentiment?_action=train&target_col=label", - method='post', + method="post", data=df.to_json(orient="records"), headers={"Content-Type": "application/json"}, timeout=300, ) - self.assertGreaterEqual(resp.json()['score'], 0.9) + self.assertGreaterEqual(resp.json()["score"], 0.9) @skipUnless(STATSMODELS_INSTALLED, "Please install statsmodels to run these tests.") From 2b751634ae60119651936f5e8104cb7be0641202 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Tue, 7 Jun 2022 13:54:43 +0530 Subject: [PATCH 6/8] ENH: MLHandler supports FBProphet for forecasting "Prophet" is available as a valid model class in MLHandler now. --- gramex/handlers/mlhandler.py | 2 +- gramex/ml_api.py | 3 +- gramex/{sm_api.py => timeseries.py} | 56 ++++++++++++++++++++++++-- setup.cfg | 2 +- tests/gramex.yaml | 8 ++++ tests/test_mlhandler.py | 62 ++++++++++++++++++++++++++++- 6 files changed, 125 insertions(+), 8 deletions(-) rename gramex/{sm_api.py => timeseries.py} (72%) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 0240e72c3..c7f629f20 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -208,7 +208,7 @@ def _predict(self, data=None, score_col=''): # Set data in the same order as the transformer requests try: tcol = self.store.load('target_col', '_prediction') - data = self.model.predict(data, target_col=tcol) + data = self.model.predict(data, target_col=tcol, **self.args) except Exception as exc: app_log.exception(exc) return data diff --git a/gramex/ml_api.py b/gramex/ml_api.py index 6493e9d28..442414fba 100644 --- a/gramex/ml_api.py +++ b/gramex/ml_api.py @@ -41,10 +41,11 @@ "sklearn.decomposition", "gramex.ml", ], - "gramex.sm_api.StatsModel": [ + "gramex.timeseries.StatsModel": [ "statsmodels.tsa.api", "statsmodels.tsa.statespace.sarimax", ], + "gramex.timeseries.Prophet": ["prophet"], "gramex.ml_api.HFTransformer": ["gramex.transformers"], } diff --git a/gramex/sm_api.py b/gramex/timeseries.py similarity index 72% rename from gramex/sm_api.py rename to gramex/timeseries.py index d838210f5..a472fa019 100644 --- a/gramex/sm_api.py +++ b/gramex/timeseries.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import joblib +from typing import Union from gramex.config import app_log from gramex import cache from statsmodels import api as sm @@ -10,7 +11,6 @@ class StatsModel(AbstractModel): - @classmethod def from_disk(cls, path, **kwargs): model = cache.open(path, joblib.load) @@ -51,8 +51,14 @@ def _get_stl(self, endog): return pd.Series(result, index=endog.index) def fit( - self, X, y=None, model_path=None, name=None, index_col=None, target_col=None, - **kwargs + self, + X, + y=None, + model_path=None, + name=None, + index_col=None, + target_col=None, + **kwargs, ): """Only a dataframe is accepted. Index and target columns are both expected to be in it.""" params = self.params.copy() @@ -106,3 +112,47 @@ def get_attributes(self): if not result: return {} return result.summary().as_html() + + +class Prophet(StatsModel): + def fit( + self, + X: Union[pd.DataFrame, np.ndarray], + y: Union[pd.Series, np.ndarray], + model_path: str = "", + name: str = "", + **kwargs, + ): + X["y"] = y + self.model = self.mclass.fit(X) + from prophet.serialize import model_to_json + + with open(model_path, "w") as fout: + fout.write(model_to_json(self.model)) + score = self.score(X[["ds"]], y) + return score + + @classmethod + def from_disk(cls, path, **kwargs): + from prophet.serialize import model_from_json + + with open(path, "r") as fin: + model = model_from_json(fin.read()) + return cls(model, params={}) + + def score(self, X, y_true, **kwargs): + return mean_absolute_error(y_true, self.model.predict(X)["yhat"]) + + def predict( + self, + X: Union[pd.DataFrame, np.ndarray] = None, + n_periods=None, + include_history=False, + **kwargs, + ): + if n_periods is not None: + future = self.mclass.make_future_dataframe( + periods=int(n_periods), include_history=include_history + ) + return self.mclass.predict(future) + return self.mclass.predict(X) diff --git a/setup.cfg b/setup.cfg index de2ecab31..189297afe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,7 +19,7 @@ per-file-ignores = testlib/test_scale.py:E912 ; ML libraries use capital "X" as a function argument or a variable. That's OK gramex/ml_api.py:N803,N806 - gramex/sm_api.py:N803,N806 + gramex/timeseries.py:N803,N806 [nosetests] diff --git a/tests/gramex.yaml b/tests/gramex.yaml index 5eeba043f..308f63f38 100644 --- a/tests/gramex.yaml +++ b/tests/gramex.yaml @@ -1287,6 +1287,14 @@ url: class: NER xsrf_cookies: false + mlhandler/prophet: + pattern: /prophet + handler: MLHandler + kwargs: + model: + class: Prophet + xsrf_cookies: false + capture: pattern: /capture handler: CaptureHandler diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 5716142e1..f67352ee7 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -21,24 +21,82 @@ from . import TestGramex, folder, tempfiles +STATSMODELS_INSTALLED = PROPHET_INSTALLED = TRANSFORMERS_INSTALLED = True + try: from statsmodels.datasets.interest_inflation import load as infl_load - STATSMODELS_INSTALLED = True except ImportError: STATSMODELS_INSTALLED = False try: logging.getLogger("tensorflow").disabled = True import transformers as trf # NOQA: F401 - TRANSFORMERS_INSTALLED = True except ImportError: TRANSFORMERS_INSTALLED = False +try: + import prophet # NOQA: F401 +except ImportError: + PROPHET_INSTALLED = False + op = os.path +@skipUnless(PROPHET_INSTALLED, "Please install Prophet to run these tests.") +class TestProphet(TestGramex): + @classmethod + def setUpClass(cls): + df = pd.read_csv( + "https://bit.ly/39d7Y6r", index_col="ds", parse_dates=["ds"] + ) # Peyton Manning dataset + train, test = df.loc[:"2014"], df.loc["2015":] + train, test = train.reset_index(), test.reset_index() + train["ds"] = train["ds"].astype(str) + test["ds"] = test["ds"].astype(str) + cls.train, cls.test = train, test + + @classmethod + def tearDownClass(cls): + + path = op.join( + gramex.config.variables["GRAMEXDATA"], + "apps", + "mlhandler", + "mlhandler-prophet", + ) + if op.isdir(path): + shutil.rmtree(path) + + def setUp(self): + resp = self.get( + "/prophet?_action=train&target_col=y", + method="post", + data=self.train.to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + self.assertTrue(resp.json()["score"] < 0.4) + + def test_default(self): + + # Get predictions + resp = self.get( + "/prophet?_action=predict", + method="post", + data=self.test[["ds"]].to_json(orient="records"), + headers={"Content-Type": "application/json"}, + ) + yhat = pd.DataFrame.from_records(resp.json())["yhat"] + self.assertTrue(mean_absolute_error(self.test["y"], yhat) < 0.5) + + def test_forecast(self): + n_periods = self.test.shape[0] + resp = self.get(f"/prophet?_action=predict&n_periods={n_periods}") + yhat = pd.DataFrame.from_records(resp.json())["yhat"] + self.assertTrue(mean_absolute_error(self.test["y"], yhat) < 0.5) + + @skipUnless(TRANSFORMERS_INSTALLED, "Please install transformers to run these tests.") class TestTransformers(TestGramex): @classmethod From 97903b80b6f57ddf491b4bcf8b2c5ec7af833d3e Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Tue, 7 Jun 2022 14:26:05 +0530 Subject: [PATCH 7/8] BUG: Fix deserialization for Prophet models --- gramex/handlers/mlhandler.py | 7 ++++++- gramex/timeseries.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index c7f629f20..cff2f37ba 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -100,7 +100,12 @@ def setup(cls, data=None, model={}, config_dir='', template=DEFAULT_TEMPLATE, ** mclass, wrapper = ml.search_modelclass(mclass) cls.model = locate(wrapper).from_disk(cls.store.model_path, mclass) else: - cls.model = get_model(cls.store.model_path, {}) + try: + cls.model = get_model(cls.store.model_path, {}) + except Exception as err: + app_log.warning(err) + mclass, wrapper = ml.search_modelclass(mclass) + cls.model = locate(wrapper).from_disk(cls.store.model_path, mclass) elif data is not None: data = cls._filtercols(data) data = cls._filterrows(data) diff --git a/gramex/timeseries.py b/gramex/timeseries.py index a472fa019..e7e0b8abd 100644 --- a/gramex/timeseries.py +++ b/gramex/timeseries.py @@ -133,7 +133,7 @@ def fit( return score @classmethod - def from_disk(cls, path, **kwargs): + def from_disk(cls, path, *args, **kwargs): from prophet.serialize import model_from_json with open(path, "r") as fin: @@ -141,7 +141,7 @@ def from_disk(cls, path, **kwargs): return cls(model, params={}) def score(self, X, y_true, **kwargs): - return mean_absolute_error(y_true, self.model.predict(X)["yhat"]) + return mean_absolute_error(y_true, self.mclass.predict(X)["yhat"]) def predict( self, From 3fe8407f9a3f74604bb2e18be8e308314fa145b5 Mon Sep 17 00:00:00 2001 From: "deshpande.jaidev@gmail.com" Date: Fri, 10 Jun 2022 09:50:00 +0530 Subject: [PATCH 8/8] Revert "ENH: Labelstudio support for MLHandler template" This reverts commit c7c7b5020b853654531c6bce0e010d405e286796. --- gramex/apps/mlhandler/sklearn.html | 317 ---------------------- gramex/apps/mlhandler/template.html | 336 ++++++++++++++++++++++-- gramex/apps/mlhandler/transformers.html | 178 ------------- gramex/handlers/mlhandler.py | 5 +- gramex/transformers.py | 14 +- tests/test_mlhandler.py | 12 +- 6 files changed, 328 insertions(+), 534 deletions(-) delete mode 100644 gramex/apps/mlhandler/sklearn.html delete mode 100644 gramex/apps/mlhandler/transformers.html diff --git a/gramex/apps/mlhandler/sklearn.html b/gramex/apps/mlhandler/sklearn.html deleted file mode 100644 index cd885e05a..000000000 --- a/gramex/apps/mlhandler/sklearn.html +++ /dev/null @@ -1,317 +0,0 @@ -{% set base = '.' %} -{% set columns = data.columns.tolist() %} -{% set CLASSIFICTION_MODELS = [ - 'LogisticRegression', - 'BernoulliNB', - 'Perceptron', - 'PassiveAggressiveClassifier', - 'SVC', - 'NuSVC', - 'LinearSVC', - 'KNeighborsClassifier', - 'GaussianNB', - 'DecisionTreeClassifier', - 'RandomForestClassifier', - 'MLPClassifier'] %} -{% set REGRESSION_MODELS = [ - 'LinearRegression', - 'PassiveAggressiveRegressor', - 'SVR', - 'NuSVR', - 'LinearSVR', - 'KNeighborsRegressor', - 'DecisionTreeRegressor', - 'RandomForestRegressor', - 'MLPRegressor'] %} -{% set tcol = handler.store.load('target_col', False) %} -{% set CLASSIFICTION_METRICS = { - 'Accuracy': 'accuracy', - 'Balanced Accuracy': 'balanced_accuracy', - 'ROC AUC': 'roc_auc', - 'F1 Score': 'f1_weighted' -}%} -{% set REGRESSION_METRICS = { - 'R2': 'r2', - 'Explained Variance': 'explained_variance', - 'Max Error': 'max_error', - 'Negative Mean Absolute Error': 'neg_mean_absolute_error', - 'Negative Mean Squared Error': 'neg_mean_squared_error', - 'Negative Root Mean Squared Error': 'neg_root_mean_squared_error' -}%} -
-
-

Train the Model

-
-
-
-
-
-
-
- - -
-
- - -
-
-
-
- - -
-
- - -
-
-
-
- - -
-
- - -
-
- - -
-
-
- -
-
-
-
Results
-
-
-
-
- Your model scored -
-
- - - - - - - 40% - - - -
-
-
-
-
-
- - -
-
- -
-
-
- -
-
-
-
-
-

Make Predictions

-
-
- -
-
-

-
-
-
- -
-
-
- diff --git a/gramex/apps/mlhandler/template.html b/gramex/apps/mlhandler/template.html index fa666a0f4..d2b73ae97 100644 --- a/gramex/apps/mlhandler/template.html +++ b/gramex/apps/mlhandler/template.html @@ -8,9 +8,6 @@ - - - + {% set base = '.' %} + {% set columns = data.columns.tolist() %} + {% set CLASSIFICTION_MODELS = [ + 'LogisticRegression', + 'BernoulliNB', + 'Perceptron', + 'PassiveAggressiveClassifier', + 'SVC', + 'NuSVC', + 'LinearSVC', + 'KNeighborsClassifier', + 'GaussianNB', + 'DecisionTreeClassifier', + 'RandomForestClassifier', + 'MLPClassifier'] %} + {% set REGRESSION_MODELS = [ + 'LinearRegression', + 'PassiveAggressiveRegressor', + 'SVR', + 'NuSVR', + 'LinearSVR', + 'KNeighborsRegressor', + 'DecisionTreeRegressor', + 'RandomForestRegressor', + 'MLPRegressor'] %} + {% set tcol = handler.store.load('target_col', False) %} + {% set CLASSIFICTION_METRICS = { + 'Accuracy': 'accuracy', + 'Balanced Accuracy': 'balanced_accuracy', + 'ROC AUC': 'roc_auc', + 'F1 Score': 'f1_weighted' + }%} + {% set REGRESSION_METRICS = { + 'R2': 'r2', + 'Explained Variance': 'explained_variance', + 'Max Error': 'max_error', + 'Negative Mean Absolute Error': 'neg_mean_absolute_error', + 'Negative Mean Squared Error': 'neg_mean_squared_error', + 'Negative Root Mean Squared Error': 'neg_root_mean_squared_error' + }%} +
+
+
+

Train the Model

+
+
+
+
+
+
+
+ + +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ +
+
+
+
Results
+
+
+
+
+ Your model scored +
+
+ + + + + + + 40% + + + +
+
+
+
+
+
+ + +
+
+ +
+
+
+ +
+
+
+
+
+

Make Predictions

+
+
+ +
+
+

+
+
+
+ +
+
+
+
+ @@ -59,17 +263,119 @@ - - - -
- {% set wrappercls = handler.model.__class__.__name__ %} - {% if wrappercls == 'SklearnModel' %} - {% set print('happened') %} - {% include sklearn.html %} - {% elif wrappercls == 'HFTransformer' %} - {% include transformers.html %} - {% end %} -
+ + diff --git a/gramex/apps/mlhandler/transformers.html b/gramex/apps/mlhandler/transformers.html deleted file mode 100644 index a730607f8..000000000 --- a/gramex/apps/mlhandler/transformers.html +++ /dev/null @@ -1,178 +0,0 @@ - -
-
-
-
-
- -
-
-
- -
-
- Download -
-
-
-
-
-
-
-
-
-
-

You have annotated 0 documents

-
-
-
-
-
- -
-

-
-
-
-
- diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index cff2f37ba..12098e61a 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -124,9 +124,6 @@ def setup(cls, data=None, model={}, config_dir='', template=DEFAULT_TEMPLATE, ** **cls.store.model_kwargs() ) - def get_template_path(self): - return op.dirname(self.template) - def _parse_multipart_form_data(self): dfs = [] for _, files in self.request.files.items(): @@ -142,7 +139,7 @@ def _parse_multipart_form_data(self): return pd.concat(dfs, axis=0) def _parse_application_json(self): - return pd.read_json(self.request.body) + return pd.read_json(self.request.body.decode('utf8')) def _parse_data(self, _cache=True, append=False): header = self.request.headers.get('Content-Type', '').split(';')[0] diff --git a/gramex/transformers.py b/gramex/transformers.py index 968c2522f..007513059 100644 --- a/gramex/transformers.py +++ b/gramex/transformers.py @@ -114,11 +114,7 @@ def load_pretrained(klass, path, default, **kwargs): model = cache.open(default, klass.from_pretrained, **kwargs) else: app_log.info(f"{path} not found on disk; loading default...") - key = klass.__name__ + default - if key in _CACHE: - model = _CACHE[key] - else: - model = _CACHE[key] = klass.from_pretrained(default, **kwargs) + model = klass.from_pretrained(default, **kwargs) return model @@ -159,10 +155,6 @@ class SentimentAnalysis(BaseTransformer): ) = "distilbert-base-uncased-finetuned-sst-2-english" AUTO_CLASS = trf.AutoModelForSequenceClassification - @property - def labels(self): - return self.model.config.label2id.keys() - def fit(self, text, labels, model_path, **kwargs): if pd.api.types.is_object_dtype(labels): labels = labels.map(self.model.config.label2id.get) @@ -181,11 +173,11 @@ def fit(self, text, labels, model_path, **kwargs): def predict(self, text, **kwargs): text = text.tolist() predictions = self.pipeline(text) - return [{"text": t, "label": p["label"]} for t, p in zip(text, predictions)] + return [k["label"] for k in predictions] def score(self, X, y_true, **kwargs): y_true = [self.model.config.label2id[x] for x in y_true] - y_pred = [p["label"] for p in self.predict(X.squeeze("columns"))] + y_pred = self.predict(X.squeeze("columns")) y_pred = [self.model.config.label2id[x] for x in y_pred] try: score = roc_auc_score(y_true, y_pred) diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index f67352ee7..6e5016702 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -113,13 +113,7 @@ def tearDownClass(cls): def test_default_sentiment(self): """Ensure that the default model predicts something.""" resp = self.get("/sentiment?text=This is bad.&text=This is good.", timeout=60) - self.assertEqual( - resp.json(), - [ - {"text": "This is bad.", "label": "NEGATIVE"}, - {"text": "This is good.", "label": "POSITIVE"}, - ], - ) + self.assertEqual(resp.json(), ["NEGATIVE", "POSITIVE"]) def test_train_sentiment(self): """Train with some vague sentences.""" @@ -127,12 +121,12 @@ def test_train_sentiment(self): df = pd.read_json("https://bit.ly/3NesHFs") resp = self.get( "/sentiment?_action=train&target_col=label", - method="post", + method='post', data=df.to_json(orient="records"), headers={"Content-Type": "application/json"}, timeout=300, ) - self.assertGreaterEqual(resp.json()["score"], 0.9) + self.assertGreaterEqual(resp.json()['score'], 0.9) def test_default_ner(self): """Ensure that the default model predicts something."""