diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000000..528f30c71c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 0000000000..af1b48d031 --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 0000000000..1d38849f48 --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json new file mode 100644 index 0000000000..9cf71ce0a2 --- /dev/null +++ b/.dvc/plots/default.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 0000000000..65549f9e01 --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 0000000000..9af9304c64 --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 0000000000..d497ce75e9 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000000..5197305523 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore index 9b563d6636..3f3a481a34 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ dmypy.json # Pyre type checker .pyre/ /poetry.lock +/en_core_sci_scibert-0.4.0.tar.gz diff --git a/cv_py/__init__.py b/cv_py/__init__.py index d8ea7044a8..b9aba0e337 100644 --- a/cv_py/__init__.py +++ b/cv_py/__init__.py @@ -1,5 +1,18 @@ +import scispacy + __title__ = "cv-py" + +__scispacy_version__ = scispacy.__version__ + __compatible__ = { "cord19_cdcs": "~0.2.3", + "en_core_sci_sm": __scispacy_version__, + "en_core_sci_md": __scispacy_version__, + "en_core_sci_lg": __scispacy_version__, + "en_ner_craft_md": __scispacy_version__, + "en_ner_jnlpba_md": __scispacy_version__, + "en_ner_bc5cdr_md": __scispacy_version__, + "en_ner_bionlp13cg_md": __scispacy_version__, + "en_core_sci_scibert": __scispacy_version__ } __release__ = True diff --git a/cv_py/resources/datapackage.py b/cv_py/resources/datapackage.py index d963b65456..c5885d2863 100644 --- a/cv_py/resources/datapackage.py +++ b/cv_py/resources/datapackage.py @@ -5,20 +5,60 @@ import os import subprocess import sys -from cv_py import __compatible__ +from cv_py import __compatible__, __scispacy_version__ import argparse -import re from pathlib import Path import importlib import pkg_resources import dask.dataframe as dd import requests import semantic_version as sv - +import yaml +import re __all__ = ["load"] +def check_version(dvc_file): + with open(dvc_file, 'r') as yaml_dvc: + try: + dvc_dict = yaml.safe_load(yaml_dvc) + except yaml.YAMLError as exc: + print(exc) + + source_url = dvc_dict["deps"][0]["path"] + version_regex = re.compile(r'[0-9]+\.[0-9]+\.[0-9]+') + match = version_regex.search(source_url) + + return match.group() == __scispacy_version__ + + +def check_datapackage(datapackage): + """check whether the zipped data package already exists""" + path = f"{datapackage}-{__scispacy_version__}.tar.gz.dvc" + is_file = os.path.isfile(path) + version_is_correct = check_version(path) + if is_file and version_is_correct == __scispacy_version__: + print("File is already up-to-date!") + exit(0) + elif is_file and not version_is_correct: + return "update" + else: + return "import-url" + + +def check_dvc(): + """check whether or not dvc has been initiated in the current working directory (whether the .dvc config folder exists, more precisely)""" + path = './.dvc' + isdir = os.path.isdir(path) + return isdir + + +def init_dvc(): + cmd = ["dvc", "init"] + return subprocess.call(cmd, env=os.environ.copy()) + + def get_release_versions(proj_str): r = requests.get(f"https://api.github.com/repos/{proj_str}/tags").json() versions = [sv.Version(i["name"][1:]) for i in r if sv.validate(i["name"][1:])] @@ -40,33 +80,26 @@ def get_filename(datapackage="cord19_cdcs"): f"https://github.com/{repo}/releases/download/v{v}/cord19-cdcs-{v}.tar.gz" ) return fname - elif datapackage in [ # Sci-spaCy - "en_core_sci_sm", - "en_core_sci_md", - "en_core_sci_lg", - "en_ner_craft_md", - "en_ner_jnlpba_md", - "en_ner_bc5cdr_md", - "en_ner_bionlp13cg_md", - ]: + else: fname = ( - f"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/{datapackage}-0.2.4.tar.gz" + f"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v{__scispacy_version__}/{datapackage}-{__scispacy_version__}.tar.gz" ) return fname - else: # TODO other resources sources? - raise NotImplementedError + # TODO other resources sources? -def download_datapackage(datapackage, user_pip_args=None): +def download_datapackage(datapackage): download_url = get_filename(datapackage=datapackage) print(download_url) - pip_args = ["--no-cache-dir", "--upgrade"] - if user_pip_args: - pip_args.extend(user_pip_args) - cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] + if not check_dvc(): + init_dvc() + + dvc_command = check_datapackage() + cmd = ["dvc", dvc_command, datapackage] return subprocess.call(cmd, env=os.environ.copy()) + def is_package(name): """Check if string maps to a package installed via pip. name (unicode): Name of package. @@ -92,7 +125,6 @@ def get_package_path(name): return Path(pkg.__file__).parent - def download_cli(): parser = argparse.ArgumentParser( @@ -101,12 +133,7 @@ def download_cli(): parser.add_argument( "--resource", "-r", type=str, help="Which resource to install?", ) - parser.add_argument( - "--pip-arg", - "-p", - action="append", - help="Argument to pass to pip (in addition to `--no-cache-dir`)", - ) + parser.add_argument( "--overwrite", dest="overwrite", @@ -124,7 +151,6 @@ def download_cli(): download_datapackage(args.resource, user_pip_args=args.pip_arg) - def load(datapackage="cord19_cdcs", fmt="parquet"): """Return a python container (e.g. Dataframe) stored in `datapackage`. """ diff --git a/pyproject.toml b/pyproject.toml index 83b43f99d3..6d4e3f9adc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,10 +20,12 @@ pyarrow = "^3.0.0" scikit-learn = "^0.24.1" dask = {extras = ["complete"], version = "^2.13.0"} +#necessary to download the correct scispacy language models +scispacy = "^0.4.4" + # A list of optional dependencies, choosable by module ## Spacy ecosystem textacy = { version = "^0.10.0", optional = true} -scispacy = { version = "^0.2.4", optional = true} ## flair ecosystem flair = { version = "^0.4.5", optional = true}