Merge branch 'master' of https://github.com/kevinlu1248/pyate

kevinlu1248 · kevinlu1248 · commit 5baf493872ae · 2020-08-12T14:15:30.000-07:00
diff --git a/.deepsource.toml b/.deepsource.toml
@@ -0,0 +1,10 @@
+version = 1
+
+test_patterns = ["**.py"]
+
+[[analyzers]]
+name = "python"
+enabled = true
+
+  [analyzers.meta]
+  runtime_version = "3.x.x"
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ pip install pyate https://github.com/explosion/spacy-models/releases/download/en
 ```
 
 ## :rocket: Quickstart
-To get started, simply call one of the implemented algorithms. According to Astrakhantsev 2016, `combo_basic` is the most precise of the five algorithms, though `basic` and `cvalue` is not too far behind (see Precision). The same study shows that PU-ATR and KeyConceptRel have higher precision than `combo_basic` but are not implemented and PU-ATR take significantly more time since it uses machine learning.
+To get started, simply call one of the implemented algorithms. According to Astrakhantsev 2016, `combo_basic` is the most precise of the five algorithms, though `basic` and `cvalues` is not too far behind (see Precision). The same study shows that PU-ATR and KeyConceptRel have higher precision than `combo_basic` but are not implemented and PU-ATR take significantly more time since it uses machine learning.
 ```python3
 from pyate import combo_basic
 
@@ -88,7 +88,7 @@ __init__(
 where `func` is essentially your term extracting algorithm that takes in a corpus (either a string or iterator of strings) and outputs a Pandas Series of term-value pairs of terms and their respective termhoods. `func` is by default `combo_basic`. `args` and `kwargs` are for you to overide default values for the function, which you can find by running `help` (might document later on).
 
 ### Summary of functions 
-Each of `cvalue, basic, combo_basic, weirdness` and `term_extractor` take in a string or an iterator of strings and outputs a Pandas Series of term-value pairs, where higher values indicate higher chance of being a domain specific term. Furthermore, `weirdness` and `term_extractor` take a `general_corpus` key word argument which must be an iterator of strings which defaults to the General Corpus described below. 
+Each of `cvalues, basic, combo_basic, weirdness` and `term_extractor` take in a string or an iterator of strings and outputs a Pandas Series of term-value pairs, where higher values indicate higher chance of being a domain specific term. Furthermore, `weirdness` and `term_extractor` take a `general_corpus` key word argument which must be an iterator of strings which defaults to the General Corpus described below. 
 
 All functions only take the string of which you would like to extract terms from as the mandatory input (the `technical_corpus`), as well as other tweakable settings, including `general_corpus` (contrasting corpus for `weirdness` and `term_extractor`), `general_corpus_size`, `verbose` (whether to print a progress bar), `weights`, `smoothing`, `have_single_word` (whether to have a single word count as a phrase) and `threshold`. If you have not read the papers and are unfamiliar with the algorithms, I recommend just using the default settings. Again, use `help` to find the details regarding each algorithm since they are all different.
 
@@ -117,7 +117,7 @@ Here is the average precision of some of the implemented algorithms using the Av
 ## :stars: Motivation
 This project was planned to be a tool to be connected to a Google Chrome Extension that highlights and defines key terms that the reader probably does not know of. Furthermore, term extraction is an area where there is not a lot of focused research on in comparison to other areas of NLP and especially recently is not viewed to be very practical due to the more general tool of NER tagging. However, modern NER tagging usually incorporates some combination of memorized words and deep learning which are spatially and computationally heavy. Furthermore, to generalize an algorithm to recognize terms to the ever growing areas of medical and AI research, a list of memorized words will not do.
 
-Of the five implemented algorithms, none are expensive, in fact, the bottleneck of the space allocation and computation expense is from the spaCy model and spaCy POS tagging. This is because they most rely simply on POS patterns, word frequencies, and the existence of embedded term candidates. For example, the term candidate "breast cancer" implies that "malignant breast cancer" is probably not a term and simply a form of "breast cancer" that is "malignant" (implemented in C-Value).
+Of the five implemented algorithms, none are expensive, in fact, the bottleneck of the space allocation and computation expense is from the spaCy model and spaCy POS tagging. This is because they mostly rely simply on POS patterns, word frequencies, and the existence of embedded term candidates. For example, the term candidate "breast cancer" implies that "malignant breast cancer" is probably not a term and simply a form of "breast cancer" that is "malignant" (implemented in C-Value).
 
 ## :pushpin: Todo
 * Add PU-ATR algorithm since its precision is a lot higher, though more computationally expensive
diff --git a/src/pyate/cvalues.py b/src/pyate/cvalues.py
@@ -1,6 +1,5 @@
 # c_value
 
-import time
 import math
 from typing import List, Mapping
 
diff --git a/src/pyate/term_extraction.py b/src/pyate/term_extraction.py
@@ -1,41 +1,23 @@
-# c_value
+# term_extraction.py
 
-import pickle
-import time
-import math
-from collections import Iterable
+import collections
+from collections import defaultdict
+import pkg_resources
 from multiprocessing import Pool
 from typing import Iterable, Union, Sequence, Callable
-from distutils.sysconfig import get_python_lib
 
 import spacy
 from spacy.matcher import Matcher
 from tqdm import tqdm
 import pandas as pd
-from collections import defaultdict
 import ahocorasick
 import numpy as np
-import pkg_resources
 
 start_ = 0
 tmp = 0
 doctime, matchertime = 0, 0
 Corpus = Union[str, Sequence[str]]
 
-# import glob
-# print(get_python_lib())
-# print(glob.glob("/home/kevin/PycharmProjects/pyate/venv/lib/python3.6/site-packages/*.csv"))
-
-
-def start():
-    global start_
-    start_ = time.time()
-
-
-def end():
-    global start_
-    print(time.time() - start_)
-
 
 class TermExtraction:
     nlp = spacy.load("en_core_web_sm", parser=False, entity=False)
@@ -140,7 +122,7 @@ def count_terms_from_documents(self, seperate: bool = False, verbose: bool = Fal
             self.__term_counts = pd.Series(self.count_terms_from_document(self.corpus))
             return self.__term_counts
         # elif type(self.corpus) is list or type(self.corpus) is pd.Series:
-        elif isinstance(self.corpus, Iterable):
+        elif isinstance(self.corpus, collections.Iterable):
             if seperate:
                 term_counters = []
             else:
@@ -224,10 +206,10 @@ def term_extraction_decoration(self, *args, **kwargs):
     wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
     pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
     vocab = ["Cutaneous melanoma", "cancer", "secondary clusters", "bio"]
-    start()
+#     start()
     print(
         TermExtraction(pmc[:100]).count_terms_from_documents(
             seperate=True, verbose=True
         )
     )
-    end()
+#     end()
diff --git a/src/pyate/term_extractor.py b/src/pyate/term_extractor.py
@@ -1,11 +1,7 @@
 # c_value
 
-import time
 import math
 from typing import Mapping, Sequence
-
-import spacy
-import pickle
 import pandas as pd
 import numpy as np
 
diff --git a/src/pyate/weirdness.py b/src/pyate/weirdness.py
@@ -1,12 +1,6 @@
 # weirdness.py
 
-import pickle
-import time
-import math
-import json
 from typing import Mapping
-
-import spacy
 import pandas as pd
 
 from .term_extraction import TermExtraction, add_term_extraction_method, Corpus