kevinlu1248 · May 17, 2020
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎__init__.py
+6 b/‎__init__.py
+6
diff --git a/‎basic.py
+19 b/‎basic.py
+19
diff --git a/‎combo_basic.py
+114 b/‎combo_basic.py
+114
diff --git a/‎cvalues.py
+114 b/‎cvalues.py
+114
diff --git a/‎term_extraction.py
+197 b/‎term_extraction.py
+197
diff --git a/‎term_extractor.py
+142 b/‎term_extractor.py
+142
diff --git a/‎weirdness.py
+63 b/‎weirdness.py
+63
@@ -0,0 +1 @@
+__pycache__
@@ -0,0 +1,6 @@
+from pyate.term_extraction import TermExtraction
+from pyate.pyate.basic import basic
+from pyate.combo_basic import combo_basic
+from pyate.cvalues import cvalues
+from pyate.term_extractor import term_extractor
+from pyate.weirdness import weirdness
@@ -0,0 +1,19 @@
+# combo basic
+import math
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from term_extraction import TermExtraction, add_term_extraction_method
+from combo_basic import combo_basic
+
+@add_term_extraction_method
+def basic(technical_corpus, *args, **kwargs):
+    weights = np.array([1, 3.5, 0])
+    return combo_basic(technical_corpus, weights=weights, *args, **kwargs)
+
+if __name__ == "__main__":
+    import pickle
+    pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
+    print(len(pkl))
+    corpus = pkl
+    print(TermExtraction(pkl[0]).basic().sort_values(ascending=False).head(50))
@@ -0,0 +1,114 @@
+# combo basic
+import math
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from term_extraction import TermExtraction, add_term_extraction_method
+
+start_ = 0
+tmp = 0
+# TOTAL_WORK = 27768
+# success = 27768
+# pbar = tqdm(total=27768)
+
+
+def start():
+    global start_
+    start_ = time.time()
+
+
+def end():
+    global start_
+    print(time.time() - start_)
+
+
+MAX_WORD_LENGTH = 6
+THRESHOLD = 0
+
+
+def helper_get_subsequences(s):
+    sequence = s.split()
+    if len(sequence) <= 2:
+        return []
+    answer = []
+    for left in range(len(sequence) + 1):
+        for right in range(left + 1, len(sequence) + 1):
+            if left == 0 and right == len(sequence):
+                continue
+            answer.append(" ".join(sequence[left:right]))
+    return answer
+
+
+@add_term_extraction_method
+def combo_basic(
+    technical_corpus,
+    smoothing=0.01,
+    verbose=False,
+    have_single_word=False,
+    technical_counts=None,
+    weights=None,
+):
+
+    # TODO
+
+    if technical_counts is None:
+        technical_counts = (
+            TermExtraction(technical_corpus)
+            .count_terms_from_documents(verbose=verbose)
+            .reindex()
+        )
+
+    order = sorted(
+        list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True
+    )
+
+    if not have_single_word:
+        order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order))
+
+    technical_counts = technical_counts[order]
+
+    df = pd.DataFrame(
+        {
+            "xlogx_score": technical_counts.reset_index()
+            .apply(
+                lambda s: math.log(TermExtraction.word_length(s["index"])) * s[0],
+                axis=1,
+            )
+            .values,
+            "times_subset": 0,
+            "times_superset": 0,
+        },
+        index=technical_counts.index,
+    )
+
+    indices = set(technical_counts.index)
+
+    def score_of_children(candidate):
+        df.at[candidate, "times_subset"] += 1
+        if TermExtraction.word_length(candidate) is 1:
+            return 1
+        sm = 1
+        for substring in helper_get_subsequences(candidate):
+            if substring in indices:
+                df.at[substring, "times_subset"] += 1
+                df.at[substring, "times_subset"] += 1
+        return sm
+
+    for index in technical_counts.index:
+        for substring in helper_get_subsequences(index):
+            if substring in indices:
+                df.at[substring, "times_subset"] += 1
+                df.at[index, "times_superset"] += 1
+
+    if weights is None:
+        weights = np.array([1, 0.75, 0.1])
+    return df.apply(lambda s: s.values.dot(weights), axis=1)
+
+
+if __name__ == "__main__":
+    import pickle
+
+    pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
+    print(len(pkl))
+    corpus = pkl
+    print(TermExtraction(pkl[0]).combo_basic().sort_values(ascending=False).head(50))
@@ -0,0 +1,114 @@
+# c_value
+
+import time
+import math
+from tqdm import tqdm
+import pandas as pd
+from term_extraction import TermExtraction, add_term_extraction_method
+
+start_ = 0
+tmp = 0
+# TOTAL_WORK = 27768
+# success = 27768
+# pbar = tqdm(total=27768)
+
+
+def start():
+    global start_
+    start_ = time.time()
+
+
+def end():
+    global start_
+    print(time.time() - start_)
+
+
+MAX_WORD_LENGTH = 6
+THRESHOLD = 0
+
+
+def helper_get_subsequences(s):
+    sequence = s.split()
+    if len(sequence) <= 2:
+        return []
+    answer = []
+    for left in range(len(sequence) + 1):
+        for right in range(left + 1, len(sequence) + 1):
+            if left == 0 and right == len(sequence):
+                continue
+            answer.append(" ".join(sequence[left:right]))
+    return answer
+
+
+@add_term_extraction_method
+def c_values(
+    technical_corpus,
+    smoothing=0.01,
+    verbose=False,
+    have_single_word=False,
+    technical_counts=None,
+):
+
+    if technical_counts is None:
+        technical_counts = (
+            TermExtraction(technical_corpus)
+            .count_terms_from_documents(verbose=verbose)
+            .reindex()
+        )
+
+    order = sorted(
+        list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True
+    )
+
+    if not have_single_word:
+        order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order))
+
+    technical_counts = technical_counts[order]
+
+    df = pd.DataFrame(
+        {
+            "frequency": technical_counts.values,
+            "times_nested": technical_counts.values,
+            "number_of_nested": 1,
+            "has_been_evaluated": False,
+        },
+        index=technical_counts.index,
+    )
+
+    # print(df)
+    output = []
+    indices = set(df.index)
+
+    iterator = tqdm(df.iterrows()) if verbose else df.iterrows()
+
+    for candidate, row in iterator:
+        f, t, n, h = row
+        length = TermExtraction.word_length(candidate)
+        if length == MAX_WORD_LENGTH:
+            c_val = math.log(length + smoothing) * f
+        else:
+            c_val = math.log(length + smoothing) * f
+            if h:
+                c_val -= t / n
+        if c_val >= THRESHOLD:
+            output.append((candidate, c_val))
+            nstart = time.time()  # TODO: optimize
+            for substring in helper_get_subsequences(candidate):
+                if substring in indices:
+                    df.loc[substring, "times_nested"] += 1
+                    df.loc[substring, "number_of_nested"] += f
+                    df.loc[substring, "has_been_evaluated"] = True
+            global tmp
+            tmp += time.time() - nstart
+
+    srs = pd.Series(map(lambda s: s[1], output), index=map(lambda s: s[0], output))
+    return srs.sort_values(ascending=False)
+
+
+if __name__ == "__main__":
+    import pickle
+
+    pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
+    corpus = pkl
+    print(list(TermExtraction(pkl[0]).c_values(verbose=True).index))
+    print(pkl[0])
@@ -0,0 +1,197 @@
+# c_value
+
+import spacy
+import pickle
+import time
+import math
+from tqdm import tqdm
+import pandas as pd
+
+# from pathos.multiprocessing import ProcessingPool as Pool
+from multiprocessing import Pool
+from spacy.matcher import Matcher
+from collections import defaultdict
+from multiprocessing.pool import Pool
+import ahocorasick
+import numpy as np
+
+start_ = 0
+tmp = 0
+doctime, matchertime = 0, 0
+
+
+def start():
+    global start_
+    start_ = time.time()
+
+
+def end():
+    global start_
+    print(time.time() - start_)
+
+
+class TermExtraction:
+    nlp = spacy.load("en_core_web_sm", parser=False, entity=False)
+    matcher = Matcher(nlp.vocab)
+    MAX_WORD_LENGTH = 6
+
+    noun, adj, prep = (
+        {"POS": "NOUN", "IS_PUNCT": False},
+        {"POS": "ADJ", "IS_PUNCT": False},
+        {"POS": "DET", "IS_PUNCT": False},
+    )
+
+    patterns = [[adj],
+        [{"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False}, noun],
+        [
+            {"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False},
+            noun,
+            prep,
+            {"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False},
+            noun,
+        ],
+    ]
+
+    def __init__(self, corpus, vocab=None, patterns=patterns, do_parallelize=True):
+        """
+        If corpus is a string, then find vocab sequentially, but if the corpus is an iterator,
+        compute in parallel. If there is a vocab list, only search for frequencies from the vocab list, 
+        otherwise search using the patterns.
+
+        TODO: do_parallelize and do_lower
+        """
+        self.corpus = corpus
+        self.vocab = vocab
+        self.patterns = patterns
+        self.do_parallelize = do_parallelize
+
+    @staticmethod
+    def word_length(string):
+        return string.count(" ") + 1
+
+    @property
+    def trie(self):
+        if not hasattr(self, "_TermExtraction__trie"):
+            self.__trie = ahocorasick.Automaton()
+            for idx, key in enumerate(self.vocab):
+                self.__trie.add_word(key, (idx, key))
+            self.__trie.make_automaton()
+        return self.__trie
+
+    def count_terms_from_document(self, document):
+        # for single documents
+        term_counter = defaultdict(int)
+        if self.vocab is None:
+
+            def add_to_counter(matcher, doc, i, matches):
+                match_id, start, end = matches[i]
+                candidate = str(doc[start:end])
+                if (
+                    TermExtraction.word_length(candidate)
+                    <= TermExtraction.MAX_WORD_LENGTH
+                ):
+                    term_counter[candidate] += 1
+
+            for i, pattern in enumerate(self.patterns):
+                TermExtraction.matcher.add("term{}".format(i), add_to_counter, pattern)
+
+            doc = TermExtraction.nlp(document.lower(), disable=["parser", "ner"])
+            matches = TermExtraction.matcher(doc)
+        else:
+            for end_index, (insert_order, original_value) in self.trie.iter(
+                document.lower()
+            ):
+                term_counter[original_value] += 1
+        return term_counter
+
+    def count_terms_from_documents(self, seperate=False, verbose=False):
+
+        if type(self.corpus) is str:
+            term_counter = pd.Series(self.count_terms_from_document(self.corpus))
+        elif type(self.corpus) is list or type(self.corpus) is pd.Series:
+            if seperate:
+                term_counters = []
+            else:
+                term_counter = pd.Series(dtype="int64")
+            if verbose:
+                pbar = tqdm(total=len(self.corpus))
+
+            def callback(counter_list):
+                if verbose:
+                    pbar.update(1)
+                if seperate:
+                    term_counters.append(
+                        (tuple(counter_list.keys()), tuple(counter_list.values()))
+                    )
+                else:
+                    nonlocal term_counter
+                    # print(tuple(counter_list.values()))
+                    term_counter = term_counter.add(
+                        pd.Series(
+                            index=tuple(counter_list.keys()),
+                            data=tuple(counter_list.values()),
+                            dtype=np.int64,
+                        ),
+                        fill_value=0,
+                    ).astype(np.int64)
+
+            def error_callback(e):
+                print(e)
+
+            P = Pool()
+
+            start_ = time.time()
+            for document in self.corpus:
+                P.apply_async(
+                    self.count_terms_from_document,
+                    [document],
+                    callback=callback,
+                    error_callback=error_callback,
+                )
+            P.close()
+            P.join()
+            print(time.time() - start_)
+
+            P.terminate()
+            if verbose:
+                pbar.close()
+            # print(term_counter)
+        else:
+            raise TypeError()
+
+        if seperate:
+
+            def counter_to_series(counter):
+                return pd.Series(data=counter[1], index=counter[0], dtype="int8")
+
+            return (
+                pd.DataFrame(data=map(counter_to_series, term_counters))
+                .fillna(0)
+                .astype("int8")
+                .T
+            )
+        else:
+            return term_counter
+
+
+def add_term_extraction_method(extractor):
+    def decorated(self, *args, **kwargs):
+        return extractor(self.corpus, technical_counts=self.count_terms_from_documents(), *args, **kwargs)
+    setattr(TermExtraction, extractor.__name__, decorated)
+    return extractor
+
+if __name__ == "__main__":
+    PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
+    PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
+    wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
+    pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
+    # print(pmc, '\n', wiki)
+    vocab = ["Cutaneous melanoma", "cancer", "secondary clusters", "bio"]
+    start()
+    print(
+        TermExtraction(pmc[:100]).count_terms_from_documents(
+            seperate=True, verbose=True
+        )
+    )
+    end()
+    # print(domain_pertinence(pmc[0], wiki[0]))
@@ -0,0 +1,142 @@
+# c_value
+
+import spacy
+import pickle
+import time
+import math
+import pandas as pd
+import numpy as np
+from term_extraction import TermExtraction
+
+# from sklearn import preprocessing
+
+start_ = 0
+tmp = 0
+
+
+def start():
+    global start_
+    start_ = time.time()
+
+
+def end():
+    global start_
+    print(time.time() - start_)
+
+
+def domain_pertinence(technical_corpus, general_corpus):
+    # http://ceur-ws.org/Vol-1031/paper3.pdf
+    return 1
+
+
+def domain_consensus(technical_corpus, general_corpus):
+    return 1
+
+
+def lexical_cohesion(technical_corpus, general_corpus):
+    return 1
+
+@add_term_extraction_method
+def term_extractor(technical_corpus, general_corpus, weights=None, verbose=False, technical_counts=None):
+
+    # reused initializations
+    start()
+    technical_counts_seperate = TermExtraction(
+        technical_corpus
+    ).count_terms_from_documents(True, verbose=verbose)
+    end()
+
+    start()
+    technical_counts = technical_counts_seperate.sum(axis=1)
+    XLOGX = lambda x: x * math.log(x) if x else 0
+
+    # domain pertinence
+    general_counts = TermExtraction(
+        general_corpus, technical_counts.index
+    ).count_terms_from_documents(verbose=verbose)
+    general_counts /= general_counts.max()
+    domain_pertinence = pd.DataFrame(
+        data={
+            "technical": technical_counts / technical_counts.sum(),
+            "general": general_counts,
+        }
+    ).fillna(0)
+
+    def domain_pertinence_function(s):
+        tech, gen = s.iloc
+        return tech / max(tech, gen)
+
+    domain_pertinence = domain_pertinence.apply(domain_pertinence_function, axis=1)
+
+    # domain consensus
+    domain_consensus = technical_counts_seperate
+    domain_consensus = domain_consensus.div(domain_consensus.sum(axis=0), axis=1)
+    domain_consensus = domain_consensus.applymap(lambda x: -XLOGX(x))
+    domain_consensus = domain_consensus.sum(axis=1)
+
+    # Lexical cohesion
+    term_words = set(
+        word for term in technical_counts_seperate.index for word in term.split()
+    )
+    term_counts = TermExtraction(
+        technical_corpus, term_words
+    ).count_terms_from_documents()
+    lexical_cohesion = technical_counts
+
+    def lexical_cohesion_function(row):
+        word, freq = row.iloc
+        # print(word, freq)
+        return (
+            TermExtraction.word_length(word)
+            * XLOGX(freq)  # remove plus 1 later
+            / sum(map(lambda s: term_counts.loc[s], word.split()))
+        )
+
+    lexical_cohesion = pd.Series(
+        lexical_cohesion.reset_index().apply(lexical_cohesion_function, axis=1).values,
+        index=lexical_cohesion.index,
+    )
+
+    if True:
+        domain_pertinence /= domain_pertinence.max()
+        domain_consensus /= domain_consensus.max()
+        lexical_cohesion /= lexical_cohesion.max()
+
+    df = pd.DataFrame(
+        data={
+            "domain_pertinence": domain_pertinence,
+            "domain_consensus": domain_consensus,
+            "lexical_cohesion": lexical_cohesion,
+        }
+    )
+
+    if verbose:
+        print(
+            domain_pertinence.sort_values(ascending=False).head(10),
+            "\n",
+            domain_consensus.sort_values(ascending=False).head(10),
+            "\n",
+            lexical_cohesion.sort_values(ascending=False).head(10),
+        )
+    if weights is None:
+        weights = np.array([1, 1, 1]) / 3
+    end()
+    return df.dot(weights)
+
+
+if __name__ == "__main__":
+    PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
+    PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
+    wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
+    pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
+    # term_extractor(pmc[:50], wiki[:250])
+
+    # print(term_extractor(pmc, wiki).sort_values(ascending=False).head(50))
+    print(
+        term_extractor(pmc[:50], wiki[:1000], verbose=True)
+        .sort_values(ascending=False)
+        .head(50)
+    )
+    # print(pmc[0])
+    # Syntactic Analysis: (PP NP PP CNP RVP NP PP)
+    # POS: (PAJNNN AJN PNCNNWVANPJN)
@@ -0,0 +1,63 @@
+# c_value
+
+import spacy
+import pickle
+import time
+import math
+import json
+from tqdm import tqdm
+import pandas as pd
+
+# from pathos.multiprocessing import ProcessingPool as Pool
+from multiprocessing import Pool
+from spacy.matcher import Matcher
+from collections import defaultdict
+from multiprocessing.pool import Pool
+import ahocorasick
+from term_extraction import TermExtraction
+
+start_ = 0
+tmp = 0
+
+
+def start():
+    global start_
+    start_ = time.time()
+
+
+def end():
+    global start_
+    print(time.time() - start_)
+
+@add_term_extraction_method
+def weirdness(technical_corpus, general_corpus, normalized=False, technical_counts=None):
+    # http://ceur-ws.org/Vol-1031/paper3.pdf
+    if technical_counts is None:
+        technical_counts = TermExtraction(technical_corpus).count_terms_from_documents()
+    general_counts = TermExtraction(
+        general_corpus, technical_counts.index
+    ).count_terms_from_documents()
+    technical_word_count = TermExtraction.word_length("\n".join(technical_corpus))
+    general_word_count = TermExtraction.word_length("\n".join(general_corpus))
+
+    zero_division_preventer = pd.Series(index=technical_counts.index, data=1)
+    general_counts += zero_division_preventer
+    if normalized:
+        return (
+            technical_counts
+            * general_word_count
+            / general_counts
+            / technical_word_count
+        )
+    else:
+        return technical_counts / general_counts
+
+
+if __name__ == "__main__":
+    PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
+    PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
+    wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
+    pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
+    # print(pmc, '\n', wiki)
+    pairdf = weirdness(pmc[:200], wiki[:500])
+    print(pairdf.sort_values(ascending=False).head(50))