Skip to content
This repository was archived by the owner on Jun 1, 2023. It is now read-only.

Commit dec8e91

Browse files
committedMay 17, 2020
initial commit
0 parents  commit dec8e91

8 files changed

+656
-0
lines changed
 

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__

‎__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from pyate.term_extraction import TermExtraction
2+
from pyate.pyate.basic import basic
3+
from pyate.combo_basic import combo_basic
4+
from pyate.cvalues import cvalues
5+
from pyate.term_extractor import term_extractor
6+
from pyate.weirdness import weirdness

‎basic.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# combo basic
2+
import math
3+
from tqdm import tqdm
4+
import pandas as pd
5+
import numpy as np
6+
from term_extraction import TermExtraction, add_term_extraction_method
7+
from combo_basic import combo_basic
8+
9+
@add_term_extraction_method
10+
def basic(technical_corpus, *args, **kwargs):
11+
weights = np.array([1, 3.5, 0])
12+
return combo_basic(technical_corpus, weights=weights, *args, **kwargs)
13+
14+
if __name__ == "__main__":
15+
import pickle
16+
pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
17+
print(len(pkl))
18+
corpus = pkl
19+
print(TermExtraction(pkl[0]).basic().sort_values(ascending=False).head(50))

‎combo_basic.py

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# combo basic
2+
import math
3+
from tqdm import tqdm
4+
import pandas as pd
5+
import numpy as np
6+
from term_extraction import TermExtraction, add_term_extraction_method
7+
8+
start_ = 0
9+
tmp = 0
10+
# TOTAL_WORK = 27768
11+
# success = 27768
12+
# pbar = tqdm(total=27768)
13+
14+
15+
def start():
16+
global start_
17+
start_ = time.time()
18+
19+
20+
def end():
21+
global start_
22+
print(time.time() - start_)
23+
24+
25+
MAX_WORD_LENGTH = 6
26+
THRESHOLD = 0
27+
28+
29+
def helper_get_subsequences(s):
30+
sequence = s.split()
31+
if len(sequence) <= 2:
32+
return []
33+
answer = []
34+
for left in range(len(sequence) + 1):
35+
for right in range(left + 1, len(sequence) + 1):
36+
if left == 0 and right == len(sequence):
37+
continue
38+
answer.append(" ".join(sequence[left:right]))
39+
return answer
40+
41+
42+
@add_term_extraction_method
43+
def combo_basic(
44+
technical_corpus,
45+
smoothing=0.01,
46+
verbose=False,
47+
have_single_word=False,
48+
technical_counts=None,
49+
weights=None,
50+
):
51+
52+
# TODO
53+
54+
if technical_counts is None:
55+
technical_counts = (
56+
TermExtraction(technical_corpus)
57+
.count_terms_from_documents(verbose=verbose)
58+
.reindex()
59+
)
60+
61+
order = sorted(
62+
list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True
63+
)
64+
65+
if not have_single_word:
66+
order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order))
67+
68+
technical_counts = technical_counts[order]
69+
70+
df = pd.DataFrame(
71+
{
72+
"xlogx_score": technical_counts.reset_index()
73+
.apply(
74+
lambda s: math.log(TermExtraction.word_length(s["index"])) * s[0],
75+
axis=1,
76+
)
77+
.values,
78+
"times_subset": 0,
79+
"times_superset": 0,
80+
},
81+
index=technical_counts.index,
82+
)
83+
84+
indices = set(technical_counts.index)
85+
86+
def score_of_children(candidate):
87+
df.at[candidate, "times_subset"] += 1
88+
if TermExtraction.word_length(candidate) is 1:
89+
return 1
90+
sm = 1
91+
for substring in helper_get_subsequences(candidate):
92+
if substring in indices:
93+
df.at[substring, "times_subset"] += 1
94+
df.at[substring, "times_subset"] += 1
95+
return sm
96+
97+
for index in technical_counts.index:
98+
for substring in helper_get_subsequences(index):
99+
if substring in indices:
100+
df.at[substring, "times_subset"] += 1
101+
df.at[index, "times_superset"] += 1
102+
103+
if weights is None:
104+
weights = np.array([1, 0.75, 0.1])
105+
return df.apply(lambda s: s.values.dot(weights), axis=1)
106+
107+
108+
if __name__ == "__main__":
109+
import pickle
110+
111+
pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
112+
print(len(pkl))
113+
corpus = pkl
114+
print(TermExtraction(pkl[0]).combo_basic().sort_values(ascending=False).head(50))

‎cvalues.py

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# c_value
2+
3+
import time
4+
import math
5+
from tqdm import tqdm
6+
import pandas as pd
7+
from term_extraction import TermExtraction, add_term_extraction_method
8+
9+
start_ = 0
10+
tmp = 0
11+
# TOTAL_WORK = 27768
12+
# success = 27768
13+
# pbar = tqdm(total=27768)
14+
15+
16+
def start():
17+
global start_
18+
start_ = time.time()
19+
20+
21+
def end():
22+
global start_
23+
print(time.time() - start_)
24+
25+
26+
MAX_WORD_LENGTH = 6
27+
THRESHOLD = 0
28+
29+
30+
def helper_get_subsequences(s):
31+
sequence = s.split()
32+
if len(sequence) <= 2:
33+
return []
34+
answer = []
35+
for left in range(len(sequence) + 1):
36+
for right in range(left + 1, len(sequence) + 1):
37+
if left == 0 and right == len(sequence):
38+
continue
39+
answer.append(" ".join(sequence[left:right]))
40+
return answer
41+
42+
43+
@add_term_extraction_method
44+
def c_values(
45+
technical_corpus,
46+
smoothing=0.01,
47+
verbose=False,
48+
have_single_word=False,
49+
technical_counts=None,
50+
):
51+
52+
if technical_counts is None:
53+
technical_counts = (
54+
TermExtraction(technical_corpus)
55+
.count_terms_from_documents(verbose=verbose)
56+
.reindex()
57+
)
58+
59+
order = sorted(
60+
list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True
61+
)
62+
63+
if not have_single_word:
64+
order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order))
65+
66+
technical_counts = technical_counts[order]
67+
68+
df = pd.DataFrame(
69+
{
70+
"frequency": technical_counts.values,
71+
"times_nested": technical_counts.values,
72+
"number_of_nested": 1,
73+
"has_been_evaluated": False,
74+
},
75+
index=technical_counts.index,
76+
)
77+
78+
# print(df)
79+
output = []
80+
indices = set(df.index)
81+
82+
iterator = tqdm(df.iterrows()) if verbose else df.iterrows()
83+
84+
for candidate, row in iterator:
85+
f, t, n, h = row
86+
length = TermExtraction.word_length(candidate)
87+
if length == MAX_WORD_LENGTH:
88+
c_val = math.log(length + smoothing) * f
89+
else:
90+
c_val = math.log(length + smoothing) * f
91+
if h:
92+
c_val -= t / n
93+
if c_val >= THRESHOLD:
94+
output.append((candidate, c_val))
95+
nstart = time.time() # TODO: optimize
96+
for substring in helper_get_subsequences(candidate):
97+
if substring in indices:
98+
df.loc[substring, "times_nested"] += 1
99+
df.loc[substring, "number_of_nested"] += f
100+
df.loc[substring, "has_been_evaluated"] = True
101+
global tmp
102+
tmp += time.time() - nstart
103+
104+
srs = pd.Series(map(lambda s: s[1], output), index=map(lambda s: s[0], output))
105+
return srs.sort_values(ascending=False)
106+
107+
108+
if __name__ == "__main__":
109+
import pickle
110+
111+
pkl = pickle.load(open("../data/pmc_testing.pkl", "rb"))
112+
corpus = pkl
113+
print(list(TermExtraction(pkl[0]).c_values(verbose=True).index))
114+
print(pkl[0])

‎term_extraction.py

+197
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# c_value
2+
3+
import spacy
4+
import pickle
5+
import time
6+
import math
7+
from tqdm import tqdm
8+
import pandas as pd
9+
10+
# from pathos.multiprocessing import ProcessingPool as Pool
11+
from multiprocessing import Pool
12+
from spacy.matcher import Matcher
13+
from collections import defaultdict
14+
from multiprocessing.pool import Pool
15+
import ahocorasick
16+
import numpy as np
17+
18+
start_ = 0
19+
tmp = 0
20+
doctime, matchertime = 0, 0
21+
22+
23+
def start():
24+
global start_
25+
start_ = time.time()
26+
27+
28+
def end():
29+
global start_
30+
print(time.time() - start_)
31+
32+
33+
class TermExtraction:
34+
nlp = spacy.load("en_core_web_sm", parser=False, entity=False)
35+
matcher = Matcher(nlp.vocab)
36+
MAX_WORD_LENGTH = 6
37+
38+
noun, adj, prep = (
39+
{"POS": "NOUN", "IS_PUNCT": False},
40+
{"POS": "ADJ", "IS_PUNCT": False},
41+
{"POS": "DET", "IS_PUNCT": False},
42+
)
43+
44+
patterns = [[adj],
45+
[{"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False}, noun],
46+
[
47+
{"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False},
48+
noun,
49+
prep,
50+
{"POS": {"IN": ["ADJ", "NOUN"]}, "OP": "*", "IS_PUNCT": False},
51+
noun,
52+
],
53+
]
54+
55+
def __init__(self, corpus, vocab=None, patterns=patterns, do_parallelize=True):
56+
"""
57+
If corpus is a string, then find vocab sequentially, but if the corpus is an iterator,
58+
compute in parallel. If there is a vocab list, only search for frequencies from the vocab list,
59+
otherwise search using the patterns.
60+
61+
TODO: do_parallelize and do_lower
62+
"""
63+
self.corpus = corpus
64+
self.vocab = vocab
65+
self.patterns = patterns
66+
self.do_parallelize = do_parallelize
67+
68+
@staticmethod
69+
def word_length(string):
70+
return string.count(" ") + 1
71+
72+
@property
73+
def trie(self):
74+
if not hasattr(self, "_TermExtraction__trie"):
75+
self.__trie = ahocorasick.Automaton()
76+
for idx, key in enumerate(self.vocab):
77+
self.__trie.add_word(key, (idx, key))
78+
self.__trie.make_automaton()
79+
return self.__trie
80+
81+
def count_terms_from_document(self, document):
82+
# for single documents
83+
term_counter = defaultdict(int)
84+
if self.vocab is None:
85+
86+
def add_to_counter(matcher, doc, i, matches):
87+
match_id, start, end = matches[i]
88+
candidate = str(doc[start:end])
89+
if (
90+
TermExtraction.word_length(candidate)
91+
<= TermExtraction.MAX_WORD_LENGTH
92+
):
93+
term_counter[candidate] += 1
94+
95+
for i, pattern in enumerate(self.patterns):
96+
TermExtraction.matcher.add("term{}".format(i), add_to_counter, pattern)
97+
98+
doc = TermExtraction.nlp(document.lower(), disable=["parser", "ner"])
99+
matches = TermExtraction.matcher(doc)
100+
else:
101+
for end_index, (insert_order, original_value) in self.trie.iter(
102+
document.lower()
103+
):
104+
term_counter[original_value] += 1
105+
return term_counter
106+
107+
def count_terms_from_documents(self, seperate=False, verbose=False):
108+
109+
if type(self.corpus) is str:
110+
term_counter = pd.Series(self.count_terms_from_document(self.corpus))
111+
elif type(self.corpus) is list or type(self.corpus) is pd.Series:
112+
if seperate:
113+
term_counters = []
114+
else:
115+
term_counter = pd.Series(dtype="int64")
116+
if verbose:
117+
pbar = tqdm(total=len(self.corpus))
118+
119+
def callback(counter_list):
120+
if verbose:
121+
pbar.update(1)
122+
if seperate:
123+
term_counters.append(
124+
(tuple(counter_list.keys()), tuple(counter_list.values()))
125+
)
126+
else:
127+
nonlocal term_counter
128+
# print(tuple(counter_list.values()))
129+
term_counter = term_counter.add(
130+
pd.Series(
131+
index=tuple(counter_list.keys()),
132+
data=tuple(counter_list.values()),
133+
dtype=np.int64,
134+
),
135+
fill_value=0,
136+
).astype(np.int64)
137+
138+
def error_callback(e):
139+
print(e)
140+
141+
P = Pool()
142+
143+
start_ = time.time()
144+
for document in self.corpus:
145+
P.apply_async(
146+
self.count_terms_from_document,
147+
[document],
148+
callback=callback,
149+
error_callback=error_callback,
150+
)
151+
P.close()
152+
P.join()
153+
print(time.time() - start_)
154+
155+
P.terminate()
156+
if verbose:
157+
pbar.close()
158+
# print(term_counter)
159+
else:
160+
raise TypeError()
161+
162+
if seperate:
163+
164+
def counter_to_series(counter):
165+
return pd.Series(data=counter[1], index=counter[0], dtype="int8")
166+
167+
return (
168+
pd.DataFrame(data=map(counter_to_series, term_counters))
169+
.fillna(0)
170+
.astype("int8")
171+
.T
172+
)
173+
else:
174+
return term_counter
175+
176+
177+
def add_term_extraction_method(extractor):
178+
def decorated(self, *args, **kwargs):
179+
return extractor(self.corpus, technical_counts=self.count_terms_from_documents(), *args, **kwargs)
180+
setattr(TermExtraction, extractor.__name__, decorated)
181+
return extractor
182+
183+
if __name__ == "__main__":
184+
PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
185+
PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
186+
wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
187+
pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
188+
# print(pmc, '\n', wiki)
189+
vocab = ["Cutaneous melanoma", "cancer", "secondary clusters", "bio"]
190+
start()
191+
print(
192+
TermExtraction(pmc[:100]).count_terms_from_documents(
193+
seperate=True, verbose=True
194+
)
195+
)
196+
end()
197+
# print(domain_pertinence(pmc[0], wiki[0]))

‎term_extractor.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# c_value
2+
3+
import spacy
4+
import pickle
5+
import time
6+
import math
7+
import pandas as pd
8+
import numpy as np
9+
from term_extraction import TermExtraction
10+
11+
# from sklearn import preprocessing
12+
13+
start_ = 0
14+
tmp = 0
15+
16+
17+
def start():
18+
global start_
19+
start_ = time.time()
20+
21+
22+
def end():
23+
global start_
24+
print(time.time() - start_)
25+
26+
27+
def domain_pertinence(technical_corpus, general_corpus):
28+
# http://ceur-ws.org/Vol-1031/paper3.pdf
29+
return 1
30+
31+
32+
def domain_consensus(technical_corpus, general_corpus):
33+
return 1
34+
35+
36+
def lexical_cohesion(technical_corpus, general_corpus):
37+
return 1
38+
39+
@add_term_extraction_method
40+
def term_extractor(technical_corpus, general_corpus, weights=None, verbose=False, technical_counts=None):
41+
42+
# reused initializations
43+
start()
44+
technical_counts_seperate = TermExtraction(
45+
technical_corpus
46+
).count_terms_from_documents(True, verbose=verbose)
47+
end()
48+
49+
start()
50+
technical_counts = technical_counts_seperate.sum(axis=1)
51+
XLOGX = lambda x: x * math.log(x) if x else 0
52+
53+
# domain pertinence
54+
general_counts = TermExtraction(
55+
general_corpus, technical_counts.index
56+
).count_terms_from_documents(verbose=verbose)
57+
general_counts /= general_counts.max()
58+
domain_pertinence = pd.DataFrame(
59+
data={
60+
"technical": technical_counts / technical_counts.sum(),
61+
"general": general_counts,
62+
}
63+
).fillna(0)
64+
65+
def domain_pertinence_function(s):
66+
tech, gen = s.iloc
67+
return tech / max(tech, gen)
68+
69+
domain_pertinence = domain_pertinence.apply(domain_pertinence_function, axis=1)
70+
71+
# domain consensus
72+
domain_consensus = technical_counts_seperate
73+
domain_consensus = domain_consensus.div(domain_consensus.sum(axis=0), axis=1)
74+
domain_consensus = domain_consensus.applymap(lambda x: -XLOGX(x))
75+
domain_consensus = domain_consensus.sum(axis=1)
76+
77+
# Lexical cohesion
78+
term_words = set(
79+
word for term in technical_counts_seperate.index for word in term.split()
80+
)
81+
term_counts = TermExtraction(
82+
technical_corpus, term_words
83+
).count_terms_from_documents()
84+
lexical_cohesion = technical_counts
85+
86+
def lexical_cohesion_function(row):
87+
word, freq = row.iloc
88+
# print(word, freq)
89+
return (
90+
TermExtraction.word_length(word)
91+
* XLOGX(freq) # remove plus 1 later
92+
/ sum(map(lambda s: term_counts.loc[s], word.split()))
93+
)
94+
95+
lexical_cohesion = pd.Series(
96+
lexical_cohesion.reset_index().apply(lexical_cohesion_function, axis=1).values,
97+
index=lexical_cohesion.index,
98+
)
99+
100+
if True:
101+
domain_pertinence /= domain_pertinence.max()
102+
domain_consensus /= domain_consensus.max()
103+
lexical_cohesion /= lexical_cohesion.max()
104+
105+
df = pd.DataFrame(
106+
data={
107+
"domain_pertinence": domain_pertinence,
108+
"domain_consensus": domain_consensus,
109+
"lexical_cohesion": lexical_cohesion,
110+
}
111+
)
112+
113+
if verbose:
114+
print(
115+
domain_pertinence.sort_values(ascending=False).head(10),
116+
"\n",
117+
domain_consensus.sort_values(ascending=False).head(10),
118+
"\n",
119+
lexical_cohesion.sort_values(ascending=False).head(10),
120+
)
121+
if weights is None:
122+
weights = np.array([1, 1, 1]) / 3
123+
end()
124+
return df.dot(weights)
125+
126+
127+
if __name__ == "__main__":
128+
PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
129+
PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
130+
wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
131+
pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
132+
# term_extractor(pmc[:50], wiki[:250])
133+
134+
# print(term_extractor(pmc, wiki).sort_values(ascending=False).head(50))
135+
print(
136+
term_extractor(pmc[:50], wiki[:1000], verbose=True)
137+
.sort_values(ascending=False)
138+
.head(50)
139+
)
140+
# print(pmc[0])
141+
# Syntactic Analysis: (PP NP PP CNP RVP NP PP)
142+
# POS: (PAJNNN AJN PNCNNWVANPJN)

‎weirdness.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# c_value
2+
3+
import spacy
4+
import pickle
5+
import time
6+
import math
7+
import json
8+
from tqdm import tqdm
9+
import pandas as pd
10+
11+
# from pathos.multiprocessing import ProcessingPool as Pool
12+
from multiprocessing import Pool
13+
from spacy.matcher import Matcher
14+
from collections import defaultdict
15+
from multiprocessing.pool import Pool
16+
import ahocorasick
17+
from term_extraction import TermExtraction
18+
19+
start_ = 0
20+
tmp = 0
21+
22+
23+
def start():
24+
global start_
25+
start_ = time.time()
26+
27+
28+
def end():
29+
global start_
30+
print(time.time() - start_)
31+
32+
@add_term_extraction_method
33+
def weirdness(technical_corpus, general_corpus, normalized=False, technical_counts=None):
34+
# http://ceur-ws.org/Vol-1031/paper3.pdf
35+
if technical_counts is None:
36+
technical_counts = TermExtraction(technical_corpus).count_terms_from_documents()
37+
general_counts = TermExtraction(
38+
general_corpus, technical_counts.index
39+
).count_terms_from_documents()
40+
technical_word_count = TermExtraction.word_length("\n".join(technical_corpus))
41+
general_word_count = TermExtraction.word_length("\n".join(general_corpus))
42+
43+
zero_division_preventer = pd.Series(index=technical_counts.index, data=1)
44+
general_counts += zero_division_preventer
45+
if normalized:
46+
return (
47+
technical_counts
48+
* general_word_count
49+
/ general_counts
50+
/ technical_word_count
51+
)
52+
else:
53+
return technical_counts / general_counts
54+
55+
56+
if __name__ == "__main__":
57+
PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
58+
PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
59+
wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
60+
pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
61+
# print(pmc, '\n', wiki)
62+
pairdf = weirdness(pmc[:200], wiki[:500])
63+
print(pairdf.sort_values(ascending=False).head(50))

0 commit comments

Comments
 (0)
This repository has been archived.