-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserovar-search.py
75 lines (53 loc) · 1.9 KB
/
serovar-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
from sys import argv
import numpy as np
import pandas as pd
from litsearch import PubMed, ask_email
def find_terms(text):
p = re.compile(r"Salmonella enterica [Ss]erovar (\w+)")
try:
n = p.search(text).group(1)
return n
except AttributeError:
return np.NaN
def search(df, search: str, top=10) -> pd.DataFrame:
"""Search for a given serovar"""
first_letter, remaining = search[0], search[1:]
match = f"[{first_letter.upper()}{first_letter.lower()}]{remaining}"
b = df["Serovar"].str.findall(f"{match}")
b = b[b.astype(str) != '[]']
result = df[["Title", "Authors", "Journal/Book", "Publication Year", "Serovar", "Times cited"]]
result = result.loc[b.index]
if isinstance(top, int):
return result.head(top)
else:
return result
def Serovars(df, top=10):
print(df["Serovar"].value_counts()[:top])
def main(csv_file):
lit = pd.read_csv(csv_file, index_col=0) # make DataFrame
lit = lit.drop(["PMCID", "NIHMS ID", "First Author"], axis=1)
lit["Serovar"] = lit["Title"].apply(find_terms) # Identify sequence type from title
lit = lit[lit["Serovar"].notna()] # Filter out rows without mention of ST
lit = PubMed.add_times_cited(lit)
return lit
if __name__ == "__main__":
if argv[1] == "new":
fp = argv[2]
ask_email()
df = main(fp)
df.to_csv(fp)
elif argv[1] == "find":
fp = argv[2]
df = pd.read_csv(fp, index_col=0)
term = argv[3]
print(search(df, term))
elif argv[1] == "summary":
ask_email()
pmid = argv[2]
PubMed.long_summary(pmid)
elif argv[1] == "serovars":
fp = argv[2]
df = pd.read_csv(fp, index_col=0)
else:
raise TypeError("No recognised argument given for first positional argument.")