-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgcf_wrapper.py
More file actions
100 lines (74 loc) · 3.13 KB
/
gcf_wrapper.py
File metadata and controls
100 lines (74 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
## Extracting the guidetree for each cognateset
'''
python3 gcf_wrapper.py gold_trees/walworthpolynesian.gold.tre computed/cognateswalworthpolynesian.tsv
'''
from lingpy import *
from gCF_Calc import *
from dendropy import Tree
from dendropy.calculate.treecompare import symmetric_difference
import sys
import numpy as np
from tqdm import tqdm
master_tree_file = sys.argv[1] #gold tree or the lexical tree
cog_file = sys.argv[2] #cognates file to be aligned
wl = Wordlist(cog_file)
trees = []
nr_perms = int(sys.argv[3])
for cogid, values in wl.get_etymdict(ref='cogids').items():
idxs = []
#print(cogid, values)
for idx in values:
if idx:
idxs += [idx[0]]
words, taxa = [], []
for idx in idxs:
word_idx = wl[idx, 'cogids'].index(cogid)
if wl[idx, 'doculect'] in taxa:
print(wl[idx, 'doculect'])
continue
words += [wl[idx, 'tokens'].n[word_idx]]
taxa += [wl[idx, 'doculect']]
if len(taxa) < 4: continue
#continue
mult = Multiple(words)
mult.align(method='progressive')
#print(mult.alm_matrix)
#print(mult.classes)
tree = str(mult.tree)
if "(" not in tree: continue
dp_tree = Tree.get_from_string(tree, "newick")
for i, (idx, vals) in enumerate(mult.int2ext.items()):
rep = ','.join([taxa[j] for j in vals])
to_rep = "".join(mult._classes[idx])
#print("To be replaced ", to_rep)
for x in dp_tree.leaf_nodes():
if x.taxon.label == to_rep:
x.taxon.label = rep
tree = dp_tree.as_string("newick").replace("_", "").replace("\n","").replace("\'", "")
dp_tree = Tree.get_from_string(tree, "newick")
nb_flag = False
for itn in dp_tree.nodes():
if itn.num_child_nodes() > 2:
nb_flag = True
break
if nb_flag: continue
print(tree)
trees.append(tree)
if master_tree_file.endswith(".nex"):
master_tree = Tree.get(path=master_tree_file, schema="nexus").as_string("newick").replace("\n", "")
elif master_tree_file.endswith(".nwk") or master_tree_file.endswith(".tre"):
master_tree = open(master_tree_file).read().replace("\n", "")
mean_gcfs = []
for i in tqdm(range(1, nr_perms+1)):
gcfs, mtree = gcf(master_tree, trees, seed=i)
_, __, gcf_scores, strng_gcf_scores= zip(*list(gcfs.values()))
print("Mean of gcf scores ", i, np.mean(gcf_scores), np.std(gcf_scores))
mean_gcfs.append(np.mean(gcf_scores))
mtree.write(schema="nexus",suppress_internal_node_labels=True, suppress_edge_lengths=True, path=master_tree_file+".gcf"+str(i),translate_tree_taxa=True)
if "mcc" in sys.argv:
mtr = Tree.get(path=master_tree_file, schema="nexus")
mtre = Tree.get(path=master_tree_file+".gcf"+str(i), schema="nexus")
for e1, e2 in zip(mtr.internal_edges(), mtre.internal_edges()):
print(e1.head_node.annotations.get_value("support"), e2.head_node.annotations.get_value("gcf"), [x.taxon.label for x in e1.head_node.leaf_nodes()], [x.taxon.label for x in e2.head_node.leaf_nodes()])
print()
print("Final ", np.mean(mean_gcfs))