Merge pull request #1 from pirovc/v0.08

V0.08
pirovc · Nov 8, 2017 · 0c67508 · 0c67508
2 parents d6b48b9 + 1c53b56
commit 0c67508
Show file tree

Hide file tree

Showing 16 changed files with 172 additions and 166 deletions.
diff --git a/DUDes.py b/DUDes.py
@@ -28,20 +28,20 @@
 import argparse, time, shelve
 import multiprocessing as mp
 from collections import defaultdict
-from parse_sam import parse_sam
-from Util import *
-from SMap import SMap
-from Refs import Refs
-from TTree import TTree
-from Ident import Ident
-from Rep import Rep
-from Bins import Bins
-from Names import Names
-from Ranks import Ranks
+from dudes.parse_sam import parse_sam
+from dudes.Util import *
+from dudes.SMap import SMap
+from dudes.Refs import Refs
+from dudes.TTree import TTree
+from dudes.Ident import Ident
+from dudes.Rep import Rep
+from dudes.Bins import Bins
+from dudes.Names import Names
+from dudes.Ranks import Ranks
 
 def main():
 
-	version = 'v0.07'
+	version = 'v0.08'
 
 	total_tx = time.time()
 
@@ -58,7 +58,7 @@ def main():
 	global min_group_size
 	global bin_size
 
-	parser = argparse.ArgumentParser(prog='DUDes')
+	parser = argparse.ArgumentParser(prog='DUDes.py')
 	parser.add_argument('-s',required=True, metavar='<sam_file>', dest="sam_file", help="Alignment/mapping file in SAM format. DUDes does not depend on any specific read mapper, but it requires header information (@SQ SN:gi|556555098|ref|NC_022650.1| LN:55956) and mismatch information (check -i)")
 	parser.add_argument('-d',required=True, metavar='<database_file>', dest="database_file", help="Database file (output from DUDesDB [.npz])")
 	parser.add_argument('-i', metavar='<sam_format>', dest="sam_format", default="nm", help="SAM file format ['nm': sam file with standard cigar string plus NM flag (NM:i:[0-9]*) for mismatches count | 'ex': just the extended cigar string]. Default: 'nm'")
@@ -506,31 +506,7 @@ def perm_pval(c,t):
 		diff_random[n] = np.mean(combined[:norm_c_sub.size]) - np.mean(combined[norm_c_sub.size:])
 
 	pv = sum(np.greater_equal(diff_random,diff_obs)) / float(permutations)			
-
-	# ## plots
-	# if plot:
-		# import matplotlib.pyplot as plt
-		# fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False)
-		# ax[0].plot(norm_c,lw=2.5,color='blue',label='Y')
-		# ax[0].plot(norm_t,lw=2.5,color='red',label='X')
-		# ax[0].legend(fontsize=20)
-		# ax[0].axvline(cutoff-1, color='black', lw=3)
-		# #ax[0].text(cutoff-1+5, 100, int(cutoff), fontsize=20)
-		# #ax[0].set_xticks([]) 
-		# ax[0].set_yticks([]) 
-		# ax[0].set_xlabel('bins', fontsize=20)
-		# ax[0].set_ylabel('bin score', fontsize=20)
-		# ax[1].hist(diff_random,50,alpha=0.8,color='gray')
-		# ax[1].axvline(diff_obs, color='green', lw=3)
-		# ax[1].axvline(1-np.percentile(diff_random,cv), color='red', lw=3, ls="--")
-		# ax[1].set_xticks([]) 
-		# ax[1].set_yticks([]) 
-		# xlim = ax[1].get_xlim()
-		# ax[1].set_xlim(xlim[0]-100,xlim[1]+100)
-		# plt.tight_layout()
-		# plt.show()
-	# ## plots
-
+
 	return pv
 
 
@@ -562,18 +538,6 @@ def findIndirectMatches(smap,direct_matches_smap_idx):
 
 	return indirect_matches_smap_idx
 
-#def findIndirectMatches2(smap,direct_matches_smap_idx):
-#	## Get best match from the direct matches (because sometimes the same read can match several references)
-#	direct_matches = smap.getSubSet(direct_matches_smap_idx)
-#	order_dm,index_dm = group_max(direct_matches.getCol('ReadID'),direct_matches.getCol('MatchScore'))
-#	max_match_score_direct_matches = defaultdict(np.int,zip(np.unique(direct_matches.getCol('ReadID')),direct_matches.getSubSet(order_dm).getSubSet(index_dm).getCol('MatchScore')))
-#	## Compare match scores agains an
-#	## array of max match scores for each readid (0 when read is not being analyzed) from previous calculation
-#	## Use smap.smap directly for performance (instead of smap.getCol())
-#	indirect_matches_smap_idx = np.less_equal(smap.smap[:,2],[max_match_score_direct_matches[readid] for readid in smap.smap[:,3]])	
-#	# Return indirect matches, ignoring the direct matches
-#	return np.logical_and(indirect_matches_smap_idx,~direct_matches_smap_idx)
-
 def filterRefTax(smap,refs,ttree):
 
 	# References marked with -1 are the ones not found in the refids_lookup (on parse_sam)

diff --git a/DUDesDB.py b/DUDesDB.py
@@ -25,26 +25,26 @@
 
 import numpy as np
 import argparse, subprocess
-from Ranks import Ranks
-from parse_names import parse_names
+from dudes.Ranks import Ranks
+from dudes.parse_names import parse_names
 from collections import defaultdict
 import multiprocessing as mp
 import time, sys
-import shelve
+import pandas as pd
 
 def main():
 
-	version = 'v0.07'
+	version = 'v0.08'
 
 	total_tx = time.time()
 
 	global nodes
 
-	parser = argparse.ArgumentParser(description='DUDesDB')
-	parser.add_argument('-m', metavar='<reference_mode>', dest="reference_mode", default="gi", help="'gi' uses the GI as the identifier (For headers like: >gi|158333233|ref|NC_009925.1|) [NCBI is phasing out sequence GI numbers in September 2016]. 'av' uses the accession.version as the identifier (for headers like: >NC_013791.2).	Default: 'gi'")
-	parser.add_argument('-f', required=True, metavar='<fasta_file>', dest="fasta_file", help="Reference fasta file for header extraction [only]. Should be the same file used to generate the index database for the SAM file. Each sequence header should contain a identifier as defined in the reference mode.")
+	parser = argparse.ArgumentParser(prog='DUDesDB.py')
+	parser.add_argument('-m', metavar='<reference_mode>', dest="reference_mode", default="av", help="'gi' uses the GI as the identifier (For headers like: >gi|158333233|ref|NC_009925.1|) [NCBI is phasing out sequence GI numbers in September 2016]. 'av' uses the accession.version as the identifier (for headers like: >NC_013791.2).	Default: 'av'")
+	parser.add_argument('-f', required=True, metavar='<fasta_files>', dest="fasta_files", nargs="*", help="Reference fasta file(s) for header extraction only, plain or gzipped -  the same file used to generate the read mapping index. Each sequence header '>' should contain a identifier as defined in the reference mode.")
+	parser.add_argument('-g', required=True, metavar='<ref2tax_files>', dest="ref2tax_files", nargs="*", help="reference id to taxid file(s): 'gi_taxid_nucl.dmp[.gz]' --> 'gi' mode, '*.accession2taxid[.gz]' --> 'av' mode [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
 	parser.add_argument('-n', required=True, metavar='<nodes_file>', dest="nodes_file", help="nodes.dmp file [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
-	parser.add_argument('-g', required=True, metavar='<ref2tax_file>', dest="ref2tax_file", help="Reference identifier (GI or accesion.version) to taxonomic ID file: 'gi_taxid_nucl.dmp' --> 'gi' mode, 'accession2taxid/nucl_gb.dump' --> 'av' mode [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
 	parser.add_argument('-a', metavar='<names_file>', dest="names_file", help="names.dmp file [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
 	parser.add_argument('-o', metavar='<output_prefix>', dest="output_prefix", default="dudesdb", help="Output prefix. Default: dudesdb")
 	parser.add_argument('-t', metavar='<threads>', dest="threads", type=int, default=1, help="# of threads. Default: 1")
@@ -60,34 +60,38 @@ def main():
 
 	# Load reference identifiers from fasta file (in a set for fast lookup)
 	# refids = [identifier]
-	sys.stdout.write("Extracting reference identifiers (%s) ..." % args.fasta_file)
+	sys.stdout.write("Extracting reference identifiers (%s) ..." % ",".join(args.fasta_files))
 	tx = time.time()
-	try: # LINUX - faster
+	try: # shell - faster
 		if args.reference_mode=="gi":
-			cmd = 'grep -o "gi|[0-9]*" ' + args.fasta_file + ' | sed "s/gi|//g"'
+			cmd = 'zgrep -h -o "^>gi|[0-9]*" ' + " ".join(args.fasta_files) + ' | sed "s/>gi|//g"'
 		else:
-			cmd = 'grep -o ">[A-Z0-9_\.]*" ' + args.fasta_file + ' | sed "s/>//g"'
+			cmd = 'zgrep -h -o "^>[A-Z0-9_\.]*" ' + " ".join(args.fasta_files) + ' | sed "s/>//g"'
 		out = subprocess.getoutput(cmd)
 		refids = set(l for l in out.split('\n') if l)
-	except: # general
-		import re
+	except: # python
+		import re, gzip
 		if args.reference_mode=="gi":
 			regex = re.compile('gi\|[0-9]*')
 			slice = 3
 		else:
 			regex = re.compile('>[A-Z0-9_\.]*')
 			slice = 1	
 		refids = set()
-		for f in open(args.fasta_file,'r'):
-			if f[0]==">":
-				r = regex.search(f)
-				if r: 
-					if len(r.group())>1: refids.add(r.group()[slice:])
-	sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
-
+		for file in args.fasta_files:
+			f = gzip.open(file, 'rt') if file.endswith(".gz") else open(file,'r')
+			for line in f:
+				if line[0]==">":
+					r = regex.search(line)
+					if r: 
+						if len(r.group())>1: refids.add(r.group()[slice:])
+
 	if not refids:
-		print("No references found. Check the reference mode (-m) and the patter from your fasta file")
+		print("\n\tNo references found. Check the reference mode (-m) and your reference headers.")
 		return
+	else:
+		print("\n\t%d unique references found" % len(refids))
+	sys.stdout.write("Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
 
 	# Load nodes.dmp
 	# nodes = [taxid,parent_taxid,rankid]
@@ -101,47 +105,61 @@ def main():
 			nodes.append([int(fields[0]),int(fields[1]),ranks[fields[2]]])
 	nodes = np.array(nodes)
 	sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
-	
+
 	# Load refid 2 taxid file
 	# Verify if the entry is being used in the refids before and just output relevant rows
 	# refid_taxid = [refid,taxid]
-	sys.stdout.write("Loading taxids (%s) ..." % args.ref2tax_file)
+	sys.stdout.write("Loading taxids (%s) ..." % ",".join(args.ref2tax_files))
 	tx = time.time()
 	refids_lookup = defaultdict(lambda: len(refids_lookup)) #Auto-increase id dict
-	try: # pandas - faster
-		import pandas as pd
-		if args.reference_mode=="gi":
-			refid_taxid = np.array(pd.read_csv(args.ref2tax_file, sep='\t', header=None, dtype=int, converters={0:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int)
-		else:
-			refid_taxid = np.array(pd.read_csv(args.ref2tax_file, sep='\t', header=None, skiprows=1, usecols=[1,2], converters={1:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int)
-	except: # general
-		refid_taxid = []
-		if args.reference_mode=="gi":
-			for f in open(args.ref2tax_file):
-				fields = f.split('\t')
-				if fields[0] in refids:
-					refid_taxid.append([refids_lookup[fields[0]],int(fields[0])])
-		else:
-			with open(args.ref2tax_file,"r") as file:
-				next(file)
-				for f in file:
-					fields = f.split('\t')
-					if fields[1] in refids:
-						refid_taxid.append([refids_lookup[fields[1]],int(fields[2])])
-		refid_taxid = np.array(refid_taxid,dtype=int)
+	refid_taxid = []
+	if args.reference_mode=="gi":
+		for file in args.ref2tax_files:
+			refid_taxid.append(np.array(pd.read_csv(file, compression='gzip' if file.endswith(".gz") else None, sep='\t', header=None, dtype=int, converters={0:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int))
+	else:
+		for file in args.ref2tax_files:
+			refid_taxid.append(np.array(pd.read_csv(file, compression='gzip' if file.endswith(".gz") else None, sep='\t', header=None, skiprows=1, usecols=[1,2], converters={1:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int))
+	# Concatenate files together
+	refid_taxid = np.concatenate(refid_taxid, axis=0)
 	sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
 
 	# Output differences and verify the taxids on nodes file
 	sys.stdout.write("Parsing nodes and taxids ...")
 	tx = time.time()
-	unique_refs_used = np.unique(refid_taxid[:,0]).size
-	sys.stdout.write("\n\t%d references without entries on %s" % ((len(refids) - unique_refs_used,args.ref2tax_file)))
-	#print(np.setdiff1d(refids,refids_lookup.keys()))
-	sub_refid_taxid = refid_taxid[np.in1d(refid_taxid[:,1],nodes[:,0])]
-	sys.stdout.write("\n\t%d references without entries on %s\n" % (unique_refs_used - sub_refid_taxid[:,0].size,args.nodes_file))
-	#print(refid_taxid[~np.in1d(refid_taxid[:,1],nodes[:,0])])
-	sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
+	unique_refs_used = np.unique(refid_taxid[:,0]).size # found on the ref2tax files - should be already unique, just to make sure
+	if not unique_refs_used:
+		print("\n\tNo matches on taxonomy found. Check your reference to taxonomy file[s] (-g): ", ",".join(args.ref2tax_files))
+		return
 
+	seq_without_refid = len(refids) - unique_refs_used
+	sys.stdout.write("\n\tIgnoring %d (out of %d) sequences without entries on %s\n" % ((seq_without_refid, len(refids), ",".join(args.ref2tax_files))))
+	if seq_without_refid: 
+		refids_nparr = np.array(list(refids))
+		print("\n".join(refids_nparr[~np.in1d(refids_nparr, list(refids_lookup.keys()))]))
+		del refids_nparr
+
+	refid_with_valid_taxid = np.in1d(refid_taxid[:,1],nodes[:,0])
+	seq_without_taxid = unique_refs_used - sum(refid_with_valid_taxid)
+	sys.stdout.write("\tIgnoring %d (out of %d) sequences without taxid on %s\n" % (seq_without_taxid, unique_refs_used, args.nodes_file))
+	if seq_without_taxid:
+		refids_lookup_rev = {v: k for k, v in refids_lookup.items()}
+		print("\n".join([refids_lookup_rev[r] + "\t" + str(t) for r,t in refid_taxid[~refid_with_valid_taxid]]))
+
+		# filter from lookup (rev) and overwrite current lookup
+		for r in refid_taxid[~refid_with_valid_taxid,0]:
+			del refids_lookup_rev[r]
+		refids_lookup = {v: k for k, v in refids_lookup_rev.items()}
+		del refids_lookup_rev
+		# filter out entries without taxid matches on nodes.dmp
+		refid_taxid = refid_taxid[refid_with_valid_taxid]
+
+	sys.stdout.write("Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
+
+	unique_refs_used_filtered = np.unique(refid_taxid[:,0]).size
+	if not unique_refs_used_filtered:
+		print("\n\tNo matches on nodes found. Check your file (-n): ", args.nodes_file)
+		return
+
 	# ------- MakeDB -----------
 	sys.stdout.write("Creating database ...")
 	tx = time.time()
@@ -157,7 +175,7 @@ def main():
 	pool = mp.Pool(args.threads)
 	paths = defaultdict(list)
 	res = {}
-	for taxid in np.unique(sub_refid_taxid[:,1]):
+	for taxid in np.unique(refid_taxid[:,1]):
 		res[taxid] = pool.apply_async(generatePath,args=(taxid,fixed_ranks_id,))
 	for taxid,r in res.items():
 		paths[taxid] = r.get()
@@ -167,7 +185,7 @@ def main():
 	no_rank_id = fixed_ranks.index('strain')
 	refid_nodes = []
 	res = []
-	for refid,taxid in sub_refid_taxid: 
+	for refid,taxid in refid_taxid: 
 		res.append(pool.apply_async(generateDB,args=(paths[taxid],refid,no_rank_after,no_rank_id,)))
 	for r in res:
 		refid_nodes.extend(r.get())