Skip to content

Commit

Permalink
Merge pull request #1 from pirovc/v0.08
Browse files Browse the repository at this point in the history
V0.08
  • Loading branch information
pirovc authored Nov 8, 2017
2 parents d6b48b9 + 1c53b56 commit 0c67508
Show file tree
Hide file tree
Showing 16 changed files with 172 additions and 166 deletions.
62 changes: 13 additions & 49 deletions DUDes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,20 @@
import argparse, time, shelve
import multiprocessing as mp
from collections import defaultdict
from parse_sam import parse_sam
from Util import *
from SMap import SMap
from Refs import Refs
from TTree import TTree
from Ident import Ident
from Rep import Rep
from Bins import Bins
from Names import Names
from Ranks import Ranks
from dudes.parse_sam import parse_sam
from dudes.Util import *
from dudes.SMap import SMap
from dudes.Refs import Refs
from dudes.TTree import TTree
from dudes.Ident import Ident
from dudes.Rep import Rep
from dudes.Bins import Bins
from dudes.Names import Names
from dudes.Ranks import Ranks

def main():

version = 'v0.07'
version = 'v0.08'

total_tx = time.time()

Expand All @@ -58,7 +58,7 @@ def main():
global min_group_size
global bin_size

parser = argparse.ArgumentParser(prog='DUDes')
parser = argparse.ArgumentParser(prog='DUDes.py')
parser.add_argument('-s',required=True, metavar='<sam_file>', dest="sam_file", help="Alignment/mapping file in SAM format. DUDes does not depend on any specific read mapper, but it requires header information (@SQ SN:gi|556555098|ref|NC_022650.1| LN:55956) and mismatch information (check -i)")
parser.add_argument('-d',required=True, metavar='<database_file>', dest="database_file", help="Database file (output from DUDesDB [.npz])")
parser.add_argument('-i', metavar='<sam_format>', dest="sam_format", default="nm", help="SAM file format ['nm': sam file with standard cigar string plus NM flag (NM:i:[0-9]*) for mismatches count | 'ex': just the extended cigar string]. Default: 'nm'")
Expand Down Expand Up @@ -506,31 +506,7 @@ def perm_pval(c,t):
diff_random[n] = np.mean(combined[:norm_c_sub.size]) - np.mean(combined[norm_c_sub.size:])

pv = sum(np.greater_equal(diff_random,diff_obs)) / float(permutations)

# ## plots
# if plot:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False)
# ax[0].plot(norm_c,lw=2.5,color='blue',label='Y')
# ax[0].plot(norm_t,lw=2.5,color='red',label='X')
# ax[0].legend(fontsize=20)
# ax[0].axvline(cutoff-1, color='black', lw=3)
# #ax[0].text(cutoff-1+5, 100, int(cutoff), fontsize=20)
# #ax[0].set_xticks([])
# ax[0].set_yticks([])
# ax[0].set_xlabel('bins', fontsize=20)
# ax[0].set_ylabel('bin score', fontsize=20)
# ax[1].hist(diff_random,50,alpha=0.8,color='gray')
# ax[1].axvline(diff_obs, color='green', lw=3)
# ax[1].axvline(1-np.percentile(diff_random,cv), color='red', lw=3, ls="--")
# ax[1].set_xticks([])
# ax[1].set_yticks([])
# xlim = ax[1].get_xlim()
# ax[1].set_xlim(xlim[0]-100,xlim[1]+100)
# plt.tight_layout()
# plt.show()
# ## plots


return pv


Expand Down Expand Up @@ -562,18 +538,6 @@ def findIndirectMatches(smap,direct_matches_smap_idx):

return indirect_matches_smap_idx

#def findIndirectMatches2(smap,direct_matches_smap_idx):
# ## Get best match from the direct matches (because sometimes the same read can match several references)
# direct_matches = smap.getSubSet(direct_matches_smap_idx)
# order_dm,index_dm = group_max(direct_matches.getCol('ReadID'),direct_matches.getCol('MatchScore'))
# max_match_score_direct_matches = defaultdict(np.int,zip(np.unique(direct_matches.getCol('ReadID')),direct_matches.getSubSet(order_dm).getSubSet(index_dm).getCol('MatchScore')))
# ## Compare match scores agains an
# ## array of max match scores for each readid (0 when read is not being analyzed) from previous calculation
# ## Use smap.smap directly for performance (instead of smap.getCol())
# indirect_matches_smap_idx = np.less_equal(smap.smap[:,2],[max_match_score_direct_matches[readid] for readid in smap.smap[:,3]])
# # Return indirect matches, ignoring the direct matches
# return np.logical_and(indirect_matches_smap_idx,~direct_matches_smap_idx)

def filterRefTax(smap,refs,ttree):

# References marked with -1 are the ones not found in the refids_lookup (on parse_sam)
Expand Down
126 changes: 72 additions & 54 deletions DUDesDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,26 @@

import numpy as np
import argparse, subprocess
from Ranks import Ranks
from parse_names import parse_names
from dudes.Ranks import Ranks
from dudes.parse_names import parse_names
from collections import defaultdict
import multiprocessing as mp
import time, sys
import shelve
import pandas as pd

def main():

version = 'v0.07'
version = 'v0.08'

total_tx = time.time()

global nodes

parser = argparse.ArgumentParser(description='DUDesDB')
parser.add_argument('-m', metavar='<reference_mode>', dest="reference_mode", default="gi", help="'gi' uses the GI as the identifier (For headers like: >gi|158333233|ref|NC_009925.1|) [NCBI is phasing out sequence GI numbers in September 2016]. 'av' uses the accession.version as the identifier (for headers like: >NC_013791.2). Default: 'gi'")
parser.add_argument('-f', required=True, metavar='<fasta_file>', dest="fasta_file", help="Reference fasta file for header extraction [only]. Should be the same file used to generate the index database for the SAM file. Each sequence header should contain a identifier as defined in the reference mode.")
parser = argparse.ArgumentParser(prog='DUDesDB.py')
parser.add_argument('-m', metavar='<reference_mode>', dest="reference_mode", default="av", help="'gi' uses the GI as the identifier (For headers like: >gi|158333233|ref|NC_009925.1|) [NCBI is phasing out sequence GI numbers in September 2016]. 'av' uses the accession.version as the identifier (for headers like: >NC_013791.2). Default: 'av'")
parser.add_argument('-f', required=True, metavar='<fasta_files>', dest="fasta_files", nargs="*", help="Reference fasta file(s) for header extraction only, plain or gzipped - the same file used to generate the read mapping index. Each sequence header '>' should contain a identifier as defined in the reference mode.")
parser.add_argument('-g', required=True, metavar='<ref2tax_files>', dest="ref2tax_files", nargs="*", help="reference id to taxid file(s): 'gi_taxid_nucl.dmp[.gz]' --> 'gi' mode, '*.accession2taxid[.gz]' --> 'av' mode [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
parser.add_argument('-n', required=True, metavar='<nodes_file>', dest="nodes_file", help="nodes.dmp file [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
parser.add_argument('-g', required=True, metavar='<ref2tax_file>', dest="ref2tax_file", help="Reference identifier (GI or accesion.version) to taxonomic ID file: 'gi_taxid_nucl.dmp' --> 'gi' mode, 'accession2taxid/nucl_gb.dump' --> 'av' mode [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
parser.add_argument('-a', metavar='<names_file>', dest="names_file", help="names.dmp file [from NCBI taxonomy database ftp://ftp.ncbi.nih.gov/pub/taxonomy/]")
parser.add_argument('-o', metavar='<output_prefix>', dest="output_prefix", default="dudesdb", help="Output prefix. Default: dudesdb")
parser.add_argument('-t', metavar='<threads>', dest="threads", type=int, default=1, help="# of threads. Default: 1")
Expand All @@ -60,34 +60,38 @@ def main():

# Load reference identifiers from fasta file (in a set for fast lookup)
# refids = [identifier]
sys.stdout.write("Extracting reference identifiers (%s) ..." % args.fasta_file)
sys.stdout.write("Extracting reference identifiers (%s) ..." % ",".join(args.fasta_files))
tx = time.time()
try: # LINUX - faster
try: # shell - faster
if args.reference_mode=="gi":
cmd = 'grep -o "gi|[0-9]*" ' + args.fasta_file + ' | sed "s/gi|//g"'
cmd = 'zgrep -h -o "^>gi|[0-9]*" ' + " ".join(args.fasta_files) + ' | sed "s/>gi|//g"'
else:
cmd = 'grep -o ">[A-Z0-9_\.]*" ' + args.fasta_file + ' | sed "s/>//g"'
cmd = 'zgrep -h -o "^>[A-Z0-9_\.]*" ' + " ".join(args.fasta_files) + ' | sed "s/>//g"'
out = subprocess.getoutput(cmd)
refids = set(l for l in out.split('\n') if l)
except: # general
import re
except: # python
import re, gzip
if args.reference_mode=="gi":
regex = re.compile('gi\|[0-9]*')
slice = 3
else:
regex = re.compile('>[A-Z0-9_\.]*')
slice = 1
refids = set()
for f in open(args.fasta_file,'r'):
if f[0]==">":
r = regex.search(f)
if r:
if len(r.group())>1: refids.add(r.group()[slice:])
sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")

for file in args.fasta_files:
f = gzip.open(file, 'rt') if file.endswith(".gz") else open(file,'r')
for line in f:
if line[0]==">":
r = regex.search(line)
if r:
if len(r.group())>1: refids.add(r.group()[slice:])

if not refids:
print("No references found. Check the reference mode (-m) and the patter from your fasta file")
print("\n\tNo references found. Check the reference mode (-m) and your reference headers.")
return
else:
print("\n\t%d unique references found" % len(refids))
sys.stdout.write("Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")

# Load nodes.dmp
# nodes = [taxid,parent_taxid,rankid]
Expand All @@ -101,47 +105,61 @@ def main():
nodes.append([int(fields[0]),int(fields[1]),ranks[fields[2]]])
nodes = np.array(nodes)
sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")

# Load refid 2 taxid file
# Verify if the entry is being used in the refids before and just output relevant rows
# refid_taxid = [refid,taxid]
sys.stdout.write("Loading taxids (%s) ..." % args.ref2tax_file)
sys.stdout.write("Loading taxids (%s) ..." % ",".join(args.ref2tax_files))
tx = time.time()
refids_lookup = defaultdict(lambda: len(refids_lookup)) #Auto-increase id dict
try: # pandas - faster
import pandas as pd
if args.reference_mode=="gi":
refid_taxid = np.array(pd.read_csv(args.ref2tax_file, sep='\t', header=None, dtype=int, converters={0:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int)
else:
refid_taxid = np.array(pd.read_csv(args.ref2tax_file, sep='\t', header=None, skiprows=1, usecols=[1,2], converters={1:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int)
except: # general
refid_taxid = []
if args.reference_mode=="gi":
for f in open(args.ref2tax_file):
fields = f.split('\t')
if fields[0] in refids:
refid_taxid.append([refids_lookup[fields[0]],int(fields[0])])
else:
with open(args.ref2tax_file,"r") as file:
next(file)
for f in file:
fields = f.split('\t')
if fields[1] in refids:
refid_taxid.append([refids_lookup[fields[1]],int(fields[2])])
refid_taxid = np.array(refid_taxid,dtype=int)
refid_taxid = []
if args.reference_mode=="gi":
for file in args.ref2tax_files:
refid_taxid.append(np.array(pd.read_csv(file, compression='gzip' if file.endswith(".gz") else None, sep='\t', header=None, dtype=int, converters={0:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int))
else:
for file in args.ref2tax_files:
refid_taxid.append(np.array(pd.read_csv(file, compression='gzip' if file.endswith(".gz") else None, sep='\t', header=None, skiprows=1, usecols=[1,2], converters={1:lambda x: refids_lookup[x] if x in refids else np.nan}).dropna(how='any'),dtype=int))
# Concatenate files together
refid_taxid = np.concatenate(refid_taxid, axis=0)
sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")

# Output differences and verify the taxids on nodes file
sys.stdout.write("Parsing nodes and taxids ...")
tx = time.time()
unique_refs_used = np.unique(refid_taxid[:,0]).size
sys.stdout.write("\n\t%d references without entries on %s" % ((len(refids) - unique_refs_used,args.ref2tax_file)))
#print(np.setdiff1d(refids,refids_lookup.keys()))
sub_refid_taxid = refid_taxid[np.in1d(refid_taxid[:,1],nodes[:,0])]
sys.stdout.write("\n\t%d references without entries on %s\n" % (unique_refs_used - sub_refid_taxid[:,0].size,args.nodes_file))
#print(refid_taxid[~np.in1d(refid_taxid[:,1],nodes[:,0])])
sys.stdout.write(" Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")
unique_refs_used = np.unique(refid_taxid[:,0]).size # found on the ref2tax files - should be already unique, just to make sure
if not unique_refs_used:
print("\n\tNo matches on taxonomy found. Check your reference to taxonomy file[s] (-g): ", ",".join(args.ref2tax_files))
return

seq_without_refid = len(refids) - unique_refs_used
sys.stdout.write("\n\tIgnoring %d (out of %d) sequences without entries on %s\n" % ((seq_without_refid, len(refids), ",".join(args.ref2tax_files))))
if seq_without_refid:
refids_nparr = np.array(list(refids))
print("\n".join(refids_nparr[~np.in1d(refids_nparr, list(refids_lookup.keys()))]))
del refids_nparr

refid_with_valid_taxid = np.in1d(refid_taxid[:,1],nodes[:,0])
seq_without_taxid = unique_refs_used - sum(refid_with_valid_taxid)
sys.stdout.write("\tIgnoring %d (out of %d) sequences without taxid on %s\n" % (seq_without_taxid, unique_refs_used, args.nodes_file))
if seq_without_taxid:
refids_lookup_rev = {v: k for k, v in refids_lookup.items()}
print("\n".join([refids_lookup_rev[r] + "\t" + str(t) for r,t in refid_taxid[~refid_with_valid_taxid]]))

# filter from lookup (rev) and overwrite current lookup
for r in refid_taxid[~refid_with_valid_taxid,0]:
del refids_lookup_rev[r]
refids_lookup = {v: k for k, v in refids_lookup_rev.items()}
del refids_lookup_rev
# filter out entries without taxid matches on nodes.dmp
refid_taxid = refid_taxid[refid_with_valid_taxid]

sys.stdout.write("Done. Elapsed time: " + str(time.time() - tx) + " seconds\n")

unique_refs_used_filtered = np.unique(refid_taxid[:,0]).size
if not unique_refs_used_filtered:
print("\n\tNo matches on nodes found. Check your file (-n): ", args.nodes_file)
return

# ------- MakeDB -----------
sys.stdout.write("Creating database ...")
tx = time.time()
Expand All @@ -157,7 +175,7 @@ def main():
pool = mp.Pool(args.threads)
paths = defaultdict(list)
res = {}
for taxid in np.unique(sub_refid_taxid[:,1]):
for taxid in np.unique(refid_taxid[:,1]):
res[taxid] = pool.apply_async(generatePath,args=(taxid,fixed_ranks_id,))
for taxid,r in res.items():
paths[taxid] = r.get()
Expand All @@ -167,7 +185,7 @@ def main():
no_rank_id = fixed_ranks.index('strain')
refid_nodes = []
res = []
for refid,taxid in sub_refid_taxid:
for refid,taxid in refid_taxid:
res.append(pool.apply_async(generateDB,args=(paths[taxid],refid,no_rank_after,no_rank_id,)))
for r in res:
refid_nodes.extend(r.get())
Expand Down
Loading

0 comments on commit 0c67508

Please sign in to comment.