Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions neusomatic/python/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shutil
import pickle
import multiprocessing
import copy

import pysam
import numpy as np
Expand Down Expand Up @@ -69,6 +70,11 @@ def call_variants(net, call_loader, out_dir, model_tag, use_cuda):
for data in loader_:
(matrices, labels, var_pos_s, var_len_s,
non_transformed_matrices), (paths) = data

paths_ = copy.deepcopy(paths)
del paths
paths = paths_

matrices = Variable(matrices)
iii += 1
j += len(paths[0])
Expand Down
63 changes: 26 additions & 37 deletions neusomatic/python/filter_candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# filter_candidates.py
# filter raw candidates extracted by 'scan_alignments.py' using min_af and other cut-offs
#-------------------------------------------------------------------------
import os
import argparse
import traceback
import logging
Expand All @@ -25,6 +26,16 @@ def filter_candidates(candidate_record):
thread_logger.info(
"---------------------Filter Candidates---------------------")

if dbsnp:
if not dbsnp.endswith("vcf.gz"):
thread_logger.error("Aborting!")
raise Exception(
"The dbSNP file should be a tabix indexed file with .vcf.gz format")
if not os.path.exists(dbsnp + ".tbi"):
thread_logger.error("Aborting!")
raise Exception(
"The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))

records = {}
with open(candidates_vcf) as v_f:
for line in skip_empty(v_f):
Expand Down Expand Up @@ -257,47 +268,25 @@ def filter_candidates(candidate_record):
final_records.append([chrom, pos - 1, ref, alt, line])
final_records = sorted(final_records, key=lambda x: x[0:2])
if dbsnp:
filtered_bed = get_tmp_file()
intervals = []
for x in enumerate(final_records):
intervals.append([x[1][0], int(x[1][1]), int(
x[1][1]) + 1, x[1][2], x[1][3], str(x[0])])
write_tsv_file(filtered_bed, intervals)
filtered_bed = bedtools_sort(
filtered_bed, run_logger=thread_logger)

dbsnp_tmp = get_tmp_file()
vcf_2_bed(dbsnp, dbsnp_tmp)
bedtools_sort(dbsnp_tmp, output_fn=dbsnp, run_logger=thread_logger)
non_in_dbsnp_1 = bedtools_window(
filtered_bed, dbsnp, args=" -w 0 -v", run_logger=thread_logger)
non_in_dbsnp_2 = bedtools_window(
filtered_bed, dbsnp, args=" -w 0", run_logger=thread_logger)

tmp_ = get_tmp_file()
with open(non_in_dbsnp_2) as i_f, open(tmp_, "w") as o_f:
for line in skip_empty(i_f):
x = line.strip().split()
if x[1]!=x[7] or x[3]!=x[9] or x[4]!=x[10]:
o_f.write(line)
non_in_dbsnp_2 = tmp_

non_in_dbsnp_ids = []
with open(non_in_dbsnp_1) as i_f:
for line in skip_empty(i_f):
x = line.strip().split("\t")
non_in_dbsnp_ids.append(int(x[5]))
with open(non_in_dbsnp_2) as i_f:
for line in skip_empty(i_f):
x = line.strip().split("\t")
non_in_dbsnp_ids.append(int(x[5]))
final_records = list(map(lambda x: x[1], filter(
lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))))
dbsnp_tb = pysam.TabixFile(dbsnp)
with open(filtered_candidates_vcf, "w") as o_f:
o_f.write("{}\n".format(VCF_HEADER))
o_f.write(
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
for record in final_records:
if dbsnp:
chrom, pos, ref, alt = record[0:4]
var_id = "-".join(map(str,[chrom, pos, ref, alt]))
region = "{}:{}-{}".format(chrom, pos, pos + 1)
dbsnp_vars = []
for x in dbsnp_tb.fetch(region=region):
chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[
0:5]
for alt_ in alts_.split(","):
dbsnp_var_id = "-".join(map(str,[chrom_, pos_, ref_, alt_]))
dbsnp_vars.append(dbsnp_var_id)
if var_id in dbsnp_vars:
continue
o_f.write(record[-1] + "\n")
return filtered_candidates_vcf

Expand All @@ -320,7 +309,7 @@ def filter_candidates(candidate_record):
parser.add_argument('--reference', type=str, help='reference fasta filename',
required=True)
parser.add_argument('--dbsnp_to_filter', type=str,
help='dbsnp vcf (will be used to filter candidate variants)', default=None)
help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None)
parser.add_argument('--good_ao', type=float, help='good alternate count (ignores maf)',
default=10)
parser.add_argument('--min_ao', type=float,
Expand Down
73 changes: 42 additions & 31 deletions neusomatic/python/generate_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import numpy as np
import pysam
from scipy.misc import imresize
from PIL import Image

from split_bed import split_region
from utils import concatenate_vcfs, get_chromosomes_order, run_bedtools_cmd, vcf_2_bed, bedtools_sort, bedtools_window, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty
Expand Down Expand Up @@ -513,39 +513,43 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec
else:
col_pos_map = {i: int(round(v / float(ncols) * matrix_width))
for i, v in col_pos_map.items()}
tumor_count_matrix = imresize(
tumor_matrix, (5, matrix_width)).astype(int)
bq_tumor_count_matrix = imresize(
bq_tumor_matrix, (5, matrix_width)).astype(int)
mq_tumor_count_matrix = imresize(
mq_tumor_matrix, (5, matrix_width)).astype(int)
st_tumor_count_matrix = imresize(
st_tumor_matrix, (5, matrix_width)).astype(int)
lsc_tumor_count_matrix = imresize(
lsc_tumor_matrix, (5, matrix_width)).astype(int)
rsc_tumor_count_matrix = imresize(
rsc_tumor_matrix, (5, matrix_width)).astype(int)
tumor_count_matrix = np.array(Image.fromarray(
tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
bq_tumor_count_matrix = np.array(Image.fromarray(
bq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
mq_tumor_count_matrix = np.array(Image.fromarray(
mq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
st_tumor_count_matrix = np.array(Image.fromarray(
st_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
lsc_tumor_count_matrix = np.array(Image.fromarray(
lsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
rsc_tumor_count_matrix = np.array(Image.fromarray(
rsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)

tag_tumor_count_matrices = []
for iii in range(len(tag_tumor_matrices)):
tag_tumor_count_matrices.append(
imresize(tag_tumor_matrices[iii], (5, matrix_width)).astype(int))
normal_count_matrix = imresize(
normal_matrix, (5, matrix_width)).astype(int)
bq_normal_count_matrix = imresize(
bq_normal_matrix, (5, matrix_width)).astype(int)
mq_normal_count_matrix = imresize(
mq_normal_matrix, (5, matrix_width)).astype(int)
st_normal_count_matrix = imresize(
st_normal_matrix, (5, matrix_width)).astype(int)
lsc_normal_count_matrix = imresize(
lsc_normal_matrix, (5, matrix_width)).astype(int)
rsc_normal_count_matrix = imresize(
rsc_normal_matrix, (5, matrix_width)).astype(int)
np.array(Image.fromarray(tag_tumor_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))

normal_count_matrix = np.array(Image.fromarray(
normal_matrix).resize((matrix_width, 5), 2)).astype(int)
bq_normal_count_matrix = np.array(Image.fromarray(
bq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
mq_normal_count_matrix = np.array(Image.fromarray(
mq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
st_normal_count_matrix = np.array(Image.fromarray(
st_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
lsc_normal_count_matrix = np.array(Image.fromarray(
lsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
rsc_normal_count_matrix = np.array(Image.fromarray(
rsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)

tag_normal_count_matrices = []
for iii in range(len(tag_normal_matrices)):
tag_normal_count_matrices.append(
imresize(tag_normal_matrices[iii], (5, matrix_width)).astype(int))
ref_count_matrix = imresize(ref_matrix, (5, matrix_width)).astype(int)
np.array(Image.fromarray(tag_normal_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))
ref_count_matrix = np.array(Image.fromarray(
ref_matrix).resize((matrix_width, 5), 2)).astype(int)

if int(pos) + rcenter[0] not in col_pos_map:
center = min(col_pos_map.values()) + rcenter[0] - 1 + rcenter[1]
Expand Down Expand Up @@ -873,6 +877,7 @@ def find_records(input_record):
records = []
i = 0
anns = {}
fasta_file = pysam.Fastafile(ref_file)
if ensemble_bed:
with open(not_in_ensemble_bed) as ni_f:
for line in skip_empty(ni_f):
Expand Down Expand Up @@ -1033,8 +1038,16 @@ def find_records(input_record):
with open(split_truth_vcf_file, 'r') as vcf_reader:
for line in skip_empty(vcf_reader):
record = line.strip().split()
pos = int(record[1])
if len(record[3]) != len(record[4]) and min(len(record[3]), len(record[4])) > 0 and record[3][0] != record[4][0]:
if pos > 1:
l_base = fasta_file.fetch(
record[0], pos - 2, pos - 1).upper()
record[3] = l_base + record[3]
record[4] = l_base + record[4]
pos -= 1
truth_records.append(
[record[0], int(record[1]), record[3], record[4], str(i)])
[record[0], pos, record[3], record[4], str(i)])
i += 1

truth_bed = get_tmp_file()
Expand Down Expand Up @@ -1071,7 +1084,6 @@ def find_records(input_record):
record_center = {}

chroms_order = get_chromosomes_order(reference=ref_file)
fasta_file = pysam.Fastafile(ref_file)

good_records = {"INS": [], "DEL": [], "SNP": []}
vtype = {}
Expand Down Expand Up @@ -1299,7 +1311,6 @@ def find_records(input_record):
records_r = [records[x] for k, w in good_records.items() for x in w]

N_none = len(none_records_ids)
thread_logger.info("N_none: {} ".format(N_none))
none_records = list(map(lambda x: records[x], none_records_ids))
none_records = sorted(none_records, key=lambda x: [x[0], int(x[1])])

Expand Down
Loading