bioinform · msahraeian · Jul 26, 2019 · Aug 13, 2019 · Aug 13, 2019 · Jan 5, 2020
diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
@@ -11,6 +11,7 @@
 import shutil
 import pickle
 import multiprocessing
+import copy
 
 import pysam
 import numpy as np
@@ -69,6 +70,11 @@ def call_variants(net, call_loader, out_dir, model_tag, use_cuda):
     for data in loader_:
         (matrices, labels, var_pos_s, var_len_s,
          non_transformed_matrices), (paths) = data
+
+        paths_ = copy.deepcopy(paths)
+        del paths
+        paths = paths_
+
         matrices = Variable(matrices)
         iii += 1
         j += len(paths[0])

diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py
@@ -3,6 +3,7 @@
 # filter_candidates.py
 # filter raw candidates extracted by 'scan_alignments.py' using min_af and other cut-offs
 #-------------------------------------------------------------------------
+import os
 import argparse
 import traceback
 import logging
@@ -25,6 +26,16 @@ def filter_candidates(candidate_record):
         thread_logger.info(
             "---------------------Filter Candidates---------------------")
 
+        if dbsnp:
+            if not dbsnp.endswith("vcf.gz"):
+                thread_logger.error("Aborting!")
+                raise Exception(
+                    "The dbSNP file should be a tabix indexed file with .vcf.gz format")
+            if not os.path.exists(dbsnp + ".tbi"):
+                thread_logger.error("Aborting!")
+                raise Exception(
+                    "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
+
         records = {}
         with open(candidates_vcf) as v_f:
             for line in skip_empty(v_f):
@@ -257,47 +268,25 @@ def filter_candidates(candidate_record):
                 final_records.append([chrom, pos - 1, ref, alt, line])
         final_records = sorted(final_records, key=lambda x: x[0:2])
         if dbsnp:
-            filtered_bed = get_tmp_file()
-            intervals = []
-            for x in enumerate(final_records):
-                intervals.append([x[1][0], int(x[1][1]), int(
-                    x[1][1]) + 1, x[1][2], x[1][3], str(x[0])])
-            write_tsv_file(filtered_bed, intervals)
-            filtered_bed = bedtools_sort(
-                filtered_bed, run_logger=thread_logger)
-
-            dbsnp_tmp = get_tmp_file()
-            vcf_2_bed(dbsnp, dbsnp_tmp)
-            bedtools_sort(dbsnp_tmp, output_fn=dbsnp, run_logger=thread_logger)
-            non_in_dbsnp_1 = bedtools_window(
-                filtered_bed, dbsnp, args=" -w 0 -v", run_logger=thread_logger)
-            non_in_dbsnp_2 = bedtools_window(
-                filtered_bed, dbsnp, args=" -w 0", run_logger=thread_logger)
-
-            tmp_ = get_tmp_file()
-            with open(non_in_dbsnp_2) as i_f, open(tmp_, "w") as o_f:
-                for line in skip_empty(i_f):
-                    x = line.strip().split()
-                    if x[1]!=x[7] or x[3]!=x[9] or x[4]!=x[10]:
-                        o_f.write(line)
-            non_in_dbsnp_2 = tmp_
-
-            non_in_dbsnp_ids = []
-            with open(non_in_dbsnp_1) as i_f:
-                for line in skip_empty(i_f):
-                    x = line.strip().split("\t")
-                    non_in_dbsnp_ids.append(int(x[5]))
-            with open(non_in_dbsnp_2) as i_f:
-                for line in skip_empty(i_f):
-                    x = line.strip().split("\t")
-                    non_in_dbsnp_ids.append(int(x[5]))
-            final_records = list(map(lambda x: x[1], filter(
-                lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))))
+            dbsnp_tb = pysam.TabixFile(dbsnp)
         with open(filtered_candidates_vcf, "w") as o_f:
             o_f.write("{}\n".format(VCF_HEADER))
             o_f.write(
                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
             for record in final_records:
+                if dbsnp:
+                    chrom, pos, ref, alt = record[0:4]
+                    var_id = "-".join(map(str,[chrom, pos, ref, alt]))
+                    region = "{}:{}-{}".format(chrom, pos, pos + 1)
+                    dbsnp_vars = []
+                    for x in dbsnp_tb.fetch(region=region):
+                        chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[
+                            0:5]
+                        for alt_ in alts_.split(","):
+                            dbsnp_var_id = "-".join(map(str,[chrom_, pos_, ref_, alt_]))
+                            dbsnp_vars.append(dbsnp_var_id)
+                    if var_id in dbsnp_vars:
+                        continue
                 o_f.write(record[-1] + "\n")
         return filtered_candidates_vcf
 
@@ -320,7 +309,7 @@ def filter_candidates(candidate_record):
     parser.add_argument('--reference', type=str, help='reference fasta filename',
                         required=True)
     parser.add_argument('--dbsnp_to_filter', type=str,
-                        help='dbsnp vcf (will be used to filter candidate variants)', default=None)
+                        help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None)
     parser.add_argument('--good_ao', type=float, help='good alternate count (ignores maf)',
                         default=10)
     parser.add_argument('--min_ao', type=float,

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import pysam
-from scipy.misc import imresize
+from PIL import Image
 
 from split_bed import split_region
 from utils import concatenate_vcfs, get_chromosomes_order, run_bedtools_cmd, vcf_2_bed, bedtools_sort, bedtools_window, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty
@@ -513,39 +513,43 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec
     else:
         col_pos_map = {i: int(round(v / float(ncols) * matrix_width))
                        for i, v in col_pos_map.items()}
-        tumor_count_matrix = imresize(
-            tumor_matrix, (5, matrix_width)).astype(int)
-        bq_tumor_count_matrix = imresize(
-            bq_tumor_matrix, (5, matrix_width)).astype(int)
-        mq_tumor_count_matrix = imresize(
-            mq_tumor_matrix, (5, matrix_width)).astype(int)
-        st_tumor_count_matrix = imresize(
-            st_tumor_matrix, (5, matrix_width)).astype(int)
-        lsc_tumor_count_matrix = imresize(
-            lsc_tumor_matrix, (5, matrix_width)).astype(int)
-        rsc_tumor_count_matrix = imresize(
-            rsc_tumor_matrix, (5, matrix_width)).astype(int)
+        tumor_count_matrix = np.array(Image.fromarray(
+            tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+        bq_tumor_count_matrix = np.array(Image.fromarray(
+            bq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+        mq_tumor_count_matrix = np.array(Image.fromarray(
+            mq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+        st_tumor_count_matrix = np.array(Image.fromarray(
+            st_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+        lsc_tumor_count_matrix = np.array(Image.fromarray(
+            lsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+        rsc_tumor_count_matrix = np.array(Image.fromarray(
+            rsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
+
         tag_tumor_count_matrices = []
         for iii in range(len(tag_tumor_matrices)):
             tag_tumor_count_matrices.append(
-                imresize(tag_tumor_matrices[iii], (5, matrix_width)).astype(int))
-        normal_count_matrix = imresize(
-            normal_matrix, (5, matrix_width)).astype(int)
-        bq_normal_count_matrix = imresize(
-            bq_normal_matrix, (5, matrix_width)).astype(int)
-        mq_normal_count_matrix = imresize(
-            mq_normal_matrix, (5, matrix_width)).astype(int)
-        st_normal_count_matrix = imresize(
-            st_normal_matrix, (5, matrix_width)).astype(int)
-        lsc_normal_count_matrix = imresize(
-            lsc_normal_matrix, (5, matrix_width)).astype(int)
-        rsc_normal_count_matrix = imresize(
-            rsc_normal_matrix, (5, matrix_width)).astype(int)
+                np.array(Image.fromarray(tag_tumor_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))
+
+        normal_count_matrix = np.array(Image.fromarray(
+            normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+        bq_normal_count_matrix = np.array(Image.fromarray(
+            bq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+        mq_normal_count_matrix = np.array(Image.fromarray(
+            mq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+        st_normal_count_matrix = np.array(Image.fromarray(
+            st_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+        lsc_normal_count_matrix = np.array(Image.fromarray(
+            lsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+        rsc_normal_count_matrix = np.array(Image.fromarray(
+            rsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
+
         tag_normal_count_matrices = []
         for iii in range(len(tag_normal_matrices)):
             tag_normal_count_matrices.append(
-                imresize(tag_normal_matrices[iii], (5, matrix_width)).astype(int))
-        ref_count_matrix = imresize(ref_matrix, (5, matrix_width)).astype(int)
+                np.array(Image.fromarray(tag_normal_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))
+        ref_count_matrix = np.array(Image.fromarray(
+            ref_matrix).resize((matrix_width, 5), 2)).astype(int)
 
     if int(pos) + rcenter[0] not in col_pos_map:
         center = min(col_pos_map.values()) + rcenter[0] - 1 + rcenter[1]
@@ -873,6 +877,7 @@ def find_records(input_record):
         records = []
         i = 0
         anns = {}
+        fasta_file = pysam.Fastafile(ref_file)
         if ensemble_bed:
             with open(not_in_ensemble_bed) as ni_f:
                 for line in skip_empty(ni_f):
@@ -1033,8 +1038,16 @@ def find_records(input_record):
         with open(split_truth_vcf_file, 'r') as vcf_reader:
             for line in skip_empty(vcf_reader):
                 record = line.strip().split()
+                pos = int(record[1])
+                if len(record[3]) != len(record[4]) and min(len(record[3]), len(record[4])) > 0 and record[3][0] != record[4][0]:
+                    if pos > 1:
+                        l_base = fasta_file.fetch(
+                            record[0], pos - 2, pos - 1).upper()
+                        record[3] = l_base + record[3]
+                        record[4] = l_base + record[4]
+                        pos -= 1
                 truth_records.append(
-                    [record[0], int(record[1]), record[3], record[4], str(i)])
+                    [record[0], pos, record[3], record[4], str(i)])
                 i += 1
 
         truth_bed = get_tmp_file()
@@ -1071,7 +1084,6 @@ def find_records(input_record):
         record_center = {}
 
         chroms_order = get_chromosomes_order(reference=ref_file)
-        fasta_file = pysam.Fastafile(ref_file)
 
         good_records = {"INS": [], "DEL": [], "SNP": []}
         vtype = {}
@@ -1299,7 +1311,6 @@ def find_records(input_record):
         records_r = [records[x] for k, w in good_records.items() for x in w]
 
         N_none = len(none_records_ids)
-        thread_logger.info("N_none: {} ".format(N_none))
         none_records = list(map(lambda x: records[x], none_records_ids))
         none_records = sorted(none_records, key=lambda x: [x[0], int(x[1])])