From e7bc9342fbfb4869f01ce3072e6ebfd961c0d031 Mon Sep 17 00:00:00 2001 From: Darryl Nousome Date: Fri, 8 Dec 2023 15:45:07 -0500 Subject: [PATCH] feat: added python cli for SV --- bin/add_gt_lofreq.sh | 65 +++ bin/ascat.R | 47 +++ bin/assess_significance.R | 59 +++ bin/freec_paired.pl | 43 ++ bin/lofreq_convert.sh | 32 ++ bin/makeGraph.R | 134 ++++++ bin/make_freec_genome.pl | 39 ++ bin/run_sequenza.R | 65 +++ bin/sexscript.sh | 18 + main.nf | 5 +- modules/local/copynumber.nf | 425 +++++++++++++++---- modules/local/structural_variant.nf | 242 +++++++++-- modules/local/variant_calling.nf | 576 ++++++++++++++++++++------ nextflow.config | 42 +- subworkflows/local/workflows.nf | 487 +++++++++++++++------- subworkflows/local/workflows_tonly.nf | 242 +++++++---- wgs-seek_bak | 501 ---------------------- 17 files changed, 2087 insertions(+), 935 deletions(-) create mode 100755 bin/add_gt_lofreq.sh create mode 100644 bin/ascat.R create mode 100644 bin/assess_significance.R create mode 100644 bin/freec_paired.pl create mode 100755 bin/lofreq_convert.sh create mode 100644 bin/makeGraph.R create mode 100644 bin/make_freec_genome.pl create mode 100644 bin/run_sequenza.R create mode 100755 bin/sexscript.sh delete mode 100755 wgs-seek_bak diff --git a/bin/add_gt_lofreq.sh b/bin/add_gt_lofreq.sh new file mode 100755 index 0000000..ac2b373 --- /dev/null +++ b/bin/add_gt_lofreq.sh @@ -0,0 +1,65 @@ +#!/bin/bash +#Author: Dr Charles Foster http://github.com/charlesfoster + +i_flag='' +g_flag='' +h_flag='' +n_flag='' +o_flag='' + +print_usage() { + printf "Usage: bash add_artificial_genotype.sh -i in.vcf.gz [-g genotype] [-n sample_name] -o out.vcf.gz\n" + printf "Genotype defaults to 1 if not specified\n" + printf "Sample name gussed from infile name if not specified\n" +} + +if [[ $# -eq 0 ]] ; then + print_usage + exit 1 +fi + +while getopts 'i:g:n:ho:' flag; do + case "${flag}" in + i) IN="${OPTARG}" ;; + n) NAME="${OPTARG}" ;; + g) GENOTYPE="${OPTARG}" ;; + h) print_usage + exit 1 ;; + o) OUT="${OPTARG}" ;; + *) print_usage + exit 1 ;; + esac +done + +if [ ! -f ${IN} ]; then + printf "\nError: input file not found\n" + print_usage + exit 1 +fi + +if [ -z "${GENOTYPE}" ] + then + printf "\nNo genotype specified: setting to 1" + GENOTYPE=1 +fi + +if [ -z "${NAME}" ] + then + NAME=$(basename ${IN} | cut -f1 -d ".") + printf "\nNo name specified: guessed it to be ${NAME}" +fi + +if [ -z "${OUT}" ] + then + OUT=$(echo ${IN} | sed "s/.vcf.gz/_withGT.vcf.gz/") + printf "\nNo outfile specified: setting to ${OUT}\n" +fi + +gunzip -kc ${IN} | \ +sed -e '6i##FORMAT=' \ +-e "s|FILTER\tINFO|FILTER\tINFO\tFORMAT\t${NAME}|g" | \ +awk -F'\t' -v genotype=${GENOTYPE} -v OFS="\t" '/^[^#]/{ $9 = "GT"; $10 = genotype }1' | \ +bgzip -c > ${OUT} +tabix -p vcf ${OUT} +printf "VCF with artificial genotype written to ${OUT}\n" +exit 0 diff --git a/bin/ascat.R b/bin/ascat.R new file mode 100644 index 0000000..334afe2 --- /dev/null +++ b/bin/ascat.R @@ -0,0 +1,47 @@ +#!/usr/bin/env Rscript + +####### +# +#R Script for ASCAT +# +###### +library(ASCAT) +library(RColorBrewer) + +args = commandArgs(trailingOnly=TRUE) +tumor_bam=args[1] +normal_bam=args[2] + +genome="hg38" + +ascat.prepareHTS( + tumourseqfile = tumor_bam, + normalseqfile = normal_bam, + tumourname = tumor_name, + normalname = normal_name, + allelecounter_exe = "/PATH/TO/allelecounter", + alleles.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT/G1000_alleles", + loci.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT_G1000_loci", + nthreads = 10 + gender = "XX", + genomeVersion = genome, + nthreads = 8, + tumourLogR_file = "Tumor_LogR.txt", + tumourBAF_file = "Tumor_BAF.txt", + normalLogR_file = "Germline_LogR.txt", + normalBAF_file = "Germline_BAF.txt") + +ascat.bc = ascat.loadData(Tumor_LogR_file = "Tumor_LogR.txt", Tumor_BAF_file = "Tumor_BAF.txt", + Germline_LogR_file = "Germline_LogR.txt", Germline_BAF_file = "Germline_BAF.txt", gender = 'XX', genomeVersion = "hg19") + +ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_") +ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = "GC_file.txt", replictimingfile = "RT_file.txt") +ascat.plotRawData(ascat.bc, img.prefix = "After_correction_") +ascat.bc = ascat.aspcf(ascat.bc) +ascat.plotSegmentedData(ascat.bc) +ascat.output = ascat.runAscat(ascat.bc, gamma=1, write_segments = T) +QC = ascat.metrics(ascat.bc,ascat.output) +save(ascat.bc, ascat.output, QC, file = 'ASCAT_objects.Rdata') + + +##### \ No newline at end of file diff --git a/bin/assess_significance.R b/bin/assess_significance.R new file mode 100644 index 0000000..7f7216c --- /dev/null +++ b/bin/assess_significance.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript + +library(rtracklayer) + +args <- commandArgs() + +dataTable <-read.table(args[5], header=TRUE); +ratio<-data.frame(dataTable) + +dataTable <- read.table(args[4], header=FALSE) +cnvs<- data.frame(dataTable) + +ratio$Ratio[which(ratio$Ratio==-1)]=NA + +cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3])) +ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio) + +overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed) +normals <- setdiff(ratio.bed,cnvs.bed) +normals <- subsetByOverlaps(ratio.bed,normals) + +#mu <- mean(score(normals),na.rm=TRUE) +#sigma<- sd(score(normals),na.rm=TRUE) + +#hist(score(normals),n=500,xlim=c(0,2)) +#hist(log(score(normals)),n=500,xlim=c(-1,1)) + +#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000]) +#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10))) +#qqline(score(normals)[which(!is.na(score(normals)))], col = 2) + +#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000]) +#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10))) +#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2) + +numberOfCol=length(cnvs) +wscore=c() +kscore=c() +for (i in c(1:length(cnvs[,1]))) { +values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i])) +resultw <- class(try(wilcox.test(values,score(normals)), silent = TRUE)) +ifelse(resultw == "try-error", wscore <- c(wscore, "NA"), wscore <- c(wscore, wilcox.test(values,score(normals))$p.value)) +resultks <- class(try(ks.test(values,score(normals)), silent = TRUE)) +ifelse(resultks == "try-error",kscore <- c(kscore, "NA"),kscore <- c(kscore, ks.test(values,score(normals))$p.value)) +} +cnvs = cbind(cnvs, as.numeric(wscore), as.numeric(kscore)) + +if (numberOfCol==7) { + names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +if (numberOfCol==9) { + names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +if (numberOfCol==11) { + names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F) + + diff --git a/bin/freec_paired.pl b/bin/freec_paired.pl new file mode 100644 index 0000000..161e24e --- /dev/null +++ b/bin/freec_paired.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w +use strict; +use List::Util 'shuffle'; + +#INPUT + +#my $mergedmaf = $ARGV[1] . '_out/oncotator_out/' . $ARGV[1] . '_merged.maf'; #to fix... +#open C, ">$mergedmaf"; + +my $outfile = $ARGV[0] . '/freec_genome_config.txt'; +my $chrLenFile = $ARGV[1]; +my $chrFiles = $ARGV[2]; +my $tumormateFile = $ARGV[3]; +my $controlmateFile = $ARGV[4]; +my $makePileup = $ARGV[5]; +my $fastaFile = $ARGV[6]; +my $SNPfile = $ARGV[7]; + +open C, ">$outfile"; + +print C '[general]' . "\n\n"; + +print C "BedGraphOutput = TRUE\ndegree = 3\nforceGCcontentNormalization = 0\nminCNAlength = 1\nreadCountThreshold = 10\n"; +print C "chrLenFile = $chrLenFile\n"; +print C "ploidy = 2,3,4\nbreakPointThreshold = 0.8\nwindow = 50000\n"; +print C "chrFiles = $chrFiles\n"; +print C "minimalSubclonePresence = 20\nmaxThreads = 8\n"; +print C "outputDir = $ARGV[0]\n\n"; + +print C '[sample]' . "\n\n"; + +print C "mateFile = $tumormateFile\n"; +print C "inputFormat = BAM\nmateOrientation = FR\n\n"; + +print C '[BAF]' . "\n\n"; + +print C "mateFile = $controlmateFile\n"; +print C "inputFormat = BAM\nmateOrientation = FR\n\n"; + +print C "makePileup = $makePileup\n"; +print C "fastaFile = $fastaFile\n"; +print C "minimalCoveragePerPosition = 20\nminimalQualityPerPosition = 20\n"; +print C "SNPfile = $SNPfile"; diff --git a/bin/lofreq_convert.sh b/bin/lofreq_convert.sh new file mode 100755 index 0000000..1d5edda --- /dev/null +++ b/bin/lofreq_convert.sh @@ -0,0 +1,32 @@ +INPUT_FILE="$1" +TUMOR_NAME="$2" +export TUMOR_NAME + +zcat "${INPUT_FILE}" \ + | awk '($4=="A" || $4 == "C" || $4=="T" || $4=="G" || /^\#/)' \ + | perl -ne 'print if /^#|^(chr)*[\dX]+\s.+/' \ + | perl -ne 's/AF=/VAF=/g;s/ID=AF/ID=VAF/;print;' \ + | perl -ne ' + # Add 2 new rows to the description and 2 new columns in the header + if(/^#/){ + if(/##INFO=0) { + plot(ratio$Start[tt],log2(ratio$Ratio[tt]),xlab = paste ("position, chr",i),ylab = "normalized copy number profile (log2)",pch = ".",col = colors()[88]) + tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy ) + points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[136]) + + + tt <- which(ratio$Chromosome==i & ratio$CopyNumbermaxLevelToPlot) { + ratio$Ratio[i]=maxLevelToPlot; + } +} + + +for (i in c(1:22,'X','Y')) { + tt <- which(ratio$Chromosome==i) + if (length(tt)>0) { + plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88]) + tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy ) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136]) + + tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4) + + tt <- which(ratio$Chromosome==i & ratio$CopyNumber5) { + dataTable <-read.table(args[6], header=TRUE); + BAF<-data.frame(dataTable) + + png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180, + units = "px", pointsize = 20, bg = "white", res = NA) + plot(1:10) + op <- par(mfrow = c(5,5)) + + for (i in c(1:22,'X','Y')) { + tt <- which(BAF$Chromosome==i) + if (length(tt)>0){ + lBAF <-BAF[tt,] + plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1]) + + tt <- which(lBAF$A==0.5) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92]) + tt <- which(lBAF$A!=0.5 & lBAF$A>=0) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62]) + tt <- 1 + pres <- 1 + + if (length(lBAF$A)>4) { + for (j in c(2:(length(lBAF$A)-pres-1))) { + if (lBAF$A[j]==lBAF$A[j+pres]) { + tt[length(tt)+1] <- j + } + } + points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4) + points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4) + } + + tt <- 1 + pres <- 1 + if (length(lBAF$FittedA)>4) { + for (j in c(2:(length(lBAF$FittedA)-pres-1))) { + if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) { + tt[length(tt)+1] <- j + } + } + points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4) + points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4) + } + + } + + } + dev.off() + +} diff --git a/bin/make_freec_genome.pl b/bin/make_freec_genome.pl new file mode 100644 index 0000000..28aec4c --- /dev/null +++ b/bin/make_freec_genome.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w +use strict; +use List::Util 'shuffle'; + +#INPUT + +#my $mergedmaf = $ARGV[1] . '_out/oncotator_out/' . $ARGV[1] . '_merged.maf'; #to fix... +#open C, ">$mergedmaf"; + +my $outfile = $ARGV[0] . '/freec_genome_config.txt'; +my $chrLenFile = $ARGV[1]; +my $chrFiles = $ARGV[2]; +my $tumormateFile = $ARGV[3]; +my $makePileup = $ARGV[4]; +my $fastaFile = $ARGV[5]; +my $SNPfile = $ARGV[6]; + +open C, ">$outfile"; + +print C '[general]' . "\n\n"; + +print C "BedGraphOutput = TRUE\ndegree = 3\nforceGCcontentNormalization = 0\nminCNAlength = 1\nreadCountThreshold = 10\n"; +print C "chrLenFile = $chrLenFile\n"; +print C "ploidy = 2,3,4\nbreakPointThreshold = 0.8\nwindow = 50000\n"; +print C "chrFiles = $chrFiles\n"; +print C "minimalSubclonePresence = 20\nmaxThreads = 4\n"; +print C "outputDir = $ARGV[0]\n\n"; + +print C '[sample]' . "\n\n"; + +print C "mateFile = $tumormateFile\n"; +print C "inputFormat = BAM\nmateOrientation = FR\n\n"; + +print C '[BAF]' . "\n\n"; + +print C "makePileup = $makePileup\n"; +print C "fastaFile = $fastaFile\n"; +print C "minimalCoveragePerPosition = 20\nminimalQualityPerPosition = 20\n"; +print C "SNPfile = $SNPfile"; diff --git a/bin/run_sequenza.R b/bin/run_sequenza.R new file mode 100644 index 0000000..3758b8a --- /dev/null +++ b/bin/run_sequenza.R @@ -0,0 +1,65 @@ + +args = commandArgs(trailingOnly=TRUE) + +library(sequenza) +Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10) + + +if (length(args)==0) { + stop("Must provide a seqz file") +} else { + seqz_file = args[1] + if (! file.exists(seqz_file)) { + stop(paste0("Can't find this SEQZ output file: ", seqz_file)) + } +} + +if (length(args) > 1) { + out_dir = args[2] +} else { + out_dir = dirname(seqz_file) +} + +if (length(args) > 2) { + sampleid = args[3] +} else { + sampleid = gsub(".seqz.gz","",basename(seqz_file)) +} + +if (length(args) > 3) { + n_cores = as.numeric(args[4]) +} else { + n_cores = 1 +} + +if (is.na(n_cores)) { + n_cores = 1 +} + + +print(paste0("Using ",n_cores," cores...")) +date() +print("Extracting seqz data...") +seqzdata <- sequenza.extract(seqz_file, min.reads = 30, min.reads.normal= 10, parallel=n_cores) + +date() +print("Fitting model...") +CP.example <- sequenza.fit(seqzdata, mc.cores = n_cores) + +## Sequenza.extract seems to fail if too few mutations +num_mutations <- unlist(lapply(seqzdata$mutations, nrow)) +chrom_list <- names(num_mutations)[num_mutations > 3] +## But it might actually be segments, idk? +#num_segments <- unlist(lapply(seqzdata$segments, nrow)) +#chrom_list <- names(num_mutations)[num_segments > 1] + +not_included <- setdiff(names(num_mutations), chrom_list) +print("Printing results...") +if (length(not_included) > 0) { + print("Excluding these chromosomes because of too few mutations...") + print(not_included) +} +sequenza.results(sequenza.extract = seqzdata,cp.table = CP.example, sample.id = sampleid, out.dir=out_dir, chromosome.list=chrom_list) + +date() +print("Done") diff --git a/bin/sexscript.sh b/bin/sexscript.sh new file mode 100755 index 0000000..8e94555 --- /dev/null +++ b/bin/sexscript.sh @@ -0,0 +1,18 @@ + #!/bin/bash + + x_map=$(samtools idxstats $1 | grep "chrX\s" | cut -f 3) + x_len=$(samtools idxstats $1 | grep "chrX\s" | cut -f 2) + x_cov=$(echo "scale=3; ${x_map}/${x_len}" | bc) + + y_map=$(samtools idxstats $1 | grep "chrY\s" | cut -f 3) + y_len=$(samtools idxstats $1 | grep "chrY\s" | cut -f 2) + y_cov=$(echo "scale=3; ${y_map}/${y_len}" | bc) + + ratio=$(echo "scale=2; ${x_cov}/${y_cov}" | bc) + echo $ratio + + if (( $(echo "$ratio > 4.00" | bc -l) )); then + echo "F" + else + echo "M" + fi \ No newline at end of file diff --git a/main.nf b/main.nf index 31a753d..4babf3a 100644 --- a/main.nf +++ b/main.nf @@ -22,8 +22,9 @@ include {INPUT_PIPE;TRIM_ALIGN_PIPE; GERMLINE_PIPE;VARIANTCALL_PIPE;INPUT_BAMVC_PIPE;SV_PIPE; QC_PIPE} from "./subworkflows/local/workflows.nf" -include {INPUT_TONLY_PIPE;TRIM_ALIGN_TONLY_PIPE; - VARIANT_TONLY_PIPE;INPUT_TONLY_BAMVC_PIPE;QC_TONLY_PIPE} from "./subworkflows/local/workflows_tonly.nf" +include {INPUT_TONLY; INPUT_TONLY_BAM; + ALIGN_TONLY; + VC_TONLY; SV_TONLY; CNVhuman_tonly; CNVmouse_tonly; QC_TONLY } from "./subworkflows/local/workflows_tonly.nf" log.info """\ diff --git a/modules/local/copynumber.nf b/modules/local/copynumber.nf index 7618bc4..b15a8c2 100644 --- a/modules/local/copynumber.nf +++ b/modules/local/copynumber.nf @@ -1,39 +1,231 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 +GENOMEREF=file(params.genomes[params.genome].genome) +SEQUENZAGC=file(params.genomes[params.genome].SEQUENZAGC) +SEQUENZA_SCRIPT=params.script_sequenza + +if (params.genome=="mm10"){ +FREECLENGTHS=file(params.genomes[params.genome].FREEC.FREECLENGTHS) +FREECCHROMS=file(params.genomes[params.genome].FREEC.FREECCHROMS) +FREECPILEUP=file(params.genomes[params.genome].FREEC.FREECPILEUP) +FREECSNPS = file(params.genomes[params.genome].FREEC.FREECSNPS) +FREECTARGETS=file(params.genomes[params.genome].intervals) +FREECSCRIPT = params.script_freec +FREECPAIR_SCRIPT = params.script_freecpaired +FREECSIGNIFICANCE = params.freec_significance +FREECPLOT = params.freec_plot +} -GENOME=file(params.genome) GERMLINEHET="/data/SCLC-BRAINMETS/cn/copy_number/GermlineHetPon.38.vcf.gz" GCPROFILE='/data/SCLC-BRAINMETS/cn/copy_number/GC_profile.1000bp.38.cnp' DIPLODREG='/data/SCLC-BRAINMETS/cn/copy_number/DiploidRegions.38.bed.gz' ENSEMBLCACHE='/data/SCLC-BRAINMETS/cn/common/ensembl_data' DRIVERS='/data/SCLC-BRAINMETS/cn/common/DriverGenePanel.38.tsv' HOTSPOTS='/data/SCLC-BRAINMETS/cn/variants/KnownHotspots.somatic.38.vcf.gz' -cobalt="/data/SCLC-BRAINMETS/cn/cobalt_v1.14.jar" -amber="/data/SCLC-BRAINMETS/cn/amber-3.9.jar" -purple="/data/SCLC-BRAINMETS/cn/purple_v3.8.2.jar" - -date = new Date().format( 'yyyyMMdd' ) +//DBSNP_INDEL=file(params.genomes[params.genome].KNOWNINDELS) +//ascatR= outdir=file(params.output) +//mm10 Paired-Sequenza, FREEC-tumor only +process seqz_sequenza_bychr { + label 'process_low' -log.info """\ - W G S S E E K P I P E L I N E - ============================= - Samples: ${params.sample_sheet} - """ - .stripIndent() + input: + tuple val(pairid), val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), val(chr) + + output: + tuple val(pairid), path("${tumorname}_${normalname}_${chr}.seqz.gz") + + script: + """ + sequenza-utils bam2seqz \ + -gc ${SEQUENZAGC} \ + -F $GENOMEREF \ + -C ${chr} \ + -n ${normal} \ + -t ${tumor} | gzip > "${tumorname}_${normalname}_${chr}.seqz.gz" + + """ + + stub: + """ + touch "${tumorname}_${normalname}_${chr}.seqz.gz" + """ +} + + + + +process sequenza { + label 'process_highcpu' + publishDir("${outdir}/cnv/sequenza", mode: 'copy') + + input: + tuple val(pairid), path(seqz) + + output: + tuple val(pairid), + path("${pairid}_alternative_solutions.txt"), + path("${pairid}_alternative_fit.pdf"), + path("${pairid}_model_fit.pdf"), + path("${pairid}_confints_CP.txt"), + path("${pairid}_CN_bars.pdf"), + path("${pairid}_genome_view.pdf"), + path("${pairid}_chromosome_view.pdf"), + path("${pairid}_mutations.txt"), + path("${pairid}_segments.txt"), + path("${pairid}_CP_contours.pdf"), + path("${pairid}_sequenza_cp_table.RData"), + path("${pairid}_chromosome_depths.pdf"), + path("${pairid}_gc_plots.pdf"), + path("${pairid}_sequenza_extract.RData") + + //samtools mpileup ${tumor} -f $GENOMEREF -Q 20 |gzip > ${tumorname}.mpileup.gz + //samtools mpileup ${normal} -f $GENOMEREF -Q 20 |gzip > ${normalname}.mpileup.gz + //sequenza-utils seqz_binning --seqz --window 50 -o ${sample}_bin50.seqz.gz + + shell: + ''' + + zcat !{seqz} | awk '{if (NR==1) {print $0} else {if ($1!="chromosome"){print $0}}}' |\ + sequenza-utils seqz_binning \ + -w 100 \ + -s - > !{pairid}.bin100.seqz + + Rscript !{SEQUENZA_SCRIPT} \ + !{pairid}.bin100.seqz \ + . \ + !{pairid} \ + !{task.cpus} + + ''' + + stub: + + """ + touch "${pairid}_alternative_solutions.txt" + touch "${pairid}_alternative_fit.pdf" + touch "${pairid}_model_fit.pdf" + touch "${pairid}_confints_CP.txt" + touch "${pairid}_CN_bars.pdf" + touch "${pairid}_genome_view.pdf" + touch "${pairid}_chromosome_view.pdf" + touch "${pairid}_mutations.txt" + touch "${pairid}_segments.txt" + touch "${pairid}_CP_contours.pdf" + touch "${pairid}_sequenza_cp_table.RData" + touch "${pairid}_chromosome_depths.pdf" + touch "${pairid}_gc_plots.pdf" + touch "${pairid}_sequenza_extract.RData" + + """ + +} + +process freec_paired { + label 'process_highcpu' + publishDir("${outdir}/cnv/freec", mode: 'copy') + + input: + tuple val(tumorname), path(tumor), path(tumorbai), val(normalname), path(normal), path(normalbai) + + shell: """ + + perl $FREECPAIR_SCRIPT \ + . \ + $FREECLENGTHS \ + $FREECCHROMS \ + ${tumor} \ + ${normal} \ + $FREECPILEUP \ + $GENOMEREF \ + $FREECSNPS \ + $FREECTARGETS + + freec -conf freec_genome_config.txt + + cat $FREECSIGNIFICANCE | \ + R --slave \ + --args ${tumor}_CNVs \ + ${tumor}_ratio.txt + + cat $FREECPLOT | \ + R --slave \ + --args 2 \ + ${tumorname}_vs_${normalname}.bam_ratio.txt \ + ${tumorname}_vs_${normalname}.bam_BAF.txt + + """ + + stub: + """ + touch ${tumorname}_vs_${normalname}.bam_CNVs.p.value.txt + touch ${tumorname}_vs_${normalname}.bam_ratio.txt + touch ${tumorname}_vs_${normalname}.bam_BAF.txt + touch ${tumorname}_vs_${normalname}.bam_ratio.txt.log2.png + touch ${tumorname}_vs_${normalname}.bam_ratio.txt.png + + """ +} + + +process freec { + label 'process_mid' + publishDir("${outdir}/cnv/freec", mode: 'copy') -//Purple -process amber { - module=["java/12.0.1","R/3.6.3"] - publishDir("${outdir}/amber", mode: 'copy') input: - tuple val(samplename), path("${samplename}.bqsr.bam"), path("${samplename}.bqsr.bam.bai") + tuple val(tumorname), path(tumor), path(tumorbai) + + shell: """ + + perl $FREECSCRIPT \ + . \ + $FREECLENGTHS \ + $FREECCHROMS \ + ${tumor} \ + $FREECPILEUP \ + $GENOMEREF \ + $FREECSNPS \ + $FREECTARGETS + + freec -conf freec_genome_config.txt + + cat $FREECSIGNIFICANCE | \ + R --slave \ + --args ${tumor}_CNVs \ + ${tumor}_ratio.txt + + cat $FREECPLOT | \ + R --slave \ + --args 2 \ + ${tumor}_ratio.txt \ + ${tumor}_BAF.txt + + """ + + stub: + """ + touch ${tumor}_CNVs.p.value.txt + touch ${tumor}_ratio.txt + touch ${tumor}_BAF.txt + touch ${tumor}_ratio.txt.log2.png + touch ${tumor}_ratio.txt.png + + """ +} + + +process amber_tonly { + label 'process_mid' + publishDir("${outdir}/cnv/amber", mode: 'copy') + + input: + tuple val(tumorname), path(tumor), path(tumorbai) + output: - tuple val(samplename), path("${samplename}_amber") + tuple val(tumorname), path("${tumorname}_amber") //path("${samplename}.amber.baf.tsv.gz"), //path("${samplename}.amber.baf.pcf"), //path("${samplename}.amber.qc") @@ -43,10 +235,10 @@ process amber { """ - java -Xmx32G -cp $amber com.hartwig.hmftools.amber.AmberApplication \ - -tumor ${samplename} -tumor_bam ${samplename}.bqsr.bam \ - -output_dir ${samplename}_amber \ - -threads 16 \ + java -Xmx32G -cp amber.jar com.hartwig.hmftools.amber.AmberApplication \ + -tumor ${tumorname} -tumor_bam ${tumor} \ + -output_dir ${tumorname}_amber \ + -threads $task.cpus \ -ref_genome_version V38 \ -loci $GERMLINEHET @@ -55,20 +247,57 @@ process amber { stub: """ - mkdir ${samplename}_amber - touch ${samplename}_amber/${samplename}.amber.baf.tsv.gz ${samplename}_amber/${samplename}.amber.baf.pcf ${samplename}_amber/${samplename}.amber.qc + mkdir ${tumorname}_amber + touch ${tumorname}_amber/${tumorname}.amber.baf.tsv.gz ${tumorname}_amber/${tumorname}.amber.baf.pcf ${tumorname}_amber/${tumorname}.amber.qc """ } -process cobalt { - module=["java/12.0.1","R/3.6.3"] - publishDir("${outdir}/cobalt", mode: 'copy') +process amber_tn { + label 'process_mid' + publishDir("${outdir}/cnv/amber", mode: 'copy') + + input: + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai) + + output: + tuple val(tumorname), path("${tumorname}_vs_${normalname}_amber") + //path("${samplename}.amber.baf.tsv.gz"), + //path("${samplename}.amber.baf.pcf"), + //path("${samplename}.amber.qc") + //path("${samplename}.amber.contamination.vcf.gz") Contamination maybe only with tumor + + script: + + """ + + java -Xmx32G -cp amber.jar com.hartwig.hmftools.amber.AmberApplication \ + -tumor ${tumorname} -tumor_bam ${tumor} \ + -reference ${normalname} -reference_bam ${normal} \ + -output_dir ${tumorname}_vs_${normalname}_amber \ + -threads $task.cpus \ + -ref_genome_version V38 \ + -loci $GERMLINEHET + + """ + + stub: + + """ + mkdir ${tumorname}_vs_${normalname}_amber + touch ${tumorname}_vs_${normalname}_amber/${tumorname}.amber.baf.tsv.gz ${tumorname}_vs_${normalname}_amber/${tumorname}.amber.baf.pcf ${tumorname}_vs_${normalname}_amber/${tumorname}.amber.qc + """ +} + +process cobalt_tonly { + label "process_mid" + publishDir("${outdir}/cnv/cobalt", mode: 'copy') input: - tuple val(samplename), path("${samplename}.bqsr.bam"), path("${samplename}.bqsr.bam.bai") + tuple val(tumorname), path(tumor), path(tumorbai) output: - tuple val(samplename), path("${samplename}_cobalt") + tuple val(tumorname), path("${tumorname}_cobalt") //path("${samplename}/${samplename}.cobalt.ratio.tsv.gz"), //path("${samplename}/${samplename}.cobalt.ratio.pcf"), //path("${samplename}/${samplename}.cobalt.gc.median.tsv") @@ -77,88 +306,142 @@ process cobalt { """ - java -jar -Xmx8G $cobalt \ - -tumor ${samplename} -tumor_bam ${samplename}.bqsr.bam \ - -output_dir ${samplename}_cobalt \ - -threads 16 \ + java -jar -Xmx8G cobalt.jar \ + -tumor ${tumorname} -tumor_bam ${tumor} \ + -output_dir ${tumorname}_cobalt \ + -threads $task.cpus \ -tumor_only_diploid_bed $DIPLODREG \ - -gc_profile /data/SCLC-BRAINMETS/cn/copy_number/GC_profile.1000bp.38.cnp + -gc_profile $GCPROFILE """ stub: """ - mkdir ${samplename}_cobalt - touch ${samplename}_cobalt/${samplename}.cobalt.ratio.tsv.gz ${samplename}_cobalt/${samplename}.cobalt.ratio.pcf ${samplename}_cobalt/${samplename}.cobalt.gc.median.tsv + mkdir ${tumorname}_cobalt + touch ${tumorname}_cobalt/${tumorname}.cobalt.ratio.tsv.gz ${tumorname}_cobalt/${tumorname}.cobalt.ratio.pcf ${tumorname}_cobalt/${tumorname}.cobalt.gc.median.tsv """ } -process purple { - module=["java/12.0.1","R/3.6.3"] - - publishDir("${outdir}/purple", mode: 'copy') +process cobalt_tn { + label "process_mid" + publishDir("${outdir}/cnv/cobalt", mode: 'copy') input: - tuple val(samplename), path(cobaltin), path(amberin), path("${samplename}.tonly.final.mut2.vcf.gz") + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai) output: - tuple val(samplename), path("${samplename}") + tuple val(tumorname), path("${tumorname}_vs_${normalname}_cobalt") + //path("${samplename}/${samplename}.cobalt.ratio.tsv.gz"), + //path("${samplename}/${samplename}.cobalt.ratio.pcf"), + //path("${samplename}/${samplename}.cobalt.gc.median.tsv") script: """ - java -jar $purple \ - -tumor ${samplename} \ + java -jar -Xmx8G cobalt.jar \ + -tumor ${tumorname} -tumor_bam ${tumorname} \ + -reference ${normalname} -reference_bam ${normal} \ + -output_dir ${tumorname}_vs_${normalname}_cobalt \ + -threads $task.cpus \ + -tumor_only_diploid_bed $DIPLODREG \ + -gc_profile $GCPROFILE + + """ + + stub: + + """ + mkdir ${tumorname}_vs_${normalname}_cobalt + touch ${tumorname}_vs_${normalname}_cobalt/${tumorname}.cobalt.ratio.tsv.gz ${tumorname}_vs_${normalname}_cobalt/${tumorname}.cobalt.ratio.pcf ${tumorname}_vs_${normalname}_cobalt/${tumorname}.cobalt.gc.median.tsv + """ +} + + +process purple { + label 'process_mid' + publishDir("${outdir}/cnv/purple", mode: 'copy') + + input: + tuple val(tumorname), + path(cobaltin), + path(amberin), + path(somaticvcf), + path(somaticvcfindex) + + output: + tuple val(tumorname), path("${tumorname}") + + script: + + """ + java -jar purple.jar \ + -tumor ${tumorname} \ -amber ${amberin} \ -cobalt ${cobaltin} \ -gc_profile $GCPROFILE \ -ref_genome_version 38 \ -ref_genome $GENOME \ -ensembl_data_dir $ENSEMBLCACHE \ - -somatic_vcf ${samplename}.tonly.final.mut2.vcf.gz \ + -somatic_vcf ${somaticvcf} \ -driver_gene_panel $DRIVERS \ -somatic_hotspots $HOTSPOTS \ - -output_dir ${samplename} - + -output_dir ${tumorname} """ stub: """ - mkdir ${samplename} - touch ${samplename}/${samplename}.purple.cnv.somatic.tsv ${samplename}/${samplename}.purple.cnv.gene.tsv ${samplename}/${samplename}.driver.catalog.somatic.tsv + mkdir ${tumorname} + touch ${tumorname}/${tumorname}.purple.cnv.somatic.tsv ${tumorname}/${tumorname}.purple.cnv.gene.tsv ${tumorname}/${tumorname}.driver.catalog.somatic.tsv """ } +/* +process ascat_tn { + module=["java/12.0.1","R/3.6.3"] -//Workflow -workflow { - sample_sheet=Channel.fromPath(params.sample_sheet, checkIfExists: true).view() - .ifEmpty { "sample sheet not found" } - .splitCsv(header:true, sep: "\t", strip:true) - .map { row -> tuple( - row.ID, - row.bam, - row.vcf - ) - } + publishDir("${outdir}/purple", mode: 'copy') + input: + tuple val(samplename), path(cobaltin), path(amberin), path("${samplename}.tonly.final.mut2.vcf.gz") - baminput=sample_sheet - .map{samplename,bam,vcf-> tuple(samplename,file(bam),file("${bam}.bai"))} + output: + tuple val(samplename), path("${samplename}") - somaticinput=sample_sheet - .map{samplename,bam,vcf-> tuple(samplename,file(vcf))} + script: + + """ + Rscript ${ascatR} + """ - cobalt(baminput) - amber(baminput) + stub: + + """ + touch ${prefix}.after_correction.gc_rt.test.tumour.germline.png + touch ${prefix}.after_correction.gc_rt.test.tumour.tumour.png + touch ${prefix}.before_correction.test.tumour.germline.png + touch ${prefix}.before_correction.test.tumour.tumour.png + touch ${prefix}.cnvs.txt + touch ${prefix}.metrics.txt + touch ${prefix}.normal_alleleFrequencies_chr21.txt + touch ${prefix}.normal_alleleFrequencies_chr22.txt + touch ${prefix}.purityploidy.txt + touch ${prefix}.segments.txt + touch ${prefix}.tumour.ASPCF.png + touch ${prefix}.tumour.sunrise.png + touch ${prefix}.tumour_alleleFrequencies_chr21.txt + touch ${prefix}.tumour_alleleFrequencies_chr22.txt + touch ${prefix}.tumour_normalBAF.txt + touch ${prefix}.tumour_normalLogR.txt + touch ${prefix}.tumour_tumourBAF.txt + touch ${prefix}.tumour_tumourLogR.txt + """ - pre1=cobalt.out.join(amber.out) - pre1.join(somaticinput).view()| purple } - +*/ diff --git a/modules/local/structural_variant.nf b/modules/local/structural_variant.nf index 7a92b3f..d807753 100644 --- a/modules/local/structural_variant.nf +++ b/modules/local/structural_variant.nf @@ -1,23 +1,18 @@ -GENOME=file(params.genome) -GENOMEDICT=file(params.genomedict) -WGSREGION=file(params.wgsregion) -MILLSINDEL=file(params.millsindel) //Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -SHAPEITINDEL=file(params.shapeitindel) //ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz -KGP=file(params.kgp) //1000G_phase1.snps.high_confidence.hg38.vcf.gz" -DBSNP=file(params.dbsnp) //dbsnp_138.hg38.vcf.gz" -DBSNP_INDEL=file(params.dbsnp_indel) //dbsnp_138.hg38.vcf.gz" -BWAGENOME=file(params.bwagenome) -GNOMAD=file(params.gnomad) //somatic-hg38-af-only-gnomad.hg38.vcf.gz -PON=file(params.pon) +GENOMEREF=file(params.genomes[params.genome].genome) +GENOME=params.genome +BWAGENOME=file(params.genomes[params.genome].bwagenome) +DBSNP_INDEL=file(params.genomes[params.genome].KNOWNINDELS) outdir=file(params.output) process svaba_somatic { + label 'process_highcpu' + publishDir(path: "${outdir}/SV/svaba", mode: 'copy') input: - tuple val(tumorname), path(tumor), path(tumorbai),val(normalname), path(normal), path(normalbai) + tuple val(tumorname), path(tumor), path(tumorbai), val(normalname), path(normal), path(normalbai) output: tuple val(tumorname), @@ -46,14 +41,26 @@ process svaba_somatic { """ touch "${tumor.simpleName}.bps.txt.gz" touch "${tumor.simpleName}.contigs.bam" - touch "${tumor.simpleName}.discordants.txt.gz" + touch "${tumor.simpleName}.discordant.txt.gz" touch "${tumor.simpleName}.alignments.txt.gz" + touch "${tumor.simpleName}.svaba.germline.indel.vcf" + touch "${tumor.simpleName}.svaba.germline.sv.vcf" + touch "${tumor.simpleName}.svaba.somatic.indel.vcf" + touch "${tumor.simpleName}.svaba.somatic.sv.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.germline.indel.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.germline.sv.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.somatic.indel.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.somatic.sv.vcf" + touch "${tumor.simpleName}.log" + """ } + process manta_somatic { - //https://github.com/illumina/manta + + label 'process_highcpu' publishDir(path: "${outdir}/SV/manta", mode: 'copy') input: @@ -71,10 +78,10 @@ process manta_somatic { mkdir -p wd configManta.py \ - --normalBam=${normal} \ - --tumorBam=${tumor} \ - --referenceFasta=$GENOME \ - --runDir=wd + --normalBam=${normal} \ + --tumorBam=${tumor} \ + --referenceFasta=$GENOMEREF \ + --runDir=wd wd/runWorkflow.py -m local -j 10 -g 10 @@ -96,30 +103,215 @@ process manta_somatic { } -process annotsv_tn{ +process annotsv_tn { //AnnotSV for Manta/Svaba works with either vcf.gz or .vcf files //Requires bedtools,bcftools + module = ['annotsv/3.3.1'] publishDir(path: "${outdir}/SV/annotated", mode: 'copy') input: - tuple val(tumorname), path(somaticvcf) + tuple val(tumorname), path(somaticvcf), val(sv) output: tuple val(tumorname), - path("${tumor.simpleName}.tsv"), - path("${tumor.simpleName}.unannotated.tsv"), + path("${sv}/${tumorname}.tsv"), + path("${sv}/${tumorname}.unannotated.tsv") script: """ - AnnotSV -SVinputFile ${tumor}.vcf.gz -SVinputInfo 1 -outputFile ${tumor.simpleName} -outputDir . + mkdir ${sv} + + AnnotSV -SVinputFile ${somaticvcf} \ + -genomeBuild $GENOME \ + -SVinputInfo 1 -outputFile ${tumorname} \ + -outputDir ${sv} + + """ + + stub: + """ + mkdir ${sv} + + touch "${sv}/${tumorname}.tsv" + touch "${sv}/${tumorname}.unannotated.tsv" + """ +} + + +process manta_tonly { + label 'process_highcpu' + publishDir(path: "${outdir}/SV/manta_tonly", mode: 'copy') + + input: + tuple val(tumorname), path(tumor), path(tumorbai) + + output: + tuple val(tumorname), + path("${tumor.simpleName}.candidateSV.vcf.gz"), + path("${tumor.simpleName}.candidateSmallIndels.vcf.gz"), + path("${tumor.simpleName}.tumorSV.vcf.gz") + + + script: + """ + mkdir -p wd + + configManta.py \ + --tumorBam=${tumor} \ + --referenceFasta=$GENOMEREF \ + --runDir=wd + + wd/runWorkflow.py -m local -j 10 -g 10 + + mv wd/results/variants/candidateSV.vcf.gz ${tumor.simpleName}.candidateSV.vcf.gz + mv wd/results/variants/candidateSmallIndels.vcf.gz ${tumor.simpleName}.candidateSmallIndels.vcf.gz + mv wd/results/variants/tumorSV.vcf.gz ${tumor.simpleName}.tumorSV.vcf.gz + """ stub: + """ - touch "${tumor.simpleName}.tsv" - touch "${tumor.simpleName}.unannotated.tsv" + touch ${tumor.simpleName}.candidateSV.vcf.gz + touch ${tumor.simpleName}.candidateSmallIndels.vcf.gz + touch ${tumor.simpleName}.tumorSV.vcf.gz + """ } + + +process svaba_tonly { + label 'process_highcpu' + publishDir(path: "${outdir}/SV/svaba_tonly", mode: 'copy') + + input: + tuple val(tumorname), path(tumor), path(tumorbai) + + output: + tuple val(tumorname), + path("${tumor.simpleName}.bps.txt.gz"), + path("${tumor.simpleName}.contigs.bam"), + path("${tumor.simpleName}.discordant.txt.gz"), + path("${tumor.simpleName}.alignments.txt.gz"), + path("${tumor.simpleName}.svaba.indel.vcf"), + path("${tumor.simpleName}.svaba.sv.vcf"), + path("${tumor.simpleName}.svaba.unfiltered.indel.vcf"), + path("${tumor.simpleName}.svaba.unfiltered.sv.vcf"), + path("${tumor.simpleName}.log") + + + script: + """ + svaba run -t ${tumor} -p $task.cpus -D $DBSNP_INDEL -a ${tumor.simpleName} -G $BWAGENOME + """ + + stub: + + """ + touch "${tumor.simpleName}.bps.txt.gz" + touch "${tumor.simpleName}.contigs.bam" + touch "${tumor.simpleName}.discordant.txt.gz" + touch "${tumor.simpleName}.alignments.txt.gz" + touch "${tumor.simpleName}.svaba.indel.vcf" + touch "${tumor.simpleName}.svaba.sv.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.indel.vcf" + touch "${tumor.simpleName}.svaba.unfiltered.sv.vcf" + touch "${tumor.simpleName}.log" + + """ +} + + +process gunzip { + + input: + tuple val(tumorname), + path(vcf), val(sv) + + output: + tuple val(tumorname), + path("${tumorname}.tumorSV.vcf"), val(sv) + + script: + """ + gunzip ${vcf} > ${tumorname}.tumorSV.vcf + """ + + stub: + + """ + touch ${tumorname}.tumorSV.vcf + """ + +} + + +process survivor_sv { + module = ['survivor'] + publishDir(path: "${outdir}/SV/survivor", mode: 'copy') + + input: + tuple val(tumorname), + path(vcfs),val(svs) + + output: + tuple val(tumorname), + path("${tumorname}_merged.vcf"), + val("survivor") + + + script: + strin = vcfs.join("\\n") + + """ + echo -e '$strin' > filelistin + SURVIVOR merge filelistin 1000 2 1 1 1 30 ${tumorname}_merged.vcf + """ + + stub: + strin = vcfs.join("\\n") + """ + echo -e '$strin' > filelistin + touch "${tumorname}_merged.vcf" + """ +} + + +process annotsv_tonly { + //AnnotSV for Manta/Svaba works with either vcf.gz or .vcf files + //Requires bedtools,bcftools + + module = ['annotsv/3.3.1'] + publishDir(path: "${outdir}/SV/annotated_tonly", mode: 'copy') + + input: + tuple val(tumorname), path(somaticvcf), val(sv) + + output: + tuple val(tumorname), + path("${sv}/${tumorname}.tsv"), + path("${sv}/${tumorname}.unannotated.tsv") + + + script: + """ + mkdir ${sv} + + AnnotSV -SVinputFile ${somaticvcf} \ + -genomeBuild $GENOME \ + -SVinputInfo 1 -outputFile ${tumorname} \ + -outputDir ${sv} + + """ + + stub: + """ + mkdir ${sv} + + touch "${sv}/${tumorname}.tsv" + touch "${sv}/${tumorname}.unannotated.tsv" + """ +} \ No newline at end of file diff --git a/modules/local/variant_calling.nf b/modules/local/variant_calling.nf index 807fa77..d7b3bf9 100644 --- a/modules/local/variant_calling.nf +++ b/modules/local/variant_calling.nf @@ -1,99 +1,106 @@ -GENOME=file(params.genome) -GENOMEDICT=file(params.genomedict) -WGSREGION=file(params.wgsregion) -MILLSINDEL=file(params.millsindel) //Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -SHAPEITINDEL=file(params.shapeitindel) //ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz -KGP=file(params.kgp) //1000G_phase1.snps.high_confidence.hg38.vcf.gz" -DBSNP=file(params.dbsnp) //dbsnp_138.hg38.vcf.gz" -GNOMAD=file(params.gnomad) //somatic-hg38-af-only-gnomad.hg38.vcf.gz -PON=file(params.pon) -VEP_CACHEDIR=file(params.vep_cache) +GENOMEREF=file(params.genomes[params.genome].genome) +GENOMEFAI=file(params.genomes[params.genome].genomefai) +GENOMEDICT=file(params.genomes[params.genome].genomedict) +KGPGERMLINE=params.genomes[params.genome].kgp +DBSNP=file(params.genomes[params.genome].dbsnp) +GNOMADGERMLINE=params.genomes[params.genome].gnomad +PON=file(params.genomes[params.genome].pon) +VEPCACHEDIR=file(params.genomes[params.genome].vepcache) +VEPSPECIES=params.genomes[params.genome].vepspecies +VEPBUILD=params.genomes[params.genome].vepbuild +SOMATIC_FOREST=params.genomes[params.genome].octopus_sforest +GERMLINE_FOREST=params.genomes[params.genome].octopus_gforest +LOFREQ_CONVERT=params.lofreq_convert //Output outdir=file(params.output) process mutect2 { + label 'process_somaticcaller' + input: - tuple val(tumorname), path(tumor), path(tumorbai),val(normalname), path(normal), path(normalbai), path(bed) + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), + path(bed) output: - tuple val(tumorname), - path("${tumor.simpleName}_${bed.simpleName}.mut2.vcf.gz"), - path("${tumor.simpleName}_${bed.simpleName}.f1r2.tar.gz"), - path("${tumor.simpleName}_${bed.simpleName}.mut2.vcf.gz.stats") + tuple val(tumorname), val(normalname), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.mut2.vcf.gz"), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.f1r2.tar.gz"), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.mut2.vcf.gz.stats") script: - """ gatk Mutect2 \ - --reference ${GENOME} \ + --reference $GENOMEREF \ --intervals ${bed} \ --input ${tumor} \ --input ${normal} \ --normal-sample ${normal.simpleName} \ --tumor-sample ${tumor.simpleName} \ - --germline-resource ${GNOMAD} \ + $GNOMADGERMLINE \ --panel-of-normals ${PON} \ - --output ${tumor.simpleName}_${bed.simpleName}.mut2.vcf.gz \ - --f1r2-tar-gz ${tumor.simpleName}_${bed.simpleName}.f1r2.tar.gz \ + --output ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.mut2.vcf.gz \ + --f1r2-tar-gz ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.f1r2.tar.gz \ --independent-mates """ stub: - """ - touch ${tumor.simpleName}_${bed.simpleName}.mut2.vcf.gz - touch ${tumor.simpleName}_${bed.simpleName}.f1r2.tar.gz - touch ${tumor.simpleName}_${bed.simpleName}.mut2.vcf.gz.stats + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.mut2.vcf.gz + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.f1r2.tar.gz + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.mut2.vcf.gz.stats """ } process pileup_paired_t { + label 'process_highmem' + input: - tuple val(tumorname), path(tumor), path(tumorbai),val(normalname), path(normal), path(normalbai), path(bed) + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), path(bed) output: tuple val(tumorname), path("${tumor.simpleName}_${bed.simpleName}.tumor.pileup.table") script: - """ gatk --java-options -Xmx48g GetPileupSummaries \ -I ${tumor} \ - -V ${KGP} \ + -V $KGPGERMLINE \ -L ${bed} \ -O ${tumor.simpleName}_${bed.simpleName}.tumor.pileup.table """ stub: - """ touch ${tumor.simpleName}_${bed.simpleName}.tumor.pileup.table - """ } process pileup_paired_n { + label 'process_highmem' + input: - tuple val(tumorname), path(tumor), path(tumorbai),val(normalname), path(normal), path(normalbai), path(bed) + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), path(bed) output: tuple val(tumorname), path("${tumor.simpleName}_${bed.simpleName}.normal.pileup.table") script: - """ gatk --java-options -Xmx48g GetPileupSummaries \ -I ${normal} \ - -V ${KGP} \ + -V $KGPGERMLINE \ -L ${bed} \ -O ${tumor.simpleName}_${bed.simpleName}.normal.pileup.table @@ -108,10 +115,10 @@ process pileup_paired_n { process contamination_paired { + label 'process_highmem' publishDir(path: "${outdir}/vcfs/mutect2", mode: 'copy') - //OUTPUT THE CONTAMINATION TABLE input: tuple val(tumorname), path(tumor_pileups), @@ -132,11 +139,11 @@ process contamination_paired { """ gatk GatherPileupSummaries \ - --sequence-dictionary ${GENOMEDICT} \ + --sequence-dictionary $GENOMEDICT \ -I ${alltumor} -O ${tumorname}_allpileups.table gatk GatherPileupSummaries \ - --sequence-dictionary ${GENOMEDICT} \ + --sequence-dictionary $GENOMEDICT \ -I ${allnormal} -O ${tumorname}_normal.allpileups.table gatk CalculateContamination \ @@ -161,7 +168,10 @@ process contamination_paired { } + process learnreadorientationmodel { + label 'process_highmem' + publishDir(path: "${outdir}/vcfs/mutect2", mode: 'copy') input: @@ -186,8 +196,9 @@ process learnreadorientationmodel { } - process mergemut2stats { + label 'process_low' + publishDir(path: "${outdir}/vcfs/mutect2", mode: 'copy') input: @@ -213,68 +224,70 @@ process mergemut2stats { } - - process mutect2filter { + label 'process_mid' publishDir(path: "${outdir}/vcfs/mutect2", mode: 'copy') input: - tuple val(sample), path(mutvcfs), path(stats), path(obs), path(pileups), path(normal_pileups),path(tumorcontamination),path(normalcontamination) + tuple val(tumor), val(normal),path(mutvcfs), path(stats), path(obs), + path(pileups), path(normal_pileups),path(tumorcontamination),path(normalcontamination) + output: - tuple val(sample), path("${sample}.mut2.marked.vcf.gz"), - path("${sample}.mut2.norm.vcf.gz"), path("${sample}.marked.vcf.gz.filteringStats.tsv") + tuple val("${tumor}_vs_${normal}"), + path("${tumor}_vs_${normal}.mut2.marked.vcf.gz"), + path("${tumor}_vs_${normal}.mut2.marked.vcf.gz.tbi"), + path("${tumor}_vs_${normal}.mut2.norm.vcf.gz"), path("${tumor}_vs_${normal}.mut2.norm.vcf.gz.tbi"), + path("${tumor}_vs_${normal}.mut2.marked.vcf.gz.filteringStats.tsv") script: - //Include the stats and concat ${mutvcfs} -Oz -o ${sample}.concat.vcf.gz mut2in = mutvcfs.join(" -I ") - """ - gatk GatherVcfs -I ${mut2in} -O ${sample}.concat.vcf.gz - gatk IndexFeatureFile -I ${sample}.concat.vcf.gz + gatk SortVcf -I ${mut2in} -O ${tumor}_vs_${normal}.concat.vcf.gz --CREATE_INDEX gatk FilterMutectCalls \ - -R ${GENOME} \ - -V ${sample}.concat.vcf.gz \ + -R $GENOMEREF \ + -V ${tumor}_vs_${normal}.concat.vcf.gz \ --ob-priors ${obs} \ --contamination-table ${tumorcontamination} \ --stats ${stats} \ - -O ${sample}.mut2.marked.vcf.gz - - + -O ${tumor}_vs_${normal}.mut2.marked.vcf.gz gatk SelectVariants \ - -R ${GENOME} \ - --variant ${sample}.mut2.marked.vcf.gz \ + -R $GENOMEREF \ + --variant ${tumor}_vs_${normal}.mut2.marked.vcf.gz \ --exclude-filtered \ - --output ${sample}.mut2.final.vcf.gz + --output ${tumor}_vs_${normal}.mut2.final.vcf.gz - bcftools sort ${sample}.mut2.final.vcf.gz -@ 16 -Oz |\ - bcftools norm --threads $task.cpus --check-ref s -f $GENOME -O v |\ + bcftools sort ${tumor}_vs_${normal}.mut2.final.vcf.gz |\ + bcftools norm --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",\$4); OFS = "\\t"; print}}' |\ - sed '/^\$/d' > ${sample}.mut2.norm.vcf.gz + sed '/^\$/d' | bcftools view - -Oz -o ${tumor}_vs_${normal}.mut2.norm.vcf.gz + bcftools index -t ${tumor}_vs_${normal}.mut2.norm.vcf.gz """ stub: """ - touch ${sample}.mut2.marked.vcf.gz - touch ${sample}.mut2.norm.vcf.gz - touch ${sample}.marked.vcf.gz.filteringStats.tsv + touch ${tumor}_vs_${normal}.mut2.marked.vcf.gz ${tumor}_vs_${normal}.mut2.marked.vcf.gz.tbi + touch ${tumor}_vs_${normal}.mut2.norm.vcf.gz ${tumor}_vs_${normal}.mut2.norm.vcf.gz.tbi + touch ${tumor}_vs_${normal}.mut2.marked.vcf.gz.filteringStats.tsv """ } - process strelka_tn { - + label 'process_highcpu' input: - tuple val(tumorname), path(tumor), path(tumorbai), val(normalname), path(normal), path(normalbai), path(bed) + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), path(bed) output: - tuple val(tumorname), - path("${tumor.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz"), - path("${tumor.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz") + tuple val(tumorname), val(normalname), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz"), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz.tbi"), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz"), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz.tbi") script: @@ -285,62 +298,80 @@ process strelka_tn { tabix ${bed}.gz configureStrelkaSomaticWorkflow.py \ - --ref=${GENOME} \ + --ref=$GENOMEREF \ --tumor=${tumor} \ --normal=${normal} \ --runDir=wd \ --callRegions ${bed}.gz ./wd/runWorkflow.py -m local -j $task.cpus - mv wd/results/variants/somatic.snvs.vcf.gz ${tumor.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz - mv wd/results/variants/somatic.indels.vcf.gz ${tumor.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz + mv wd/results/variants/somatic.snvs.vcf.gz ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic_temp.snvs.vcf.gz + mv wd/results/variants/somatic.indels.vcf.gz ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic_temp.indels.vcf.gz + + printf "NORMAL\t${normalname}\nTUMOR\t${tumorname}\n" >sampname + + bcftools reheader -s sampname ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic_temp.snvs.vcf.gz \ + | bcftools view -Oz -o ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz + bcftools reheader -s sampname ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic_temp.indels.vcf.gz \ + | bcftools view -Oz -o ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz + bcftools index -t ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz + bcftools index -t ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz + """ stub: """ - touch ${tumor.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz - touch ${tumor.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.snvs.vcf.gz.tbi + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.somatic.indels.vcf.gz.tbi """ - } process vardict_tn { - + label 'process_highcpu' + input: tuple val(tumorname), path(tumor), path(tumorbai), val(normalname), path(normal), path(normalbai), path(bed) output: - tuple val(tumorname), - path("${tumor.simpleName}_${bed.simpleName}.vardict.vcf") - + tuple val(tumorname), val(normalname), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.vardict.vcf.gz") + //bcbio notes of vardict filtering var2vcf_paired.pl -P 0.9 -m 4.25 -f 0.01 -M” and + //filtered with “((AF*DP < 6) && ((MQ < 55.0 && NM > 1.0) || (MQ < 60.0 && NM > 2.0) || (DP < 10) || (QUAL < 45)))” script: """ - VarDict -G $GENOME \ + bedtools makewindows -b ${bed} -w 50150 -s 50000 > temp_${bed} + + VarDict -G $GENOMEREF \ -f 0.01 \ --nosv \ - -b "${tumor}|${normal}" \ + -b "${tumor}|${normal}" --fisher \ -t -Q 20 -c 1 -S 2 -E 3 \ - ${bed} --th 6 \ - | teststrandbias.R \ + --th $task.cpus temp_${bed} \ | var2vcf_paired.pl \ -N "${tumor}|${normal}" \ -Q 20 \ -d 10 \ -v 6 \ -S \ - -f 0.05 > ${tumor.simpleName}_${bed.simpleName}.vardict.vcf + -f 0.05 > ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.vardict.vcf + + printf "${normal.Name}\t${normalname}\n${tumor.Name}\t${tumorname}\n" > sampname + + bcftools reheader -s sampname ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.vardict.vcf \ + | bcftools view -Oz -o ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.vardict.vcf.gz + """ stub: """ - touch ${tumor.simpleName}_${bed.simpleName}.vardict.vcf + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.vardict.vcf.gz """ @@ -348,37 +379,185 @@ process vardict_tn { } - process varscan_tn { + label 'process_somaticcaller' + input: tuple val(tumorname), path(tumor), path(tumorbai), val(normalname), path(normal), path(normalbai), path(bed), - path(tumorpileup), path(normalpileup), path(tumor_con_table), path(normal_con_table) + val(tumor1), + path(tumorpileup), path(normalpileup), + path(tumor_con_table), path(normal_con_table) output: - tuple val(tumorname), - path("${tumor.simpleName}_${bed.simpleName}.varscan.vcf") + tuple val(tumorname), val(normalname), + path("${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.varscan.vcf.gz") shell: - ''' tumor_purity=$( echo "1-$(printf '%.6f' $(tail -n -1 !{tumor_con_table} | cut -f2 ))" | bc -l) normal_purity=$( echo "1-$(printf '%.6f' $(tail -n -1 !{normal_con_table} | cut -f2 ))" | bc -l) + dual_pileup="samtools mpileup -d 10000 -q 15 -Q 15 -f !{GENOMEREF} -l !{bed} !{normal} !{tumor}" varscan_opts="--strand-filter 1 --min-var-freq 0.01 --min-avg-qual 30 --somatic-p-value 0.05 --output-vcf 1 --normal-purity $normal_purity --tumor-purity $tumor_purity" - varscan somatic < samtools mpileup -d 10000 -q 15 -Q 15 -f !GENOME -l !{bed.simpleName} !{normal} !{rumor} !{tumor.simpleName}_{bed.simpleName}.vardict.vcf $varscan_opts --mpileup 1 + varscan_cmd="varscan somatic <($dual_pileup) !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.vcf $varscan_opts --mpileup 1" + eval "$varscan_cmd" + + awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",$4); OFS = "\\t"; print}}' !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.vcf.indel \ + | sed '/^$/d' | bcftools view - -Oz -o !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.indel_temp.vcf.gz + awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",$4); OFS = "\\t"; print}}' !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.vcf.snp \ + | sed '/^$/d' | bcftools view - -Oz -o !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.snp_temp.vcf.gz + + gatk SortVcf -I !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.snp_temp.vcf.gz \ + -I !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.indel_temp.vcf.gz \ + -R !{GENOMEREF} -SD !{GENOMEDICT} \ + -O !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}_temp.varscan.vcf + + printf "NORMAL\t!{normalname}\nTUMOR\t!{tumorname}\n" > sampname + + bcftools reheader -s sampname !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}_temp.varscan.vcf \ + | bcftools view -Oz -o !{tumor.simpleName}_vs_!{normal.simpleName}_!{bed.simpleName}.varscan.vcf.gz + ''' + stub: + """ + touch ${tumor.simpleName}_vs_${normal.simpleName}_${bed.simpleName}.varscan.vcf.gz + """ + +} + + +process octopus_tn { + //label 'process_highcpu' Using separate docker for octopus + + input: + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), path(bed) + + + output: + tuple val("${tumorname}_vs_${normalname}"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}.octopus.vcf.gz") + + script: + + """ + octopus -R $GENOMEREF -I ${normal} ${tumor} --normal-sample ${normalname} \ + -C cancer \ + --annotations AC AD DP -t ${bed} \ + --threads $task.cpus \ + $GERMLINE_FOREST \ + $SOMATIC_FOREST \ + --target-working-memory 64Gb \ + -o ${tumorname}_vs_${normalname}_${bed.simpleName}.octopus.vcf.gz + """ + stub: """ - touch ${tumor.simpleName}_${bed.simpleName}.varscan.vcf + touch "${tumorname}_vs_${normalname}_${bed.simpleName}.octopus.vcf.gz" + """ + +} + + +process lofreq_tn { + label 'process_somaticcaller' + + input: + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai), path(bed) + + + output: + tuple val(tumorname), val(normalname), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final.snvs.vcf.gz"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.snvs.vcf.gz"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final.indels.vcf.gz"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.indels.vcf.gz"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz"), + path("${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz.tbi") + + script: + """ + lofreq somatic -f $GENOMEREF -n ${normal} -t ${tumor} \ + -d $DBSNP \ + --threads $task.cpus \ + -l ${bed} \ + --call-indels \ + -o ${tumorname}_vs_${normalname}_${bed.simpleName}_ + + bcftools concat ${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.snvs.vcf.gz \ + ${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.indels.vcf.gz --threads $task.cpus -Oz -o \ + ${tumorname}_vs_${normalname}_${bed.simpleName}_temp_lofreq.vcf.gz + + $LOFREQ_CONVERT -i ${tumorname}_vs_${normalname}_${bed.simpleName}_temp_lofreq.vcf.gz -g 1/0 \ + -n ${tumorname} -o ${tumorname}_vs_${normalname}_${bed.simpleName}_temp1_lofreq.vcf.gz + + bcftools view -h ${tumorname}_vs_${normalname}_${bed.simpleName}_temp1_lofreq.vcf.gz >temphead + + sed 's/^##FORMAT=/##FORMAT=/' temphead > temphead1 + bcftools reheader ${tumorname}_vs_${normalname}_${bed.simpleName}_temp1_lofreq.vcf.gz -h temphead1 |\ + bcftools view -Oz -o ${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz + + bcftools index -t ${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz + + """ + + stub: + + """ + touch "${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final.snvs.vcf.gz" + touch "${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.snvs.vcf.gz" + touch "${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final.indels.vcf.gz" + touch "${tumorname}_vs_${normalname}_${bed.simpleName}_somatic_final_minus-dbsnp.indels.vcf.gz" + touch "${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz" "${tumorname}_vs_${normalname}_${bed.simpleName}_lofreq.vcf.gz.tbi" + + """ +} -} -process combineVariants { +process muse_tn { + label 'process_somaticcaller' + input: + tuple val(tumorname), path(tumor), path(tumorbai), + val(normalname), path(normal), path(normalbai) + + + output: + tuple val(tumorname), val(normalname), + path("${tumorname}_vs_${normalname}.vcf.gz") + + script: + + """ + MuSE call -f $GENOMEREF -O ${tumorname}_vs_${normalname} -n $task.cpus $tumor $normal + MuSE sump -I ${tumorname}_vs_${normalname}.MuSE.txt \ + -O ${tumorname}_vs_${normalname}.vcf -n $task.cpus -D $DBSNP -G + + bcftools view ${tumorname}_vs_${normalname}.vcf -Oz -o ${tumorname}_vs_${normalname}_temp.vcf.gz + + printf "NORMAL\t${normalname}\nTUMOR\t${tumorname}\n" > sampname + + bcftools reheader -s sampname ${tumorname}_vs_${normalname}_temp.vcf.gz \ + | bcftools view -Oz -o ${tumorname}_vs_${normalname}.vcf.gz + + """ + + stub: + + """ + touch "${tumorname}_vs_${normalname}.vcf.gz" + """ + +} + + +process combineVariants { + label 'process_highmem' publishDir(path: "${outdir}/vcfs/", mode: 'copy') input: @@ -386,22 +565,79 @@ process combineVariants { output: tuple val(sample), - path("${vc}/${sample}.${vc}.marked.vcf.gz"), path("${vc}/${sample}.${vc}.norm.vcf.gz") + path("${vc}/${sample}.${vc}.marked.vcf.gz"), + path("${vc}/${sample}.${vc}.marked.vcf.gz.tbi"), + path("${vc}/${sample}.${vc}.norm.vcf.gz"), + path("${vc}/${sample}.${vc}.norm.vcf.gz.tbi") + + script: + vcfin = inputvcf.join(" -I ") + + """ + mkdir ${vc} + gatk --java-options "-Xmx48g" SortVcf \ + -O ${sample}.${vc}.marked.vcf.gz \ + -SD $GENOMEDICT \ + -I $vcfin + bcftools norm ${sample}.${vc}.marked.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ + awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",\$4); OFS = "\\t"; print}}' |\ + sed '/^\$/d' > ${sample}.${vc}.temp.vcf + + bcftools view ${sample}.${vc}.temp.vcf -f PASS -Oz -o ${vc}/${sample}.${vc}.norm.vcf.gz + + mv ${sample}.${vc}.marked.vcf.gz ${vc} + mv ${sample}.${vc}.marked.vcf.gz.tbi ${vc} + + bcftools index ${vc}/${sample}.${vc}.norm.vcf.gz -t + """ + + stub: + + """ + mkdir ${vc} + touch ${vc}/${sample}.${vc}.marked.vcf.gz + touch ${vc}/${sample}.${vc}.norm.vcf.gz + touch ${vc}/${sample}.${vc}.marked.vcf.gz.tbi + touch ${vc}/${sample}.${vc}.norm.vcf.gz.tbi + """ + +} + + + + +process combineVariants_alternative { + label 'process_highmem' + publishDir(path: "${outdir}/vcfs/", mode: 'copy') + + input: + tuple val(sample), path(vcfs), path(vcfsindex), val(vc) + + output: + tuple val(sample), + path("${vc}/${sample}.${vc}.marked.vcf.gz"), + path("${vc}/${sample}.${vc}.marked.vcf.gz.tbi"), + path("${vc}/${sample}.${vc}.norm.vcf.gz"), + path("${vc}/${sample}.${vc}.norm.vcf.gz.tbi") script: - vcfin = inputvcf.join(" ") + vcfin = vcfs.join(" ") """ mkdir ${vc} - bcftools concat $vcfin -Oz -o ${sample}.${vc}.temp.vcf.gz - bcftools sort ${sample}.${vc}.temp.vcf.gz -Oz -o ${sample}.${vc}.marked.vcf.gz - bcftools norm ${sample}.${vc}.marked.vcf.gz --threads $task.cpus --check-ref s -f $GENOME -O v |\ + bcftools concat $vcfin -a -Oz -o ${sample}.${vc}.temp1.vcf.gz + bcftools reheader -f $GENOMEFAI ${sample}.${vc}.temp1.vcf.gz -o ${sample}.${vc}.temp.vcf + bcftools sort ${sample}.${vc}.temp.vcf -Oz -o ${sample}.${vc}.marked.vcf.gz + bcftools norm ${sample}.${vc}.marked.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",\$4); OFS = "\\t"; print}}' |\ sed '/^\$/d' > ${sample}.${vc}.temp.vcf bcftools view ${sample}.${vc}.temp.vcf -f PASS -Oz -o ${vc}/${sample}.${vc}.norm.vcf.gz mv ${sample}.${vc}.marked.vcf.gz ${vc} + + bcftools index ${vc}/${sample}.${vc}.marked.vcf.gz -t + bcftools index ${vc}/${sample}.${vc}.norm.vcf.gz -t """ stub: @@ -410,7 +646,35 @@ process combineVariants { mkdir ${vc} touch ${vc}/${sample}.${vc}.marked.vcf.gz touch ${vc}/${sample}.${vc}.norm.vcf.gz + touch ${vc}/${sample}.${vc}.marked.vcf.gz.tbi + touch ${vc}/${sample}.${vc}.norm.vcf.gz.tbi + + """ + +} + + +process bcftools_index_octopus { + label 'process_low' + + input: + tuple val(tumor), + path(vcf) + + output: + tuple val(tumor), + path(vcf), + path("${vcf}.tbi") + script: + """ + bcftools index -t ${vcf} + """ + + stub: + """ + touch ${vcf} + touch ${vcf}.tbi """ } @@ -418,79 +682,158 @@ process combineVariants { process combineVariants_strelka { //Concat all somatic snvs/indels across all files, strelka separates snv/indels - + label 'process_mid' publishDir(path: "${outdir}/vcfs/strelka", mode: 'copy') input: - tuple val(sample), path(strelkasnvs), path(strelkaindels) + tuple val(sample), + path(strelkasnvs), path(snvindex), + path(strelkaindels), path(indelindex) output: - tuple val(sample), path("${sample}.strelka.vcf.gz"),path("${sample}.filtered.strelka.vcf.gz") + tuple val(sample), + path("${sample}.strelka.vcf.gz"), path("${sample}.strelka.vcf.gz.tbi"), + path("${sample}.filtered.strelka.vcf.gz"), path("${sample}.filtered.strelka.vcf.gz.tbi") script: - vcfin = strelkavcfs.join(" ") + vcfin = strelkasnvs.join(" ") indelsin = strelkaindels.join(" ") """ - bcftools concat $vcfin $indelsin -Oz -o ${sample}.temp.strelka.vcf.gz - bcftools sort ${sample}.temp.strelka.vcf.gz -Oz -o ${sample}.strelka.vcf.gz + bcftools concat $vcfin $indelsin --threads $task.cpus -Oz -o ${sample}.temp.strelka.vcf.gz -a + bcftools norm ${sample}.temp.strelka.vcf.gz -m- --threads $task.cpus --check-ref s -f $GENOMEREF -O v |\ + awk '{{gsub(/\\y[W|K|Y|R|S|M]\\y/,"N",\$4); OFS = "\\t"; print}}' |\ + sed '/^\$/d' > ${sample}.temp1.strelka.vcf.gz + + bcftools sort ${sample}.temp1.strelka.vcf.gz -Oz -o ${sample}.strelka.vcf.gz bcftools view ${sample}.strelka.vcf.gz --threads $task.cpus -f PASS -Oz -o ${sample}.filtered.strelka.vcf.gz + bcftools index ${sample}.strelka.vcf.gz -t + bcftools index ${sample}.filtered.strelka.vcf.gz -t """ stub: """ - touch ${sample}.strelka.vcf.gz - touch ${sample}.filtered.strelka.vcf.gz + touch ${sample}.strelka.vcf.gz ${sample}.strelka.vcf.gz.tbi + touch ${sample}.filtered.strelka.vcf.gz ${sample}.filtered.strelka.vcf.gz.tbi + + """ + +} + + +process somaticcombine { + label 'process_mid' + publishDir(path: "${outdir}/vcfs/combined", mode: 'copy') + + input: + tuple val(tumorsample), val(normal), + val(callers), + path(vcfs), path(vcfindex) + + output: + tuple val(tumorsample), val(normal), + path("${tumorsample}_vs_${normal}_combined.vcf.gz"), + path("${tumorsample}_vs_${normal}_combined.vcf.gz.tbi") + + script: + vcfin1=[callers, vcfs].transpose().collect { a, b -> a + " " + b } + vcfin2="-V:" + vcfin1.join(" -V:") """ + java -jar \$DISCVRSeq_JAR MergeVcfsAndGenotypes \ + -R $GENOMEREF \ + --genotypeMergeOption PRIORITIZE \ + --priority_list mutect2,strelka,octopus,muse,lofreq,vardict,varscan \ + --filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED \ + -O ${tumorsample}_vs_${normal}_combined.vcf.gz \ + $vcfin2 + """ + + stub: + vcfin1=[callers, vcfs].transpose().collect { a, b -> a + " " + b } + vcfin2="-V:" + vcfin1.join(" -V:") + """ + touch ${tumorsample}_vs_${normal}_combined.vcf.gz + touch ${tumorsample}_vs_${normal}_combined.vcf.gz.tbi + """ } + process annotvep_tn { publishDir(path: "${outdir}/mafs/", mode: 'copy') input: tuple val(tumorsample), val(normalsample), - val(vc), path(tumorvcf) + val(vc), path(tumorvcf), path(vcfindex) output: - path("paired/${vc}/${tumorsample}.maf") + path("paired/${vc}/${tumorsample}_vs_${normalsample}.maf") shell: - """ - - zcat !{tumorvcf}.vcf.gz > !{tumorvcf}.vcf + ''' + VCF_SAMPLE_IDS=($(bcftools query -l !{tumorvcf})) + TID_IDX=0 + NID_IDX="" + VCF_NID="" + NORM_VCF_ID_ARG="" + NSAMPLES=${#VCF_SAMPLE_IDS[@]} + if [ $NSAMPLES -gt 1 ]; then + # Assign tumor, normal IDs + # Look through column names and + # see if they match provided IDs + for (( i = 0; i < $NSAMPLES; i++ )); do + echo "${VCF_SAMPLE_IDS[$i]}" + if [ "${VCF_SAMPLE_IDS[$i]}" == !{tumorsample} ]; then + TID_IDX=$i + fi + + if [ "${VCF_SAMPLE_IDS[$i]}" == !{normalsample} ]; then + NID_IDX=$i + fi + done + + if [ ! -z $NID_IDX ]; then + VCF_NID=${VCF_SAMPLE_IDS[$NID_IDX]} + NORM_VCF_ID_ARG="--vcf-normal-id $VCF_NID" + fi + fi + VCF_TID=${VCF_SAMPLE_IDS[$TID_IDX]} + + zcat !{tumorvcf} > !{tumorvcf.baseName} + + mkdir -p paired/!{vc} vcf2maf.pl \ - --vep-forks 16 --input-vcf ${tumorvcf}.vcf \ - --output-maf !{vc}/!{tumorsample}.maf \ + --vep-forks !{task.cpus} --input-vcf !{tumorvcf.baseName} \ + --output-maf paired/!{vc}/!{tumorsample}_vs_!{normalsample}.maf \ --tumor-id !{tumorsample} \ --normal-id !{normalsample} \ --vep-path /opt/vep/src/ensembl-vep \ - --vep-data $VEP_CACHEDIR \ - --ncbi-build GRCh38 --species homo_sapiens --ref-fasta !{GENOME} + --vep-data !{VEPCACHEDIR} \ + --ncbi-build !{VEPBUILD} --species !{VEPSPECIES} --ref-fasta !{GENOMEREF} \ + --vep-overwrite - """ + ''' stub: """ mkdir -p paired/${vc} - touch paired/${vc}/${tumorsample}.maf + touch paired/${vc}/${tumorsample}_vs_${normalsample}.maf """ } - - process combinemafs_tn { + label 'process_low' publishDir(path: "${outdir}/mafs/paired", mode: 'copy') input: @@ -514,3 +857,4 @@ process combinemafs_tn { """ } + diff --git a/nextflow.config b/nextflow.config index d076bf4..987ca4b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,4 +1,15 @@ +manifest { + name = "CCBR/LOGAN" + author = "CCR Collaborative Bioinformatics Resource" + homePage = "https://github.com/CCBR/LOGAN" + description = "one-line description of LOGAN goes here" + mainScript = "main.nf" +} + +includeConfig 'conf/hg38.config' +includeConfig 'conf/mm10.config' + params { // TODO create a separate genome config, with genome index dir that can change depending on platform. see https://github.com/CCBR/CHAMPAGNE/blob/main/conf/genomes.config genome = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/bwamem2/Homo_sapiens_assembly38.fasta" // file(params.genome) genomedict= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.dict" @@ -133,7 +144,8 @@ profiles { cacheDir = "$PWD/singularity" envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID' // TODO refactor to no longer need bind mounts. These paths also only work on biowulf - runOptions = '-B /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/,/data/nousomedr/,/data/CCBR/projects/,/vf/users/,/gpfs/,/fdb' + runOptions = '-B /gs10,/gs11,/gs12,/gs9,/spin1,/data/CCBR_Pipeliner/,/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/,/data/nousomedr/,/data/CCBR/projects/,/vf/users,/gpfs,/fdb' + } } @@ -344,6 +356,34 @@ env { JULIA_DEPOT_PATH = "/usr/local/share/julia" } +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] + + //Container options + singularity { + enabled = true + autoMounts = true + cacheDir = "$PWD/singularity" + envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID' + runOptions = '-B /gs10,/gs11,/gs12,/gs9,/spin1,/data/CCBR_Pipeliner/,/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/,/data/nousomedr/,/data/CCBR/projects/,/vf/users,/gpfs,/fdb' + } + + } +} + +includeConfig 'conf/genomes.config' +includeConfig 'conf/containers.config' + +// Export these variables to prevent local Python/R libraries from conflicting with those in the container +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. +env { + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" +} + // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] diff --git a/subworkflows/local/workflows.nf b/subworkflows/local/workflows.nf index d40e6ba..436d677 100644 --- a/subworkflows/local/workflows.nf +++ b/subworkflows/local/workflows.nf @@ -1,40 +1,58 @@ -//All Worksflows in One Place -// TODO split subworkflows out into one per file - -// TODO: this line should be moved to within a subworkflow or the main workflow -intervalbedin = Channel.fromPath(params.intervals,checkIfExists: true,type: 'file') +//All Worksflows in One Place +intervalbedin = Channel.fromPath(params.genomes[params.genome].intervals,checkIfExists: true,type: 'file') include {fc_lane; fastq_screen;kraken;qualimap_bamqc;fastqc; samtools_flagstats;vcftools;collectvariantcallmetrics; bcftools_stats;gatk_varianteval; snpeff; - somalier_extract;somalier_analysis;multiqc} from '../../modules/local/qc.nf' + somalier_extract;somalier_analysis_human;somalier_analysis_mouse; + multiqc} from '../../modules/local/qc.nf' + +include {fastp; bwamem2; //indelrealign; + bqsr; gatherbqsr; applybqsr; samtoolsindex} from '../../modules/local/trim_align.nf' + include {deepvariant_step1;deepvariant_step2;deepvariant_step3; deepvariant_combined;glnexus} from '../../modules/local/germline.nf' -include {fastp; bwamem2; //indelrealign; - bqsr; gatherbqsr; applybqsr; samtoolsindex} from '../../modules/local/trim_align.nf' + include {mutect2; mutect2filter; pileup_paired_t; pileup_paired_n; contamination_paired; learnreadorientationmodel;mergemut2stats; strelka_tn; combineVariants_strelka; - varscan_tn; vardict_tn; - combineVariants as combineVariants_vardict; combineVariants as combineVariants_varscan; - combineVariants as combineVariants_vardict_tonly; combineVariants as combineVariants_varscan_tonly - annotvep_tn as annotvep_tn_mut2; annotvep_tn as annotvep_tn_strelka; annotvep_tn as annotvep_tn_varscan; annotvep_tn as annotvep_tn_vardict; - combinemafs_tn} from '../../modules/local/variant_calling.nf' + varscan_tn; vardict_tn; lofreq_tn; muse_tn; + octopus_tn; bcftools_index_octopus; bcftools_index_octopus as bcftools_index_octopus_tonly; + combineVariants as combineVariants_vardict; combineVariants as combineVariants_vardict_tonly; + combineVariants as combineVariants_varscan; combineVariants as combineVariants_varscan_tonly; + combineVariants_alternative as combineVariants_lofreq; combineVariants as combineVariants_muse; + combineVariants_alternative as combineVariants_octopus; combineVariants_alternative as combineVariants_octopus_tonly; + annotvep_tn as annotvep_tn_mut2; annotvep_tn as annotvep_tn_strelka; + annotvep_tn as annotvep_tn_varscan; annotvep_tn as annotvep_tn_vardict; annotvep_tn as annotvep_tn_octopus; + annotvep_tn as annotvep_tn_lofreq; annotvep_tn as annotvep_tn_muse; + annotvep_tn as annotvep_tn_combined; + combinemafs_tn; somaticcombine} from '../../modules/local/variant_calling.nf' + include {mutect2_t_tonly; mutect2filter_tonly; - varscan_tonly; vardict_tonly; + varscan_tonly; vardict_tonly; octopus_tonly; contamination_tumoronly; learnreadorientationmodel_tonly; mergemut2stats_tonly; - annotvep_tonly as annotvep_tonly_varscan; annotvep_tonly as annotvep_tonly_vardict; annotvep_tonly as annotvep_tonly_mut2; - combinemafs_tonly} from '../../modules/local/variant_calling_tonly.nf' -include {svaba_somatic} from '../../modules/local/structural_variant.nf' -include {splitinterval} from "../../modules/local/splitbed.nf" + annotvep_tonly as annotvep_tonly_varscan; annotvep_tonly as annotvep_tonly_vardict; + annotvep_tonly as annotvep_tonly_mut2; annotvep_tonly as annotvep_tonly_octopus; + annotvep_tonly as annotvep_tonly_combined; + combinemafs_tonly;somaticcombine_tonly} from '../../modules/local/variant_calling_tonly.nf' +include {svaba_somatic; manta_somatic; + survivor_sv; gunzip; + annotsv_tn as annotsv_survivor_tn + annotsv_tn as annotsv_svaba;annotsv_tn as annotsv_manta} from '../../modules/local/structural_variant.nf' +include {amber_tn; cobalt_tn; purple; + sequenza; seqz_sequenza_bychr; freec; freec_paired } from './copynumber.nf' -workflow INPUT_PIPE { +include {splitinterval} from '../../modules/local/splitbed.nf' + + + +workflow INPUT { if(params.fastq_input){ fastqinput=Channel.fromFilePairs(params.fastq_input) @@ -67,7 +85,7 @@ workflow INPUT_PIPE { } -workflow TRIM_ALIGN_PIPE { +workflow ALIGN { take: fastqinput sample_sheet @@ -93,9 +111,7 @@ workflow TRIM_ALIGN_PIPE { tobqsr=bwamem2.out.combine(gatherbqsr.out,by:0) applybqsr(tobqsr) - //samtoolsindex(applybqsr.out) - - //samtoolsindex.out.view() + //sample_sheet.view() bamwithsample=applybqsr.out.combine(sample_sheet,by:0).map{it.swap(3,0)}.combine(applybqsr.out,by:0).map{it.swap(3,0)} @@ -111,7 +127,7 @@ workflow TRIM_ALIGN_PIPE { bqsrout=applybqsr.out } -workflow GERMLINE_PIPE { +workflow GL { //GERMLINE REQUIRES only BAMBYINTERVAL take: bambyinterval @@ -132,7 +148,7 @@ workflow GERMLINE_PIPE { } -workflow VARIANTCALL_PIPE { +workflow VC { take: //Input is the BAMby interval bamwithsample @@ -140,6 +156,9 @@ workflow VARIANTCALL_PIPE { sample_sheet main: + //Create Pairing for TN (in case of dups) + sample_sheet_paired=sample_sheet|map{tu,no -> tuple ("${tu}_vs_${no}",tu, no)} + bambyinterval=bamwithsample.combine(splitout.flatten()) //Paired Mutect2 @@ -158,148 +177,297 @@ workflow VARIANTCALL_PIPE { pileup_paired_all=pileup_paired_tout.join(pileup_paired_nout) - contamination_paired(pileup_paired_all) - - mut2out_lor=mutect2.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - f1r2.toSorted{ it -> (it.name =~ /${samplename}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } - )} - - learnreadorientationmodel(mut2out_lor) - - mut2out_mstats=mutect2.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - stats.toSorted{ it -> (it.name =~ /${samplename}_(.*?).mut2.vcf.gz.stats/)[0][1].toInteger() } - )} - - mergemut2stats(mut2out_mstats) - - allmut2tn=mutect2.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - vcfs.toSorted{ it -> (it.name =~ /${samplename}_(.*?).mut2.vcf.gz/)[0][1].toInteger() } - )} + contamination_paired(pileup_paired_all) + + //Mutect2 TN + mutect2.out.groupTuple(by:[0,1]) + | multiMap { tumor,normal,vcfs,f1r2,stats -> + mut2out_lor: tuple("${tumor}_vs_${normal}", + f1r2.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } ) + mut2out_mstats: tuple( "${tumor}_vs_${normal}", + stats.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).mut2.vcf.gz.stats/)[0][1].toInteger() }) + allmut2tn: tuple( "${tumor}_vs_${normal}", + vcfs.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).mut2.vcf.gz/)[0][1].toInteger() } ) + } + | set{mut2out} - mut2tn_filter=allmut2tn - .join(mergemut2stats.out) - .join(learnreadorientationmodel.out) - .join(contamination_paired.out) - mutect2filter(mut2tn_filter) - - - //Tumor Only Calling + learnreadorientationmodel(mut2out.mut2out_lor) + mergemut2stats(mut2out.mut2out_mstats) + + mutect2_in=mut2out.allmut2tn + | join(mergemut2stats.out) + | join(learnreadorientationmodel.out) + | map{t,vcf,stats,ro -> tuple(t.split('_vs_')[0],t.split('_vs_')[1],vcf,stats,ro)} + | join(contamination_paired.out) + | mutect2filter + | join(sample_sheet_paired) + | map{sample,markedvcf,markedindex,normvcf,normindex,stats,tumor,normal -> tuple(tumor,normal,"mutect2",normvcf,normindex)} + + annotvep_tn_mut2(mutect2_in) + + //Mutect2 Tumor Only bambyinterval_t=bambyinterval.map{tumorname,tumor,tumorbai,normalname,normalbam,normalbai,bed ->tuple(tumorname,tumor,tumorbai,bed)} - mutect2_t_tonly(bambyinterval_t) - - //LOR - mut2tout_lor=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - f1r2.toSorted{ it -> (it.name =~ /${samplename}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } - )} - learnreadorientationmodel_tonly(mut2tout_lor) - - - //Stats - mut2tonly_mstats=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - stats.toSorted{ it -> (it.name =~ /${samplename}_(.*?).tonly.mut2.vcf.gz.stats/)[0][1].toInteger() } - )} - mergemut2stats_tonly(mut2tonly_mstats) - + mutect2_t_tonly(bambyinterval_t) + + mutect2_t_tonly.out.groupTuple() + | multiMap { tumor,vcfs,f1r2,stats -> + mut2tout_lor: tuple(tumor, + f1r2.toSorted{ it -> (it.name =~ /${tumor}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } ) + mut2tonly_mstats: tuple( tumor, + stats.toSorted{ it -> (it.name =~ /${tumor}_(.*?).tonly.mut2.vcf.gz.stats/)[0][1].toInteger() }) + allmut2tonly: tuple(tumor, + vcfs.toSorted{ it -> (it.name =~ /${tumor}_(.*?).tonly.mut2.vcf.gz/)[0][1].toInteger() } ) + } + | set{mut2tonlyout} - //Contamination + + learnreadorientationmodel_tonly(mut2tonlyout.mut2tout_lor) + mergemut2stats_tonly(mut2tonlyout.mut2tonly_mstats) contamination_tumoronly(pileup_paired_tout) - //Final TUMOR ONLY FILTER - allmut2tonly=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - vcfs.toSorted{ it -> (it.name =~ /${samplename}_(.*?).tonly.mut2.vcf.gz/)[0][1].toInteger() } - )} + mutect2_in_tonly=mut2tonlyout.allmut2tonly + | join(mergemut2stats_tonly.out) + | join(learnreadorientationmodel_tonly.out) + | join(contamination_tumoronly.out) + | mutect2filter_tonly + | join(sample_sheet) + | map{tumor,markedvcf,markedindex,normvcf,normindex,stats,normal -> tuple(tumor,"mutect2_tonly",normvcf,normindex)} + annotvep_tonly_mut2(mutect2_in_tonly) - mut2tonly_filter=allmut2tonly - .join(mergemut2stats_tonly.out) - .join(learnreadorientationmodel_tonly.out) - .join(contamination_tumoronly.out) - - mutect2filter_tonly(mut2tonly_filter) - - //VCF2maf - mutect2filter.out - .join(sample_sheet) - .map{tumor,markedvcf,finalvcf,stats,normal -> tuple(tumor,normal,"mutect2",finalvcf)} | annotvep_tn_mut2 - - mutect2filter_tonly.out - .join(sample_sheet) - .map{tumor,markedvcf,finalvcf,stats,normal -> tuple(tumor,"mutect2",finalvcf)} | annotvep_tonly_mut2 - - //Strelka - strelka_tn(bambyinterval) - strelkaout=strelka_tn.out.groupTuple() - .map { samplename,vcfs,indels -> tuple( samplename, - vcfs.toSorted{ it -> (it.name =~ /${samplename}_(.*?).somatic.snvs.vcf.gz/)[0][1].toInteger() }, - indels.toSorted{ it -> (it.name =~ /${samplename}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() } - )} - combineVariants_strelka(strelkaout) - combineVariants_strelka.out.join(sample_sheet) - .map{tumor,markedvcf,finalvcf,normal -> tuple(tumor,normal,"strelka",finalvcf)} | annotvep_tn_strelka - - //Vardict - vardict_comb=vardict_tn(bambyinterval).groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"vardict")} | combineVariants_vardict - vardict_comb.join(sample_sheet) - .map{tumor,marked,normvcf,normal ->tuple(tumor,normal,"vardict",normvcf)} | annotvep_tn_vardict - - //VarDict_tonly - vardict_tonly_comb=bambyinterval.map{tumorname,tumorbam,tumorbai,normname,normbam,normbai,bed -> - tuple(tumorname,tumorbam,tumorbai,bed)} - vardict_tonly(vardict_tonly_comb).groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"vardict_tonly")} |combineVariants_vardict_tonly - combineVariants_vardict_tonly.out.join(sample_sheet) - .map{tumor,marked,normvcf,normal ->tuple(tumor,"vardict_tonly",normvcf)} | annotvep_tonly_vardict - - - //VarScan - varscan_in=bambyinterval.join(contamination_paired.out) - varscan_comb=varscan_tn(varscan_in).groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"varscan")} | combineVariants_varscan - varscan_comb.join(sample_sheet) - .map{tumor,marked,normvcf,normal ->tuple(tumor,normal,"varscan",normvcf)} | annotvep_tn_varscan - - //VarScan_tonly - varscan_tonly_comb=varscan_in.map{tumor,bam,bai,normal,nbam,nbai,bed,tpile,npile,tumorc,normalc -> - tuple(tumor,bam,bai,bed,tpile,tumorc)} | varscan_tonly - varscan_tonly_comb1=varscan_tonly_comb.groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"varscan_tonly")} | combineVariants_varscan_tonly + //Strelka TN + strelka_in=strelka_tn(bambyinterval) | groupTuple(by:[0,1]) + | map { tumor,normal,vcfs,vcfindex,indels,indelindex -> tuple("${tumor}_vs_${normal}", + vcfs.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.snvs.vcf.gz/)[0][1].toInteger() },vcfindex, + indels.toSorted{ it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).somatic.indels.vcf.gz/)[0][1].toInteger() } ,indelindex)} + | combineVariants_strelka | join(sample_sheet_paired) + | map{sample,markedvcf,markedindex,finalvcf,finalindex,tumor,normal -> tuple(tumor,normal,"strelka",finalvcf,finalindex)} + annotvep_tn_strelka(strelka_in) + + //Vardict TN + vardict_in=vardict_tn(bambyinterval) | groupTuple(by:[0,1]) + | map{tumor,normal,vcf-> tuple("${tumor}_vs_${normal}",vcf.toSorted{it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).vardict.vcf/)[0][1].toInteger()},"vardict")} + | combineVariants_vardict | join(sample_sheet_paired) + | map{sample,marked,markedindex,normvcf,normindex,tumor,normal ->tuple(tumor,normal,"vardict",normvcf,normindex)} + annotvep_tn_vardict(vardict_in) + + //VarDict TOnly + vardict_in_tonly=bambyinterval + | map{tumorname,tumorbam,tumorbai,normname,normbam,normbai,bed -> + tuple(tumorname,tumorbam,tumorbai,bed)} + | vardict_tonly | groupTuple() + | map{tumor,vcf-> tuple(tumor,vcf.toSorted{it -> (it.name =~ /${tumor}_(.*?).tonly.vardict.vcf/)[0][1].toInteger()},"vardict_tonly")} + | combineVariants_vardict_tonly | join(sample_sheet) + | map{tumor,marked,markedindex,normvcf,normindex,normal ->tuple(tumor,"vardict_tonly",normvcf,normindex)} + annotvep_tonly_vardict(vardict_in_tonly) - varscan_tonly_comb1.join(sample_sheet) - .map{tumor,marked,normvcf,normal ->tuple(tumor,"varscan_tonly",normvcf)} | annotvep_tonly_varscan - + //VarScan TN + varscan_in=bambyinterval.combine(contamination_paired.out) + | varscan_tn | groupTuple(by:[0,1]) + | map{tumor,normal,vcf-> tuple("${tumor}_vs_${normal}",vcf.toSorted{it -> (it.name =~ /${tumor}_vs_${normal}_(.*?).varscan.vcf.gz/)[0][1].toInteger()},"varscan")} + | combineVariants_varscan | join(sample_sheet_paired) + | map{sample,marked,markedindex,normvcf,normindex,tumor,normal ->tuple(tumor,normal,"varscan",normvcf,normindex)} + annotvep_tn_varscan(varscan_in) - //Combine All MAFs - annotvep_tn_mut2.out.concat(annotvep_tn_strelka.out).concat(annotvep_tn_vardict.out).concat(annotvep_tn_varscan.out) | combinemafs_tn + //VarScan TOnly + varscan_in_tonly=bambyinterval.combine(contamination_paired.out) + | map{tumor,bam,bai,normal,nbam,nbai,bed,tumorname2,tpile,npile,tumorc,normalc -> + tuple(tumor,bam,bai,bed,tpile,tumorc)} | varscan_tonly | groupTuple() + | map{tumor,vcf-> tuple(tumor,vcf.toSorted{it -> (it.name =~ /${tumor}_(.*?).tonly.varscan.vcf/)[0][1].toInteger()},"varscan_tonly")} + | combineVariants_varscan_tonly + | join(sample_sheet) + | map{tumor,marked,markedindex,normvcf,normindex,normal ->tuple(tumor,"varscan_tonly",normvcf,normindex)} + annotvep_tonly_varscan(varscan_in_tonly) + + //Lofreq TN + lofreq_in=lofreq_tn(bambyinterval) | groupTuple(by:[0,1]) + | map{tu,no,snv,dbsnv,indel,dbindel,vcf,vcfindex-> tuple("${tu}_vs_${no}",vcf.toSorted{it -> (it.name =~ /${tu}_vs_${no}_(.*?)_lofreq.vcf.gz/)[0][1].toInteger()},vcfindex,"lofreq")} + | combineVariants_lofreq | join(sample_sheet_paired) + | map{sample,marked,markedindex,normvcf,normindex,tumor,normal->tuple(tumor,normal,"lofreq",normvcf,normindex)} + annotvep_tn_lofreq(lofreq_in) + + //MuSE TN + muse_in=muse_tn(bamwithsample) + | map{tumor,normal,vcf-> tuple("${tumor}_vs_${normal}",vcf,"muse")} + | combineVariants_muse | join(sample_sheet_paired) + | map{sample,marked,markedindex,normvcf,normindex,tumor,normal ->tuple(tumor,normal,"muse",normvcf,normindex)} + annotvep_tn_muse(muse_in) + + //Octopus TN + octopus_in=octopus_tn(bambyinterval) | bcftools_index_octopus + | groupTuple() + | map{samplename,vcf,vcfindex-> tuple(samplename,vcf.toSorted{it->(it.name =~ /${samplename}_(.*).octopus.vcf.gz/)[0][1].toInteger()},vcfindex,"octopus")} + | combineVariants_octopus + | map{samplename,marked,markedindex,normvcf,normindex -> + tuple(samplename.split('_vs_')[0],samplename.split('_vs_')[1],"octopus",normvcf,normindex)} + annotvep_tn_octopus(octopus_in) + + //Octopus TOnly + octopus_in_tonly=bambyinterval.map{tumor,bam,bai,normal,nbam,nbai,bed-> + tuple(tumor,bam,bai,bed)} | octopus_tonly | bcftools_index_octopus_tonly + | groupTuple() + | map{samplename,vcf,vcfindex->tuple(samplename,vcf.toSorted{it->(it.name =~ /${samplename}_(.*).tonly.octopus.vcf.gz/)[0][1].toInteger()},vcfindex,"octopus_tonly")} + | combineVariants_octopus_tonly + | join(sample_sheet) | + map{tumor,marked,markedindex,normvcf,normindex,normal ->tuple(tumor,"octopus_tonly",normvcf,normindex)} + annotvep_tonly_octopus(octopus_in_tonly) + + //Combine All Variants Using VCF and Then Reannotate + mutect2_in|concat(strelka_in)|concat(octopus_in)|concat(muse_in)|concat(lofreq_in) + | concat(vardict_in) |concat(varscan_in) | groupTuple(by:[0,1]) + | somaticcombine + | map{tumor,normal,vcf,index ->tuple(tumor,normal,"combined",vcf,index)} + | annotvep_tn_combined + + mutect2_in_tonly|concat(octopus_in_tonly) + | concat(vardict_in_tonly)|concat(varscan_in_tonly) | groupTuple() + | somaticcombine_tonly + | map{tumor,vcf,index ->tuple(tumor,"combined_tonly",vcf,index)} + | annotvep_tonly_combined + + //Implement PCGR Annotator/CivIC Next + + emit: + somaticcall_input=octopus_in - annotvep_tonly_mut2.out.concat(annotvep_tonly_vardict.out).concat(annotvep_tonly_varscan.out) | combinemafs_tonly - //Implement Copy Number - //Implement PCGR Annotator/CivIC Next } -workflow SV_PIPE { +workflow SV { take: bamwithsample main: //Svaba - svaba_somatic(bamwithsample) + svaba_out=svaba_somatic(bamwithsample) + .map{ tumor,bps,contigs,discord,alignents,gindel,gsv,so_indel,so_sv,unfil_gindel,unfil_gsv,unfil_so_indel,unfil_sv,log -> + tuple(tumor,so_sv,"svaba")} + annotsv_svaba(svaba_out).ifEmpty("Empty SV input--No SV annotated") //Manta + manta_out=manta_somatic(bamwithsample) + .map{tumor,gsv,so_sv,unfil_sv,unfil_indel -> + tuple(tumor,so_sv,"manta")} + annotsv_manta(manta_out).ifEmpty("Empty SV input--No SV annotated") + + //Delly-WIP + + //Survivor + gunzip(manta_out).concat(svaba_out).groupTuple() + | survivor_sv | annotsv_survivor_tn | ifEmpty("Empty SV input--No SV annotated") + +} + +workflow CNVmouse { + take: + bamwithsample + + main: + //Sequenza (Preferred for Paired) + chrs=Channel.fromList(params.genomes[params.genome].chromosomes) + seqzin=bamwithsample.map{tname,tumor,tbai,nname,norm,nbai-> + tuple("${tname}_${nname}",tname,tumor,tbai,nname,norm,nbai)} + seqzin.combine(chrs) | seqz_sequenza_bychr + seqz_sequenza_bychr.out.groupTuple() + .map{pair, seqz -> tuple(pair, seqz.sort{it.name})} + | sequenza + + //FREEC Paired Mode + bamwithsample | freec_paired + +} + +workflow CNVhuman { + take: + bamwithsample + somaticcall_input + + main: + //Sequenza + chrs=Channel.fromList(params.genomes[params.genome].chromosomes) + seqzin=bamwithsample.map{tname,tumor,tbai,nname,norm,nbai-> + tuple("${tname}_${nname}",tname,tumor,tbai,nname,norm,nbai)} + seqzin.combine(chrs) | seqz_sequenza_bychr + seqz_sequenza_bychr.out.groupTuple() + .map{pair, seqz -> tuple(pair, seqz.sort{it.name})} + | sequenza + + //Purple + bamwithsample | amber_tn + bamwithsample | cobalt_tn + purplein=amber_tn.out.join(cobalt_tn.out) + purplein.join(somaticcall_input)| + map{t1,amber,cobalt,n1,vc,vcf,vcfindex -> tuple(t1,amber,cobalt,vcf,vcfindex)} + | purple + +} + + /* + //baminput=sample_sheet + // .map{samplename,bam,vcf-> tuple(samplename,file(bam),file("${bam}.bai"))} + + //somaticinput=sample_sheet + // .map{samplename,bam,vcf-> tuple(samplename,file(vcf))} + + + */ + + + + +workflow QC_NOGL { + take: + fastqin + fastpout + applybqsr + + main: + //QC Steps + fc_lane(fastqin) + fastq_screen(fastpout) + kraken(fastqin) + qualimap_bamqc(applybqsr) + samtools_flagstats(applybqsr) + fastqc(applybqsr) + + //Somalier + somalier_extract(applybqsr) + som_in=somalier_extract.out.collect() + + //Prep for MultiQC input + fclane_out=fc_lane.out.map{samplename,info->info}.collect() + fqs_out=fastq_screen.out.collect() + + kraken_out=kraken.out.map{samplename,taxa,krona -> tuple(taxa,krona)}.collect() + qualimap_out=qualimap_bamqc.out.map{genome,rep->tuple(genome,rep)}.collect() + fastqc_out=fastqc.out.map{samplename,html,zip->tuple(html,zip)}.collect() + + samtools_flagstats_out=samtools_flagstats.out.collect() + + if(params.genome=="hg38"){ + somalier_analysis_human(som_in) + somalier_analysis_out=somalier_analysis_human.out.collect() + } + else if(params.genome=="mm10"){ + somalier_analysis_mouse(som_in) + somalier_analysis_out=somalier_analysis_mouse.out.collect() + } + + conall=fclane_out.concat(fqs_out,kraken_out,qualimap_out,samtools_flagstats_out, + somalier_analysis_out).flatten().toList() + multiqc(conall) } -workflow QC_PIPE { + +workflow QC_GL { take: fastqin fastpout applybqsr - glnexusout //GLnexus germline output - bcfout //DV germline output + glnexusout + bcfout main: //QC Steps @@ -321,9 +489,19 @@ workflow QC_PIPE { //Somalier somalier_extract(applybqsr) som_in=somalier_extract.out.collect() - somalier_analysis(som_in) + + //Prep for MultiQC input + if(params.genome=="hg38"){ + somalier_analysis_human(som_in) + somalier_analysis_out=somalier_analysis_human.out.collect() + } + else if(params.genome=="mm10"){ + somalier_analysis_mouse(som_in) + somalier_analysis_out=somalier_analysis_mouse.out.collect() + } + fclane_out=fc_lane.out.map{samplename,info->info}.collect() fqs_out=fastq_screen.out.collect() @@ -333,10 +511,9 @@ workflow QC_PIPE { samtools_flagstats_out=samtools_flagstats.out.collect() bcftools_stats_out= bcftools_stats.out.collect() gatk_varianteval_out= gatk_varianteval.out.collect() - snpeff_out=snpeff.out.collect()//map{vcf,csv,html->vcf,csv,html}.collect() + snpeff_out=snpeff.out.collect() vcftools_out=vcftools.out - collectvariantcallmetrics_out=collectvariantcallmetrics.out//.map{details,summary->details,summary} - somalier_analysis_out=somalier_analysis.out.collect() + collectvariantcallmetrics_out=collectvariantcallmetrics.out conall=fclane_out.concat(fqs_out,kraken_out,qualimap_out,samtools_flagstats_out,bcftools_stats_out, gatk_varianteval_out,snpeff_out,vcftools_out,collectvariantcallmetrics_out,somalier_analysis_out).flatten().toList() @@ -345,7 +522,7 @@ workflow QC_PIPE { //Variant Calling from BAM only -workflow INPUT_BAMVC_PIPE { +workflow INPUT_BAM { if(params.sample_sheet){ sample_sheet=Channel.fromPath(params.sample_sheet, checkIfExists: true) @@ -360,13 +537,28 @@ workflow INPUT_BAMVC_PIPE { //Either BAM Input or File sheet input if(params.bam_input){ - baminputonly=Channel.fromPath(params.bam_input) + //Check if Index is .bai or .bam.bai + bambai=params.bam_input +".bai" + baionly = bambai.replace(".bam", "") + bamcheck1=file(bambai) + bamcheck2=file(baionly) + + if (bamcheck1.size()>0){ + baminputonly=Channel.fromPath(params.bam_input) .map{it-> tuple(it.simpleName,it,file("${it}.bai"))} + } + if (bamcheck2.size()>0){ + bai=Channel.from(bamcheck2).map{it -> tuple(it.simpleName,it)}.view() + baminputonly=Channel.fromPath(params.bam_input) + .map{it-> tuple(it.simpleName,it)} + .join(bai) + } + }else if(params.file_input) { baminputonly=Channel.fromPath(params.file_input) .splitCsv(header: false, sep: "\t", strip:true) - .map{ sample,bam -> - tuple(sample, file(bam),file("${bam}.bai")) + .map{ sample,bam,bai -> + tuple(sample, file(bam),file(bai)) } } @@ -382,3 +574,4 @@ workflow INPUT_BAMVC_PIPE { } + diff --git a/subworkflows/local/workflows_tonly.nf b/subworkflows/local/workflows_tonly.nf index 854444c..07c274c 100644 --- a/subworkflows/local/workflows_tonly.nf +++ b/subworkflows/local/workflows_tonly.nf @@ -2,36 +2,53 @@ // TODO split subworkflows out into one per file // TODO: this line should be moved to within a subworkflow or the main workflow -intervalbedin = Channel.fromPath(params.intervals,checkIfExists: true,type: 'file') +intervalbedin = Channel.fromPath(params.genomes[params.genome].intervals,checkIfExists: true,type: 'file') + include {fc_lane; fastq_screen;kraken;qualimap_bamqc; samtools_flagstats;vcftools;collectvariantcallmetrics; bcftools_stats;gatk_varianteval; snpeff;fastqc; - somalier_extract;somalier_analysis;multiqc} from '../../modules/local/qc.nf' + somalier_extract;somalier_analysis_human;somalier_analysis_mouse; + multiqc} from '../../modules/local/qc.nf' + include {deepvariant_step1;deepvariant_step2;deepvariant_step3; deepvariant_combined;glnexus} from '../../modules/local/germline.nf' + include {fastp; bwamem2; bqsr; gatherbqsr; applybqsr; samtoolsindex} from '../../modules/local/trim_align.nf' + include {mutect2; mutect2filter; pileup_paired_t; pileup_paired_n; - contamination_paired; learnreadorientationmodel;mergemut2stats; + bcftools_index_octopus; + contamination_paired; learnreadorientationmodel; mergemut2stats; combineVariants as combineVariants_vardict; combineVariants as combineVariants_varscan; - combineVariants as combineVariants_vardict_tonly; combineVariants as combineVariants_varscan_tonly + combineVariants as combineVariants_vardict_tonly; combineVariants as combineVariants_varscan_tonly; + combineVariants_alternative ; annotvep_tn as annotvep_tn_mut2; annotvep_tn as annotvep_tn_strelka; annotvep_tn as annotvep_tn_varscan; annotvep_tn as annotvep_tn_vardict; combinemafs_tn} from '../../modules/local/variant_calling.nf' -include {mutect2_t_tonly; mutect2filter_tonly; + +include {mutect2_t_tonly; mutect2filter_tonly; pileup_paired_tonly; varscan_tonly; vardict_tonly; + octopus_tonly; contamination_tumoronly; learnreadorientationmodel_tonly; mergemut2stats_tonly; - annotvep_tonly as annotvep_tonly_varscan; annotvep_tonly as annotvep_tonly_vardict; annotvep_tonly as annotvep_tonly_mut2; - combinemafs_tonly} from '../../modules/local/variant_calling_tonly.nf' -include {splitinterval} from "../../modules/local/splitbed.nf" + annotvep_tonly as annotvep_tonly_varscan; annotvep_tonly as annotvep_tonly_vardict; + annotvep_tonly as annotvep_tonly_mut2; annotvep_tonly as annotvep_tonly_octopus; + annotvep_tonly as annotvep_tonly_combined; + combinemafs_tonly; somaticcombine_tonly} from '../../modules/local/variant_calling_tonly.nf' +include {manta_tonly; svaba_tonly; survivor_sv; gunzip; +annotsv_tonly as annotsv_manta_tonly; annotsv_tonly as annotsv_svaba_tonly; +annotsv_tonly as annotsv_survivor_tonly} from './structural_variant.nf' +include {freec; amber_tonly; cobalt_tonly; purple } from './copynumber.nf' -workflow INPUT_TONLY_PIPE { +include {splitinterval} from '../../modules/local/splitbed.nf' + + +workflow INPUT_TONLY { if(params.fastq_input){ fastqinput=Channel.fromFilePairs(params.fastq_input) @@ -66,7 +83,7 @@ workflow INPUT_TONLY_PIPE { } -workflow TRIM_ALIGN_TONLY_PIPE { +workflow ALIGN_TONLY { take: fastqinput sample_sheet @@ -90,9 +107,7 @@ workflow TRIM_ALIGN_TONLY_PIPE { tobqsr=bwamem2.out.combine(gatherbqsr.out,by:0) applybqsr(tobqsr) - //samtoolsindex(applybqsr.out) - //samtoolsindex.out.view() - //bamwithsample=samtoolsindex.out.join(sample_sheet).map{it.swap(3,0)}.join(samtoolsindex.out).map{it.swap(3,0)} + bamwithsample=applybqsr.out.join(sample_sheet) .map{samplename,tumor,tumorbai -> tuple( samplename,tumor,tumorbai) } @@ -104,14 +119,13 @@ workflow TRIM_ALIGN_TONLY_PIPE { fastpout=fastp.out fastqin=fastqinput splitout=splitinterval.out - //indelbambyinterval bqsrbambyinterval sample_sheet bqsrout=applybqsr.out } -workflow VARIANT_TONLY_PIPE { +workflow VC_TONLY { take: //Input is the BAMby interval bamwithsample @@ -129,67 +143,131 @@ workflow VARIANT_TONLY_PIPE { mutect2_t_tonly(bambyinterval) - //LOR - mut2tout_lor=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - f1r2.toSorted{ it -> (it.name =~ /${samplename}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } - )} - learnreadorientationmodel_tonly(mut2tout_lor) - - - //Stats - mut2tonly_mstats=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - stats.toSorted{ it -> (it.name =~ /${samplename}_(.*?).tonly.mut2.vcf.gz.stats/)[0][1].toInteger() } - )} - mergemut2stats_tonly(mut2tonly_mstats) + mutect2_t_tonly.out.groupTuple() + | multiMap { tumor,vcfs,f1r2,stats -> + mut2tout_lor: tuple(tumor, + f1r2.toSorted{ it -> (it.name =~ /${tumor}_(.*?).f1r2.tar.gz/)[0][1].toInteger() } ) + mut2tonly_mstats: tuple( tumor, + stats.toSorted{ it -> (it.name =~ /${tumor}_(.*?).tonly.mut2.vcf.gz.stats/)[0][1].toInteger() }) + allmut2tonly: tuple(tumor, + vcfs.toSorted{ it -> (it.name =~ /${tumor}_(.*?).tonly.mut2.vcf.gz/)[0][1].toInteger() } ) + } + | set{mut2tonlyout} + - //Contamination + learnreadorientationmodel_tonly(mut2tonlyout.mut2tout_lor) + mergemut2stats_tonly(mut2tonlyout.mut2tonly_mstats) contamination_tumoronly(pileup_paired_tout) - //Final TUMOR ONLY FILTER - allmut2tonly=mutect2_t_tonly.out.groupTuple() - .map { samplename,vcfs,f1r2,stats -> tuple( samplename, - vcfs.toSorted{ it -> (it.name =~ /${samplename}_(.*?).tonly.mut2.vcf.gz/)[0][1].toInteger() } - )} - - mut2tonly_filter=allmut2tonly + mut2tonly_filter=mut2tonlyout.allmut2tonly .join(mergemut2stats_tonly.out) .join(learnreadorientationmodel_tonly.out) .join(contamination_tumoronly.out) - mutect2filter_tonly(mut2tonly_filter) - - - //##VCF2MAF TO + mutect2_tonly_in=mutect2filter_tonly(mut2tonly_filter) + | join(sample_sheet) + | map{tumor,markedvcf,markedindex,finalvcf,finalindex,stats -> tuple(tumor,"mutect2_tonly",finalvcf,finalindex)} + annotvep_tonly_mut2(mutect2_tonly_in) - mutect2filter_tonly.out - .join(sample_sheet) - .map{tumor,markedvcf,finalvcf,stats -> tuple(tumor,"mutect2",finalvcf)} | annotvep_tonly_mut2 - //VarDict_tonly - vardict_tonly_comb=bambyinterval.map{tumorname,tumorbam,tumorbai,normname,normbam,normbai,bed -> - tuple(tumorname,tumorbam,tumorbai,bed)} - vardict_tonly(vardict_tonly_comb).groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"vardict_tonly")} |combineVariants_vardict_tonly - combineVariants_vardict_tonly.out.join(sample_sheet) - .map{tumor,marked,normvcf ->tuple(tumor,"vardict_tonly",normvcf)} | annotvep_tonly_vardict + //VarDict + vardict_in_tonly=vardict_tonly(bambyinterval) | groupTuple()| map{tumor,vcf -> tuple(tumor,vcf,"vardict_tonly")} + | combineVariants_vardict_tonly + | join(sample_sheet) + | map{tumor,marked,markedindex,normvcf,normindex ->tuple(tumor,"vardict_tonly",normvcf,normindex)} + annotvep_tonly_vardict(vardict_in_tonly) //VarScan_tonly - varscan_tonly_comb=varscan_in.map{tumor,bam,bai,normal,nbam,nbai,bed,tpile,npile,tumorc,normalc -> - tuple(tumor,bam,bai,bed,tpile,tumorc)} | varscan_tonly - varscan_tonly_comb1=varscan_tonly_comb.groupTuple().map{tumor,vcf-> tuple(tumor,vcf,"varscan_tonly")} | combineVariants_varscan_tonly - - varscan_tonly_comb1.join(sample_sheet) - .map{tumor,marked,normvcf,normal ->tuple(tumor,"varscan_tonly",normvcf)} | annotvep_tonly_varscan + varscan_in_tonly=bambyinterval.join(contamination_tumoronly.out) + | varscan_tonly | groupTuple() | map{tumor,vcf-> tuple(tumor,vcf,"varscan")} + | combineVariants_varscan_tonly + | join(sample_sheet) + | map{tumor,marked,markedindex,normvcf,normindex ->tuple(tumor,"varscan_tonly",normvcf,normindex)} + annotvep_tonly_varscan(varscan_in_tonly) + + //Octopus_tonly + octopus_in_tonly=bambyinterval | octopus_tonly | bcftools_index_octopus + | groupTuple() + | map{tumor,vcf,vcfindex -> tuple(tumor,vcf.toSorted{it -> it.name} + ,vcfindex, "octopus_tonly")} + | combineVariants_alternative | join(sample_sheet) + | map{tumor,marked,markedindex,normvcf,normindex ->tuple(tumor,"octopus_tonly",normvcf,normindex)} + annotvep_tonly_octopus(octopus_in_tonly) + + + mutect2_tonly_in|concat(octopus_in_tonly) + | concat(vardict_in_tonly)|concat(varscan_in_tonly) + | somaticcombine_tonly + | map{tumor,vcf,index ->tuple(tumor,"combined_tonly",vcf,index)} + | annotvep_tonly_combined + + + + emit: + somaticcall_input=octopus_in_tonly - annotvep_tonly_mut2.out.concat(annotvep_tonly_vardict.out).concat(annotvep_tonly_varscan.out) | combinemafs_tonly } -workflow QC_TONLY_PIPE { +workflow SV_TONLY { + take: + bamwithsample + + main: + //Svaba + svaba_out=svaba_tonly(bamwithsample) + .map{ tumor,bps,contigs,discord,alignments,so_indel,so_sv,unfil_so_indel,unfil_sv,log -> + tuple(tumor,so_sv,"svaba_tonly")} + annotsv_svaba_tonly(svaba_out).ifEmpty("Empty SV input--No SV annotated") + + //Manta + manta_out=manta_tonly(bamwithsample) + .map{tumor, sv, indel, tumorsv -> + tuple(tumor,tumorsv,"manta_tonly")} + annotsv_manta_tonly(manta_out).ifEmpty("Empty SV input--No SV annotated") + + //Delly-WIP + + //Survivor + gunzip(manta_out).concat(svaba_out).groupTuple() + | survivor_sv | annotsv_survivor_tonly | ifEmpty("Empty SV input--No SV annotated") +} + + + +workflow CNVmouse_tonly { + take: + bamwithsample + + main: + freec(bamwithsample) +} + + +workflow CNVhuman_tonly { + take: + bamwithsample + somaticcall_input + + main: + //FREEC-Unpaired onlypu + bamwithsample | freec + + //Purple + bamwithsample | amber_tonly + bamwithsample | cobalt_tonly + purplein=amber_tonly.out.join(cobalt_tonly.out) + purplein.join(somaticcall_input)| + map{t1,amber,cobalt,vc,vcf,index -> tuple(t1,amber,cobalt,vcf,index)} + | purple + + +} + +workflow QC_TONLY { take: fastqin fastpout @@ -206,11 +284,16 @@ workflow QC_TONLY_PIPE { samtools_flagstats(bqsrout) qualimap_bamqc(bqsrout) - somalier_extract(bqsrout) som_in=somalier_extract.out.collect() - somalier_analysis(som_in) - + if(params.genome=="hg38"){ + somalier_analysis_human(som_in) + somalier_analysis_out=somalier_analysis_human.out.collect() + } + else if(params.genome=="mm10"){ + somalier_analysis_mouse(som_in) + somalier_analysis_out=somalier_analysis_mouse.out.collect() + } //Prep for MultiQC input fclane_out=fc_lane.out.map{samplename,info->info}.collect() @@ -221,7 +304,8 @@ workflow QC_TONLY_PIPE { fastqc_out=fastqc.out.map{samplename,html,zip->tuple(html,zip)}.collect() samtools_flagstats_out=samtools_flagstats.out.collect() - somalier_analysis_out=somalier_analysis.out.collect() + + conall=fclane_out.concat(fqs_out,kraken_out,qualimap_out,fastqc_out, samtools_flagstats_out, @@ -233,26 +317,41 @@ workflow QC_TONLY_PIPE { + + //Variant Calling from BAM only -workflow INPUT_TONLY_BAMVC_PIPE { +workflow INPUT_TONLY_BAM { main: - //Either BAM Input or File sheet input if(params.bam_input){ - baminputonly=Channel.fromPath(params.bam_input) + bambai = params.bam_input +".bai" + baionly = bambai.replace(".bam", "") + bamcheck1 = file(bambai) + bamcheck2 = file(baionly) + + if (bamcheck1.size()>0){ + baminputonly=Channel.fromPath(params.bam_input) .map{it-> tuple(it.simpleName,it,file("${it}.bai"))} + } + if (bamcheck2.size()>0){ + bai=Channel.from(bamcheck2).map{it -> tuple(it.simpleName,it)}.view() + baminputonly=Channel.fromPath(params.bam_input) + .map{it-> tuple(it.simpleName,it)} + .join(bai) + } sample_sheet=baminputonly.map{samplename,bam,bai -> tuple ( - samplename)}.view() + samplename)} + }else if(params.file_input) { baminputonly=Channel.fromPath(params.file_input) .splitCsv(header: false, sep: "\t", strip:true) - .map{ sample,bam -> - tuple(sample, file(bam),file("${bam}.bai")) + .map{ sample,bam,bai -> + tuple(sample, file(bam),file(bai)) } + sample_sheet=baminputonly.map{samplename,bam,bai -> tuple ( - samplename)}.view() - + samplename)} } splitinterval(intervalbedin) @@ -262,8 +361,7 @@ workflow INPUT_TONLY_BAMVC_PIPE { emit: bamwithsample splitout=splitinterval.out - sample_sheet + sample_sheet - } diff --git a/wgs-seek_bak b/wgs-seek_bak deleted file mode 100755 index eb36bb1..0000000 --- a/wgs-seek_bak +++ /dev/null @@ -1,501 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: UTF-8 -*- - -""" -ABOUT: This is the main entry for the GATK4 WES pipeline. -REQUIRES: - - python>=3.5 - - nextflow - - singularity (recommended==latest) -DISCLAIMER: - PUBLIC DOMAIN NOTICE - CCR Collaborative Bioinformatics Resource (CCBR) - National Cancer Institute (NCI) -This software/database is a "United States Government Work" under -the terms of the United States Copyright Act. It was written as -part of the author's official duties as a United States Government -employee and thus cannot be copyrighted. This software is freely -available to the public for use. -Although all reasonable efforts have been taken to ensure the -accuracy and reliability of the software and data, CCBR do not and -cannot warrant the performance or results that may be obtained by -using this software or data. CCBR and NCI disclaim all warranties, -express or implied, including warranties of performance, -merchantability or fitness for any particular purpose. -Please cite the author and the "NIH Biowulf Cluster" in any work or -product based on this material. -USAGE: - $ wgs-seek [OPTIONS] -EXAMPLE: - $ wgs-seek run --input *.R{1,2}.fastq.gz --output WGS_hg38/ --mode biowulf --sample_sheet pairs.tsv - -""" - -# Python standard library -from __future__ import print_function -import sys, os, subprocess, re, json, textwrap - -# 3rd party imports from pypi -import argparse # potential python3 3rd party package, added in python/3.5 - - -def run(sub_args): - """Initialize, setup, and run the GATK4 WGS pipeline. - Calls initialize() to create output directory and copy over pipeline resources, - setup() to create the pipeline config file, dryrun() to ensure their are no issues - before running the pipeline, and finally run() to execute the Nextflow workflow. - @param sub_args : - Parsed arguments for run sub-command - """ - # Step 0. Check for required dependencies - # The pipelines has only two requirements: - # nextflow and singularity - require(['nextflow', 'singularity'], ['nextflow', 'singularity']) - - # Step 1. Initialize working directory, - # copy over required resources to run - # the pipeline - git_repo = __home__ - input_files = init( - repo_path = git_repo, - output_path = sub_args.output, - links = sub_args.input - ) - - # Step 2. Setup pipeline for execution, - # dynamically create config.json config - # file from user inputs and base config - # templates - config = setup(sub_args, - ifiles = input_files, - repo_path = git_repo, - output_path = sub_args.output - ) - - # Step 3. Resolve docker/singularity bind - # paths from the config file. - bindpaths = bind( - sub_args, - config = config - ) - - # Optional Step: Dry-run pipeline - if sub_args.dry_run: - # Dryrun pipeline - dryrun_output = dryrun(outdir = sub_args.output) # python3 returns byte-string representation - print("\nDry-running exome-seek pipeline:\n{}".format(dryrun_output.decode("utf-8"))) - sys.exit(0) - - # Step 4. Orchestrate pipeline execution, - # run pipeline in locally on a compute node - # for debugging purposes or submit the master - # job to the job scheduler, SLURM, and create - # logging file - if not exists(os.path.join(sub_args.output, 'logfiles')): - # Create directory for logfiles - os.makedirs(os.path.join(sub_args.output, 'logfiles')) - if sub_args.mode == 'local': - log = os.path.join(sub_args.output, 'logfiles', 'snakemake.log') - else: - log = os.path.join(sub_args.output, 'logfiles', 'master.log') - logfh = open(log, 'w') - wait = '' - if sub_args.wait: wait = '--wait' - mjob = runner(mode = sub_args.mode, - outdir = sub_args.output, - # additional_bind_paths = all_bind_paths, - alt_cache = sub_args.singularity_cache, - threads = int(sub_args.threads), - jobname = sub_args.job_name, - submission_script='runner', - logger = logfh, - additional_bind_paths = ",".join(bindpaths), - tmp_dir = sub_args.tmp_dir, - wait = wait - ) - - # Step 5. Wait for subprocess to complete, - # this is blocking and not asynchronous - if not sub_args.silent: - print("\nRunning GATK4 WES pipeline in '{}' mode...".format(sub_args.mode)) - mjob.wait() - logfh.close() - - # Step 6. Relay information about submission - # of the master job or the exit code of the - # pipeline that ran in local mode - if sub_args.mode == 'local': - if int(mjob.returncode) == 0: - print('GATK4 WES pipeline has successfully completed') - else: - fatal('GATK4 WES pipeline failed. Please see {} for more information.'.format( - os.path.join(sub_args.output, 'logfiles', 'snakemake.log'))) - elif sub_args.mode == 'slurm': - jobid = open(os.path.join(sub_args.output, 'logfiles', 'mjobid.log')).read().strip() - if not sub_args.silent: - if int(mjob.returncode) == 0: - print('Successfully submitted master job: ', end="") - else: - fatal('Error occurred when submitting the master job.') - print(jobid) - - -def unlock(sub_args): - """Unlocks a previous runs output directory. If snakemake fails ungracefully, - it maybe required to unlock the working directory before proceeding again. - This is rare but it does occasionally happen. Maybe worth add a --force - option to delete the '.snakemake/' directory in the future. - @param sub_args : - Parsed arguments for unlock sub-command - """ - - print("Unlocking the pipeline's output directory...") - outdir = sub_args.output - - try: - unlock_output = subprocess.check_output([ - 'snakemake', '--unlock', - '--cores', '1', - '--configfile=config.json' - ], cwd = outdir, - stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - # Unlocking process returned a non-zero exit code - sys.exit("{}\n{}".format(e, e.output)) - - print("Successfully unlocked the pipeline's working directory!") - - -def cache(sub_args): - """Caches remote resources or reference files stored on DockerHub and S3. - Local SIFs will be created from images defined in 'config/containers/images.json'. - @TODO: add option to cache other shared S3 resources (i.e. kraken db and fqscreen indices) - @param sub_args : - Parsed arguments for unlock sub-command - """ - print(sub_args) - fatal('NotImplementedError... Comming Soon!') - - sif_cache = sub_args.sif_cache - # Get absolute PATH to templates in exome-seek git repo - repo_path = os.path.dirname(os.path.abspath(__file__)) - images = os.path.join(repo_path, 'config','containers', 'images.json') - - # Create image cache - if not exists(sif_cache): - # Pipeline output directory does not exist on filesystem - os.makedirs(sif_cache) - elif exists(sif_cache) and os.path.isfile(sif_cache): - # Provided Path for pipeline output directory exists as file - raise OSError("""\n\tFatal: Failed to create provided sif cache directory! - User provided --sif-cache PATH already exists on the filesystem as a file. - Please {} cache again with a different --sif-cache PATH. - """.format(sys.argv[0]) - ) - - # Check if local SIFs already exist on the filesystem - with open(images, 'r') as fh: - data = json.load(fh) - - pull = [] - for image, uri in data['images'].items(): - sif = os.path.join(sif_cache, '{}.sif'.format(os.path.basename(uri).replace(':', '_'))) - if not exists(sif): - # If local sif does not exist on in cache, print warning - # and default to pulling from URI in config/containers/images.json - print('Image will be pulled from "{}".'.format(uri), file=sys.stderr) - pull.append(uri) - - if not pull: - # Nothing to do! - print('Singularity image cache is already up to update!') - else: - # There are image(s) that need to be pulled - if not sub_args.dry_run: - # submission_script for exome-seek cache is /path/to/output/resources/cacher - # Quote user provided values to avoid shell injections - masterjob = subprocess.Popen( - 'sbatch --parsable -J pl:cache --gres=lscratch:200 --time=10:00:00 --mail-type=BEGIN,END,FAIL ' + - str(os.path.join(repo_path, 'resources', 'cacher')) + ' slurm ' + - " -s '{}' ".format(sif_cache) + - " -i '{}' ".format(','.join(pull)) + - " -t '/lscratch/${SLURM_JOB_ID}/.singularity/' ", - cwd = sif_cache, shell=True, stderr = subprocess.STDOUT, stdout = subprocess.PIPE) - - masterjob.communicate() - print('exome-seek reference cacher submitted master job with exit-code: {}'.format(masterjob.returncode)) - - -def parsed_arguments(): - """Parses user-provided command-line arguments. Requires argparse and textwrap - package. argparse was added to standard lib in python 3.5 and textwrap was added - in python 3.5. To create custom help formatting for subparsers a docstring is - used create the help message for required options. argparse does not support named - subparser groups, which is normally what would be used to accomphish this reformatting. - As so, the help message for require options must be suppressed. If a new required arg - is added to a subparser, it must be added to the docstring and the usage statement - also must be updated. - """ - - # Create a top-level parser - parser = argparse.ArgumentParser(description = 'exome-seek: a highly-reproducible GATK4 WES pipeline') - - # Adding Verison information - #parser.add_argument('--version', action = 'version', version='%(prog)s {}'.format(__version__)) - - # Create sub-command parser - subparsers = parser.add_subparsers(help='List of available sub-commands') - - - # Sub-parser for the "run" sub-command - # Grouped sub-parser arguments are currently not supported by argparse. - # https://bugs.python.org/issue9341 - # Here is a work around to create more useful help message for named - # options that are required! Please note: if a required arg is added the - # description below should be updated (i.e. update usage and add new option) - required_run_options = textwrap.dedent("""\ - usage: exome-seek run [--help] \\ - [--mode {local, slurm}] \\ - [--job-name JOB_NAME] \\ - [--callers {mutect2,mutect,strelka, ...}] \\ - [--pairs PAIRS] \\ - [--ffpe] \\ - [--cnv] \\ - [--dry-run] \\ - [--silent] \\ - [--singularity-cache SINGULARITY_CACHE] \\ - [--sif-cache SIF_CACHE] \\ - [--tmpdir TMP_DIR] \\ - [--threads THREADS] \\ - [--wait] \\ - --input INPUT [INPUT ...] \\ - --output OUTPUT \\ - --genome {hg38, mm10, ...} \\ - --targets TARGETS - - required arguments: - --input INPUT [INPUT ...] - Input FastQ or BAM file(s) to process. One or more input - files can be provided. The pipeline does NOT support - single-end WES data. Please provide either a set of - FastQ files or a set of BAM files. The pipeline does - NOT support processing a mixture of FastQ files and - BAM files. - Example: --input .tests/*.R?.fastq.gz - --output OUTPUT - Path to an output directory. This location is where - the pipeline will create all of its output files, also - known as the pipeline's working directory. If the user - provided working directory has not been initialized, - it will be created automatically. - Example: --output /data/$USER/WES_hg38 - --genome {hg38, mm10, ...} - Reference genome. This option defines the reference - genome of the samples. Currently, hg38 and mm10 are supported. - Support for additional and custom genomes will be added soon. - Example: --genome hg38 - --targets TARGETS - Path to exome targets BED file. This file can be - obtained from the manufacturer of the target capture - kit that was used. - - """) - - # Display example usage in epilog - run_epilog = textwrap.dedent("""\ - example: - # Step 1.) Grab an interactive node (do not run on head node) - sinteractive --mem=8g --cpus-per-task=4 - module purge - module load singularity snakemake - - # Step 2A.) Dry-run the pipeline - ./exome-seek run --input .tests/*.R?.fastq.gz \\ - --output /data/$USER/WES_hg38 \\ - --genome hg38 \\ - --targets Agilent_SSv7_allExons_hg38.bed \\ - --mode slurm \\ - --dry-run - - # Step 2B.) Run the GATK4 WES pipeline - # The slurm mode will submit jobs to the cluster. - # It is recommended running exome-seek in this mode. - ./exome-seek run --input .tests/*.R?.fastq.gz \\ - --output /data/$USER/WES_hg38 \\ - --genome hg38 \\ - --targets Agilent_SSv7_allExons_hg38.bed \\ - --mode slurm - - version: - {} - """)#.format(__version__)) - - # Supressing help message of required args to overcome no sub-parser named groups - subparser_run = subparsers.add_parser('run', - help = 'Run the GATK4 WES pipeline with input files.', - usage = argparse.SUPPRESS, - formatter_class=argparse.RawDescriptionHelpFormatter, - description = required_run_options, - epilog = run_epilog - ) - - # Required Arguments - # Input FastQ files - subparser_run.add_argument('--input', - # Check if the file exists and if it is readable - type = lambda file: permissions(parser, file, os.R_OK), - required = True, - nargs = '+', - help = argparse.SUPPRESS - ) - - # Output Directory (analysis working directory) - subparser_run.add_argument('--output', - type = lambda option: os.path.abspath(os.path.expanduser(option)), - required = True, - help = argparse.SUPPRESS - ) - - # Reference Genome (to dynamically select reference files) - subparser_run.add_argument('--genome', - required = True, - #choices = ['hg38', 'mm10'], - type = lambda option: str(genome_options(subparser_run, option, ['hg38','mm10'])), - help = argparse.SUPPRESS - ) - - # Exome TARGET BED file - subparser_run.add_argument('--targets', - # Check if the file exists and if it is readable - type = lambda file: permissions(parser, file, os.R_OK), - required = True, - help = argparse.SUPPRESS - ) - - # Optional Arguments - # Execution Method (run locally on a compute node, submit to SLURM job scheduler, etc.) - subparser_run.add_argument('--mode', - type = str, - required = False, - default = "biowulf", - choices = ['biowulf', 'local'], - help = 'Execution Method [Default: slurm]. Defines the mode or method of execution. \ - Vaild mode options include: local or slurm. \ - local: uses local method of execution. local executions will run serially on \ - compute instance. This is useful for testing, debugging, or when a users does \ - not have access to a high performance computing environment. If this option is \ - not provided, it will default to a local execution mode. \ - slurm: uses slurm and singularity backend. The slurm execution method will submit \ - jobs to a cluster. It is recommended running exome-seek in this mode as execution \ - will be significantly faster in a distributed environment. \ - Example: --mode slurm' - ) - - # Name of master job - subparser_run.add_argument('--job-name', - type = str, - required = False, - default = 'pl:wgs-seek', - help = 'Set the name of the pipeline\'s master job. \ - When submitting the pipeline to a job scheduler, like SLURM, \ - this option always you to set the name of the pipeline\'s master \ - job. By default, the name of the pipeline\'s master job \ - is set to "pl:exome-seek". \ - Example: --job-name WES_hg38_main' - ) - - - - # Tumor normal pairs file - subparser_run.add_argument('--pairs', - # Check if the file exists and if it is readable - type = lambda file: permissions(parser, file, os.R_OK), - required = False, - help = 'Tumor normal pairs file. This tab delimited file contains two columns with the names \ - of tumor and normal pairs, one per line. The header of the file needs to be "Tumor" for the \ - tumor column and "Normal" for the normal column.' - ) - - # Correction for FFPE samples - subparser_run.add_argument('--ffpe', - action = 'store_true', - required = False, - default = False, - help = 'FFPE correction. Runs an additional filtering step for Formalin-Fixed Paraffin-Embedded \ - (FFPE) samples. Do NOT use this option with non-FFPE samples.' - ) - - # Call CNVs - subparser_run.add_argument('--cnv', - action = 'store_true', - required = False, - default = False, - help = 'Call copy number variations or CNVs. CNVs will only be called from tumor-normal pairs. \ - If this option is provided without providing a --pairs file, CNVs will NOT be called.' - ) - - - # Silent output mode - subparser_run.add_argument('--silent', - action = 'store_true', - required = False, - default = False, - help = 'Silence standard output. Reduces the amount of information directed \ - to standard output when submitting master job to the job scheduler. Only the \ - job id of the master job is returned.' - ) - - - # Base directory to write temporary files - subparser_run.add_argument('--tmp-dir', - type = str, - required = False, - default = '/lscratch/$SLURM_JOBID/', - help = 'Path on the filesystem for writing intermediate, temporary output \ - files. By default, this variable is set to \'/lscratch/$SLURM_JOBID\' \ - for backwards compatibility with the NIH\'s Biowulf cluster; however, \ - if you are running the pipeline on another cluster, this option will \ - need to be specified. Ideally, this path should point to a dedicated \ - location on the filesystem for writing tmp files. On many systems, this \ - location is set to somewhere in /scratch. If you need to inject a variable \ - into this string that should NOT be expanded, please quote this options \ - value in single quotes. As an example, on the NCI/NIH FRCE cluster the \ - value of this option would be set to \ - --tmp-dir \'/scratch/cluster_scratch/$USER/\', \ - default: \'/lscratch/$SLURM_JOBID/\'' - ) - - # Number of threads for the exome-seek pipeline's main proceess - subparser_run.add_argument('--threads', - type = int, - required = False, - default = 2, - help = 'Max number of threads for local processes. It is recommended \ - setting this vaule to the maximum number of CPUs available on the host \ - machine, default: 2.' - ) - - - - # Define handlers for each sub-parser - subparser_run.set_defaults(func = run) - #subparser_unlock.set_defaults(func = unlock) - #subparser_cache.set_defaults(func = cache) - - # Parse command-line args - args = parser.parse_args() - return args - - -def main(): - - # Collect args for sub-command - args = parsed_arguments() - - # Display version information - #err('wgs-seek ({})'.format(__version__)) - # Mediator method to call sub-command's set handler function - args.func(args) - -if __name__ == '__main__': - main() \ No newline at end of file