diff --git a/Makefile b/Makefile index ad57ecbe..b8554f01 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ MAKELOG = log/$(@).$(NOW).log USE_CLUSTER ?= true QMAKE = modules/scripts/qmake.pl -n $@.$(NOW) $(if $(SLACK_CHANNEL),-c $(SLACK_CHANNEL)) -r $(NUM_ATTEMPTS) -m -s -- make -NUM_JOBS ?= 50 +NUM_JOBS ?= 100 define RUN_QMAKE $(QMAKE) -e -f $1 -j $2 $(TARGET) && \ @@ -29,11 +29,11 @@ RUN_MAKE = $(if $(findstring false,$(USE_CLUSTER))$(findstring n,$(MAKEFLAGS)),+ #================================================== TARGETS += somatic_indels -somatic_indels: +somatic_indels : $(call RUN_MAKE,modules/variant_callers/somatic/somaticIndels.mk) TARGETS += somatic_variants -somatic_variants: +somatic_variants : $(call RUN_MAKE,modules/variant_callers/somatic/somaticVariants.mk) @@ -69,11 +69,11 @@ tophat : $(call RUN_MAKE,modules/aligners/tophatAligner.mk) TARGETS += star -star: +star : $(call RUN_MAKE,modules/aligners/starAligner.mk) TARGETS += star_fusion_aligner -star_fusion_aligner: +star_fusion_aligner : $(call RUN_MAKE,modules/aligners/starFusionAligner.mk) TARGETS += blast_reads @@ -102,11 +102,11 @@ snvmix : $(call RUN_MAKE,modules/variant_callers/snvmix.mk) TARGETS += tvcTN -tvcTN: +tvcTN : $(call RUN_MAKE,modules/variant_callers/somatic/tvcTN.mk) TARGETS += tvc -tvc: +tvc : $(call RUN_MAKE,modules/variant_callers/tvc.mk) TARGETS += varscanTN @@ -150,13 +150,17 @@ samtools_het : $(call RUN_MAKE,modules/variant_callers/samtoolsHet.mk) TARGETS += platypus -platypus: +platypus : $(call RUN_MAKE,modules/variant_callers/somatic/platypus.mk) TARGETS += msisensor msisensor : $(call RUN_MAKE,modules/variant_callers/somatic/msisensor.mk) +TARGETS += mimsi +mimsi : + $(call RUN_MAKE,modules/variant_callers/somatic/mimsi.mk) + TARGETS += hla_polysolver hla_polysolver : $(call RUN_MAKE,modules/variant_callers/somatic/polysolver.mk) @@ -170,17 +174,9 @@ museqTN : $(call RUN_MAKE,modules/variant_callers/somatic/museqTN.mk) TARGETS += hotspot -hotspot: +hotspot : $(call RUN_MAKE,modules/variant_callers/hotspot.mk) -TARGETS += genotype_hotspot -genotype_hotspot: - $(call RUN_MAKE,modules/variant_callers/genotypehotspots.mk) - -TARGETS += genotype_pdx -genotype_pdx: - $(call RUN_MAKE,modules/variant_callers/genotypepdx.mk) - TARGETS += jsm jsm : $(call RUN_MAKE,modules/variant_callers/somatic/jsm.mk) @@ -188,10 +184,18 @@ jsm : TARGETS += sufam sufam: $(call RUN_MAKE,modules/variant_callers/sufamsampleset.mk) + +TARGETS += sufam_gt +sufam_gt : + $(call RUN_MAKE,modules/variant_callers/sufam_gt.mk) + +TARGETS += get_basecount +get_basecount : + $(call RUN_MAKE,modules/variant_callers/get_basecounts.mk) -TARGETS += sufam_summary -sufam_summary: - $(call RUN_MAKE,modules/variant_callers/sufammultisample.mk) +TARGETS += strelka_varscan_indels +strelka_varscan_indels : + $(call RUN_MAKE,modules/variant_callers/somatic/strelkaVarscanIndels.mk) #================================================== @@ -201,7 +205,11 @@ sufam_summary: TARGETS += facets facets : $(call RUN_MAKE,modules/copy_number/facets.mk) - + +TARGETS += facets_suite +facets_suite : + $(call RUN_MAKE,modules/copy_number/facets_suite.mk) + TARGETS += ascat ascat : $(call RUN_MAKE,modules/copy_number/ascat.mk) @@ -214,10 +222,6 @@ TARGETS += titan titan : $(call RUN_MAKE,modules/copy_number/titan.mk) -TARGETS += strelka_varscan_indels -strelka_varscan_indels: - $(call RUN_MAKE,modules/variant_callers/somatic/strelkaVarscanIndels.mk) - TARGETS += varscan_cnv varscan_cnv : $(call RUN_MAKE,modules/copy_number/varscanCNV.mk) @@ -256,75 +260,15 @@ snp6 : TARGETS += cnv_kit cnv_kit : - $(call RUN_MAKE,modules/test/workflows/cnvkit.mk) - -TARGETS += cnvkit_coverage -cnvkit_coverage : - $(call RUN_MAKE,modules/copy_number/cnvkitcoverage.mk) - -TARGETS += cnvkit_reference -cnvkit_reference : - $(call RUN_MAKE,modules/copy_number/cnvkitreference.mk) - -TARGETS += cnvkit_fix -cnvkit_fix : - $(call RUN_MAKE,modules/copy_number/cnvkitfix.mk) - -TARGETS += cnvkit_plot -cnvkit_plot : - $(call RUN_MAKE,modules/copy_number/cnvkitplot.mk) - -TARGETS += cnvkit_heatmap -cnvkit_heatmap : - $(call RUN_MAKE,modules/copy_number/cnvkitheatmap.mk) - -TARGETS += cnvkit_pca -cnvkit_pca : - $(call RUN_MAKE,modules/copy_number/cnvkitprcomp.mk) - -TARGETS += cnvkit_qc -cnvkit_qc : - $(call RUN_MAKE,modules/copy_number/cnvkitqc.mk) - -TARGETS += qdna_seq -qdna_seq : - $(call RUN_MAKE,modules/test/workflows/qdnaseq.mk) - -TARGETS += qdnaseq_extract_test -qdnaseq_extract_test: - $(call RUN_MAKE,modules/test/copy_number/qdnaseqextract.mk) - -TARGETS += qdnaseq_copynumber_test -qdnaseq_copynumber_test: - $(call RUN_MAKE,modules/test/copy_number/qdnaseqcopynumber.mk) - -TARGETS += copynumber_summary -copynumber_summary: - $(call RUN_MAKE,modules/test/workflows/copynumber_summary.mk) - -TARGETS += genome_altered -genome_altered : - $(call RUN_MAKE,modules/copy_number/genomealtered.mk) - -TARGETS += lst_score -lst_score : - $(call RUN_MAKE,modules/copy_number/lstscore.mk) - -TARGETS += ntai_score -ntai_score : - $(call RUN_MAKE,modules/copy_number/ntaiscore.mk) - -TARGETS += myriad_score -myriad_score : - $(call RUN_MAKE,modules/copy_number/myriadhrdscore.mk) + $(call RUN_MAKE,modules/copy_number/cnvkit.mk) #================================================== -# structural variant callers +# RNAseq structural variant callers #================================================== TARGETS += star_fusion -star_fusion: +star_fusion : $(call RUN_MAKE,modules/sv_callers/starFusion.mk) TARGETS += tophat_fusion @@ -334,23 +278,52 @@ tophat_fusion : TARGETS += manta_rnaseq manta_rnaseq : $(call RUN_MAKE,modules/sv_callers/mantaRnaseq.mk) + +TARGETS += integrate_rnaseq +integrate_rnaseq : + $(call RUN_MAKE,modules/sv_callers/integrateRnaseq.mk) +TARGETS += soapfuse +soapfuse : + $(call RUN_MAKE,modules/sv_callers/soapFuse.mk) + +TARGETS += mapsplice +mapsplice : + $(call RUN_MAKE,modules/sv_callers/mapsplice.mk) + +TARGETS += fusioncatcher +fusioncatcher : + $(call RUN_MAKE,modules/sv_callers/fusioncatcher.mk) + +TARGETS += oncofuse +oncofuse : + $(call RUN_MAKE,modules/sv_callers/oncofuse.mk) + + +#================================================== +# DNA structural variant callers +#================================================== + +TARGETS += manta_tumor_normal +manta_tumor_normal : + $(call RUN_MAKE,modules/sv_callers/manta_tumor_normal.mk) + +TARGETS += svaba_tumor_normal +svaba_tumor_normal : + $(call RUN_MAKE,modules/sv_callers/svaba_tumor_normal.mk) + +TARGETS += gridss_tumor_normal +gridss_tumor_normal : + $(call RUN_MAKE,modules/sv_callers/gridss_tumor_normal.mk) + TARGETS += manta manta : $(call RUN_MAKE,modules/sv_callers/manta.mk) -TARGETS += mantaTN -mantaTN : - $(call RUN_MAKE,modules/sv_callers/mantaTN.mk) - TARGETS += brass brass : $(call RUN_MAKE,modules/sv_callers/brass.mk) -TARGETS += integrate_rnaseq -integrate_rnaseq : - $(call RUN_MAKE,modules/sv_callers/integrateRnaseq.mk) - TARGETS += integrate integrate : $(call RUN_MAKE,modules/sv_callers/integrate.mk) @@ -364,10 +337,6 @@ TARGETS += chimscan chimscan : $(call RUN_MAKE_J,modules/sv_callers/chimerascan.mk,$(NUM_CHIMSCAN_JOBS)) -TARGETS += oncofuse -oncofuse : - $(call RUN_MAKE,modules/sv_callers/oncofuse.mk) - TARGETS += lumpy lumpy : $(call RUN_MAKE,modules/sv_callers/lumpy.mk) @@ -380,18 +349,6 @@ TARGETS += nfuse_wgss_wtss nfuse_wgss_wtss : $(call RUN_MAKE,modules/sv_callers/nfuseWGSSWTSS.mk) -TARGETS += soapfuse -soapfuse : - $(call RUN_MAKE,modules/sv_callers/soapFuse.mk) - -TARGETS += mapsplice -mapsplice : - $(call RUN_MAKE,modules/sv_callers/mapsplice.mk) - -TARGETS += fusioncatcher -fusioncatcher : - $(call RUN_MAKE,modules/sv_callers/fusioncatcher.mk) - TARGETS += crest crest : $(call RUN_MAKE,modules/sv_callers/crest.mk) @@ -399,23 +356,65 @@ crest : TARGETS += delly delly : $(call RUN_MAKE,modules/sv_callers/delly.mk) - + #================================================== -# pre-processing +# BAM tools #================================================== -TARGETS += merge_fastq -merge_fastq : - $(call RUN_MAKE,modules/fastq_tools/mergeFastq.mk) - TARGETS += fix_bam fix_bam : - $(call RUN_MAKE,modules/bam_tools/fixBam.mk) + $(call RUN_MAKE,modules/bam_tools/fix_bam.mk) TARGETS += fix_rg fix_rg : - $(call RUN_MAKE,modules/bam_tools/fixRG.mk) + $(call RUN_MAKE,modules/bam_tools/fix_rg.mk) + +TARGETS += fix_mate +fix_mate : + $(call RUN_MAKE,modules/bam_tools/fix_mate.mk) + +TARGETS += merge_bam +merge_bam : + $(call RUN_MAKE,modules/bam_tools/merge_bam.mk) + +TARGETS += process_bam +process_bam : + $(call RUN_MAKE,modules/bam_tools/processBam.mk) + +TARGETS += getbam_irb_mirror +getbam_irb_mirror : + $(call RUN_MAKE,modules/bam_tools/get_bam_irb_mirror.mk) + +TARGETS += getbam_data_mirror +getbam_data_mirror : + $(call RUN_MAKE,modules/bam_tools/get_bam_data_mirror.mk) + +TARGETS += putbam_data_mirror +putbam_data_mirror : + $(call RUN_MAKE,modules/bam_tools/put_bam_data_mirror.mk) + + +#================================================== +# VCF tools +#================================================== + +TARGETS += merge_sv +merge_sv : + $(call RUN_MAKE,modules/vcf_tools/merge_sv.mk) + +TARGETS += annotate_sv +annotate_sv : + $(call RUN_MAKE,modules/vcf_tools/annotate_sv.mk) + + +#================================================== +# FASTQ tools +#================================================== + +TARGETS += merge_fastq +merge_fastq : + $(call RUN_MAKE,modules/fastq_tools/mergeFastq.mk) TARGETS += merge_split_fastq merge_split_fastq : @@ -436,27 +435,23 @@ extract_unmapped_pairs : TARGETS += bam_to_fasta bam_to_fasta : $(call RUN_MAKE,modules/fastq_tools/bamtoFasta.mk) - -TARGETS += process_bam -process_bam : - $(call RUN_MAKE,modules/bam_tools/processBam.mk) - -TARGETS += merge_bam -merge_bam : - $(call RUN_MAKE,modules/bam_tools/mergeBam.mk) #================================================== -# quality control +# QC #================================================== TARGETS += bam_metrics bam_metrics : - $(call RUN_MAKE,modules/qc/bamMetrics.mk) + $(call RUN_MAKE,modules/qc/bam_metrics.mk) TARGETS += bam_interval_metrics bam_interval_metrics : - $(call RUN_MAKE,modules/qc/bamIntervalMetrics.mk) + $(call RUN_MAKE,modules/qc/bam_interval_metrics.mk) + +TARGETS += wgs_metrics +wgs_metrics : + $(call RUN_MAKE,modules/qc/wgs_metrics.mk) TARGETS += rnaseq_metrics rnaseq_metrics : @@ -484,44 +479,30 @@ bam_stats : #================================================== -# rna sequencing +# RNA sequencing #================================================== -TARGETS += cufflinks -cufflinks : - $(call RUN_MAKE,modules/rnaseq/cufflinks.mk) - TARGETS += sum_reads sum_reads : - $(call RUN_MAKE,modules/rnaseq/sumRNASeqReads.mk) + $(call RUN_MAKE,modules/rnaseq/sumreads.mk) -TARGETS += exon_counts -exon_counts : - $(call RUN_MAKE,modules/rnaseq/dexseq.mk) +TARGETS += kallisto +kallisto : + $(call RUN_MAKE,modules/rnaseq/kallisto.mk) - -#================================================== -# chip sequencing -#================================================== +TARGETS += immune_deconv +immune_deconv : + $(call RUN_MAKE,modules/rnaseq/immunedeconv.mk) -TARGETS += macs2TN -macs2TN: - $(call RUN_MAKE,modules/variant_callers/somatic/macs2TN.mk) - #================================================== -# ploidy +# Ploidy / Clonality #================================================== TARGETS += pyloh pyloh : $(call RUN_MAKE,modules/ploidy/pyloh.mk) - -#================================================== -# clonality -#================================================== - TARGETS += clonehd clonehd : $(call RUN_MAKE,modules/clonality/clonehd.mk) @@ -530,31 +511,34 @@ TARGETS += absolute_seq absolute_seq : $(call RUN_MAKE,modules/clonality/absoluteSeq.mk) -TARGETS += ms_pyclone -ms_pyclone : - $(call RUN_MAKE,modules/test/workflows/mspyclone.mk) - -TARGETS += ss_pyclone -ss_pyclone : - $(call RUN_MAKE,modules/test/workflows/pyclone.mk) +TARGETS += pyclone_13 +pyclone_13 : + $(call RUN_MAKE,modules/clonality/pyclone_13.mk) +TARGETS += pyclone_vi +pyclone_vi : + $(call RUN_MAKE,modules/clonality/pyclone_vi.mk) #================================================== # mutational signatures #================================================== -TARGETS += emu -emu : - $(call RUN_MAKE,modules/signatures/emu.mk) - -TARGETS += mut_sig -mut_sig : - $(call RUN_MAKE,modules/signatures/mut_sig.mk) - TARGETS += deconstruct_sigs deconstruct_sigs : $(call RUN_MAKE,modules/signatures/deconstruct_sigs.mk) +TARGETS += sv_signature +sv_signature : + $(call RUN_MAKE,modules/signatures/sv_signature.mk) + +TARGETS += star_fish +star_fish : + $(call RUN_MAKE,modules/signatures/star_fish.mk) + +TARGETS += hr_detect +hr_detect : + $(call RUN_MAKE,modules/signatures/hr_detect.mk) + #================================================== # miscellaneous @@ -573,29 +557,16 @@ virus_detection_bowtie2 : $(call RUN_MAKE,modules/virus/virus_detection_bowtie2.mk) TARGETS += viral_detection -viral_detection: +viral_detection : $(call RUN_MAKE,modules/test/workflows/viral_detection.mk) TARGETS += krona_classify krona_classify : $(call RUN_MAKE,modules/virus/krona_classify.mk) -TARGETS += fetch_impact -fetch_impact : - $(call RUN_MAKE,modules/test/workflows/fetchimpact.mk) - - -#================================================== -# phylogeny -#================================================== - -TARGETS += medicc -medicc : - $(call RUN_MAKE,modules/test/workflows/medicc.mk) - -TARGETS += pratchet -pratchet : - $(call RUN_MAKE,modules/test/workflows/pratchet.mk) +TARGETS += medicc2 +medicc2 : + $(call RUN_MAKE,modules/copy_number/medicc2.mk) #================================================== @@ -614,9 +585,9 @@ TARGETS += mutation_summary mutation_summary : $(call RUN_MAKE,modules/summary/mutationsummary.mk) -TARGETS += cravat_summary -cravat_summary : - $(call RUN_MAKE,modules/summary/cravat_summary.mk) +TARGETS += delmh_summary +delmh_summary : + $(call RUN_MAKE,modules/summary/delmh_summary.mk) #================================================== @@ -624,24 +595,28 @@ cravat_summary : #================================================== TARGETS += ann_ext_vcf -ann_ext_vcf: +ann_ext_vcf : $(call RUN_MAKE,modules/vcf_tools/annotateExtVcf.mk) TARGETS += ann_somatic_vcf -ann_somatic_vcf: +ann_somatic_vcf : $(call RUN_MAKE,modules/vcf_tools/annotateSomaticVcf.mk) TARGETS += ann_vcf -ann_vcf: +ann_vcf : $(call RUN_MAKE,modules/vcf_tools/annotateVcf.mk) -TARGETS += cravat_annotation -cravat_annotation : - $(call RUN_MAKE,modules/test/workflows/cravat_annotation.mk) - TARGETS += cravat_annotate cravat_annotate : $(call RUN_MAKE,modules/vcf_tools/cravat_annotation.mk) + +TARGETS += cravat_summary +cravat_summary : + $(call RUN_MAKE,modules/summary/cravat_summary.mk) + +TARGETS += ann_summary_vcf +ann_summary_vcf : + $(call RUN_MAKE,modules/vcf_tools/annotateSummaryVcf.mk) #================================================== @@ -649,13 +624,9 @@ cravat_annotate : #================================================== TARGETS += hotspot_summary -hotspot_summary: +hotspot_summary : $(MAKE) -f modules/variant_callers/genotypehotspots.mk -j $(NUM_JOBS) $(call RUN_MAKE,modules/summary/hotspotsummary.mk) -#================================================== -# alpha testing -#================================================== - - + .PHONY : $(TARGETS) diff --git a/Makefile.inc b/Makefile.inc index 41f26d88..2a63beec 100644 --- a/Makefile.inc +++ b/Makefile.inc @@ -99,7 +99,13 @@ CREATE_SEQ_DICT = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CreateSequenceDic CALC_HS_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CalculateHsMetrics.jar $(PICARD_OPTS) COLLECT_MULT_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CollectMultipleMetrics.jar $(PICARD_OPTS) COLLECT_TARGETED_METRICS = $(JAVA) -Xmx$(PICARD_MEM) -jar $(PICARD_DIR)/CollectTargetedPcrMetrics.jar $(PICARD_OPTS) - +COLLECT_ALIGNMENT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectAlignmentSummaryMetrics $(PICARD_OPTS) +COLLECT_INSERT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectInsertSizeMetrics $(PICARD_OPTS) +COLLECT_OXOG_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectOxoGMetrics $(PICAD_OPTS) +COLLECT_GC_BIAS = $(PICARD) -Xmx$(PICARD_MEM) CollectGcBiasMetrics $(PICARD_OPTS) +COLLECT_WGS_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectWgsMetrics $(PICARD_OPTS) +COLLECT_DUP_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectDuplicateMetrics $(PICARD_OPTS) +BAM_INDEX = $(PICARD) -Xmx$(PICARD_MEM) BamIndexStats $(PICARD_OPTS) FIX_MATE = $(call FIX_MATE_MEM,$(PICARD_MEM)) FIX_MATE_MEM = $(JAVA) -Xmx$(1) -jar $(PICARD_DIR)/FixMateInformation.jar $(PICARD_OPTS) TMP_DIR=$(TMPDIR) SAM_TO_FASTQ = $(call SAM_TO_FASTQ_MEM,$(PICARD_MEM)) @@ -120,7 +126,7 @@ SNP_EFF_MEM = $(JAVA8) -Xmx$1 -jar $(SNP_EFF_JAR) SNP_SIFT_MEM = $(JAVA8) -Xmx$1 -jar $(SNP_SIFT_JAR) SNP_SIFT = $(call SNP_SIFT_MEM,$(DEFAULT_JAVA_MEM)) VCF_EFF_ONE_PER_LINE = $(HOME)/share/usr/snpEff-4.1/scripts/vcfEffOnePerLine.pl -VCF_JOIN_EFF = modules/scripts/joinEff.pl +VCF_JOIN_EFF = modules/scripts/join_eff.pl COUNT_SAMPLES = $(shell expr `sed 's/\..*//; s:.*/::' <<< $1 | grep -o "_" | wc -l` + 1) diff --git a/README.md b/README.md index df602c2f..aa552e97 100644 --- a/README.md +++ b/README.md @@ -1,61 +1,2 @@ -# jrflab modules -[![Build Status](https://travis-ci.org/cBioPortal/cbioportal.svg?branch=master)](https://travis-ci.org/jrflab/modules) +# modules -## Introduction -This is the implementation of the jrflab pipeline. - -## Installation -The easiest way to download this pipeline is to clone the repository. - -``` -git clone https://github.com/jrflab/modules.git -``` - -## Dependencies -- An instance of [anaconda](https://www.anaconda.com) or [miniconda](https://conda.io/en/latest/miniconda.html) -- IMB's Platform Load Sharing Facility (LSF) or Oracle's Sun Grid Engine (SGE) for resource management - -### Following R Packages -- [xxx](https://) - -## Best practices - -### Conventions -- Sample names cannot have "/" or "." in them -- Fastq files end in ".fastq.gz" -- Fastq files are stored in DATA_DIR (Set as Environment Variable) - -### Whole genome, whole exome and targeted sequencing -- QC -- BWA -- Broad Standard Practices on bwa bam -- Haplotype Caller, Platypus, MuTect, Strelka -- snpEff, Annovar, SIFT, pph2, vcf2maf, VEP, OncoKB, ClinVar -- Copy number, tumor purity using Facets -- Contamination using -- HLA Typing - * [xxx](http://) - -### RNA transcriptome sequencing -- QC -- Tophat, STAR -- Cufflinks (ENS and UCSC) -- In-house Exon Expression (ENS and UCSC) -- fusion-catcher, tophat-fusion, deFuse -- OncoFuse actionable fusion classification - -### Patient: -- Genotyping On Patient. - 1000g sites are evaluated for every library and then compared (all vs all) - If two libraries come from a patient the match should be pretty good >80% -- Still to develop: - If the match is below a certain threshold, break the pipeline for patient - -## Detailed usage -[wiki](https://github.com/jrflab/modules/wiki) - -## Known issues - -### Known bugs - -### Currently under development diff --git a/aligners/bwamemAligner.mk b/aligners/bwamemAligner.mk index 6cf97dc7..c1e32194 100644 --- a/aligners/bwamemAligner.mk +++ b/aligners/bwamemAligner.mk @@ -21,12 +21,6 @@ BWAMEM_REF_FASTA ?= $(REF_FASTA) BWAMEM_THREADS = 8 BWAMEM_MEM_PER_THREAD = $(if $(findstring true,$(PDX)),4G,2G) -..DUMMY := $(shell mkdir -p version; $(BWA) &> version/bwamem.txt; echo "options: $(BWA_ALN_OPTS)" >> version/bwamem.txt ) -.SECONDARY: -.DELETE_ON_ERROR: -.PHONY: bwamem - - BWA_BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam) bwamem : $(BWA_BAMS) $(addsuffix .bai,$(BWA_BAMS)) @@ -50,6 +44,12 @@ bwamem/bam/%.bwamem.bam : fastq/%.fastq.gz fastq/%.fastq.gz : fastq/%.fastq $(call RUN,,"gzip -c $< > $(@) && $(RM) $<") + + +..DUMMY := $(shell mkdir -p version; $(BWA) &> version/bwamem.txt; echo "options: $(BWA_ALN_OPTS)" >> version/bwamem.txt ) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: bwamem include modules/bam_tools/processBam.mk include modules/fastq_tools/fastq.mk diff --git a/aligners/tmapAligner.mk b/aligners/tmapAligner.mk index 6e4d415e..42d1493a 100644 --- a/aligners/tmapAligner.mk +++ b/aligners/tmapAligner.mk @@ -5,6 +5,7 @@ include modules/aligners/align.inc ALIGNER := tmap LOGDIR := log/tmap.$(NOW) + SAMTOOLS_SORT_MEM = 2000000000 FASTQ_CHUNKS := 10 diff --git a/bam_tools/fixBam.mk b/bam_tools/fixBam.mk deleted file mode 100644 index 28e39a7b..00000000 --- a/bam_tools/fixBam.mk +++ /dev/null @@ -1,72 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/fix_bam.$(NOW) -PHONY += fixed_bam - -VPATH = fixed_bam unprocessed_bam -PICARD_JAR = ~/share/usr/picard/bin/picard.jar - -fix_bam : $(foreach sample,$(SAMPLES),fixed_bam/$(sample).bam) - -define fix-bam -unprocessed_bam/%.ubam : unprocessed_bam/%.bam - $$(call RUN,-c -n 1 -s 12G -m 18G -w 7200,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) RevertSam \ - I=$$(<) \ - O=unprocessed_bam/$$(*).ubam \ - SANITIZE=true \ - MAX_DISCARD_FRACTION=0.005 \ - ATTRIBUTE_TO_CLEAR=XT \ - ATTRIBUTE_TO_CLEAR=XN \ - ATTRIBUTE_TO_CLEAR=AS \ - ATTRIBUTE_TO_CLEAR=OC \ - ATTRIBUTE_TO_CLEAR=OP \ - SORT_ORDER=queryname \ - RESTORE_ORIGINAL_QUALITIES=true \ - REMOVE_DUPLICATE_INFORMATION=true \ - REMOVE_ALIGNMENT_INFORMATION=true \ - TMP_DIR=$(TMPDIR)") -unprocessed_bam/%.fixed.bam : unprocessed_bam/%.ubam - $$(call RUN, -c -n 1 -s 12G -m 18G -w 7200,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) MergeBamAlignment \ - R=$$(DMP_FASTA) \ - UNMAPPED_BAM=$$(<) \ - ALIGNED_BAM=unprocessed_bam/$$(*).bam \ - O=unprocessed_bam/$$(*).fixed.bam \ - CREATE_INDEX=true \ - ADD_MATE_CIGAR=true \ - CLIP_ADAPTERS=true \ - CLIP_OVERLAPPING_READS=true \ - INCLUDE_SECONDARY_ALIGNMENTS=false \ - MAX_INSERTIONS_OR_DELETIONS=-1 \ - TMP_DIR=$(TMPDIR)") -unprocessed_bam/%.dedup.bam : unprocessed_bam/%.fixed.bam - $$(call RUN, -c -n 1 -s 12G -m 18G -w 7200,"java -Djava.io.tmpdir=$$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) MarkDuplicates \ - I=$$(<) \ - O=unprocessed_bam/$$(*).dedup.bam \ - M=unprocessed_bam/$$(*).txt \ - TMP_DIR=$$(TMPDIR)") -fixed_bam/%.bam : unprocessed_bam/%.dedup.bam - $$(call RUN, -c -n 1 -s 12G -m 18G -w 7200,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) AddOrReplaceReadGroups \ - I=$$(<) \ - O=fixed_bam/$$(*).bam \ - RGID=$$(*) \ - RGLB=$$(*) \ - RGPL=illumina \ - RGPU=NA \ - RGSM=$$(*) \ - TMP_DIR=$(TMPDIR) && \ - samtools index fixed_bam/$$(*).bam && \ - cp fixed_bam/$$(*).bam.bai fixed_bam/$$(*).bai && \ - rm -rf unprocessed_bam/$$(*).ubam && \ - rm -rf unprocessed_bam/$$(*).fixed.bam && \ - rm -rf unprocessed_bam/$$(*).dedup.bam && \ - rm -rf unprocessed_bam/$$(*).fixed.bai && \ - rm -rf unprocessed_bam/$$(*).dedup.bai && \ - rm -rf unprocessed_bam/$$(*).txt") -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call fix-bam,$(sample)))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/bam_tools/fix_bam.mk b/bam_tools/fix_bam.mk new file mode 100644 index 00000000..3e6ce446 --- /dev/null +++ b/bam_tools/fix_bam.mk @@ -0,0 +1,75 @@ +include modules/Makefile.inc +include modules/genome_inc/b37.inc + +LOGDIR = log/fix_bam.$(NOW) + +PICARD_JAR = ~/share/usr/picard/bin/picard.jar + +fix_bam : $(foreach sample,$(SAMPLES),fixed_bam/$(sample).bam) + +define fix-bam +unprocessed_bam/$1.ubam : unprocessed_bam/$1.bam + $$(call RUN,-c -n 1 -s 12G -m 18G -w 72:00:00,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) RevertSam \ + I=$$(<) \ + O=$$(@) \ + SANITIZE=true \ + MAX_DISCARD_FRACTION=0.005 \ + ATTRIBUTE_TO_CLEAR=XT \ + ATTRIBUTE_TO_CLEAR=XN \ + ATTRIBUTE_TO_CLEAR=AS \ + ATTRIBUTE_TO_CLEAR=OC \ + ATTRIBUTE_TO_CLEAR=OP \ + SORT_ORDER=queryname \ + RESTORE_ORIGINAL_QUALITIES=true \ + REMOVE_DUPLICATE_INFORMATION=true \ + REMOVE_ALIGNMENT_INFORMATION=true \ + TMP_DIR=$(TMPDIR)") + +unprocessed_bam/$1.fixed.bam : unprocessed_bam/$1.bam unprocessed_bam/$1.ubam + $$(call RUN, -c -n 1 -s 12G -m 18G -w 72:00:00,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) MergeBamAlignment \ + R=$$(DMP_FASTA) \ + ALIGNED_BAM=$$(<) \ + UNMAPPED_BAM=$$(<<) \ + O=$$(@) \ + CREATE_INDEX=true \ + ADD_MATE_CIGAR=true \ + CLIP_ADAPTERS=true \ + CLIP_OVERLAPPING_READS=true \ + INCLUDE_SECONDARY_ALIGNMENTS=false \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + TMP_DIR=$(TMPDIR)") + +unprocessed_bam/$1.dedup.bam : unprocessed_bam/$1.fixed.bam + $$(call RUN, -c -n 1 -s 12G -m 18G -w 72:00:00,"java -Djava.io.tmpdir=$$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) MarkDuplicates \ + I=$$(<) \ + O=$$(@) \ + M=unprocessed_bam/$1.txt \ + TMP_DIR=$$(TMPDIR)") + +fixed_bam/$1.bam : unprocessed_bam/$1.dedup.bam + $$(call RUN, -c -n 1 -s 12G -m 18G -w 72:00:00,"java -Djava.io.tmpdir=$(TMPDIR) -Xmx16G -jar $$(PICARD_JAR) AddOrReplaceReadGroups \ + I=$$(<) \ + O=$$(@) \ + RGID=$1 \ + RGLB=$1 \ + RGPL=illumina \ + RGPU=NA \ + RGSM=$1 \ + TMP_DIR=$(TMPDIR) && \ + samtools index $$(@) && \ + cp fixed_bam/$1.bam.bai fixed_bam/$1.bai && \ + rm -rf unprocessed_bam/$1.ubam && \ + rm -rf unprocessed_bam/$1.fixed.bam && \ + rm -rf unprocessed_bam/$1.dedup.bam && \ + rm -rf unprocessed_bam/$1.fixed.bai && \ + rm -rf unprocessed_bam/$1.dedup.bai && \ + rm -rf unprocessed_bam/$1.txt") +endef + $(foreach sample,$(SAMPLES),\ + $(eval $(call fix-bam,$(sample)))) + +..DUMMY := $(shell mkdir -p version; \ + echo "picard" > version/fix_bam.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: fix_bam diff --git a/bam_tools/fixMate.mk b/bam_tools/fix_mate.mk similarity index 100% rename from bam_tools/fixMate.mk rename to bam_tools/fix_mate.mk diff --git a/bam_tools/fixRG.mk b/bam_tools/fix_rg.mk similarity index 91% rename from bam_tools/fixRG.mk rename to bam_tools/fix_rg.mk index d957c9ad..0e7735f8 100644 --- a/bam_tools/fixRG.mk +++ b/bam_tools/fix_rg.mk @@ -2,13 +2,13 @@ include modules/Makefile.inc include modules/variant_callers/gatk.inc include modules/aligners/align.inc -LOGDIR ?= log/fixRG.$(NOW) +LOGDIR ?= log/fix_rg.$(NOW) BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam) + fixed_bams : $(BAMS) $(addsuffix .bai,$(BAMS)) bam/%.bam : unprocessed_bam/%.rg.bam $(INIT) ln -f $(<) $(@) - include modules/bam_tools/processBam.mk diff --git a/bam_tools/get_bam_data_mirror.mk b/bam_tools/get_bam_data_mirror.mk new file mode 100644 index 00000000..1f616d50 --- /dev/null +++ b/bam_tools/get_bam_data_mirror.mk @@ -0,0 +1,36 @@ +include modules/Makefile.inc + +LOGDIR = log/getbam_data_mirror.$(NOW) + +get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \ + $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \ + $(foreach sample,$(SAMPLES),bam/$(sample).bai) + +PROJECT_NAME = $(shell basename $(PWD)) + +define get-bam +bam/$1.bam : + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam \ + bam/") + +bam/$1.bam.bai : + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai \ + bam/") + +bam/$1.bai : + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai \ + bam/") + + +endef + $(foreach sample,$(SAMPLES),\ + $(eval $(call get-bam,$(sample)))) + +..DUMMY := $(shell mkdir -p version; \ + which scp > version/getbam_data_mirror.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: get_bam \ No newline at end of file diff --git a/bam_tools/get_bam_irb_mirror.mk b/bam_tools/get_bam_irb_mirror.mk new file mode 100644 index 00000000..02a78b4b --- /dev/null +++ b/bam_tools/get_bam_irb_mirror.mk @@ -0,0 +1,32 @@ +include modules/Makefile.inc + +LOGDIR = log/getbam_irb_mirror.$(NOW) + +get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \ + $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \ + $(foreach sample,$(SAMPLES),bam/$(sample).bai) + +define get-bam +bam/$1.bam : + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + scp $(USER)@juno-xfer01.mskcc.org:/juno/dmp/share/irb12_245/`echo $1 | cut -c 1-1`/`echo $1 | cut -c 2-2`/$1.bam \ + bam/") + +bam/$1.bam.bai : bam/$1.bam + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + $(SAMTOOLS) index $$(<)") + +bam/$1.bai : bam/$1.bam bam/$1.bam.bai + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + cp $$(<<) $$(@)") + + +endef + $(foreach sample,$(SAMPLES),\ + $(eval $(call get-bam,$(sample)))) + +..DUMMY := $(shell mkdir -p version; \ + which scp > version/getbam_irb_mirror.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: get_bam \ No newline at end of file diff --git a/bam_tools/mergeBam.mk b/bam_tools/merge_bam.mk similarity index 100% rename from bam_tools/mergeBam.mk rename to bam_tools/merge_bam.mk index bfaeedb8..68eda7b0 100644 --- a/bam_tools/mergeBam.mk +++ b/bam_tools/merge_bam.mk @@ -2,10 +2,6 @@ include modules/Makefile.inc LOGDIR = log/merge.$(NOW) -.SECONDARY: -.DELETE_ON_ERROR: -.PHONY : merged_bam - merged_bam : $(foreach sample,$(MERGE_SAMPLES),bam/$(sample).bam bam/$(sample).bam.bai) define merged-bam @@ -32,4 +28,8 @@ $(foreach sample,$(MERGE_SAMPLES),\ bam/%.bam : merged_bam/%.rg.bam $(INIT) ln -f $< $@ +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY : merged_bam + include modules/bam_tools/processBam.mk diff --git a/bam_tools/put_bam_data_mirror.mk b/bam_tools/put_bam_data_mirror.mk new file mode 100644 index 00000000..8bc19bdc --- /dev/null +++ b/bam_tools/put_bam_data_mirror.mk @@ -0,0 +1,25 @@ +include modules/Makefile.inc + +LOGDIR = log/putbam_data_mirror.$(NOW) + +put_bam : $(foreach sample,$(SAMPLES),bam/$(sample).taskcomplete) + +PROJECT_NAME = $(shell basename $(PWD)) + +define put-bam +bam/$1.taskcomplete : bam/$1.bam + $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ + rsync -aP -e ssh bam/$1.bam $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam && \ + rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai && \ + rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai && \ + echo 'finished!' > $$(@)") + +endef + $(foreach sample,$(SAMPLES),\ + $(eval $(call put-bam,$(sample)))) + +..DUMMY := $(shell mkdir -p version; \ + which scp > version/putbam_data_mirror.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: put_bam \ No newline at end of file diff --git a/clonality/plotpyclone.R b/clonality/plotpyclone.R deleted file mode 100644 index 1648e82c..00000000 --- a/clonality/plotpyclone.R +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) -suppressPackageStartupMessages(library("ggplot2")) - -optList = list( - make_option("--sample_set", default = NULL, help = "sample set name"), - make_option("--normal_samples", default = NULL, help = "normal sample names"), - make_option("--min_depth", default = NA, help = "minimum depth to consider") - ) - -parser = OptionParser(usage = "%prog [options] mutation_file", option_list = optList) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -tumor_samples = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) -normal_sample = unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE)) -normal_sample = tumor_samples[tumor_samples %in% normal_sample] -tumor_samples = tumor_samples[!(tumor_samples %in% normal_sample)] -min_depth = ifelse(is.na(opt$min_depth) | is.null(opt$min_depth) | opt$min_depth=="" | opt$min_depth==" ", 50, opt$min_depth) - -mutation_summary = read_tsv(file=paste0("sufam/", opt$sample_set, ".tsv")) %>% - mutate(mutation_id = paste0(Gene_Symbol, "_", HGVSp)) -index = apply(mutation_summary[,paste0("DP_", tumor_samples)], 1, function(x) {sum(x>=min_depth)})==length(tumor_samples) -mutation_summary = mutation_summary[index,,drop=FALSE] -pyclone_summary = read_tsv(file=paste0("pyclone/", opt$sample_set, "/report/pyclone.tsv"), col_types = cols(.default = col_character())) %>% - type_convert() %>% - full_join(mutation_summary, by="mutation_id") %>% - arrange(cluster_id) %>% - mutate(mutation_type = ifelse(Variant_Caller=="mutect", "SNV", "Indel")) %>% - mutate(nref = nchar(Ref)) %>% - mutate(nalt = nchar(Alt)) %>% - filter(nref<=2 & nalt<=2) - -df = pyclone_summary[,c("mutation_id", "cluster_id", "mutation_type"),drop=FALSE] -for (i in 1:length(tumor_samples)) { - x = pyclone_summary[,tumor_samples[i]] %>% - .[[1]] - c_x = pyclone_summary %>% - .[[paste0("CALL_", tumor_samples[i])]] - m_x = pyclone_summary %>% - .[[paste0("MAF_", tumor_samples[i])]] - x[x<.025 | c_x==0 | m_x<.05] = 0 - df = cbind(df, x) - colnames(df)[i+3] = tumor_samples[i] -} -index = apply(df[,tumor_samples], 1, function(x) {sum(x==0)})==length(tumor_samples) -df = df[!index,,drop=FALSE] -pyclone_summary = pyclone_summary[!index,,drop=FALSE] -index = apply(pyclone_summary[,paste0("DP_", tumor_samples)], 1, function(x) {sum(x>=500)})>=1 -df = df[!index,,drop=FALSE] -pyclone_summary = pyclone_summary[!index,,drop=FALSE] - - -pyclone_summary[,tumor_samples] = df[,tumor_samples] - - -clusters = table(pyclone_summary$cluster_id) -if (any(clusters==1)) { - pyclone_summary = pyclone_summary %>% - filter(!(cluster_id %in% names(clusters)[clusters==1])) -} - -df = pyclone_summary[,c("mutation_id", "cluster_id", "mutation_type"),drop=FALSE] -for (i in 1:length(tumor_samples)) { - x = pyclone_summary[,tumor_samples[i]] %>% - .[[1]] - df = cbind(df, x) - colnames(df)[i+3] = tumor_samples[i] -} - - -pdf(file=paste0("pyclone/", opt$sample_set, "/report/pyclone.pdf"), width=6.5, height=6) -for (i in 1:(length(tumor_samples)-1)) { - for (j in (i+1):length(tumor_samples)) { - x = df[,tumor_samples[i]] - y = df[,tumor_samples[j]] - z1 = df[,"cluster_id"] - z2 = df[,"mutation_type"] - tmp.0 = data_frame(x=x, y=y, z1=factor(z1, ordered=TRUE), z2=z2) - plot.0 = ggplot(tmp.0, aes(x=x, y=y, fill=z1, color=z1, shape=z2)) + - geom_point(alpha = .55, size=2.5) + - theme_classic() + - coord_cartesian(xlim=c(0,1), ylim=c(0,1)) + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=9), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x=paste0("\n",tumor_samples[i],"\n"), y=paste0("\n",tumor_samples[j],"\n")) + - guides(color=guide_legend(title=c("Cluster")), shape=guide_legend(title=c("Type"))) + - guides(fill=FALSE) - print(plot.0) - } -} -dev.off() - -write_tsv(pyclone_summary, path=paste0("pyclone/", opt$sample_set, "/report/summary.tsv")) diff --git a/clonality/plotpyclone.mk b/clonality/plotpyclone.mk deleted file mode 100644 index 20abed4c..00000000 --- a/clonality/plotpyclone.mk +++ /dev/null @@ -1,15 +0,0 @@ -include modules/Makefile.inc -include modules/clonality/setuppyclone.mk - -LOGDIR ?= log/plot_pyclone.$(NOW) -PHONY += pyclone - -plot_pyclone : $(foreach set,$(SAMPLE_SETS),pyclone/$(set)/report/pyclone.pdf) - -define plot-pyclone -pyclone/%/report/pyclone.pdf : pyclone/%/report/pyclone.tsv - $$(call RUN,-s 4G -m 6G -w 7200,"$(RSCRIPT) modules/clonality/plotpyclone.R --sample_set $$(*) --normal_samples $(NORMAL_SAMPLES) --min_depth $(MIN_DEPTH)") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call plot-pyclone,$(set)))) diff --git a/clonality/pyclone_13.mk b/clonality/pyclone_13.mk new file mode 100644 index 00000000..7df04299 --- /dev/null +++ b/clonality/pyclone_13.mk @@ -0,0 +1,134 @@ +include modules/Makefile.inc + +LOGDIR ?= log/pyclone_13.$(NOW) + +SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev +SUFAM_OPTS = --mpileup-parameters='-A -q 15 -Q 15 -d 50000' + +MCMC_ITER = 10000 +MCMC_BURNIN = 2000 +MCMC_THIN = 1 + +pyclone : $(foreach sample,$(TUMOR_SAMPLES),pyclone_13/$(sample)/$(sample).vcf) \ + $(foreach sample,$(TUMOR_SAMPLES),pyclone_13/$(sample)/$(sample).txt) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/taskcomplete) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/config.yaml) \ + $(foreach set,$(SAMPLE_SETS), \ + $(foreach sample,$(tumors.$(set)),pyclone_13/$(set)/$(sample).yaml)) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/trace/alpha.tsv.bz2) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/summary/by_clusters.txt) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/summary/by_loci.txt) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/summary/scatter_by_sample.pdf) \ + $(foreach set,$(SAMPLE_SETS),pyclone_13/$(set)/summary/heatmap_by_sample.pdf) + + +define r-sufam +pyclone_13/$1/$1.vcf : summary/tsv/all.tsv + $$(call RUN,-c -n 1 -s 4G -m 8G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 1 \ + --sample_set '$(set.$1)' \ + --normal_sample '$(normal.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +pyclone_13/$1/$1.txt : pyclone_13/$1/$1.vcf bam/$1.bam + $$(call RUN,-c -n 1 -s 2G -m 3G -v $(SUFAM_ENV),"set -o pipefail && \ + sufam \ + --sample_name $1 \ + $$(SUFAM_OPTS) \ + $$(REF_FASTA) \ + $$(<) \ + $$(<<) \ + > $$(@)") + +endef +$(foreach sample,$(TUMOR_SAMPLES),\ + $(eval $(call r-sufam,$(sample)))) + +define r-pyclone-input +pyclone_13/$1/taskcomplete : $(foreach sample,$(TUMOR_SAMPLES),pyclone_13/$(sample)/$(sample).txt) + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_13.R \ + --option 1 \ + --sample_set $1 \ + --normal_sample '$(normal.$1)' && \ + echo 'success' > $$(@)") + +pyclone_13/$1/config.yaml : $(foreach sample,$(TUMOR_SAMPLES),pyclone_13/$(sample)/$(sample).txt) + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_13.R \ + --option 2 \ + --sample_set $1 \ + --normal_sample '$(normal.$1)' \ + --output_file $$(@) \ + --num_iter $$(MCMC_ITER)") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call r-pyclone-input,$(set)))) + +define r-pyclone-build-mutations +pyclone_13/$1/$2.yaml : pyclone_13/$1/taskcomplete pyclone_13/$1/config.yaml + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(PYCLONE_13_ENV),"set -o pipefail && \ + PyClone build_mutations_file \ + --in_file pyclone_13/$1/$2.tsv \ + --out_file $$(@) \ + --prior total_copy_number") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(foreach sample,$(tumors.$(set)),\ + $(eval $(call r-pyclone-build-mutations,$(set),$(sample))))) + +define r-pyclone-run-analysis +pyclone_13/$1/trace/alpha.tsv.bz2 : $(foreach sample,$(tumors.$1),pyclone_13/$1/$(sample).yaml) pyclone_13/$1/config.yaml + $$(call RUN,-c -n 1 -s 8G -m 16G -v $(PYCLONE_13_ENV) -w 72:00:00,"set -o pipefail && \ + PyClone run_analysis \ + --config_file pyclone_13/$1/config.yaml") + +pyclone_13/$1/summary/by_clusters.txt : pyclone_13/$1/trace/alpha.tsv.bz2 pyclone_13/$1/config.yaml + $$(call RUN,-c -n 1 -s 8G -m 16G -v $(PYCLONE_13_ENV),"set -o pipefail && \ + PyClone build_table \ + --config_file $$(<<) \ + --out_file $$(@) \ + --table_type cluster \ + --burnin $$(MCMC_BURNIN) \ + --thin $$(MCMC_THIN)") + +pyclone_13/$1/summary/by_loci.txt : pyclone_13/$1/trace/alpha.tsv.bz2 pyclone_13/$1/config.yaml + $$(call RUN,-c -n 1 -s 8G -m 16G -v $(PYCLONE_13_ENV),"set -o pipefail && \ + PyClone build_table \ + --config_file $$(<<) \ + --out_file $$(@) \ + --table_type loci \ + --burnin $$(MCMC_BURNIN) \ + --thin $$(MCMC_THIN)") + +pyclone_13/$1/summary/scatter_by_sample.pdf : pyclone_13/$1/summary/by_loci.txt pyclone_13/$1/summary/by_clusters.txt + $$(call RUN,-c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_13.R \ + --option 3 \ + --sample_set '$(tumors.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +pyclone_13/$1/summary/heatmap_by_sample.pdf : pyclone_13/$1/summary/by_loci.txt pyclone_13/$1/summary/by_clusters.txt + $$(call RUN,-c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_13.R \ + --option 4 \ + --sample_set '$(tumors.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call r-pyclone-run-analysis,$(set)))) + + +..DUMMY := $(shell mkdir -p version; \ + R --version > version/pyclone_13.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: pyclone diff --git a/clonality/pyclone_vi.mk b/clonality/pyclone_vi.mk new file mode 100644 index 00000000..746908b7 --- /dev/null +++ b/clonality/pyclone_vi.mk @@ -0,0 +1,114 @@ +include modules/Makefile.inc + +LOGDIR ?= log/pyclone_vi.$(NOW) + +SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev +SUFAM_OPTS = --mpileup-parameters='-A -q 15 -Q 15 -d 50000' + +pyclone : $(foreach sample,$(TUMOR_SAMPLES),pyclone_vi/$(sample)/$(sample).vcf) \ + $(foreach sample,$(TUMOR_SAMPLES),pyclone_vi/$(sample)/$(sample).txt) \ + $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/$(set).tsv) \ + $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/$(set).hd5) \ + $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/summary/by_loci.txt) \ + $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/summary/scatter_by_sample.pdf) \ + $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/summary/heatmap_by_sample.pdf) \ + pyclone_vi/summary.txt + + +define r-sufam +pyclone_vi/$1/$1.vcf : summary/tsv/all.tsv + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 1 \ + --sample_set '$(set.$1)' \ + --normal_sample '$(normal.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +pyclone_vi/$1/$1.txt : pyclone_vi/$1/$1.vcf bam/$1.bam + $$(call RUN,-c -n 1 -s 2G -m 3G -v $(SUFAM_ENV),"set -o pipefail && \ + sufam \ + --sample_name $1 \ + $$(SUFAM_OPTS) \ + $$(REF_FASTA) \ + $$(<) \ + $$(<<) \ + > $$(@)") + +pyclone_vi/$1/$1.maf : pyclone_vi/$1/$1.vcf + $$(call RUN,-c -n 12 -s 1G -m 2G -v $(VEP_ENV),"set -o pipefail && \ + $$(VCF2MAF) \ + --input-vcf $$< \ + --tumor-id $1 \ + --filter-vcf $$(EXAC_NONTCGA) \ + --ref-fasta $$(REF_FASTA) \ + --vep-path $$(VEP_PATH) \ + --vep-data $$(VEP_DATA) \ + --tmp-dir `mktemp -d` \ + --output-maf $$(@)") + +endef +$(foreach sample,$(TUMOR_SAMPLES),\ + $(eval $(call r-sufam,$(sample)))) + +define r-pyclone +pyclone_vi/$1/$1.tsv : $(foreach sample,$(TUMOR_SAMPLES),pyclone_vi/$(sample)/$(sample).txt) + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_vi.R \ + --option 1 \ + --sample_set $1 \ + --normal_sample '$(normal.$1)' \ + --output_file $$(@)") + +pyclone_vi/$1/$1.hd5 : pyclone_vi/$1/$1.tsv + $$(call RUN,-c -n 1 -s 12G -m 24G -v $(PYCLONE_ENV) -w 72:00:00,"set -o pipefail && \ + pyclone-vi fit \ + --in-file $$(<) \ + --out-file $$(@) \ + --num-clusters 10 \ + --density beta-binomial \ + --num-grid-points 100 \ + --max-iters 1000000 \ + --mix-weight-prior 1 \ + --precision 500 \ + --num-restarts 100") + +pyclone_vi/$1/summary/by_loci.txt : pyclone_vi/$1/$1.hd5 + $$(call RUN,-c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + pyclone-vi write-results-file \ + --in-file $$(<) \ + --out-file $$(@)") + +pyclone_vi/$1/summary/scatter_by_sample.pdf : pyclone_vi/$1/summary/by_loci.txt + $$(call RUN,-c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_vi.R \ + --option 2 \ + --sample_set '$(tumors.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +pyclone_vi/$1/summary/heatmap_by_sample.pdf : pyclone_vi/$1/summary/by_loci.txt + $$(call RUN,-c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_vi.R \ + --option 3 \ + --sample_set '$(tumors.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call r-pyclone,$(set)))) + + +pyclone_vi/summary.txt : $(foreach set,$(SAMPLE_SETS),pyclone_vi/$(set)/summary/by_loci.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(PYCLONE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/pyclone_vi.R \ + --option 4 \ + --sample_set '$(SAMPLE_SETS)'") + + +..DUMMY := $(shell mkdir -p version; \ + R --version > version/pyclone_vi.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: pyclone diff --git a/clonality/pycloneconfig.R b/clonality/pycloneconfig.R deleted file mode 100644 index 110c8866..00000000 --- a/clonality/pycloneconfig.R +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -optList = list(make_option("--sample_set", default = NULL, help = "sample set name"), - make_option("--normal_samples", default = NULL, help = "normal sample names")) - -parser = OptionParser(usage = "%prog [options] mutation_file", option_list = optList) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -tumor_samples = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) -normal_sample = unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE)) -normal_sample = tumor_samples[tumor_samples %in% normal_sample] -tumor_samples = tumor_samples[!(tumor_samples %in% normal_sample)] - -cat("num_iters: 10000\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = FALSE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("base_measure_params:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" alpha: 1\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" beta: 1\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("concentration:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" value: 1.0\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" prior:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" shape: 1.0\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" rate: 0.001\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("density: pyclone_beta_binomial\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("beta_binomial_precision_params:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" value: 1000\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" prior:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" shape: 1.0\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" rate: 0.0001\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" proposal:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(" precision: 0.5\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat(paste0("working_dir: pyclone/",opt$sample_set, "\n"), file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("trace_dir: trace", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("init_method: connected\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) -cat("samples:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - -for (i in 1:length(tumor_samples)) { - if (i!=1) { - cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - } - cat(paste0(" ", tumor_samples[i], ":\n"), file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - cat(paste0(" mutations_file: ", tumor_samples[i], ".yaml\n"), file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - cat(" tumour_content:\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - load(paste0("ascat/ascat/", tumor_samples[i], "_", normal_sample, ".RData")) - cat(paste0(" value: ", ifelse(is.na(purity), 1.0, signif(purity, 2)),"\n"), file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - cat(" error_rate: 0.01", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - if (i!=length(tumor_samples)) { - cat("\n", file=paste0("pyclone/", opt$sample_set, "/config.yaml"), append = TRUE) - } -} - -for (i in 1:length(tumor_samples)) { - system(paste0("source ~/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate ~/share/usr/anaconda-envs/PyClone-0.13.1 && PyClone build_mutations_file --in_file pyclone/", opt$sample_set, "/", tumor_samples[i], ".tsv --out_file pyclone/", opt$sample_set, "/", tumor_samples[i], ".yaml --prior parental_copy_number")) -} diff --git a/clonality/runpyclone.mk b/clonality/runpyclone.mk deleted file mode 100644 index 0c86ddfa..00000000 --- a/clonality/runpyclone.mk +++ /dev/null @@ -1,20 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/run_pyclone.$(NOW) -PHONY += pyclone - -run_pyclone : $(foreach set,$(SAMPLE_SETS),pyclone/$(set)/report/pyclone.tsv) - -define run-pyclone -pyclone/%/trace/alpha.tsv.bz2 : pyclone/%/config.yaml - $$(call RUN,-s 4G -m 6G -w 7200,"source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate /home/${USER}/share/usr/anaconda-envs/PyClone-0.13.1 && \ - PyClone run_analysis --config_file pyclone/$$*/config.yaml --seed 0") - -pyclone/%/report/pyclone.tsv : pyclone/%/trace/alpha.tsv.bz2 - $$(call RUN,-s 4G -m 6G -w 7200,"make -p pyclone/$$*/report && \ - source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate /home/${USER}/share/usr/anaconda-envs/PyClone-0.13.1 && \ - PyClone build_table --config_file pyclone/$$*/config.yaml --out_file pyclone/$$*/report/pyclone.tsv --max_cluster 10 --table_type old_style --burnin 5000") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call run-pyclone,$(set)))) diff --git a/clonality/setuppyclone.mk b/clonality/setuppyclone.mk deleted file mode 100644 index ebd2f8f9..00000000 --- a/clonality/setuppyclone.mk +++ /dev/null @@ -1,22 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/setup_pyclone.$(NOW) -PHONY += pyclone - -MIN_DEPTH ?= 50 - -setup_pyclone : $(foreach set,$(SAMPLE_SETS),pyclone/$(set)/config.yaml) - -define make-input-pyclone -pyclone/%/config.yaml : sufam/%.tsv - $$(call RUN, -s 4G -m 6G,"mkdir -p pyclone/$$(*) && \ - $(RSCRIPT) modules/clonality/tsvforpyclone.R --sample_set $$(*) --normal_samples $(NORMAL_SAMPLES) --min_depth $(MIN_DEPTH) && \ - $(RSCRIPT) modules/clonality/pycloneconfig.R --sample_set $$(*) --normal_samples $(NORMAL_SAMPLES)") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call make-input-pyclone,$(set)))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/clonality/tsvforpyclone.R b/clonality/tsvforpyclone.R deleted file mode 100644 index e6bc7be6..00000000 --- a/clonality/tsvforpyclone.R +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) - -optList = list( - make_option("--sample_set", default = NULL, help = "sample set name"), - make_option("--normal_samples", default = NULL, help = "normal sample names"), - make_option("--min_depth", default = NA, help = "minimum depth to consider") - ) - -parser = OptionParser(usage = "%prog [options] mutation_file", option_list = optList) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -tumor_samples = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) -normal_sample = unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE)) -normal_sample = tumor_samples[tumor_samples %in% normal_sample] -tumor_samples = tumor_samples[!(tumor_samples %in% normal_sample)] -min_depth = ifelse(is.na(opt$min_depth) | is.null(opt$min_depth) | opt$min_depth=="" | opt$min_depth==" ", 50, opt$min_depth) - -mutation_summary = read_tsv(file=paste0("sufam/", opt$sample_set, ".tsv")) -index = apply(mutation_summary[,paste0("DP_", tumor_samples)], 1, function(x) {sum(x>=min_depth)})==length(tumor_samples) -mutation_summary = mutation_summary[index,,drop=FALSE] -index = apply(mutation_summary[,paste0("CALL_", tumor_samples)], 1, function(x) {sum(x==0)})==length(tumor_samples) -mutation_summary = mutation_summary[!index,,drop=FALSE] - -for (i in 1:length(tumor_samples)) { - mutation_id = paste0(mutation_summary$Gene_Symbol, "_", mutation_summary$HGVSp) - fsq = mutation_summary %>% - .[[paste0("MAF_", tumor_samples[i])]] - qt = mutation_summary %>% - .[[paste0("qt_", tumor_samples[i])]] - q2 = mutation_summary %>% - .[[paste0("q2_", tumor_samples[i])]] - q1 = qt - q2 - n = mutation_summary %>% - .[[paste0("DP_", tumor_samples[i])]] - flag = mutation_summary %>% - .[[paste0("CALL_", tumor_samples[i])]] - - fsq[flag==0] = 0 - var_counts = round(fsq*n) - ref_counts = round((1-fsq)*n) - normal_cn = rep(2, length(mutation_id)) - major_cn = q2 - minor_cn = q1 - sample_summary = data.frame(mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn) - write.table(sample_summary, paste0("pyclone/", opt$sample_set, "/", tumor_samples[i], ".tsv"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=FALSE) -} diff --git a/config.inc b/config.inc index d1c44fcf..38cb6d34 100644 --- a/config.inc +++ b/config.inc @@ -14,20 +14,39 @@ R ?= R MY_RSCRIPT ?= Rscript RSCRIPT ?= Rscript -# General python 2.7 environment -ANACONDA_27_ENV ?= $(HOME)/share/usr/anaconda-envs/anaconda-2.7 - -# SUFAM python environment -SUFAM_ENV ?= $(HOME)/share/usr/anaconda-envs/sufam-dev - +ANACONDA_27_ENV = $(HOME)/share/usr/anaconda-envs/anaconda-2.7 +SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev MUTSIG_REPORT_ENV = $(HOME)/share/usr/anaconda-envs/mutsig-report-0.0.1 +JRFLAB_MODULES_ENV = $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.4 +ONCOTATOR_ENV = $(HOME)/share/usr/venv/oncotator-1.9.2.0 +VEP_ENV = $(HOME)/share/usr/anaconda-envs/variant-effect-predictor-86 +ASCAT_ENV = $(HOME)/share/usr/anaconda-envs/ascat +INNOVATION_ENV = $(HOME)/share/usr/env/innovation-lab-0.0.1 +PIGZ_ENV ?= $(HOME)/share/usr/env/pigz-2.6 +KALLISTO_ENV ?= $(HOME)/share/usr/env/kallisto-0.46.2 +IMMUNE_ENV ?= $(HOME)/share/usr/env/r-immunedeconv-2.1.0 +SUMREADS_ENV = $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.6 +PYCLONE_ENV = $(HOME)/share/usr/env/pyclone-vi-0.1.2 +PYCLONE_13_ENV = $(HOME)/share/usr/env/pyclone-0.13.1 +GRIDSS_ENV = $(HOME)/share/usr/env/gridss-2.13.2 +SVABA_ENV ?= $(HOME)/share/usr/env/svaba-1.1.0 +SURVIVOR_ENV ?= $(HOME)/share/usr/env/survivor-1.0.7 +ANNOTATE_SV_ENV ?= $(HOME)/share/usr/env/annot_sv-3.1.3 +VIOLA_ENV = $(HOME)/share/usr/env/viola-sv-1.0.2 +SIGNATURE_TOOLS_ENV = $(HOME)/share/usr/env/r-signature.tools.lib-2.2.0 +CNVKIT_ENV ?= $(HOME)/share/usr/env/cnvkit-0.9.9 +STARFISH_ENV ?= $(HOME)/share/usr/env/r-starfish-0.11 +MEDICC_ENV = $(HOME)/share/usr/env/medicc2-0.8.1 +VARIANT_ANNOTATION_ENV = $(HOME)/share/usr/env/r-variantannotation-1.44.0 +FACETS_SUITE_ENV = $(HOME)/share/usr/env/r-facets-suite-2.0.8 +CRAVAT_ENV = $(HOME)/share/usr/anaconda-envs/open-cravat +POLYSOLVER_ENV = $(HOME)/share/usr/anaconda-envs/hla-polysolver +MSISENSOR_ENV = $(HOME)/share/usr/anaconda-envs/msisensor +MIMSI_ENV = $(HOME)/share/usr/env/mimsi-0.4.4 JARDIR ?= $(HOME)/share/usr/lib/java -# jrflab modules environment -JRFLAB_MODULES_ENV ?= $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.4 - -### Applications +## Applications UNZIP ?= /usr/bin/unzip FASTQC ?= $(PERL) $(HOME)/share/usr/FastQC/fastqc MUTECT_JAR ?= $(JARDIR)/muTect-1.1.7.jar @@ -38,31 +57,30 @@ SAMTOOLS2 ?= samtools VCFUTILS ?= $(HOME)/share/usr/bin/vcfutils.pl BCFTOOLS2 ?= bcftools BCFTOOLS ?= bcftools +PIGZ ?= pigz BEDTOOLS ?= $(HOME)/share/usr/bin/bedtools BGZIP ?= $(HOME)/share/usr/bin/bgzip IGVTOOLS ?= $(HOME)/share/usr/IGVTools/igvtools VCFTOOLS ?= $(HOME)/share/usr/bin/vcftools-0.1.10 VCF_SORT ?= $(PERL) $(HOME)/share/usr/bin/vcfsorter.pl - SNP_EFF_JAR ?= $(JARDIR)/snpEff-4.3.jar SNP_SIFT_JAR ?= $(JARDIR)/SnpSift-4.3.jar SNP_EFF_CONFIG ?= modules/config/snpEff.conf DB_NSFP ?= $(HOME)/share/reference/snpEff-4.1/dbNSFP3.0b1a.hg19.txt.gz NSFP_FIELDS ?= Uniprot_acc_Polyphen2 Uniprot_id_Polyphen2 Polyphen2_HVAR_score Polyphen2_HVAR_pred 1000Gp3_AF ESP6500_AA_AF ESP6500_EA_AF MutationAssessor_pred MutationAssessor_score MutationTaster_pred MutationTaster_score PROVEAN_pred ExAC_Adj_AF clinvar_rs clinvar_clnsig Interpro_domain - CUFFLINKS ?= cufflinks CUFFCMP ?= cuffcompare TOPHAT ?= tophat DEFUSE ?= $(PERL) $(HOME)/share/usr/defuse-0.6.1/scripts/defuse.pl - ONCOFUSE_JAR ?= $(HOME)/share/usr/oncofuse-1.0.9b2/Oncofuse.jar VARSCAN_JAR ?= $(JARDIR)/VarScan.v2.3.9.jar +MEDICC ?= medicc2 -# PICARD tools +## PICARD tools PICARD_DIR ?= $(JARDIR)/picard-1.92 PICARD_JAR ?= $(JARDIR)/picard-tools-1.141/picard.jar -# scripts +## scripts SCRIPTS_DIR ?= modules/scripts MERGE ?= $(SCRIPTS_DIR)/merge.R VCF_TO_TABLE ?= $(SCRIPTS_DIR)/vcfToTable.R @@ -70,6 +88,9 @@ INTRON_POSN_LOOKUP ?= $(SCRIPTS_DIR)/posnGeneLookup.pl RBIND ?= $(SCRIPTS_DIR)/rbind.R NORMAL_FILTER ?= $(PERL) $(SCRIPTS_DIR)/normalFilterVCF.pl SOMATIC_FILTER_VCF ?= $(PERL) $(SCRIPTS_DIR)/somaticFilterVCF.pl +SUM_READS_RSCRIPT = $(RSCRIPT) $(SCRIPTS_DIR)/summarize_rnaseqreads.R +SUM_EXONS_RSCRIPT = $(RSCRIPT) $(SCRIPTS_DIR)/summarize_rnaseqreads_byexon.R +SUM_INTRONS_RSCRIPT = $(RSCRIPT) $(SCRIPTS_DIR)/summarize_rnaseqreads_byintron.R JAVA_BIN ?= $(JAVA8_BIN) JAVA6_BIN ?= $(HOME)/share/usr/jdk1.6.0_45/bin/java @@ -78,7 +99,7 @@ JAVA8_BIN ?= $(HOME)/share/usr/jdk1.8.0_121/bin/java GET_INSERT_SIZE ?= $(HOME)/share/usr/bin/getInsertSize.py -#GATK +## GATK GATK_JAR ?= $(JARDIR)/GenomeAnalysisTK.jar GATK_JAR2 ?= $(JARDIR)/GenomeAnalysisTK-3.7.jar @@ -125,9 +146,6 @@ TVC ?= $(HOME)/share/usr/bin/tvc ANNOVAR = $(PERL) $(HOME)/share/usr/annovar-2017-07-16/table_annovar.pl -ONCOTATOR_ENV = $(HOME)/share/usr/venv/oncotator-1.9.2.0 - -VEP_ENV = $(HOME)/share/usr/anaconda-envs/variant-effect-predictor-86 VEP_PATH = $(VEP_ENV)/bin SPLIT_BED = python modules/scripts/split_bed.py @@ -137,35 +155,8 @@ SNP_FILTER_VCF = python modules/vcf_tools/snp_filter_vcf.py MERGE_VCF = python modules/vcf_tools/merge_vcf.py -ASCAT_ENV = $(HOME)/share/usr/anaconda-envs/ascat - -MEDICC_ENV = $(HOME)/share/usr/anaconda-envs/medicc -MEDICC_VAR = $(MEDICC_ENV)/PROFILE -MEDICC_BIN = $(MEDICC_ENV)/opt/medicc - -PHYLO_ENV = $(HOME)/share/usr/anaconda-envs/phylotools - -CNTILP_ENV = $(HOME)/share/usr/anaconda-envs/cnt-ilp -CNTILP_CONF = $(CNTILP_ENV)/PROFILE -CNTILP_BIN = $(CNTILP_ENV)/opt/CNT-ILP/build - -CNTMD_ENV = $(HOME)/share/usr/anaconda-envs/cnt-md -CNTMD_CONF = $(CNTMD_ENV)/PROFILE -CNTMD_BIN = $(CNTMD_ENV)/opt/CNT-MD/build - -MACHINA_ENV = $(HOME)/share/usr/anaconda-envs/machina -MACHINA_VAR = $(MACHINA_ENV)/PROFILE -MACHINA_BIN = $(MACHINA_ENV)/opt/machina/build - -HATCHET_ENV = $(HOME)/share/usr/anaconda-envs/hatchet -HATCHET_VAR = $(HATCHET_ENV)/PROFILE -HATCHET_BIN = $(HATCHET_ENV)/opt/machina/build - -DECONSTRUCTSIGS_ENV = $(HOME)/share/usr/anaconda-envs/deconstructsigs - -PHANGORN_ENV = $(HOME)/share/usr/anaconda-envs/phangorn - -FGBIO_ENV = $(HOME)/share/usr/anaconda-envs/fgbio-0.8.1 +# gbc command line +GBC ?= $(HOME)/share/usr/GetBaseCounts/GetBaseCounts endif CONFIG_INC = true diff --git a/contamination/clusterSampleVcf.R b/contamination/clusterSampleVcf.R index ca114d7e..35ee71cc 100644 --- a/contamination/clusterSampleVcf.R +++ b/contamination/clusterSampleVcf.R @@ -2,51 +2,88 @@ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("VariantAnnotation")) -suppressPackageStartupMessages(library("gplots")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("ComplexHeatmap")) +suppressPackageStartupMessages(library("RColorBrewer")) -options(error = quote(dump.frames("testdump", TRUE))) - -optList <- list( - make_option("--genome", default = 'b37', help = "genome build [default %default]"), - make_option("--outPrefix", default = NULL, help = "output prefix [default %default]")) - -parser <- OptionParser(usage = "%prog vcf.files", option_list = optList); -arguments <- parse_args(parser, positional_arguments = T); -opt <- arguments$options; - -if (is.null(opt$outPrefix)) { - cat("Need output prefix\n"); - print_help(parser); - stop(); -} else if (length(arguments$args) < 1) { - cat("Need vcf files\n"); - print_help(parser); - stop(); +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) } -vcfFile <- arguments$args[1] +optList <- list(make_option("--input_file", default = 'snp_vcf/snps_ft.vcf', help = "input file"), + make_option("--output_file", default = 'snp_vcf/snps_ft.pdf', help = "output file"), + make_option("--sample_pairs", default = NA, type = 'character', help = "sample pairs"), + make_option("--genome", default = 'b37', help = "genome build")) +parser <- OptionParser(usage = "%prog vcf.files", option_list = optList) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options -vcf <- readVcf(vcfFile, opt$genome) -gt <- geno(vcf)$GT -ad <- geno(vcf)$AD -af <- structure(sapply(ad, function(x) x[2] / sum(x)), dim = dim(ad)) -X <- matrix(0, nrow = nrow(gt), ncol = ncol(gt), dimnames = list(rownames(gt), colnames(gt))) -X[is.na(af)] <- NA -X[af > 0.15 & af < 0.95] <- 1 -X[af >= 0.95] <- 2 -X[!gt %in% c("0/0", "0/1", "1/1")] <- NA -#plot(hclust(dist(t(X), method = 'manhattan'))) +vcf = readVcf(as.character(opt$input_file), as.character(opt$genome)) +gt = geno(vcf)$GT +ad = geno(vcf)$AD +af = structure(sapply(ad, function(x) x[2] / sum(x)), dim = dim(ad)) +X = matrix(0, nrow = nrow(gt), ncol = ncol(gt), dimnames = list(rownames(gt), colnames(gt))) +X[is.na(af)] = NA +X[af > 0.15 & af < 0.95] = 1 +X[af >= 0.95] = 2 +X[!gt %in% c("0/0", "0/1", "1/1")] = NA -gt <- matrix(as.integer(factor(X)), nrow = nrow(gt), ncol = ncol(gt), dimnames = list(rownames(gt), colnames(gt))) +gt = matrix(as.integer(factor(X)), nrow = nrow(gt), ncol = ncol(gt), dimnames = list(rownames(gt), colnames(gt))) +dt = as.matrix(dist(t(gt))) -fn <- paste(opt$outPrefix, ".clust.pdf", sep = '') -pdf(fn, height = 9, width = 15) -null <- plot(hclust(dist(t(gt)), method = 'ward')) +tumor_samples = unlist(lapply(strsplit(x = unlist(strsplit(x = as.character(opt$sample_pairs), split = " ")), split = "_"), function(x) { x[1] })) +normal_samples = unlist(lapply(strsplit(x = unlist(strsplit(x = as.character(opt$sample_pairs), split = " ")), split = "_"), function(x) { x[2] })) +sample_pairs = dplyr::tibble(tumor_samples = factor(c(tumor_samples, unique(normal_samples)), levels = rownames(dt), ordered = TRUE), + normal_samples = c(normal_samples, unique(normal_samples))) %>% + dplyr::arrange(tumor_samples) %>% + dplyr::mutate(normal_samples = factor(normal_samples, levels = unique(normal_samples), ordered = TRUE)) +cluster_color = colorRampPalette(brewer.pal(9, "Set1"))(length(unique(sample_pairs %>% .[["normal_samples"]]))) +names(cluster_color) = sort(unique(sample_pairs %>% .[["normal_samples"]])) + +row_annot = rowAnnotation( + cluster_id = sample_pairs %>% .[["normal_samples"]], + col = list(cluster_id = cluster_color), + show_annotation_name = FALSE, + simple_anno_size = unit(.5, "cm"), + show_legend = FALSE +) +col_annot = columnAnnotation( + cluster_id = sample_pairs %>% .[["normal_samples"]], + col = list(cluster_id = cluster_color), + show_annotation_name = FALSE, + simple_anno_size = unit(.5, "cm"), + show_legend = FALSE +) +col_pal = c(rep("#662506", 3), + rev(brewer.pal(n = 7, name = "YlOrBr")), + rep("#fff7bc", 3)) + +pdf(as.character(opt$output_file), height = 21, width = 22) +draw(Heatmap(matrix = dt, + name = " ", + rect_gp = gpar(col = "white"), + border = NA, + col = col_pal, + cluster_rows = TRUE, + show_row_dend = TRUE, + row_dend_width = unit(3, "cm"), + row_names_side = "right", + row_names_gp = gpar(fontsize = 12), + show_row_names = TRUE, + left_annotation = row_annot, + + show_column_names = TRUE, + column_names_side = "bottom", + column_names_gp = gpar(fontsize = 12), + cluster_columns = TRUE, + show_column_dend = TRUE, + column_dend_height = unit(3, "cm"), + top_annotation = col_annot, + + use_raster = FALSE, + show_heatmap_legend = TRUE, + heatmap_legend_param = list(legend_height = unit(5, "cm"), legend_width = unit(5, "cm")))) dev.off() - -fn <- paste(opt$outPrefix, ".heatmap.pdf", sep = '') -pdf(fn, height = 30, width = 30) -null <- heatmap.2(as.matrix(dist(t(gt))), scale = 'none', trace = 'none', keysize = 0.3, cexRow = 2, cexCol = 2, margins = c(20,20)) -dev.off() - diff --git a/contamination/clusterSamples.mk b/contamination/clusterSamples.mk index d3f953d5..975898c6 100644 --- a/contamination/clusterSamples.mk +++ b/contamination/clusterSamples.mk @@ -1,39 +1,60 @@ -# Run unified genotyper on snp positions and cluster samples using results -##### DEFAULTS ###### -LOGDIR = log/cluster_samples.$(NOW) - -##### MAKE INCLUDES ##### include modules/Makefile.inc include modules/variant_callers/gatk.inc -VPATH ?= bam -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY : all +LOGDIR = log/cluster_samples.$(NOW) +VPATH ?= bam ifeq ($(EXOME),true) DBSNP_SUBSET ?= $(HOME)/share/reference/dbsnp_137_exome.bed else DBSNP_SUBSET = $(HOME)/share/reference/dbsnp_tseq_intersect.bed endif -CLUSTER_VCF = $(RSCRIPT) modules/contamination/clusterSampleVcf.R +CLUSTER_VCF = modules/contamination/clusterSampleVcf.R -all : snp_vcf/snps_filtered.clust.png +snp_cluster : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) \ + snp_vcf/snps.vcf \ + snp_vcf/snps_ft.vcf \ + snp_vcf/snps_ft.pdf -#snp_vcf/snps.vcf : $(foreach sample,$(SAMPLES),bam/$(sample).bam) -#$(call RUN,-s 4G -m 8G,"$(SAMTOOLS) mpileup -f $(REF_FASTA) -g -l <(sed '/^#/d' $(DBSNP) | cut -f 1,2) $^ | $(BCFTOOLS) view -g - > $@") +snp_vcf/%.snps.vcf : bam/%.bam + $(call RUN,-n 4 -s 2.5G -m 3G,"set -o pipefail && \ + $(call GATK_MEM,8G) \ + -T UnifiedGenotyper \ + -rf BadCigar \ + -nt 4 \ + -R $(REF_FASTA) \ + --dbsnp $(DBSNP) \ + $(foreach bam,$(filter %.bam,$^),-I $(bam) ) \ + -L $(DBSNP_SUBSET) \ + -o $@ \ + --output_mode EMIT_ALL_SITES") -snp_vcf/snps.vcf : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) - $(call RUN,-s 16G -m 20G,"$(call GATK_MEM,14G) -T CombineVariants $(foreach vcf,$^,--variant $(vcf) ) -o $@ --genotypemergeoption UNSORTED -R $(REF_FASTA)") -snp_vcf/snps_filtered.vcf : snp_vcf/snps.vcf +snp_vcf/snps.vcf : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) + $(call RUN,-s 16G -m 20G,"set -o pipefail && \ + $(call GATK_MEM,14G) -T CombineVariants \ + $(foreach vcf,$^,--variant $(vcf) ) \ + -o $@ \ + --genotypemergeoption UNSORTED \ + -R $(REF_FASTA)") + +snp_vcf/snps_ft.vcf : snp_vcf/snps.vcf $(INIT) grep '^#' $< > $@ && grep -e '0/1' -e '1/1' $< >> $@ -snp_vcf/%.snps.vcf : bam/%.bam - $(call RUN,-n 4 -s 2.5G -m 3G,"$(call GATK_MEM,8G) -T UnifiedGenotyper -nt 4 -R $(REF_FASTA) --dbsnp $(DBSNP) $(foreach bam,$(filter %.bam,$^),-I $(bam) ) -L $(DBSNP_SUBSET) -o $@ --output_mode EMIT_ALL_SITES") - -snp_vcf/%.clust.png : snp_vcf/%.vcf - $(INIT) $(CLUSTER_VCF) --outPrefix snp_vcf/$* $< +snp_vcf/snps_ft.pdf : snp_vcf/snps_ft.vcf + $(call RUN,-n 1 -s 16G -m 20G -v $(VARIANT_ANNOTATION_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/contamination/clusterSampleVcf.R \ + --input_file $(<) \ + --output_file $(@) \ + --sample_pairs '$(SAMPLE_PAIRS)' \ + --genome b37") + + +..DUMMY := $(shell mkdir -p version; \ + echo "GATK" > version/cluster_samples.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY : snp_cluster include modules/vcf_tools/vcftools.mk diff --git a/copy_number/annotateFacetsCCF2Vcf.R b/copy_number/annotateFacetsCCF2Vcf.R index 8285c9a7..dd80a89d 100644 --- a/copy_number/annotateFacetsCCF2Vcf.R +++ b/copy_number/annotateFacetsCCF2Vcf.R @@ -118,8 +118,8 @@ if (sum(pass) == 0) { alt <- sapply(geno(vcf[pass])$AD[!is.na(ol), tumorSample], function(x) x[2]) vaf <- alt / (alt + ref) - ccfFit <- computeCCF(vaf = vaf, tcn, lcn, purity = purity) - conf <- confCCF(alt = alt, ref = ref, tcn, lcn, purity = purity, + ccfFit <- compute_ccf(vaf = vaf, tcn, lcn, purity = purity) + conf <- conf_ccf(alt = alt, ref = ref, tcn, lcn, purity = purity, multiplicity = ccfFit$multiplicity) ccfLower <- conf$lower ccfUpper <- conf$upper diff --git a/copy_number/ascat.R b/copy_number/ascat.R index 24432af1..cacee796 100644 --- a/copy_number/ascat.R +++ b/copy_number/ascat.R @@ -10,13 +10,13 @@ if (!interactive()) { } args_list <- list(make_option("--type", default = NA, type = 'character', help = "type of analysis"), - make_option("--file_in", default = NA, type = 'character', help = "input file name"), - make_option("--file_out", default = NA, type = 'character', help = "output file name"), - make_option("--gamma", default = NA, type = 'numeric', help = "gamma parameter in pcf"), - make_option("--nlog2", default = NA, type = 'numeric', help = "number of clusters in Log2 ratio"), - make_option("--nbaf", default = NA, type = 'numeric', help = "number of clusters in BAF"), - make_option("--rho", default = NA, type = 'numeric', help = "purity for ASCAT"), - make_option("--psi", default = NA, type = 'numeric', help = "ploidy for ASCAT")) + make_option("--file_in", default = NA, type = 'character', help = "input file name"), + make_option("--file_out", default = NA, type = 'character', help = "output file name"), + make_option("--gamma", default = NA, type = 'numeric', help = "gamma parameter in pcf"), + make_option("--nlog2", default = NA, type = 'numeric', help = "number of clusters in Log2 ratio"), + make_option("--nbaf", default = NA, type = 'numeric', help = "number of clusters in BAF"), + make_option("--rho", default = NA, type = 'numeric', help = "purity for ASCAT"), + make_option("--psi", default = NA, type = 'numeric', help = "ploidy for ASCAT")) parser <- OptionParser(usage = "%prog", option_list = args_list) arguments <- parse_args(parser, positional_arguments = T) @@ -51,9 +51,9 @@ if (opt$type=="log2") { abline(v=max(CN[,"Position"]), col="goldenrod3", lty=3, lwd=1) abline(h=0, col="red") axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(CN[,"Position"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) + rect(xleft=1-1e10, xright=max(CN[,"Position"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) title(main = gsub(".pdf", "", gsub("ascat/log2/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) + box(lwd=1.5) dev.off() } else if (opt$type=="bafall") { @@ -83,9 +83,9 @@ if (opt$type=="log2") { abline(v=max(BAF[,"Position"]), col="goldenrod3", lty=3, lwd=1) abline(h=0.5, col="red") axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) - title(main = gsub(".pdf", "", gsub("ascat/bafall/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) + rect(xleft=1-1e10, xright=max(BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) + title(main = gsub(".pdf", "", gsub("ascat/baf_all/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) + box(lwd=1.5) dev.off() } else if (opt$type=="bafhet") { @@ -117,14 +117,14 @@ if (opt$type=="log2") { abline(v=max(BAF[,"Position"]), col="goldenrod3", lty=3, lwd=1) abline(h=0.5, col="red") axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) - title(main = gsub(".pdf", "", gsub("ascat/bafhet/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) + rect(xleft=1-1e10, xright=max(BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) + title(main = gsub(".pdf", "", gsub("ascat/baf_het/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) + box(lwd=1.5) dev.off() } else if (opt$type=="aspcf") { - gamma = ifelse(is.na(as.numeric(opt$gamma)), 70, as.numeric(opt$gamma)) + gamma = ifelse(is.na(as.numeric(opt$gamma)), 20, as.numeric(opt$gamma)) CN_and_BAF = out2$jointseg[,c("chrom", "maploc", "cnlr", "vafT"),drop=FALSE] index = out2$jointseg[,"het"]==1 @@ -132,7 +132,18 @@ if (opt$type=="log2") { colnames(CN_and_BAF) = c("Chromosome", "Position", "Log2Ratio", "BAF") index = CN_and_BAF[,"BAF"]>0.5 CN_and_BAF[index,"BAF"] = 1 - CN_and_BAF[index,"BAF"] + TMP = CN_and_BAF + for (i in 1:23) { + CN_and_BAF$Position[CN_and_BAF$Chromosome == i] = 1:sum(CN_and_BAF$Chromosome == i) + } tmp = multipcf(data=winsorize(data=CN_and_BAF, method="mad", tau=2.5, k=25, verbose=FALSE), gamma=gamma, fast=FALSE, verbose=FALSE) + for (i in 1:23) { + tmp[tmp$chrom == i,"start.pos"] = (TMP$Position[TMP$Chromosome == i])[tmp$start.pos[tmp$chrom == i]] + } + for (i in 1:23) { + tmp[tmp$chrom == i,"end.pos"] = (TMP$Position[TMP$Chromosome == i])[tmp$end.pos[tmp$chrom == i]] + } + CN_and_BAF = TMP colnames(tmp) = c("Chromosome", "Arm", "Start", "End", "N", "Log2Ratio", "BAF") save(CN_and_BAF, tmp, file=opt$file_out) @@ -218,9 +229,9 @@ if (opt$type=="log2") { abline(v=max(CN_and_BAF[,"Position"]), col="goldenrod3", lty=3, lwd=1) abline(h=0, col="red") axis(1, at = .5*(start+end), labels=rep(" ", 23), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(CN_and_BAF[,"Position"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) - title(main = gsub(".pdf", "", gsub("ascat/log2nbaf/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1.35, cex.main=.75, font.main=1) - box(lwd=1.5) + rect(xleft=1-1e10, xright=max(CN_and_BAF[,"Position"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) + title(main = gsub(".pdf", "", gsub("ascat/log2_baf/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1.35, cex.main=.75, font.main=1) + box(lwd=1.5) screen(zz[2]) plot(CN_and_BAF[,"Position"], CN_and_BAF[,"BAF"], type="p", pch=".", cex=1, col=col, axes=FALSE, frame=TRUE, xlab="", ylab="", main="", ylim=c(0,1.125)) @@ -238,10 +249,10 @@ if (opt$type=="log2") { abline(v=max(CN_and_BAF[,"Position"]), col="goldenrod3", lty=3, lwd=1) abline(h=0.5, col="red") axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(CN_and_BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) - title(main = gsub(".pdf", "", gsub("ascat/log2nbaf/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1.35, cex.main=.75, font.main=1) - box(lwd=1.5) - close.screen(all.screens=TRUE) + rect(xleft=1-1e10, xright=max(CN_and_BAF[,"Position"])+1e10, ybottom=1, ytop=1.25, col="lightgrey", border="black", lwd=1.5) + title(main = gsub(".pdf", "", gsub("ascat/log2_baf/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1.35, cex.main=.75, font.main=1) + box(lwd=1.5) + close.screen(all.screens=TRUE) dev.off() } else if (opt$type=="run-ascat") { @@ -317,31 +328,35 @@ if (opt$type=="log2") { chrs = 1:23 gender = "2323" sexchromosomes = c(23, 24) - tmp2 = list(Tumor_LogR=Tumor_LogR, - Tumor_BAF=Tumor_BAF, - Tumor_LogR_segmented=Tumor_LogR_segmented, - Tumor_BAF_segmented=Tumor_BAF_segmented, - SNPpos=SNPpos, - chromosomes=ch, - chrnames=chrs, - gender=gender, - sexchromosomes=sexchromosomes) + tmp2 = list(Tumor_LogR = Tumor_LogR, + Tumor_BAF = Tumor_BAF, + Tumor_LogR_segmented = Tumor_LogR_segmented, + Tumor_BAF_segmented = Tumor_BAF_segmented, + SNPpos = SNPpos, + chromosomes = ch, + chrnames = chrs, + gender = gender, + sexchromosomes = sexchromosomes) - tmp3 = try(runASCAT(lrr=tmp2$Tumor_LogR, - baf=tmp2$Tumor_BAF, - lrrsegmented=tmp2$Tumor_LogR_segmented, - bafsegmented=tmp2$Tumor_BAF_segmented, - gender=tmp2$gender, - SNPpos=tmp2$SNPpos, - chromosomes=tmp2$chromosomes, - chrnames=tmp2$chrnames, - sexchromosomes=tmp2$sexchromosomes, - failedqualitycheck=FALSE, - distance = opt$file_out, - copynumberprofile = NULL, - nonroundedprofile = NULL, - aberrationreliability = NULL, - gamma = 1, rho_manual = rho, psi_manual = psi, y_limit = 3, circos = NA)) + tmp3 = try(runASCAT(lrr = tmp2$Tumor_LogR, + baf = tmp2$Tumor_BAF, + lrrsegmented = tmp2$Tumor_LogR_segmented, + bafsegmented = tmp2$Tumor_BAF_segmented, + gender = tmp2$gender, + SNPpos = tmp2$SNPpos, + chromosomes = tmp2$chromosomes, + chrnames = tmp2$chrnames, + sexchromosomes = tmp2$sexchromosomes, + failedqualitycheck = FALSE, + distance = opt$file_out, + copynumberprofile = NULL, + nonroundedprofile = NULL, + aberrationreliability = NULL, + gamma = 1, + rho_manual = rho, + psi_manual = psi, + y_limit = 3, + circos = NA)) if (!("try-error" %in% is(tmp3))) { purity = tmp3$rho @@ -408,10 +423,10 @@ if (opt$type=="log2") { abline(v=max(CN[,"pos"]), col="goldenrod3", lty=3, lwd=1) abline(h=0, col="red") axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - load(gsub(".pdf", ".RData", gsub("total", "ascat", opt$file_out))) - rect(xleft=1-1e10, xright=max(CN[,"pos"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) + load(gsub(".pdf", ".RData", gsub("total", "ascat", opt$file_out))) + rect(xleft=1-1e10, xright=max(CN[,"pos"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) title(main = gsub(".pdf", "", gsub("ascat/total/", "", opt$file_out, fixed=TRUE), fixed=TRUE), line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) + box(lwd=1.5) dev.off() } else if (opt$type=="plot-chr") { diff --git a/copy_number/ascat.mk b/copy_number/ascat.mk index 3af26268..77cefcbd 100644 --- a/copy_number/ascat.mk +++ b/copy_number/ascat.mk @@ -1,29 +1,47 @@ include modules/Makefile.inc LOGDIR ?= log/ascat.$(NOW) -PHONY += ascat ascat/log2 ascat/bafall ascat/bafhet ascat/mad ascat/log2nbaf ascat/ascat ascat/total ascat/bychr -ascat : $(foreach pair,$(SAMPLE_PAIRS),ascat/log2/$(pair).pdf ascat/bafall/$(pair).pdf ascat/bafhet/$(pair).pdf ascat/mad/$(pair).RData ascat/log2nbaf/$(pair).pdf ascat/ascat/$(pair).pdf ascat/total/$(pair).pdf ascat/bychr/$(pair)/timestamp) +ascat : $(foreach pair,$(SAMPLE_PAIRS),ascat/log2/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/baf_all/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/baf_het/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/mad/$(pair).RData) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/log2_baf/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/ascat/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/total/$(pair).pdf) \ + $(foreach pair,$(SAMPLE_PAIRS),ascat/bychr/$(pair)/timestamp) define ascat-plot-log2 ascat/log2/$1_$2.pdf : facets/cncf/$1_$2.Rdata - $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"$(RSCRIPT) modules/copy_number/ascat.R --type log2 --file_in $$< --file_out ascat/log2/$1_$2.pdf") + $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type log2 \ + --file_in $$(<) \ + --file_out ascat/log2/$1_$2.pdf") endef $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call ascat-plot-log2,$(tumor.$(pair)),$(normal.$(pair))))) define ascat-plot-bafall -ascat/bafall/$1_$2.pdf : facets/cncf/$1_$2.Rdata - $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"$(RSCRIPT) modules/copy_number/ascat.R --type bafall --file_in $$< --file_out ascat/bafall/$1_$2.pdf") +ascat/baf_all/$1_$2.pdf : facets/cncf/$1_$2.Rdata + $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type bafall \ + --file_in $$(<) \ + --file_out ascat/baf_all/$1_$2.pdf") endef $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call ascat-plot-bafall,$(tumor.$(pair)),$(normal.$(pair))))) define ascat-plot-bafhet -ascat/bafhet/$1_$2.pdf : facets/cncf/$1_$2.Rdata - $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"$(RSCRIPT) modules/copy_number/ascat.R --type bafhet --file_in $$< --file_out ascat/bafhet/$1_$2.pdf") +ascat/baf_het/$1_$2.pdf : facets/cncf/$1_$2.Rdata + $$(call RUN,-c -v $(ASCAT_ENV) -s 1G -m 2G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type bafhet \ + --file_in $$(<) \ + --file_out ascat/baf_het/$1_$2.pdf") endef $(foreach pair,$(SAMPLE_PAIRS),\ @@ -31,15 +49,26 @@ $(foreach pair,$(SAMPLE_PAIRS),\ define ascat-aspcf ascat/mad/$1_$2.RData : facets/cncf/$1_$2.Rdata - $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/ascat.R --type aspcf --file_in $$< --file_out ascat/mad/$1_$2.RData --gamma '$${aspcf_gamma.$1}'") + $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type aspcf \ + --file_in $$(<) \ + --file_out ascat/mad/$1_$2.RData \ + --gamma '$${aspcf_gamma.$1}'") endef $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call ascat-aspcf,$(tumor.$(pair)),$(normal.$(pair))))) define ascat-plot-aspcf -ascat/log2nbaf/$1_$2.pdf : ascat/mad/$1_$2.RData - $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/ascat.R --type plot-aspcf --file_in $$< --file_out ascat/log2nbaf/$1_$2.pdf --nlog2 '$${aspcf_nlog2.$1}' --nbaf '$${aspcf_nbaf.$1}'") +ascat/log2_baf/$1_$2.pdf : ascat/mad/$1_$2.RData + $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type plot-aspcf \ + --file_in $$(<) \ + --file_out ascat/log2_baf/$1_$2.pdf \ + --nlog2 '$${aspcf_nlog2.$1}' \ + --nbaf '$${aspcf_nbaf.$1}'") endef $(foreach pair,$(SAMPLE_PAIRS),\ @@ -47,27 +76,41 @@ $(foreach pair,$(SAMPLE_PAIRS),\ define ascat-run-ascat ascat/ascat/$1_$2.pdf : ascat/mad/$1_$2.RData - $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/ascat.R --type run-ascat --file_in $$< --file_out ascat/ascat/$1_$2.pdf --rho '$${ascat_rho.$1}' --psi '$${ascat_psi.$1}' --nlog2 '$${aspcf_nlog2.$1}' --nbaf '$${aspcf_nbaf.$1}'") + $$(call RUN,-c -v $(ASCAT_ENV) -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type run-ascat \ + --file_in $$(<) \ + --file_out ascat/ascat/$1_$2.pdf \ + --rho '$${ascat_rho.$1}' \ + --psi '$${ascat_psi.$1}' \ + --nlog2 '$${aspcf_nlog2.$1}' \ + --nbaf '$${aspcf_nbaf.$1}'") ascat/total/$1_$2.pdf : facets/cncf/$1_$2.Rdata ascat/ascat/$1_$2.pdf - $$(call RUN,-c -v $(ASCAT_ENV) -s 6G -m 12G,"$(RSCRIPT) modules/copy_number/ascat.R --type total-copy --file_in $$< --file_out ascat/total/$1_$2.pdf") + $$(call RUN,-c -v $(ASCAT_ENV) -s 6G -m 12G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type total-copy \ + --file_in $$(<) \ + --file_out ascat/total/$1_$2.pdf") endef - $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call ascat-run-ascat,$(tumor.$(pair)),$(normal.$(pair))))) define ascat-plot-chr ascat/bychr/$1_$2/timestamp : facets/cncf/$1_$2.Rdata ascat/ascat/$1_$2.pdf - $$(call RUN, -v $(ASCAT_ENV) -s 6G -m 12G,"mkdir -p ascat/bychr/ && \ - mkdir -p ascat/bychr/$1_$2 && \ - $(RSCRIPT) modules/copy_number/ascat.R --type plot-chr --file_in $$< --file_out ascat/bychr/$1_$2") + $$(call RUN, -v $(ASCAT_ENV) -s 6G -m 12G,"set -o pipefail && \ + $(RSCRIPT) modules/copy_number/ascat.R \ + --type plot-chr \ + --file_in $$(<) \ + --file_out ascat/bychr/$1_$2") endef - $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call ascat-plot-chr,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: + +..DUMMY := $(shell mkdir -p version; \ + R --version > version/ascat.txt;) .SECONDARY: -.PHONY: $(PHONY) +.DELETE_ON_ERROR: +.PHONY: ascat diff --git a/copy_number/cnvkit.R b/copy_number/cnvkit.R deleted file mode 100644 index 95cbe66c..00000000 --- a/copy_number/cnvkit.R +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("copynumber")) -suppressPackageStartupMessages(library("colorspace")) -suppressPackageStartupMessages(library("ASCAT")) -suppressPackageStartupMessages(library("GAP")) - -'plot_log2_' <- function(x, y, title = "", alpha=NA, psi=NA) -{ - par(mar=c(5, 5, 4, 2)+.1) - data("CytoBand") - end = NULL - for (j in 1:23) { - end = c(end, max(CytoBand$End[CytoBand$Chromosome==j])) - } - end = cumsum(end) - start = rep(0, 23) - start[2:23] = end[1:22]+1 - for (j in 1:23) { - y[y[,"Chromosome"]==j,"Start"] = y[y[,"Chromosome"]==j,"Start"] + start[j] - y[y[,"Chromosome"]==j,"End"] = y[y[,"Chromosome"]==j,"End"] + start[j] - x[x[,"chrom"]==j,"pos"] = x[x[,"chrom"]==j,"pos"] + start[j] - } - plot(x[,"pos"], x[,"Log2Ratio"], type="p", pch=".", cex=1, col="grey75", axes=FALSE, frame=TRUE, xlab="", ylab="", main="", ylim=c(-4,5)) - for (j in 1:nrow(y)) { - lines(x=c(y[j,"Start"], y[j,"End"]), y=rep(y[j,"Log2Ratio"],2), lty=1, lwd=1.75, col="red") - } - axis(2, at = c(-4, -2, 0, 2, 4), labels = c(-4, -2, 0, 2, 4), cex.axis = 1, las = 1) - mtext(side = 2, text = expression(Log[2]~"Ratio"), line = 3.15, cex = 1.25) - abline(v=1, col="goldenrod3", lty=3, lwd=.5) - abline(h=0, col="red", lty=1, lwd=1) - for (j in 2:23) { - v = start[j] - abline(v=v, col="goldenrod3", lty=3, lwd=.5) - } - abline(v=max(x[,"pos"]), col="goldenrod3", lty=3, lwd=.5) - axis(1, at = .5*(start+end), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=x[nrow(x),"pos"]+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) - title(main = paste0(title, " | alpha = ", signif(alpha, 3), " | psi = ", signif(psi, 3)), line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) -} - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--type", default = NA, type = 'character', help = "type of analysis"), - make_option("--sample_name", default = NA, type = 'character', help = "sample name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -if (opt$type=="total-copy") { - - 'prunesegments.cn' <- function(x, n=10) - { - cnm = matrix(NA, nrow=nrow(x), ncol=nrow(x)) - for (j in 1:nrow(x)) { - cnm[,j] = abs(2^x[j,"Log2Ratio"] - 2^x[,"Log2Ratio"]) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx,"Log2Ratio"]) - scl = sd(x[indx,"Log2Ratio"]) - ind = which(x[indx,"Log2Ratio"]<(mcl+1.96*scl) & x[indx,"Log2Ratio"]>(mcl-1.96*scl)) - x[indx[ind],"Log2Ratio"] = mean(x[indx[ind],"Log2Ratio"]) - } else { - x[indx,"Log2Ratio"] = mean(x[indx,"Log2Ratio"]) - } - } - return(x) - } - - data = read.csv(file=paste0("cnvkit/cnr/", opt$sample_name, ".cnr"), header=TRUE, sep="\t", stringsAsFactors=FALSE) - CN = data[,c("chromosome", "start", "log2"),drop=FALSE] - colnames(CN) = c("Chromosome", "Position", "Log2Ratio") - CN[,"Chromosome"] = gsub(pattern="chr", replacement="", x=CN[,"Chromosome"], fixed=TRUE) - CN[CN[,"Chromosome"]=="X","Chromosome"] = 23 - CN[CN[,"Chromosome"]=="Y","Chromosome"] = 24 - CN[,"Chromosome"] = as.numeric(CN[,"Chromosome"]) - CN[CN[,"Log2Ratio"]<(-4) | CN[,"Log2Ratio"]>(4),"Log2Ratio"] = 0 - CN = subset(CN, CN[,"Chromosome"]<=23) - tmp = pcf(data=winsorize(data=CN, method="mad", tau=2.5, k=10, verbose=FALSE), kmin = 10, gamma=40, fast=FALSE, verbose=FALSE)[,2:7,drop=FALSE] - colnames(tmp) = c("Chromosome", "Arm", "Start", "End", "N", "Log2Ratio") - save(CN, tmp, file=paste0("cnvkit/totalcopy/", opt$sample_name, ".RData")) - tmp = prunesegments.cn(x=tmp, n=10) - CN = winsorize(data=CN[,c("Chromosome","Position","Log2Ratio")], tau=2.5, k=15, verbose=FALSE) - pdf(file=paste0("cnvkit/segmented/", opt$sample_name, ".pdf"), width=10, height=4.25) - file_names = dir(path="facets/cncf", pattern=opt$sample_name, full.names=TRUE) - file_names = file_names[grep(".Rdata", file_names, fixed=TRUE)] - if (length(file_names)==1) { - load(file_names) - alpha = fit$purity - psi = fit$ploidy - } else { - alpha = NA - psi = NA - } - plot_log2_(x=CN, y=tmp, title = opt$sample_name, alpha=alpha, psi=psi) - dev.off() - -} else if (opt$type=="call-cna") { - - 'prunesegments.cn' <- function(x, n=10) - { - cnm = matrix(NA, nrow=nrow(x), ncol=nrow(x)) - for (j in 1:nrow(x)) { - cnm[,j] = abs(2^x[j,"Log2Ratio"] - 2^x[,"Log2Ratio"]) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx,"Log2Ratio"]) - scl = sd(x[indx,"Log2Ratio"]) - ind = which(x[indx,"Log2Ratio"]<(mcl+1.96*scl) & x[indx,"Log2Ratio"]>(mcl-1.96*scl)) - x[indx[ind],"Log2Ratio"] = mean(x[indx[ind],"Log2Ratio"]) - } else { - x[indx,"Log2Ratio"] = mean(x[indx,"Log2Ratio"]) - } - } - return(x) - } - load(paste0("cnvkit/totalcopy/", opt$sample_name, ".RData")) - file_names = dir(path="facets/cncf", pattern=opt$sample_name, full.names=TRUE) - file_names = file_names[grep(".Rdata", file_names, fixed=TRUE)] - if (length(file_names)==1) { - load(file_names) - alpha = ifelse(is.na(fit$purity), 1, fit$purity) - psi = ifelse(is.na(fit$ploidy), 2, fit$ploid) - } else { - alpha = 1 - psi = 2 - } - tmp = prunesegments.cn(x=tmp, n=10) - qt = round((((2^(tmp[,"Log2Ratio"])) * (alpha*psi + 2*(1-alpha))) - 2*(1-alpha))/alpha) - qt[is.na(qt)] = 2 - qt[is.infinite(qt)] = 2 - cat5 = rep(0, length(qt)) - if (round(psi)==1 | round(psi)==2) { - cat5t = c(0, 1, 3, 7) - } else if (round(psi)==3) { - cat5t = c(0, 1, 4, 9) - } else if (round(psi)==4) { - cat5t = c(0, 1, 5, 10) - } else if (round(psi)==5) { - cat5t = c(0, 2, 6, 12) - } else if (round(psi)>=6) { - cat5t = c(0, 2, 7, 15) - } else { - cat5t = c(0, 1, 3, 7) - } - cat5[qt <= cat5t[2]] = -1 - cat5[qt <= cat5t[1]] = -2 - cat5[qt >= cat5t[3]] = 1 - cat5[qt >= cat5t[4]] = 2 - tmp = cbind(tmp, "Cat5"=cat5) - save(CN, tmp, file=paste0("cnvkit/called/", opt$sample_name, ".RData")) - -} - -warnings() diff --git a/copy_number/cnvkit.mk b/copy_number/cnvkit.mk new file mode 100644 index 00000000..5e065c78 --- /dev/null +++ b/copy_number/cnvkit.mk @@ -0,0 +1,125 @@ +include modules/Makefile.inc +include modules/genome_inc/b37.inc + +LOGDIR ?= log/cnv_kit.$(NOW) + +cnv_kit : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).targetcoverage.cnn) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).antitargetcoverage.cnn) \ + $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn) \ + $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn) \ + cnvkit/reference/combined_reference.cnr \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnr/$(sample).cnr) \ + $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnr/$(sample).cnr) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/segmented/$(sample).txt) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/plots/log2/$(sample).pdf) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/plots/segmented/$(sample).pdf) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/totalcopy/$(sample).txt) \ + $(foreach sample,$(TUMOR_SAMPLES),cnvkit/plots/totalcopy/$(sample).pdf) \ + cnvkit/summary/total_copy.txt \ + cnvkit/summary/log2_ratio.txt + +ONTARGET_FILE = $(HOME)/share/lib/bed_files/MSK-IMPACT-v3_cnvkit_ontarget.bed +OFFTARGET_FILE = $(HOME)/share/lib/bed_files/MSK-IMPACT-v4_cnvkit_offtarget.bed + +define cnvkit-tumor-cnn +cnvkit/cnn/tumor/$1.targetcoverage.cnn : bam/$1.bam + $$(call RUN,-c -n 4 -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py coverage -p 4 -q 0 $$(<) $$(ONTARGET_FILE) -o cnvkit/cnn/tumor/$1.targetcoverage.cnn") + +cnvkit/cnn/tumor/$1.antitargetcoverage.cnn : bam/$1.bam + $$(call RUN,-c -n 4 -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py coverage -p 4 -q 0 $$(<) $$(OFFTARGET_FILE) -o cnvkit/cnn/tumor/$1.antitargetcoverage.cnn") +endef + $(foreach sample,$(TUMOR_SAMPLES),\ + $(eval $(call cnvkit-tumor-cnn,$(sample)))) + +define cnvkit-normal-cnn +cnvkit/cnn/normal/$1.targetcoverage.cnn : bam/$1.bam + $$(call RUN,-c -n 4 -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py coverage -p 4 -q 0 $$(<) $$(ONTARGET_FILE) -o cnvkit/cnn/normal/$1.targetcoverage.cnn") + +cnvkit/cnn/normal/$1.antitargetcoverage.cnn : bam/$1.bam + $$(call RUN,-c -n 4 -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py coverage -p 4 -q 0 $$(<) $$(OFFTARGET_FILE) -o cnvkit/cnn/normal/$1.antitargetcoverage.cnn") +endef + $(foreach sample,$(NORMAL_SAMPLES),\ + $(eval $(call cnvkit-normal-cnn,$(sample)))) + +cnvkit/reference/combined_reference.cnr : $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn) $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn) + $(call RUN,-n 1 -s 24G -m 32G -v $(CNVKIT_ENV),"set -o pipefail && \ + sleep 30 && \ + cnvkit.py reference cnvkit/cnn/normal/*.cnn -f $(REF_FASTA) --no-edge -o cnvkit/reference/combined_reference.cnr") + +define cnvkit-tumor-cnr +cnvkit/cnr/$1.cnr : cnvkit/cnn/tumor/$1.targetcoverage.cnn cnvkit/cnn/tumor/$1.antitargetcoverage.cnn cnvkit/reference/combined_reference.cnr + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py fix $$(<) $$(<<) $$(<<<) -o cnvkit/cnr/$1.cnr") + +cnvkit/plots/log2/$1.pdf : cnvkit/cnr/$1.cnr + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 1 \ + --sample_name $1") + +cnvkit/segmented/$1.txt : cnvkit/cnr/$1.cnr + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 2 \ + --sample_name $1") + +cnvkit/plots/segmented/$1.pdf : cnvkit/cnr/$1.cnr + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 3 \ + --sample_name $1") + +endef + $(foreach sample,$(TUMOR_SAMPLES),\ + $(eval $(call cnvkit-tumor-cnr,$(sample)))) + +define cnvkit-normal-cnr +cnvkit/cnr/$1.cnr : cnvkit/cnn/normal/$1.targetcoverage.cnn cnvkit/cnn/normal/$1.antitargetcoverage.cnn cnvkit/reference/combined_reference.cnr + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + cnvkit.py fix $$(<) $$(<<) $$(<<<) -o cnvkit/cnr/$1.cnr") + +endef + $(foreach sample,$(NORMAL_SAMPLES),\ + $(eval $(call cnvkit-normal-cnr,$(sample)))) + + +define cnvkit-total-copy +cnvkit/totalcopy/$1.txt : cnvkit/segmented/$1.txt facets/cncf/$1_$2.out + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 4 \ + --sample_name $1_$2") + +cnvkit/plots/totalcopy/$1.pdf : cnvkit/cnr/$1.cnr cnvkit/totalcopy/$1.txt facets/cncf/$1_$2.out + $$(call RUN,-c -s 6G -m 8G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 5 \ + --sample_name $1_$2") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call cnvkit-total-copy,$(tumor.$(pair)),$(normal.$(pair))))) + +cnvkit/summary/total_copy.txt : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/totalcopy/$(sample).txt) + $(call RUN,-n 1 -s 24G -m 32G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 6 \ + --sample_name '$(TUMOR_SAMPLES)'") + +cnvkit/summary/log2_ratio.txt : $(foreach sample,$(SAMPLES),cnvkit/cnr/$(sample).cnr) + $(call RUN,-n 1 -s 24G -m 32G -v $(CNVKIT_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/cnvkit.R \ + --option 7 \ + --sample_name '$(SAMPLES)'") + + + +..DUMMY := $(shell mkdir -p version; \ + python $(CNVKIT_ENV)/bin/cnvkit.py version &> version/cnvkit.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: cnv_kit diff --git a/copy_number/cnvkitbinqc.R b/copy_number/cnvkitbinqc.R deleted file mode 100644 index 7d3e430e..00000000 --- a/copy_number/cnvkitbinqc.R +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--normal_files", default = NA, type = 'character', help = "normal samples input file names"), - make_option("--tumor_files", default = NA, type = 'character', help = "tumor samples input file names"), - make_option("--out_file", default = NA, type = 'character', help = "output file name")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -in_file_normal = unlist(strsplit(x=opt$normal_files, split=" ", fixed=TRUE)) -in_file_tumor = unlist(strsplit(x=opt$tumor_files, split=" ", fixed=TRUE)) -out_file = opt$out_file - -depth_n = list() -for (i in 1:length(in_file_normal)) { - print(i) - data = read.csv(file=in_file_normal[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% as.character(1:22) - depth_n[[i]] = as.numeric(data[index,"depth"]) -} -depth_n = do.call(cbind, depth_n) - -depth_t = list() -for (i in 1:length(in_file_tumor)) { - print(i) - data = read.csv(file=in_file_tumor[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% as.character(1:22) - depth_t[[i]] = as.numeric(data[index,"depth"]) -} -depth_t = do.call(cbind, depth_t) - -bin_size = as.numeric(data[index,"end"]) - as.numeric(data[index,"start"]) -var_bin_n = apply(depth_n, 1, sd, na.rm=TRUE) -var_bin_t = apply(depth_t, 1, sd, na.rm=TRUE) -data = data.frame(bin_size, var_bin_n, var_bin_t) -write.table(data, file=out_file, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) - -ymin = min(var_bin_n, var_bin_t) -ymax = max(var_bin_n, var_bin_t) - -pdf(file=gsub(".tsv", ".pdf", x=out_file, fixed=TRUE), width=7, height=7) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(bin_size, var_bin_n, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", log="y", ylim=c(ymin, ymax)) -points(x=bin_size, y=var_bin_n, col = "grey50", bg = "grey90", pch = 21, cex = 1, lwd = .1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "Bin size (bp)", line = 4, cex = 1.5) -mtext(side = 2, text = "SD", line = 5, cex = 1.5) -plot(bin_size, var_bin_t, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", log="y", ylim=c(ymin, ymax)) -points(x=bin_size, y=var_bin_t, col = "black", bg = "steelblue", pch = 21, cex = 1, lwd = .1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "Bin size (bp)", line = 4, cex = 1.5) -mtext(side = 2, text = "SD", line = 5, cex = 1.5) -plot(var_bin_n, var_bin_t, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", log="xy", xlim=c(ymin, ymax), ylim=c(ymin, ymax)) -points(x=var_bin_n, y=var_bin_t, col = "black", bg = "steelblue", pch = 21, cex = 1, lwd = .1) -abline(a=0, b=1, col="goldenrod3", lwd=2) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "Normal SD", line = 4, cex = 1.5) -mtext(side = 2, text = "Tumor SD", line = 5, cex = 1.5) -dev.off() diff --git a/copy_number/cnvkitcoverage.mk b/copy_number/cnvkitcoverage.mk deleted file mode 100644 index 696f395a..00000000 --- a/copy_number/cnvkitcoverage.mk +++ /dev/null @@ -1,30 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_coverage.$(NOW) -PHONY += cnvkit cnvkit/cnn cnvkit/cnn/tumor cnvkit/cnn/normal - -cnvkit_coverage : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).targetcoverage.cnn cnvkit/cnn/tumor/$(sample).antitargetcoverage.cnn) $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn) - -define cnvkit-tumor-cnn -cnvkit/cnn/tumor/%.targetcoverage.cnn : bam/%.bam - $$(call RUN,-c -n 4 -s 6G -m 8G,"cnvkit.py coverage -p 4 -q 0 $$(<) $$(ONTARGET_FILE) -o cnvkit/cnn/tumor/$$(*).targetcoverage.cnn") - -cnvkit/cnn/tumor/%.antitargetcoverage.cnn : bam/%.bam - $$(call RUN,-c -n 4 -s 6G -m 8G,"cnvkit.py coverage -p 4 -q 0 $$(<) $$(OFFTARGET_FILE) -o cnvkit/cnn/tumor/$$(*).antitargetcoverage.cnn") -endef - $(foreach sample,$(TUMOR_SAMPLES),\ - $(eval $(call cnvkit-tumor-cnn,$(sample)))) - -define cnvkit-normal-cnn -cnvkit/cnn/normal/%.targetcoverage.cnn : bam/%.bam - $$(call RUN,-c -n 4 -s 6G -m 8G,"cnvkit.py coverage -p 4 -q 0 $$(<) $$(ONTARGET_FILE) -o cnvkit/cnn/normal/$$(*).targetcoverage.cnn") - -cnvkit/cnn/normal/%.antitargetcoverage.cnn : bam/%.bam - $$(call RUN,-c -n 4 -s 6G -m 8G,"cnvkit.py coverage -p 4 -q 0 $$(<) $$(OFFTARGET_FILE) -o cnvkit/cnn/normal/$$(*).antitargetcoverage.cnn") -endef - $(foreach sample,$(NORMAL_SAMPLES),\ - $(eval $(call cnvkit-normal-cnn,$(sample)))) - -.PHONY: $(PHONY) - diff --git a/copy_number/cnvkitfix.mk b/copy_number/cnvkitfix.mk deleted file mode 100644 index c83aa1af..00000000 --- a/copy_number/cnvkitfix.mk +++ /dev/null @@ -1,18 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_fix.$(NOW) -PHONY += cnvkit cnvkit/cnr - -cnvkit_fix : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnr/$(sample).cnr) - -define cnvkit-cnr -cnvkit/cnr/%.cnr : cnvkit/cnn/tumor/%.targetcoverage.cnn cnvkit/cnn/tumor/%.antitargetcoverage.cnn cnvkit/reference/combined_reference.cnr - $$(call RUN,-c -s 6G -m 8G,"cnvkit.py fix $$(<) $$(<<) cnvkit/reference/combined_reference.cnr -o cnvkit/cnr/$$(*).cnr") - -endef - $(foreach sample,$(TUMOR_SAMPLES),\ - $(eval $(call cnvkit-cnr,$(sample)))) - -.PHONY: $(PHONY) - diff --git a/copy_number/cnvkitheatmap.R b/copy_number/cnvkitheatmap.R deleted file mode 100644 index 1513a473..00000000 --- a/copy_number/cnvkitheatmap.R +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("RColorBrewer")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--in_file", default = NA, type = 'character', help = "input file names"), - make_option("--out_file", default = NA, type = 'character', help = "output file name")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -in_file = unlist(strsplit(x=opt$in_file, split=" ", fixed=TRUE)) -out_file = opt$out_file - -depth = list() -for (i in 1:length(in_file)) { - print(i) - data = read.csv(file=in_file[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% c(as.character(1:22), "X") - depth[[i]] = as.numeric(data[index,"depth"]) -} -depth = do.call(cbind, depth) -pdf(file=out_file, width=14, height=14) -heatmap(x=depth, labRow=rep(" ", nrow(depth)), labCol=rep(" ", ncol(depth)), col=colorRampPalette(RColorBrewer::brewer.pal(10, "RdBu"))(256)) -dev.off() - -png(file=gsub(".pdf", ".png", out_file, fixed=TRUE), width=1440, height=1440) -heatmap(x=depth, labRow=rep(" ", nrow(depth)), labCol=rep(" ", ncol(depth)), col=colorRampPalette(RColorBrewer::brewer.pal(10, "RdBu"))(256)) -dev.off() diff --git a/copy_number/cnvkitheatmap.mk b/copy_number/cnvkitheatmap.mk deleted file mode 100644 index cbbad3b1..00000000 --- a/copy_number/cnvkitheatmap.mk +++ /dev/null @@ -1,25 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/cnvkit_heatmap.$(NOW) -PHONY += cnvkit cnvkit/heatmap - -CNVKIT_NORMAL_ON_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn)) -CNVKIT_NORMAL_OFF_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn)) -CNVKIT_TUMOR_ON_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).targetcoverage.cnn)) -CNVKIT_TUMOR_OFF_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).antitargetcoverage.cnn)) - -cnvkit : cnvkit/heatmap/normal_samples_ontarget.pdf cnvkit/heatmap/normal_samples_offtarget.pdf cnvkit/heatmap/tumor_samples_ontarget.pdf cnvkit/heatmap/tumor_samples_offtarget.pdf - -cnvkit/heatmap/normal_samples_ontarget.pdf : $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).targetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitheatmap.R --in_file '$(CNVKIT_NORMAL_ON_TARGET)' --out_file cnvkit/heatmap/normal_samples_ontarget.pdf") - -cnvkit/heatmap/normal_samples_offtarget.pdf : $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).antitargetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitheatmap.R --in_file '$(CNVKIT_NORMAL_OFF_TARGET)' --out_file cnvkit/heatmap/normal_samples_offtarget.pdf") - -cnvkit/heatmap/tumor_samples_ontarget.pdf : $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).targetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitheatmap.R --in_file '$(CNVKIT_TUMOR_ON_TARGET)' --out_file cnvkit/heatmap/tumor_samples_ontarget.pdf") - -cnvkit/heatmap/tumor_samples_offtarget.pdf : $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).antitargetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitheatmap.R --in_file '$(CNVKIT_TUMOR_OFF_TARGET)' --out_file cnvkit/heatmap/tumor_samples_offtarget.pdf") - -.PHONY: $(PHONY) diff --git a/copy_number/cnvkitplot.R b/copy_number/cnvkitplot.R deleted file mode 100644 index 971b6654..00000000 --- a/copy_number/cnvkitplot.R +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("copynumber")) -suppressPackageStartupMessages(library("colorspace")) -suppressPackageStartupMessages(library("ASCAT")) -suppressPackageStartupMessages(library("GAP")) - -'plot_log2_' <- function(x, title = "") -{ - par(mar=c(5, 5, 4, 2)+.1) - data("CytoBand") - end = NULL - for (i in 1:23) { - end = c(end, max(CytoBand[CytoBand[,1]==i,"End"])) - } - end = cumsum(end) - start = c(1, end[1:22]+1) - CytoBand = cbind(start, end) - index = NULL - for (i in 1:23) { - index = c(index, seq(from = CytoBand[i, "start"], to=CytoBand[i, "end"], length=sum(x$chromosome==i))) - } - plot(index, x$log2, type="p", pch=".", cex=1.95, col="grey80", axes=FALSE, frame=TRUE, xlab="", ylab="", main="", ylim=c(-4,5)) - axis(2, at = c(-4, -2, 0, 2, 4), labels = c(-4, -2, 0, 2, 4), cex.axis = 1, las = 1) - mtext(side = 2, text = expression(Log[2]~"Ratio"), line = 3.15, cex = 1.25) - abline(v=1, col="goldenrod3", lty=3, lwd=.5) - abline(h=0, col="red", lty=1, lwd=1) - for (j in 1:23) { - abline(v=CytoBand[j,"end"], col="goldenrod3", lty=3, lwd=.5) - } - axis(1, at = .5*(CytoBand[,"start"]+CytoBand[,"end"]), labels=c(1:22, "X"), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=CytoBand[23,"end"]+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) - title(main = title, line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) -} - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--in_file", default = NA, type = 'character', help = "input file name")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -outfile_on_target = gsub("cnr", "log2", gsub(".cnr", ".ontarget.pdf", opt$in_file, fixed=TRUE), fixed=TRUE) -outfile_off_target = gsub("cnr", "log2", gsub(".cnr", ".offtarget.pdf", opt$in_file, fixed=TRUE), fixed=TRUE) - -data = read.table(file=opt$in_file, header=TRUE, sep="\t", comment.char="#", stringsAsFactors=FALSE) -data = subset(data, data[,"depth"]!=0) - -if (nrow(data)==0) { - system(paste0("touch ", outfile_on_target)) - system(paste0("touch ", outfile_off_target)) -} else { - data[,"chromosome"] = gsub(pattern="chr", replacement="", x=data[,"chromosome"], fixed=TRUE) - data[data[,"chromosome"]=="X", "chromosome"] = 23 - data[data[,"chromosome"]=="Y", "chromosome"] = 24 - data[,"chromosome"] = as.numeric(data[,"chromosome"]) - data = subset(data, data[,"chromosome"]<=23) - - if (sum(data$gene=="-")>0) { - flag = 1 - } else if (sum(data$gene=="Antitarget")>0) { - flag = 2 - } - - if (flag==1) { - ontarget = subset(data, data$gene=="-") - } else if (flag==2) { - ontarget = subset(data, data$gene!="Antitarget") - } - - pdf(file=outfile_on_target, width=10, height=4.25) - plot_log2_(x=ontarget, title=gsub("cnvkit/cnr/", "", gsub(".cnr", "", opt$in_file, fixed=TRUE), fixed=TRUE)) - dev.off() - - if (flag==1) { - offtarget = subset(data, data$gene!="-") - } else if (flag==2) { - offtarget = subset(data, data$gene=="Antitarget") - } - - tmp = offtarget[,c("chromosome", "start", "log2"),drop=FALSE] - tmp = winsorize(data=tmp, tau=3.5, k=25, verbose=FALSE, return.outliers=TRUE) - offtarget[tmp$wins.outliers[,3]!=0,"log2"] = NA - pdf(file=outfile_off_target, width=10, height=4.25) - plot_log2_(x=offtarget, title=gsub("cnvkit/cnr/", "", gsub(".cnr", "", opt$in_file, fixed=TRUE), fixed=TRUE)) - dev.off() -} diff --git a/copy_number/cnvkitplot.mk b/copy_number/cnvkitplot.mk deleted file mode 100644 index ba16ff8d..00000000 --- a/copy_number/cnvkitplot.mk +++ /dev/null @@ -1,16 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_plot.$(NOW) -PHONY += cnvkit cnvkit/log2 - -cnvkit_plot : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/log2/$(sample).ontarget.pdf cnvkit/log2/$(sample).offtarget.pdf) - -define cnvkit-plot -cnvkit/log2/%.ontarget.pdf cnvkit/log2/%.offtarget.pdf : cnvkit/cnr/%.cnr - $$(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 4G -m 6G,"$(RSCRIPT) modules/copy_number/cnvkitplot.R --in_file $$(<)") -endef - $(foreach sample,$(TUMOR_SAMPLES),\ - $(eval $(call cnvkit-plot,$(sample)))) - -.PHONY: $(PHONY) diff --git a/copy_number/cnvkitprcomp.R b/copy_number/cnvkitprcomp.R deleted file mode 100644 index 0353609e..00000000 --- a/copy_number/cnvkitprcomp.R +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("RColorBrewer")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--normal_files", default = NA, type = 'character', help = "normal samples input file names"), - make_option("--tumor_files", default = NA, type = 'character', help = "tumor samples input file names"), - make_option("--out_file_normal", default = NA, type = 'character', help = "normal samples output file name"), - make_option("--out_file_tumor", default = NA, type = 'character', help = "tumor samples output file name")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -in_file_normal = unlist(strsplit(x=opt$normal_files, split=" ", fixed=TRUE)) -normal_samples = gsub(".antitargetcoverage", "", x=gsub(".targetcoverage", "", x=gsub(pattern=".cnn", replacement="", x=gsub(pattern="cnvkit/cnn/normal/", replacement="", x=in_file_normal, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE) -in_file_tumor = unlist(strsplit(x=opt$tumor_files, split=" ", fixed=TRUE)) -tumor_samples = gsub(".antitargetcoverage", "", x=gsub(".targetcoverage", "", x=gsub(pattern=".cnn", replacement="", x=gsub(pattern="cnvkit/cnn/tumor/", replacement="", x=in_file_tumor, fixed=TRUE), fixed=TRUE), fixed=TRUE), fixed=TRUE) -out_file_normal = opt$out_file_normal -out_file_tumor = opt$out_file_tumor - -depth_n = list() -for (i in 1:length(in_file_normal)) { - print(i) - data = read.csv(file=in_file_normal[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% as.character(1:22) - depth_n[[i]] = as.numeric(data[index,"depth"]) -} -depth_n = do.call(cbind, depth_n) - -depth_t = list() -for (i in 1:length(in_file_tumor)) { - print(i) - data = read.csv(file=in_file_tumor[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% as.character(1:22) - depth_t[[i]] = as.numeric(data[index,"depth"]) -} -depth_t = do.call(cbind, depth_t) - -pca_n = prcomp(t(depth_n), center=TRUE, scale.=TRUE) -pca_t = predict(object=pca_n, newdata=t(depth_t)) -x = c(pca_n$x[,1], pca_t[,1]) -y = c(pca_n$x[,2], pca_t[,2]) -bg = c(rep("grey90", nrow(pca_n$x)), rep("steelblue", nrow(pca_t))) -col = c(rep("grey50", nrow(pca_n$x)), rep("black", nrow(pca_t))) -pch = 21 -index = c(rep(TRUE, nrow(pca_n$x)), rep(FALSE, nrow(pca_t))) - -pdf(file=out_file_normal, width=9, height=9) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(x=x, y=y, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "") -points(x=x[index], y=y[index], col = col[index], bg = bg[index], pch = pch, cex = 1, lwd = .1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "PC 1", line = 4, cex = 1.5) -mtext(side = 2, text = "PC 2", line = 4, cex = 1.5) -dev.off() - -pdf(file=out_file_tumor, width=9, height=9) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(x=x, y=y, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "") -points(x=x[!index], y=y[!index], col = col[!index], bg = bg[!index], pch = pch, cex = 1, lwd = .1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "PC 1", line = 4, cex = 1.5) -mtext(side = 2, text = "PC 2", line = 4, cex = 1.5) -dev.off() - -pdf(file=gsub("tumor", "all", out_file_tumor, fixed=TRUE), width=9, height=9) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(x=x, y=y, col = col, bg = bg, pch = pch, cex = 1, lwd = .1, axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "") -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = NULL, cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "PC 1", line = 4, cex = 1.5) -mtext(side = 2, text = "PC 2", line = 4, cex = 1.5) -dev.off() - -data = rbind(pca_n$x, pca_t) -rownames(data) = c(normal_samples, tumor_samples) -colnames(data) = paste("PC", 1:ncol(data)) -file_name = paste0("cnvkit/pca/pc_", ifelse(grepl("offtarget", out_file_tumor, fixed=TRUE), "offtarget", "ontarget"), ".txt") -write.table(data, file=file_name, sep="\t", col.names=TRUE, row.names=TRUE, quote=FALSE) diff --git a/copy_number/cnvkitprcomp.mk b/copy_number/cnvkitprcomp.mk deleted file mode 100644 index e787762f..00000000 --- a/copy_number/cnvkitprcomp.mk +++ /dev/null @@ -1,19 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/cnvkit_pca.$(NOW) -PHONY += cnvkit cnvkit/pca - -CNVKIT_NORMAL_ON_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn)) -CNVKIT_NORMAL_OFF_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn)) -CNVKIT_TUMOR_ON_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).targetcoverage.cnn)) -CNVKIT_TUMOR_OFF_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).antitargetcoverage.cnn)) - -cnvkit : cnvkit/pca/normal_samples_ontarget.pdf cnvkit/pca/normal_samples_offtarget.pdf cnvkit/pca/tumor_samples_ontarget.pdf cnvkit/pca/tumor_samples_offtarget.pdf - -cnvkit/pca/normal_samples_ontarget.pdf cnvkit/pca/tumor_samples_ontarget.pdf : $(wildcard cnvkit/cnn/tumor/$(NORMAL_SAMPLES).targetcoverage.cnn) $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).targetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitprcomp.R --normal_files '$(CNVKIT_NORMAL_ON_TARGET)' --tumor_files '$(CNVKIT_TUMOR_ON_TARGET)' --out_file_normal cnvkit/pca/normal_samples_ontarget.pdf --out_file_tumor cnvkit/pca/tumor_samples_ontarget.pdf") - -cnvkit/pca/normal_samples_offtarget.pdf cnvkit/pca/tumor_samples_offtarget.pdf : $(wildcard cnvkit/cnn/tumor/$(NORMAL_SAMPLES).antitargetcoverage.cnn) $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).antitargetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitprcomp.R --normal_files '$(CNVKIT_NORMAL_OFF_TARGET)' --tumor_files '$(CNVKIT_TUMOR_OFF_TARGET)' --out_file_normal cnvkit/pca/normal_samples_offtarget.pdf --out_file_tumor cnvkit/pca/tumor_samples_offtarget.pdf") - -.PHONY: $(PHONY) diff --git a/copy_number/cnvkitqc.R b/copy_number/cnvkitqc.R deleted file mode 100644 index a8002d3a..00000000 --- a/copy_number/cnvkitqc.R +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--normal_files", default = NA, type = 'character', help = "normal input files"), - make_option("--tumor_files", default = NA, type = 'character', help = "tumor input files"), - make_option("--out_file", default = NA, type = 'character', help = "output file"), - make_option("--option", default = NA, type = 'character', help = "1-0 for ontarget or offtarget")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -normal_files = unlist(strsplit(x=opt$normal_files, split=" ", fixed=TRUE)) -normal_samples = gsub(pattern=".cnr", replacement="", x=gsub(pattern="cnvkit/cnr/", replacement="", x=normal_files, fixed=TRUE), fixed=TRUE) -tumor_files = unlist(strsplit(x=opt$tumor_files, split=" ", fixed=TRUE)) -tumor_samples = gsub(pattern=".cnr", replacement="", x=gsub(pattern="cnvkit/cnr/", replacement="", x=tumor_files, fixed=TRUE), fixed=TRUE) -out_file = opt$out_file - -'MAD' <- function(x) -{ - x = na.omit(x) - q2 = mad(x) - return(invisible(q2)) -} - -'MAPD' <- function(x) -{ - x = na.omit(x) - q2 = median(abs(x[1:(length(x)-1)] - x[2:length(x)])) - return(invisible(q2)) -} - -'MIQR' <- function(x) -{ - x = na.omit(x) - iq = stats::IQR(abs(x[1:(length(x)-1)] - x[2:length(x)])) - return(invisible(iq)) -} - -'scale.' <- function(x) -{ - y = (x-min(x))/(max(x)-min(x)) - return(invisible(y)) -} - -'transparentRgb' <- function (col = "black", alpha = 85) -{ - tmp = c(col2rgb(col), alpha, 255) - names(tmp) = c("red", "green", "blue", "alpha", "maxColorValue") - out = do.call("rgb", as.list(tmp)) - return(invisible(out)) -} - - -qc = matrix(NA, nrow=length(c(normal_samples, tumor_samples)), ncol=3, dimnames=list(c(normal_samples, tumor_samples), c("MAD", "MAPD", "IQR"))) -for (i in 1:length(normal_files)) { - print(i) - data = read.csv(file=normal_files[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% 1:22 & data[,"gene"] == ifelse(opt$option==1, "-", "Antitarget") - qc[normal_samples[i],1] = MAD(data[index,"log2"]) - qc[normal_samples[i],2] = MAPD(data[index,"log2"]) - qc[normal_samples[i],3] = MIQR(data[index,"log2"]) -} -for (i in 1:length(tumor_files)) { - print(i) - data = read.csv(file=tumor_files[i], header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = data[,"chromosome"] %in% 1:22 & data[,"gene"] == ifelse(opt$option==1, "-", "Antitarget") - qc[tumor_samples[i],1] = MAD(data[index,"log2"]) - qc[tumor_samples[i],2] = MAPD(data[index,"log2"]) - qc[tumor_samples[i],3] = MIQR(data[index,"log2"]) -} -data = qc -colnames(data) = c("MAD", "MAPD", "IQR") -data = cbind("SAMPLE_NAME"=c(normal_samples, tumor_samples), "SAMPLE_TYPE"=c(rep("N", length(normal_samples)), rep("T", length(tumor_samples))), data) -write.table(data, file=out_file, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) - -# MAPD -file_name = paste0("cnvkit/qc/", ifelse(opt$option==1, "on", "off"), "target_mapd.pdf") -x = as.numeric(data[data[,"SAMPLE_TYPE"]=="T", "MAPD"]) -y = as.numeric(data[data[,"SAMPLE_TYPE"]=="N", "MAPD"]) -dx = density(x, from=0, to=max(x,y)) -dx$y = scale.(dx$y) -dy = density(y, from=0, to=max(x,y)) -dy$y = scale.(dy$y) -pdf(file=file_name, width=7, height=7) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(0, 0, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", xlim=c(0, max(max(x, y), 1.5)), ylim=c(0,1.2)) -polygon(x=c(dx$x, rev(dx$x)), y=c(dx$y, rep(0, length(dx$y))), border="steelblue", col=transparentRgb("steelblue", 155), lwd=2) -polygon(x=c(dy$x, rev(dy$x)), y=c(dy$y, rep(0, length(dy$y))), border="grey50", col=transparentRgb("grey50", 155), lwd=2) -legend("topright", col=c("steelblue", "grey50"), pch=15, legend=c("Tumor", "Normal"), box.lwd=-1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = seq(0, 1, by=.2), labels = seq(0, 1, by=.2), cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "MAPD", line = 4, cex = 1.5) -mtext(side = 2, text = "Density", line = 4, cex = 1.5) -dev.off() - -# MAD -file_name = paste0("cnvkit/qc/", ifelse(opt$option==1, "on", "off"), "target_mad.pdf") -x = as.numeric(data[data[,"SAMPLE_TYPE"]=="T", "MAD"]) -y = as.numeric(data[data[,"SAMPLE_TYPE"]=="N", "MAD"]) -dx = density(x, from=0, to=max(x,y)) -dx$y = scale.(dx$y) -dy = density(y, from=0, to=max(x,y)) -dy$y = scale.(dy$y) -pdf(file=file_name, width=7, height=7) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(0, 0, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", xlim=c(0, max(max(x, y), 1.5)), ylim=c(0,1.2)) -polygon(x=c(dx$x, rev(dx$x)), y=c(dx$y, rep(0, length(dx$y))), border="steelblue", col=transparentRgb("steelblue", 155), lwd=2) -polygon(x=c(dy$x, rev(dy$x)), y=c(dy$y, rep(0, length(dy$y))), border="grey50", col=transparentRgb("grey50", 155), lwd=2) -legend("topright", col=c("steelblue", "grey50"), pch=15, legend=c("Tumor", "Normal"), box.lwd=-1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = seq(0, 1, by=.2), labels = seq(0, 1, by=.2), cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "MAD", line = 4, cex = 1.5) -mtext(side = 2, text = "Density", line = 4, cex = 1.5) -dev.off() - -# IQR -file_name = paste0("cnvkit/qc/", ifelse(opt$option==1, "on", "off"), "target_iqr.pdf") -x = as.numeric(data[data[,"SAMPLE_TYPE"]=="T", "IQR"]) -y = as.numeric(data[data[,"SAMPLE_TYPE"]=="N", "IQR"]) -dx = density(x, from=0, to=max(x,y)) -dx$y = scale.(dx$y) -dy = density(y, from=0, to=max(x,y)) -dy$y = scale.(dy$y) -pdf(file=file_name, width=7, height=7) -par(mar = c(6.1, 6.5, 4.1, 1.1)) -plot(0, 0, type="n", axes = FALSE, frame.plot = FALSE, main = "", xlab = "", ylab = "", xlim=c(0, max(max(x, y), 1.5)), ylim=c(0,1.2)) -polygon(x=c(dx$x, rev(dx$x)), y=c(dx$y, rep(0, length(dx$y))), border="steelblue", col=transparentRgb("steelblue", 155), lwd=2) -polygon(x=c(dy$x, rev(dy$x)), y=c(dy$y, rep(0, length(dy$y))), border="grey50", col=transparentRgb("grey50", 155), lwd=2) -legend("topright", col=c("steelblue", "grey50"), pch=15, legend=c("Tumor", "Normal"), box.lwd=-1) -axis(1, at = NULL, cex.axis = 1.5, padj = 0.25, lwd=1.25, lwd.ticks=1.15) -axis(2, at = seq(0, 1, by=.2), labels = seq(0, 1, by=.2), cex.axis = 1.5, las = 1, lwd=1.25, lwd.ticks=1.15) -mtext(side = 1, text = "IQR", line = 4, cex = 1.5) -mtext(side = 2, text = "Density", line = 4, cex = 1.5) -dev.off() - diff --git a/copy_number/cnvkitqc.mk b/copy_number/cnvkitqc.mk deleted file mode 100644 index ab73e82f..00000000 --- a/copy_number/cnvkitqc.mk +++ /dev/null @@ -1,28 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/cnvkit_qc.$(NOW) -PHONY += cnvkit cnvkit/qc - -CNVKIT_NORMAL ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnr/$(sample).cnr)) -CNVKIT_TUMOR ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnr/$(sample).cnr)) -CNVKIT_NORMAL_ON_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).targetcoverage.cnn)) -CNVKIT_NORMAL_OFF_TARGET ?= $(wildcard $(foreach sample,$(NORMAL_SAMPLES),cnvkit/cnn/normal/$(sample).antitargetcoverage.cnn)) -CNVKIT_TUMOR_ON_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).targetcoverage.cnn)) -CNVKIT_TUMOR_OFF_TARGET ?= $(wildcard $(foreach sample,$(TUMOR_SAMPLES),cnvkit/cnn/tumor/$(sample).antitargetcoverage.cnn)) - -cnvkit : cnvkit/qc/qc_ontarget.tsv cnvkit/qc/qc_offtarget.tsv cnvkit/qc/bin_qc_ontarget.tsv cnvkit/qc/bin_qc_offtarget.tsv - -cnvkit/qc/qc_ontarget.tsv : $(wildcard cnvkit/cnr/$(SAMPLES).cnr) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 8G -m 16G,"$(RSCRIPT) modules/copy_number/cnvkitqc.R --normal_files '$(CNVKIT_NORMAL)' --tumor_files '$(CNVKIT_TUMOR)' --out_file cnvkit/qc/qc_ontarget.tsv --option 1") - -cnvkit/qc/qc_offtarget.tsv : $(wildcard cnvkit/cnr/$(SAMPLES).cnr) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 8G -m 16G,"$(RSCRIPT) modules/copy_number/cnvkitqc.R --normal_files '$(CNVKIT_NORMAL)' --tumor_files '$(CNVKIT_TUMOR)' --out_file cnvkit/qc/qc_offtarget.tsv --option 0") - -cnvkit/qc/bin_qc_ontarget.tsv : $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).targetcoverage.cnn) $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).targetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitbinqc.R --normal_files '$(CNVKIT_NORMAL_ON_TARGET)' --tumor_files '$(CNVKIT_TUMOR_ON_TARGET)' --out_file cnvkit/qc/bin_qc_ontarget.tsv") - -cnvkit/qc/bin_qc_offtarget.tsv : $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).antitargetcoverage.cnn) $(wildcard cnvkit/cnn/tumor/$(TUMOR_SAMPLES).antitargetcoverage.cnn) - $(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 32G -m 48G,"$(RSCRIPT) modules/copy_number/cnvkitbinqc.R --normal_files '$(CNVKIT_NORMAL_OFF_TARGET)' --tumor_files '$(CNVKIT_TUMOR_OFF_TARGET)' --out_file cnvkit/qc/bin_qc_offtarget.tsv") - - -.PHONY: $(PHONY) diff --git a/copy_number/cnvkitreference.mk b/copy_number/cnvkitreference.mk deleted file mode 100644 index f4932a9f..00000000 --- a/copy_number/cnvkitreference.mk +++ /dev/null @@ -1,13 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_reference.$(NOW) -PHONY += cnvkit cnvkit/reference - -cnvkit_reference : cnvkit/reference/combined_reference.cnr - -cnvkit/reference/combined_reference.cnr : $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).targetcoverage.cnn) $(wildcard cnvkit/cnn/normal/$(NORMAL_SAMPLES).antitargetcoverage.cnn) - $(call RUN,-n 1 -s 24G -m 32G,"cnvkit.py reference cnvkit/cnn/normal/*.cnn -f $(REF_FASTA) --no-edge -o cnvkit/reference/combined_reference.cnr") - -.PHONY: $(PHONY) - diff --git a/copy_number/cnvkitsegment.mk b/copy_number/cnvkitsegment.mk deleted file mode 100644 index 7c051d3d..00000000 --- a/copy_number/cnvkitsegment.mk +++ /dev/null @@ -1,23 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_segment.$(NOW) -PHONY += cnvkit cnvkit/segmented cnvkit/totalcopy cnvkit/called - -cnvkit_segment : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/totalcopy/$(sample).RData) $(foreach sample,$(TUMOR_SAMPLES),cnvkit/segmented/$(sample).pdf) $(foreach sample,$(TUMOR_SAMPLES),cnvkit/called/$(sample).RData) - -define cnvkit-totalcopy -cnvkit/segmented/%.pdf cnvkit/totalcopy/%.RData : cnvkit/cnr/%.cnr - $$(call RUN,-c -v $(ASCAT_ENV) -s 6G -m 12G,"mkdir -p cnvkit/segmented && \ - mkdir -p cnvkit/totalcopy && \ - $(RSCRIPT) modules/copy_number/cnvkit.R --type total-copy --sample_name $$(*)") - -cnvkit/called/%.RData : cnvkit/totalcopy/%.RData - $$(call RUN,-c -v $(ASCAT_ENV) -s 6G -m 12G,"mkdir -p cnvkit/called && \ - $(RSCRIPT) modules/copy_number/cnvkit.R --type call-cna --sample_name $$(*)") - -endef - $(foreach sample,$(TUMOR_SAMPLES),\ - $(eval $(call cnvkit-totalcopy,$(sample)))) - -.PHONY: $(PHONY) diff --git a/copy_number/cnvkitsummary.R b/copy_number/cnvkitsummary.R deleted file mode 100755 index cfa7bf5b..00000000 --- a/copy_number/cnvkitsummary.R +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("GenomicRanges")) -suppressPackageStartupMessages(library("plyr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("stringr")) -suppressPackageStartupMessages(library("tidyr")) -suppressPackageStartupMessages(library("magrittr")) -suppressPackageStartupMessages(library("foreach")) -suppressPackageStartupMessages(library("rtracklayer")) -suppressPackageStartupMessages(library("grid")) -suppressPackageStartupMessages(library("rlist")) - -optList <- list( - make_option("--sample_names", default = NULL, help = "list of sample names") - ) - -parser <- OptionParser(usage = "%prog [options] [facets files]", option_list = optList) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -sample_names = unlist(strsplit(opt$sample_names, split=" ", fixed=TRUE)) -genes = read.csv(file="~/share/reference/annotation_gene_lists/annotation_impact_468.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE) %>% - filter(Chromosome %in% as.character(c(1:22, "X", "Y"))) %>% - filter(!duplicated(Gene_Symbol)) %>% - arrange(as.integer(Chromosome), Start, End) - -genes_granges = genes %$% - GRanges(seqnames = Chromosome, ranges = IRanges(Start, End), Gene_Symbol = Gene_Symbol) -mm = lapply(1:length(sample_names), function(i, sample_names, genes, genes_granges) { - cat(i, "of", length(sample_names), "\n") - load(paste0("cnvkit/called/", sample_names[i], ".RData")) - tmp[tmp[,"Chromosome"]==23,"Chromosome"] = "X" - tmp[tmp[,"Chromosome"]==24,"Chromosome"] = "Y" - tmp_granges = tmp %$% GRanges(seqnames = Chromosome, ranges = IRanges(Start, End)) - mcols(tmp_granges) = tmp %>% select(Cat5) - fo = findOverlaps(tmp_granges, genes_granges) - x = mcols(genes_granges)[subjectHits(fo),] - y = mcols(tmp_granges)[queryHits(fo),] - df = data.frame("Gene_Symbol"=x, "Cat5"=y) - df = df %>% - group_by(Gene_Symbol) %>% - top_n(1, abs(Cat5)) - z = as.numeric(df$Cat5) - names(z) = as.character(df$Gene_Symbol) - z = z[names(z) %in% genes[,1]] - res = rep(NA, nrow(genes)) - names(res) = genes[,1] - res[names(z)] = z - return(res) -}, sample_names, genes, genes_granges) -bygene = do.call(cbind, mm) -colnames(bygene) = sample_names -bygene = cbind(genes, bygene) %>% - arrange(as.integer(Chromosome), Start, End) - -save(bygene, file="cnvkit/summary/bygene.RData") -write.table(bygene, file="cnvkit/summary/bygene.txt", sep="\t", col.names=TRUE, row.names=FALSE, na="", quote=FALSE) diff --git a/copy_number/cnvkitsummary.mk b/copy_number/cnvkitsummary.mk deleted file mode 100644 index 41aeeffb..00000000 --- a/copy_number/cnvkitsummary.mk +++ /dev/null @@ -1,13 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit_summary.$(NOW) -PHONY += cnvkit cnvkit/summary - -cnvkit_summary : cnvkit/summary/bygene.txt - -cnvkit/summary/bygene.txt : $(foreach sample,$(TUMOR_SAMPLES),cnvkit/called/$(sample).RData) - $(call RUN,-c -s 24G -m 48G,"mkdir -p cnvkit/summary && \ - $(RSCRIPT) modules/copy_number/cnvkitsummary.R --sample_names '$(TUMOR_SAMPLES)'") - -.PHONY: $(PHONY) diff --git a/copy_number/facets_suite.mk b/copy_number/facets_suite.mk new file mode 100644 index 00000000..dfd9dfa3 --- /dev/null +++ b/copy_number/facets_suite.mk @@ -0,0 +1,72 @@ +include modules/Makefile.inc + +LOGDIR ?= log/facets_suite.$(NOW) + +FACETS_MAX_DEPTH ?= 15000 +FACETS_CVAL ?= 50 +FACETS_PURITY_CVAL ?= 30 +FACETS_MIN_NHET ?= 15 +FACETS_PURITY_MIN_NHET ?= 10 +SNP_WINDOW_SIZE ?= 250 +NORMAL_DEPTH ?= 25 + +facets_suite : facets_suite/targets_dbsnp.vcf \ + $(foreach pair,$(SAMPLE_PAIRS),facets_suite/$(pair)/$(pair).snp_pileup.gz) \ + $(foreach pair,$(SAMPLE_PAIRS),facets_suite/$(pair)/taskcomplete) \ + facets_suite/summary.txt + +facets_suite/targets_dbsnp.vcf : $(TARGETS_FILE) + $(INIT) $(BEDTOOLS) intersect -header -u -a $(DBSNP) -b $< > $@ + + +define snp-pileup +facets_suite/$1_$2/$1_$2.snp_pileup.gz : facets_suite/targets_dbsnp.vcf bam/$1.bam bam/$2.bam + $$(call RUN,-c -s 2G -m 4G -v $(FACETS_SUITE_ENV),"set -o pipefail && \ + snp-pileup-wrapper.R --verbose \ + -sp /home/$(USER)/share/usr/env/r-facets-suite-2.0.8/bin/snp-pileup \ + --vcf-file $$(<) \ + --tumor-bam $$(<<) \ + --normal-bam $$(<<<) \ + --output-prefix facets_suite/$1_$2/$1_$2 \ + --pseudo-snps NULL \ + --max-depth $$(FACETS_MAX_DEPTH)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call snp-pileup,$(tumor.$(pair)),$(normal.$(pair))))) + +define run-facets +facets_suite/$1_$2/taskcomplete : facets_suite/$1_$2/$1_$2.snp_pileup.gz + $$(call RUN,-c -s 4G -m 6G -v $(FACETS_SUITE_ENV),"set -o pipefail && \ + run-facets-wrapper.R --verbose \ + --counts-file $$(<) \ + --sample-id $1_$2 \ + --directory facets_suite/$1_$2/ \ + --everything \ + --genome hg19 \ + --cval $$(FACETS_CVAL) \ + --purity-cval $$(FACETS_PURITY_CVAL) \ + --min-nhet $$(FACETS_MIN_NHET) \ + --purity-min-nhet $$(FACETS_PURITY_MIN_NHET) \ + --snp-window-size $$(SNP_WINDOW_SIZE) \ + --normal-depth $$(NORMAL_DEPTH) \ + --seed 0 \ + --legacy-output True \ + --facets-lib-path /home/$(USER)/share/usr/env/r-facets-suite-2.0.8/lib/R/library/ && \ + echo 'finished!' > $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call run-facets,$(tumor.$(pair)),$(normal.$(pair))))) + + +facets_suite/summary.txt : $(foreach pair,$(SAMPLE_PAIRS),facets_suite/$(pair)/taskcomplete) + $(call RUN, -c -n 1 -s 24G -m 48G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/facets_suite.R --option 1 --sample_pairs '$(SAMPLE_PAIRS)'") + + +..DUMMY := $(shell mkdir -p version; \ + $(FACETS_SUITE_ENV)/bin/R --version > version/facets_suite.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: facets_suite diff --git a/copy_number/genomealtered.R b/copy_number/genomealtered.R deleted file mode 100644 index c2e10564..00000000 --- a/copy_number/genomealtered.R +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--file_in", default = NA, type = 'character', help = "input file name"), - make_option("--file_out", default = NA, type = 'character', help = "output file name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -load(opt$file_in) -alpha = ifelse(is.na(fit$purity), 1, fit$purity) -psi = ifelse(is.na(fit$ploidy), 2, fit$ploidy) -gamma = 1 -x = fit$cncf[,"cnlr.median"] -absolute_copies = round(((((2^(x/gamma))*(alpha*psi+(1-alpha)*2)) - ((1-alpha)*2))/alpha)) -index = absolute_copies!=round(psi) -if (sum(index, na.rm=TRUE)!=0) { - genome_footprint = sum(as.numeric(fit$cncf[,"end"]-fit$cncf[,"start"]), na.rm=TRUE) - genome_altered = sum(as.numeric(fit$cncf[index,"end"]-fit$cncf[index,"start"]), na.rm=TRUE)/genome_footprint -} else { - genome_altered = 0 -} -cat(paste0(gsub("facets/cncf/","", gsub(".Rdata", "", opt$file_in)), "\t", genome_altered), file = opt$file_out, append=FALSE) -cat("\n", file = opt$file_out, append=TRUE) - -warnings() diff --git a/copy_number/genomealtered.mk b/copy_number/genomealtered.mk deleted file mode 100644 index 66402738..00000000 --- a/copy_number/genomealtered.mk +++ /dev/null @@ -1,18 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/genome_altered.$(NOW) -PHONY += genome_stats - -genome_altered : $(foreach pair,$(SAMPLE_PAIRS),genome_stats/$(pair).fga) - -define fraction-genome-altered -genome_stats/$1_$2.fga : facets/cncf/$1_$2.Rdata - $$(call RUN,-n 1 -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/genomealtered.R --file_in $$< --file_out genome_stats/$1_$2.fga") -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call fraction-genome-altered,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) - diff --git a/copy_number/lstscore.R b/copy_number/lstscore.R deleted file mode 100644 index 517eaf5d..00000000 --- a/copy_number/lstscore.R +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--file_in", default = NA, type = 'character', help = "input file name"), - make_option("--file_out", default = NA, type = 'character', help = "output file name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -chromStrToNum <- function(str) { - suppressWarnings(cNum <- as.numeric(str)) - if (is.na(cNum) && str == "X" ) { - cNum <- 23 - } else if (is.na(cNum) && str == "Y") { - cNum <- 24 - } - return(invisible(cNum)) -} - -GetChrominfo <- function() { - f <- "modules/copy_number/hg19_chrominfo.txt" - chrom <- read.table(file=f) - chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) - f <- "modules/copy_number/hg19_gaps.txt" - gaps <- read.table(file=f) - centro <- subset(gaps, gaps[,8] == "centromere") - chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) - chrominfo$centromere <- rowMeans(chrominfo[,3:4]) - chrominfo <- chrominfo[,c(1,2,5,3,4)] - colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") - chrominfo[,1] <- as.character(chrominfo[,1]) - chrominfo$chr <- sub("chr", "", chrominfo$chr) - chrominfo$chr <- sub("X", "23", chrominfo$chr) - chrominfo$chr <- sub("Y", "24", chrominfo$chr) - chrominfo[,1] <- as.numeric(chrominfo[,1]) - chrominfo <- chrominfo[order(chrominfo$chr), ] - rownames(chrominfo) <- as.character(chrominfo[,1]) - chrominfo <- as.matrix(chrominfo) - return(invisible(chrominfo)) -} - -fix_facets_column_names <- function(dat) { - colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" - colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" - colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" - colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" - sz <- dat[,"endBP"] - dat[,"startBP"] - dat <- cbind(dat, size=sz) - nA <- dat[,"tcn.em"] - dat[,"nB"] - dat <- cbind(dat, nA=nA) - return(invisible(dat)) -} - -join_adjacent_segments <- function(dat) { - cur_segs <- dat - something_changed <- 1 - while ( something_changed ) { - new_segs <- c() - something_changed <- 0 - x <- 2 - last_changed <- 0 - while (x <= nrow(cur_segs)) { - last_changed <- 0 - if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && - (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && - (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) - ) { - t <- cur_segs[x-1,] - t["endBP"] <- cur_segs[x,"endBP"] - t["end"] <- cur_segs[x,"end"] - t["size"] <- t["endBP"] - t["startBP"] - something_changed <- 1 - new_segs <- rbind(t, new_segs) - x <- x+2 - last_changed <- 1 - } else { - new_segs <- rbind(cur_segs[x-1,], new_segs) - x<-x+1 - } - } - if (! last_changed ) { - new_segs <- rbind(cur_segs[x-1,],new_segs) - } - n <- nrow(new_segs) - new_segs <- new_segs[n:1,] - cur_segs <- new_segs - } - return(invisible(cur_segs)) -} - -fix_facet_segs <- function(dat) { - i <- which(is.na(dat$nB)) - if ( length(i) > 0 ) { - dat <- dat[-i, ] - } - dat <- join_adjacent_segments(dat) - return(invisible(dat)) -} - -chrom_arm_LST_score <- function(dat) { - score <- 0 - segs <- c() - SIZE_THRESH <- 10e6 - SPACE_THRESH <- 3e6 - if ( nrow(dat) >= 2 ) { - for (x in 2:nrow(dat)) { - if ( (dat[x-1,"size"] >= SIZE_THRESH) && - (dat[x,"size"] >= SIZE_THRESH) && - ( (dat[x,"startBP"] - dat[x-1,"endBP"]) <= SPACE_THRESH) - ) { - score <- score +1 - segs <- rbind(dat[x-1,], segs) - } - } - } - tmp <- list() - tmp$score <- score - tmp$segs <- segs - return(invisible(tmp)) -} - -lst_filter <- function(dat, size_thresh) { - i <- which(dat[,"size"] < size_thresh) - sz <- dat[i,"size"] - i <- i[order(sz)] - segs_removed <- 0 - while (length(i) > 0) { - dat <- dat[-i[1], ] - dat <- join_adjacent_segments(dat) - i<- which(dat[,"size"] < size_thresh) - sz <- dat[i,"size"] - i <- i[order(sz)] - segs_removed <- segs_removed + 1 - } - return(invisible(dat)) -} - -score_LST <- function(dat, chromInfo) { - score <- 0 - segs <- c() - dat <- lst_filter(dat, 3e6) - for (c in unique(dat[,"chromosome"]) ) { - i <- which(dat[,"chromosome"] == c) - csegs <- dat[i,] - cNum <- chromStrToNum(c) - i <- which(csegs[,"startBP"] <= chromInfo[cNum,"centstart"]) - parm <- csegs[i,] - tmp <- chrom_arm_LST_score(parm) - score <- score + tmp$score - segs <- rbind(tmp$segs, segs) - i <- which(csegs[,"endBP"] >= chromInfo[cNum,"centend"]) - qarm <- csegs[i,] - tmp <- chrom_arm_LST_score(qarm) - score <- score + tmp$score - segs <- rbind(tmp$segs, segs) - } - tmp <- list() - tmp$score <- score - tmp$segs <- segs - return(invisible(tmp)) -} - -dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) -dat = fix_facets_column_names(dat) -segs = fix_facet_segs(dat) -chromInfo = GetChrominfo() -lst = score_LST(segs, chromInfo) -cat(paste0(gsub("facets/cncf/","", gsub(".cncf.txt", "", opt$file_in)), "\t", lst$score), file = opt$file_out, append=FALSE) -cat("\n", file = opt$file_out, append=TRUE) - -warnings() - diff --git a/copy_number/lstscore.mk b/copy_number/lstscore.mk deleted file mode 100644 index b8664c7d..00000000 --- a/copy_number/lstscore.mk +++ /dev/null @@ -1,17 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/lst_score.$(NOW) -PHONY += genome_stats - -lst_score : $(foreach pair,$(SAMPLE_PAIRS),genome_stats/$(pair).lst) - -define lst-score -genome_stats/$1_$2.lst : facets/cncf/$1_$2.txt - $$(call RUN,-n 1 -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/lstscore.R --file_in $$< --file_out genome_stats/$1_$2.lst") -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call lst-score,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/copy_number/medicc2.mk b/copy_number/medicc2.mk new file mode 100644 index 00000000..faa0c457 --- /dev/null +++ b/copy_number/medicc2.mk @@ -0,0 +1,74 @@ +include modules/Makefile.inc + +LOGDIR ?= log/medicc2.$(NOW) + +medicc : $(foreach sample,$(TUMOR_SAMPLES),medicc2/$(sample)/$(sample).txt) \ + $(foreach set,$(SAMPLE_SETS),medicc2/$(set)/$(set).txt) \ + $(foreach set,$(SAMPLE_SETS),medicc2/$(set)/$(set).tsv) \ + $(foreach set,$(SAMPLE_SETS),medicc2/$(set)/$(set)_summary.tsv) + +define collect-copy-number +medicc2/$1/$1.txt : facets/cncf/$1_$2.Rdata + $$(call RUN,-c -n 1 -s 1G -m 2G -v $(MEDICC_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/medicc2.R \ + --option 1 \ + --tumor_sample_name $1 \ + --normal_sample_name $2 \ + --file_in $$(<) \ + --file_out $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call collect-copy-number,$(tumor.$(pair)),$(normal.$(pair))))) + + +define aggregate-copy-number +medicc2/$1/$1.txt : $(foreach sample,$(TUMOR_SAMPLES),medicc2/$(sample)/$(sample).txt) + $$(call RUN,-c -n 1 -s 2G -m 4G -v $(MEDICC_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/medicc2.R \ + --option 2 \ + --tumor_sample_name '$(tumors.$1)' \ + --normal_sample_name '$(normal.$1)' \ + --file_out $$(@)") + +medicc2/$1/$1.tsv : medicc2/$1/$1.txt + $$(call RUN,-c -n 1 -s 2G -m 4G -v $(MEDICC_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/medicc2.R \ + --option 3 \ + --tumor_sample_name '$(tumors.$1)' \ + --normal_sample_name '$(normal.$1)' \ + --file_in $$(<) \ + --file_out $$(@)") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call aggregate-copy-number,$(set)))) + + +define r-medicc2 +medicc2/$1/$1_summary.tsv : medicc2/$1/$1.tsv + $$(call RUN,-c -n 4 -s 2G -m 4G -v $(MEDICC_ENV),"set -o pipefail && \ + $$(MEDICC) \ + $$(<) \ + medicc2/$1/ \ + --input-type tsv \ + --normal-name diploid \ + --total-copy-numbers \ + --input-allele-columns 'nAB' \ + --plot both \ + --maxcn 8 \ + --bootstrap-method 'segment-wise' \ + --bootstrap-nr 100 \ + --n-cores 4") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call r-medicc2,$(set)))) + + +..DUMMY := $(shell mkdir -p version; \ + $(MEDICC_ENV)/bin/R --version > version/medicc2.txt; \ + $(MEDICC_ENV)/bin/medicc2 --help >> version/medicc2.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: medicc diff --git a/copy_number/myriadhrdscore.R b/copy_number/myriadhrdscore.R deleted file mode 100644 index 392fa195..00000000 --- a/copy_number/myriadhrdscore.R +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--file_in", default = NA, type = 'character', help = "input file name"), - make_option("--file_out", default = NA, type = 'character', help = "output file name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -chromStrToNum <- function(str) { - suppressWarnings(cNum <- as.numeric(str)) - if (is.na(cNum) && str == "X" ) { - cNum <- 23 - } else if (is.na(cNum) && str == "Y") { - cNum <- 24 - } - return(invisible(cNum)) -} - -GetChrominfo <- function() { - f <- "modules/copy_number/hg19_chrominfo.txt" - chrom <- read.table(file=f) - chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) - f <- "modules/copy_number/hg19_gaps.txt" - gaps <- read.table(file=f) - centro <- subset(gaps, gaps[,8] == "centromere") - chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) - chrominfo$centromere <- rowMeans(chrominfo[,3:4]) - chrominfo <- chrominfo[,c(1,2,5,3,4)] - colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") - chrominfo[,1] <- as.character(chrominfo[,1]) - chrominfo$chr <- sub("chr", "", chrominfo$chr) - chrominfo$chr <- sub("X", "23", chrominfo$chr) - chrominfo$chr <- sub("Y", "24", chrominfo$chr) - chrominfo[,1] <- as.numeric(chrominfo[,1]) - chrominfo <- chrominfo[order(chrominfo$chr), ] - rownames(chrominfo) <- as.character(chrominfo[,1]) - chrominfo <- as.matrix(chrominfo) - return(invisible(chrominfo)) -} - -fix_facets_column_names <- function(dat) { - colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" - colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" - colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" - colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" - sz <- dat[,"endBP"] - dat[,"startBP"] - dat <- cbind(dat, size=sz) - nA <- dat[,"tcn.em"] - dat[,"nB"] - dat <- cbind(dat, nA=nA) - return(invisible(dat)) -} - -join_adjacent_segments <- function(dat) { - cur_segs <- dat - something_changed <- 1 - while ( something_changed ) { - new_segs <- c() - something_changed <- 0 - x <- 2 - last_changed <- 0 - while (x <= nrow(cur_segs)) { - last_changed <- 0 - if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && - (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && - (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) - ) { - t <- cur_segs[x-1,] - t["endBP"] <- cur_segs[x,"endBP"] - t["end"] <- cur_segs[x,"end"] - t["size"] <- t["endBP"] - t["startBP"] - something_changed <- 1 - new_segs <- rbind(t, new_segs) - x <- x+2 - last_changed <- 1 - } else { - new_segs <- rbind(cur_segs[x-1,], new_segs) - x<-x+1 - } - } - if (! last_changed ) { - new_segs <- rbind(cur_segs[x-1,],new_segs) - } - n <- nrow(new_segs) - new_segs <- new_segs[n:1,] - cur_segs <- new_segs - } - return(invisible(cur_segs)) -} - -fix_facet_segs <- function(dat) { - i <- which(is.na(dat$nB)) - if ( length(i) > 0 ) { - dat <- dat[-i, ] - } - dat <- join_adjacent_segments(dat) - return(invisible(dat)) -} - -chrom_arm_LST_score <- function(dat) { - score <- 0 - segs <- c() - SIZE_THRESH <- 10e6 - SPACE_THRESH <- 3e6 - if ( nrow(dat) >= 2 ) { - for (x in 2:nrow(dat)) { - if ( (dat[x-1,"size"] >= SIZE_THRESH) && - (dat[x,"size"] >= SIZE_THRESH) && - ( (dat[x,"startBP"] - dat[x-1,"endBP"]) <= SPACE_THRESH) - ) { - score <- score +1 - segs <- rbind(dat[x-1,], segs) - } - } - } - tmp <- list() - tmp$score <- score - tmp$segs <- segs - return(invisible(tmp)) -} - -lst_filter <- function(dat, size_thresh) { - i <- which(dat[,"size"] < size_thresh) - sz <- dat[i,"size"] - i <- i[order(sz)] - segs_removed <- 0 - while (length(i) > 0) { - dat <- dat[-i[1], ] - dat <- join_adjacent_segments(dat) - i<- which(dat[,"size"] < size_thresh) - sz <- dat[i,"size"] - i <- i[order(sz)] - segs_removed <- segs_removed + 1 - } - return(invisible(dat)) -} - -score_myriad_HRD <- function(dat, thresh=15e6) { - chrDel <- NULL - hrdSegs <- NULL - hrd_score <- 0 - chrList <- unique(dat[,"chromosome"]) - for (x in chrList) { - index <- which(dat[,"chromosome"] == x) - totalnB <- sum(dat[index,"nB"], na.rm=TRUE) - if (totalnB == 0) { - chrDel <- c(x, chrDel) - } - } - for (x in 1:nrow(dat)) { - if ( dat[x,"chromosome"] %in% chrDel ) { - next - } - if ( dat[x,"nB"] != 0 ) { - next - } - if (dat[x,"size"] < thresh) { - next - } - hrd_score <- hrd_score + 1 - hrdSegs <- rbind(dat[x,], hrdSegs) - } - tmp <- list() - tmp$score = hrd_score - tmp$segs = hrdSegs - return(invisible(tmp)) -} - - -dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) -dat = fix_facets_column_names(dat) -segs = fix_facet_segs(dat) -chromInfo = GetChrominfo() -mrs = score_myriad_HRD(segs) -cat(paste0(gsub("facets/cncf/","", gsub(".cncf.txt", "", opt$file_in)), "\t", mrs$score), file = opt$file_out, append=FALSE) -cat("\n", file = opt$file_out, append=TRUE) - -warnings() diff --git a/copy_number/myriadhrdscore.mk b/copy_number/myriadhrdscore.mk deleted file mode 100644 index 8d619938..00000000 --- a/copy_number/myriadhrdscore.mk +++ /dev/null @@ -1,17 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/myriad_score.$(NOW) -PHONY += genome_stats - -myriad_score : $(foreach pair,$(SAMPLE_PAIRS),genome_stats/$(pair).mrs) - -define myriad-score -genome_stats/$1_$2.mrs : facets/cncf/$1_$2.txt - $$(call RUN,-n 1 -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/myriadhrdscore.R --file_in $$< --file_out genome_stats/$1_$2.mrs") -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call myriad-score,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/copy_number/ntaiscore.R b/copy_number/ntaiscore.R deleted file mode 100644 index bb35c010..00000000 --- a/copy_number/ntaiscore.R +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--file_in", default = NA, type = 'character', help = "input file name"), - make_option("--file_out", default = NA, type = 'character', help = "output file name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -chromStrToNum <- function(str) { - suppressWarnings(cNum <- as.numeric(str)) - if (is.na(cNum) && str == "X" ) { - cNum <- 23 - } else if (is.na(cNum) && str == "Y") { - cNum <- 24 - } - return(invisible(cNum)) -} - -GetChrominfo <- function() { - f <- "modules/copy_number/hg19_chrominfo.txt" - chrom <- read.table(file=f) - chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) - f <- "modules/copy_number/hg19_gaps.txt" - gaps <- read.table(file=f) - centro <- subset(gaps, gaps[,8] == "centromere") - chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) - chrominfo$centromere <- rowMeans(chrominfo[,3:4]) - chrominfo <- chrominfo[,c(1,2,5,3,4)] - colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") - chrominfo[,1] <- as.character(chrominfo[,1]) - chrominfo$chr <- sub("chr", "", chrominfo$chr) - chrominfo$chr <- sub("X", "23", chrominfo$chr) - chrominfo$chr <- sub("Y", "24", chrominfo$chr) - chrominfo[,1] <- as.numeric(chrominfo[,1]) - chrominfo <- chrominfo[order(chrominfo$chr), ] - rownames(chrominfo) <- as.character(chrominfo[,1]) - chrominfo <- as.matrix(chrominfo) - return(invisible(chrominfo)) -} - -fix_facets_column_names <- function(dat) { - colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" - colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" - colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" - colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" - sz <- dat[,"endBP"] - dat[,"startBP"] - dat <- cbind(dat, size=sz) - nA <- dat[,"tcn.em"] - dat[,"nB"] - dat <- cbind(dat, nA=nA) - return(invisible(dat)) -} - -join_adjacent_segments <- function(dat) { - cur_segs <- dat - something_changed <- 1 - while ( something_changed ) { - new_segs <- c() - something_changed <- 0 - x <- 2 - last_changed <- 0 - while (x <= nrow(cur_segs)) { - last_changed <- 0 - if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && - (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && - (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) - ) { - t <- cur_segs[x-1,] - t["endBP"] <- cur_segs[x,"endBP"] - t["end"] <- cur_segs[x,"end"] - t["size"] <- t["endBP"] - t["startBP"] - something_changed <- 1 - new_segs <- rbind(t, new_segs) - x <- x+2 - last_changed <- 1 - } else { - new_segs <- rbind(cur_segs[x-1,], new_segs) - x<-x+1 - } - } - if (! last_changed ) { - new_segs <- rbind(cur_segs[x-1,],new_segs) - } - n <- nrow(new_segs) - new_segs <- new_segs[n:1,] - cur_segs <- new_segs - } - return(invisible(cur_segs)) -} - -fix_facet_segs <- function(dat) { - i <- which(is.na(dat$nB)) - if ( length(i) > 0 ) { - dat <- dat[-i, ] - } - dat <- join_adjacent_segments(dat) - return(invisible(dat)) -} - -score_ntAI <- function(dat, chromInfo, min_size=1000, shrink=FALSE) { - index <- dat[,"chromosome"] %in% c("MT", "Y", "24") - dat <- dat[!index,] - index <- dat[,"size"] < min_size - dat <- dat[!index,] - if (shrink) { - dat <- join_adjacent_segments(dat) - } - chrList <- unique(dat[,"chromosome"]) - ntAI_score <- 0 - ntAI_segs <- NULL - for (x in chrList) { - index <- dat[,"chromosome"] == x - chr_segs <- dat[index,] - cNum <- chromStrToNum(x) - if (nrow(chr_segs) < 2 ) { - next - } - if ( (chr_segs[1,"nA"] != chr_segs[1,"nB"]) && (chromInfo[cNum,"centstart"] > chr_segs[1,"endBP"]) ) { - ntAI_score <- ntAI_score+1 - ntAI_segs <- rbind(chr_segs[1,],ntAI_segs) - } - eSeg <- nrow(chr_segs) - if ( (chr_segs[eSeg, "nA"] != chr_segs[eSeg, "nB"]) && (chr_segs[eSeg,"startBP"] > chromInfo[cNum,"centend"]) ) { - ntAI_score <- ntAI_score+1 - ntAI_segs <- rbind(chr_segs[eSeg,],ntAI_segs) - } - } - tmp <- list() - tmp$segs <- ntAI_segs - tmp$score <- ntAI_score - return(invisible(tmp)) -} - -dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) -dat = fix_facets_column_names(dat) -segs = fix_facet_segs(dat) -chromInfo = GetChrominfo() -ntai = score_ntAI(segs, chromInfo) -cat(paste0(gsub("facets/cncf/","", gsub(".cncf.txt", "", opt$file_in)), "\t", ntai$score), file = opt$file_out, append=FALSE) -cat("\n", file = opt$file_out, append=TRUE) - -warnings() diff --git a/copy_number/ntaiscore.mk b/copy_number/ntaiscore.mk deleted file mode 100644 index 2f8d751a..00000000 --- a/copy_number/ntaiscore.mk +++ /dev/null @@ -1,17 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/ntai_score.$(NOW) -PHONY += genome_stats - -ntai_score : $(foreach pair,$(SAMPLE_PAIRS),genome_stats/$(pair).ntai) - -define ntai-score -genome_stats/$1_$2.ntai : facets/cncf/$1_$2.txt - $$(call RUN,-n 1 -s 3G -m 6G,"$(RSCRIPT) modules/copy_number/ntaiscore.R --file_in $$< --file_out genome_stats/$1_$2.ntai") -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call ntai-score,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/copy_number/plotFacets.R b/copy_number/plotFacets.R index 5b3c848a..a21d116e 100644 --- a/copy_number/plotFacets.R +++ b/copy_number/plotFacets.R @@ -26,6 +26,8 @@ parser <- OptionParser(usage = "%prog [options] [facets Rdata file]", option_lis arguments <- parse_args(parser, positional_arguments = T) opt <- arguments$options +OLD_STYLE = TRUE + if (length(arguments$args) < 1) { cat("Need facets Rdata file\n") print_help(parser) @@ -66,7 +68,11 @@ normalName <- facetsFile %>% sub('\\..*', '', .) pdf(file = str_c(opt$outPrefix, ".pdf"), width=10, height=4.25) -plot_log2_(x=out2, y=fit, purity=fit$purity, ploidy=fit$ploidy, title = gsub("facets/plots/log2/", "", opt$outPrefix, fixed=TRUE)) +if (OLD_STYLE) { + plot_sample_lrr_(x=out2, fit=fit) +} else { + plot_log2_(x=out2, y=fit, purity=fit$purity, ploidy=fit$ploidy, title = gsub("facets/plots/log2/", "", opt$outPrefix, fixed=TRUE)) +} dev.off() pdf(file = str_c(gsub("log2", "cncf", opt$outPrefix, fixed=TRUE), ".pdf"), width=10, height=7) diff --git a/default_yaml/project_config.yaml b/default_yaml/project_config.yaml index be2b012e..3be1e9e6 100644 --- a/default_yaml/project_config.yaml +++ b/default_yaml/project_config.yaml @@ -32,13 +32,6 @@ ann_pathogen: true # target panels targets_file: ~/share/reference/target_panels/ -# cnvkit default target panels -# ontarget_file: ~/share/reference/target_panels/ -# offtarget_file: ~/share/reference/target_panels/ - -# whole exome sequencing -# exome: false - # gatk options gatk_hard_filter_snps: true gatk_pool_snp_recal: false diff --git a/qc/bamIntervalMetrics.mk b/qc/bamIntervalMetrics.mk deleted file mode 100644 index 88930e8a..00000000 --- a/qc/bamIntervalMetrics.mk +++ /dev/null @@ -1,102 +0,0 @@ -# generate bam interval metrics per sample - -#NO_RM := true - -include modules/Makefile.inc -include modules/variant_callers/gatk.inc -# picard format intervals file, needs requires sam format header - -VPATH ?= bam - -LOGDIR ?= log/metrics.$(NOW) - -PLOT_HS_METRICS = $(RSCRIPT) modules/qc/plotHsMetrics.R -NON_REF_FREQ = $(PERL) modules/qc/nonRefFreqFromPileup.pl -NON_REF_FREQ_BIN_SIZE = 0.01 - -SUMMARIZE_HS_METRICS = python modules/qc/summarize_hs_metrics.py -SUMMARIZE_IDXSTATS = python modules/qc/summarize_idxstats.py - -.DELETE_ON_ERROR: - -.SECONDARY: - -.PHONY: bam_interval_metrics hs_metrics amplicon_metrics interval_report #non_ref_metrics insert_size_metrics idxstats - -bam_interval_metrics : hs_metrics interval_report #non_ref_metrics idxstats - -#non_ref_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).interval_nonref_freq.tsv) - -hs_metrics : metrics/hs_metrics.tsv metrics/interval_hs_metrics.tsv metrics/hs_metrics.summary.tsv - -amplicon_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).amplicon_metrics.tsv) - -interval_report : metrics/interval_report/interval_report.timestamp - -#insert_size_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_size_metrics.tsv) - -#idxstats : metrics/idxstats_summary.tsv $(foreach sample,$(SAMPLES),metrics/$(sample).idxstats) - -# interval metrics per sample -metrics/%.hs_metrics.tsv metrics/%.interval_hs_metrics.tsv : bam/%.bam bam/%.bam.bai - $(call RUN,-s 10G -m 20G,"TMP=`mktemp`.intervals; \ - $(SAMTOOLS) view -H $< | grep '^@SQ' > \$$TMP && grep -P \"\t\" $(TARGETS_FILE) | awk 'BEGIN {OFS = \"\t\"} { print \$$1$(,)\$$2+1$(,)\$$3$(,)\"+\"$(,)NR }' >> \$$TMP; \ - $(CALC_HS_METRICS) INPUT=$< OUTPUT=metrics/$*.hs_metrics.tsv METRIC_ACCUMULATION_LEVEL=ALL_READS REFERENCE_SEQUENCE=$(REF_FASTA) PER_TARGET_COVERAGE=metrics/$*.interval_hs_metrics.tsv TARGET_INTERVALS=\$$TMP BAIT_SET_NAME=hs BAIT_INTERVALS=\$$TMP") - -# not sure how this differs from above, see picard doc -metrics/%.amplicon_metrics.tsv metrics/%.interval_amplicon_metrics.tsv : bam/%.bam bam/%.bam.bai - $(call RUN,-s 10G -m 20G,"TMP=`mktemp`.intervals; \ - $(SAMTOOLS) view -H $< | grep '^@SQ' > \$$TMP && grep -P \"\t\" $(TARGETS_FILE) | awk 'BEGIN {OFS = \"\t\"} { print \$$1$(,)\$$2+1$(,)\$$3$(,)\"+\"$(,)NR }' >> \$$TMP; \ - $(COLLECT_TARGETED_METRICS) INPUT=$< REFERENCE_SEQUENCE=$(REF_FASTA) OUTPUT=$@ AMPLICON_INTERVALS=\$$TMP TARGET_INTERVALS=\$$TMP METRIC_ACCUMULATION_LEVEL=ALL_READS PER_TARGET_COVERAGE=metrics/$*.interval_amplicon_metrics.tsv") - -# summarize interval metrics into one file -metrics/interval_hs_metrics.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).interval_hs_metrics.tsv) - $(INIT) \ - sed '/^#/d; /^$$/d' $< | cut -f 1-6 > $@.tmp; \ - for metrics in $^; do \ - samplename=$$(basename $${metrics%%.interval_hs_metrics.tsv}); \ - sed '/^#/d; /^$$/d' $$metrics | cut -f 7,8 | sed "s/mean_coverage/$${samplename}_mean_coverage/; s/normalized_coverage/$${samplename}_normalized_coverage/" | paste $@.tmp - > $@; \ - cp $@ $@.tmp; \ - done; \ - rm -f $@.tmp - -metrics/hs_metrics.summary.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.tsv) - $(INIT) $(SUMMARIZE_HS_METRICS) --excel_file $(@:.tsv=.xlsx) --project_name $(PROJECT_NAME) $^ > $@ 2> $(LOG) - -metrics/hs_metrics.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.tsv) - $(INIT) \ - { \ - sed '/^$$/d; /^#/d; s/SAMPLE.*//; s/BAIT_SET/SAMPLE/; s/\s$$//' $< | head -1; \ - for metrics in $^; do \ - samplename=$$(basename $${metrics%%.hs_metrics.tsv}); \ - sed "/^#/d; /^BAIT/d; /^\$$/d; s/^hs/$$samplename/; s/\t\+$$//" $$metrics; \ - done; \ - } > $@ - -metrics/interval_report/interval_report.timestamp : metrics/hs_metrics.tsv - $(call RUN,-s 7G -m 10G,"$(PLOT_HS_METRICS) --outDir $(@D) $< && touch $@") - -#metrics/%.interval_nonref_freq.tsv : bam/%.bam -# $(call RUN,-s 8G -m 10G,"$(SAMTOOLS) mpileup -l $(TARGETS_FILE) -f $(REF_FASTA) $< | $(NON_REF_FREQ) -b $(NON_REF_FREQ_BIN_SIZE) > $@") - -#metrics/%.insert_size_metrics.tsv : bam/%.bam -# $(call RUN,-s 8G -m 10G,"$(call PICARD,CollectInsertSizeMetrics,8G) INPUT=$< OUTPUT=$@ \ -# REFERENCE_SEQUENCE=$(REF_FASTA) HISTOGRAM_FILE=$(@:.tsv=.pdf)") - -#metrics/insert_size_metrics.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_size_metrics.tsv) -# $(INIT) \ -# { \ -# sed '/^$$/d; /^#/d; s/SAMPLE.*//; s/\s$$//; s/^/SAMPLE\t/' $< | head -1; \ -# for metrics in $^; do \ -# samplename=$$(basename $${metrics%%.insert_size_metrics.tsv}); \ -# grep -A1 '^MEDIAN_INSERT_SIZE' $$metrics | sed "1d; s/^/$$samplename\t/; s/\t\+$$//"; \ -# done; \ -# } > $@ - -#metrics/%.idxstats : bam/%.bam bam/%.bam.bai -# $(call RUN,,"samtools idxstats $< > $@") - -#metrics/idxstats_summary.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).idxstats) -# $(INIT) $(SUMMARIZE_IDXSTATS) --excel_file $(@:.tsv=.xlsx) --project_name $(PROJECT_NAME) --targets_file $(TARGETS_FILE) $^ > $@ 2> $(LOG) - -include modules/bam_tools/processBam.mk diff --git a/qc/bamMetrics.mk b/qc/bamMetrics.mk deleted file mode 100644 index be2f4fa3..00000000 --- a/qc/bamMetrics.mk +++ /dev/null @@ -1,50 +0,0 @@ -include modules/Makefile.inc -include modules/variant_callers/gatk.inc - -LOGDIR ?= log/bam_metrics.$(NOW) -PHONY += metrics - -COLLECT_METRICS = $(JAVA) -Xmx12G -jar $(PICARD_DIR)/CollectMultipleMetrics.jar VALIDATION_STRINGENCY=LENIENT -COLLECT_WGS_METRICS = $(JAVA) -Xmx12G -jar $(PICARD_JAR) CollectWgsMetrics VALIDATION_STRINGENCY=LENIENT -COLLECT_GC_METRICS = $(JAVA) -Xmx12G -jar $(PICARD_DIR)/CollectGcBiasMetrics.jar VALIDATION_STRINGENCY=LENIENT - -SUMMARIZE_IDXSTATS = python modules/qc/summarize_idxstats.py - -bam_metrics : summary_metrics gc flagstats wgs_metrics - -PHONY += flagstats -flagstats : $(foreach sample,$(SAMPLES),metrics/$(sample).flagstats) -PHONY += summary_metrics -summary_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).alignment_summary_metrics) -PHONY += wgs_metrics -wgs_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics) metrics/wgs_metrics_summary.tsv -PHONY += dup -dup : $(foreach sample,$(SAMPLES),metrics/$(sample).dup_metrics) -PHONY += gc -gc : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_bias_metrics) - -metrics/%.alignment_summary_metrics : bam/%.bam - $(call RUN,-s 18G -m 24G -w 7200,"$(COLLECT_METRICS) I=$< O=metrics/$(*).alignment_summary_metrics REFERENCE_SEQUENCE=$(REF_FASTA)") - -metrics/wgs_metrics_summary.tsv : $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics) - $(INIT) (grep GENOME_TERRITORY $< | sed 's/^/SAMPLE\t/'; for x in $(SAMPLES); do grep -A1 GENOME_TERRITORY metrics/$$x.wgs_metrics | sed 1d | sed "s/^/$$x\t/" ; done) > $@ - -metrics/%.wgs_metrics : bam/%.bam - $(call RUN,-s 18G -m 24G -w 7200,"$(COLLECT_WGS_METRICS) I=$< O=$@ REFERENCE_SEQUENCE=$(REF_FASTA)") - -metrics/%.gc_bias_metrics : bam/%.bam - $(call RUN,-s 18G -m 24G -w 7200,"$(COLLECT_GC_METRICS) I=$< O=$@ CHART_OUTPUT=$(addsuffix .pdf,$@) REFERENCE_SEQUENCE=$(REF_FASTA)") - -metrics/%.flagstats : bam/%.bam - $(call RUN,-s 18G -m 24G -w 7200,"$(SAMTOOLS) flagstat $< > $@") - -bam/%.markdup.bam metrics/%.dup_metrics : bam/%.bam - $(call RUN,-s 18G -m 24G -w 7200,"$(MARK_DUP) I=$< O=bam/$*.markdup.bam METRICS_FILE=metrics/$*.dup_metrics") - -metrics/dup_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).dup_metrics.txt) - $(INIT) grep '^LIBRARY' $< > $@ && \ - for metrics in $^; do \ - grep -A1 '^LIBRARY' $$metrics | sed '1d' >> $@; \ - done - -.PHONY: $(PHONY) diff --git a/qc/bam_interval_metrics.mk b/qc/bam_interval_metrics.mk new file mode 100644 index 00000000..b7d049ec --- /dev/null +++ b/qc/bam_interval_metrics.mk @@ -0,0 +1,147 @@ +include modules/Makefile.inc + +LOGDIR ?= log/bam_interval_metrics.$(NOW) + +bam_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) \ + summary/idx_metrics.txt \ + summary/aln_metrics.txt \ + summary/insert_metrics.txt \ + summary/oxog_metrics.txt \ + summary/hs_metrics.txt \ + summary/gc_metrics.txt \ + summary/gc_summary.txt + +PICARD = picard +PICARD_MEM = 16G +PICARD_OPTS = VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=4000000 TMP_DIR=$(TMPDIR) +CALC_HS_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectHsMetrics $(PICARD_OPTS) +COLLECT_ALIGNMENT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectAlignmentSummaryMetrics $(PICAD_OPTS) +COLLECT_INSERT_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectInsertSizeMetrics $(PICAD_OPTS) +COLLECT_OXOG_METRICS = $(PICARD) -Xmx$(PICARD_MEM) CollectOxoGMetrics $(PICAD_OPTS) +COLLECT_GC_BIAS = $(PICARD) -Xmx$(PICARD_MEM) CollectGcBiasMetrics $(PICAD_OPTS) +BAM_INDEX = $(PICARD) -Xmx$(PICARD_MEM) BamIndexStats $(PICAD_OPTS) + +BAITS_LIST = $(HOME)/share/lib/bed_files/targets/IMPACT505/b37/IMPACT505_b37_baits.list +TARGETS_LIST ?= $(HOME)/share/lib/bed_files/targets/IMPACT505/b37/IMPACT505_b37_targets.list + +define idx-metrics +metrics/$1.idx_stats.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(BAM_INDEX) \ + INPUT=$$(<) \ + > $$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call idx-metrics,$(sample)))) + +define aln-metrics +metrics/$1.aln_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(COLLECT_ALIGNMENT_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call aln-metrics,$(sample)))) + +define insert-metrics +metrics/$1.insert_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(COLLECT_INSERT_METRICS) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + HISTOGRAM_FILE=metrics/$1.insert_metrics.pdf \ + MINIMUM_PCT=0.5") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call insert-metrics,$(sample)))) + +define oxog-metrics +metrics/$1.oxog_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(COLLECT_OXOG_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call oxog-metrics,$(sample)))) + +define hs-metrics +metrics/$1.hs_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(CALC_HS_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + BAIT_INTERVALS=$$(BAITS_LIST) \ + TARGET_INTERVALS=$$(TARGETS_LIST)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call hs-metrics,$(sample)))) + +define gc-metrics +metrics/$1.gc_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $$(COLLECT_GC_BIAS) \ + INPUT=$$(<) \ + OUTPUT=metrics/$1.gc_bias.txt \ + CHART_OUTPUT=metrics/$1.gc_metrics.pdf \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + SUMMARY_OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call gc-metrics,$(sample)))) + +summary/idx_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 1 --sample_names '$(SAMPLES)'") + +summary/aln_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 2 --sample_names '$(SAMPLES)'") + +summary/insert_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 3 --sample_names '$(SAMPLES)'") + +summary/oxog_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 4 --sample_names '$(SAMPLES)'") + +summary/hs_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 5 --sample_names '$(SAMPLES)'") + +summary/gc_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 6 --sample_names '$(SAMPLES)'") + +summary/gc_summary.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) + $(call RUN, -c -n 1 -s 12G -m 24G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/bam_metrics.R --option 7 --sample_names '$(SAMPLES)'") + + +..DUMMY := $(shell mkdir -p version; \ + echo "picard" >> version/bam_interval_metrics.txt; \ + $(PICARD) CollectAlignmentSummaryMetrics --version &>> version/bam_interval_metrics.txt; \ + $(PICARD) CollectInsertSizeMetrics --version &>> version/bam_interval_metrics.txt; \ + $(PICARD) CollectOxoGMetrics --version &>> version/bam_interval_metrics.txt; \ + $(PICARD) CollectHsMetrics --version &>> version/bam_interval_metrics.txt; \ + $(PICARD) CollectGcBiasMetrics --version &>> version/bam_interval_metrics.txt; \ + R --version >> version/bam_interval_metrics.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: bam_metrics diff --git a/qc/bam_metrics.mk b/qc/bam_metrics.mk new file mode 100644 index 00000000..00377c43 --- /dev/null +++ b/qc/bam_metrics.mk @@ -0,0 +1,136 @@ +include modules/Makefile.inc + +LOGDIR ?= log/bam_metrics.$(NOW) + +bam_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) \ + summary/idx_metrics.txt \ + summary/aln_metrics.txt \ + summary/insert_metrics.txt \ + summary/oxog_metrics.txt \ + summary/hs_metrics.txt \ + summary/gc_metrics.txt \ + summary/gc_summary.txt + +TARGETS_LIST ?= $(HOME)/share/lib/resource_files/MSK-IMPACT-v4.sorted.list + +define idx-metrics +metrics/$1.idx_stats.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(BAM_INDEX) \ + INPUT=$$(<) \ + > $$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call idx-metrics,$(sample)))) + +define aln-metrics +metrics/$1.aln_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(COLLECT_ALIGNMENT_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call aln-metrics,$(sample)))) + +define insert-metrics +metrics/$1.insert_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(COLLECT_INSERT_METRICS) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + HISTOGRAM_FILE=metrics/$1.insert_metrics.pdf \ + MINIMUM_PCT=0.5") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call insert-metrics,$(sample)))) + +define oxog-metrics +metrics/$1.oxog_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(COLLECT_OXOG_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call oxog-metrics,$(sample)))) + +define hs-metrics +metrics/$1.hs_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(CALC_HS_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + BAIT_INTERVALS=$$(TARGETS_LIST) \ + TARGET_INTERVALS=$$(TARGETS_LIST)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call hs-metrics,$(sample)))) + +define gc-metrics +metrics/$1.gc_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $$(COLLECT_GC_BIAS) \ + INPUT=$$(<) \ + OUTPUT=metrics/$1.gc_bias.txt \ + CHART_OUTPUT=metrics/$1.gc_metrics.pdf \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + SUMMARY_OUTPUT=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call gc-metrics,$(sample)))) + +summary/idx_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 1 --sample_names '$(SAMPLES)'") + +summary/aln_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 2 --sample_names '$(SAMPLES)'") + +summary/insert_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 3 --sample_names '$(SAMPLES)'") + +summary/oxog_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 4 --sample_names '$(SAMPLES)'") + +summary/hs_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).hs_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 5 --sample_names '$(SAMPLES)'") + +summary/gc_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 6 --sample_names '$(SAMPLES)'") + +summary/gc_summary.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics.txt) + $(call RUN, -c -n 1 -s 12G -m 24G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/qc/bam_metrics.R --option 7 --sample_names '$(SAMPLES)'") + + +..DUMMY := $(shell mkdir -p version; \ + echo "picard" >> version/bam_metrics.txt; \ + $(PICARD) CollectAlignmentSummaryMetrics --version &>> version/bam_metrics.txt; \ + $(PICARD) CollectInsertSizeMetrics --version &>> version/bam_metrics.txt; \ + $(PICARD) CollectOxoGMetrics --version &>> version/bam_metrics.txt; \ + $(PICARD) CollectHsMetrics --version &>> version/bam_metrics.txt; \ + $(PICARD) CollectGcBiasMetrics --version &>> version/bam_metrics.txt; \ + R --version >> version/bam_metrics.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: bam_metrics diff --git a/qc/wgs_metrics.mk b/qc/wgs_metrics.mk new file mode 100644 index 00000000..ded23e69 --- /dev/null +++ b/qc/wgs_metrics.mk @@ -0,0 +1,116 @@ +include modules/Makefile.inc + +LOGDIR ?= log/wgs_metrics.$(NOW) + +wgs_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt) \ + $(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt) \ + summary/idx_metrics.txt \ + summary/aln_metrics.txt \ + summary/insert_metrics.txt \ + summary/oxog_metrics.txt \ + summary/gc_metrics.txt \ + summary/wgs_metrics.txt \ + summary/duplicate_metrics.txt + +SAMTOOLS_THREADS = 4 +SAMTOOLS_MEM_THREAD = 1G + +GATK_THREADS = 4 +GATK_MEM_THREAD = 2G + +define picard-metrics +metrics/$1.idx_stats.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(BAM_INDEX) \ + INPUT=$$(<) \ + > $$(@)") + +metrics/$1.aln_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_ALIGNMENT_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +metrics/$1.insert_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_INSERT_METRICS) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + HISTOGRAM_FILE=metrics/$1.insert_metrics.pdf \ + MINIMUM_PCT=0.05") + +metrics/$1.oxog_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_OXOG_METRICS) \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + INPUT=$$(<) \ + OUTPUT=$$(@)") + +metrics/$1.gc_metrics_summary.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_GC_BIAS) \ + INPUT=$$(<) \ + OUTPUT=metrics/$1.gc_metrics.txt \ + CHART_OUTPUT=metrics/$1.gc_metrics.pdf \ + REFERENCE_SEQUENCE=$$(REF_FASTA) \ + SUMMARY_OUTPUT=$$(@)") + +metrics/$1.wgs_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_WGS_METRICS) \ + INPUT=$$(<) \ + OUTPUT=$$(@) \ + REFERENCE_SEQUENCE=$$(REF_FASTA)") + +metrics/$1.duplicate_metrics.txt : bam/$1.bam + $$(call RUN, -c -n 1 -s 12G -m 24G -w 24:00:00,"set -o pipefail && \ + $$(COLLECT_DUP_METRICS) \ + INPUT=$$(<) \ + METRICS_FILE=$$(@)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call picard-metrics,$(sample)))) + +summary/idx_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).idx_stats.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 1 --sample_names '$(SAMPLES)'") + +summary/aln_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).aln_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 2 --sample_names '$(SAMPLES)'") + +summary/insert_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).insert_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 3 --sample_names '$(SAMPLES)'") + +summary/oxog_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).oxog_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 4 --sample_names '$(SAMPLES)'") + +summary/gc_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).gc_metrics_summary.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 5 --sample_names '$(SAMPLES)'") + +summary/wgs_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).wgs_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 6 --sample_names '$(SAMPLES)'") + +summary/duplicate_metrics.txt : $(foreach sample,$(SAMPLES),metrics/$(sample).duplicate_metrics.txt) + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/wgs_metrics.R --option 7 --sample_names '$(SAMPLES)'") + +#..DUMMY := $(shell mkdir -p version; \ +# $(SAMTOOLS) --version >> version/wgs_metrics.txt; \ +# echo "gatk3" >> version/wgs_metrics.txt; \ +# $(GATK) --version >> version/wgs_metrics.txt; \ +# echo "picard" >> version/wgs_metrics.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: wgs_metrics diff --git a/rnaseq/cufflinks.mk b/rnaseq/cufflinks.mk deleted file mode 100644 index 54e294fe..00000000 --- a/rnaseq/cufflinks.mk +++ /dev/null @@ -1,75 +0,0 @@ -# This module is used for running cufflinks -# input: $(SAMPLES) -# Options: BAM_PHRED64 = true/false -# Authors: Fong Chun Chan -# -include modules/Makefile.inc - -LOGDIR = log/cufflinks.$(NOW) - - -NUM_CORES ?= 8 -CUFFLINKS = $(HOME)/share/usr/bin/cufflinks -CUFFLINKS_OPTS = -b $(REF_FASTA) -u -g $(GENES_GTF) -p $(NUM_CORES) -u --no-update-check -v -CUFFCOMPARE = $(HOME)/share/usr/bin/cuffcompare -CUFFCOMPARE_OPTS = --no-update-check -CUFFMERGE = $(HOME)/share/usr/bin/cuffmerge -CUFFMERGE_OPTS = --no-update-check -CUFFDIFF = $(HOME)/share/usr/bin/cuffdiff -CUFFDIFF_OPTS = --no-update-check -v -CUFFQUANT = $(HOME)/share/usr/bin/cuffquant -CUFFQUANT_OPTS = --no-update-check -v -CUFFNORM = $(HOME)/share/usr/bin/cuffnorm -CUFFNORM_OPTS = --no-update-check -v -CUFFCOMPARE_OPTS = --no-update-check -s $(REF_FASTA) -r $(GENES_GTF) -V -v - -PHENO_FILE ?= pheno.txt -ifneq ($(wildcard $(PHENO_FILE)),) - A = $(shell sed '1d' $(PHENO_FILE) | cut -f1) - B = $(shell sed '1d' $(PHENO_FILE) | cut -f2) - $(foreach i,$(shell seq 1 $(words $(A))),$(eval pheno.$(word $i,$(B)) += $(word $i,$(A)))) - PHENOTYPES = $(shell sed '1d' $(PHENO_FILE) | cut -f2 | sort | uniq) -endif - -..DUMMY := $(shell mkdir -p version; $(CUFFLINKS) &> version/tophat.txt; echo "options: $(CUFFLINKS_OPTS)" >> version/cufflinks.txt) -.SECONDARY: -.DELETE_ON_ERROR: -.PHONY : all_cufflinks cufflinks cuffcmp cuffmerge cuffdiff cuffnorm - -all_cufflinks : cufflinks cuffcmp cuffmerge cuffdiff cuffnorm -cufflinks : $(foreach sample,$(SAMPLES),cufflinks/gtf/$(sample).transcripts.gtf) -cuffcmp : cufflinks/cuffcmp/cc.stats -cuffmerge : cufflinks/gtf/merged.gtf -cuffdiff : cufflinks/cuffdiff/gene_exp.diff -cuffnorm : cufflinks/cuffnorm/gene_exp.txt - -cufflinks/gtf/%.transcripts.gtf cufflinks/fpkm_tracking/%.isoforms.fpkm_tracking cufflinks/fpkm_tracking/%.genes.fpkm_tracking : bam/%.bam - $(call RUN,-n $(NUM_CORES) -s 2G -m 4G,"${CUFFLINKS} ${CUFFLINKS_OPTS} -o cufflinks/$* $< && \ - mkdir -p cufflinks/gtf cufflinks/fpkm_tracking && \ - ln cufflinks/$*/transcripts.gtf cufflinks/gtf/$*.transcripts.gtf && \ - ln cufflinks/$*/isoforms.fpkm_tracking cufflinks/fpkm_tracking/$*.isoforms.fpkm_tracking && \ - ln cufflinks/$*/genes.fpkm_tracking cufflinks/fpkm_tracking/$*.genes.fpkm_tracking") - -cufflinks/cuffcmp/cc.stats : $(foreach sample,$(SAMPLES),cufflinks/gtf/$(sample).transcripts.gtf) - $(call RUN,-s 10G -m 20G,"$(CUFFCOMPARE) $(CUFFCOMPARE_OPTS) -o $(@:.stats=) $^") - -cufflinks/assembly_list.txt : $(foreach sample,$(SAMPLES),cufflinks/gtf/$(sample).transcripts.gtf) - $(INIT) echo "$^" | tr ' ' '\n' > $@ - -cufflinks/gtf/merged.gtf : cufflinks/assembly_list.txt - $(call RUN,-n 8 -s 1G -m 2.5G,"$(CUFFMERGE) $(CUFFMERGE_OPTS) -o $(@D) -g $(GENES_GTF) -p 8 $<") - -cufflinks/cxb/%.cxb : cufflinks/gtf/merged.gtf bam/%.bam - $(call RUN,-n 4 -s 1G -m 2.5G,"mkdir -p cufflinks/$* && \ - $(CUFFQUANT) $(CUFFQUANT_OPTS) -o cufflinks/$* -b $(REF_FASTA) -p 4 $^ && \ - ln cufflinks/$*/abundances.cxb $@") - -cufflinks/cuffdiff/gene_exp.diff : cufflinks/gtf/merged.gtf $(foreach sample,$(SAMPLES),cufflinks/cxb/$(sample).cxb) - $(call RUN,-n 8 -s 1G -m 4G,"$(CUFFDIFF) $(CUFFDIFF_OPTS) -o $(@D) -p 8 $< \ - $(foreach pheno,$(PHENOTYPES),$(subst $( ),$(,),$(foreach s,$(pheno.$(pheno)),cufflinks/cxb/$s.cxb))) \ - -L $(subst $( ),$(,),$(PHENOTYPES))") - -cufflinks/cuffnorm/gene_exp.txt : cufflinks/gtf/merged.gtf $(foreach sample,$(SAMPLES),cufflinks/cxb/$(sample).cxb) - $(call RUN,-n 8 -s 1G -m 2G,"$(CUFFNORM) $(CUFFNORM_OPTS) -o $(@D) -p 8 $< \ - $(foreach pheno,$(PHENOTYPES),$(subst $( ),$(,),$(foreach s,$(pheno.$(pheno)),cufflinks/cxb/$s.cxb))) \ - -L $(subst $( ),$(,),$(PHENOTYPES))") diff --git a/rnaseq/deseq.Rnw b/rnaseq/deseq.Rnw deleted file mode 100644 index 8cc9f8de..00000000 --- a/rnaseq/deseq.Rnw +++ /dev/null @@ -1,138 +0,0 @@ -%%% Applies DESeq on a matrix of count data -%%% Inputs: counts matrix and pheno design matrix - -\documentclass{article} -\usepackage[margin=1in]{geometry} -\usepackage{here} - -\title{DESeq Analysis} -\author{Raymond Lim} - -\begin{document} - -\maketitle - -\SweaveOpts{cache=T, prefix.string=graphics/deSeq} - - -<>= -dir.create('graphics', showWarnings = F) -options(width = 100) - -includeGraphic <- function(filename, caption = NULL, width = 1) { - if (is.null(caption)) { - cat("\\includegraphics[width=", width, "\\linewidth]{", filename, "}\n", sep = "") - } else { - cat("\\begin{figure}[h!]\n") - cat("\\includegraphics[width=", width, "\\linewidth]{", filename, "}\n", sep = "") - cat("\\caption{", caption, "}\n", sep = "") - cat("\\end{figure}\n") - } - -} - -includeGraphics <- function(filenames, width = 1, caption) { - cat("\\begin{figure}[h!]\n") - for (filename in filenames) { - includeGraphic(filename, width) - } - cat("\\caption{", caption, "}\n", sep = "") - cat("\\end{figure}\n") -} -@ - - -<>= -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("DESeq")) -suppressPackageStartupMessages(library("GenomicFeatures")) -suppressPackageStartupMessages(library("Rsamtools")) -suppressPackageStartupMessages(library("xtable")) -#library(multicore) -@ - -<>= -optList <- list( - make_option("--condition", default = 'Condition', help = "Factor of interest in pheno file [default %default]"), - make_option("--refCondition", default = 'Normal', help = "Reference condition [default %default]"), - make_option("--altCondition", default = 'Variant', help = "Reference condition [default %default]"), - make_option("--outFile", default = NULL, help = "Output results to this file [optional]")); - -parser <- OptionParser(usage = "%prog [options] [counts file] [pheno file]", option_list = optList); - -arguments <- parse_args(parser, positional_arguments = T, arg = arguments); -opt <- arguments$options; - -if (length(arguments$args) != 2) { - cat("Need pheno design file and counts data"); - print_help(parser); - stop(); -} -@ - -<>= -phenoFile <- arguments$args[2]; -countsFile <- arguments$args[1]; - -pheno <- read.table(phenoFile, header = T, sep = '\t', row.names = 1, check.names = F); -pheno[, opt$condition] <- relevel(pheno[, opt$condition], opt$refCondition); - -counts <- read.table(countsFile, header = T, sep = '\t', na.strings = "", comment.char = "", stringsAsFactors = F, check.names = F); -counts <- counts[!duplicated(counts[,1]), ] -rownames(counts) <- counts[,1] -counts <- counts[-1] - -if (!all(colnames(counts) %in% rownames(pheno))) { - cat("Design does not match data"); -} -counts <- counts[, rownames(pheno)] - -cds <- newCountDataSet(counts, pheno[, opt$condition]) -@ - -Estimate the effective library size: - -<>= -cds <- estimateSizeFactors(cds) -sizeFactors(cds) -@ - -Estimate dispersion/variance: - -<>= -cds <- estimateDispersions(cds) - -str(fitInfo(cds)) -@ - -<>= -res <- nbinomTest(cds, levels(pData(cds)$condition)[1], levels(pData(cds)$condition)[2]) -@ - -\begin{figure} -<>= -plot(res$baseMean, res$log2FoldChange, log = "x", pch = 20, cex = .3, col = ifelse(res$padj < .1, "red", "black"), ylab = 'M', xlab = 'A') -@ - \caption{MA plot, normalised mean vs. log2 fold change} -\end{figure} - -\begin{figure} -<>= -hist(res$pval, breaks = 100, col = 'skyblue', border = 'slateblue', main = "", xlab = 'p-value') -@ - \caption{Histogram of p-values} -\end{figure} - -<>= -capt <- 'Top differentially expressed genes' -print(xtable(head(res[order(res$padj), ], 20), caption = capt)) -@ - -<>= -if (!is.null(opt$outFile)) { - write.table(res, file = opt$outFile, sep = '\t', quote = F, col.names=NA) -} -@ - -\end{document} - diff --git a/rnaseq/deseq.mk b/rnaseq/deseq.mk deleted file mode 100644 index bfec8c1e..00000000 --- a/rnaseq/deseq.mk +++ /dev/null @@ -1,23 +0,0 @@ -include modules/Makefile.inc -include modules/variant_callers/gatk.inc - -LOGDIR = log/deseq.$(NOW) - -DESEQ_RNW = modules/rnaseq/deseq.Rnw -SWEAVE = $(RSCRIPT) modules/scripts/Sweave.R - -DESEQ_CONDITION ?= condition -DESEQ_REF_CONDITION ?= ref - -# pheno file: sample\tpheno with header -PHENO_FILE ?= pheno.txt - -.DELETE_ON_ERROR: -.SECONDARY: - -.PHONY : all - -deseq_results.txt : sumreads/geneCounts.txt - mkdir -p graphics; $(SWEAVE) $(DESEQ_RNW) --condition $(DESEQ_CONDITION) --refCondition $(DESEQ_REF_CONDITION) --outFile $@ $< $(PHENO_FILE) - - diff --git a/rnaseq/dexseq.mk b/rnaseq/dexseq.mk deleted file mode 100644 index 23b5668f..00000000 --- a/rnaseq/dexseq.mk +++ /dev/null @@ -1,18 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/exon_counts.$(NOW) -PHONY += dexseq - -dexseq : $(foreach sample,$(TUMOR_SAMPLES),dexseq/$(sample).txt) - -define exon-count -dexseq/%.txt : star/bam/%.star.sorted.filtered.bam - $$(call RUN,-c -s 8G -m 12G -w 1440,"source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate /home/${USER}/share/usr/anaconda-envs/dexseq && \ - /home/${USER}/share/usr/anaconda-envs/dexseq/lib/R/library/DEXSeq/python_scripts/dexseq_count.py -f bam -p yes -r pos /home/${USER}/share/reference/Ensembl/Homo_sapiens.GRCh37.75.gff $$< dexseq/$$*.txt") -endef -$(foreach sample,$(TUMOR_SAMPLES),\ - $(eval $(call exon-count,$sample))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/rnaseq/immunedeconv.mk b/rnaseq/immunedeconv.mk new file mode 100644 index 00000000..e112137b --- /dev/null +++ b/rnaseq/immunedeconv.mk @@ -0,0 +1,25 @@ +include modules/Makefile.inc + +LOGDIR = log/immunedeconv.$(NOW) + +immunedeconv : immunedeconv/quantiseq.txt \ + immunedeconv/mcpcounter.txt \ + immunedeconv/cibersort.txt + +immunedeconv/quantiseq.txt : kallisto/tpm_by_gene.txt + $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 1 --input_file $(<) --output_file $(@)") + +immunedeconv/mcpcounter.txt : kallisto/tpm_by_gene.txt + $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 2 --input_file $(<) --output_file $(@)") + +immunedeconv/cibersort.txt : kallisto/tpm_by_gene.txt + $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 3 --input_file $(<) --output_file $(@)") + +..DUMMY := $(shell mkdir -p version; \ + ~/share/usr/env/r-immunedeconv-2.1.0/bin/R --version >> version/immunedeconv.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: immunedeconv diff --git a/rnaseq/kallisto.mk b/rnaseq/kallisto.mk new file mode 100644 index 00000000..6db1acf1 --- /dev/null +++ b/rnaseq/kallisto.mk @@ -0,0 +1,48 @@ +include modules/Makefile.inc + +LOGDIR = log/kallisto.$(NOW) + +kallisto : $(foreach sample,$(SAMPLES),kallisto/$(sample)/$(sample)_R1.fastq.gz) \ + $(foreach sample,$(SAMPLES),kallisto/$(sample)/$(sample)_R2.fastq.gz) \ + $(foreach sample,$(SAMPLES),kallisto/$(sample)/abundance.tsv) \ + kallisto/tpm_by_gene.txt + +SLEUTH_ANNOT ?= $(HOME)/share/lib/resource_files/Hugo_ENST_ensembl75_fixed.txt +KALLISTO_INDEX ?= $(HOME)/share/lib/ref_files/b37/ensembl_v75-0.43.0_kallisto_index + +define merge-fastq +kallisto/$1/$1_R1.fastq.gz : $$(foreach split,$2,$$(word 1, $$(fq.$$(split)))) + $$(call RUN,-c -n 12 -s 0.5G -m 1G -w 24:00:00 -v $(PIGZ_ENV),"set -o pipefail && \ + $$(PIGZ) -cd -p 12 $$(^) | $$(PIGZ) -c -p 12 > $$(@)") + +kallisto/$1/$1_R2.fastq.gz : $$(foreach split,$2,$$(word 2, $$(fq.$$(split)))) + $$(call RUN,-c -n 12 -s 0.5G -m 1G -w 24:00:00 -v $(PIGZ_ENV),"set -o pipefail && \ + $$(PIGZ) -cd -p 12 $$(^) | $$(PIGZ) -c -p 12 > $$(@)") +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call merge-fastq,$(sample),$(split.$(sample))))) + +define fastq-to-kallisto +kallisto/$1/abundance.tsv : kallisto/$1/$1_R1.fastq.gz kallisto/$1/$1_R2.fastq.gz + $$(call RUN,-c -n 12 -s 2G -m 3G -v $(KALLISTO_ENV),"set -o pipefail && \ + kallisto quant \ + -i $$(KALLISTO_INDEX) \ + -o kallisto/$1 \ + --bias -b 100 -t 12\ + --fusion $$(<) $$(<<)") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call fastq-to-kallisto,$(sample)))) + +kallisto/tpm_by_gene.txt : $(foreach sample,$(SAMPLES),kallisto/$(sample)/abundance.tsv) + $(call RUN, -c -n 24 -s 1G -m 2G -v $(KALLISTO_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/summarize_sleuth.R --annotation $(SLEUTH_ANNOT) --samples '$(SAMPLES)'") + +..DUMMY := $(shell mkdir -p version; \ + $(SAMTOOLS) --version > version/kallisto.txt; \ + ~/share/usr/env/kallisto-0.46.2/bin/kallisto version >> version/kallisto.txt; \ + ~/share/usr/env/kallisto-0.46.2/bin/R --version >> version/kallisto.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: kallisto diff --git a/rnaseq/rpkm.mk b/rnaseq/rpkm.mk deleted file mode 100644 index 34a6d9b5..00000000 --- a/rnaseq/rpkm.mk +++ /dev/null @@ -1,7 +0,0 @@ -#Module calculates RPKM values. Depends on the sumRNASeqReads.mk -RPKM_RSCRIPT = ${RSCRIPT} ~/gascoyne/scripts/calculateRPKM.R - -rpkm/%.rpkm.txt : summarized_reads/%.summarized_reads.txt - SGE_RREQ="-N $(@F) -l mem_free=1G -q all.q -now n" \ - $(MKDIR) $(@D)/logs;\ - $(RPKM_RSCRIPT) ${TXDB_FILE} $< $@ > $(@D)/logs/$*.log 2>&1 diff --git a/rnaseq/sumRNASeqReads.mk b/rnaseq/sumRNASeqReads.mk deleted file mode 100644 index 931cf949..00000000 --- a/rnaseq/sumRNASeqReads.mk +++ /dev/null @@ -1,44 +0,0 @@ -include modules/Makefile.inc -include modules/variant_callers/gatk.inc - -LOGDIR = log/sum_reads.$(NOW) - -DEFAULT_ENV = $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.6 - -SUM_READS_RSCRIPT = ${RSCRIPT} modules/rnaseq/summarizeRNASeqReads.R -SUM_EXONS_RSCRIPT = ${RSCRIPT} modules/rnaseq/summarizeRNASeqReadsByExon.R -SUM_INTRONS_RSCRIPT = ${RSCRIPT} modules/rnaseq/summarizeRNASeqReadsByIntron.R -SUM_READS_OPTS = - -.DELETE_ON_ERROR: -.SECONDARY: - -.PHONY : all sumreads - -SUM_TYPE = byGene byExon - -all : $(foreach type,$(SUM_TYPE),$(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.$(type).txt)) sumreads/rpkm_by_gene.txt sumreads/rpkm_by_exon.txt sumreads/counts_by_gene.txt sumreads/counts_by_exon.txt - -sumreads/%.sumreads.byGene.txt : bam/%.bam bam/%.bam.bai - $(call RUN,-v $(DEFAULT_ENV) -s 24G -m 48G,"$(SUM_READS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") - -sumreads/%.sumreads.byExon.txt : bam/%.bam bam/%.bam.bai - $(call RUN,-v $(DEFAULT_ENV) -s 24G -m 48G,"$(SUM_EXONS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") - -sumreads/rpkm_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.byGene.txt) - cut -f 2 $< > $@; \ - for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 7 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done - -sumreads/rpkm_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.byExon.txt) - cut -f 1-2 $< > $@; \ - for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 6 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done - -sumreads/counts_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.byGene.txt) - cut -f 2 $< > $@; \ - for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 3 $$x | sed "s/countsByGene/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done - -sumreads/counts_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.byExon.txt) - cut -f 1-2 $< > $@; \ - for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 4 $$x | sed "s/exonCount/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done - -include modules/bam_tools/processBam.mk diff --git a/rnaseq/sumreads.mk b/rnaseq/sumreads.mk new file mode 100644 index 00000000..f8a6e0af --- /dev/null +++ b/rnaseq/sumreads.mk @@ -0,0 +1,41 @@ +include modules/Makefile.inc + +LOGDIR = log/sum_reads.$(NOW) + +sumreads : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) \ + $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) \ + sumreads/rpkm_by_gene.txt \ + sumreads/rpkm_by_exon.txt \ + sumreads/counts_by_gene.txt \ + sumreads/counts_by_exon.txt + +SUM_READS_OPTS = +REF ?= b37 + +sumreads/%.sumreads.by_gene.txt : bam/%.bam bam/%.bam.bai + $(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_READS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") + +sumreads/%.sumreads.by_exon.txt : bam/%.bam bam/%.bam.bai + $(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_EXONS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") + +sumreads/rpkm_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) + cut -f 2 $< > $@; \ + for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 7 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done + +sumreads/rpkm_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) + cut -f 1-2 $< > $@; \ + for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 6 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done + +sumreads/counts_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) + cut -f 2 $< > $@; \ + for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 3 $$x | sed "s/countsByGene/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done + +sumreads/counts_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) + cut -f 1-2 $< > $@; \ + for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 4 $$x | sed "s/exonCount/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done + +..DUMMY := $(shell mkdir -p version; \ + $(SUMREADS_ENV)/bin/R --version >> version/sumreads.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: sumreads diff --git a/scripts/annotateSummaryVcf.R b/scripts/annotateSummaryVcf.R new file mode 100755 index 00000000..ce3fc2ca --- /dev/null +++ b/scripts/annotateSummaryVcf.R @@ -0,0 +1,53 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--input", default = NA, type = 'character', help = "input file path"), + make_option("--maf", default = NA, type = 'character', help = "input maf file path"), + make_option("--output", default = NA, type = 'character', help = "output file path")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + smry = readr::read_tsv(file = opt$input, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(`#CHROM` = CHROM, + POS = POS) %>% + dplyr::mutate(ID = ".", + QUAL = 100, + FILTER = "PASS", + INFO = ".") %>% + dplyr::select(`#CHROM`, POS, ID, REF, ALT, QUAL, FILTER, INFO) + cat("##fileformat=VCFv4.2\n", file = opt$output, append = FALSE) + readr::write_tsv(smry, path = opt$output, na = "NA", append = TRUE, col_names = TRUE) + +} else if (as.numeric(opt$option)==2) { + smry = readr::read_tsv(file = opt$input, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + maf = readr::read_tsv(file = opt$maf, comment = "#", col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(Tumor_Sample_Barcode = smry$TUMOR_SAMPLE, + Matched_Norm_Sample_Barcode = smry$NORMAL_SAMPLE, + Tumor_Sample_UUID = smry$TUMOR_SAMPLE, + Matched_Norm_Sample_UUID = smry$NORMAL_SAMPLE, + t_depth = smry$TUMOR_DP, + t_ref_count = round((1-smry$TUMOR_MAF) * smry$TUMOR_DP), + t_alt_count = round(smry$TUMOR_MAF*smry$TUMOR_DP), + n_depth = smry$NORMAL_DP, + n_ref_count = round((1-smry$NORMAL_MAF) * smry$NORMAL_DP), + n_alt_count = round(smry$NORMAL_MAF*smry$NORMAL_DP), + CCF = smry$ccf, + LOH = smry$facetsLOHCall, + HOTSPOT = smry$HOTSPOT) + readr::write_tsv(x = maf, path = opt$output) + +} diff --git a/scripts/backup.sh b/scripts/backup.sh deleted file mode 100755 index 3c1894f9..00000000 --- a/scripts/backup.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -LOCK=~/.backup_lock -LOGFILE=~/.backup.log -if ! mkdir $LOCK 2> /dev/null; then - echo "backup script is already running" - exit 1 -fi - -TMP=`mktemp`; -TOPDIR=/ifs/e63data/reis-filho -if mountpoint -q "/mount/limr/zedshared/"; then - while [ 1 ]; do - echo "searching for files in $TOPDIR" - cd $TOPDIR - 'ls' data/*/*/bam/*.bam* projects/*/bam/*.bam* data/*/wgs*/fastq/*.fastq.gz | \ - rsync --verbose --stats --recursive -a --files-from=- --log-file=$LOGFILE --prune-empty-dirs ./ /mount/limr/zedshared - if [ "$?" = "0" ]; then - echo "rsync complete" - exit - else - echo "rsync failure, retrying in 1 minute..." - sleep 60 - fi - done -fi - -rmdir $LOCK diff --git a/scripts/bam_metrics.R b/scripts/bam_metrics.R new file mode 100755 index 00000000..a08eefaa --- /dev/null +++ b/scripts/bam_metrics.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_names", default = NA, type = 'character', help = "sample names")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".idx_stats.txt"), + col_names = FALSE, col_types = cols(.default = col_character()))[-85,,drop=FALSE] %>% + readr::type_convert() %>% + dplyr::select(CHROMOSOME = X1, + LENGTH = X2, + ALIGNED_READS = X3) %>% + dplyr::mutate(CHROMOSOME = gsub(pattern=" length=", replacement="", x=CHROMOSOME), + ALIGNED_READS = gsub(pattern="Aligned= ", replacement="", x=ALIGNED_READS), + SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/idx_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==2) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".aln_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(-SAMPLE, -READ_GROUP) %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/aln_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==3) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".insert_metrics.txt"), + skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(-SAMPLE, -READ_GROUP) %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/insert_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==4) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".oxog_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(SAMPLE_NAME = SAMPLE_ALIAS) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/oxog_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==5) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".hs_metrics.txt"), + skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/hs_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==6) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".gc_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/gc_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==7) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + x = list() + for (i in 1:length(sample_names)) { + x[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".gc_bias.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + x = do.call(rbind, x) + write_tsv(x, path="summary/gc_summary.txt", na = "NA", append = FALSE, col_names = TRUE) + +} diff --git a/scripts/chmod.sh b/scripts/chmod.sh deleted file mode 100755 index ba5b0892..00000000 --- a/scripts/chmod.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -chmod ug+rwX -R /ifs/e63data/reis-filho/data &> /dev/null -chmod ug+rwX -R /ifs/e63data/reis-filho/projects &> /dev/null -chmod ug+rwX -R /ifs/e63data/reis-filho/reference &> /dev/null -chmod o+rX -R /ifs/e63data/reis-filho/data &> /dev/null -chmod o+rX -R /ifs/e63data/reis-filho/projects &> /dev/null -chmod o+rX -R /ifs/e63data/reis-filho/reference &> /dev/null diff --git a/scripts/classify_indel_pathogenicity_vcf.py b/scripts/classify_indel_pathogenicity_vcf.py index fdb66ee9..274e9081 100644 --- a/scripts/classify_indel_pathogenicity_vcf.py +++ b/scripts/classify_indel_pathogenicity_vcf.py @@ -31,7 +31,7 @@ def query_mutation_taster(record): parser.add_argument('--qsub_queue', nargs='?', default='jrf.q,all.q', help='qsub queue') parser.add_argument('--num_provean_threads', nargs='?', default=4, type=int, help='number of provean threads') parser.add_argument('--run_local', action='store_true', default=False, help='run provean locally') - parser.add_argument('--no_remote', action='store_true', default=False, help='no remote queries: can only call potentially pathogenic') + parser.add_argument('--no_remote', action='store_true', default=True, help='no remote queries: can only call potentially pathogenic') parser.add_argument('--no_mt_provean', action='store_true', default=False, help='do not run mutation taster / provean') args = parser.parse_args() diff --git a/scripts/cnvkit.R b/scripts/cnvkit.R new file mode 100644 index 00000000..1e7b0a6e --- /dev/null +++ b/scripts/cnvkit.R @@ -0,0 +1,237 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("copynumber")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +'plot_log2_ratio' <- function(x) +{ + par(mar=c(5, 5, 4, 2)+.1) + plot(x = x$position, y = x$log2, type = "p", pch = ".", cex = 1, col = "grey75", axes = FALSE, frame = FALSE, xlab = "", ylab = "", main = "", ylim = c(-4,5)) + y = x %>% + dplyr::group_by(chromosome) %>% + dplyr::summarize(start = min(start_chr), + end = max(end_chr)) %>% + dplyr::mutate(chromosome = factor(chromosome, levels = c(1:22, "X"), ordered = TRUE)) %>% + dplyr::arrange(chromosome) + points(x = c(y$start[1]-1E9, y$end[nrow(y)]), y = c(0, 0), type = "l", col = "grey20", lwd = 1.15) + axis(1, at = c(y$start, y$end[nrow(y)]), labels = rep(" ", nrow(y)+1), cex.axis = 0.85, las = 1, tck = .035) + axis(1, at = .5*(y$start + y$end), labels = y$chromosome, cex.axis = 0.85, las = 1) + axis(2, at = c(-2, -1, 0, 1, 2), labels = c(-2, -1, 0, 1, 2), cex.axis = 1, las = 1) + mtext(side = 2, text = expression(Log[2]~"Ratio "), line = 3.15, cex = 1) +} + +'add_segmented' <- function(x) +{ + for (i in 1:nrow(x)) { + points(x = c(x$Start_Position[i], x$End_Position[i]), y = rep(x$Log2_Ratio[i], 2), type = "l", col = "#e41a1c", lwd = 2.75) + } +} + +'add_totalcopies' <- function(purity, ploidy, xmin, xmax) +{ + for (i in c(1, 2, 4, 6, 10)) { + y = log2(((purity*i) + (1-purity)*2)/((purity*ploidy) + (1-purity)*2)) + if (!is.na(y) & y<2) { + points(x = c(xmin, xmax), y = rep(y, 2), type = "l", col = "goldenrod3", lty = 3, lwd = 1) + } + } +} + +if (as.numeric(opt$option) == 1) { + data = readr::read_tsv(file = paste0("cnvkit/cnr/", opt$sample_name, ".cnr"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(weight>.1) %>% + dplyr::filter(chromosome != "Y") %>% + dplyr::mutate(chromosome = factor(chromosome, levels = c(1:22, "X"), ordered = TRUE)) + cytoband = data %>% + dplyr::group_by(chromosome) %>% + dplyr::summarize(start = min(start), + end = max(end)) %>% + dplyr::mutate(chromosome = factor(chromosome, levels = c(1:22, "X"), ordered = TRUE)) %>% + dplyr::arrange(chromosome) %>% + dplyr::mutate(end = cumsum(end)) + start = rep(0, nrow(cytoband)) + start[2:nrow(cytoband)] = cytoband$end[1:(nrow(cytoband)-1)] + cytoband$start[2:nrow(cytoband)] + cytoband$start = start + data = data %>% + dplyr::left_join(cytoband %>% + dplyr::rename(start_chr = start, + end_chr = end), + by = "chromosome") %>% + dplyr::mutate(start = start + start_chr, + end = end + start_chr) %>% + dplyr::mutate(position = .5*(start + end)) %>% + dplyr::mutate(log2 = case_when( + log2 > 6 ~ 0, + log2 < (-4) ~ 0, + TRUE ~ log2 + )) + + pdf(file = paste0("cnvkit/plots/log2/", opt$sample_name, ".pdf"), width = 8, height = 3.75) + plot_log2_ratio(x = data) + dev.off() + +} else if (as.numeric(opt$option) == 2) { + data = readr::read_tsv(file = paste0("cnvkit/cnr/", opt$sample_name, ".cnr"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(weight>.1) %>% + dplyr::filter(chromosome != "Y") + smoothed = winsorize(data = data %>% dplyr::select(chromosome, start, log2) %>% data.frame(), method = "mad") + segmented = pcf(data = smoothed, kmin = 25, gamma = 75, normalize = FALSE, fast = FALSE) %>% + dplyr::as_tibble() %>% + dplyr::select(Sample_Name = sampleID, Chromosome = chrom, Arm = arm, + Start_Position = start.pos, End_Position = end.pos, + N = n.probes, Log2_Ratio = mean) %>% + dplyr::mutate(Sample_Name = opt$sample_name) + readr::write_tsv(x = segmented, file = paste0("cnvkit/segmented/", opt$sample_name, ".txt"), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 3) { + data = readr::read_tsv(file = paste0("cnvkit/cnr/", opt$sample_name, ".cnr"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(weight>.1) %>% + dplyr::filter(chromosome != "Y") + smoothed = winsorize(data = data %>% dplyr::select(chromosome, start, log2) %>% data.frame(), method = "mad") + segmented = pcf(data = smoothed, kmin = 25, gamma = 75, normalize = FALSE, fast = FALSE) %>% + dplyr::as_tibble() %>% + dplyr::select(Sample_Name = sampleID, Chromosome = chrom, Arm = arm, + Start_Position = start.pos, End_Position = end.pos, + N = n.probes, Log2_Ratio = mean) %>% + dplyr::mutate(Sample_Name = opt$sample_name) + cytoband = data %>% + dplyr::group_by(chromosome) %>% + dplyr::summarize(start = min(start), + end = max(end)) %>% + dplyr::mutate(chromosome = factor(chromosome, levels = c(1:22, "X"), ordered = TRUE)) %>% + dplyr::arrange(chromosome) %>% + dplyr::mutate(end = cumsum(end)) + start = rep(0, nrow(cytoband)) + start[2:nrow(cytoband)] = cytoband$end[1:(nrow(cytoband)-1)] + cytoband$start[2:nrow(cytoband)] + cytoband$start = start + data = data %>% + dplyr::left_join(cytoband %>% + dplyr::rename(start_chr = start, + end_chr = end), + by = "chromosome") %>% + dplyr::mutate(start = start + start_chr, + end = end + start_chr) %>% + dplyr::mutate(position = start) %>% + dplyr::mutate(log2 = case_when( + log2 > 6 ~ 0, + log2 < (-4) ~ 0, + TRUE ~ log2 + )) + segmented = segmented %>% + dplyr::left_join(cytoband %>% + dplyr::rename(Chromosome = chromosome, + start_chr = start, + end_chr = end), + by = "Chromosome") %>% + dplyr::mutate(Start_Position = Start_Position + start_chr, + End_Position = End_Position + start_chr) + + pdf(file = paste0("cnvkit/plots/segmented/", opt$sample_name, ".pdf"), width = 8, height = 3.75) + plot_log2_ratio(x = data) + add_segmented(x = segmented) + dev.off() + +} else if (as.numeric(opt$option) == 4) { + tumor_name = unlist(strsplit(x = opt$sample_name, split = "_", fixed = TRUE))[1] + normal_name = unlist(strsplit(x = opt$sample_name, split = "_", fixed = TRUE))[2] + data = readr::read_tsv(file = paste0("cnvkit/segmented/", tumor_name, ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + facets = readr::read_tsv(file = paste0("facets/cncf/", tumor_name, "_", normal_name, ".out"), col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + purity = as.numeric(gsub(pattern = "# Purity = ", replacement = "", x = facets %>% dplyr::slice(10) %>% .[["X1"]], fixed = TRUE)) + ploidy = as.numeric(gsub(pattern = "# Ploidy = ", replacement = "", x = facets %>% dplyr::slice(11) %>% .[["X1"]], fixed = TRUE)) + data = data %>% + dplyr::mutate(Total_Copy = ((2^(Log2_Ratio))*(purity*ploidy + (1-purity)*2) - (1-purity)*2)/purity) + readr::write_tsv(x = data, file = paste0("cnvkit/totalcopy/", tumor_name, ".txt"), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 5) { + tumor_name = unlist(strsplit(x = opt$sample_name, split = "_", fixed = TRUE))[1] + normal_name = unlist(strsplit(x = opt$sample_name, split = "_", fixed = TRUE))[2] + data = readr::read_tsv(file = paste0("cnvkit/cnr/", tumor_name, ".cnr"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(weight>.1) %>% + dplyr::filter(chromosome != "Y") + segmented = readr::read_tsv(file = paste0("cnvkit/totalcopy/", tumor_name, ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + cytoband = data %>% + dplyr::group_by(chromosome) %>% + dplyr::summarize(start = min(start), + end = max(end)) %>% + dplyr::mutate(chromosome = factor(chromosome, levels = c(1:22, "X"), ordered = TRUE)) %>% + dplyr::arrange(chromosome) %>% + dplyr::mutate(end = cumsum(end)) + start = rep(0, nrow(cytoband)) + start[2:nrow(cytoband)] = cytoband$end[1:(nrow(cytoband)-1)] + cytoband$start[2:nrow(cytoband)] + cytoband$start = start + data = data %>% + dplyr::left_join(cytoband %>% + dplyr::rename(start_chr = start, + end_chr = end), + by = "chromosome") %>% + dplyr::mutate(start = start + start_chr, + end = end + start_chr) %>% + dplyr::mutate(position = start) %>% + dplyr::mutate(log2 = case_when( + log2 > 6 ~ 0, + log2 < (-4) ~ 0, + TRUE ~ log2 + )) + segmented = segmented %>% + dplyr::left_join(cytoband %>% + dplyr::rename(Chromosome = chromosome, + start_chr = start, + end_chr = end), + by = "Chromosome") %>% + dplyr::mutate(Start_Position = Start_Position + start_chr, + End_Position = End_Position + start_chr) + + facets = readr::read_tsv(file = paste0("facets/cncf/", tumor_name, "_", normal_name, ".out"), col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + purity = as.numeric(gsub(pattern = "# Purity = ", replacement = "", x = facets %>% dplyr::slice(10) %>% .[["X1"]], fixed = TRUE)) + ploidy = as.numeric(gsub(pattern = "# Ploidy = ", replacement = "", x = facets %>% dplyr::slice(11) %>% .[["X1"]], fixed = TRUE)) + + pdf(file = paste0("cnvkit/plots/totalcopy/", tumor_name, ".pdf"), width = 8, height = 3.75) + plot_log2_ratio(x = data) + add_segmented(x = segmented) + add_totalcopies(purity, ploidy, cytoband[1,"start"]-1E9, cytoband[nrow(cytoband),"end"]) + dev.off() + +} else if (as.numeric(opt$option) == 6) { + sample_names = unlist(strsplit(x = opt$sample_name, split = " ", fixed = TRUE)) + data = list() + for (i in 1:length(sample_names)) { + data[[i]] = readr::read_tsv(file = paste0("cnvkit/totalcopy/", sample_names[i], ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + data = do.call(bind_rows, data) + readr::write_tsv(x = data, file = "cnvkit/summary/total_copy.txt", col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 7) { + sample_names = unlist(strsplit(x = opt$sample_name, split = " ", fixed = TRUE)) + data = list() + for (i in 1:length(sample_names)) { + data[[i]] = readr::read_tsv(file = paste0("cnvkit/cnr/", sample_names[i], ".cnr"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(chromosome, start, end, log2, weight) %>% + dplyr::mutate(sample_name = sample_names[i]) + } + data = do.call(bind_rows, data) + readr::write_tsv(x = data, file = "cnvkit/summary/log2_ratio.txt", col_names = TRUE, append = FALSE) + +} \ No newline at end of file diff --git a/scripts/configure.py b/scripts/configure.py index 8f216041..2a226a61 100755 --- a/scripts/configure.py +++ b/scripts/configure.py @@ -1,23 +1,18 @@ #!/usr/bin/env python -from __future__ import print_function +from __future__ import print_function import yaml import argparse import collections -""" convert yaml files to make include files -""" - - def lowerBool(x): if isinstance(x, bool): return str(x).lower() else: return x - def sample_yaml2mk(samples_file, out): - samples = yaml.load(open(args.samples_file, 'r')) + samples = yaml.full_load(open(args.samples_file, 'r')) tumors = set() normals = set() @@ -95,7 +90,7 @@ def sample_yaml2mk(samples_file, out): def sample_attr_yaml2mk(sample_attr_file, out): print("\n# sample_attr_file", file=out) - sample_attr = yaml.load(open(sample_attr_file, 'r')) + sample_attr = yaml.full_load(open(sample_attr_file, 'r')) for attr, m in sample_attr.items(): for k, v in m.items(): print("{}.{} = {}".format(attr, k, v), file=out) @@ -103,7 +98,7 @@ def sample_attr_yaml2mk(sample_attr_file, out): def sample_fastq_yaml2mk(sample_fastq_file, out): print("\n# sample_fastq_file", file=out) - sample_fastq = yaml.load(open(sample_fastq_file, 'r')) + sample_fastq = yaml.full_load(open(sample_fastq_file, 'r')) split_samples = set() for k, v in sample_fastq.items(): for idx, fastq in enumerate(v): @@ -122,17 +117,15 @@ def sample_fastq_yaml2mk(sample_fastq_file, out): def sample_merge_yaml2mk(sample_merge_file, out): print("\n# sample_merge_file", file=out) - sample_merge = yaml.load(open(args.sample_merge_file, 'r')) + sample_merge = yaml.full_load(open(args.sample_merge_file, 'r')) print("MERGE_SAMPLES = {}".format(" ".join(list(sample_merge.keys()))), file=out) for k, v in sample_merge.items(): print("merge.{} = {}".format(k, " ".join(v)), file=out) if __name__ == '__main__': - parser = argparse.ArgumentParser(prog='configure', - description='Convert project YAML file to Make') - parser.add_argument('--project_config_file', help='project yaml config file', - default='project_config.yaml') + parser = argparse.ArgumentParser(prog='configure', description='Convert project YAML file to Make') + parser.add_argument('--project_config_file', help='project yaml config file', default='project_config.yaml') parser.add_argument('--samples_file', help='yaml samples file', default='samples.yaml') parser.add_argument('--sample_attr_file', help='yaml sample attr file', default='sample_attr.yaml') parser.add_argument('--sample_fastq_file', help='yaml sample fastq file mappings', default='sample.fastq.yaml') @@ -142,7 +135,7 @@ def sample_merge_yaml2mk(sample_merge_file, out): of = open(args.out_file, 'w') - config = yaml.load(open(args.project_config_file, 'r')) + config = yaml.full_load(open(args.project_config_file, 'r')) for k, v in config.items(): print("{} = {}".format(k.upper(), lowerBool(v)), file=of) diff --git a/scripts/createSampleSets.pl b/scripts/create_sample_sets.pl similarity index 92% rename from scripts/createSampleSets.pl rename to scripts/create_sample_sets.pl index f32b680d..2bcc0a55 100644 --- a/scripts/createSampleSets.pl +++ b/scripts/create_sample_sets.pl @@ -1,5 +1,4 @@ #!/usr/bin/env perl -# parse samples file to get sample sets (space delimited, normal last) use strict; use warnings; diff --git a/signatures/extract_signatures.R b/scripts/extract_signatures.R similarity index 100% rename from signatures/extract_signatures.R rename to scripts/extract_signatures.R diff --git a/scripts/facets_suite.R b/scripts/facets_suite.R new file mode 100644 index 00000000..11d45470 --- /dev/null +++ b/scripts/facets_suite.R @@ -0,0 +1,29 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), + make_option("--sample_pairs", default = NA, type = 'character', help = "sample pairs")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +if (as.numeric(opt$option) == 1) { + sample_names = unlist(strsplit(as.character(opt$sample_pairs), split = " ", fixed = TRUE)) + CN = list() + for (i in 1:length(sample_names)) { + CN[[i]] = readr::read_tsv(file = paste0("facets_suite/", sample_names[i], "/", sample_names[i], ".gene_level.txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + CN = do.call(rbind, CN) + readr::write_tsv(x = CN, path = "facets_suite/summary.txt", col_names = TRUE, append = FALSE) + +} diff --git a/scripts/filter_sv.R b/scripts/filter_sv.R new file mode 100644 index 00000000..bb9b87f6 --- /dev/null +++ b/scripts/filter_sv.R @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--input_file", default = NA, type = 'character', help = "Input VCF file"), + make_option("--output_file", default = NA, type = 'character', help = "Output VCF file")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +vcf = readr::read_tsv(file = as.character(opt$input_file), comment = "#", col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(!grepl("SUPP_VEC=110", X8, fixed = TRUE)) %>% + dplyr::mutate(X3 = X12) %>% + dplyr::mutate(X3 = unlist(lapply(X3, function(x) { unlist(strsplit(x, split = ":", fixed = TRUE))[8] }))) %>% + dplyr::mutate(X3 = gsub(pattern = "_", replacement = ":", x = X3, fixed = TRUE)) %>% + dplyr::mutate(X5 = case_when( + grepl("DUP", X3, fixed = TRUE) ~ "", + grepl("DEL", X3, fixed = TRUE) ~ "", + grepl("INV", X3, fixed = TRUE) ~ "", + TRUE ~ X5 + )) %>% + dplyr::mutate(X8 = case_when( + grepl("DUP", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DUP", X8), + grepl("DEL", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DEL", X8), + TRUE ~ X8 + )) %>% + dplyr::rename(`#CHROM` = X1, + POS = X2, + ID = X3, + REF = X4, + ALT = X5, + QUAL = X6, + FILTER = X7, + INFO = X8, + FORMAT = X9, + SVABA = X10, + GRIDSS = X11, + MANTA = X12) + +readr::write_tsv(x = vcf, path = as.character(opt$output_file), append = TRUE, col_names = TRUE) + + diff --git a/scripts/get_basecounts.R b/scripts/get_basecounts.R new file mode 100644 index 00000000..f65fae14 --- /dev/null +++ b/scripts/get_basecounts.R @@ -0,0 +1,29 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "Which option?"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +if (as.numeric(opt$option) == 1) { + sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + data = list() + for (i in 1:length(sample_names)) { + data[[i]] = readr::read_tsv(file = paste0("gbc/", sample_names[i], ".txt.gz"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(sample_name = sample_names[i]) + } + data = do.call(bind_rows, data) + readr::write_tsv(x = data, path = "gbc/summary.txt", append = FALSE, col_names = TRUE) +} + diff --git a/scripts/gzipLogs.sh b/scripts/gzipLogs.sh deleted file mode 100644 index 05cdf2cc..00000000 --- a/scripts/gzipLogs.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh -# gzip old log files -LOCK=~/.gzip_lock -if ! mkdir $LOCK 2> /dev/null; then - echo "log gzip script is already running" - exit 1 -fi -find /ifs/e63data/reis-filho/data /ifs/e63data/reis-filho/projects/ -name '*.log' -mtime +5 -exec gzip {} \; -rmdir $LOCK diff --git a/scripts/hr_detect.R b/scripts/hr_detect.R new file mode 100644 index 00000000..d0ff2e03 --- /dev/null +++ b/scripts/hr_detect.R @@ -0,0 +1,237 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("signature.tools.lib")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +if (as.numeric(opt$option) == 1) { + vcf = readr::read_tsv(file = "summary/tsv/all.tsv", col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::filter(CHROM %in% c(1:22, "X")) %>% + dplyr::mutate(CHROM = case_when( + CHROM == "X" ~ "23", + TRUE ~ CHROM + )) %>% + readr::type_convert() %>% + dplyr::arrange(CHROM, POS) %>% + dplyr::mutate(TUMOR_NORMAL = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::filter(TUMOR_NORMAL == as.character(opt$sample_name)) %>% + dplyr::filter(variantCaller == "mutect") %>% + dplyr::filter(TUMOR_DP>=10 & NORMAL_DP>=10) %>% + dplyr::mutate(CHROM = as.character(CHROM)) %>% + dplyr::mutate(CHROM = ifelse(CHROM == "23", "X", CHROM)) %>% + dplyr::mutate(QUAL = 100, + FILTER = "PASS", + INFO = ".") %>% + dplyr::select(`#CHROM` = CHROM, + POS = POS, + ID = ID, + REF = REF, + ALT = ALT, + QUAL = QUAL, + FILTER = FILTER, + INFO = INFO) + cat("##fileformat=VCFv4.1\n", file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".snv.vcf"), append = FALSE) + readr::write_tsv(x = vcf, path = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".snv.vcf"), col_names = TRUE, append = TRUE) + +} else if (as.numeric(opt$option) == 2) { + vcf = readr::read_tsv(file = "summary/tsv/all.tsv", col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::filter(CHROM %in% c(1:22, "X")) %>% + dplyr::mutate(CHROM = case_when( + CHROM == "X" ~ "23", + TRUE ~ CHROM + )) %>% + readr::type_convert() %>% + dplyr::arrange(CHROM, POS) %>% + readr::type_convert() %>% + dplyr::mutate(TUMOR_NORMAL = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::filter(TUMOR_NORMAL == as.character(opt$sample_name)) %>% + dplyr::filter(grepl("varscan", variantCaller, fixed = TRUE)) %>% + dplyr::filter(grepl("strelka", variantCaller, fixed = TRUE)) %>% + dplyr::filter(TUMOR_DP>=10 & NORMAL_DP>=10) %>% + dplyr::mutate(CHROM = as.character(CHROM)) %>% + dplyr::mutate(CHROM = ifelse(CHROM == "23", "X", CHROM)) %>% + dplyr::mutate(QUAL = 100, + FILTER = "PASS", + INFO = ".") %>% + dplyr::select(`#CHROM` = CHROM, + POS = POS, + ID = ID, + REF = REF, + ALT = ALT, + QUAL = QUAL, + FILTER = FILTER, + INFO = INFO) + cat("##fileformat=VCFv4.1\n", file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".indel.vcf"), append = FALSE) + readr::write_tsv(x = vcf, path = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".indel.vcf"), col_names = TRUE, append = TRUE) + +} else if (as.numeric(opt$option) == 3) { + cn = readr::read_tsv(file = paste0("facets/cncf/", as.character(opt$sample_name), ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(chrom = as.character(chrom)) %>% + dplyr::mutate(chrom = ifelse(chrom == "23", "X", chrom)) %>% + dplyr::mutate(seg_no = seg, + Chromosome = chrom, + chromStart = loc.start, + chromEnd = loc.end, + total.copy.number.inNormal = 2, + minor.copy.number.inNormal = 1, + total.copy.number.inTumour = tcn.em, + minor.copy.number.inTumour = lcn.em) %>% + dplyr::mutate(total.copy.number.inTumour = case_when( + is.na(total.copy.number.inTumour) ~ 2, + TRUE ~ total.copy.number.inTumour + )) %>% + dplyr::mutate(minor.copy.number.inTumour = case_when( + is.na(minor.copy.number.inTumour) ~ 2, + TRUE ~ minor.copy.number.inTumour + )) %>% + dplyr::select(seg_no, + Chromosome, + chromStart, + chromEnd, + total.copy.number.inNormal, + minor.copy.number.inNormal, + total.copy.number.inTumour, + minor.copy.number.inTumour) + + readr::write_tsv(x = cn, path = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".cn.txt"), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 4) { + sv = readr::read_tsv(file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".merged.bedpe"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(chrom1 %in% c(1:22, "X")) %>% + dplyr::filter(chrom2 %in% c(1:22, "X")) %>% + dplyr::mutate(svclass = case_when( + svclass == "BND" ~ "translocation", + svclass == "TRA" ~ "translocation", + svclass == "DEL" ~ "deletion", + svclass == "DUP" ~ "tandem-duplication", + svclass == "INS" ~ "insertion", + svclass == "INV" ~ "inversion", + TRUE ~ svclass + )) %>% + dplyr::mutate(sample = as.character(opt$sample_name)) + + readr::write_tsv(x = sv, path = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".sv.bedpe"), col_names = TRUE, append = FALSE) + + +} else if (as.numeric(opt$option) == 5) { + url_subs_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".snv_repaired.vcf.bgz") + url_indels_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".indel_repaired.vcf.bgz") + url_cn_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".cn.txt") + url_sv_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".sv.bedpe") + + genomePlot(subsVcf.file = url_subs_file, + indelsVcf.file = url_indels_file, + cnvsTab.file = url_cn_file, + rearrBedpe.file = url_sv_file, + sampleID = as.character(opt$sample_name), + genome.v = "hg19", file.ideogram = NULL, plot_title = NULL, + no_copynumber = FALSE, no_rearrangements = FALSE, no_indels = FALSE, + no_subs_legend = FALSE, out_format = "png", + out_path = paste0("hr_detect/", as.character(opt$sample_name), "/"), + rearr_only_assembled = FALSE, base.per.unit = NULL) + +} else if (as.numeric(opt$option) == 6) { + url_subs_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".snv_repaired.vcf.bgz") + url_indels_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".indel_repaired.vcf.bgz") + url_cn_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".cn.txt") + url_sv_file = paste0("hr_detect/", as.character(opt$sample_name), "/", as.character(opt$sample_name), ".sv.bedpe") + + genomePlot(subsVcf.file = url_subs_file, + indelsVcf.file = url_indels_file, + cnvsTab.file = url_cn_file, + rearrBedpe.file = url_sv_file, + sampleID = as.character(opt$sample_name), + genome.v = "hg19", file.ideogram = NULL, plot_title = NULL, + no_copynumber = FALSE, no_rearrangements = FALSE, no_indels = FALSE, + no_subs_legend = FALSE, out_format = "svg", + out_path = paste0("hr_detect/", as.character(opt$sample_name), "/"), + rearr_only_assembled = FALSE, base.per.unit = NULL) + +} else if (as.numeric(opt$option) == 7) { + sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + snv_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".snv_repaired.vcf.bgz") })) + indel_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".indel_repaired.vcf.bgz") })) + cn_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".cn.txt") })) + sv_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".sv.bedpe") })) + + names(snv_files) = names(indel_files) = names(cn_files) = names(sv_files) <- sample_names + + res = HRDetect_pipeline(genome.v = "hg19", + SNV_vcf_files = snv_files, + SV_bedpe_files = sv_files, + Indels_vcf_files = indel_files, + CNV_tab_files = cn_files, + SNV_signature_version = "COSMICv2", + nparallel = 4) + + readr::write_tsv(x = res$hrdetect_output %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "sample_name") %>% + dplyr::as_tibble(), + file = "hr_detect/hrdetect_smry.txt", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option) == 8) { + sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + snv_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".snv_repaired.vcf.bgz") })) + indel_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".indel_repaired.vcf.bgz") })) + cn_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".cn.txt") })) + sv_files = unlist(lapply(sample_names, function(x) { paste0("hr_detect/", x, "/", x, ".sv.bedpe") })) + + names(snv_files) = names(indel_files) = names(cn_files) = names(sv_files) <- sample_names + + res = signatureFit_pipeline(genome.v = "hg19", + SNV_vcf_files = snv_files, + nparallel = 4) + + signatures_to_use = c("SBS1", "SBS2", "SBS3", "SBS4", "SBS6", "SBS7a", "SBS7c", "SBS8", + "SBS9", "SBS10a", "SBS10d", "SBS11", "SBS13", "SBS14", "SBS15", + "SBS18", "SBS20", "SBS22", "SBS24", "SBS26", "SBS30", "SBS31", + "SBS32", "SBS35", "SBS38", "SBS44", "SBS84", "SBS87", "SBS88", + "SBS90", "SBS94", "SBS95", "SBS96", "SBS97", "SBS104", "SBS105", + "SBS107", "SBS108", "SBS109", "SBS110", "SBS111", "SBS112", + "SBS113", "SBS119", "SBS129", "SBS137") + + tags_to_use = c("Deamination (Age)", "Deamination (APOBEC)", "HR deficiency", "Tobacco", "MMR deficiency", + "UV exposure", "UV exposure", "HR deficiency", "Lymphoma", "POLE deficiency", "POLD deficiency", + "Temozolomide-1,2-DMH", "Deamination (APOBEC)", "MMR deficiency (POLE deficiency)", "MMR deficiency", + "BER deficiency", "MMR deficiency (POLD deficiency)", "AAI", "Aflatoxin", "MMR deficiency", + "BER deficiency", "Platinum", "Azathioprine", "Platinum", "Similar to UV", "MMR deficiency", + "AID", "Deamination (Thiopurine)", "Colibactin", "Duocarmycin", "Similar to tobacco", "Deamination", + "Deamination", "MMR deficiency", "Platinum-related", "Deamination", "Similar to tobacco", "BER deficiency", + "Similar to tobacco", "Similar to AAI", "Platinum-related", "Platinum-related", "AAI", "Temozolomide-1,2-DMH", + "Similar to UV", "Similar to UV") + + res = res$fitResults$exposures %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "sample_name") %>% + reshape2::melt(id.vars = "sample_name", variable.name = "signature", value.name = "exposure") %>% + dplyr::filter(signature %in% signatures_to_use) %>% + dplyr::mutate(exposure = case_when( + is.na(exposure) ~ 0, + TRUE ~ exposure + )) %>% + dplyr::group_by(sample_name) %>% + dplyr::summarize(signature = signature, + exposure = exposure/sum(exposure)) %>% + dplyr::ungroup() %>% + dplyr::left_join(dplyr::tibble(signature = signatures_to_use, + description = tags_to_use), by = "signature") + + + readr::write_tsv(x = res, file = "hr_detect/signatures_smry.txt", append = FALSE, col_names = TRUE) + +} diff --git a/scripts/immunedeconv.R b/scripts/immunedeconv.R new file mode 100644 index 00000000..b5497b5f --- /dev/null +++ b/scripts/immunedeconv.R @@ -0,0 +1,57 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("immunedeconv")) + + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option('--option', type = 'character', default = NA, help = 'Immune deconv algorithm'), + make_option('--input_file', type = 'character', default = NA, help = 'Expression input file'), + make_option('--output_file', type = 'character', default = NA, help = 'Immune cell output file')) +parser = OptionParser(usage = "%prog", option_list=optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +set_cibersort_binary("~/share/lib/resource_files/CIBERSORT/CIBERSORT.R") +set_cibersort_mat("~/share/lib/resource_files/CIBERSORT/LM22.txt") + +if (as.numeric(opt$option)==1) { + tpm_by_gene = readr::read_tsv(file = opt$input_file, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::arrange(gene_symbol) + gene_expr = tpm_by_gene %>% + dplyr::select(-gene_symbol) %>% + as.matrix() + rownames(gene_expr) = tpm_by_gene %>% .[["gene_symbol"]] + quantiseq = immunedeconv::deconvolute(gene_expression = gene_expr, method = "quantiseq", scale_mrna = FALSE) + readr::write_tsv(x = quantiseq, file = opt$output_file, col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==2) { + tpm_by_gene = readr::read_tsv(file = opt$input_file, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::arrange(gene_symbol) + gene_expr = tpm_by_gene %>% + dplyr::select(-gene_symbol) %>% + as.matrix() + rownames(gene_expr) = tpm_by_gene %>% .[["gene_symbol"]] + mcpcounter = immunedeconv::deconvolute(gene_expression = gene_expr, method = "mcp_counter", scale_mrna = FALSE) + readr::write_tsv(x = mcpcounter, file = opt$output_file, col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==3) { + tpm_by_gene = readr::read_tsv(file = opt$input_file, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::arrange(gene_symbol) + gene_expr = tpm_by_gene %>% + dplyr::select(-gene_symbol) %>% + as.matrix() + rownames(gene_expr) = tpm_by_gene %>% .[["gene_symbol"]] + cibersort = immunedeconv::deconvolute(gene_expression = gene_expr, method = "cibersort_abs", scale_mrna = FALSE) + readr::write_tsv(x = cibersort, file = opt$output_file, col_names = TRUE, append = FALSE) + +} diff --git a/scripts/initProject.pl b/scripts/init_project.pl similarity index 77% rename from scripts/initProject.pl rename to scripts/init_project.pl index 04ed5d37..06d15290 100644 --- a/scripts/initProject.pl +++ b/scripts/init_project.pl @@ -22,7 +22,3 @@ unless (-e "summary_config.yaml") { copy("modules/default_yaml/summary_config.yaml", "summary_config.yaml") or die "Unable to create summary_config.yaml: $!"; } - -# unless (-e "sample_attr.yaml") { -# copy("modules/default_yaml/sample_attr.yaml", "sample_attr.yaml") or die "Unable to create sample_attr.yaml: $!"; -# } diff --git a/scripts/joinEff.pl b/scripts/join_eff.pl similarity index 97% rename from scripts/joinEff.pl rename to scripts/join_eff.pl index bc0b1a6b..2779407b 100644 --- a/scripts/joinEff.pl +++ b/scripts/join_eff.pl @@ -1,5 +1,4 @@ #!/usr/bin/env perl -# join EFF lines use strict; use List::MoreUtils qw(first_index indexes); diff --git a/scripts/knit.R b/scripts/knit.R index c63e70e4..f6c77bf3 100644 --- a/scripts/knit.R +++ b/scripts/knit.R @@ -12,7 +12,6 @@ input <- args[1] outPrefix <- args[2] args <- args[c(-1,-2)] -#create output dirs figPath <- file.path(outPrefix, 'figure/') cachePath <- file.path(outPrefix, 'cache/') dir.create(figPath, showWarnings = F, recursive = T) diff --git a/scripts/medicc2.R b/scripts/medicc2.R new file mode 100644 index 00000000..4289f47c --- /dev/null +++ b/scripts/medicc2.R @@ -0,0 +1,74 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("reshape2")) +suppressPackageStartupMessages(library("copynumber")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "Which option?"), + make_option("--tumor_sample_name", default = NA, type = 'character', help = "Tumor sample name"), + make_option("--normal_sample_name", default = NA, type = 'character', help = "Normal sample name"), + make_option("--file_in", default = NA, type = 'character', help = "Input file name including path"), + make_option("--file_out", default = NA, type = 'character', help = "Output file name including path")) + +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +if (as.numeric(opt$option) == 1) { + load(as.character(opt$file_in)) + cn_df = out2$jointseg %>% + dplyr::as_tibble() %>% + dplyr::select(Chromosome = chrom, + Position = maploc, + Log2_Ratio = cnlr) + readr::write_tsv(x = cn_df, file = as.character(opt$file_out), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 2) { + tumor_sample_names = unlist(strsplit(x = as.character(opt$tumor_sample_name), split = " ", fixed = TRUE)) + normal_sample_name = unlist(strsplit(x = as.character(opt$normal_sample_name), split = " ", fixed = TRUE)) + cn_df = list() + for (i in 1:length(tumor_sample_names)) { + data_ = readr::read_tsv(file = paste0("medicc2/", tumor_sample_names[i], "/", tumor_sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + colnames(data_) = c("Chromosome", "Position", paste0(tumor_sample_names[i], "_Log2_Ratio")) + cn_df[[i]] = data_ %>% + reshape2::melt(id.vars = c("Chromosome", "Position")) + } + cn_df = do.call(bind_rows, cn_df) %>% + reshape2::dcast(Chromosome + Position ~ variable, value.var = "value", fill = 0) + readr::write_tsv(x = cn_df, file = as.character(opt$file_out), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option) == 3) { + tumor_sample_names = unlist(strsplit(x = as.character(opt$tumor_sample_name), split = " ", fixed = TRUE)) + normal_sample_name = unlist(strsplit(x = as.character(opt$normal_sample_name), split = " ", fixed = TRUE)) + cn_df = readr::read_tsv(file = as.character(opt$file_in), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + as.data.frame() + cn_smooth = copynumber::winsorize(data = cn_df, method = "mad", tau = 2.5, k = 40, verbose = FALSE) + cn_segmented = copynumber::multipcf(data = cn_smooth, gamma = 40, normalize = FALSE, fast = FALSE, verbose = FALSE) + + total_copies = cn_segmented %>% + dplyr::select(c("chrom", "start.pos", "end.pos", contains("Log2_Ratio"))) %>% + dplyr::rename(start = start.pos, end = end.pos) %>% + reshape2::melt(id.vars = c("chrom", "start", "end")) %>% + dplyr::select(sample_id = variable, + chrom, start, end, nAB = value) %>% + dplyr::mutate(sample_id = gsub(pattern = "_Log2_Ratio", replacement = "", x = sample_id, fixed = TRUE)) %>% + dplyr::left_join(readr::read_tsv(file = "facets/summary/summary.tsv", col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::select(sample_id = tumorName, purity, ploidy), + by = "sample_id") %>% + readr::type_convert() %>% + dplyr::mutate(nAB = ((2^nAB)*((purity*ploidy) + (2*(1-purity))) - 2*(1-purity))/purity) %>% + dplyr::mutate(nAB = round(nAB)) %>% + dplyr::select(-purity, -ploidy) + + readr::write_tsv(x = total_copies, file = as.character(opt$file_out), col_names = TRUE, append = FALSE) +} diff --git a/scripts/mimsi.R b/scripts/mimsi.R new file mode 100644 index 00000000..aebd765a --- /dev/null +++ b/scripts/mimsi.R @@ -0,0 +1,29 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), + make_option("--sample_names", default = NA, type = 'character', help = "sample name")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +if (as.numeric(opt$option)==1) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + smry = list() + for (i in 1:length(sample_names)) { + smry[[i]] = readr::read_tsv(file = paste0("mimsi/", sample_names[i], "/", sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + smry = do.call(rbind, smry) + write_tsv(smry, path="mimsi/summary.txt", append = FALSE, col_names = TRUE) + +} diff --git a/scripts/monitorMySQL.sh b/scripts/monitorMySQL.sh index 898c070b..6fb2b6cf 100755 --- a/scripts/monitorMySQL.sh +++ b/scripts/monitorMySQL.sh @@ -1,4 +1,5 @@ #!/bin/bash + UP=$(pgrep -u limr mysqld | wc -l); if [ "$UP" -ne 1 ]; then diff --git a/scripts/monitorGfServer.sh b/scripts/monitor_gfserver.sh similarity index 99% rename from scripts/monitorGfServer.sh rename to scripts/monitor_gfserver.sh index 6c552afd..376bb8df 100644 --- a/scripts/monitorGfServer.sh +++ b/scripts/monitor_gfserver.sh @@ -1,4 +1,5 @@ #!/bin/bash + UP=$(pgrep -u limr gfServer | wc -l); if [ "$UP" -ne 1 ]; then diff --git a/scripts/pyclone_13.R b/scripts/pyclone_13.R new file mode 100644 index 00000000..0c79350e --- /dev/null +++ b/scripts/pyclone_13.R @@ -0,0 +1,268 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("fuzzyjoin")) +suppressPackageStartupMessages(library("reshape2")) +suppressPackageStartupMessages(library("ComplexHeatmap")) +suppressPackageStartupMessages(library("RColorBrewer")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_set", default = NA, type = 'character', help = "sample set"), + make_option("--normal_sample", default = NA, type = 'character', help = "normal sample"), + make_option("--input_file", default = NA, type = 'character', help = "input file"), + make_option("--output_file", default = NA, type = 'character', help = "output file"), + make_option("--num_iter", default = NA, type = 'character', help = "mcmc iterations")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option) == 1) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = "_")) + normal_sample = as.character(opt$normal_sample) + sample_set = setdiff(sample_set, normal_sample) + pyclone = list() + for (i in 1:length(sample_set)) { + sufam = readr::read_tsv(file = paste0("pyclone_13/", sample_set[i], "/", sample_set[i], ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(Chromosome = chrom, + Position = pos, + Reference_Allele = val_ref, + Alternate_Allele = val_alt, + t_depth = cov, + t_alt_count = val_al_count) %>% + dplyr::mutate(t_ref_count = t_depth - t_alt_count) %>% + dplyr::mutate(mutation_id = paste0(Chromosome, ":", Position, ":", Reference_Allele, ":", Alternate_Allele), + ref_counts = t_ref_count, + var_counts = t_alt_count, + normal_cn = 2) + + facets = readr::read_tsv(file = paste0("facets/cncf/", sample_set[i], "_", normal_sample, ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::mutate(chrom = ifelse(chrom == "23", "X", chrom)) %>% + dplyr::mutate(Chromosome = chrom, + Start_Position = loc.start, + End_Position = loc.end, + minor_cn = ifelse(is.na(lcn.em), "0", lcn.em), + major_cn = tcn.em) %>% + readr::type_convert() %>% + dplyr::mutate(major_cn = major_cn - minor_cn) %>% + dplyr::select(Chromosome, Start_Position, End_Position, minor_cn, major_cn) + + pyclone[[i]] = sufam %>% + dplyr::mutate(Chromosome = ifelse(Chromosome == "X", "23", Chromosome)) %>% + dplyr::mutate(Start_Position = Position, + End_Position = Position +1) %>% + readr::type_convert() %>% + fuzzyjoin::genome_left_join(facets %>% + dplyr::mutate(Chromosome = ifelse(Chromosome == "X", "23", Chromosome)) %>% + readr::type_convert(), + by = c("Chromosome", "Start_Position", "End_Position")) %>% + dplyr::mutate(sample_id = sample_set[i]) %>% + dplyr::select(mutation_id, sample_id, ref_counts, var_counts, normal_cn, major_cn, minor_cn) + + } + pyclone = do.call(rbind, pyclone) %>% + dplyr::filter(!is.na(ref_counts)) %>% + dplyr::filter(!is.na(var_counts)) %>% + dplyr::mutate(var_counts = ifelse(var_counts<=1, 0, var_counts)) %>% + dplyr::filter(!is.na(major_cn)) %>% + dplyr::filter(major_cn != 0) %>% + dplyr::mutate(minor_cn = ifelse(is.na(minor_cn), 0, minor_cn)) + + smry = pyclone %>% + dplyr::group_by(mutation_id) %>% + dplyr::summarize(n_x = n(), + n_y = sum(var_counts)) %>% + dplyr::ungroup() + + pyclone = pyclone %>% + dplyr::left_join(smry, by = "mutation_id") %>% + dplyr::filter(n_x == length(sample_set)) %>% + dplyr::filter(n_y > 0) + + for (i in 1:length(sample_set)) { + pyclone_ft = pyclone %>% + dplyr::filter(sample_id == sample_set[i]) %>% + dplyr::select(mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn) + readr::write_tsv(x = pyclone_ft, file = paste0("pyclone_13/", opt$sample_set, "/", sample_set[i], ".tsv"), append = FALSE, col_names = TRUE) + } + +} else if (as.numeric(opt$option) == 2) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = "_")) + normal_sample = as.character(opt$normal_sample) + sample_set = setdiff(sample_set, normal_sample) + params = list() + for (i in 1:length(sample_set)) { + params[[i]] = readr::read_tsv(file = paste0("facets/cncf/", sample_set[i], "_", normal_sample, ".out"), col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(grepl("# Purity", X1)) %>% + dplyr::mutate(X1 = gsub("# Purity = ", "", X1)) %>% + readr::type_convert() %>% + .[["X1"]] + } + cat(paste0("num_iters: ", as.numeric(opt$num_iter), "\n\n"), file = as.character(opt$output_file), append = FALSE) + cat("base_measure_params:\n", file = as.character(opt$output_file), append = TRUE) + cat(" alpha: 1\n", file = as.character(opt$output_file), append = TRUE) + cat(" beta: 1\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat("concentration:\n", file = as.character(opt$output_file), append = TRUE) + cat(" value: 1.0\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat(" prior:\n", file = as.character(opt$output_file), append = TRUE) + cat(" shape: 1.0\n", file = as.character(opt$output_file), append = TRUE) + cat(" rate: 1.0\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat("density: pyclone_beta_binomial\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat("beta_binomial_precision_params:\n", file = as.character(opt$output_file), append = TRUE) + cat(" value: 1000\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat(" prior:\n", file = as.character(opt$output_file), append = TRUE) + cat(" shape: 10\n", file = as.character(opt$output_file), append = TRUE) + cat(" rate: 10\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat(" proposal:\n", file = as.character(opt$output_file), append = TRUE) + cat(" precision: 0.1\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat("working_dir: pyclone_13/", file = as.character(opt$output_file), append = TRUE) + cat(as.character(opt$sample_set), file = as.character(opt$output_file), append = TRUE) + cat("\n\n", file = as.character(opt$output_file), append = TRUE) + cat("trace_dir: trace\n", file = as.character(opt$output_file), append = TRUE) + cat("init_method: connected\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat("samples:\n", file = as.character(opt$output_file), append = TRUE) + for (i in 1:length(sample_set)) { + cat(paste0(" ", sample_set[i], ":\n"), file = as.character(opt$output_file), append = TRUE) + cat(paste0(" mutations_file: pyclone_13/", as.character(opt$sample_set), "/", sample_set[i], ".yaml\n\n"), file = as.character(opt$output_file), append = TRUE) + cat(" tumour_content:\n", file = as.character(opt$output_file), append = TRUE) + cat(paste0(" value: ", params[[i]], "\n"), file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + cat(" error_rate: 0.01\n", file = as.character(opt$output_file), append = TRUE) + cat("\n", file = as.character(opt$output_file), append = TRUE) + } +} else if (as.numeric(opt$option) == 3) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ")) + pyclone = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::arrange(mutation_id) + + pyclone_ft = list() + index = 1 + for (i in 1:(length(sample_set)-1)) { + for (j in (i+1):length(sample_set)) { + pyclone_ft[[index]] = pyclone %>% + dplyr::filter(sample_id == sample_set[i]) %>% + dplyr::select(mutation_id, + cluster_id, + sample_id_x = sample_id, + cellular_prevalence_x = cellular_prevalence) %>% + dplyr::full_join(pyclone %>% + dplyr::filter(sample_id == sample_set[j]) %>% + dplyr::select(mutation_id, + sample_id_y = sample_id, + cellular_prevalence_y = cellular_prevalence), + by = "mutation_id") %>% + readr::type_convert() + index = index + 1 + } + } + pyclone_ft = do.call(bind_rows, pyclone_ft) %>% + readr::type_convert() %>% + dplyr::filter(!is.na(cellular_prevalence_x)) %>% + dplyr::filter(!is.na(cellular_prevalence_y)) %>% + dplyr::mutate(sample_id_x = factor(sample_id_x, levels = sample_set, ordered = TRUE)) %>% + dplyr::mutate(sample_id_y = factor(sample_id_y, levels = sample_set, ordered = TRUE)) + + smry_ = pyclone_ft %>% + dplyr::group_by(mutation_id) %>% + dplyr::summarize(cluster_id = unique(cluster_id)) %>% + dplyr::ungroup() %>% + dplyr::group_by(cluster_id) %>% + dplyr::summarize(n = n()) + + pyclone_ft = pyclone_ft %>% + dplyr::left_join(smry_, by = "cluster_id") + + colourCount = nrow(smry_) + getPalette = colorRampPalette(brewer.pal(9, "Set1")) + + plot_ = pyclone_ft %>% + ggplot(aes(x = 100*cellular_prevalence_x, y = 100*cellular_prevalence_y, color = factor(cluster_id), size = n)) + + geom_point(stat = "identity", alpha = .75, shape = 21) + + scale_color_manual(values = getPalette(colourCount)) + + xlab("\n\nCCF (%)\n") + + ylab("\nCCF (%)\n\n") + + guides(color = guide_legend(title = "Cluster", override.aes = list(shape = 19)), + size = guide_legend(title = "N")) + + facet_wrap(~sample_id_x+sample_id_y) + + pdf(file = as.character(opt$output_file), width = 18, height = 18) + print(plot_) + dev.off() + +} else if (as.numeric(opt$option) == 4) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ")) + pyclone = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(sample_id = paste0(sample_id, " ")) + + pyclone_mt = pyclone %>% + reshape2::dcast(formula = mutation_id~sample_id, value.var = "cellular_prevalence") %>% + dplyr::left_join(pyclone %>% + dplyr::select(mutation_id, cluster_id) %>% + dplyr::filter(!duplicated(mutation_id)), by = "mutation_id") + + smry_ = pyclone %>% + dplyr::group_by(cluster_id) %>% + dplyr::summarize(cluster_mean = mean(cellular_prevalence)) %>% + dplyr::ungroup() + + pyclone_mt = pyclone_mt %>% + dplyr::left_join(smry_, by = "cluster_id") + + index = pyclone_mt %>% + dplyr::select(-mutation_id, -cluster_id, -cluster_mean) %>% + apply(., 1, mean) + + pyclone_mt = pyclone_mt %>% + dplyr::mutate(index = index) %>% + dplyr::arrange(desc(cluster_mean), desc(cluster_id), desc(index)) + + cp = c("#f0f0f0","#c6dbef","#9ecae1","#6baed6","#4292c6","#2171b5","#08519c","#08519c","#08306b","#08306b","#08306b") + ca = colorRampPalette(brewer.pal(9, "Set1"))(nrow(smry_)) + names(ca) = smry_ %>% .[["cluster_id"]] + + ha = rowAnnotation( + `Cluster ID` = pyclone_mt %>% .[["cluster_id"]], + col = list(`Cluster ID` = ca), + simple_anno_size = unit(7, "mm") + ) + + pdf(file = as.character(opt$output_file), width = 12, height = 18) + draw(Heatmap(matrix = pyclone_mt %>% + dplyr::select(-mutation_id, -cluster_id, -cluster_mean, -index), + col = cp, + name = "CCF", + na_col = "#f0f0f0", + border = "white", + border_gp = gpar(lwd = 0), + cluster_rows = TRUE, + show_row_dend = FALSE, + cluster_row_slices = TRUE, + cluster_columns = TRUE, + show_column_dend = FALSE, + use_raster = FALSE, + left_annotation = ha, + row_split = pyclone_mt %>% .[["cluster_id"]], + width = unit(20, "cm"), + height = unit(40, "cm"))) + dev.off() + +} diff --git a/scripts/pyclone_vi.R b/scripts/pyclone_vi.R new file mode 100644 index 00000000..831ce0da --- /dev/null +++ b/scripts/pyclone_vi.R @@ -0,0 +1,230 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("fuzzyjoin")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("reshape2")) +suppressPackageStartupMessages(library("ComplexHeatmap")) +suppressPackageStartupMessages(library("RColorBrewer")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_set", default = NA, type = 'character', help = "sample set"), + make_option("--normal_sample", default = NA, type = 'character', help = "normal sample"), + make_option("--input_file", default = NA, type = 'character', help = "input file"), + make_option("--output_file", default = NA, type = 'character', help = "output file")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option) == 1) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = "_")) + normal_sample = as.character(opt$normal_sample) + sample_set = setdiff(sample_set, normal_sample) + pyclone = list() + for (i in 1:length(sample_set)) { + sufam = readr::read_tsv(file = paste0("pyclone_vi/", sample_set[i], "/", sample_set[i], ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(Chromosome = chrom, + Position = pos, + Reference_Allele = val_ref, + Alternate_Allele = val_alt, + t_depth = cov, + t_alt_count = val_al_count) %>% + dplyr::mutate(t_ref_count = t_depth - t_alt_count) %>% + dplyr::mutate(mutation_id = paste0(Chromosome, ":", Position, ":", Reference_Allele, ":", Alternate_Allele), + ref_counts = t_ref_count, + alt_counts = t_alt_count, + normal_cn = 2) + + facets = readr::read_tsv(file = paste0("facets/cncf/", sample_set[i], "_", normal_sample, ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::mutate(chrom = ifelse(chrom == "23", "X", chrom)) %>% + dplyr::mutate(Chromosome = chrom, + Start_Position = loc.start, + End_Position = loc.end, + minor_cn = lcn.em, + major_cn = tcn.em) %>% + readr::type_convert() %>% + dplyr::mutate(major_cn = ifelse(is.na(major_cn), 2, major_cn)) %>% + dplyr::mutate(minor_cn = ifelse(is.na(minor_cn), major_cn, minor_cn)) %>% + dplyr::mutate(major_cn = major_cn - minor_cn) %>% + dplyr::select(Chromosome, Start_Position, End_Position, minor_cn, major_cn) + + pyclone[[i]] = sufam %>% + dplyr::mutate(Chromosome = ifelse(Chromosome == "X", "23", Chromosome)) %>% + dplyr::mutate(Start_Position = Position, + End_Position = Position +1) %>% + readr::type_convert() %>% + fuzzyjoin::genome_left_join(facets %>% + dplyr::mutate(Chromosome = ifelse(Chromosome == "X", "23", Chromosome)) %>% + readr::type_convert(), + by = c("Chromosome", "Start_Position", "End_Position")) %>% + dplyr::mutate(sample_id = sample_set[i]) %>% + dplyr::select(mutation_id, sample_id, ref_counts, alt_counts, normal_cn, major_cn, minor_cn) + + params = readr::read_tsv(file = paste0("facets/cncf/", sample_set[i], "_", normal_sample, ".out"), col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(grepl("# Purity", X1)) %>% + dplyr::mutate(X1 = gsub("# Purity = ", "", X1)) %>% + readr::type_convert() %>% + .[["X1"]] + + parame = ifelse(is.na(params), .1, params) + + pyclone[[i]] = pyclone[[i]] %>% + dplyr::mutate(tumour_content = params) + } + pyclone = do.call(rbind, pyclone) %>% + dplyr::filter(!is.na(ref_counts)) %>% + dplyr::filter(!is.na(alt_counts)) %>% + dplyr::mutate(alt_counts = ifelse(alt_counts<=1, 0, alt_counts)) %>% + dplyr::mutate(major_cn = ifelse(is.na(major_cn), 1, major_cn)) %>% + dplyr::mutate(major_cn = ifelse(major_cn==0, 1, major_cn)) %>% + dplyr::mutate(minor_cn = ifelse(is.na(minor_cn), 0, minor_cn)) + + smry = pyclone %>% + dplyr::group_by(mutation_id) %>% + dplyr::summarize(n_x = n(), + n_y = sum(alt_counts)) %>% + dplyr::ungroup() + + pyclone = pyclone %>% + dplyr::left_join(smry, by = "mutation_id") %>% + dplyr::filter(n_x == length(sample_set)) %>% + dplyr::filter(n_y > 0) + + readr::write_tsv(x = pyclone, file = opt$output_file, append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option) == 2) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ")) + pyclone = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + + pyclone_ft = list() + index = 1 + for (i in 1:(length(sample_set)-1)) { + for (j in (i+1):length(sample_set)) { + pyclone_ft[[index]] = pyclone %>% + dplyr::filter(sample_id == sample_set[i]) %>% + dplyr::select(mutation_id, + cluster_id, + sample_id_x = sample_id, + cellular_prevalence_x = cellular_prevalence) %>% + dplyr::full_join(pyclone %>% + dplyr::filter(sample_id == sample_set[j]) %>% + dplyr::select(mutation_id, + sample_id_y = sample_id, + cellular_prevalence_y = cellular_prevalence), + by = "mutation_id") %>% + readr::type_convert() + index = index + 1 + } + } + pyclone_ft = do.call(bind_rows, pyclone_ft) %>% + readr::type_convert() %>% + dplyr::filter(!is.na(cellular_prevalence_x)) %>% + dplyr::filter(!is.na(cellular_prevalence_y)) %>% + dplyr::mutate(sample_id_x = factor(sample_id_x, levels = sample_set, ordered = TRUE)) %>% + dplyr::mutate(sample_id_y = factor(sample_id_y, levels = sample_set, ordered = TRUE)) + + smry_ = pyclone_ft %>% + dplyr::group_by(mutation_id) %>% + dplyr::summarize(cluster_id = unique(cluster_id)) %>% + dplyr::ungroup() %>% + dplyr::group_by(cluster_id) %>% + dplyr::summarize(n = n()) + + pyclone_ft = pyclone_ft %>% + dplyr::left_join(smry_, by = "cluster_id") + + colourCount = nrow(smry_) + getPalette = colorRampPalette(brewer.pal(9, "Set1")) + + plot_ = pyclone_ft %>% + ggplot(aes(x = 100*cellular_prevalence_x, y = 100*cellular_prevalence_y, color = factor(cluster_id), size = n)) + + geom_point(stat = "identity", alpha = .75, shape = 21) + + scale_color_manual(values = getPalette(colourCount)) + + xlab("\n\nCCF (%)\n") + + ylab("\nCCF (%)\n\n") + + guides(color = guide_legend(title = "Cluster"), + size = guide_legend(title = "N")) + + facet_wrap(~sample_id_x+sample_id_y) + + pdf(file = as.character(opt$output_file), width = 18, height = 18) + print(plot_) + dev.off() + +} else if (as.numeric(opt$option) == 3) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ")) + pyclone = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(sample_id = paste0(sample_id, " ")) + + pyclone_mt = pyclone %>% + reshape2::dcast(formula = mutation_id~sample_id, value.var = "cellular_prevalence") %>% + dplyr::left_join(pyclone %>% + dplyr::select(mutation_id, cluster_id) %>% + dplyr::filter(!duplicated(mutation_id)), by = "mutation_id") + + smry_ = pyclone %>% + dplyr::group_by(cluster_id) %>% + dplyr::summarize(cluster_mean = mean(cellular_prevalence)) %>% + dplyr::ungroup() + + pyclone_mt = pyclone_mt %>% + dplyr::left_join(smry_, by = "cluster_id") + + index = pyclone_mt %>% + dplyr::select(-mutation_id, -cluster_id, -cluster_mean) %>% + apply(., 1, mean) + + pyclone_mt = pyclone_mt %>% + dplyr::mutate(index = index) %>% + dplyr::arrange(desc(cluster_mean), desc(cluster_id), desc(index)) + + cp = c("#f0f0f0","#c6dbef","#9ecae1","#6baed6","#4292c6","#2171b5","#08519c","#08519c","#08306b","#08306b","#08306b") + ca = colorRampPalette(brewer.pal(9, "Set1"))(nrow(smry_)) + names(ca) = smry_ %>% .[["cluster_id"]] + + ha = rowAnnotation( + `Cluster ID` = pyclone_mt %>% .[["cluster_id"]], + col = list(`Cluster ID` = ca), + simple_anno_size = unit(7, "mm") + ) + + pdf(file = as.character(opt$output_file), width = 12, height = 18) + draw(Heatmap(matrix = pyclone_mt %>% + dplyr::select(-mutation_id, -cluster_id, -cluster_mean, -index), + col = cp, + name = "CCF", + na_col = "#f0f0f0", + border = "white", + border_gp = gpar(lwd = 0), + cluster_rows = TRUE, + show_row_dend = FALSE, + cluster_row_slices = TRUE, + cluster_columns = TRUE, + show_column_dend = FALSE, + use_raster = FALSE, + left_annotation = ha, + row_split = pyclone_mt %>% .[["cluster_id"]], + width = unit(20, "cm"), + height = unit(40, "cm"))) + dev.off() + +} else if (as.numeric(opt$option) == 4) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ")) + pyclone = list() + for (i in 1:length(sample_set)) { + pyclone[[i]] = readr::read_tsv(file = paste0("pyclone_vi/", sample_set[i], "/summary/by_loci.txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + pyclone = do.call(bind_rows, pyclone) + readr::write_tsv(x = pyclone, file = "pyclone_vi/summary.txt", append = FALSE, col_names = TRUE) +} diff --git a/scripts/qmake.pl b/scripts/qmake.pl index 89738127..20461017 100755 --- a/scripts/qmake.pl +++ b/scripts/qmake.pl @@ -6,24 +6,21 @@ use Cwd; my $cwd = getcwd; -#my $fin_email_addrs = "qmake.finished\@raylim.mm.st charlottekyng+qmake.finished\@gmail.com"; -#my $err_email_addrs = "qmake.error\@raylim.mm.st charlottekyng+qmake.error\@gmail.com"; -#my $start_email_addrs = "qmake.start\@raylim.mm.st charlottekyng+qmake.start\@gmail.com"; - my $err_slack = "pipeline_error"; my $fin_slack = "pipeline_finished"; my %slack_map = ( - limr => "raylim", - debruiji => "debruiji", - brownd7 => "brownd7", - selenicp => "selenicp", - lees19 => "lees19", - ferrandl => "ferrandl", - dacruzpa => "dacruzpa" + brownd7 => "W013UH0HWUF", + selenicp => "W0142HA5LNA", + dacruzpa => "W01BT68MSSD", + parejaf => "W01BLNUF7J8", + zhuy1 => "W013UH382P9", + peix => "W0147TPN3E1", + issabhas => "U01V8R1RKQU", + xiaoy => "U01C8MPBSH5", + giacomf1 => "U06SW7W6D44" ); - sub HELP_MESSAGE { print "Usage: qmake.pl -n [name] -m -r [numAttempts]\n"; print "-m: e-mail notifications\n"; @@ -37,8 +34,13 @@ sub HELP_MESSAGE { sub slack { my ($slack_channel, $slack_message) = @_; - my $slack_url = "\$'https://jrflab.slack.com/services/hooks/slackbot?token=2TWPiY9Hu4EUteoECqCEfYAZ&channel=%23$slack_channel'"; - system "curl --data ' $slack_message' $slack_url &> /dev/null"; + my $slack_url = ""; + if ($slack_channel eq "pipeline_error") { + $slack_url = $ENV{SLACK_URL_ERR}; + } elsif ($slack_channel eq "pipeline_finished") { + $slack_url = $ENV{SLACK_URL_FIN}; + } + system "curl -X POST -H 'Content-type: application/json' --data '{\"text\":\"$slack_message\"}' $slack_url &> /dev/null"; } @@ -73,10 +75,8 @@ sub slack { # makefile processing =pod my $orig_args = $args; - $args =~ s;-f (\S+);"-f " . dirname($1) . "/." . basename($1) . ".tmp";e; my $optf = $1; - my @makefiles; if (defined $optf) { push @makefiles, $optf; @@ -88,9 +88,6 @@ sub slack { } push @makefiles, "Makefile"; } - - - do { my $makefile = glob(shift(@makefiles)); @@ -148,13 +145,17 @@ sub slack { $mail_subject = "**FINAL** $mail_subject"; } $mail_subject .= " Attempt " . ($n + 1) if $n > 0; + #open(MAIL, "| mail -s '$mail_subject' $addrs"); + #print MAIL "Return code: $retcode\n"; + #print MAIL "$mail_msg"; + #close MAIL; } - my $pipeline_channel_msg = "\@${slackname} $project_name :"; + my $pipeline_channel_msg = "<\@${slackname}|cal> $project_name :"; if ($opt{s} && ($retcode == 0 || $n == 0 || $n + 1 == $attempts)) { if ($retcode == 0) { # op success - my $slack_msg = "*COMPLETE* $name :ok_hand:"; + my $slack_msg = "*COMPLETE* $name :the_horns:"; &slack($fin_slack, "$pipeline_channel_msg $slack_msg"); &slack($opt{c}, $slack_msg) if $opt{c}; } else { @@ -162,7 +163,7 @@ sub slack { my $slack_msg = "*FAILURE* $cwd/$logfile"; if ($n + 1 == $attempts) { # final attempt - $slack_msg = ":troll: $slack_msg"; + $slack_msg = ":-1: $slack_msg"; &slack($opt{c}, $slack_msg) if $opt{c}; } &slack($err_slack, "$pipeline_channel_msg $slack_msg"); diff --git a/scripts/star_fish.R b/scripts/star_fish.R new file mode 100644 index 00000000..b7f53b36 --- /dev/null +++ b/scripts/star_fish.R @@ -0,0 +1,107 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("Starfish")) + + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name"), + make_option("--input_file", default = NA, type = 'character', help = "input file"), + make_option("--output_file", default = NA, type = 'character', help = "output file")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + sample_name = as.character(opt$sample_name) + bed = readr::read_tsv(file = as.character(opt$input_file), col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(chrom1 = X1, + start1 = X2, + end1 = X3, + chrom2 = X4, + start2 = X5, + end2 = X6, + sv_id = X7, + pe_support = X8, + strand1 = X9, + strand2 = X10, + svclass = X11) %>% + dplyr::select(chrom1, pos1 = start1, chrom2, pos2 = start2, strand1, strand2, svtype = svclass) %>% + dplyr::mutate(svtype = case_when( + svtype == "INV" & strand1 == "+" & strand2 == "+" ~ "h2hINV", + svtype == "INV" & strand1 == "-" & strand2 == "-" ~ "t2tINV", + TRUE ~ svtype + )) %>% + dplyr::mutate(sample = sample_name) + readr::write_tsv(x = bed, file = as.character(opt$output_file), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==2) { + sample_name = as.character(opt$sample_name) + data = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::select(chromosome = chrom, + start = loc.start, + end = loc.end, + total_cn = tcn.em) %>% + dplyr::mutate(sample = sample_name) + readr::write_tsv(x = data, file = as.character(opt$output_file), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==3) { + sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + sv_df = cn_df = gd_df = list() + for (i in 1:length(sample_names)) { + sv_df[[i]] = readr::read_tsv(file = paste0("star_fish/", sample_names[i], "/", sample_names[i], ".merged_sv.bedpe"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + cn_df[[i]] = readr::read_tsv(file = paste0("star_fish/", sample_names[i], "/", sample_names[i], ".merged_cn.txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + gd_df[[i]] = dplyr::tibble(sample = sample_names[i], gender = "unknown") %>% + readr::type_convert() + } + sv_df = do.call(rbind, sv_df) + cn_df = do.call(rbind, cn_df) + gd_df = do.call(rbind, gd_df) + starfish_link_out = starfish_link(sv_file = sv_df, prefix = "star_fish/summary/") + if (length(starfish_link_out) == 1) { + cat(starfish_link_out, file = as.character(opt$output_file), append = FALSE) + } else { + starfish_feature_out = starfish_feature(cgr = starfish_link_out$starfish_call, + complex_sv = starfish_link_out$interleave_tra_complex_sv, + cnv_file = cn_df %>% + dplyr::mutate(chromosome = as.character(chromosome)) %>% + dplyr::mutate(chromosome = case_when( + chromosome == "23" ~ "X", + TRUE ~ chromosome)), + gender_file = gd_df, + prefix = "star_fish/summary/", + genome_v = "hg19", + cnv_factor = "auto", + arm_del_rm = TRUE) + starfish_sig_out = starfish_sig(cluster_feature = starfish_feature_out$cluster_feature, + prefix = "star_fish/summary/", + cmethod = "class") + wd = getwd() + setwd("star_fish/summary/") + starfish_plot(sv_file = sv_df, cnv_file = cn_df, cgr = starfish_link_out$starfish_call, genome_v = "hg19") + setwd(wd) + cat("taskcomplete!!", file = as.character(opt$output_file), append = FALSE) + } + +} else if (as.numeric(opt$option)==4) { + df = readr::read_csv(file = "star_fish/summary/_pcawg_6signatures_class.csv", col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(sample_name = unlist(lapply(cluster_id, function(x) { paste0(unlist(strsplit(x, "_", fixed = TRUE))[1:2], collapse="_")}))) + readr::write_tsv(x = df, file = as.character(opt$output_file), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==5) { + df = readr::read_csv(file = "star_fish/summary/_CGR_feature_matrix.csv", col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(sample_name = sample) + readr::write_tsv(x = df, file = as.character(opt$output_file), append = FALSE, col_names = TRUE) +} diff --git a/scripts/sufam_gt.R b/scripts/sufam_gt.R new file mode 100644 index 00000000..98fa3726 --- /dev/null +++ b/scripts/sufam_gt.R @@ -0,0 +1,165 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("fuzzyjoin")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_set", default = NA, type = 'character', help = "sample set"), + make_option("--tumor_sample", default = NA, type = 'character', help = "tumor sample"), + make_option("--normal_sample", default = NA, type = 'character', help = "normal sample"), + make_option("--input_file", default = NA, type = 'character', help = "input file"), + make_option("--output_file", default = NA, type = 'character', help = "output file")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ", fixed=TRUE)) + normal_sample = unlist(strsplit(x = as.character(opt$normal_sample), split = " ", fixed=TRUE)) + sample_set = setdiff(sample_set, normal_sample) + smry = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(TUMOR_SAMPLE %in% sample_set) %>% + dplyr::filter(NORMAL_SAMPLE == normal_sample) %>% + dplyr::mutate(UUID = paste0(CHROM, ":", POS, "_", REF, ">", ALT)) %>% + dplyr::filter(!duplicated(UUID)) %>% + dplyr::mutate(`#CHROM` = CHROM, + POS = POS, + ID = ".", + REF = REF, + ALT = ALT, + QUAL = 100, + FILTER = "PASS", + INFO = ".") %>% + dplyr::select(`#CHROM`, POS, ID, REF, ALT, QUAL, INFO) %>% + dplyr::mutate(`#CHROM` = as.character(`#CHROM`)) %>% + dplyr::mutate(chr_n = case_when( + `#CHROM` == "X" ~ "23", + `#CHROM` == "Y" ~ "24", + TRUE ~ `#CHROM` + )) %>% + readr::type_convert() %>% + dplyr::arrange(chr_n) %>% + dplyr::select(-chr_n) + cat("##fileformat=VCFv4.2\n", file = as.character(opt$output_file), append=FALSE) + readr::write_tsv(x = smry, path = as.character(opt$output_file), append = TRUE, col_names = TRUE) + +} else if (as.numeric(opt$option)==2) { + tumor_sample = unlist(strsplit(x = as.character(opt$tumor_sample), split = " ", fixed=TRUE)) + normal_sample = unlist(strsplit(x = as.character(opt$normal_sample), split = " ", fixed=TRUE)) + maf = readr::read_tsv(file = opt$input_file, comment = "#", col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(chrom = Chromosome, + loc.start = Start_Position, + loc.end = End_Position) %>% + dplyr::mutate(chrom = as.character(chrom)) + facets = readr::read_tsv(file = paste0("facets/cncf/", tumor_sample, "_", normal_sample, ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::mutate(chrom = case_when( + chrom == "23" ~ "X", + TRUE ~ chrom + )) %>% + readr::type_convert() %>% + dplyr::mutate(qt = tcn.em, + q2 = tcn.em - lcn.em) %>% + dplyr::select(chrom, loc.start, loc.end, qt, q2) + maf = maf %>% + fuzzyjoin::genome_left_join(facets, by = c("chrom", "loc.start", "loc.end")) %>% + dplyr::select(-chrom.x, -loc.start.x, -loc.end.x, -chrom.y, -loc.start.y, -loc.end.y) + + write_tsv(x = maf, path = as.character(opt$output_file), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==3) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ", fixed=TRUE)) + normal_sample = unlist(strsplit(x = as.character(opt$normal_sample), split = " ", fixed=TRUE)) + sample_set = setdiff(sample_set, normal_sample) + maf = list() + for (i in 1:length(sample_set)) { + sufam = readr::read_tsv(file = paste0("sufam/", sample_set[i], ".txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(CHROM = chrom, + POS = pos, + REF = val_ref, + ALT = val_alt, + t_depth = cov, + t_alt_count = val_al_count) %>% + dplyr::mutate(t_ref_count = t_depth - t_alt_count) + + maf[[i]] = readr::read_tsv(file = paste0("sufam/", sample_set[i], "_ann.maf"), comment = "#", col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(-t_depth, -t_alt_count, -t_ref_count) %>% + dplyr::bind_cols(sufam) + } + maf = do.call(bind_rows, maf) + write_tsv(x = maf, path = as.character(opt$output_file), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==4) { + sample_set = unlist(strsplit(x = as.character(opt$sample_set), split = " ", fixed=TRUE)) + maf = list() + for (i in 1:length(sample_set)) { + maf[[i]] = readr::read_tsv(file = paste0("sufam/", sample_set[i], ".maf"), comment = "#", col_names = TRUE, col_types = cols(.default = col_character())) + } + maf = do.call(bind_rows, maf) %>% + readr::type_convert() + smry = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + dplyr::mutate(HOTSPOT = case_when( + is.na(HOTSPOT) ~ FALSE, + HOTSPOT == "True" ~ TRUE, + HOTSPOT == "False" ~ FALSE, + HOTSPOT == "TRUE" ~ TRUE, + HOTSPOT == "FALSE" ~ FALSE + )) %>% + dplyr::mutate(HOTSPOT_INTERNAL = case_when( + is.na(HOTSPOT_INTERNAL) ~ FALSE, + HOTSPOT_INTERNAL == "True" ~ TRUE, + HOTSPOT_INTERNAL == "False" ~ FALSE, + HOTSPOT_INTERNAL == "TRUE" ~ TRUE, + HOTSPOT_INTERNAL == "FALSE" ~ FALSE + )) %>% + dplyr::mutate(cmo_hotspot = case_when( + is.na(cmo_hotspot) ~ FALSE, + cmo_hotspot == "True" ~ TRUE, + cmo_hotspot == "False" ~ FALSE, + cmo_hotspot == "TRUE" ~ TRUE, + cmo_hotspot == "FALSE" ~ FALSE + )) %>% + dplyr::mutate(is_Hotspot = HOTSPOT | HOTSPOT_INTERNAL | cmo_hotspot) %>% + dplyr::mutate(facetsLOHCall = case_when( + is.na(facetsLOHCall) ~ FALSE, + facetsLOHCall == "True" ~ TRUE, + facetsLOHCall == "False" ~ FALSE, + facetsLOHCall == "TRUE" ~ TRUE, + facetsLOHCall == "FALSE" ~ FALSE + )) %>% + dplyr::mutate(is_LOH = facetsLOHCall) %>% + readr::type_convert() + maf = maf %>% + dplyr::left_join(smry %>% + dplyr::group_by(CHROM, POS, REF, ALT) %>% + dplyr::summarize(is_Hotspot = unique(is_Hotspot)) %>% + dplyr::ungroup(), + by = c("CHROM", "POS", "REF", "ALT")) + maf = maf %>% + dplyr::left_join(smry %>% + dplyr::select(CHROM, POS, REF, ALT, Tumor_Sample_Barcode = TUMOR_SAMPLE, is_LOH) %>% + dplyr::mutate(is_present = TRUE), + by = c("CHROM", "POS", "REF", "ALT", "Tumor_Sample_Barcode")) %>% + dplyr::mutate(is_present = case_when( + is.na(is_present) ~ FALSE, + TRUE ~ is_present + )) + write_tsv(x = maf, path = as.character(opt$output_file), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==5) { + maf = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::filter(is_present) + write_tsv(x = maf, path = as.character(opt$output_file), append = FALSE, col_names = TRUE) +} + diff --git a/rnaseq/summarizeRNASeqReads.R b/scripts/summarize_rnaseqreads.R similarity index 100% rename from rnaseq/summarizeRNASeqReads.R rename to scripts/summarize_rnaseqreads.R diff --git a/rnaseq/summarizeRNASeqReadsByExon.R b/scripts/summarize_rnaseqreads_byexon.R similarity index 100% rename from rnaseq/summarizeRNASeqReadsByExon.R rename to scripts/summarize_rnaseqreads_byexon.R diff --git a/rnaseq/summarizeRNASeqReadsByIntron.R b/scripts/summarize_rnaseqreads_byintron.R similarity index 100% rename from rnaseq/summarizeRNASeqReadsByIntron.R rename to scripts/summarize_rnaseqreads_byintron.R diff --git a/scripts/summarize_sleuth.R b/scripts/summarize_sleuth.R new file mode 100644 index 00000000..51f018b4 --- /dev/null +++ b/scripts/summarize_sleuth.R @@ -0,0 +1,34 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("sleuth")) + + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option('--annotation', type = 'character', default = NA, help = 'path to annotation file'), + make_option('--samples', type = 'character', default = NA, help = 'list of samples names')) +parser = OptionParser(usage = "%prog", option_list=optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +sample_names = unlist(strsplit(x=opt$samples, split=" ", fixed=TRUE)) +annotation = readr::read_tsv(file=opt$annotation, col_names=TRUE, col_types=cols(.default=col_character())) +manifest = dplyr::tibble(sample = sample_names, + condition = rep(1, length(sample_names)), + path = paste0("kallisto/", sample_names)) +data = sleuth::sleuth_prep(sample_to_covariates = manifest, + extra_bootstrap_summary = TRUE, + read_bootstrap_tpm = TRUE, + target_mapping = annotation, + aggregation_column = "hugo", + gene_mode = TRUE) +res = as.data.frame(sleuth_to_matrix(data, "obs_norm", "tpm")) +tpm_bygene = dplyr::tibble(gene_symbol = rownames(res)) %>% + dplyr::bind_cols(dplyr::as_tibble(res)) +write_tsv(x=tpm_bygene, path="kallisto/tpm_by_gene.txt", append=FALSE, col_names=TRUE, quote_escape=FALSE) diff --git a/scripts/sv_signature.R b/scripts/sv_signature.R new file mode 100644 index 00000000..3ec46f58 --- /dev/null +++ b/scripts/sv_signature.R @@ -0,0 +1,82 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("signature.tools.lib")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name"), + make_option("--input_file", default = NA, type = 'character', help = "input file"), + make_option("--output_file", default = NA, type = 'character', help = "output file")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + sample_name = as.character(opt$sample_name) + sv_bedpe = readr::read_tsv(file = as.character(opt$input_file), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(sample = sample_name) %>% + dplyr::select(-svclass) + res_list = bedpeToRearrCatalogue(sv_bedpe %>% data.frame()) + catalogues_mutations = data.frame(row.names = rownames(res_list$rearr_catalogue), stringsAsFactors = FALSE) + bedpecolumns = c("chrom1", "start1", "end1", "chrom2", "start2", "end2" , "sample","svclass","id", "is.clustered", "length") + catalogues_mutations = cbind(catalogues_mutations,res_list$rearr_catalogue) + mtype_mutations = signature.tools.lib:::getTypeOfMutationsFromChannels(catalogues_mutations) + exposureFilterType = "fixedThreshold" + threshold_percent = 5 + optimisation_method = "KLD" + useBootstrap = FALSE + nboot = 1000 + threshold_p.value = 0.05 + nparallel = 4 + randomSeed = 1 + fit = Fit(catalogues = catalogues_mutations, + signatures = signature.tools.lib:::RefSigv1_rearr, + exposureFilterType = exposureFilterType, + threshold_percent = threshold_percent, + method = optimisation_method, + useBootstrap = useBootstrap, + nboot = nboot, + threshold_p.value = threshold_p.value, + nparallel = nparallel, + randomSeed = randomSeed, + verbose = TRUE) + x = dplyr::tibble(feature_name = rownames(fit$catalogues), + feature_count = as.vector(fit$catalogues[,1])) %>% + dplyr::mutate(sample_name = sample_name) + readr::write_tsv(x = x, file = paste0(opt$output_file, "_features.txt"), col_names = TRUE, append = FALSE) + + x = dplyr::tibble(signature_name = colnames(fit$exposures), + signature_exposure = as.vector(fit$exposures[1,])/sum(as.vector(fit$exposures[1,])) * 100) %>% + dplyr::mutate(sample_name = sample_name) + readr::write_tsv(x = x, file = paste0(opt$output_file, "_exposures.txt"), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==2) { + sample_name = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + signature_df = list() + for (i in 1:length(sample_name)) { + signature_df[[i]] = readr::read_tsv(file = paste0("sv_signature/", sample_name[i], "/", sample_name[i], ".merged_exposures.txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + signature_df = do.call(bind_rows, signature_df) + readr::write_tsv(x = signature_df, file = as.character(opt$output_file), col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==3) { + sample_name = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) + signature_df = list() + for (i in 1:length(sample_name)) { + signature_df[[i]] = readr::read_tsv(file = paste0("sv_signature/", sample_name[i], "/", sample_name[i], ".merged_features.txt"), col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(feature_proportion = 100*feature_count / sum(feature_count)) %>% + dplyr::select(feature_name, feature_count, feature_proportion, sample_name) + } + signature_df = do.call(bind_rows, signature_df) + readr::write_tsv(x = signature_df, file = as.character(opt$output_file), col_names = TRUE, append = FALSE) +} diff --git a/scripts/wgs_metrics.R b/scripts/wgs_metrics.R new file mode 100755 index 00000000..48c35cb6 --- /dev/null +++ b/scripts/wgs_metrics.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +optList = list(make_option("--option", default = NA, type = 'character', help = "analysis type"), + make_option("--sample_names", default = NA, type = 'character', help = "sample names")) +parser = OptionParser(usage = "%prog", option_list = optList) +arguments = parse_args(parser, positional_arguments = T) +opt = arguments$options + +if (as.numeric(opt$option)==1) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".idx_stats.txt"), + col_names = FALSE, col_types = cols(.default = col_character()))[-85,,drop=FALSE] %>% + readr::type_convert() %>% + dplyr::select(CHROMOSOME = X1, + LENGTH = X2, + ALIGNED_READS = X3) %>% + dplyr::mutate(CHROMOSOME = gsub(pattern=" length=", replacement="", x=CHROMOSOME), + ALIGNED_READS = gsub(pattern="Aligned= ", replacement="", x=ALIGNED_READS), + SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/idx_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==2) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".aln_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(-SAMPLE, -READ_GROUP) %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/aln_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==3) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".insert_metrics.txt"), + skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::select(-SAMPLE, -READ_GROUP) %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/insert_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==4) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".oxog_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(SAMPLE_NAME = SAMPLE_ALIAS) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/oxog_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==5) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".gc_metrics.txt"), + skip = 6, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/gc_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==6) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".wgs_metrics.txt"), + skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/wgs_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==7) { + sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) + metrics = list() + for (i in 1:length(sample_names)) { + metrics[[i]] = readr::read_tsv(file = paste0("metrics/", sample_names[i], ".duplicate_metrics.txt"), + skip = 6, n_max = 1, col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::mutate(SAMPLE_NAME = sample_names[i]) + } + metrics = do.call(rbind, metrics) + write_tsv(metrics, path="summary/duplicate_metrics.txt", na = "NA", append = FALSE, col_names = TRUE) + +} diff --git a/signatures/createNMFinput.m b/signatures/createNMFinput.m deleted file mode 100644 index e61040b2..00000000 --- a/signatures/createNMFinput.m +++ /dev/null @@ -1,17 +0,0 @@ -function createNMFinput( mutationFile, sampleNameFile, typesFile, cancerType, inputFile) -%create WTSI input -% convert mutsig mutation matrix file and sample name file into input for -% WTSI mutation signature package - -originalGenomes = importdata(mutationFile)'; - -fid = fopen(sampleNameFile); -sampleNames = textscan(fid, '%s'); -fclose(fid); -sampleNames = sampleNames{1}; - -load(typesFile); - -save(inputFile, 'originalGenomes', 'subtypes', 'types', 'sampleNames', 'cancerType'); -quit -end diff --git a/signatures/deconstruct_sigs.mk b/signatures/deconstruct_sigs.mk index c6721dc2..ba309ad8 100644 --- a/signatures/deconstruct_sigs.mk +++ b/signatures/deconstruct_sigs.mk @@ -1,24 +1,27 @@ include modules/Makefile.inc LOGDIR = log/deconstruct_sigs.$(NOW) -PHONY += deconstructsigs deconstructsigs/signatures deconstructsigs/plots/context -deconstructsigs : $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/signatures/$(sample).RData) $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/plots/context/$(sample).pdf) +deconstructsigs : $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/signatures/$(sample).RData) \ + $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/plots/context/$(sample).pdf) define extract-signatures deconstructsigs/signatures/%.RData : summary/tsv/mutation_summary.tsv - $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"$(RSCRIPT) modules/signatures/extract_signatures.R --sample_name $$(*)") + $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefial && \ + $(RSCRIPT) modules/signatures/extract_signatures.R \ + --sample_name $$()") deconstructsigs/plots/context/%.pdf : deconstructsigs/signatures/%.RData - $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"mkdir -p deconstructsigs/plots/context && \ - mkdir -p deconstructsigs/plots/exposures && \ - $(RSCRIPT) modules/signatures/plot_signatures.R --sample_name $$(*)") + $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/signatures/plot_signatures.R \ + --sample_name $$(*)") endef $(foreach sample,$(TUMOR_SAMPLES),\ $(eval $(call extract-signatures,$(sample)))) - -.DELETE_ON_ERROR: +..DUMMY := $(shell mkdir -p version; \ + $(DECONSTRUCTSIGS_ENV)/bin/R --version > version/deconstruct_sigs.txt) .SECONDARY: -.PHONY: $(PHONY) +.DELETE_ON_ERROR: +.PHONY: deconstructsigs \ No newline at end of file diff --git a/signatures/emu.mk b/signatures/emu.mk deleted file mode 100644 index 7d48e555..00000000 --- a/signatures/emu.mk +++ /dev/null @@ -1,63 +0,0 @@ -include modules/Makefile.inc - -LOGDIR = log/emu.$(NOW) - -EMU_PREPARE = $(HOME)/usr/bin/EMu-prepare -EMU_PREPARE_OPTS := --chr $(EMU_REF_DIR) -ifdef EMU_TARGETS_FILE -EMU_PREPARE_OPTS += --regions $(EMU_TARGETS_FILE) -endif -EMU = $(HOME)/usr/bin/EMu - -PLOT_EMU = $(RSCRIPT) modules/signatures/plot_emu_signatures.R - -NO_CNV ?= false - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: all - -ALL := emu/emu_results_bic.txt emu/report/index.html -ifdef NUM_SPECTRA -ALL += emu/emu_$(NUM_SPECTRA).timestamp -endif - -all : $(ALL) - -ALL_TABLE ?= alltables/allTN.mutect_snps.tab.txt - -emu/mutations.txt : $(ALL_TABLE) - $(INIT) awk 'NR > 1 { sub("X", "23", $$3); sub("Y", "24", $$3); sub("MT", "25", $$3); print $$1 "_" $$2, $$3, $$4, $$6 ">" $$7 }' $< | cat - $(EMU_REF_MUTATIONS) > $@ - -emu/cnv.txt : $(foreach pair,$(SAMPLE_PAIRS),freec/$(pair)/$(tumor.$(pair)).bam_CNVs) - $(INIT) rm -f $@; for x in $^; do \ - sample=`echo $$x | sed 's:freec/::; s:/.*::'`; \ - awk -v sample=$$sample 'NR > 1 { sub("chr", "", $$1); sub("X", "23" , $$1); sub("Y", "24", $$1); sub("MT", "25", $$1); print sample, $$1, $$2, $$3, $$4; }' $$x >> $@; \ - done && cat $(EMU_REF_CNV) >> $@ - -ifeq ($(NO_CNV),false) -emu/mutations.txt.mut.matrix : emu/mutations.txt emu/cnv.txt - $(call RUN,-s 4G -m 8G,"$(EMU_PREPARE) $(EMU_PREPARE_OPTS) --cnv $(<<) --mut $< --pre $(@D) --regions $(EMU_TARGETS_FILE)") -else -emu/mutations.txt.mut.matrix : emu/mutations.txt - $(call RUN,-s 4G -m 8G,"$(EMU_PREPARE) $(EMU_PREPARE_OPTS) --chr $(EMU_REF_DIR) --mut $< --pre $(@D)") -endif - -emu/emu_results_bic.txt : emu/mutations.txt.mut.matrix - $(call RUN,-s 4G -m 8G,"$(EMU) --mut $< --opp human-exome --pre emu/emu_results") - -RESULT_TIMESTAMPS = -ifdef NUM_SPECTRA -emu/emu_$(NUM_SPECTRA).timestamp : emu/mutations.txt.mut.matrix - $(call RUN,-s 4G -m 8G,"$(EMU) --force $(NUM_SPECTRA) --mut $< --opp human-exome --pre emu/emu_results && touch $@") - -RESULT_TIMESTAMPS += emu/emu_$(NUM_SPECTRA).timestamp -endif - -emu/samples.txt : - $(INIT) echo "$(SAMPLE_PAIRS)" | sed 's/ /\n/g' > $@ - -emu/report/index.html : emu/emu_results_bic.txt emu/samples.txt emu/mutations.txt $(RESULT_TIMESTAMPS) - $(call RUN,-s 4G -m 16G,"$(PLOT_EMU) --inPrefix $( \ + $$(@) && \ + cat $$(<) >> $$(@)") + +hr_detect/$1_$2/$1_$2.snv.vcf : summary/tsv/all.tsv + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 1 \ + --sample_name $1_$2") + +hr_detect/$1_$2/$1_$2.snv.vcf.bgz : hr_detect/$1_$2/$1_$2.snv.vcf + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bgzip -c $$(<) > $$(@)") + +hr_detect/$1_$2/$1_$2.snv.vcf.bgz.tbi : hr_detect/$1_$2/$1_$2.snv.vcf.bgz + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + tabix -p vcf $$(<)") + +hr_detect/$1_$2/$1_$2.indel.vcf : summary/tsv/all.tsv + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 2 \ + --sample_name $1_$2") + +hr_detect/$1_$2/$1_$2.indel.vcf.bgz : hr_detect/$1_$2/$1_$2.indel.vcf + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bgzip -c $$(<) > $$(@)") + +hr_detect/$1_$2/$1_$2.indel.vcf.bgz.tbi : hr_detect/$1_$2/$1_$2.indel.vcf.bgz + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + tabix -p vcf $$(<)") + +hr_detect/$1_$2/$1_$2.snv_repaired.vcf : hr_detect/$1_$2/$1_$2.snv.vcf.bgz hr_detect/$1_$2/$1_$2.snv.vcf.bgz.tbi + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bcftools view $$(<) > $$(@)") + +hr_detect/$1_$2/$1_$2.snv_repaired.vcf.bgz : hr_detect/$1_$2/$1_$2.snv_repaired.vcf + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bgzip -c $$(<) > $$(@)") + +hr_detect/$1_$2/$1_$2.snv_repaired.vcf.bgz.tbi : hr_detect/$1_$2/$1_$2.snv_repaired.vcf.bgz + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + tabix -p vcf $$(<)") + +hr_detect/$1_$2/$1_$2.indel_repaired.vcf : hr_detect/$1_$2/$1_$2.indel.vcf.bgz hr_detect/$1_$2/$1_$2.indel.vcf.bgz.tbi + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bcftools view $$(<) > $$(@)") + + +hr_detect/$1_$2/$1_$2.indel_repaired.vcf.bgz : hr_detect/$1_$2/$1_$2.indel_repaired.vcf + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + bgzip -c $$(<) > $$(@)") + +hr_detect/$1_$2/$1_$2.indel_repaired.vcf.bgz.tbi : hr_detect/$1_$2/$1_$2.indel_repaired.vcf.bgz + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(INNOVATION_ENV),"set -o pipefail && \ + tabix -p vcf $$(<)") + +hr_detect/$1_$2/$1_$2.cn.txt : facets/cncf/$1_$2.txt + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 3 \ + --sample_name $1_$2") + +hr_detect/$1_$2/$1_$2.sv.bedpe : hr_detect/$1_$2/$1_$2.merged.bedpe + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 4 \ + --sample_name $1_$2") + +hr_detect/$1_$2/$1_$2.png : hr_detect/$1_$2/$1_$2.snv_repaired.vcf.bgz.tbi hr_detect/$1_$2/$1_$2.indel_repaired.vcf.bgz.tbi hr_detect/$1_$2/$1_$2.cn.txt hr_detect/$1_$2/$1_$2.sv.bedpe + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 5 \ + --sample_name $1_$2 && \ + mv hr_detect/$1_$2/$1_$2.genomePlot.png $$(@)") + +hr_detect/$1_$2/$1_$2.svg : hr_detect/$1_$2/$1_$2.snv_repaired.vcf.bgz.tbi hr_detect/$1_$2/$1_$2.indel_repaired.vcf.bgz.tbi hr_detect/$1_$2/$1_$2.cn.txt hr_detect/$1_$2/$1_$2.sv.bedpe + $$(call RUN,-c -n 1 -s 12G -m 16G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R \ + --option 6 \ + --sample_name $1_$2 && \ + mv hr_detect/$1_$2/$1_$2.genomePlot.svg $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call hr-detect,$(tumor.$(pair)),$(normal.$(pair))))) + +hr_detect/hrdetect_smry.txt : $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).sv.bedpe) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).snv_repaired.vcf.bgz) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).snv_repaired.vcf.bgz.tbi) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).indel_repaired.vcf.bgz) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).indel_repaired.vcf.bgz.tbi) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).cn.txt) + $(call RUN, -c -n 4 -s 6G -m 9G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R --option 7 --sample_name '$(SAMPLE_PAIRS)'") + +hr_detect/signatures_smry.txt : $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).sv.bedpe) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).snv_repaired.vcf.bgz) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).snv_repaired.vcf.bgz.tbi) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).indel_repaired.vcf.bgz) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).indel_repaired.vcf.bgz.tbi) $(foreach pair,$(SAMPLE_PAIRS),hr_detect/$(pair)/$(pair).cn.txt) + $(call RUN, -c -n 4 -s 6G -m 9G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) modules/scripts/hr_detect.R --option 8 --sample_name '$(SAMPLE_PAIRS)'") + +..DUMMY := $(shell mkdir -p version; \ + R --version &> version/hr_detect.txt;) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: hr_detect diff --git a/signatures/mut_sig.mk b/signatures/mut_sig.mk deleted file mode 100644 index b95fbaae..00000000 --- a/signatures/mut_sig.mk +++ /dev/null @@ -1,28 +0,0 @@ -include modules/Makefile.inc -include modules/variant_callers/gatk.inc - -LOGDIR = log/mut_sig.$(NOW) - -VCF2VRANGES = $(RSCRIPT) modules/signatures/vcf_2_vranges.R -KNIT = $(RSCRIPT) modules/scripts/knit.R -ALEXANDROV_DATA = $(HOME)/share/reference/sanger_30_mutsig_prob.txt -MUTSIG_REPORT = modules/signatures/mut_sig_report.Rmd -MUTSIG_REPORT_OPTS = --name $(PROJECT_NAME) \ - --alexandrovData $(ALEXANDROV_DATA) \ - $(if $(TARGETS_FILE),--targetBed $(TARGETS_FILE)) - -SNV_TYPE ?= mutect - -.SECONDARY: -.DELETE_ON_ERROR: -.PHONY: mutect_mutsig_reports - -mutect_mutsig_reports : mutsig_report/mutect/mutsig_report.timestamp - -mutsig_report/mutect/mutsig_report.timestamp : $(foreach pair,$(SAMPLE_PAIRS),mutsig_report/vrange/$(pair).$(SNV_TYPE).ft.VRanges.Rdata) - $(call RUN,-N mutect_mutsig_report -v $(MUTSIG_REPORT_ENV) -n 4 -s 3G -m 5G,"$(KNIT) $(MUTSIG_REPORT) $(@D) --ncores 4 --outDir $(@D) $(MUTSIG_REPORT_OPTS) $^ && touch $@") - -mutsig_report/vrange/%.VRanges.Rdata : vcf/%.vcf - $(call RUN,-v $(MUTSIG_REPORT_ENV) -s 7G -m 10G,"$(VCF2VRANGES) --genome $(REF) --outFile $@ $<") - -include modules/vcf_tools/vcftools.mk diff --git a/signatures/mut_sig_report.Rmd b/signatures/mut_sig_report.Rmd deleted file mode 100644 index 11db1d82..00000000 --- a/signatures/mut_sig_report.Rmd +++ /dev/null @@ -1,330 +0,0 @@ -```{r setup, include = F} -options(useHTTPS = F) -library("optparse"); -library("VariantAnnotation"); -library("reshape") -library("boot") -library("plyr") -library("dplyr") -library("ggplot2") -library("RColorBrewer") -library("reshape2") -library("SomaticSignatures") -library("doMC") -library("foreach") -library("NMF") -library("gridExtra") -library("gplots") -library("magrittr") - - -optList <- list( - make_option("--name", default = '', type = "character", action = "store", help = "report name"), - make_option("--ncores", default = 4, type = "integer", action = "store", help = "number of cores"), - make_option("--alexandrovData", default = '~/share/reference/Alexandrov_NMF_signatures.txt', type = "character", action = "store", help = "alexandrov nmf signatures"), - make_option("--genome", default = 'b37', help = "reference genome"), - make_option("--vrangeDir", default = NULL, type = "character", action = "store", help = "input directory"), - make_option("--outDir", default = NULL, type = "character", action = "store", help = "output directory"), - make_option("--targetBed", default = NULL, type = "character", action = "store", help = "target intervals in bed format")) - -parser <- OptionParser(usage = "%prog [options] [VRange file(s)]", option_list = optList); -arguments <- parse_args(parser, positional_arguments = T, args = args); -opt <- arguments$options; - -if (!is.null(opt$vrangeDir)) { - vrFiles <- list.files(path = opt$vrangeDir, - pattern = '.*\\.VRanges\\.Rdata', - full.names = T) -} -if (length(arguments$args) > 1) { - vrFiles <- arguments$args -} -if (length(vrFiles) < 1) { - cat("Need VRange file(s)\n"); - print_help(parser); - stop(); -} - -outFile <- opt$outFile -if (opt$genome == "b37" || opt$genome == "hg19") { - library("BSgenome.Hsapiens.UCSC.hg19"); - library("TxDb.Hsapiens.UCSC.hg19.knownGene") - genome <- BSgenome.Hsapiens.UCSC.hg19 - txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene - genomeName <- 'hg19' -} else if (opt$genome == "mm10" || opt$genome == "GRCm38") { - library("TxDb.Mmusculus.UCSC.mm10.knownGene") - library("BSgenome.Mmusculus.UCSC.mm10"); - genome <- BSgenome.Mmusculus.UCSC.mm10 - txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene - genomeName <- 'mm10' -} - -txdb <- renameSeqlevels(txdb, sub('chr', '', seqlevels(txdb))) -txByGenes <- transcriptsBy(txdb, 'gene') -bases <- c("A", "C", "G", "T") - -if (!is.null(opt$targetBed)) { - bed <- ucsc(import(opt$targetBed)) - k3t <- kmerFrequency(genome, 1e5, 3, bed) - k3wg <- kmerFrequency(genome, 1e5, 3) - norms <- k3wg[names(k3t)] / k3t -} - -registerDoMC(opt$ncores) - -theme_set(theme_bw(base_size = 18)) -``` - -# `r opt$name` Mutational Signature Report ---- -### Raymond Lim - -```{r createMotifMatrices, include = F, cache = F} -vrs <- list() -mm <- list() -mmNorm <- list() -mmTranscribed <- list() -mmUntranscribed <- list() -for (vrFile in vrFiles) { - s <- sub('\\..*', '', vrFile) - s <- sub('.*/', '', s) - load(vrFile) - vrs[[s]] <- vr -} -vrs <- VRangesList(vrs) -allVr <- unlist(vrs) - -mm <- motifMatrix(allVr, normalize = F) -mmNorm <- motifMatrix(allVr, normalize = T) -x <- subset(allVr, allVr$transcribed) -sampleNames(x) <- factor(as.factor(sampleNames(x))) -mmTranscribed <- motifMatrix(x, normalize = T) -x <- subset(allVr, !allVr$transcribed) -sampleNames(x) <- factor(as.factor(sampleNames(x))) -mmUntranscribed <- motifMatrix(x, normalize = T) -if (!is.null(opt$targetBed)) { - mmNorm <- normalizeMotifs(mmNorm, norms) - mmTranscribed <- normalizeMotifs(mmTranscribed, norms) - mmUntranscribed <- normalizeMotifs(mmUntranscribed, norms) -} -``` - -```{r writeMotifMatrices} -if (!is.null(opt$outDir)) { - fn <- paste(opt$outDir, "/mm.tsv", sep = "") - write.table(mm, file = fn, quote = F, sep = '\t') - fn <- paste(opt$outDir, "/mm_transcribed.tsv", sep = "") - write.table(mmTranscribed, file = fn, quote = F, sep = '\t') - fn <- paste(opt$outDir, "/mm_untranscribed.tsv", sep = "") - write.table(mmUntranscribed, file = fn, quote = F, sep = '\t') - fn <- paste(opt$outDir, "/mm_norm.tsv", sep = "") - write.table(mmNorm, file = fn, quote = F, sep = '\t') -} -``` - -```{r loadAlexandrovData} -alexandrov <- read.table(opt$alexandrovData, sep = '\t', header = T, as.is = T) -rownames(alexandrov) <- paste(sub('>', '', as.character(alexandrov$Substitution.Type)), - ' ', subseq(as.character(alexandrov$Trinucleotide), 1, 1), '.', - subseq(as.character(alexandrov$Trinucleotide), 3, 3), sep = '') - -alexandrovM <- alexandrov[rownames(mm), grepl('Signature', colnames(alexandrov))] - -``` - -```{r nmf, fig.width = 10, fig.height = 10} -solveNMF <- function(x, signatures){ - coef <- fcnnls(x, signatures[rownames(x),, drop = F]) # reorder the rownames of the in matrix - colsum <- apply(coef$x, 2, sum) - coef_x_scaled <- scale(coef$x, center = F, scale = colsum) - return(coef_x_scaled) -} -nmfCoefs <- solveNMF(as.matrix(alexandrovM), as.matrix(mm)) - -if (!is.null(opt$outDir)) { - fn <- paste(opt$outDir, "/nnls_coefs.tsv", sep = "") - write.table(nmfCoefs, file = fn, quote = F, sep = '\t') -} - -cols <- brewer.pal(9, 'Blues') -if (ncol(nmfCoefs) > 2) { - heatmap.2(nmfCoefs, trace = 'none', margins = c(13, 8), cexCol = 0.8, col = cols, Rowv = F, Colv = F, dendrogram = 'none') -} - -par(mar = c(10,5,5,5)) -for (s in colnames(nmfCoefs)) { - barplot(nmfCoefs[,s], main = s, las = 2) -} -``` - - -```{r pie, eval = F, echo = F} -for (s in names(vrs)) { - vr <- vrs[[s]] - cols <- c("C>A" = "lightblue", "C>G" = "black", "C>T" = "red", "T>A" = "grey", "T>C" = "lightgreen", "T>G" = "pink") - main <- paste(s, " (n = ", length(vr), ")", sep = '') - pie(table(vr$alteration), col = cols, main = main) - if (sum(vr$transcribed, na.rm = T) > 0 && sum(!vr$transcribed, na.rm = T) > 0) { - main <- paste('transcribed', s, " (n=", sum(vr$transcribed, na.rm = T), ")", sep = '') - pie(table(subset(vr, vr$transcribed)$alteration), col = cols, main = main) - main <- paste('untranscribed', s, " (n=", sum(!info(vcf)$transcribed, na.rm = T), ")", sep = '') - pie(table(subset(vr, !vr$transcribed)$alteration), col = cols, main = paste('untranscribed', main)) - } -} -``` - - -```{r mutCountPlots, fig.height = 7, fig.width = 28} -plotMutBarplot <- function(samp, mm) { - cols <- c("C>A" = "lightblue", "C>G" = "black", "C>T" = "red", "T>A" = "grey", "T>C" = "lightgreen", "T>G" = "pink") - mdf <- melt(mm, varnames = c('motif', 'sample')) - mdf$alteration = sub("([ACGTN])([ACGTN]) .+", "\\1>\\2", - mdf$motif) - mdf$context = sub("[ACGTN][ACGTN] (.+)", "\\1", mdf$motif) - tit <- paste(samp, ' (n = ', sum(mdf %>% filter(sample == samp) %$% value, na.rm = T), ")", sep = '') - mdf %>% filter(sample == samp) %>% - ggplot(aes(x = context, y = value, fill = alteration)) + - geom_bar(stat = 'identity') + - facet_grid(~ alteration, switch = 'x') + - xlab("") + ylab("") + ggtitle(tit) + - theme(axis.text.x = element_text(angle = 90, hjust = 1), - legend.position = 'none', - panel.border = element_blank(), - axis.line.x = element_line(color = 'black', size = 1), - axis.line.y = element_line(color = 'black', size = 1), - panel.grid = element_blank(), - strip.background = element_blank(), - strip.text.x = element_text(size = 20)) + - scale_fill_manual(values = cols) -} - -plotMutBarplotStranded <- function(samp, mmTranscribed, mmUntranscribed) { - mdfTranscribed <- melt(mmTranscribed, varnames = c('motif', 'sample')) - mdfTranscribed$transcribed <- T - mdfUntranscribed <- melt(mmUntranscribed, varnames = c('motif', 'sample')) - mdfUntranscribed$transcribed <- F - mdf <- rbind(mdfTranscribed, mdfUntranscribed) - mdf$alteration = sub("([ACGTN])([ACGTN]) .+", "\\1>\\2", - mdf$motif) - mdf$context = sub("[ACGTN][ACGTN] (.+)", "\\1", mdf$motif) - tit <- paste(samp, ' (n = ', sum(mdf %>% filter(sample == samp) %$% value, na.rm = T), ")", sep = '') - mdf %>% filter(sample == samp) %>% - ggplot(aes(x = context, y = value, fill = transcribed)) + - geom_bar(stat = 'identity', position = 'dodge') + - facet_grid(~ alteration, switch = 'x') + - theme(axis.text.x = element_text(angle = 90, hjust = 1), - panel.border = element_blank(), - axis.line.x = element_line(color = 'black', size = 1), - axis.line.y = element_line(color = 'black', size = 1), - panel.grid = element_blank(), - strip.background = element_blank(), - strip.text.x = element_text(size = 20)) + - scale_fill_manual(name = "", values = c('blue', 'red'), - labels = c("Transcribed strand", - 'Untranscribed strand')) + - xlab("") + ylab("") + ggtitle(tit) -} - -plotMutPiechart <- function(samp, mm) { - mdf <- melt(mm, varnames = c('motif', 'sample')) - mdf$alteration = sub("([ACGTN])([ACGTN]) .+", "\\1>\\2", - mdf$motif) - mdf$context = sub("[ACGTN][ACGTN] (.+)", "\\1", mdf$motif) - cols <- c("C>A" = "lightblue", "C>G" = "black", "C>T" = "red", "T>A" = "grey", "T>C" = "lightgreen", "T>G" = "pink") - mdf %>% filter(sample == samp) %>% group_by(alteration) %>% summarise(value = sum(value)) %>% - ggplot(aes(x = "", y = value, fill = alteration)) + - geom_bar(width = 1, stat = 'identity') + scale_fill_manual(values = cols) + - coord_polar("y") + xlab("") + ylab("") -} - -for (s in colnames(mm)) { - p1 <- plotMutBarplot(s, mm) - p2 <- plotMutPiechart(s, mm) - grid.arrange(p1, p2, ncol = 2, widths = c(4, 2)) - - p1 <- plotMutBarplot(s, mmNorm) - p2 <- plotMutPiechart(s, mmNorm) - grid.arrange(p1, p2, ncol = 2, widths = c(4, 2)) -} - -for (s in colnames(mm)) { - vr <- vrs[[s]] - if (sum(vr$transcribed, na.rm = T) > 0 && sum(!vr$transcribed, na.rm = T) > 0) { - p1 <- plotMutBarplotStranded(s, mmTranscribed, mmUntranscribed) - p2 <- plotMutPiechart(s, mmTranscribed) - p3 <- plotMutPiechart(s, mmUntranscribed) - lom <- matrix(c(1,1, 2,3), nrow = 2, ncol = 2) - grid.arrange(p1, p2, p3, layout_matrix = lom, widths = c(4, 2)) - } -} - -``` - - -```{r bootPlot, fig.width = 12} -bootFun <- function(x) { - baseMotif = subseq(as.character(x$motif), 4, 6) - subseq(baseMotif, 2, 2) = subseq(as.character(x$motif), 1, 1) - if (!is.null(opt$targetBed)) { - nval <- x$value * norms[baseMotif] - } else { - nval <- x$value - } - nval <- nval / sum(nval) - apply(alexandrovM, 2, function(y) fcnnls(nval, y)$x) -} - -ranFun <- function(p, d) { - # create a vector of same # of mutations using original context probabilities - s <- sample.int(nrow(p), size = sum(p$value), replace = T, prob = p$value / sum(p$value)) - pp <- p - tab <- table(p[s, "motif"]) - pp[match(names(tab), pp$motif), "value"] <- tab - pp -} - -mm <- motifMatrix(allVr, normalize = F) -bootDf <- foreach(s = colnames(mm), .combine = 'rbind', .errorhandling = 'remove') %dopar% { - mdf <- melt(mm[, s, drop = F], varnames = c('motif', 'sample')) - mdf$alteration = sub("([ACGTN])([ACGTN]) .+", "\\1>\\2", - mdf$motif) - mdf$context = sub("[ACGTN][ACGTN] (.+)", "\\1", mdf$motif) - boots <- boot(mdf, bootFun, R = 1000, ran.gen = ranFun, sim = 'parametric', parallel = 'multicore') - boots.sd <- apply(boots$t, 2, sd) - ci <- norm.ci(boots, index = 1:ncol(alexandrovM)) - Df <- as.data.frame(boots$t0, row.names = 1:ncol(alexandrovM)) - colnames(Df) <- 'bootCor' - Df$signature <- sub('Signature.', '', colnames(alexandrovM)) - Df$bootSD <- boots.sd - Df$lowerCI95 <- ci[,2] - Df$upperCI95 <- ci[,3] - n <- colnames(alexandrovM) - Df$votes <- table(factor(n[apply(boots$t, 1, which.max)], levels = n)) - Df$sample <- s - Df$nCalls <- sum(mm[, s]) - Df$significant <- ! with(Df, bootCor > lowerCI95 & bootCor < upperCI95) - Df$rank = rank(-Df$votes, ties.method = 'max') - Df -} - -fn <- paste(opt$outDir, "/vote_results.tsv", sep = "") -write.table(bootDf, row.names = F, quote = F, sep = '\t', file = fn) - -maxBootDf <- bootDf %>% group_by(sample) %>% filter(votes == max(votes)) -fn <- paste(opt$outDir, "/max_vote_results.tsv", sep = "") -write.table(maxBootDf, row.names = F, quote = F, sep = '\t', file = fn) - -for (s in colnames(mm)) { - bdf <- filter(bootDf, sample == s) - cols <- ifelse(bdf$signficant, 'red', 'grey') - n <- sub('Signature.', '', colnames(alexandrovM)) - par(mfrow = c(2,1), mar = c(3,5,3,3)) - barCenters <- barplot(bdf$bootCor, ylim = c(min(bdf$bootCor - bdf$bootSD), max(bdf$bootCor + bdf$bootSD)), names.arg = n, col = cols, - main = s, ylab = 'Correlation') - segments(barCenters, bdf$bootCor - bdf$bootSD, barCenters, bdf$bootCor + bdf$bootSD, lwd = 1) - # vote barplot - barplot(bdf$votes, ylab = '# Votes', names.arg = bdf$signature) -} -``` - diff --git a/signatures/nmfMutSig.mk b/signatures/nmfMutSig.mk deleted file mode 100644 index 3b69c1ee..00000000 --- a/signatures/nmfMutSig.mk +++ /dev/null @@ -1,45 +0,0 @@ -# Run wtsi NMF mutation sig on tumour/normal data -# Detect mutation signatures using mutect calls -##### DEFAULTS ###### - -include modules/Makefile.inc - -LOGDIR = log/nmf_mutsig.$(NOW) - -EMU_PREPARE = $(HOME)/usr/bin/EMu-prepare -MATLABPATH := modules/mut_sigs -ifeq ($(HOSTNAME),ika.cbio.mskcc.org) -export MATLAB_BIN := /usr/local/MATLAB/R2013a/bin/matlab -else -export MATLAB_BIN := /usr/local/bin/matlab -endif -MATLAB = export MATLABPATH=$(MATLABPATH); $(MATLAB_BIN) -nodisplay -nosplash - -NMF_DIR = $(HOME)/usr/nmf_mut_sig -NMF_TYPES_FILE = $(NMF_DIR)/types.mat - -NMF_MIN_SIG = 1 -NMF_MAX_SIG = 4 - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: all - -ALL := nmf_mutsig/mutations.txt.mut.matrix nmf_mutsig/results.mat nmf_mutsig/plot.timestamp - -all : $(ALL) - -nmf_mutsig/mutations.txt : alltables/allTN.mutect_snps.tab.txt - $(INIT) awk 'NR > 1 { sub("X", "23", $$3); sub("Y", "24", $$3); sub("MT", "25", $$3); print $$1 "_" $$2, $$3, $$4, $$6 ">" $$7 }' $< > $@ - -nmf_mutsig/mutations.txt.mut.matrix : nmf_mutsig/mutations.txt - $(INIT) $(EMU_PREPARE) --chr $(EMU_REF_DIR) --mut $< --pre $(@D) --regions $(EMU_TARGETS_FILE) - -nmf_mutsig/input.mat : nmf_mutsig/mutations.txt.mut.matrix - $(INIT) $(MATLAB) -r "createNMFinput $< $(<:.mut.matrix=.samples) $(NMF_TYPES_FILE) $(PROJECT_NAME) $@" - -nmf_mutsig/results.mat : nmf_mutsig/input.mat - $(INIT) $(MATLAB) -r "runNMF $< $(@:.mat=) $(NMF_DIR) $(NMF_MIN_SIG) $(NMF_MAX_SIG)" - -nmf_mutsig/plot.timestamp : nmf_mutsig/results.mat - $(INIT) $(MATLAB) -r "plotNMF $(<:.mat=) $(NMF_DIR) $(NMF_MIN_SIG) $(NMF_MAX_SIG)" && touch $@ diff --git a/signatures/plotNMF.m b/signatures/plotNMF.m deleted file mode 100644 index a1420ccb..00000000 --- a/signatures/plotNMF.m +++ /dev/null @@ -1,20 +0,0 @@ -function plotNMF( prefix, nmfDir, minNumSig, maxNumSig ) -% run NMF -addpath(strcat(nmfDir, '/source/')); -addpath(strcat(nmfDir, '/plotting/')); -mkdir('temp'); - -minNumSig = str2num(minNumSig); -maxNumSig = str2num(maxNumSig); - -for totalSignatures = minNumSig : maxNumSig - tsPrefix = strcat(prefix, '_ts', num2str(totalSignatures)); - inputFile = strcat(tsPrefix, '.mat'); - S = load(inputFile); - plotSignaturesToFile(tsPrefix, S.processes, S.input, S.allProcesses, S.idx, S.processStabAvg); - plotSignaturesExposureInSamplesToFile(tsPrefix, S.exposures, S.input); -end - -quit -end - diff --git a/signatures/plot_emu_signatures.R b/signatures/plot_emu_signatures.R deleted file mode 100644 index 6b08a32f..00000000 --- a/signatures/plot_emu_signatures.R +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("VariantAnnotation")) -suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("hwriter")) -suppressPackageStartupMessages(library("org.Hs.eg.db")) - -options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) - -optList <- list( - make_option("--outDir", default = NULL, help = "output dir (required)"), - make_option("--mutations", default = NULL, help = "mutations file (required)"), - make_option("--samples", default = NULL, help = "samples file"), - make_option("--sampleSubset", default = NULL, help = "sample subset file: list of samples to plot contribution"), - make_option("--inPrefix", default = NULL, help = "EMu input prefix (required)")) - -parser <- OptionParser(usage = "%prog [options]", option_list = optList); - -arguments <- parse_args(parser, positional_arguments = T); -opt <- arguments$options; - -if (is.null(opt$outDir)) { - cat("Need output dir\n"); - print_help(parser); - stop(); -} else if (is.null(opt$inPrefix)) { - cat("Need EMu input prefix\n"); - print_help(parser); - stop(); -} else if (is.null(opt$mutations)) { - cat("Need EMu mutations file\n"); - print_help(parser); - stop(); -} else { - files <- arguments$args; -} - -glob <- paste(opt$inPrefix, '*_ml_spectra.txt', sep = '') -spectraFiles <- Sys.glob(glob) - -glob <- paste(opt$inPrefix, '*_map_activities.txt', sep = '') -activityFiles <- Sys.glob(glob) - -glob <- paste(opt$inPrefix, '*_assigned.txt', sep = '') -assignedFiles <- Sys.glob(glob) - -pg <- openPage('index.html', dirname = opt$outDir, title = 'EMu results') - -set.seed(002) -palette(sample(rainbow(30))) - -for (fn in spectraFiles) { - spectra <- read.table(fn, sep = ' ') - spectra <- spectra[,-97] # remove empty col - for (i in 1:nrow(spectra)) { - ofn <- paste(opt$outDir, "/", basename(fn), sep = '') - ofn <- sub('\\.txt$', paste("_", i, '.pdf', sep = ''), ofn) - pdf(ofn, height = 8, width = 10) - par(cex = 1.5) - cols <- rep(c('LightBlue', 'Black', 'Red', 'Grey', 'Green', 'Magenta'), each = 16) - barplot(t(spectra[i,]) * 100, beside = T, col = cols, border = cols, xaxt = 'n', main = paste("Signature", i), col.main = i, ylab = "% of mutations") - labs <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G") - mtext(labs, side = 1, at = 1:6 * 16 - 7.5) - null <- dev.off() - } - - for (i in 1:nrow(spectra)) { - ofn <- paste(opt$outDir, "/", basename(fn), sep = '') - ofn <- sub('\\.txt$', paste("_", i, '.png', sep = ''), ofn) - png(ofn, height = 500, width = 800, type = 'cairo-png') - par(cex = 2) - cols <- rep(c('LightBlue', 'Black', 'Red', 'Grey', 'Green', 'Magenta'), each = 16) - barplot(t(spectra[i,]) * 100, beside = T, col = cols, border = cols, xaxt = 'n', main = paste("Signature", i), ylab = "% of mutations", col.main = i) - labs <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G") - mtext(labs, side = 1, at = 1:6 * 16 - 7.5) - null <- dev.off() - hwriteImage(basename(ofn), pg, br = T) - } -} - -samples <- scan(opt$samples, what = 'character') -sampleSubset <- scan(opt$sampleSubset, what = 'character') - -mutations <- read.table(opt$mutations, sep = ' ') -colnames(mutations) <- c('sample', 'chr', 'pos', 'snv') -mutations <- subset(mutations, sample %in% sampleSubset) - -for (fn in assignedFiles) { - assigned <- read.table(fn, sep = ' ') - assigned <- as.matrix(assigned[,-ncol(assigned)]) - rownames(assigned) <- samples - assigned <- assigned[sampleSubset, ] - - ofn <- paste(opt$outDir, "/", basename(fn), sep = '') - ofn <- sub('\\.txt$', '.png', ofn) - - tab <- table(factor(mutations$sample)) - tab <- tab[sampleSubset] - oo <- order(tab) - assigned <- assigned[oo, ] - - png(ofn, height = 1000, width = 1000, type = 'cairo-png') - par(mar = c(5, 10, 5, 1), cex = 1, mfrow = c(1, 2), cex = 1.5) - barplot(t(assigned / rowSums(assigned)), col = 1:5, space = 0, border = F, horiz = T, las = 2, xlab = "Contribution of signature") - par(mar = c(5,1,5,5)) - barplot(tab[oo], las = 2, horiz = T, space = 0, border = F, xlab = "Number of Mutations", axisnames = F) - null <- dev.off() - hwriteImage(basename(ofn), pg, br = T) - - ofn <- sub('\\.png$', '.pdf', ofn) - pdf(ofn, height = 12, width = 12) - par(mar = c(5, 10, 5, 1), cex = 1, mfrow = c(1, 2), cex = 1.5) - barplot(t(assigned / rowSums(assigned)), col = 1:5, space = 0, border = F, horiz = T, las = 2, xlab = "Contribution of signature") - par(mar = c(5,1,5,5)) - barplot(tab[oo], las = 2, horiz = T, space = 0, border = F, xlab = "Number of Mutations", axisnames = F) - null <- dev.off() -} - - -closePage(pg) diff --git a/signatures/plot_signatures.R b/signatures/plot_signatures.R deleted file mode 100644 index 86972ef4..00000000 --- a/signatures/plot_signatures.R +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("deconstructSigs")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) -suppressPackageStartupMessages(library("ggplot2")) -suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("Palimpsest")) -suppressPackageStartupMessages(library("BSgenome.Hsapiens.UCSC.hg19")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_name", default = NA, type = 'character', help = "tumor sample name") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -'plot96_mutation_spectrum' <- function (vcf, sample.col = "sample", mutcat3.col = "mutcat3", - ymax = NULL, averageProp = FALSE, plot.file = NULL) -{ - bases <- c("A", "C", "G", "T") - ctxt16 <- paste(rep(bases, each = 4), rep(bases, 4), sep = ".") - mt <- c("CA", "CG", "CT", "TA", "TC", "TG") - types96 <- paste(rep(mt, each = 16), rep(ctxt16, 6), sep = "_") - types96 <- sapply(types96, function(z) { - sub("\\.", substr(z, 1, 1), z) - }) - context <- substr(types96, 4, 6) - nsamp <- length(unique(vcf[, sample.col])) - if (averageProp & nsamp > 1) { - tmp <- makeMutypeMatFromVcf(vcf, sample.col = "CHCID", - mutcat.col = "mutcat3", mutypes = types96) - freq <- apply(tmp, 1, mean) - } - else { - freq <- sapply(types96, function(z) { - mean(vcf[, mutcat3.col] == z, na.rm = T) - }) - } - if (!is.null(plot.file)) { - pdf(plot.file, width = 24, height = 5) - } - col96 <- c(rep("skyblue3", 16), rep("black", 16), rep("red", - 16), rep("grey", 16), rep("green", 16), rep("pink", 16)) - labs <- c(rep("C>A", 16), rep("C>G", 16), rep("C>T", 16), - rep("T>A", 16), rep("T>C", 16), rep("T>G", 16)) - if (is.null(ymax)) { - ymax <- 100*ceiling(max(freq) * 100)/100 - ymax <- ifelse(ymax>10, 30, 10) - } - bp <- barplot(freq*100, col = col96, border = col96, las = 2, - width = 1, space = .35, yaxt = "n", xaxt = "n", ylim = c(0, - ymax * 1.2)) - title(ylab = "Fraction of mutations (%)", mgp = c(1, 1, 0), - cex.lab = 1.6) - axis(1, at = bp, labels = context, pos = 0, las = 2, cex.axis = 1.5, - tick = F, cex.axis = 1, lwd=-1) - if (ymax==40) { - axis(2, at = c(0,10,20,30,40), labels=c(0,10,20,30,40), pos = 0, las = 1, cex.axis = 1.5) - } else if (ymax==30) { - axis(2, at = c(0,5,10,15,20,25,30), labels=c(0,5,10,15,20,25,30), pos = 0, las = 1, cex.axis = 1.5) - } else if (ymax==20) { - axis(2, at = c(0,5,10,15,20), labels=c(0,5,10,15,20), pos = 0, las = 1, cex.axis = 1.5) - } else if (ymax==10) { - axis(2, at = c(0,2,4,6,8,10), labels=c(0,2,4,6,8,10), pos = 0, las = 1, cex.axis = 1.5) - } - for (i in seq(1, 81, by = 16)) { - rect(bp[i], par()$usr[4], bp[i + 15], par()$usr[4] - - 0.05 * diff(par()$usr[3:4]), col = col96[i], border = col96[i]) - text((bp[i] + bp[i + 15])/2, par()$usr[4] + 0.09 * diff(par()$usr[3:4]), - labels = labs[i], xpd = TRUE, cex = 2) - } - if (!is.null(plot.file)) { - dev.off() - } -} - -load(file=paste0("deconstructsigs/signatures/", opt$sample_name, ".RData")) - -## barplot of base changes with 3' and 5' context -colnames(mutation_summary) = c("Sample", "CHROM", "POS", "REF", "ALT") -mutation_summary = cbind(mutation_summary, "Type"=rep("SNV", nrow(mutation_summary))) -vcf = preprocessInput_snv(input_data = mutation_summary, - ensgene = ensgene, - reference_genome = BSgenome.Hsapiens.UCSC.hg19) -patient_ids = unique(vcf$Sample) -pdf(file=paste0("deconstructsigs/plots/context/", opt$sample_name, ".pdf"), width=18, height=5) -plot96_mutation_spectrum(vcf, ymax=20, sample.col = "Sample", plot.file = NULL) -dev.off() - -## pie-charts of signatures -palette = colorRampPalette(brewer.pal(9, "Set1")) -cols = palette(30) -names(cols) = 1:30 - -df = data.frame(percentage = 100*as.numeric(extracted_signatures$weights[1,]), - signature_name = colnames(extracted_signatures$weights)) %>% - mutate(signature_name = as.numeric(gsub(pattern="Signature.", replacement="", signature_name))) %>% - arrange(signature_name) %>% - filter(percentage!=0) %>% - mutate(signature_name = factor(signature_name, ordered=TRUE, levels=sort(signature_name))) %>% - mutate(lab.ypos = cumsum(percentage) - 0.5*percentage) - -plot.0 = ggplot(df, aes(x = "", y = percentage, fill = signature_name)) + - geom_bar(width = 1, stat = "identity", color = "white") + - scale_fill_manual(values=cols) + - coord_polar("y", start = 0) + - geom_text(aes(y = lab.ypos, label = paste0(signif(percentage,3), "%")), color = "white") + - guides(fill=guide_legend(title="Signature")) + - theme_void() - -pdf(file=paste0("deconstructsigs/plots/exposures/", opt$sample_name, ".pdf"), width=6, height=6) -print(plot.0) -dev.off() diff --git a/signatures/runNMF.m b/signatures/runNMF.m deleted file mode 100644 index 40d372d7..00000000 --- a/signatures/runNMF.m +++ /dev/null @@ -1,46 +0,0 @@ -function runNMF( inputFile, outputPrefix, nmfDir, minNumSig, maxNumSig ) -% run NMF -addpath(strcat(nmfDir, '/source/')); -addpath(strcat(nmfDir, '/plotting/')); -clc; - -mkdir('temp'); - -minNumSig = str2num(minNumSig); -maxNumSig = str2num(maxNumSig); - -%% Open matlabpool -if ( matlabpool('size') == 0 ) - matlabpool open; % opens the default matlabpool, if it is not already opened -end - -%% Define parameters -iterationsPerCore = 100; -stability = zeros(maxNumSig, 1); -reconstructionError = zeros(maxNumSig, 1); -allOutputFile = strcat(outputPrefix, '.mat'); - -for totalSignatures = minNumSig : maxNumSig - outputFile = strcat(outputPrefix, '_ts', num2str(totalSignatures), '.mat'); - - % Decipher the signatures of mutational processes from catalogues of mutations - [input allProcesses allExposures idx processes exposures processStab processStabAvg] = ... - decipherMutationalProcesses(iterationsPerCore, totalSignatures, inputFile, ... - [ outputFile ] ); - % Record the stability and average Frobenius reconstruction error - stability(totalSignatures-minNumSig+1) = mean(processStabAvg); - reconstructionError(totalSignatures-minNumSig+1) = norm(input.originalGenomes - processes*exposures, 'fro'); -end - -%% Plotting the stability and average Frobenius reconstruction error -try %% Some versions of MATLAB plotyy has a bug under linux with -nodisplay -nosplash -nodesktop options - plotSignatureStabilityAndReconstructionToFile(strcat(outputPrefix, '_stab_reconstruction.png'), minNumSig:maxNumSig, stability, reconstructionError, input); -catch ME - %% Do not do anything - just ignore the plot in order to save the final output daya -end - -%% Saving the data -save(allOutputFile); - -quit -end diff --git a/signatures/star_fish.mk b/signatures/star_fish.mk new file mode 100644 index 00000000..3d24b30b --- /dev/null +++ b/signatures/star_fish.mk @@ -0,0 +1,67 @@ +include modules/Makefile.inc + +LOGDIR ?= log/star_fish.$(NOW) + +MIN_SIZE = 1 +MAX_SIZE = 10000000000000000000 + +star_fish : $(foreach pair,$(SAMPLE_PAIRS),star_fish/$(pair)/$(pair).merged_sv.bed) \ + $(foreach pair,$(SAMPLE_PAIRS),star_fish/$(pair)/$(pair).merged_sv.bedpe) \ + $(foreach pair,$(SAMPLE_PAIRS),star_fish/$(pair)/$(pair).merged_cn.txt) \ + star_fish/summary/taskcomplete \ + star_fish/summary/exposures.txt \ + star_fish/summary/features.txt + +define starfish-sv +star_fish/$1_$2/$1_$2.merged_sv.bed : vcf/$1_$2.merged_sv.vcf + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(SURVIVOR_ENV),"set -o pipefail && \ + SURVIVOR vcftobed \ + $$(<) \ + $(MIN_SIZE) \ + $(MAX_SIZE) \ + $$(@)") + +star_fish/$1_$2/$1_$2.merged_sv.bedpe : star_fish/$1_$2/$1_$2.merged_sv.bed + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(STARFISH_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/star_fish.R \ + --option 1 \ + --sample_name $1_$2 \ + --input_file $$(<) \ + --output_file $$(@)") + +star_fish/$1_$2/$1_$2.merged_cn.txt : facets/cncf/$1_$2.txt + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(STARFISH_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/star_fish.R \ + --option 2 \ + --sample_name $1_$2 \ + --input_file $$(<) \ + --output_file $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call starfish-sv,$(tumor.$(pair)),$(normal.$(pair))))) + +star_fish/summary/taskcomplete : $(foreach pair,$(SAMPLE_PAIRS),star_fish/$(pair)/$(pair).merged_sv.bedpe) $(foreach pair,$(SAMPLE_PAIRS),star_fish/$(pair)/$(pair).merged_cn.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(STARFISH_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/star_fish.R \ + --option 3 \ + --sample_name '$(SAMPLE_PAIRS)' \ + --output_file $(@)") + +star_fish/summary/exposures.txt : star_fish/summary/taskcomplete + $(call RUN, -c -n 1 -s 8G -m 12G -v $(STARFISH_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/star_fish.R \ + --option 4 \ + --output_file $(@)") + +star_fish/summary/features.txt : star_fish/summary/taskcomplete + $(call RUN, -c -n 1 -s 8G -m 12G -v $(STARFISH_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/star_fish.R \ + --option 5 \ + --output_file $(@)") + +..DUMMY := $(shell mkdir -p version; \ + $(STARFISH_ENV)/bin/R --version &> version/star_fish.txt;) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: star_fish diff --git a/signatures/sv_signature.mk b/signatures/sv_signature.mk new file mode 100644 index 00000000..74656371 --- /dev/null +++ b/signatures/sv_signature.mk @@ -0,0 +1,53 @@ +include modules/Makefile.inc + +LOGDIR ?= log/sv_signature.$(NOW) + +MIN_SIZE = 1 +MAX_SIZE = 100000000000000000000 + +signature_sv : $(foreach pair,$(SAMPLE_PAIRS),sv_signature/$(pair)/$(pair).merged.bed) \ + $(foreach pair,$(SAMPLE_PAIRS),sv_signature/$(pair)/$(pair).merged.bedpe) \ + $(foreach pair,$(SAMPLE_PAIRS),sv_signature/$(pair)/$(pair).merged_exposures.txt) \ + sv_signature/summary/exposures.txt \ + sv_signature/summary/features.txt + +define signature-sv +sv_signature/$1_$2/$1_$2.merged.bed : vcf/$1_$2.merged_sv.vcf + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(SURVIVOR_ENV),"set -o pipefail && \ + SURVIVOR vcftobed \ + $$(<) \ + $(MIN_SIZE) \ + $(MAX_SIZE) \ + $$(@)") + +sv_signature/$1_$2/$1_$2.merged.bedpe : sv_signature/$1_$2/$1_$2.merged.bed + $$(call RUN,-c -n 1 -s 4G -m 8G,"set -o pipefail && \ + echo \"chrom1 start1 end1 chrom2 start2 end2 sv_id pe_support strand1 strand2 svclass\" > \ + $$(@) && \ + cat $$(<) >> $$(@)") + +sv_signature/$1_$2/$1_$2.merged_exposures.txt : sv_signature/$1_$2/$1_$2.merged.bedpe + $$(call RUN,-c -n 4 -s 2G -m 4G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sv_signature.R \ + --option 1 \ + --sample_name $1_$2 \ + --input_file $$(<) \ + --output_file sv_signature/$1_$2/$1_$2.merged") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call signature-sv,$(tumor.$(pair)),$(normal.$(pair))))) + +sv_signature/summary/exposures.txt : $(foreach pair,$(SAMPLE_PAIRS),sv_signature/$(pair)/$(pair).merged_exposures.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sv_signature.R --option 2 --sample_name '$(SAMPLE_PAIRS)' --output_file $(@)") + +sv_signature/summary/features.txt : $(foreach pair,$(SAMPLE_PAIRS),sv_signature/$(pair)/$(pair).merged_exposures.txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(SIGNATURE_TOOLS_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sv_signature.R --option 3 --sample_name '$(SAMPLE_PAIRS)' --output_file $(@)") + +..DUMMY := $(shell mkdir -p version; \ + $(SURVIVOR_ENV)/bin/SURVIVOR --version &> version/sv_signature.txt;) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: signature_sv diff --git a/signatures/vcf_2_vranges.R b/signatures/vcf_2_vranges.R deleted file mode 100644 index 925ef565..00000000 --- a/signatures/vcf_2_vranges.R +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("VariantAnnotation")) -suppressPackageStartupMessages(library("reshape")) -suppressPackageStartupMessages(library("boot")) -suppressPackageStartupMessages(library("plyr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("ggplot2")) -suppressPackageStartupMessages(library("RColorBrewer")) -suppressPackageStartupMessages(library("reshape2")) -suppressPackageStartupMessages(library("SomaticSignatures")) -suppressPackageStartupMessages(library("foreach")) - -optList <- list( - make_option("--genome", default = 'b37', help = "reference genome"), - make_option("--ignoreFilter", default = F, action = 'store_true', help = "ignore the filter column for vcf files"), - make_option("--outFile", default = NULL, type = "character", action = "store", help = "output directory") - ) - -parser <- OptionParser(usage = "%prog [options] [vcf file(s)]", option_list = optList); -arguments <- parse_args(parser, positional_arguments = T); -opt <- arguments$options; - -if (length(arguments$args) != 1) { - cat("Need vcf file\n"); - print_help(parser); - stop(); -} - -vcfFile <- arguments$args[1] -outFile <- opt$outFile -if (opt$genome == "b37" || opt$genome == "hg19") { - library("BSgenome.Hsapiens.UCSC.hg19"); - library("TxDb.Hsapiens.UCSC.hg19.knownGene") - genome <- BSgenome.Hsapiens.UCSC.hg19 - txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene - genomeName <- 'hg19' - chromosomes <- c(1:22, "X", "Y") - chromosomes <- c(chromosomes, paste('chr', chromosomes, sep = '')) -} else if (opt$genome == "mm10" || opt$genome == "GRCm38") { - library("BSgenome.Mmusculus.UCSC.mm10"); - library("TxDb.Mmusculus.UCSC.mm10.knownGene") - genome <- BSgenome.Mmusculus.UCSC.mm10 - txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene - genomeName <- 'mm10' - chromosomes <- c(1:19, "X", "Y") - chromosomes <- c(chromosomes, paste('chr', chromosomes, sep = '')) -} - -txByGenes <- transcriptsBy(txdb, 'gene') - -temp <- tempfile() -zipped <- bgzip(vcfFile, temp) -idx <- indexTabix(temp, "vcf") -cat('done\n') - -tab <- TabixFile(zipped, idx) -open(tab) - -vcf <- readVcf(tab, genomeName) -passIds <- which(rowRanges(vcf)$FILTER == "PASS") -if (nrow(vcf) > 0 && sum(seqnames(vcf) %in% chromosomes) > 0 && - sum(isSNV(vcf)) > 0 && (opt$ignoreFilter | length(passIds) > 0)) { - if (!opt$ignoreFilter) { - vcf <- vcf[passIds, ] - } - vcf <- vcf[isSNV(vcf) & seqnames(vcf) %in% chromosomes] - s <- sub('\\..*', '', vcfFile) - s <- sub('.*/', '', s) - vr <- VRanges(seqnames = seqnames(vcf), - ranges = ranges(vcf), - ref = as.character(ref(vcf)), - alt = sapply(alt(vcf), function(x) as.character(x[1])), - sampleNames = s) - seqlevels(vr) <- sub('^M$', 'MT', seqlevels(vr)) - vr <- ucsc(vr) - vr <- mutationContext(vr, genome, unify = T) - vr$refalt <- paste(ref(vr), alt(vr), sep = '') - - # query transcript ids - ol <- findOverlaps(vr, txByGenes) - subjectStrands <- sapply(txByGenes[subjectHits(ol)], function(x) paste(unique(as.character(strand(x))), collapse = ',')) - queryStrands <- tapply(subjectStrands, queryHits(ol), function(x) paste(unique(x), collapse = ",")) - vr$txStrand <- NA - vr$txStrand[as.integer(names(queryStrands))] <- queryStrands - vr$transcribed <- F - vr$transcribed[is.na(vr$txStrand)] <- NA - vr$transcribed[vr$refalt %in% c("GA", "GC", "GT", "AC", "AG", "AT") & grepl('\\+', vr$txStrand)] <- T - vr$transcribed[vr$refalt %in% c("CA", "CG", "CT", "TA", "TC", "TG") & grepl('-', vr$txStrand)] <- T - save(vr, file = opt$outFile) -} else { - vr <- NULL - save(vr, file = opt$outFile) -} diff --git a/summary/delmh_summary.R b/summary/delmh_summary.R new file mode 100644 index 00000000..3698f4d4 --- /dev/null +++ b/summary/delmh_summary.R @@ -0,0 +1,118 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("BSgenome.Hsapiens.UCSC.hg19")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) +} + +args_list <- list(make_option("--input_file", default = NA, type = 'character', help = "file name and path")) + +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + +all_vars = read_tsv(file=opt$input_file, col_types = cols(.default = col_character())) %>% + type_convert() + +all_tumors = all_vars %>% + .[["TUMOR_SAMPLE"]] + +all_normals = all_vars %>% + .[["NORMAL_SAMPLE"]] + +all_patients = unique(paste0(all_tumors, "_", all_normals)) + +all_vars = all_vars %>% + filter(Variant_Classification=="Frame_Shift_Del" | Variant_Classification=="In_Frame_Del") %>% + filter((grepl("varscan", variantCaller) & grepl("strelka", variantCaller)) | + ((grepl("platypus", variantCaller) & grepl("scalpel", variantCaller)) & ((nchar(REF)-nchar(ALT))>4) & Variant_Classification!="In_Frame_Del") | + ((grepl("platypus", variantCaller) & grepl("lancet", variantCaller)) & ((nchar(REF)-nchar(ALT))>4) & Variant_Classification!="In_Frame_Del")) + +patient_summary = data_frame(SAMPLE_UUID = all_patients) +del_count = all_vars %>% + mutate(SAMPLE_UUID = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(del_count = n()) +mean_delen = all_vars %>% + mutate(del_len = nchar(REF)-1) %>% + mutate(SAMPLE_UUID = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(mean_delen = mean(del_len)) +median_delen = all_vars %>% + mutate(del_len = nchar(REF)-1) %>% + mutate(SAMPLE_UUID = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(median_delen = median(del_len)) +deln4_count = all_vars %>% + mutate(del_len = nchar(REF)-1) %>% + mutate(SAMPLE_UUID = paste0(TUMOR_SAMPLE, "_", NORMAL_SAMPLE)) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(deln4_count = sum(del_len>=4)) + +'getSeqFrom' <- function(chr, start, end) +{ + ret = as.character(getSeq(x=BSgenome.Hsapiens.UCSC.hg19, names=chr, start=start, end=end, strand="+", as.character=TRUE)) + return(invisible(ret)) +} + + +'checkHomLen' <- function(deleted, next50) +{ + ret = 0 + for (i in 1:nchar(deleted)) { + if (substr(deleted, 1, i) == substr(next50, 1, i)) { + ret = i + } + } + return(invisible(ret)) +} + +hml_down = hml_up = NULL +for (i in 1:nrow(all_vars)) { + chr = paste0("chr", all_vars[i,"CHROM"]) + start = as.numeric(all_vars[i,"POS"])+1 + n = as.numeric(nchar(all_vars[i,"REF"]))-1 + + deleted = getSeqFrom(chr = chr, start = start, end = start + n - 1) + prevn = getSeqFrom(chr = chr, start = start - n, end = start - 1) + nextn = getSeqFrom(chr = chr, start = start + n, end = start + 2*n - 1) + + hml_down = c(hml_down, checkHomLen(deleted = deleted, next50 = prevn)) + hml_up = c(hml_up, checkHomLen(deleted = deleted, next50 = nextn)) +} + +mh_3 = data_frame(SAMPLE_UUID = paste0(all_vars$TUMOR_SAMPLE, "_", all_vars$NORMAL_SAMPLE), + del_len = nchar(all_vars$REF)-1, + max_mhlen_5p = hml_down, + max_mhlen_3p = hml_up, + max_mhlen = apply(cbind(hml_down, hml_up), 1, max)) %>% + filter(del_len >= 4) %>% + mutate(is_3 = ifelse(max_mhlen>=3, 1, 0)) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(deln4_mhlen_3_counts = sum(is_3)) + +mhl_3 = data_frame(SAMPLE_UUID = paste0(all_vars$TUMOR_SAMPLE, "_", all_vars$NORMAL_SAMPLE), + del_len = nchar(all_vars$REF)-1, + max_mhlen_5p = hml_down, + max_mhlen_3p = hml_up, + max_mhlen = apply(cbind(hml_down, hml_up), 1, max)) %>% + filter(del_len >= 4) %>% + filter(max_mhlen >= 3) %>% + dplyr::group_by(SAMPLE_UUID) %>% + dplyr::summarize(deln4_mhlen_3_avg_deln = mean(del_len)) + +patient_summary = left_join(patient_summary, del_count, by="SAMPLE_UUID") %>% + left_join(mean_delen, by="SAMPLE_UUID") %>% + left_join(median_delen, by="SAMPLE_UUID") %>% + left_join(deln4_count, by="SAMPLE_UUID") %>% + left_join(mh_3, by="SAMPLE_UUID") %>% + left_join(mhl_3, by="SAMPLE_UUID") %>% + mutate(delmh_prop = deln4_mhlen_3_counts/del_count) %>% + mutate(delmh_del4n_prop = deln4_mhlen_3_counts/deln4_count) + +write_tsv(patient_summary, path="summary/tsv/delmh_summary.tsv") diff --git a/summary/delmh_summary.mk b/summary/delmh_summary.mk new file mode 100644 index 00000000..7b82afc9 --- /dev/null +++ b/summary/delmh_summary.mk @@ -0,0 +1,14 @@ +include modules/Makefile.inc + +LOGDIR ?= log/delmh_summary.$(NOW) +PHONY += delmh_summary + +delmh_summary : summary/tsv/delmh_summary.tsv + +summary/tsv/delmh_summary.tsv : summary/tsv/mutation_summary.tsv + $(call RUN,-n 1 -s 8G -m 8G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/delmh_summary.R --input_file $(<)") + +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: $(PHONY) diff --git a/summary/genomesummary.R b/summary/genomesummary.R index 57c1940e..be20985e 100644 --- a/summary/genomesummary.R +++ b/summary/genomesummary.R @@ -1,13 +1,551 @@ #!/usr/bin/env Rscript -file_names = c("genome_altered.tsv", "lst_score.tsv", "myriad_score.tsv", "ntai_score.tsv") -summary_scores = NULL -for (i in 1:length(file_names)) { - data = read.csv(file=paste0("genome_stats/", file_names[i]), header=FALSE, sep="\t", stringsAsFactors=FALSE) - summary_scores = cbind(summary_scores, data[,2]) +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("magrittr")) +suppressPackageStartupMessages(library("readr")) + +if (!interactive()) { + options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) } -summary_scores = cbind(data[,1], summary_scores) -colnames(summary_scores) = c("sample_names", gsub(".tsv", "", file_names)) -write.table(summary_scores, file="summary/tsv/genome_summary.tsv", col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) -warnings() +args_list <- list(make_option("--option", default = NA, type = 'character', help = "which analysis to do"), + make_option("--sample_name", default = NA, type = 'character', help = "sample name"), + make_option("--file_in", default = NA, type = 'character', help = "input file name"), + make_option("--file_out", default = NA, type = 'character', help = "output file name")) +parser <- OptionParser(usage = "%prog", option_list = args_list) +arguments <- parse_args(parser, positional_arguments = T) +opt <- arguments$options + + +if (as.numeric(opt$option) == 1) { + + load(opt$file_in) + alpha = ifelse(is.na(fit$purity), 1, fit$purity) + psi = ifelse(is.na(fit$ploidy), 2, fit$ploidy) + gamma = 1 + x = fit$cncf[,"cnlr.median"] + absolute_copies = round(((((2^(x/gamma))*(alpha*psi+(1-alpha)*2)) - ((1-alpha)*2))/alpha)) + index = absolute_copies!=round(psi) + if (sum(index, na.rm=TRUE)!=0) { + genome_footprint = sum(as.numeric(fit$cncf[,"end"]-fit$cncf[,"start"]), na.rm=TRUE) + genome_altered = sum(as.numeric(fit$cncf[index,"end"]-fit$cncf[index,"start"]), na.rm=TRUE)/genome_footprint + } else { + genome_altered = 0 + } + x = dplyr::tibble(sample_name = as.character(opt$sample_name), + genome_altered = genome_altered) + readr::write_tsv(x = x, path = as.character(opt$file_out), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option) == 2) { + + chromStrToNum <- function(str) { + suppressWarnings(cNum <- as.numeric(str)) + if (is.na(cNum) && str == "X" ) { + cNum <- 23 + } else if (is.na(cNum) && str == "Y") { + cNum <- 24 + } + return(invisible(cNum)) + } + + GetChrominfo <- function() { + f <- "modules/copy_number/hg19_chrominfo.txt" + chrom <- read.table(file=f) + chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) + f <- "modules/copy_number/hg19_gaps.txt" + gaps <- read.table(file=f) + centro <- subset(gaps, gaps[,8] == "centromere") + chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) + chrominfo$centromere <- rowMeans(chrominfo[,3:4]) + chrominfo <- chrominfo[,c(1,2,5,3,4)] + colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") + chrominfo[,1] <- as.character(chrominfo[,1]) + chrominfo$chr <- sub("chr", "", chrominfo$chr) + chrominfo$chr <- sub("X", "23", chrominfo$chr) + chrominfo$chr <- sub("Y", "24", chrominfo$chr) + chrominfo[,1] <- as.numeric(chrominfo[,1]) + chrominfo <- chrominfo[order(chrominfo$chr), ] + rownames(chrominfo) <- as.character(chrominfo[,1]) + chrominfo <- as.matrix(chrominfo) + return(invisible(chrominfo)) + } + + fix_facets_column_names <- function(dat) { + colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" + colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" + colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" + colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" + sz <- dat[,"endBP"] - dat[,"startBP"] + dat <- cbind(dat, size=sz) + nA <- dat[,"tcn.em"] - dat[,"nB"] + dat <- cbind(dat, nA=nA) + return(invisible(dat)) + } + + join_adjacent_segments <- function(dat) { + cur_segs <- dat + something_changed <- 1 + while ( something_changed ) { + new_segs <- c() + something_changed <- 0 + x <- 2 + last_changed <- 0 + while (x <= nrow(cur_segs)) { + last_changed <- 0 + if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && + (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && + (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) + ) { + t <- cur_segs[x-1,] + t["endBP"] <- cur_segs[x,"endBP"] + t["end"] <- cur_segs[x,"end"] + t["size"] <- t["endBP"] - t["startBP"] + something_changed <- 1 + new_segs <- rbind(t, new_segs) + x <- x+2 + last_changed <- 1 + } else { + new_segs <- rbind(cur_segs[x-1,], new_segs) + x<-x+1 + } + } + if (! last_changed ) { + new_segs <- rbind(cur_segs[x-1,],new_segs) + } + n <- nrow(new_segs) + new_segs <- new_segs[n:1,] + cur_segs <- new_segs + } + return(invisible(cur_segs)) + } + + fix_facet_segs <- function(dat) { + i <- which(is.na(dat$nB)) + if ( length(i) > 0 ) { + dat <- dat[-i, ] + } + dat <- join_adjacent_segments(dat) + return(invisible(dat)) + } + + chrom_arm_LST_score <- function(dat) { + score <- 0 + segs <- c() + SIZE_THRESH <- 10e6 + SPACE_THRESH <- 3e6 + if ( nrow(dat) >= 2 ) { + for (x in 2:nrow(dat)) { + if ( (dat[x-1,"size"] >= SIZE_THRESH) && + (dat[x,"size"] >= SIZE_THRESH) && + ( (dat[x,"startBP"] - dat[x-1,"endBP"]) <= SPACE_THRESH) + ) { + score <- score +1 + segs <- rbind(dat[x-1,], segs) + } + } + } + tmp <- list() + tmp$score <- score + tmp$segs <- segs + return(invisible(tmp)) + } + + lst_filter <- function(dat, size_thresh) { + i <- which(dat[,"size"] < size_thresh) + sz <- dat[i,"size"] + i <- i[order(sz)] + segs_removed <- 0 + while (length(i) > 0) { + dat <- dat[-i[1], ] + dat <- join_adjacent_segments(dat) + i<- which(dat[,"size"] < size_thresh) + sz <- dat[i,"size"] + i <- i[order(sz)] + segs_removed <- segs_removed + 1 + } + return(invisible(dat)) + } + + score_LST <- function(dat, chromInfo) { + score <- 0 + segs <- c() + dat <- lst_filter(dat, 3e6) + for (c in unique(dat[,"chromosome"]) ) { + i <- which(dat[,"chromosome"] == c) + csegs <- dat[i,] + cNum <- chromStrToNum(c) + i <- which(csegs[,"startBP"] <= chromInfo[cNum,"centstart"]) + parm <- csegs[i,] + tmp <- chrom_arm_LST_score(parm) + score <- score + tmp$score + segs <- rbind(tmp$segs, segs) + i <- which(csegs[,"endBP"] >= chromInfo[cNum,"centend"]) + qarm <- csegs[i,] + tmp <- chrom_arm_LST_score(qarm) + score <- score + tmp$score + segs <- rbind(tmp$segs, segs) + } + tmp <- list() + tmp$score <- score + tmp$segs <- segs + return(invisible(tmp)) + } + + dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) + dat = fix_facets_column_names(dat) + segs = fix_facet_segs(dat) + chromInfo = GetChrominfo() + lst = score_LST(segs, chromInfo) + x = dplyr::tibble(sample_name = as.character(opt$sample_name), + lst = lst$score) + readr::write_tsv(x = x, path = as.character(opt$file_out), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option) == 3) { + + chromStrToNum <- function(str) { + suppressWarnings(cNum <- as.numeric(str)) + if (is.na(cNum) && str == "X" ) { + cNum <- 23 + } else if (is.na(cNum) && str == "Y") { + cNum <- 24 + } + return(invisible(cNum)) + } + + GetChrominfo <- function() { + f <- "modules/copy_number/hg19_chrominfo.txt" + chrom <- read.table(file=f) + chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) + f <- "modules/copy_number/hg19_gaps.txt" + gaps <- read.table(file=f) + centro <- subset(gaps, gaps[,8] == "centromere") + chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) + chrominfo$centromere <- rowMeans(chrominfo[,3:4]) + chrominfo <- chrominfo[,c(1,2,5,3,4)] + colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") + chrominfo[,1] <- as.character(chrominfo[,1]) + chrominfo$chr <- sub("chr", "", chrominfo$chr) + chrominfo$chr <- sub("X", "23", chrominfo$chr) + chrominfo$chr <- sub("Y", "24", chrominfo$chr) + chrominfo[,1] <- as.numeric(chrominfo[,1]) + chrominfo <- chrominfo[order(chrominfo$chr), ] + rownames(chrominfo) <- as.character(chrominfo[,1]) + chrominfo <- as.matrix(chrominfo) + return(invisible(chrominfo)) + } + + fix_facets_column_names <- function(dat) { + colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" + colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" + colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" + colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" + sz <- dat[,"endBP"] - dat[,"startBP"] + dat <- cbind(dat, size=sz) + nA <- dat[,"tcn.em"] - dat[,"nB"] + dat <- cbind(dat, nA=nA) + return(invisible(dat)) + } + + join_adjacent_segments <- function(dat) { + cur_segs <- dat + something_changed <- 1 + while ( something_changed ) { + new_segs <- c() + something_changed <- 0 + x <- 2 + last_changed <- 0 + while (x <= nrow(cur_segs)) { + last_changed <- 0 + if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && + (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && + (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) + ) { + t <- cur_segs[x-1,] + t["endBP"] <- cur_segs[x,"endBP"] + t["end"] <- cur_segs[x,"end"] + t["size"] <- t["endBP"] - t["startBP"] + something_changed <- 1 + new_segs <- rbind(t, new_segs) + x <- x+2 + last_changed <- 1 + } else { + new_segs <- rbind(cur_segs[x-1,], new_segs) + x<-x+1 + } + } + if (! last_changed ) { + new_segs <- rbind(cur_segs[x-1,],new_segs) + } + n <- nrow(new_segs) + new_segs <- new_segs[n:1,] + cur_segs <- new_segs + } + return(invisible(cur_segs)) + } + + fix_facet_segs <- function(dat) { + i <- which(is.na(dat$nB)) + if ( length(i) > 0 ) { + dat <- dat[-i, ] + } + dat <- join_adjacent_segments(dat) + return(invisible(dat)) + } + + score_ntAI <- function(dat, chromInfo, min_size=1000, shrink=FALSE) { + index <- dat[,"chromosome"] %in% c("MT", "Y", "24") + dat <- dat[!index,] + index <- dat[,"size"] < min_size + dat <- dat[!index,] + if (shrink) { + dat <- join_adjacent_segments(dat) + } + chrList <- unique(dat[,"chromosome"]) + ntAI_score <- 0 + ntAI_segs <- NULL + for (x in chrList) { + index <- dat[,"chromosome"] == x + chr_segs <- dat[index,] + cNum <- chromStrToNum(x) + if (nrow(chr_segs) < 2 ) { + next + } + if ( (chr_segs[1,"nA"] != chr_segs[1,"nB"]) && (chromInfo[cNum,"centstart"] > chr_segs[1,"endBP"]) ) { + ntAI_score <- ntAI_score+1 + ntAI_segs <- rbind(chr_segs[1,],ntAI_segs) + } + eSeg <- nrow(chr_segs) + if ( (chr_segs[eSeg, "nA"] != chr_segs[eSeg, "nB"]) && (chr_segs[eSeg,"startBP"] > chromInfo[cNum,"centend"]) ) { + ntAI_score <- ntAI_score+1 + ntAI_segs <- rbind(chr_segs[eSeg,],ntAI_segs) + } + } + tmp <- list() + tmp$segs <- ntAI_segs + tmp$score <- ntAI_score + return(invisible(tmp)) + } + + dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) + dat = fix_facets_column_names(dat) + segs = fix_facet_segs(dat) + chromInfo = GetChrominfo() + ntai = score_ntAI(segs, chromInfo) + x = dplyr::tibble(sample_name = as.character(opt$sample_name), + ntai = ntai$score) + readr::write_tsv(x = x, path = as.character(opt$file_out), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option) == 4) { + + chromStrToNum <- function(str) { + suppressWarnings(cNum <- as.numeric(str)) + if (is.na(cNum) && str == "X" ) { + cNum <- 23 + } else if (is.na(cNum) && str == "Y") { + cNum <- 24 + } + return(invisible(cNum)) + } + + GetChrominfo <- function() { + f <- "modules/copy_number/hg19_chrominfo.txt" + chrom <- read.table(file=f) + chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1])) + f <- "modules/copy_number/hg19_gaps.txt" + gaps <- read.table(file=f) + centro <- subset(gaps, gaps[,8] == "centromere") + chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) + chrominfo$centromere <- rowMeans(chrominfo[,3:4]) + chrominfo <- chrominfo[,c(1,2,5,3,4)] + colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend") + chrominfo[,1] <- as.character(chrominfo[,1]) + chrominfo$chr <- sub("chr", "", chrominfo$chr) + chrominfo$chr <- sub("X", "23", chrominfo$chr) + chrominfo$chr <- sub("Y", "24", chrominfo$chr) + chrominfo[,1] <- as.numeric(chrominfo[,1]) + chrominfo <- chrominfo[order(chrominfo$chr), ] + rownames(chrominfo) <- as.character(chrominfo[,1]) + chrominfo <- as.matrix(chrominfo) + return(invisible(chrominfo)) + } + + fix_facets_column_names <- function(dat) { + colnames(dat)[which(colnames(dat)=="chrom")] <- "chromosome" + colnames(dat)[which(colnames(dat)=="loc.start")] <- "startBP" + colnames(dat)[which(colnames(dat)=="loc.end")] <- "endBP" + colnames(dat)[which(colnames(dat)=="lcn.em")] <- "nB" + sz <- dat[,"endBP"] - dat[,"startBP"] + dat <- cbind(dat, size=sz) + nA <- dat[,"tcn.em"] - dat[,"nB"] + dat <- cbind(dat, nA=nA) + return(invisible(dat)) + } + + join_adjacent_segments <- function(dat) { + cur_segs <- dat + something_changed <- 1 + while ( something_changed ) { + new_segs <- c() + something_changed <- 0 + x <- 2 + last_changed <- 0 + while (x <= nrow(cur_segs)) { + last_changed <- 0 + if ( (cur_segs[x-1,"nB"] == cur_segs[x,"nB"]) && + (cur_segs[x-1,"nA"] == cur_segs[x,"nA"]) && + (cur_segs[x-1,"chromosome"] == cur_segs[x,"chromosome"]) + ) { + t <- cur_segs[x-1,] + t["endBP"] <- cur_segs[x,"endBP"] + t["end"] <- cur_segs[x,"end"] + t["size"] <- t["endBP"] - t["startBP"] + something_changed <- 1 + new_segs <- rbind(t, new_segs) + x <- x+2 + last_changed <- 1 + } else { + new_segs <- rbind(cur_segs[x-1,], new_segs) + x<-x+1 + } + } + if (! last_changed ) { + new_segs <- rbind(cur_segs[x-1,],new_segs) + } + n <- nrow(new_segs) + new_segs <- new_segs[n:1,] + cur_segs <- new_segs + } + return(invisible(cur_segs)) + } + + fix_facet_segs <- function(dat) { + i <- which(is.na(dat$nB)) + if ( length(i) > 0 ) { + dat <- dat[-i, ] + } + dat <- join_adjacent_segments(dat) + return(invisible(dat)) + } + + chrom_arm_LST_score <- function(dat) { + score <- 0 + segs <- c() + SIZE_THRESH <- 10e6 + SPACE_THRESH <- 3e6 + if ( nrow(dat) >= 2 ) { + for (x in 2:nrow(dat)) { + if ( (dat[x-1,"size"] >= SIZE_THRESH) && + (dat[x,"size"] >= SIZE_THRESH) && + ( (dat[x,"startBP"] - dat[x-1,"endBP"]) <= SPACE_THRESH) + ) { + score <- score +1 + segs <- rbind(dat[x-1,], segs) + } + } + } + tmp <- list() + tmp$score <- score + tmp$segs <- segs + return(invisible(tmp)) + } + + lst_filter <- function(dat, size_thresh) { + i <- which(dat[,"size"] < size_thresh) + sz <- dat[i,"size"] + i <- i[order(sz)] + segs_removed <- 0 + while (length(i) > 0) { + dat <- dat[-i[1], ] + dat <- join_adjacent_segments(dat) + i<- which(dat[,"size"] < size_thresh) + sz <- dat[i,"size"] + i <- i[order(sz)] + segs_removed <- segs_removed + 1 + } + return(invisible(dat)) + } + + score_myriad_HRD <- function(dat, thresh=15e6) { + chrDel <- NULL + hrdSegs <- NULL + hrd_score <- 0 + chrList <- unique(dat[,"chromosome"]) + for (x in chrList) { + index <- which(dat[,"chromosome"] == x) + totalnB <- sum(dat[index,"nB"], na.rm=TRUE) + if (totalnB == 0) { + chrDel <- c(x, chrDel) + } + } + for (x in 1:nrow(dat)) { + if ( dat[x,"chromosome"] %in% chrDel ) { + next + } + if ( dat[x,"nB"] != 0 ) { + next + } + if (dat[x,"size"] < thresh) { + next + } + hrd_score <- hrd_score + 1 + hrdSegs <- rbind(dat[x,], hrdSegs) + } + tmp <- list() + tmp$score = hrd_score + tmp$segs = hrdSegs + return(invisible(tmp)) + } + + + dat = read.table(opt$file_in, sep="\t", header=TRUE, stringsAsFactor=FALSE) + dat = fix_facets_column_names(dat) + segs = fix_facet_segs(dat) + chromInfo = GetChrominfo() + mrs = score_myriad_HRD(segs) + x = dplyr::tibble(sample_name = as.character(opt$sample_name), + mrs = mrs$score) + readr::write_tsv(x = x, path = as.character(opt$file_out), append = FALSE, col_names = TRUE) + +} else if (as.numeric(opt$option)==5) { + + sample_names = unlist(strsplit(opt$sample_name, split = " ", fixed = TRUE)) + x1 = list() + for (i in 1:length(sample_names)) { + x1[[i]] = readr::read_tsv(file = paste0("genome_summary/genome_altered/", sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + x1 = do.call(bind_rows, x1) + + x2 = list() + for (i in 1:length(sample_names)) { + x2[[i]] = readr::read_tsv(file = paste0("genome_summary/lst/", sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + x2 = do.call(bind_rows, x2) + + x3 = list() + for (i in 1:length(sample_names)) { + x3[[i]] = readr::read_tsv(file = paste0("genome_summary/ntai/", sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + x3 = do.call(bind_rows, x3) + + x4 = list() + for (i in 1:length(sample_names)) { + x4[[i]] = readr::read_tsv(file = paste0("genome_summary/myriad_score/", sample_names[i], ".txt"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + x4 = do.call(bind_rows, x4) + + data = x1 %>% + dplyr::full_join(x2, by = "sample_name") %>% + dplyr::full_join(x3, by = "sample_name") %>% + dplyr::full_join(x4, by = "sample_name") + + readr::write_tsv(x = data, path = as.character(opt$file_out), append = FALSE, col_names = TRUE) +} diff --git a/summary/genomesummary.mk b/summary/genomesummary.mk index 45fbda98..35424cc8 100644 --- a/summary/genomesummary.mk +++ b/summary/genomesummary.mk @@ -1,25 +1,69 @@ include modules/Makefile.inc LOGDIR ?= log/genome_summary.$(NOW) -PHONY += genome_stats summary summary/tsv -LST_SCORE ?= $(wildcard $(foreach set,$(SAMPLE_PAIRS),genome_stats/$(set).lst)) -GENOME_ALTERED ?= $(wildcard $(foreach set,$(SAMPLE_PAIRS),genome_stats/$(set).fga)) -NTAI_SCORE ?= $(wildcard $(foreach set,$(SAMPLE_PAIRS),genome_stats/$(set).ntai)) -MYRIAD_SCORE ?= $(wildcard $(foreach set,$(SAMPLE_PAIRS),genome_stats/$(set).mrs)) -genome_summary : genome_stats/lst_score.tsv genome_stats/genome_altered.tsv genome_stats/ntai_score.tsv genome_stats/myriad_score.tsv summary/tsv/genome_summary.tsv summary/genome_summary.xlsx - -genome_stats/lst_score.tsv genome_stats/genome_altered.tsv genome_stats/ntai_score.tsv genome_stats/myriad_score.tsv summary/tsv/genome_summary.tsv : - $(call RUN,-n 1 -s 4G -m 4G,"cat $(LST_SCORE) > genome_stats/lst_score.tsv && \ - cat $(GENOME_ALTERED) > genome_stats/genome_altered.tsv && \ - cat $(NTAI_SCORE) > genome_stats/ntai_score.tsv && \ - cat $(MYRIAD_SCORE) > genome_stats/myriad_score.tsv && \ - $(RSCRIPT) modules/summary/genomesummary.R") - -summary/genome_summary.xlsx : summary/tsv/genome_summary.tsv - $(call RUN,-n 1 -s 4G -m 4G,"python modules/summary/genome_summary_excel.py") +genome_summary : $(foreach pair,$(SAMPLE_PAIRS),genome_summary/genome_altered/$(pair).txt) \ + $(foreach pair,$(SAMPLE_PAIRS),genome_summary/lst/$(pair).txt) \ + $(foreach pair,$(SAMPLE_PAIRS),genome_summary/ntai/$(pair).txt) \ + $(foreach pair,$(SAMPLE_PAIRS),genome_summary/myriad_score/$(pair).txt) \ + genome_summary/summary.txt + +define fraction-genome-altered +genome_summary/genome_altered/$1_$2.txt : facets/cncf/$1_$2.Rdata + $$(call RUN,-n 1 -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/genomesummary.R \ + --option 1 \ + --sample_name $1_$2 \ + --file_in $$(<) \ + --file_out $$(@)") +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call fraction-genome-altered,$(tumor.$(pair)),$(normal.$(pair))))) + +define lst-score +genome_summary/lst/$1_$2.txt : facets/cncf/$1_$2.txt + $$(call RUN,-n 1 -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/genomesummary.R \ + --option 2 \ + --sample_name $1_$2 \ + --file_in $$(<) \ + --file_out $$(@)") +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call lst-score,$(tumor.$(pair)),$(normal.$(pair))))) + +define ntai-score +genome_summary/ntai/$1_$2.txt : facets/cncf/$1_$2.txt + $$(call RUN,-n 1 -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/genomesummary.R \ + --option 3 \ + --sample_name $1_$2 \ + --file_in $$(<) \ + --file_out $$(@)") +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call ntai-score,$(tumor.$(pair)),$(normal.$(pair))))) + +define myriad-score +genome_summary/myriad_score/$1_$2.txt : facets/cncf/$1_$2.txt + $$(call RUN,-n 1 -s 3G -m 6G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/genomesummary.R \ + --option 4 \ + --sample_name $1_$2 \ + --file_in $$(<) \ + --file_out $$(@)") +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call myriad-score,$(tumor.$(pair)),$(normal.$(pair))))) +genome_summary/summary.txt : $(foreach pair,$(SAMPLE_PAIRS),genome_summary/genome_altered/$(pair).txt) $(foreach pair,$(SAMPLE_PAIRS),genome_summary/lst/$(pair).txt) $(foreach pair,$(SAMPLE_PAIRS),genome_summary/ntai/$(pair).txt) $(foreach pair,$(SAMPLE_PAIRS),genome_summary/myriad_score/$(pair).txt) + $(call RUN,-n 1 -s 4G -m 8G,"set -o pipefail && \ + $(RSCRIPT) modules/summary/genomesummary.R \ + --option 5 \ + --sample_name '$(SAMPLE_PAIRS)' \ + --file_out $(@)") + .DELETE_ON_ERROR: .SECONDARY: -.PHONY: $(PHONY) +.PHONY: genome_summary diff --git a/summary/mutationSummary.mk b/summary/mutationsummary.mk similarity index 100% rename from summary/mutationSummary.mk rename to summary/mutationsummary.mk diff --git a/summary/sufamsummary.R b/summary/sufamsummary.R deleted file mode 100644 index ef96e778..00000000 --- a/summary/sufamsummary.R +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("openxlsx")) -suppressPackageStartupMessages(library("readr")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample_sets", default = NA, type = 'character', help = "sample sets file names")) -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -sample_names = na.omit(unlist(strsplit(x=opt$sample_sets, split=" ", fixed=TRUE))) -list_of_dfs = list() -for (i in 1:length(sample_names)) { - sample_vars = read_tsv(file=paste0("sufam/", sample_names[i], ".tsv")) - col_names = colnames(sample_vars) - sample_vars = as.data.frame(sample_vars) - sample_vars[sample_vars=="" | sample_vars==" " | is.na(sample_vars)] = "NA" - colnames(sample_vars) = col_names - list_of_dfs[[i]] = sample_vars -} -names(list_of_dfs) = sample_names -write.xlsx(list_of_dfs, file="summary/sufam_summary.xlsx") diff --git a/sv_callers/fusioncatcher.mk b/sv_callers/fusioncatcher.mk index 946627bb..e7e3fde7 100644 --- a/sv_callers/fusioncatcher.mk +++ b/sv_callers/fusioncatcher.mk @@ -6,7 +6,7 @@ LOGDIR = log/fusioncatcher.$(NOW) ##### MAKE INCLUDES ##### include modules/Makefile.inc -FUSIONCATCHER = $(HOME)/share/usr/fusioncatcher/bin/fusioncatcher +FUSIONCATCHER = $(HOME)/share/usr/fusioncatcher/fusioncatcher_v0.99.2/fusioncatcher FUSIONCATCHER_OPTS = -d $(HOME)/share/usr/fusioncatcher/data/current --extract-buffer-size=35000000000 .DELETE_ON_ERROR: diff --git a/sv_callers/gridss_tumor_normal.mk b/sv_callers/gridss_tumor_normal.mk new file mode 100644 index 00000000..a1f5a470 --- /dev/null +++ b/sv_callers/gridss_tumor_normal.mk @@ -0,0 +1,63 @@ +include modules/Makefile.inc + +LOGDIR = log/gridss_tumor_normal.$(NOW) + +GRIDSS_CORES ?= 8 +GRIDSS_MEM_CORE ?= 6G +GRIDSS_REF ?= $(HOME)/share/lib/ref_files/b37/human_g1k_v37.fasta +GRIDSS_BLACKLIST ?= $(HOME)/share/lib/resource_files/gridss/example/ENCFF001TDO.bed +GRIDSS ?= gridss +GRIDSS_FILTER ?= gridss_somatic_filter +GRIDSS_PON_DIR ?= $(HOME)/share/lib/resource_files/gridss/pon/ + +gridss : $(foreach pair,$(SAMPLE_PAIRS),gridss/$(pair)/$(pair).gridss_sv.vcf) \ + $(foreach pair,$(SAMPLE_PAIRS),gridss/$(pair)/$(pair).gridss_sv_ft.vcf.bgz) \ + $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).gridss_sv.vcf) \ + $(foreach pair,$(SAMPLE_PAIRS),gridss/$(pair)/taskcomplete) + +define gridss-tumor-normal +gridss/$1_$2/$1_$2.gridss_sv.vcf : bam/$1.bam bam/$2.bam + $$(call RUN,-c -n $(GRIDSS_CORES) -s 4G -m $(GRIDSS_MEM_CORE) -v $(GRIDSS_ENV) -w 72:00:00,"set -o pipefail && \ + mkdir -p gridss/$1_$2 && \ + cd gridss/$1_$2 && \ + $$(GRIDSS) \ + -t $$(GRIDSS_CORES) \ + -r $$(GRIDSS_REF) \ + -o $1_$2.gridss_sv.vcf \ + -b $$(GRIDSS_BLACKLIST) \ + ../../bam/$2.bam \ + ../../bam/$1.bam") + +gridss/$1_$2/$1_$2.gridss_sv_ft.vcf.bgz : gridss/$1_$2/$1_$2.gridss_sv.vcf + $$(call RUN,-c -n 1 -s 12G -m 18G -v $(GRIDSS_ENV),"set -o pipefail && \ + cd gridss/$1_$2 && \ + $$(GRIDSS_FILTER) \ + --pondir $$(GRIDSS_PON_DIR) \ + --input $1_$2.gridss_sv.vcf \ + --output $1_$2.gridss_sv_ft.vcf \ + --fulloutput $1_$2.gridss_sv_high_and_low_confidence_somatic.vcf \ + -n 1 \ + -t 2") + +vcf/$1_$2.gridss_sv.vcf : gridss/$1_$2/$1_$2.gridss_sv_ft.vcf.bgz + $$(INIT) zcat $$(<) > $$(@) + +gridss/$1_$2/taskcomplete : vcf/$1_$2.gridss_sv.vcf + $$(INIT) rm -f gridss/$1_$2/$1.bam.gridss.working/$1.bam.sv.bam && \ + rm -f gridss/$1_$2/$1.bam.gridss.working/$1.bam.sv.bam.bai && \ + rm -f gridss/$1_$2/$2.bam.gridss.working/$2.bam.sv.bam && \ + rm -f gridss/$1_$2/$2.bam.gridss.working/$2.bam.sv.bam.bai && \ + rm -f gridss/$1_$2/$1_$2.gridss_sv.vcf.assembly.bam.gridss.working/FL001-101CD_FL001-101NL.gridss_sv.vcf.assembly.bam.sv.bam && \ + rm -f gridss/$1_$2/$1_$2.gridss_sv.vcf.assembly.bam.gridss.working/FL001-101CD_FL001-101NL.gridss_sv.vcf.assembly.bam.sv.bam.bai && \ + echo 'complete!' > $$(@) + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call gridss-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) + + +..DUMMY := $(shell mkdir -p version; \ + echo 'gridss' > version/gridss_tumor_normal.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: gridss diff --git a/sv_callers/manta.inc b/sv_callers/manta.inc index 259fab2a..9def6aad 100644 --- a/sv_callers/manta.inc +++ b/sv_callers/manta.inc @@ -5,10 +5,10 @@ MANTA_HS_CONFIG = modules/sv_callers/manta_hs_config.py.ini MANTA_CONFIG = modules/sv_callers/manta_config.py.ini MANTA_HIGH_SENS ?= false CONFIG_MANTA_OPTS = --referenceFasta $(REF_FASTA) \ - --config $(if $(findstring true,$(MANTA_HIGH_SENS)),\ - $(MANTA_HS_CONFIG),$(MANTA_CONFIG)) \ - $(if $(TARGETS_FILE),--exome) \ - $(if $(MANTA_REGION),--region $(MANTA_REGION)) + --config $(if $(findstring true,$(MANTA_HIGH_SENS)),\ + $(MANTA_HS_CONFIG),$(MANTA_CONFIG)) \ + $(if $(TARGETS_FILE),--exome) \ + $(if $(MANTA_REGION),--region $(MANTA_REGION)) endif MANTA_INC = true diff --git a/sv_callers/mantaTN.mk b/sv_callers/mantaTN.mk deleted file mode 100644 index 7e5fd54f..00000000 --- a/sv_callers/mantaTN.mk +++ /dev/null @@ -1,39 +0,0 @@ -# run manta on tumour-normal matched pairs - -include modules/Makefile.inc -include modules/sv_callers/manta.inc - -LOGDIR ?= log/manta.$(NOW) -PHONY += manta manta_vcfs - -manta : manta_vcfs - -manta_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).manta_sv.eff.vcf vcf/$(pair).manta_indels.eff.vcf vcf/$(pair).manta_candidate_sv.eff.vcf) - -define manta-tumor-normal -manta/$1_$2/runWorkflow.py : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai - $$(INIT) $$(CONFIG_MANTA) $$(CONFIG_MANTA_OPTS) --tumorBam $$< --normalBam $$(<<) --runDir $$(@D) - -manta/$1_$2.manta_timestamp : manta/$1_$2/runWorkflow.py - $$(call RUN,-n 8 -s 2G -m 2G,"python $$< -m local -j 8 && touch $$@") - -manta/$1_$2/results/variants/somaticSV.vcf.gz : manta/$1_$2.manta_timestamp - -manta/$1_$2/results/variants/candidateSmallIndels.vcf.gz : manta/$1_$2.manta_timestamp - -manta/$1_$2/results/variants/candidateSV.vcf.gz : manta/$1_$2.manta_timestamp - -vcf/$1_$2.manta_indels.vcf : manta/$1_$2/results/variants/candidateSmallIndels.vcf.gz - $$(INIT) zcat $$< > $$@ - -vcf/$1_$2.manta_sv.vcf : manta/$1_$2/results/variants/somaticSV.vcf.gz - $$(INIT) zcat $$< > $$@ - -vcf/$1_$2.manta_candidate_sv.vcf : manta/$1_$2/results/variants/candidateSV.vcf.gz - $$(INIT) zcat $$< > $$@ -endef -$(foreach pair,$(SAMPLE_PAIRS),$(eval $(call manta-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) - -.PHONY: $(PHONY) - -include modules/vcf_tools/vcftools.mk diff --git a/sv_callers/manta_tumor_normal.mk b/sv_callers/manta_tumor_normal.mk new file mode 100644 index 00000000..1c24fa3e --- /dev/null +++ b/sv_callers/manta_tumor_normal.mk @@ -0,0 +1,29 @@ +include modules/Makefile.inc +include modules/sv_callers/manta.inc + +LOGDIR ?= log/manta_tumor_normal.$(NOW) + +manta : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).manta_sv.vcf) + +define manta-tumor-normal +manta/$1_$2/runWorkflow.py : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai + $$(INIT) $$(CONFIG_MANTA) $$(CONFIG_MANTA_OPTS) --tumorBam $$(<) --normalBam $$(<<) --runDir $$(@D) + +manta/$1_$2.manta_timestamp : manta/$1_$2/runWorkflow.py + $$(call RUN,-n 8 -s 2G -m 4G -w 72:00:00,"set -o pipefail && \ + python $$(<) -m local -j 8 && touch $$(@)") + +manta/$1_$2/results/variants/somaticSV.vcf.gz : manta/$1_$2.manta_timestamp + +vcf/$1_$2.manta_sv.vcf : manta/$1_$2/results/variants/somaticSV.vcf.gz + $$(INIT) zcat $$(<) > $$(@) + +endef +$(foreach pair,$(SAMPLE_PAIRS), \ + $(eval $(call manta-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) + +..DUMMY := $(shell mkdir -p version; \ + python --version &> version/manta_tumor_normal.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: manta diff --git a/sv_callers/svaba_tumor_normal.mk b/sv_callers/svaba_tumor_normal.mk new file mode 100644 index 00000000..cbb03b34 --- /dev/null +++ b/sv_callers/svaba_tumor_normal.mk @@ -0,0 +1,42 @@ +include modules/Makefile.inc + +LOGDIR = log/svaba_tumor_normal.$(NOW) + +SVABA_CORES ?= 8 +SVABA_MEM_CORE ?= 6G +SVABA_REF ?= $(REF_FASTA) +SVABA_DBSNP ?= $(HOME)/share/lib/resource_files/svaba/dbsnp_indel.vcf +SVABA_BLACKLIST ?= $(HOME)/share/lib/resource_files/svaba/wgs_blacklist_meres.bed +SVABA ?= svaba + +svaba : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).svaba_sv.vcf) + +define svaba-tumor-normal +svaba/$1_$2.svaba.somatic.sv.vcf : bam/$1.bam bam/$2.bam + $$(call RUN,-c -n $(SVABA_CORES) -s 4G -m $(SVABA_MEM_CORE) -v $(SVABA_ENV) -w 72:00:00,"set -o pipefail && \ + mkdir -p svaba && \ + cd svaba && \ + $$(SVABA) run \ + -t ../bam/$1.bam \ + -n ../bam/$2.bam \ + -p $$(SVABA_CORES) \ + -D $$(SVABA_DBSNP) \ + -L 100000 \ + -x 25000 \ + -k $$(SVABA_BLACKLIST) \ + -a $1_$2 \ + -G $$(SVABA_REF)") + +vcf/$1_$2.svaba_sv.vcf : svaba/$1_$2.svaba.somatic.sv.vcf + $$(INIT) cat $$< > $$@ + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call svaba-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) + + +..DUMMY := $(shell mkdir -p version; \ + $(SVABA) --help &> version/svaba_tumor_normal.txt) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: svaba diff --git a/test/clonality/reportpyclone.R b/test/clonality/reportpyclone.R deleted file mode 100644 index d9770bb5..00000000 --- a/test/clonality/reportpyclone.R +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) -suppressPackageStartupMessages(library("ggplot2")) - -optList = list(make_option("--sample_name", default = NULL, help = "tumor normal sample name")) - -parser = OptionParser(usage = "%prog [options] mutation_file", option_list = optList) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -tumor_sample = unlist(strsplit(opt$sample_name, split="_", fixed=TRUE))[1] -normal_sample = unlist(strsplit(opt$sample_name, split="_", fixed=TRUE))[2] - -in_file = list( - paste0("pyclone/", tumor_sample, "_", normal_sample, "/", tumor_sample,".tsv"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/pyclone.tsv") -) -out_file = list( - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_std_by_cid.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_ccf_by_cid.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_std_by_cn.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_ccf_by_cn.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_vaf_by_cn.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/histogram_depth_by_cn.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/scatter_vaf_depth_by_cn.pdf"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/summary.tsv"), - paste0("pyclone/", tumor_sample, "_", normal_sample, "/report/clusters.tsv") -) - -mutation_summary = read_tsv(file=in_file[[1]], col_types = cols(.default = col_character())) %>% - type_convert() %>% - mutate(total_cn = factor(minor_cn+major_cn)) %>% - mutate(DP = var_counts+ref_counts) %>% - mutate(VAF = 100*var_counts/(var_counts+ref_counts)) - -pyclone_summary = read_tsv(file=in_file[[2]], col_types = cols(.default = col_character()), col_names = c("mutation_id", "ccf", "std", "cluster_id")) %>% - type_convert() %>% - mutate(cluster_id = factor(cluster_id)) %>% - mutate(ccf = as.numeric(ccf)) %>% - mutate(std = as.numeric(std)) %>% - slice(-1) - -mutation_summary = full_join(mutation_summary, pyclone_summary, by="mutation_id") - -plot.0 = ggplot(mutation_summary, aes(x=std, fill=cluster_id)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=9), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x=expression(sigma), y="Frequency\n") + - guides(fill=guide_legend(title=c("Cluster"))) - -pdf(file=out_file[[1]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x=ccf, fill=cluster_id)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=9), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x="\nCCF\n", y="Frequency\n") + - coord_cartesian(xlim=c(0,1)) + - guides(fill=guide_legend(title=c("Cluster"))) -pdf(file=out_file[[2]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x=std, fill=total_cn)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=8), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x=expression(sigma), y="Frequency\n") + - guides(fill=guide_legend(title=c("Copy number"))) - -pdf(file=out_file[[3]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x=ccf, fill=total_cn)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=8), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x="\nCCF\n", y="Frequency\n") + - coord_cartesian(xlim=c(0,1)) + - guides(fill=guide_legend(title=c("Copy number"))) - -pdf(file=out_file[[4]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x = VAF, fill=total_cn)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=8), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x="\nVAF(%)\n", y="Frequency\n") + - coord_cartesian(xlim=c(0,100)) + - guides(fill=guide_legend(title=c("Copy number"))) - -pdf(file=out_file[[5]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x = DP, fill=total_cn)) + - geom_histogram(alpha = .8) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=8), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x="\nDP\n", y="Frequency\n") + - guides(fill=guide_legend(title=c("Copy number"))) - -pdf(file=out_file[[6]], width=6, height=6) -print(plot.0) -dev.off() - -plot.0 = ggplot(mutation_summary, aes(x = VAF, y = DP, fill=total_cn)) + - geom_point(alpha=.85, size=2.5, shape=21) + - theme_classic() + - theme(axis.text.y = element_text(size=15), axis.text.x = element_text(size=15), legend.text=element_text(size=8), legend.title=element_text(size=10), legend.background = element_blank(), legend.key.size = unit(1, 'lines')) + - labs(x="\nVAF (%)\n", y="DP\n") + - scale_x_log10() + - annotation_logticks(side="b") + - coord_cartesian(xlim=c(5,100)) + - guides(fill=guide_legend(title=c("Copy number"))) - -pdf(file=out_file[[7]], width=6, height=6) -print(plot.0) -dev.off() - - -tmp = mutation_summary %>% - group_by(cluster_id) %>% - summarize( - n = n(), - mean_ccf = mean(ccf), - median_ccf = median(ccf), - std_ccf = sd(ccf), - min_ccf = min(ccf), - max_ccf = max(ccf), - mean_sd = mean(std), - median_sd = median(std), - std_sd = sd(std), - min_sd = min(std), - max_sd = max(std)) - -write_tsv(x=mutation_summary, path=out_file[[8]]) -write_tsv(x=tmp, path=out_file[[9]]) diff --git a/test/clonality/tsvtopyclone.R b/test/clonality/tsvtopyclone.R deleted file mode 100644 index e46b636b..00000000 --- a/test/clonality/tsvtopyclone.R +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) -suppressPackageStartupMessages(library("ggplot2")) - -optList = list(make_option("--sample_name", default = NULL, help = "tumor normal sample name")) - -parser = OptionParser(usage = "%prog [options] mutation_file", option_list = optList) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -tumor_sample = unlist(strsplit(opt$sample_name, split="_", fixed=TRUE))[1] -normal_sample = unlist(strsplit(opt$sample_name, split="_", fixed=TRUE))[2] - -mutation_summary = read_tsv(file="summary/tsv/mutation_summary.tsv", col_types = cols(.default = col_character())) %>% - type_convert() %>% - filter(TUMOR_SAMPLE==tumor_sample) %>% - filter(NORMAL_SAMPLE==normal_sample) %>% - filter(grepl("mutect", variantCaller, fixed=TRUE)) %>% - filter(NORMAL_MAF==0) %>% - filter(TUMOR_MAF>=.05) %>% - filter(TUMOR_DP<=500) %>% - filter(TUMOR_DP>=20) %>% - filter(NORMAL_DP<=500) %>% - filter(NORMAL_DP>=10) %>% - mutate(CHROM = as.numeric(ifelse(CHROM=="X", 23, CHROM))) %>% - mutate(CHROM = as.numeric(ifelse(CHROM=="Y", 24, CHROM))) %>% - filter(CHROM<=22) %>% - mutate(UUID = paste0(CHROM, ":", POS, "_", REF, "_", ALT)) - -load(paste0("facets/cncf/", opt$sample_name, ".Rdata")) -qt = q1 = rep(NA, nrow(mutation_summary)) -for (i in 1:nrow(mutation_summary)) { - x = mutation_summary$CHROM[i] - y = mutation_summary$POS[i] - indx = which(fit$cncf[,"chrom"]==x & (fit$cncf[,"start"]<=y & fit$cncf[,"end"]>=y)) - if (length(indx)!=0) { - qt[i] = fit$cncf[indx,"tcn.em"] - q1[i] = fit$cncf[indx,"lcn.em"] - } -} -fsq = as.numeric(mutation_summary$TUMOR_MAF) -n = as.numeric(mutation_summary$TUMOR_DP) -mutation_id = as.character(mutation_summary$UUID) -var_counts = round(fsq*n) -ref_counts = round((1-fsq)*n) -normal_cn = rep(2, nrow(mutation_summary)) -minor_cn = q1 -major_cn = qt-q1 -sample_summary = data.frame(mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn) -index = apply(sample_summary, 1, function(x) {any(is.na(x))}) -sample_summary = sample_summary[!index,,drop=FALSE] -index = sample_summary[,"major_cn"]==0 -sample_summary = sample_summary[!index,,drop=FALSE] -write.table(sample_summary, paste0("pyclone/", opt$sample_name, "/", tumor_sample,".tsv"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=FALSE) - -cat("num_iters: 100000\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = FALSE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("base_measure_params:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" alpha: 1\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" beta: 1\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("concentration:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" value: 1.0\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" prior:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" shape: 1.0\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" rate: 0.001\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("density: pyclone_beta_binomial\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("beta_binomial_precision_params:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" value: 1000\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" prior:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" shape: 1.0\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" rate: 0.0001\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" proposal:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" precision: 0.5\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(paste0("working_dir: pyclone/",opt$sample_name, "\n"), file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("trace_dir: trace", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("init_method: connected\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("samples:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) - -cat(paste0(" ", tumor_sample, ":\n"), file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(paste0(" mutations_file: ", tumor_sample, ".yaml\n"), file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" tumour_content:\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(paste0(" value: ", ifelse(is.na(fit$purity), 1.0, signif(fit$purity, 2)),"\n"), file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat("\n", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -cat(" error_rate: 0.01", file=paste0("pyclone/", opt$sample_name, "/config.yaml"), append = TRUE) -system(paste0("source ~/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate ~/share/usr/anaconda-envs/PyClone-0.13.1 && PyClone build_mutations_file --in_file pyclone/", opt$sample_name, "/", tumor_sample, ".tsv --out_file pyclone/", opt$sample_name, "/", tumor_sample, ".yaml --prior parental_copy_number")) diff --git a/test/copy_number/qdnaseqcopynumber.mk b/test/copy_number/qdnaseqcopynumber.mk deleted file mode 100755 index 6a8a9e9c..00000000 --- a/test/copy_number/qdnaseqcopynumber.mk +++ /dev/null @@ -1,29 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/qdnaseq_copynumber.$(NOW) -PHONY += qdnaseq qdnaseq/copynumber qdnaseq/copynumber/log2ratio qdnaseq/copynumber/segmented qdnaseq/copynumber/pcf - -qdnaseq_copynumber : $(foreach sample,$(SAMPLES),qdnaseq/copynumber/log2ratio/$(sample).pdf qdnaseq/copynumber/segmented/$(sample).RData qdnaseq/copynumber/pcf/$(sample).pdf) - -define qdnaseq-plot-log2ratio -qdnaseq/copynumber/log2ratio/%.pdf : qdnaseq/bed/%.bed - $$(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 10G -m 12G,"$(RSCRIPT) modules/test/copy_number/qdnaseqplot.R --sample $$(*) --type 'raw'") -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call qdnaseq-plot-log2ratio,$(sample)))) - -define qdnaseq-segment-log2ratio -qdnaseq/copynumber/segmented/%.RData : qdnaseq/bed/%.bed - $$(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 12G -m 16G,"$(RSCRIPT) modules/test/copy_number/qdnaseqsegment.R --sample $$(*)") - -qdnaseq/copynumber/pcf/%.pdf : qdnaseq/copynumber/segmented/%.RData - $$(call RUN,-c -v ~/share/usr/anaconda-envs/ascat -s 12G -m 16G,"$(RSCRIPT) modules/test/copy_number/qdnaseqplot.R --sample $$(*) --type 'bychromosome' --rho '$${qdnaseq_rho.$1}' --psi '$${qdnaseq_psi.$1}' --gamma '$${qdnaseq_gamma.$1}' && \ - $(RSCRIPT) modules/test/copy_number/qdnaseqplot.R --sample $$(*) --type 'segmented' --rho '$${qdnaseq_rho.$1}' --psi '$${qdnaseq_psi.$1}' --gamma '$${qdnaseq_gamma.$1}'") - -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call qdnaseq-segment-log2ratio,$(sample)))) - - -.PHONY: $(PHONY) diff --git a/test/copy_number/qdnaseqextract.R b/test/copy_number/qdnaseqextract.R deleted file mode 100755 index 696b84b8..00000000 --- a/test/copy_number/qdnaseqextract.R +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("QDNAseq")) -suppressPackageStartupMessages(library("future")) - -future::plan("multiprocess") -options(mc.cores=16L) - - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -if (!dir.exists("qdnaseq/readcounts")) { - dir.create("qdnaseq/readcounts") -} - -if (!dir.exists("qdnaseq/isobars")) { - dir.create("qdnaseq/isobars") -} - -if (!dir.exists("qdnaseq/variance")) { - dir.create("qdnaseq/variance") -} - -if (!dir.exists("qdnaseq/log2ratio")) { - dir.create("qdnaseq/log2ratio") -} - -if (!dir.exists("qdnaseq/bed")) { - dir.create("qdnaseq/bed") -} - -args_list = list(make_option("--sample", default = NA, type = 'character', help = "sample name"), - make_option("--binsize", default = NA, type = 'character', help = "bin size")) - -parser = OptionParser(usage = "%prog", option_list = args_list) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -if (is.na(as.numeric(opt$binsize))) { - opt$binsize = 30 -} else { - opt$binsize = as.numeric(opt$binsize) -} - -bins = getBinAnnotations(binSize=opt$binsize, genome="hg19") -readCounts = binReadCounts(bins=bins, bamfiles=paste0("bam/", opt$sample, ".bam"), - isPaired=TRUE, - isProperPair=TRUE, - minMapq=30, - pairedEnds=TRUE, - chunkSize=TRUE) - -# read counts versus genomic coordinates -pdf(file=paste0("qdnaseq/readcounts/", opt$sample, ".pdf"), width=14, height=9) -plot(readCounts, logTransform=TRUE, ylim=c(0, 20)) -highlightFilters(readCounts, logTransform=TRUE, residual=TRUE, blacklist=TRUE) -dev.off() - -readCountsFiltered = applyFilters(readCounts, residual=TRUE, blacklist=TRUE) - -# %GC content versus mappability -pdf(file=paste0("qdnaseq/isobars/", opt$sample, ".pdf"), width=7, height=7) -isobarPlot(readCountsFiltered) -dev.off() - -readCountsFiltered = estimateCorrection(readCountsFiltered) - -# noise (variance) versus bin coverage -pdf(file=paste0("qdnaseq/variance/", opt$sample, ".pdf"), width=7, height=7) -noisePlot(readCountsFiltered) -dev.off() - -copyNumbers = correctBins(readCountsFiltered) -copyNumbersNormalized = normalizeBins(copyNumbers) -copyNumbersSmooth = smoothOutlierBins(copyNumbersNormalized) - -# log2 ratio versus genomic coordinates -pdf(file=paste0("qdnaseq/log2ratio/", opt$sample, ".pdf"), width=14, height=9) -plot(copyNumbersSmooth, ylim=c(-4,4)) -dev.off() - -# write log2 ratio to file -exportBins(copyNumbersSmooth, file=paste0("qdnaseq/bed/", opt$sample, ".bed"), format="bed") diff --git a/test/copy_number/qdnaseqextract.mk b/test/copy_number/qdnaseqextract.mk deleted file mode 100755 index 7eab2d2c..00000000 --- a/test/copy_number/qdnaseqextract.mk +++ /dev/null @@ -1,22 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/qdnaseq_extract.$(NOW) -PHONY += qdnaseq qdnaseq/readcounts qdnaseq/isobars qdnaseq/variance qdnaseq/log2ratio qdnaseq/bed - -qdnaseq_extract : $(foreach sample,$(SAMPLES),qdnaseq/readcounts/$(sample).pdf qdnaseq/isobars/$(sample).pdf qdnaseq/variance/$(sample).pdf qdnaseq/log2ratio/$(sample).pdf qdnaseq/bed/$(sample).bed) - -DEFAULT_ENV = $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.6 -QDNASEQ_ENV = $(HOME)/share/usr/anaconda-envs/qdnaseq -QDNASEQ_BINSIZE = 5 - -define qdnaseq-log2ratio -qdnaseq/readcounts/%.pdf qdnaseq/isobars/%.pdf qdnaseq/variance/%.pdf qdnaseq/log2ratio/%.pdf qdnaseq/bed/%.bed : bam/%.bam - $$(call RUN,-c -n 16 -s 2G -m 3G -w 7200 -v $$(DEFAULT_ENV),"source activate $$(QDNASEQ_ENV) && \ - $$(RSCRIPT) modules/test/copy_number/qdnaseqextract.R --sample $$(*) --binsize $(QDNASEQ_BINSIZE)") - -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call qdnaseq-log2ratio,$(sample)))) - -.PHONY: $(PHONY) diff --git a/test/copy_number/qdnaseqplot.R b/test/copy_number/qdnaseqplot.R deleted file mode 100755 index 86f8ef97..00000000 --- a/test/copy_number/qdnaseqplot.R +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("copynumber")) -suppressPackageStartupMessages(library("colorspace")) -suppressPackageStartupMessages(library("ASCAT")) -load("modules/copy_number/CytoBand.RData") - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list = list(make_option("--sample", default = NA, type = 'character', help = "tumor sample"), - make_option("--type", default = NA, type = 'character', help = "type of plot"), - make_option("--rho", default = NA, type = 'numeric', help = "tumor purity"), - make_option("--psi", default = NA, type = 'numeric', help = "tumor ploidy"), - make_option("--gamma", default = NA, type = 'numeric', help = "log2 ratio compression")) - -parser = OptionParser(usage = "%prog", option_list = args_list) -arguments = parse_args(parser, positional_arguments = T) -opt = arguments$options - -opt$rho = ifelse(is.na(as.numeric(opt$rho)), 1, as.numeric(opt$rho)) -opt$psi = ifelse(is.na(as.numeric(opt$psi)), 2, as.numeric(opt$psi)) -opt$gamma = ifelse(is.na(as.numeric(opt$gamma)), 1, as.numeric(opt$gamma)) - -load("modules/copy_number/CytoBand.RData") - -'prunesegments.cn' <- function(x, n=10) -{ - cnm = matrix(NA, nrow=nrow(x), ncol=nrow(x)) - for (j in 1:nrow(x)) { - cnm[,j] = abs(2^x[j,"Log2Ratio"] - 2^x[,"Log2Ratio"]) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx,"Log2Ratio"]) - scl = sd(x[indx,"Log2Ratio"]) - ind = which(x[indx,"Log2Ratio"]<(mcl+1.96*scl) & x[indx,"Log2Ratio"]>(mcl-1.96*scl)) - x[indx[ind],"Log2Ratio"] = mean(x[indx[ind],"Log2Ratio"]) - } else { - x[indx,"Log2Ratio"] = mean(x[indx,"Log2Ratio"]) - } - } - return(invisible(x)) -} - -if (opt$type=="raw") { - - infile = paste0("qdnaseq/bed/", opt$sample, ".bed") - outfile = paste0("qdnaseq/copynumber/log2ratio/", opt$sample, ".pdf") - data = read.table(file=infile, header=FALSE, sep="\t", skip=1, stringsAsFactors=FALSE)[,c(1,2,3,5),drop=FALSE] - colnames(data) = c("Chromosome", "Start", "End", "Log2Ratio") - pdf(file=outfile, width=10, height=4.25) - par(mar=c(5, 5, 4, 2)+.1) - end = NULL - for (j in 1:22) { - end = c(end, max(CytoBand$End[CytoBand$Chromosome==j])) - } - end = cumsum(end) - start = rep(0, 22) - start[2:22] = end[1:21]+1 - for (j in 1:22) { - data[data[,"Chromosome"]==j,"Start"] = data[data[,"Chromosome"]==j,"Start"] + start[j] - } - col = rep("grey75", nrow(data)) - plot(data[,"Start"], data[,"Log2Ratio"], type="p", pch=".", cex=1.95, col=col, axes=FALSE, frame=TRUE, xlab="", ylab="", main="", ylim=c(-4,5)) - axis(2, at = c(-4, -2, 0, 2, 4), labels = c(-4, -2, 0, 2, 4), cex.axis = 1, las = 1) - mtext(side = 2, text = expression(Log[2]~"Ratio"), line = 3.15, cex = 1.25) - for (j in 1:22) { - v = start[j] - abline(v=v, col="goldenrod3", lty=3, lwd=1) - } - abline(v=max(data[,"Start"]), col="goldenrod3", lty=3, lwd=1) - abline(h=0, col="red") - axis(1, at = .5*(start+end), labels=c(1:22), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(data[,"Start"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) - title(main = opt$sample, line=-1, cex.main=.75, font.main=1) - box(lwd=1.5) - dev.off() - -} else if (opt$type=="segmented") { - - infile = paste0("qdnaseq/copynumber/segmented/", opt$sample, ".RData") - outfile = paste0("qdnaseq/copynumber/pcf/", opt$sample, ".pdf") - load(infile) - - segmented = prunesegments.cn(x=segmented, n=7) - end = NULL - for (j in 1:22) { - end = c(end, max(CytoBand$End[CytoBand$Chromosome==j])) - } - end = cumsum(end) - start = rep(0, 22) - start[2:22] = end[1:21]+1 - for (j in 1:22) { - segmented[segmented[,"Chromosome"]==j,"Start"] = segmented[segmented[,"Chromosome"]==j,"Start"] + start[j] - segmented[segmented[,"Chromosome"]==j,"End"] = segmented[segmented[,"Chromosome"]==j,"End"] + start[j] - data[data[,"Chromosome"]==j,"Start"] = data[data[,"Chromosome"]==j,"Start"] + start[j] - } - col = "grey75" - pdf(file=outfile, width=10, height=4.25) - par(mar=c(5, 5, 4, 2)+.1) - plot(data[,"Start"], data[,"Log2Ratio"], type="p", pch=".", cex=1.95, col=col, axes=FALSE, frame=TRUE, xlab="", ylab="", main="", ylim=c(-4,5)) - axis(2, at = c(-4, -2, 0, 2, 4), labels = c(-4, -2, 0, 2, 4), cex.axis = 1, las = 1) - for (j in 1:nrow(segmented)) { - lines(x=c(segmented[j,"Start"], segmented[j,"End"]), y=rep(segmented[j,"Log2Ratio"],2), lty=1, lwd=2.75, col="red") - } - mtext(side = 2, text = expression(Log[2]~"Ratio"), line = 3.15, cex = 1.25) - for (j in 1:22) { - v = start[j] - abline(v=v, col="goldenrod3", lty=3, lwd=1) - } - abline(v=max(data[,"Start"]), col="goldenrod3", lty=3, lwd=1) - abline(h=0, col="red") - axis(1, at = .5*(start+end), labels=c(1:22), cex.axis = 0.85, las = 1) - rect(xleft=1-1e10, xright=max(data[,"Start"])+1e10, ybottom=4, ytop=6, col="lightgrey", border="black", lwd=1.5) - title(main = opt$sample, line=-1, cex.main=.75, font.main=1) - for (k in c(1,2,3,4,6,9)) { - abline(h=(opt$gamma*log2(((opt$rho)*k + (1-opt$rho)*2)/((opt$rho)*opt$psi + (1-opt$rho)*2))), col="brown", lty=3, cex=.5) - mtext(text=k, side=4, line=.5, at=(opt$gamma*log2(((opt$rho)*k + (1-opt$rho)*2)/((opt$rho)*opt$psi + (1-opt$rho)*2))), las=2, cex=.5, col="brown") - } - box(lwd=1.5) - dev.off() - -} else if (opt$type=="bychromosome") { - - infile = paste0("qdnaseq/copynumber/segmented/", opt$sample, ".RData") - if (!dir.exists("qdnaseq/copynumber/bychr/")) { - dir.create("qdnaseq/copynumber/bychr/") - } - if (!dir.exists(paste0("qdnaseq/copynumber/bychr/", opt$sample, "/"))) { - dir.create(paste0("qdnaseq/copynumber/bychr/", opt$sample, "/")) - } - load(infile) - segmented = prunesegments.cn(x=segmented, n=7) - for (ii in 1:22) { - pdf(file=paste0("qdnaseq/copynumber/bychr/", opt$sample, "/", ii, ".pdf")) - zz = split.screen(figs=matrix(c(0,1,.15,1, 0.065,.975,0.1,.4), nrow=2, ncol=4, byrow=TRUE)) - screen(zz[1]) - par(mar = c(6.1, 6, 4.1, 3)) - start = 1 - end = max(CytoBand[CytoBand[,"Chromosome"]==ii,"End"]) - plot(1, 1, type="n", xlim=c(start,end), ylim=c(-4,4), xlab="", ylab="", main="", frame.plot=FALSE, axes=FALSE) - index = data[,"Chromosome"]==ii - points(data[index,"Start"], data[index,"Log2Ratio"], type="p", pch=".", cex=1.15, col="grey75") - tmp = subset(segmented, segmented[,"Chromosome"]==ii) - for (i in 1:nrow(tmp)) { - points(c(tmp[i,"Start"], tmp[i,"End"]), rep(tmp[i,"Log2Ratio"],2), type="l", col="red", lwd=4) - } - for (i in 1:(nrow(tmp)-1)) { - points(c(tmp[i,"End"], tmp[i+1,"Start"]), c(tmp[i,"Log2Ratio"],tmp[i+1,"Log2Ratio"]), type="l", col="red", lwd=1) - } - abline(h=0, lwd=1) - axis(2, at = c(-4,-2,0,2,4), labels=c("-4","-2","0","2", "4"), cex.axis = 1.25, las = 1, lwd=1.5, lwd.ticks=1.35) - mtext(side = 2, text = expression("Log"[2]~"Ratio"), line = 4, cex = 1.5) - for (k in c(1,2,3,4,6,9)) { - abline(h=(opt$gamma*log2(((opt$rho)*k + (1-opt$rho)*2)/((opt$rho)*opt$psi + (1-opt$rho)*2))), col="darkorange", lty=3) - mtext(text=k, side=4, line=.5, at=(opt$gamma*log2(((opt$rho)*k + (1-opt$rho)*2)/((opt$rho)*opt$psi + (1-opt$rho)*2))), las=2, cex=.75, col="darkorange") - } - box(lwd=2) - screen(zz[2]) - arg = copynumber:::getPlotParameters(type = "sample", nSeg = 10, cr = 3 * 3, sampleID = "dummy", plot.ideo = TRUE, xaxis = TRUE, assembly = "hg19") - copynumber:::plotIdeogram(chrom=ii, TRUE, cyto.data = arg$assembly, cex = .75, unit = "bp") - close.screen(all.screens=TRUE) - dev.off() - } - -} diff --git a/test/copy_number/qdnaseqsegment.R b/test/copy_number/qdnaseqsegment.R deleted file mode 100644 index 459cdaf0..00000000 --- a/test/copy_number/qdnaseqsegment.R +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("copynumber")) -suppressPackageStartupMessages(library("colorspace")) -suppressPackageStartupMessages(library("ASCAT")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample", default = NA, type = 'character', help = "sample name")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -infile = paste0("qdnaseq/bed/", opt$sample, ".bed") -outfile = paste0("qdnaseq/copynumber/segmented/", opt$sample, ".RData") -data = read.table(file=infile, header=FALSE, sep="\t", skip=1, stringsAsFactors=FALSE)[,c(1,2,3,5),drop=FALSE] -colnames(data) = c("Chromosome", "Start", "End", "Log2Ratio") -segmented = pcf(data=winsorize(data=data[,c("Chromosome", "Start", "Log2Ratio"),drop=FALSE], method="mad", tau=2.5, k=25, verbose=FALSE), kmin = 100, gamma = 150, fast=FALSE, verbose=FALSE)[,2:7,drop=FALSE] -colnames(segmented) = c("Chromosome", "Arm", "Start", "End", "N", "Log2Ratio") -save(data, segmented, file=outfile) diff --git a/test/phylogeny/bootstrapmedicc.R b/test/phylogeny/bootstrapmedicc.R deleted file mode 100755 index 41d94e00..00000000 --- a/test/phylogeny/bootstrapmedicc.R +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample_set", default = NA, type = 'character', help = "sample names set")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -if (!dir.exists(paste0("medicc/boot/allele_specific/", opt$sample_set))) { - dir.create(paste0("medicc/boot/allele_specific/", opt$sample_set)) -} - -load(paste0("medicc/aspcf/", opt$sample_set, ".RData")) -q1 = qt-q2 -index = !apply(q2, 1, function(x) { any(is.na(x)) }) & !apply(q1, 1, function(x) { any(is.na(x)) }) -q2 = q2[index,,drop=FALSE] -q1 = q1[index,,drop=FALSE] -tmp = tmp[index,,drop=FALSE] -q2[q2>4] = 4 -q1[q1>4] = 4 - -if (ncol(q2)<3) { - q1x = q1 - colnames(q1x) = paste0(colnames(q1), "_pad00") - q1 = cbind(q1, q1x) - q2x = q2 - colnames(q2x) = paste0(colnames(q2), "_pad00") - q2 = cbind(q2, q2x) -} - -set.seed(0) -for (ii in 1:100) { - n = nchar(ii) - if (n==1) { - n = paste0("00", ii) - } else if (n==2) { - n = paste0("0", ii) - } else { - n = ii - } - index = order(sample(x=1:nrow(tmp), size=nrow(tmp), replace=TRUE)) - q2_b = q2[index,,drop=FALSE] - q1_b = q1[index,,drop=FALSE] - tmp_b = tmp[index,,drop=FALSE] - desc = cbind(paste0("chrom", unique(tmp_b[,"Chromosome"])), - paste0("major_chr", unique(tmp_b[,"Chromosome"]), ".fasta"), - paste0("minor_chr", unique(tmp_b[,"Chromosome"]), ".fasta")) - if (!dir.exists(paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n))) { - dir.create(paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n)) - } - write.table(desc, file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/desc.txt"), sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE, append=FALSE) - for (i in unique(tmp[,"Chromosome"])) { - cat(">diploid\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/major_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/major_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q2_b)) { - cat(paste0(">", gsub("-", "_", colnames(q2_b)[j]), "\n"), file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/major_chr", i, ".fasta"), append=TRUE) - cat(paste0(q2_b[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/major_chr", i, ".fasta"), append=TRUE) - } - cat(">diploid\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/minor_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/minor_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q1_b)) { - cat(paste0(">", gsub("-", "_", colnames(q1_b)[j]), "\n"), file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/minor_chr", i, ".fasta"), append=TRUE) - cat(paste0(q1_b[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/", n, "/minor_chr", i, ".fasta"), append=TRUE) - } - } - if (ii==100) { - cat("done!", file=paste0("medicc/boot/allele_specific/", opt$sample_set, "/init.timestamp")) - } -} diff --git a/test/phylogeny/combinesamples.R b/test/phylogeny/combinesamples.R deleted file mode 100644 index 776a708b..00000000 --- a/test/phylogeny/combinesamples.R +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set"), - make_option("--normal_samples", default = NA, type = 'character', help = "normal samples"), - make_option("--type", default = NA, type = 'character', help = "allele specific or total copy") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -all_samples = na.omit(unlist(strsplit(opt$sample_set, split="_", fixed=TRUE))) -normal_samples = na.omit(unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE))) -normal_samples = normal_samples[normal_samples %in% all_samples] -tumor_samples = all_samples[!(all_samples %in% normal_samples)] - -if (opt$type=="allele_specific") { - - CN = list() - for (i in 1:length(tumor_samples)) { - load(paste0("facets/cncf/", tumor_samples[i], "_", normal_samples, ".Rdata")) - CN[[i]] = out2$jointseg[,c("chrom", "maploc", "cnlr", "vafT", "het"),drop=FALSE] - colnames(CN[[i]]) = c("Chromosome", "Position", "Log2Ratio", "BAF", "Genotype") - } - index = lapply(CN, function(x) {paste0(x[,1], ":", x[,2])}) - featureNames = unique(unlist(index)) - for (i in 1:length(index)) { - featureNames = intersect(featureNames, index[[i]]) - } - chr = as.numeric(unlist(lapply(strsplit(featureNames, ":", fixed=TRUE), function(x) { x[1] }))) - pos = as.numeric(unlist(lapply(strsplit(featureNames, ":", fixed=TRUE), function(x) { x[2] }))) - index = order(pos, decreasing=FALSE) - chr = chr[index] - pos = pos[index] - index = order(chr, decreasing=FALSE) - chr = chr[index] - pos = pos[index] - featureNames = paste0(chr, ":", pos) - for (i in 1:length(CN)) { - rownames(CN[[i]]) = paste0(CN[[i]][,1], ":", CN[[i]][,2]) - CN[[i]] = CN[[i]][featureNames,,drop=FALSE] - } - Log2Ratio = do.call(cbind, lapply(CN, function(x) { return(x[,"Log2Ratio"]) } )) - BAF = do.call(cbind, lapply(CN, function(x) { return(x[,"BAF"]) } )) - Genotype = do.call(cbind, lapply(CN, function(x) { return(x[,"Genotype"]) } )) - annotation = data.frame(Chromosome=chr, - Position=pos) - colnames(Log2Ratio) = colnames(BAF) = tumor_samples - save(Log2Ratio, BAF, Genotype, annotation, file=paste0("medicc/allele_specific/mad/", opt$sample_set, ".RData")) - -} else if (opt$type=="total_copy") { - - CN = list() - for (i in 1:length(tumor_samples)) { - load(paste0("facets/cncf/", tumor_samples[i], "_", normal_samples, ".Rdata")) - CN[[i]] = out2$jointseg[,c("chrom", "maploc", "cnlr"),drop=FALSE] - colnames(CN[[i]]) = c("Chromosome", "Position", "Log2Ratio") - } - index = lapply(CN, function(x) {paste0(x[,1], ":", x[,2])}) - featureNames = unique(unlist(index)) - for (i in 1:length(index)) { - featureNames = intersect(featureNames, index[[i]]) - } - chr = as.numeric(unlist(lapply(strsplit(featureNames, ":", fixed=TRUE), function(x) { x[1] }))) - pos = as.numeric(unlist(lapply(strsplit(featureNames, ":", fixed=TRUE), function(x) { x[2] }))) - index = order(pos, decreasing=FALSE) - chr = chr[index] - pos = pos[index] - index = order(chr, decreasing=FALSE) - chr = chr[index] - pos = pos[index] - featureNames = paste0(chr, ":", pos) - for (i in 1:length(CN)) { - rownames(CN[[i]]) = paste0(CN[[i]][,1], ":", CN[[i]][,2]) - CN[[i]] = CN[[i]][featureNames,,drop=FALSE] - } - Log2Ratio = do.call(cbind, lapply(CN, function(x) { return(x[,"Log2Ratio"]) } )) - annotation = data.frame(Chromosome=chr, - Position=pos) - colnames(Log2Ratio) = tumor_samples - save(Log2Ratio, annotation, file=paste0("medicc/total_copy/mad/", opt$sample_set, ".RData")) - -} \ No newline at end of file diff --git a/test/phylogeny/initmedicc.R b/test/phylogeny/initmedicc.R deleted file mode 100755 index 22984e30..00000000 --- a/test/phylogeny/initmedicc.R +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set"), - make_option("--type", default = NA, type = 'character', help = "allele specific or total copy") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -if (opt$type=="allele_specific") { - - load(paste0("medicc/allele_specific/aspcf/", opt$sample_set, ".RData")) - q1 = qt-q2 - index = !apply(q2, 1, function(x) { any(is.na(x)) }) & !apply(q1, 1, function(x) { any(is.na(x)) }) - q2 = q2[index,,drop=FALSE] - q1 = q1[index,,drop=FALSE] - tmp = tmp[index,,drop=FALSE] - q2[q2>4] = 4 - q1[q1>4] = 4 - - if (ncol(q2)<3) { - q1x = q1 - colnames(q1x) = paste0(colnames(q1), "_pad00") - q1 = cbind(q1, q1x) - q2x = q2 - colnames(q2x) = paste0(colnames(q2), "_pad00") - q2 = cbind(q2, q2x) - } - - desc = cbind(paste0("chrom", unique(tmp[,"Chromosome"])), - paste0("major_chr", unique(tmp[,"Chromosome"]), ".fasta"), - paste0("minor_chr", unique(tmp[,"Chromosome"]), ".fasta")) - write.table(desc, file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/desc.txt"), sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE, append=FALSE) - for (i in unique(tmp[,"Chromosome"])) { - cat(">diploid\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q2)) { - cat(paste0(">", gsub("-", "_", colnames(q2)[j]), "\n"), file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - cat(paste0(q2[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - } - - - cat(">diploid\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q1)) { - cat(paste0(">", gsub("-", "_", colnames(q1)[j]), "\n"), file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - cat(paste0(q1[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - } - } -} else if (opt$type=="total_copy") { - - load(paste0("medicc/total_copy/mpcf/", opt$sample_set, ".RData")) - ploidy = round(apply(((tmp[,"End"]-tmp[,"Start"])*qt)/sum(tmp[,"End"]-tmp[,"Start"]), 2, sum)) - ploidy[ploidy>=4] = 4 - ploidy[ploidy<=2] = 2 - if (length(unique(ploidy))>1) { - index = which(ploidy==4) - - qt_4n = ceiling(apply(qt[,index,drop=FALSE], 1, mean)/2)*2 - qt_4n[qt_4n==0 & apply(qt[,index,drop=FALSE], 1, mean)!=0] = 1 - qt_2n = round(qt_4n/2) - qt_2n[qt_2n==0 & apply(qt[,index,drop=FALSE], 1, mean)!=0] = 1 - qt = cbind(qt, diploid_ancestor=qt_2n, tetraploid_ancestor=qt_4n) - - q2_4n = ceiling(apply(q2[,index,drop=FALSE], 1, mean)/2)*2 - q2_4n[q2_4n==0 & apply(q2[,index,drop=FALSE], 1, mean)!=0] = 1 - q2_2n = round(q2_4n/2) - q2_2n[q2_2n==0 & apply(q2[,index,drop=FALSE], 1, mean)!=0] = 1 - q2 = cbind(q2, diploid_ancestor=q2_2n, tetraploid_ancestor=q2_4n) - - } - - q1 = qt-q2 - index = !apply(q2, 1, function(x) { any(is.na(x)) }) & !apply(q1, 1, function(x) { any(is.na(x)) }) - q2 = q2[index,,drop=FALSE] - q1 = q1[index,,drop=FALSE] - tmp = tmp[index,,drop=FALSE] - - q2[q2>4] = 4 - q1[q1>4] = 4 - - if (ncol(q2)<3) { - q1x = q1 - colnames(q1x) = paste0(colnames(q1), "_pad00") - q1 = cbind(q1, q1x) - q2x = q2 - colnames(q2x) = paste0(colnames(q2), "_pad00") - q2 = cbind(q2, q2x) - } - - desc = cbind(paste0("chrom", unique(tmp[,"Chromosome"])), - paste0("major_chr", unique(tmp[,"Chromosome"]), ".fasta"), - paste0("minor_chr", unique(tmp[,"Chromosome"]), ".fasta")) - write.table(desc, file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/desc.txt"), sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE, append=FALSE) - for (i in unique(tmp[,"Chromosome"])) { - cat(">diploid\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q2)) { - cat(paste0(">", gsub("-", "_", colnames(q2)[j]), "\n"), file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - cat(paste0(q2[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/major_chr", i, ".fasta"), append=TRUE) - } - - - cat(">diploid\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=FALSE) - cat(paste0(rep(1, sum(tmp[,"Chromosome"]==i)), collapse=""), "\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - for (j in 1:ncol(q1)) { - cat(paste0(">", gsub("-", "_", colnames(q1)[j]), "\n"), file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - cat(paste0(q1[tmp[,"Chromosome"]==i,j], collapse=""), "\n", file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/minor_chr", i, ".fasta"), append=TRUE) - } - } -} diff --git a/test/phylogeny/plotmedicc.R b/test/phylogeny/plotmedicc.R deleted file mode 100755 index 417f4b98..00000000 --- a/test/phylogeny/plotmedicc.R +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("ape")) -suppressPackageStartupMessages(library("foreach")) -suppressPackageStartupMessages(library("parallel")) -suppressPackageStartupMessages(library("doMC")) -suppressPackageStartupMessages(library("stringr")) -suppressPackageStartupMessages(library("phytools")) - -registerDoMC(12) - - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set"), - make_option("--type", default = NA, type = 'character', help = "allele specific or total copy") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -if (opt$type=="allele_specific") { - - phylo_tree = read.tree(file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/tree_final.new")) - tip_labels = phylo_tree$tip.label - index = grep("pad00", tip_labels) - if (length(index)!=0) { - phylo_tree = drop.tip(phy=phylo_tree, tip=tip_labels[index], trim.internal=TRUE, rooted=FALSE) - } - phylo_tree = root(phylo_tree, outgroup="diploid") - - pdf(file=paste0("medicc/allele_specific/medicc/", opt$sample_set, "/tree_final.pdf"), height=7, width=7) - plotTree(tree=phylo_tree, color="#8CC63F", lwd=3, offset=1) - edgelabels(text=paste0(phylo_tree$edge.length, " "), cex=.75) - dev.off() - -} else if (opt$type=="total_copy") { - - phylo_tree = read.tree(file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/tree_final.new")) - tip_labels = phylo_tree$tip.label - index = grep("pad00", tip_labels) - if (length(index)!=0) { - phylo_tree = drop.tip(phy=phylo_tree, tip=tip_labels[index], trim.internal=TRUE, rooted=FALSE) - } - phylo_tree = root(phylo_tree, outgroup="diploid") - - pdf(file=paste0("medicc/total_copy/medicc/", opt$sample_set, "/tree_final.pdf"), height=7, width=7) - plotTree(tree=phylo_tree, color="#8CC63F", lwd=3, offset=1) - edgelabels(text=paste0(phylo_tree$edge.length, " "), cex=.75) - dev.off() - -} diff --git a/test/phylogeny/plotratchet.R b/test/phylogeny/plotratchet.R deleted file mode 100755 index ac34dc2c..00000000 --- a/test/phylogeny/plotratchet.R +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("ape")) -suppressPackageStartupMessages(library("foreach")) -suppressPackageStartupMessages(library("parallel")) -suppressPackageStartupMessages(library("doMC")) -suppressPackageStartupMessages(library("stringr")) -suppressPackageStartupMessages(library("phytools")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -load(paste0("pratchet/", opt$sample_set, "/tree_final.RData")) - -pdf(file=paste0("pratchet/", opt$sample_set, "/tree_final.pdf"), height=7, width=7) -plot.phylo(x=phy_tree_w_bl, edge.color="#8CC63F", edge.width=3, label.offset=1) -nodelabels(node=1:phy_tree_w_bl$Nnode+Ntip(phy_tree_w_bl), - pie = cbind(as.numeric(phy_tree_w_bl$node.label),100-as.numeric(phy_tree_w_bl$node.label)), - piecol = c("goldenrod3","grey85"), - cex = 1) -edgelabels(text=paste0(phy_tree_w_bl$edge.length, " "), cex=.75) -dev.off() diff --git a/test/phylogeny/pratchet.R b/test/phylogeny/pratchet.R deleted file mode 100755 index 46a6af76..00000000 --- a/test/phylogeny/pratchet.R +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("ape")) -suppressPackageStartupMessages(library("phangorn")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("magrittr")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set"), - make_option("--normal_samples", default = NA, type = 'character', help = "normal samples") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -all_samples = na.omit(unlist(strsplit(opt$sample_set, split="_", fixed=TRUE))) -normal_samples = na.omit(unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE))) -normal_samples = normal_samples[normal_samples %in% all_samples] -tumor_samples = all_samples[!(all_samples %in% normal_samples)] - -mutation_summary = read_tsv(file=paste0("sufam/", opt$sample_set, ".tsv"), col_types = cols(.default = col_character())) %>% - type_convert() - -mutation_binary = as.data.frame(mutation_summary[,paste0("CALL_", c(tumor_samples, normal_samples)),drop=FALSE]) -colnames(mutation_binary) = gsub("CALL_", "", colnames(mutation_binary)) - -phy_data = as.phyDat(mutation_binary, type="USER", levels=c(0,1)) -phy_tree = pratchet(data=phy_data) -phy_tree_w_bl = acctran(tree=phy_tree, data=phy_data) -phy_tree_w_bl = root(phy_tree_w_bl, outgroup=normal_samples) - -'bootstrap_data' <- function(x, N=100) -{ - y = list() - for (i in 1:N) { - index = sample(1:nrow(x), size=nrow(x), replace=TRUE) - y[[i]] = x[index,,drop=FALSE] - } - return(y) -} - - -phy_tree_w_bl_boot = list() -mutation_binary_boot = bootstrap_data(x=mutation_binary) -for (i in 1:length(mutation_binary_boot)) { - phy_data = as.phyDat(mutation_binary_boot[[i]], type="USER", levels=c(0,1)) - phy_tree = pratchet(data=phy_data) - phy_tree_w_bl_boot[[i]] = acctran(tree=phy_tree, data=phy_data) - phy_tree_w_bl_boot[[i]] = root(phy_tree_w_bl_boot[[i]], outgroup=normal_samples) -} - -class(phy_tree_w_bl) = "phylo" -class(phy_tree_w_bl_boot) = "multiPhylo" -node_labels = prop.clades(phy_tree_w_bl, phy_tree_w_bl_boot, rooted=TRUE) -phy_tree_w_bl$node.label = node_labels -save(list=ls(all=TRUE), file=paste0("pratchet/", opt$sample_set, "/tree_final.RData")) diff --git a/test/phylogeny/segmentsamples.R b/test/phylogeny/segmentsamples.R deleted file mode 100755 index 253f6bd0..00000000 --- a/test/phylogeny/segmentsamples.R +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("copynumber")) -suppressPackageStartupMessages(library("colorspace")) -suppressPackageStartupMessages(library("ASCAT")) - - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list( - make_option("--sample_set", default = NA, type = 'character', help = "sample names set"), - make_option("--normal_samples", default = NA, type = 'character', help = "normal samples"), - make_option("--gamma", default = NA, type = 'character', help = "segmentation parameter gamma"), - make_option("--nlog2", default = NA, type = 'character', help = "number of clusters in Log2 ratio"), - make_option("--nbaf", default = NA, type = 'character', help = "number of clusters in BAF"), - make_option("--type", default = NA, type = 'character', help = "allele specific or total copy") - ) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -all_samples = na.omit(unlist(strsplit(opt$sample_set, split="_", fixed=TRUE))) -normal_samples = na.omit(unlist(strsplit(opt$normal_samples, split=" ", fixed=TRUE))) -normal_samples = normal_samples[normal_samples %in% all_samples] -tumor_samples = all_samples[!(all_samples %in% normal_samples)] - -if (opt$type=="allele_specific") { - - load(paste0("medicc/allele_specific/mad/", opt$sample_set, ".RData")) - gamma = ifelse(is.na(as.numeric(opt$gamma)), 50, as.numeric(opt$gamma)) - nlog2 = ifelse(is.na(as.numeric(opt$nlog2)), 10, as.numeric(opt$nlog2)) - nbaf = ifelse(is.na(as.numeric(opt$nbaf)), 15, as.numeric(opt$nbaf)) - index = apply(Genotype, 1, function(x) {sum(x==1)==length(x)}) - Log2Ratio = Log2Ratio[index,,drop=FALSE] - BAF = BAF[index,,drop=FALSE] - annotation = annotation[index,,drop=FALSE] - colnames(Log2Ratio) = paste0("Log2Ratio_", colnames(Log2Ratio)) - colnames(BAF) = paste0("BAF_", colnames(BAF)) - index = BAF>.5 - BAF[index] = 1 - BAF[index] - CN_and_BAF = cbind(annotation, Log2Ratio, BAF) - tmp = NULL - for (i in 1:23) { - cn_and_baf = subset(CN_and_BAF, CN_and_BAF[,"Chromosome"]==i) - x = try(multipcf(data=winsorize(data=cn_and_baf, method="mad", tau=2.5, k=15, verbose=FALSE), gamma=gamma, normalize=FALSE, fast=FALSE, verbose=FALSE), silent=TRUE) - if (!("try-error" %in% is(x))) { - colnames(x)[1:5] = c("Chromosome", "Arm", "Start", "End", "N") - tmp = rbind(tmp, x) - } - } - CN_and_BAF = subset(CN_and_BAF, CN_and_BAF[,"Chromosome"] %in% tmp[,"Chromosome"]) - qt = q2 = matrix(NA, nrow=nrow(tmp), ncol=length(tumor_samples)) - colnames(qt) = colnames(q2) = tumor_samples - for (i in 1:length(tumor_samples)) { - ascat = new.env() - load(paste0("ascat/ascat/", tumor_samples[i], "_", normal_samples, ".RData"), envir=ascat) - - 'prunesegments.cn' <- function(x, n=10) - { - cnm = matrix(NA, nrow=length(x), ncol=length(x)) - for (j in 1:length(x)) { - cnm[,j] = abs(2^x[j] - 2^x) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx]) - scl = sd(x[indx]) - ind = which(x[indx]<(mcl+1.96*scl) & x[indx]>(mcl-1.96*scl)) - x[indx[ind]] = mean(x[indx[ind]]) - } else { - x[indx] = mean(x[indx]) - } - } - return(x) - } - - 'prunesegments.baf' <- function(x, n=10) - { - cnm = matrix(NA, nrow=length(x), ncol=length(x)) - for (j in 1:length(x)) { - cnm[,j] = abs(2^x[j] - 2^x) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx]) - scl = sd(x[indx]) - ind = which(x[indx]<(mcl+1.96*scl) & x[indx]>(mcl-1.96*scl)) - x[indx[ind]] = mean(x[indx[ind]]) - } else { - x[indx] = mean(x[indx]) - } - } - return(x) - } - tmp[,paste0("Log2Ratio_", tumor_samples[i])] = prunesegments.cn(x=tmp[,paste0("Log2Ratio_", tumor_samples[i])], n=nlog2) - tmp[,paste0("BAF_", tumor_samples[i])] = prunesegments.baf(x=tmp[,paste0("BAF_", tumor_samples[i])], n=nbaf) - - Tumor_LogR = as.numeric(CN_and_BAF[,paste0("Log2Ratio_", tumor_samples[i])]) - Tumor_BAF = as.numeric(CN_and_BAF[,paste0("BAF_", tumor_samples[i])]) - Tumor_LogR_segmented = rep(tmp[,paste0("Log2Ratio_", tumor_samples[i])], times=tmp[,"N"]) - Tumor_BAF_segmented = rep(tmp[,paste0("BAF_", tumor_samples[i])], times=tmp[,"N"]) - SNPpos = CN_and_BAF[,c("Chromosome", "Position"), drop=FALSE] - names(Tumor_LogR) = names(Tumor_BAF) = names(Tumor_LogR_segmented) = names(Tumor_BAF_segmented) = rownames(SNPpos) = paste0("chr", CN_and_BAF[,"Chromosome"], ":", CN_and_BAF[,"Position"]) - colnames(SNPpos) = c("chrs", "pos") - ch = list() - j = 1 - for (j in 1:length(unique(CN_and_BAF[,"Chromosome"]))) { - index = which(CN_and_BAF[,"Chromosome"]==(unique(CN_and_BAF[,"Chromosome"]))[j]) - ch[[j]] = index - j = j + 1 - } - chr = ch - chrs = unique(CN_and_BAF[,"Chromosome"]) - gender = "2323" - sexchromosomes = c(23, 24) - tmp2 = list(Tumor_LogR=Tumor_LogR, - Tumor_BAF=Tumor_BAF, - Tumor_LogR_segmented=Tumor_LogR_segmented, - Tumor_BAF_segmented=Tumor_BAF_segmented, - SNPpos=SNPpos, - chromosomes=ch, - chrnames=chrs, - gender=gender, - sexchromosomes=sexchromosomes) - - tmp3 = try(runASCAT(lrr=tmp2$Tumor_LogR, - baf=tmp2$Tumor_BAF, - lrrsegmented=tmp2$Tumor_LogR_segmented, - bafsegmented=tmp2$Tumor_BAF_segmented, - gender=tmp2$gender, - SNPpos=tmp2$SNPpos, - chromosomes=tmp2$chromosomes, - chrnames=tmp2$chrnames, - sexchromosomes=tmp2$sexchromosomes, - failedqualitycheck=FALSE, - distance = paste0("medicc/allele_specific/ascat/", tumor_samples[i], "_", normal_samples, ".pdf"), - copynumberprofile = NULL, - nonroundedprofile = NULL, - aberrationreliability = NULL, - gamma = 1, rho_manual = ascat$tmp3$rho, psi_manual = ascat$tmp3$psi, y_limit = 3, circos = NA)) - - if (!("try-error" %in% is(tmp3))) { - chr = SNPpos[tmp3$seg_raw[,1],1] - pos = SNPpos[tmp3$seg_raw[,1],2] - qt[tmp[,1] %in% chr & tmp[,3] %in% pos,tumor_samples[i]] = tmp3$seg_raw[,"nA"] + tmp3$seg_raw[,"nB"] - q2[tmp[,1] %in% chr & tmp[,3] %in% pos,tumor_samples[i]] = apply(tmp3$seg_raw[,c("nA", "nB"),drop=FALSE], 1, max, na.rm=TRUE) - } - } - save(list=ls(all=TRUE), file=paste0("medicc/allele_specific/aspcf/", opt$sample_set, ".RData")) - -} else if (opt$type=="total_copy") { - - load(paste0("medicc/total_copy/mad/", opt$sample_set, ".RData")) - gamma = ifelse(is.na(as.numeric(opt$gamma)), 150, as.numeric(opt$gamma)) - nlog2 = ifelse(is.na(as.numeric(opt$nlog2)), 10, as.numeric(opt$nlog2)) - colnames(Log2Ratio) = paste0("Log2Ratio_", colnames(Log2Ratio)) - CN_and_BAF = cbind(annotation, Log2Ratio) - tmp = NULL - for (i in 1:23) { - cn_and_baf = subset(CN_and_BAF, CN_and_BAF[,"Chromosome"]==i) - x = try(multipcf(data=winsorize(data=cn_and_baf, method="mad", tau=2.5, k=15, verbose=FALSE), gamma=gamma, normalize=FALSE, fast=FALSE, verbose=FALSE), silent=TRUE) - if (!("try-error" %in% is(x))) { - colnames(x)[1:5] = c("Chromosome", "Arm", "Start", "End", "N") - tmp = rbind(tmp, x) - } - } - CN_and_BAF = subset(CN_and_BAF, CN_and_BAF[,"Chromosome"] %in% tmp[,"Chromosome"]) - qt = q2 = matrix(NA, nrow=nrow(tmp), ncol=length(tumor_samples)) - colnames(qt) = colnames(q2) = tumor_samples - for (i in 1:length(tumor_samples)) { - ascat = new.env() - load(paste0("ascat/ascat/", tumor_samples[i], "_", normal_samples, ".RData"), envir=ascat) - - 'prunesegments.cn' <- function(x, n=10) - { - cnm = matrix(NA, nrow=length(x), ncol=length(x)) - for (j in 1:length(x)) { - cnm[,j] = abs(2^x[j] - 2^x) - } - cnt = hclust(as.dist(cnm), "average") - cnc = cutree(tree=cnt, k=n) - for (j in unique(cnc)) { - indx = which(cnc==j) - if (length(indx)>2) { - mcl = mean(x[indx]) - scl = sd(x[indx]) - ind = which(x[indx]<(mcl+1.96*scl) & x[indx]>(mcl-1.96*scl)) - x[indx[ind]] = mean(x[indx[ind]]) - } else { - x[indx] = mean(x[indx]) - } - } - return(x) - } - - 'absolute.cn' <- function(rho, psi, gamma=1, x) - { - rho = ifelse(is.na(rho), 1, rho) - psi = ifelse(is.na(psi), 2, psi) - return(invisible(((((2^(x/gamma))*(rho*psi+(1-rho)*2)) - ((1-rho)*2))/rho))) - } - - tmp[,paste0("Log2Ratio_", tumor_samples[i])] = prunesegments.cn(x=tmp[,paste0("Log2Ratio_", tumor_samples[i])], n=nlog2) - purity = ifelse(is.na(ascat$tmp3$rho), 1, ascat$tmp3$rho) - ploidy = ifelse(is.na(ascat$tmp3$psi), 1, ascat$tmp3$psi) - qt[,tumor_samples[i]] = ifelse(round(absolute.cn(rho=purity, psi=ploidy, x=tmp[,paste0("Log2Ratio_", tumor_samples[i])]))<0, 0, round(absolute.cn(rho=purity, psi=ploidy, x=tmp[,paste0("Log2Ratio_", tumor_samples[i])]))) - q2[,tumor_samples[i]] = ceiling(qt[,tumor_samples[i]]/2) - } - save(list=ls(all=TRUE), file=paste0("medicc/total_copy/mpcf/", opt$sample_set, ".RData")) -} diff --git a/test/workflows/cnvkit.mk b/test/workflows/cnvkit.mk deleted file mode 100644 index 042067fb..00000000 --- a/test/workflows/cnvkit.mk +++ /dev/null @@ -1,25 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/cnvkit.$(NOW) -PHONY += cnvkit cnvkit/cnn cnvkit/cnn/tumor cnvkit/cnn/normal cnvkit/reference cnvkit/cnr cnvkit/log2 cnvkit/segmented cnvkit/called cnvkit/summary - -CNV_KIT_WORKFLOW += cnvkit_coverage -CNV_KIT_WORKFLOW += cnvkit_reference -CNV_KIT_WORKFLOW += cnvkit_fix -CNV_KIT_WORKFLOW += cnvkit_plot -CNV_KIT_WORKFLOW += cnvkit_segment -CNV_KIT_WORKFLOW += cnvkit_summary - -cnv_kit_workflow : $(CNV_KIT_WORKFLOW) - -include modules/copy_number/cnvkitcoverage.mk -include modules/copy_number/cnvkitreference.mk -include modules/copy_number/cnvkitfix.mk -include modules/copy_number/cnvkitplot.mk -include modules/copy_number/cnvkitsegment.mk -include modules/copy_number/cnvkitsummary.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/copynumber_summary.mk b/test/workflows/copynumber_summary.mk deleted file mode 100644 index ff3ed56b..00000000 --- a/test/workflows/copynumber_summary.mk +++ /dev/null @@ -1,23 +0,0 @@ -include modules/Makefile.inc -include modules/config.inc - -LOGDIR = log/copynumber_summary.$(NOW) -PHONY += genome_stats summary summary/tsv - -CN_SUMMARY_WORKFLOW += genome_altered -CN_SUMMARY_WORKFLOW += lst_score -CN_SUMMARY_WORKFLOW += ntai_score -CN_SUMMARY_WORKFLOW += myriad_score -CN_SUMMARY_WORKFLOW += genome_summary - -cn_summary_workflow : $(CN_SUMMARY_WORKFLOW) - -include modules/copy_number/genomealtered.mk -include modules/copy_number/lstscore.mk -include modules/copy_number/ntaiscore.mk -include modules/copy_number/myriadhrdscore.mk -include modules/summary/genomesummary.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/cravat_annotation.mk b/test/workflows/cravat_annotation.mk deleted file mode 100644 index d83dfcf4..00000000 --- a/test/workflows/cravat_annotation.mk +++ /dev/null @@ -1,21 +0,0 @@ -include modules/Makefile.inc -include modules/config.inc - -LOGDIR = log/cravat_annotation.$(NOW) -PHONY += gatk cravat summary summary/tsv - -ANNOTATION_WORKFLOW += gatk_vcfs -ANNOTATION_WORKFLOW += cravat_annotate -ANNOTATION_WORKFLOW += cravat_summary - -cravat_annotation_workflow : $(ANNOTATION_WORKFLOW) - -include modules/variant_callers/gatk.mk -include modules/vcf_tools/cravat_annotation.mk -include modules/summary/cravat_summary.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) - - diff --git a/test/workflows/fetchimpact.mk b/test/workflows/fetchimpact.mk deleted file mode 100644 index 4e03d3e1..00000000 --- a/test/workflows/fetchimpact.mk +++ /dev/null @@ -1,16 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/fetchimpact.$(NOW) -PHONY += unprocessed_bam - -fetch_impact : $(foreach sample,$(SAMPLES),unprocessed_bam/$(sample).bam) - -define fetch-impact -unprocessed_bam/%.bam : - $$(call RUN,-c -s 4G -m 12G,"scp luna.mskcc.org:/ifs/dmpshare/share/irb12_245/$$(*).bam unprocessed_bam/$$(*).bam") - -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call fetch-impact,$(sample)))) - -.PHONY : $(PHONY) diff --git a/test/workflows/medicc.mk b/test/workflows/medicc.mk deleted file mode 100644 index f2c4ad37..00000000 --- a/test/workflows/medicc.mk +++ /dev/null @@ -1,79 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/medicc.$(NOW) - -ALLELE_SPECIFIC_COPY ?= false - -ifeq ($(ALLELE_SPECIFIC_COPY),true) - -PHONY += medicc medicc/allele_specific medicc/allele_specific/mad medicc/allele_specific/ascat medicc/allele_specific/aspcf medicc/allele_specific/medicc - -medicc : $(foreach set,$(SAMPLE_SETS),medicc/allele_specific/medicc/$(set)/tree_final.new) $(foreach set,$(SAMPLE_SETS),medicc/allele_specific/medicc/$(set)/tree_final.pdf) - -define allele-specific-medicc -medicc/allele_specific/mad/%.RData : $(wildcard $(foreach pair,$(SAMPLE_PAIRS),facets/cncf/$(pair).Rdata)) - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/allele_specific && \ - mkdir -p medicc/allele_specific/mad && \ - $(RSCRIPT) modules/test/phylogeny/combinesamples.R --sample_set $$* --normal_samples '$(NORMAL_SAMPLES)' --type allele_specific") - -medicc/allele_specific/aspcf/%.RData : medicc/allele_specific/mad/%.RData - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/allele_specific/ascat && \ - mkdir -p medicc/allele_specific/aspcf && \ - $(RSCRIPT) modules/test/phylogeny/segmentsamples.R --sample_set $$* --normal_samples '$(NORMAL_SAMPLES)' --gamma '$${mpcf_gamma}' --nlog2 '$${mpcf_nlog2}' --nbaf '$${mpcf_nbaf}' --type allele_specific") - -medicc/allele_specific/medicc/%/desc.txt : medicc/allele_specific/aspcf/%.RData - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/allele_specific/medicc && \ - mkdir -p medicc/allele_specific/medicc/$$* && \ - $(RSCRIPT) modules/test/phylogeny/initmedicc.R --sample_set $$* --type allele_specific") - -medicc/allele_specific/medicc/%/tree_final.new : medicc/allele_specific/medicc/%/desc.txt - $$(call RUN,-c -s 8G -m 12G -v $(MEDICC_ENV),"source $(MEDICC_VAR) && \ - $(MEDICC_BIN)/medicc.py medicc/allele_specific/medicc/$$*/desc.txt medicc/allele_specific/medicc/$$* -v") - -medicc/allele_specific/medicc/%/tree_final.pdf : medicc/allele_specific/medicc/%/tree_final.new - $$(call RUN,-c -n 12 -s 1G -m 2G -v $(PHYLO_ENV),"$(RSCRIPT) modules/test/phylogeny/plotmedicc.R --sample_set $$(*) --type allele_specific") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call allele-specific-medicc,$(set)))) - -else - -PHONY += medicc medicc/total_copy medicc/total_copy/mad medicc/total_copy/mpcf medicc/total_copy/medicc - -medicc : $(foreach set,$(SAMPLE_SETS),medicc/total_copy/medicc/$(set)/tree_final.new) $(foreach set,$(SAMPLE_SETS),medicc/total_copy/medicc/$(set)/tree_final.pdf) - -define total-copy-medicc -medicc/total_copy/mad/%.RData : $(wildcard $(foreach pair,$(SAMPLE_PAIRS),facets/cncf/$(pair).Rdata)) - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/total_copy && \ - mkdir -p medicc/total_copy/mad && \ - $(RSCRIPT) modules/test/phylogeny/combinesamples.R --sample_set $$* --normal_samples '$(NORMAL_SAMPLES)' --type total_copy") - -medicc/total_copy/mpcf/%.RData : medicc/total_copy/mad/%.RData - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/total_copy/mpcf && \ - $(RSCRIPT) modules/test/phylogeny/segmentsamples.R --sample_set $$* --normal_samples '$(NORMAL_SAMPLES)' --gamma '$${mpcf_gamma}' --nlog2 '$${mpcf_nlog2}' --type total_copy") - -medicc/total_copy/medicc/%/desc.txt : medicc/total_copy/mpcf/%.RData - $$(call RUN,-c -s 8G -m 12G -v $(ASCAT_ENV),"mkdir -p medicc/total_copy/medicc && \ - mkdir -p medicc/total_copy/medicc/$$* && \ - $(RSCRIPT) modules/test/phylogeny/initmedicc.R --sample_set $$* --type total_copy") - -medicc/total_copy/medicc/%/tree_final.new : medicc/total_copy/medicc/%/desc.txt - $$(call RUN,-c -s 8G -m 12G -v $(MEDICC_ENV),"source $(MEDICC_VAR) && \ - $(MEDICC_BIN)/medicc.py medicc/total_copy/medicc/$$*/desc.txt medicc/total_copy/medicc/$$* -t -v && \ - cp medicc/total_copy/medicc/$$*/tree_fitch_nc.xml medicc/total_copy/medicc/$$*/tree_final.xml && \ - cp medicc/total_copy/medicc/$$*/tree_fitch_nc.graph medicc/total_copy/medicc/$$*/tree_final.graph && \ - cp medicc/total_copy/medicc/$$*/tree_fitch_nc.new medicc/total_copy/medicc/$$*/tree_final.new") - -medicc/total_copy/medicc/%/tree_final.pdf : medicc/total_copy/medicc/%/tree_final.new - $$(call RUN,-c -n 12 -s 1G -m 2G -v $(PHYLO_ENV),"$(RSCRIPT) modules/test/phylogeny/plotmedicc.R --sample_set $$(*) --type total_copy") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call total-copy-medicc,$(set)))) - -endif - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/mspyclone.mk b/test/workflows/mspyclone.mk deleted file mode 100644 index df313959..00000000 --- a/test/workflows/mspyclone.mk +++ /dev/null @@ -1,20 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/ms_pyclone.$(NOW) -PHONY += pyclone sufam summary pyclone - -PYCLONE_WORKFLOW += sufam_multisample -PYCLONE_WORKFLOW += setup_pyclone -PYCLONE_WORKFLOW += run_pyclone -PYCLONE_WORKFLOW += plot_pyclone - -pyclone_workflow : $(PYCLONE_WORKFLOW) - -include modules/variant_callers/sufammultisample.mk -include modules/clonality/setuppyclone.mk -include modules/clonality/runpyclone.mk -include modules/clonality/plotpyclone.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/pratchet.mk b/test/workflows/pratchet.mk deleted file mode 100644 index 5b56f83c..00000000 --- a/test/workflows/pratchet.mk +++ /dev/null @@ -1,24 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/pratchet.$(NOW) -PHONY += pratchet - -pratchet : $(foreach set,$(SAMPLE_SETS),pratchet/$(set)/tree_final.RData) $(foreach set,$(SAMPLE_SETS),pratchet/$(set)/tree_final.pdf) - -define parsimony-ratchet -pratchet/%/tree_final.RData : sufam/%.tsv - $$(call RUN,-c -s 8G -m 12G -v $(PHANGORN_ENV),"mkdir -p pratchet && \ - mkdir -p pratchet/$$* && \ - $(RSCRIPT) modules/test/phylogeny/pratchet.R --sample_set $$* --normal_samples '$(NORMAL_SAMPLES)'") - -pratchet/%/tree_final.pdf : pratchet/%/tree_final.RData - $$(call RUN,-c -s 4G -m 6G -v $(PHYLO_ENV),"$(RSCRIPT) modules/test/phylogeny/plotratchet.R --sample_set $$(*)") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call parsimony-ratchet,$(set)))) - - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/qdnaseq.mk b/test/workflows/qdnaseq.mk deleted file mode 100644 index 81a86893..00000000 --- a/test/workflows/qdnaseq.mk +++ /dev/null @@ -1,17 +0,0 @@ -include modules/Makefile.inc -include modules/genome_inc/b37.inc - -LOGDIR ?= log/qdnaseq.$(NOW) -PHONY += qdnaseq qdnaseq/copynumber qdnaseq/copynumber/log2ratio qdnaseq/copynumber/segmented qdnaseq/copynumber/pcf - -QDNA_SEQ_WORKFLOW += qdnaseq_extract -QDNA_SEQ_WORKFLOW += qdnaseq_copynumber - -qdna_seq_workflow : $(QDNA_SEQ_WORKFLOW) - -include modules/test/copy_number/qdnaseqextract.mk -include modules/test/copy_number/qdnaseqcopynumber.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/sspyclone.mk b/test/workflows/sspyclone.mk deleted file mode 100644 index c0ac7432..00000000 --- a/test/workflows/sspyclone.mk +++ /dev/null @@ -1,37 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/ss_pyclone.$(NOW) -PHONY += pyclone - -pyclone : $(foreach pair,$(SAMPLE_PAIRS),pyclone/$(pair)/report/summary.tsv) - -MAX_CLUSTER ?= 5 - -define make-pyclone -pyclone/$1_$2/config.yaml : summary/tsv/mutation_summary.tsv - $$(call RUN, -s 16G -m 24G,"mkdir -p pyclone/$1_$2 && \ - mkdir -p pyclone/$1_$2/report && \ - $(RSCRIPT) modules/test/clonality/tsvtopyclone.R --sample_name $1_$2") - -pyclone/$1_$2/trace/alpha.tsv.bz2 : pyclone/$1_$2/config.yaml - $$(call RUN,-s 16G -m 24G -w 7200,"mkdir -p pyclone/$1_$2 && \ - mkdir -p pyclone/$1_$2/trace && \ - source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate /home/${USER}/share/usr/anaconda-envs/PyClone-0.13.1 && \ - PyClone run_analysis --config_file pyclone/$1_$2/config.yaml --seed 0") - -pyclone/$1_$2/report/pyclone.tsv : pyclone/$1_$2/trace/alpha.tsv.bz2 - $$(call RUN,-s 16G -m 24G -w 7200,"source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate /home/${USER}/share/usr/anaconda-envs/PyClone-0.13.1 && \ - PyClone build_table --config_file pyclone/$1_$2/config.yaml --out_file pyclone/$1_$2/report/pyclone.tsv --max_cluster $(MAX_CLUSTER) --table_type old_style --burnin 50000") - -pyclone/$1_$2/report/summary.tsv : pyclone/$1_$2/report/pyclone.tsv - $$(call RUN, -s 24G -m 48G,"mkdir -p pyclone/$1_$2 && \ - mkdir -p pyclone/$1_$2/report && \ - $(RSCRIPT) modules/test/clonality/reportpyclone.R --sample_name $1_$2") - -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call make-pyclone,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/test/workflows/viral_detection.mk b/test/workflows/viral_detection.mk deleted file mode 100644 index 89f34fdf..00000000 --- a/test/workflows/viral_detection.mk +++ /dev/null @@ -1,23 +0,0 @@ -include modules/Makefile.inc -include modules/config.inc - -LOGDIR = log/viral_detection.$(NOW) -PHONY += unmapped_reads - -VIRUS_WORKFLOW += extract_unmapped -VIRUS_WORKFLOW += bam_to_fasta -VIRUS_WORKFLOW += blast_reads -VIRUS_WORKFLOW += krona_classify - -viral_detection_workflow : $(VIRUS_WORKFLOW) - -include modules/fastq_tools/extractReads.mk -include modules/fastq_tools/bamtoFasta.mk -include modules/fastq_tools/blastReads.mk -include modules/virus/krona_classify.mk - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) - - diff --git a/variant_callers/combinesamples.R b/variant_callers/combinesamples.R deleted file mode 100644 index 5c54d37a..00000000 --- a/variant_callers/combinesamples.R +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample_set", default = NA, type = 'character', help = "sample names set")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -sample_names = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) - -all_vars = read.csv(file="summary/tsv/mutation_summary.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) -tmp_vars = all_vars[all_vars$TUMOR_SAMPLE %in% sample_names,,drop=FALSE] -keys = paste0(tmp_vars$CHROM, ":", tmp_vars$POS, ":", tmp_vars$REF, ":", tmp_vars$ALT) -ukeys = unique(keys) -vars = NULL -for (i in 1:length(ukeys)) { - index = which(keys==ukeys[i]) - Chromosome = tmp_vars[index[1],"CHROM"] - Position = tmp_vars[index[1],"POS"] - Ref = tmp_vars[index[1],"REF"] - Alt = tmp_vars[index[1],"ALT"] - Variant_Caller = tmp_vars[index[1],"variantCaller"] - Gene_Symbol = tmp_vars[index[1],"SYMBOL"] - Variant_Classification = tmp_vars[index[1],"Variant_Classification"] - HGVSp_Short = tmp_vars[index[1],"HGVSp_Short"] - Fuentes = tmp_vars[index[1],"fuentes"] - dgd = tmp_vars[index[1],"dgd"] - OncoKB_Level = tmp_vars[index[1],"oncoKB_level"] - OncoKB_Cancer_Type = tmp_vars[index[1],"oncoKB_cancer_type"] - Cancer_Gene_Census = tmp_vars[index[1],"cancer_gene_census"] - Kandoth = tmp_vars[index[1],"kandoth"] - Lawrence = tmp_vars[index[1],"lawrence"] - Hap_Insuf = tmp_vars[index[1],"hap_insuf"] - ExAC_AF = tmp_vars[index[1],"ExAC_AF"] - MutationTaster = tmp_vars[index[1],"MutationTaster_pred"] - PROVEAN = tmp_vars[index[1],"PROVEAN_pred"] - FATHMM = tmp_vars[index[1],"FATHMM_pred"] - BRCA_Chasm = tmp_vars[index[1],"BRCA_chasm_pred"] - Parssnp = tmp_vars[index[1],"parssnp_pred"] - Pathogenicity = tmp_vars[index[1],"pathogenicity"] - HOTSPOT = tmp_vars[index[1],"HOTSPOT"] - HOTSPOT_INTERNAL = tmp_vars[index[1],"HOTSPOT_INTERNAL"] - CMO_HOTSPOT = tmp_vars[index[1],"cmo_hotspot"] - vars = rbind(vars, c("Chromosome"=Chromosome, - "Position"=Position, - "Ref"=Ref, - "Alt"=Alt, - "Variant_Caller"=Variant_Caller, - "Gene_Symbol"=Gene_Symbol, - "Variant_Classification"=Variant_Classification, - "HGVSp"=HGVSp_Short, - "Fuentes"=Fuentes, - "dgd"=dgd, - "OncoKB_Level"=OncoKB_Level, - "OncoKB_Cancer_Type"=OncoKB_Cancer_Type, - "Cancer_Gene_Census"=Cancer_Gene_Census, - "Kandoth"=Kandoth, - "Lawrence"=Lawrence, - "Hap_Insuf"=Hap_Insuf, - "ExAC"=ExAC_AF, - "MutationTaster"=MutationTaster, - "PROVEAN"=PROVEAN, - "FATHMM"=FATHMM, - "BRCA_Chasm"=BRCA_Chasm, - "Parssnp"=Parssnp, - "Pathogenicity"=Pathogenicity, - "HOTSPOT"=HOTSPOT, - "HOTSPOT_INTERNAL"=HOTSPOT_INTERNAL, - "HOTSPOT_CMO"=CMO_HOTSPOT)) -} - -normal_name = tmp_vars[1,"NORMAL_SAMPLE"] - -VAF = DEPTH = LOH = CALLS = matrix(NA, nrow=length(ukeys), ncol=length(sample_names), dimnames=list(ukeys, sample_names)) -for (j in 1:nrow(tmp_vars)) { - sample_name = tmp_vars[j,"TUMOR_SAMPLE"] - ukey = paste0(tmp_vars$CHROM[j], ":", tmp_vars$POS[j], ":", tmp_vars$REF[j], ":", tmp_vars$ALT[j]) - VAF[ukey,sample_name] = tmp_vars[j,"TUMOR_MAF"] - VAF[ukey,normal_name] = tmp_vars[j,"NORMAL_MAF"] - DEPTH[ukey,sample_name] = tmp_vars[j,"TUMOR_DP"] - DEPTH[ukey,normal_name] = tmp_vars[j,"NORMAL_DP"] - LOH[ukey,sample_name] = tmp_vars[j,"facetsLOHCall"] - CALLS[ukey,sample_name] = 1 -} -colnames(VAF) = paste0("MAF_", colnames(VAF)) -colnames(DEPTH) = paste0("DP_", colnames(DEPTH)) -colnames(LOH) = paste0("LOH_", colnames(LOH)) -colnames(CALLS) = paste0("CALL_", colnames(CALLS)) -CALLS[is.na(CALLS)] = 0 -vars = cbind(vars, VAF, DEPTH, LOH, CALLS) -mutect = grepl("mutect", vars[,"Variant_Caller"]) -main_indels = grepl("varscan", vars[,"Variant_Caller"]) & grepl("strelka", vars[,"Variant_Caller"]) -other_indels = ((grepl("platypus", vars[,"Variant_Caller"]) & grepl("scalpel", vars[,"Variant_Caller"])) | - (grepl("platypus", vars[,"Variant_Caller"]) & grepl("lancet", vars[,"Variant_Caller"]))) & - (nchar(vars[,"Ref"])>3 | nchar(vars[,"Alt"])>3) & - !grepl("In_Frame", vars[,"Variant_Classification"]) -index = mutect | main_indels | other_indels -vars = vars[index,,drop=FALSE] -index = vars[,"Variant_Classification"] %in% c("Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site") -vars = vars[index,,drop=FALSE] - -write.table(vars, file=paste0("sufam/", opt$sample_set, ".txt"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) diff --git a/variant_callers/combinesamplesf.R b/variant_callers/combinesamplesf.R deleted file mode 100644 index 0e90be09..00000000 --- a/variant_callers/combinesamplesf.R +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample_set", default = NA, type = 'character', help = "sample names set")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -sample_names = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) - -all_vars = read.csv(file="summary/tsv/mutation_summary.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) -tmp_vars = all_vars[all_vars$TUMOR_SAMPLE %in% sample_names,,drop=FALSE] -keys = paste0(tmp_vars$CHROM, ":", tmp_vars$POS, ":", tmp_vars$REF, ":", tmp_vars$ALT) -ukeys = unique(keys) -vars = NULL -for (i in 1:length(ukeys)) { - index = which(keys==ukeys[i]) - Chromosome = tmp_vars[index[1],"CHROM"] - Position = tmp_vars[index[1],"POS"] - Ref = tmp_vars[index[1],"REF"] - Alt = tmp_vars[index[1],"ALT"] - Variant_Caller = tmp_vars[index[1],"variantCaller"] - Gene_Symbol = tmp_vars[index[1],"SYMBOL"] - Variant_Classification = tmp_vars[index[1],"Variant_Classification"] - HGVSp_Short = tmp_vars[index[1],"HGVSp_Short"] - Fuentes = tmp_vars[index[1],"fuentes"] - dgd = tmp_vars[index[1],"dgd"] - OncoKB_Level = tmp_vars[index[1],"oncoKB_level"] - OncoKB_Cancer_Type = tmp_vars[index[1],"oncoKB_cancer_type"] - Cancer_Gene_Census = tmp_vars[index[1],"cancer_gene_census"] - Kandoth = tmp_vars[index[1],"kandoth"] - Lawrence = tmp_vars[index[1],"lawrence"] - Hap_Insuf = tmp_vars[index[1],"hap_insuf"] - ExAC_AF = tmp_vars[index[1],"ExAC_AF"] - MutationTaster = tmp_vars[index[1],"MutationTaster_pred"] - PROVEAN = tmp_vars[index[1],"PROVEAN_pred"] - FATHMM = tmp_vars[index[1],"FATHMM_pred"] - BRCA_Chasm = tmp_vars[index[1],"BRCA_chasm_pred"] - Parssnp = tmp_vars[index[1],"parssnp_pred"] - Pathogenicity = tmp_vars[index[1],"pathogenicity"] - HOTSPOT = tmp_vars[index[1],"HOTSPOT"] - HOTSPOT_INTERNAL = tmp_vars[index[1],"HOTSPOT_INTERNAL"] - CMO_HOTSPOT = tmp_vars[index[1],"cmo_hotspot"] - vars = rbind(vars, c("Chromosome"=Chromosome, - "Position"=Position, - "Ref"=Ref, - "Alt"=Alt, - "Variant_Caller"=Variant_Caller, - "Gene_Symbol"=Gene_Symbol, - "Variant_Classification"=Variant_Classification, - "HGVSp"=HGVSp_Short, - "Fuentes"=Fuentes, - "dgd"=dgd, - "OncoKB_Level"=OncoKB_Level, - "OncoKB_Cancer_Type"=OncoKB_Cancer_Type, - "Cancer_Gene_Census"=Cancer_Gene_Census, - "Kandoth"=Kandoth, - "Lawrence"=Lawrence, - "Hap_Insuf"=Hap_Insuf, - "ExAC"=ExAC_AF, - "MutationTaster"=MutationTaster, - "PROVEAN"=PROVEAN, - "FATHMM"=FATHMM, - "BRCA_Chasm"=BRCA_Chasm, - "Parssnp"=Parssnp, - "Pathogenicity"=Pathogenicity, - "HOTSPOT"=HOTSPOT, - "HOTSPOT_INTERNAL"=HOTSPOT_INTERNAL, - "HOTSPOT_CMO"=CMO_HOTSPOT)) -} - -normal_name = tmp_vars[1,"NORMAL_SAMPLE"] - -VAF = DEPTH = LOH = CALLS = matrix(NA, nrow=length(ukeys), ncol=length(sample_names), dimnames=list(ukeys, sample_names)) -for (j in 1:nrow(tmp_vars)) { - sample_name = tmp_vars[j,"TUMOR_SAMPLE"] - ukey = paste0(tmp_vars$CHROM[j], ":", tmp_vars$POS[j], ":", tmp_vars$REF[j], ":", tmp_vars$ALT[j]) - VAF[ukey,sample_name] = tmp_vars[j,"TUMOR_MAF"] - VAF[ukey,normal_name] = tmp_vars[j,"NORMAL_MAF"] - DEPTH[ukey,sample_name] = tmp_vars[j,"TUMOR_DP"] - DEPTH[ukey,normal_name] = tmp_vars[j,"NORMAL_DP"] - LOH[ukey,sample_name] = tmp_vars[j,"facetsLOHCall"] - CALLS[ukey,sample_name] = 1 -} -colnames(VAF) = paste0("MAF_", colnames(VAF)) -colnames(DEPTH) = paste0("DP_", colnames(DEPTH)) -colnames(LOH) = paste0("LOH_", colnames(LOH)) -colnames(CALLS) = paste0("CALL_", colnames(CALLS)) -CALLS[is.na(CALLS)] = 0 -vars = cbind(vars, VAF, DEPTH, LOH, CALLS) -mutect = grepl("mutect", vars[,"Variant_Caller"]) -main_indels = grepl("varscan", vars[,"Variant_Caller"]) & grepl("strelka", vars[,"Variant_Caller"]) -other_indels = ((grepl("platypus", vars[,"Variant_Caller"]) & grepl("scalpel", vars[,"Variant_Caller"])) | - (grepl("platypus", vars[,"Variant_Caller"]) & grepl("lancet", vars[,"Variant_Caller"]))) & - (nchar(vars[,"Ref"])>3 | nchar(vars[,"Alt"])>3) & - !grepl("In_Frame", vars[,"Variant_Classification"]) -index = mutect | main_indels | other_indels -vars = vars[index,,drop=FALSE] -index = vars[,"Variant_Classification"] %in% c("Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site") -vars = vars[index,,drop=FALSE] - -blacklist = read.csv(file="summary/tsv/mouse_summary.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) -indx = grep("AD", colnames(blacklist)) -index = matrix(0, nrow=nrow(blacklist), ncol=length(indx)) -for (i in 1:length(indx)) { - index[blacklist[,indx[i]]!=0,i] = 1 -} -index = apply(index, 1, sum)>0 -all_id = paste0(vars[,"Chromosome"], ":", vars[,"Position"], "_", vars[,"Ref"], ">", vars[,"Alt"]) -blacklist_id = paste0(blacklist[index,"Chromosome"], ":", blacklist[index,"Position"], "_", blacklist[,"Reference_Allele"], ">", blacklist[,"Alternate_Allele"]) -keep_id = which(!(all_id %in% blacklist_id)) -vars = vars[keep_id,,drop=FALSE] - -write.table(vars, file=paste0("sufam/", opt$sample_set, ".txt"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) diff --git a/variant_callers/genotypehotspots.mk b/variant_callers/genotypehotspots.mk deleted file mode 100644 index 6d425f0e..00000000 --- a/variant_callers/genotypehotspots.mk +++ /dev/null @@ -1,22 +0,0 @@ -include modules/Makefile.inc - -SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev -SUFAM_OPTS = --mpileup-parameters='-A -q 15 -Q 15 -d 15000' - -LOGDIR ?= log/genotype_hotspots.$(NOW) -PHONY += hotspot - -genotype_hotspots : $(foreach sample,$(SAMPLES),hotspot/$(sample).txt) - -define genotype-hotspots -hotspot/%.txt : bam/%.bam - $$(call RUN,-v $$(SUFAM_ENV) -c -s 2G -m 4G -w 2880,"sufam --sample_name $$(*) $$(SUFAM_OPTS) $$(REF_FASTA) modules/reference/hotspots/hotspot-dedup.vcf bam/$$(*).bam > hotspot/$$(*).txt") - -endef - $(foreach sample,$(SAMPLES),\ - $(eval $(call genotype-hotspots,$(sample)))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) - diff --git a/variant_callers/genotypepdx.R b/variant_callers/genotypepdx.R deleted file mode 100644 index 2c84bf14..00000000 --- a/variant_callers/genotypepdx.R +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env Rscript - -all_vars = read.csv(file="summary/tsv/mutation_summary.tsv", header=TRUE, sep="\t", stringsAsFactors=FALSE) -CHROM = all_vars[,"CHROM"] -POS = all_vars[,"POS"] -ID = all_vars[,"ID"] -REF = all_vars[,"REF"] -ALT = all_vars[,"ALT"] -QUAL = FILTER = rep(".", nrow(all_vars)) -INFO = paste0(all_vars[,"SYMBOL"], all_vars[,"HGVSp_Short"]) -vcf = data.frame(CHROM, POS, ID, REF, ALT, QUAL, INFO) - -cat("#", file="sufam/pdx.vcf", append=FALSE) -write.table(vcf, file="sufam/pdx.vcf", col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE, append=TRUE) diff --git a/variant_callers/genotypepdx.mk b/variant_callers/genotypepdx.mk deleted file mode 100644 index c599b82d..00000000 --- a/variant_callers/genotypepdx.mk +++ /dev/null @@ -1,29 +0,0 @@ -include modules/Makefile.inc - -SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev -SUFAM_OPTS = --mpileup-parameters='-A -q 15 -Q 15 -d 15000' -MOUSE_SAMPLES = $(sample_category.mouse) - -LOGDIR ?= log/genotype_pdx.$(NOW) -PHONY += sufam summary - -genotype_pdx : $(foreach sample,$(sample_category.mouse),sufam/$(sample).txt) sufam/pdx.vcf summary/mouse_summary.xlsx - -sufam/pdx.vcf : summary/tsv/mutation_summary.tsv - $(call RUN, -c -s 8G -m 16G,"$(RSCRIPT) modules/variant_callers/genotypepdx.R") - -define genotype-pdx -sufam/%.txt : bam/%.bam sufam/pdx.vcf - $$(call RUN,-v $$(SUFAM_ENV) -c -s 2G -m 4G -w 2880,"sufam --sample_name $$(*) $$(SUFAM_OPTS) $$(REF_FASTA) sufam/pdx.vcf bam/$$(*).bam > sufam/$$(*).txt") - -endef - $(foreach sample,$(sample_category.mouse),\ - $(eval $(call genotype-pdx,$(sample)))) - -summary/mouse_summary.xlsx : $(wildcard $(foreach sample,$(sample_category.mouse),sufam/$(sample).txt)) - $(call RUN,-n 1 -s 4G -m 4G,"$(RSCRIPT) modules/summary/mousesummary.R --sample_names '$(MOUSE_SAMPLES)' --out_file summary/tsv/mouse_summary.tsv && \ - python modules/summary/mouse_summary_excel.py") - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/variant_callers/get_basecounts.mk b/variant_callers/get_basecounts.mk new file mode 100644 index 00000000..331d9aa2 --- /dev/null +++ b/variant_callers/get_basecounts.mk @@ -0,0 +1,47 @@ +include modules/Makefile.inc + +LOGDIR ?= log/get_basecount.$(NOW) + +MAPQ := 0 +BAQ := 0 +COV := 0 + +getbasecount : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz) \ + gbc/summary.txt + +define get-basecount +gbc/$1.txt.gz : bam/$1.bam vcf/dataSilentNoPoleNotTertPromot.vcf + $$(call RUN,-n 6 -s 3G -m 6G,"set -o pipefail && \ + $(GBC) --fasta $(REF_FASTA) \ + --bam $$(<) \ + --vcf $$(<<) \ + --output $$(@) \ + --thread 6 \ + --sort_output \ + --compress_output \ + --maq $(MAPQ) \ + --baq $(BAQ) \ + --cov $(COV) \ + --filter_duplicate 0 \ + --filter_improper_pair 0 \ + --filter_qc_failed 1 \ + --filter_indel 0 \ + --filter_non_primary 1") + +endef +$(foreach sample,$(SAMPLES),\ + $(eval $(call get-basecount,$(sample)))) + + +gbc/summary.txt : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz) + $(call RUN,-n 1 -s 24G -m 32G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/get_basecounts.R \ + --option 1 \ + --sample_name '$(SAMPLES)'") + + +..DUMMY := $(shell mkdir -p version; \ + ${GBC} &> version/get_basecount.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: getbasecount diff --git a/variant_callers/somatic/hla_summary.R b/variant_callers/somatic/hla_summary.R index 5d3ef8f0..e08590db 100644 --- a/variant_callers/somatic/hla_summary.R +++ b/variant_callers/somatic/hla_summary.R @@ -1,25 +1,59 @@ suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("magrittr")) options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -optList <- list( - make_option("--sample_names", default = "NA", help = "tumor normal sample pair names") - ) +optList <- list(make_option("--option", default = "NA", help = "which option?"), + make_option("--sample_names", default = "NA", help = "sample names")) parser <- OptionParser(usage = "%prog [options]", option_list = optList) arguments <- parse_args(parser, positional_arguments = T) opt <- arguments$options sample_names = unlist(strsplit(opt$sample_names, split=" ", fixed=TRUE)) -hla_genotypes = list() -for (i in 1:length(sample_names)) { - data = read.csv(file=paste0("hla_polysolver/", sample_names[i], "/winners.hla.txt"), header=FALSE, sep="\t", stringsAsFactors=FALSE) - gen_1 = t(data[,2,drop=FALSE]) - gen_2 = t(data[,3,drop=FALSE]) - colnames(gen_1) = paste0(c("HLA-A", "HLA-B", "HLA-C"), "_1") - colnames(gen_2) = paste0(c("HLA-A", "HLA-B", "HLA-C"), "_2") - hla_genotypes[[i]] = cbind(gen_1, gen_2) + +if (as.numeric(opt$option)==1) { + hla_genotypes = list() + for (i in 1:length(sample_names)) { + hla_genotypes[[i]] = readr::read_tsv(file = paste0("hla_polysolver/", sample_names[i], "/winners.hla.txt"), + col_names = FALSE, col_types = cols(.default = col_character())) %>% + readr::type_convert() %>% + dplyr::rename(hla = X1, major_allele = X2, minor_allele = X3) %>% + dplyr::mutate(sample_name = sample_names[i]) + } + hla_genotypes = do.call(rbind, hla_genotypes) + readr::write_tsv(x = hla_genotypes, path = "hla_polysolver/summary/hla_summary.txt", col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==2) { + somatic_vars = list() + for (i in 1:length(sample_names)) { + somatic_vars[[i]] = readr::read_tsv(file = paste0("hla_polysolver/", sample_names[i], "/", sample_names[i], ".mutect.unfiltered.annotated"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + somatic_vars = do.call(rbind, somatic_vars) + if (nrow(somatic_vars)>0) { + somatic_vars = somatic_vars %>% + dplyr::mutate(tumor_name = unlist(lapply(individual, function(x) { unlist(strsplit(x, split = "_", fixed = TRUE))[1] }))) %>% + dplyr::mutate(normal_name = unlist(lapply(individual, function(x) { unlist(strsplit(x, split = "_", fixed = TRUE))[2] }))) + } + readr::write_tsv(x = somatic_vars, path = "hla_polysolver/summary/mutect_summary.txt", col_names = TRUE, append = FALSE) + +} else if (as.numeric(opt$option)==3) { + somatic_vars = list() + for (i in 1:length(sample_names)) { + somatic_vars[[i]] = readr::read_tsv(file = paste0("hla_polysolver/", sample_names[i], "/", sample_names[i], ".strelka_indels.unfiltered.annotated"), + col_names = TRUE, col_types = cols(.default = col_character())) %>% + readr::type_convert() + } + somatic_vars = do.call(rbind, somatic_vars) + if (nrow(somatic_vars)>0) { + somatic_vars = somatic_vars %>% + dplyr::mutate(tumor_name = unlist(lapply(individual, function(x) { unlist(strsplit(x, split = "_", fixed = TRUE))[1] }))) %>% + dplyr::mutate(normal_name = unlist(lapply(individual, function(x) { unlist(strsplit(x, split = "_", fixed = TRUE))[2] }))) + } + readr::write_tsv(x = somatic_vars, path = "hla_polysolver/summary/strelka_summary.txt", col_names = TRUE, append = FALSE) + } -hla_genotypes = do.call(rbind, hla_genotypes) -hla_genotypes = cbind("SAMPLE_NAMES"=sample_names, hla_genotypes) -write.table(hla_genotypes, file="hla_polysolver/summary/genotype_summary.txt", col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) diff --git a/variant_callers/somatic/macs2TN.mk b/variant_callers/somatic/macs2TN.mk deleted file mode 100644 index 259e4e65..00000000 --- a/variant_callers/somatic/macs2TN.mk +++ /dev/null @@ -1,23 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/macs2TN.$(NOW) -PHONY += macs2 macs2/broadpeaks macs2/narrowpeaks - -macs2TN : $(foreach pair,$(SAMPLE_PAIRS),macs2/$(pair).timestamp) - -define macs2-case-control -macs2/broadpeaks/$1_$2.task.complete : bam/$1.bam bam/$2.bam - $$(call RUN,-c -s 8G -m 12G,"macs2 callpeak -t $$< -c $$(<<) -f BAM -g hs --keep-dup all --broad --outdir macs2/broadpeaks -n $1_$2 -B --verbose 2 --nomodel -p 0.1 && echo $$< $$(<<) > macs2/broadpeaks/$1_$2.task.complete") - -macs2/narrowpeaks/$1_$2.task.complete : bam/$1.bam bam/$2.bam - $$(call RUN,-c -s 8G -m 12G,"macs2 callpeak -t $$< -c $$(<<) -f BAM -g hs --keep-dup all --outdir macs2/narrowpeaks -n $1_$2 -B --verbose 2 --nomodel -p 0.1 && echo $$< $$(<<) > macs2/narrowpeaks/$1_$2.task.complete") - -macs2/$1_$2.timestamp : macs2/broadpeaks/$1_$2.task.complete macs2/narrowpeaks/$1_$2.task.complete - $$(call RUN,-c -s 1G -m 1G,"echo $$< $$(<<) > macs2/$1_$2.timestamp") -endef -$(foreach pair,$(SAMPLE_PAIRS),\ - $(eval $(call macs2-case-control,$(tumor.$(pair)),$(normal.$(pair))))) - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) diff --git a/variant_callers/somatic/mimsi.mk b/variant_callers/somatic/mimsi.mk new file mode 100644 index 00000000..59585ecb --- /dev/null +++ b/variant_callers/somatic/mimsi.mk @@ -0,0 +1,37 @@ +include modules/Makefile.inc + +LOGDIR ?= log/mimsi.$(NOW) + +mimsi: $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt) \ + mimsi/summary.txt + +MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/mimsi/microsatellites_impact_only.list +MODEL = $(HOME)/share/lib/resource_files/mimsi/mi_msi_v0_4_0_200x.model + +define mimsi-tumor-normal +mimsi/$1_$2/$1_$2.txt : bam/$1.bam bam/$2.bam + $$(call RUN,-c -n 8 -s 1G -m 2G -v $(MIMSI_ENV),"set -o pipefail && \ + mkdir -p mimsi/$1_$2/ && \ + analyze \ + --tumor-bam $$(<) \ + --normal-bam $$(<<) \ + --case-id $1 \ + --norm-case-id $2 \ + --microsatellites-list $$(MICROSATELLITES_LIST) \ + --save-location mimsi/$1_$2/ \ + --model $$(MODEL) \ + --save && \ + mv mimsi/$1_$2/BATCH_results.txt $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call mimsi-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) + +mimsi/summary.txt : $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/mimsi.R --option 1 --sample_names '$(SAMPLE_PAIRS)'") + + +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: mimsi diff --git a/variant_callers/somatic/msisensor.mk b/variant_callers/somatic/msisensor.mk index b61e0298..42f5f108 100644 --- a/variant_callers/somatic/msisensor.mk +++ b/variant_callers/somatic/msisensor.mk @@ -2,30 +2,29 @@ include modules/Makefile.inc LOGDIR ?= log/msisensor.$(NOW) -MSISENSOR_OPTS ?= -d $(REF_MSI) $(if $(TARGETS_FILE),-e $(TARGETS_FILE)) +msisensor: $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi) \ + msisensor/msi.tsv -PHONY += msisensor - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY : $(PHONY) - -msisensor: msisensor/msi.tsv +MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/MSIsensor/microsatellites.list +MSI_REGIONS = $(HOME)/share/lib/resource_files/MSIsensor/msiregions.bed define msisensor-tumor-normal -msisensor/$1_$2.msi : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai - $$(call RUN,-c -n 8 -s 1G -m 1.2G,"source ~/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate ~/share/usr/anaconda-envs/msisensor && \ - msisensor msi $$(MSISENSOR_OPTS) -n $$(<<) -t $$< -b 8 -o $$@") +msisensor/$1_$2.msi : bam/$1.bam bam/$2.bam + $$(call RUN,-c -n 8 -s 1G -m 2G -v $(MSISENSOR_ENV),"set -o pipefail && \ + msisensor msi $$(MSISENSOR_OPTS) \ + -d $$(MICROSATELLITES_LIST) \ + -e $$(MSI_REGIONS) \ + -n $$(<<) \ + -t $$(<) \ + -b 8 \ + -o $$(@)") endef -$(foreach pair,$(SAMPLE_PAIRS),$(eval $(call msisensor-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call msisensor-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) msisensor/msi.tsv : $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi) $(INIT) (head -1 $< | sed 's/^/sample\t/'; for x in $^; do sed "1d; s/^/$$(basename $$x)\t/" $$x; done | sed 's/_.*msi//' ) > $@ -bam/%.ds.bam : metrics/hs_metrics.tsv bam/%.bam - $(call RUN,-s 4G -m 6G,\ - "ds=\`py 'round(500 / pandas.read_table(\"$<\", index_col=0).ix[\"$*\", \"MEAN_TARGET_COVERAGE\"], 2)'\`; \ - if [ \$$(echo \"\$$ds >= 1\" | bc) -eq 1 ]; then ln -s \$$(readlink -f $(<<)) $@; else \ - samtools view -hb -s \$$ds $(<<) > $@; fi") - -include modules/bam_tools/processBam.mk +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY: msisensor diff --git a/variant_callers/somatic/polysolver.mk b/variant_callers/somatic/polysolver.mk index 3c4d3dd8..ce302f5e 100644 --- a/variant_callers/somatic/polysolver.mk +++ b/variant_callers/somatic/polysolver.mk @@ -1,44 +1,75 @@ include modules/Makefile.inc LOGDIR ?= log/hla_polysolver.$(NOW) -PHONY += hla_polysolver hla_polysolver/summary -hla_polysolver : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).taskcomplete) hla_polysolver/summary/genotype_summary.txt + +hla_polysolver : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/winners.hla.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/hla.intervals) \ + $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).mutect.unfiltered.annotated) \ + $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).strelka_indels.unfiltered.annotated) \ + hla_polysolver/summary/hla_summary.txt \ + hla_polysolver/summary/mutect_summary.txt \ + hla_polysolver/summary/strelka_summary.txt + define hla-polysolver hla_polysolver/$1_$2/winners.hla.txt : bam/$1.bam bam/$2.bam - $$(call RUN,-n 8 -s 2G -m 4G, "source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate \ - /home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export CONDA_PREFIX=/home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export PERL5LIB=/home/${USER}/share/usr/anaconda-envs/hla-polysolver/lib/perl5/5.22.0 && \ - if [ ! -d hla_polysolver/$1_$2 ]; then mkdir hla_polysolver/$1_$2; fi && \ - shell_call_hla_type bam/$2.bam Unknown 1 hg19 STDFQ 0 hla_polysolver/$1_$2") + $$(call RUN,-c -n 8 -s 2G -m 4G -v $(POLYSOLVER_ENV) -w 72:00:00, "set -o pipefail && \ + export CONDA_PREFIX=$$(POLYSOLVER_ENV) && \ + export PERL5LIB=$$(POLYSOLVER_ENV)/lib/perl5/5.22.0 && \ + shell_call_hla_type \ + $$(<<) \ + Unknown \ + 1 \ + hg19 \ + STDFQ \ + 0 \ + hla_polysolver/$1_$2") -hla_polysolver/$1_$2/hla.intervals : hla_polysolver/$1_$2/winners.hla.txt - $$(call RUN,-n 8 -s 2G -m 4G, "source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate \ - /home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export CONDA_PREFIX=/home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export PERL5LIB=/home/${USER}/share/usr/anaconda-envs/hla-polysolver/lib/perl5/5.22.0 && \ - shell_call_hla_mutations_from_type bam/$2.bam bam/$1.bam hla_polysolver/$1_$2/winners.hla.txt hg19 STDFQ hla_polysolver/$1_$2") +hla_polysolver/$1_$2/hla.intervals : bam/$1.bam bam/$2.bam hla_polysolver/$1_$2/winners.hla.txt + $$(call RUN,-c -n 8 -s 2G -m 4G -v $(POLYSOLVER_ENV) -w 72:00:00, "set -o pipefail && \ + export CONDA_PREFIX=$$(POLYSOLVER_ENV) && \ + export PERL5LIB=$$(POLYSOLVER_ENV)/lib/perl5/5.22.0 && \ + shell_call_hla_mutations_from_type \ + $$(<<) \ + $$(<) \ + $$(<<<) \ + hg19 \ + STDFQ \ + hla_polysolver/$1_$2") -hla_polysolver/$1_$2/$1_$2.mutect.unfiltered.annotated hla_polysolver/$1_$2/$1_$2.strelka_indels.unfiltered.annotated : hla_polysolver/$1_$2/hla.intervals - $$(call RUN,-n 8 -s 2G -m 4G, "source /home/${USER}/share/usr/anaconda-envs/jrflab-modules-0.1.5/bin/activate \ - /home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export CONDA_PREFIX=/home/${USER}/share/usr/anaconda-envs/hla-polysolver && \ - export PERL5LIB=/home/${USER}/share/usr/anaconda-envs/hla-polysolver/lib/perl5/5.22.0 && \ - shell_annotate_hla_mutations $1_$2 hla_polysolver/$1_$2") +hla_polysolver/$1_$2/$1_$2.mutect.unfiltered.annotated : hla_polysolver/$1_$2/hla.intervals + $$(call RUN,-c -n 8 -s 2G -m 4G -v $(POLYSOLVER_ENV) -w 72:00:00, "set -o pipefail && \ + export CONDA_PREFIX=$$(POLYSOLVER_ENV) && \ + export PERL5LIB=$$(POLYSOLVER_ENV)/lib/perl5/5.22.0 && \ + shell_annotate_hla_mutations \ + $1_$2 \ + hla_polysolver/$1_$2") -hla_polysolver/$1_$2/$1_$2.taskcomplete : hla_polysolver/$1_$2/$1_$2.mutect.unfiltered.annotated hla_polysolver/$1_$2/$1_$2.strelka_indels.unfiltered.annotated - $$(call RUN,-n 1 -s 1G -m 1G,"touch hla_polysolver/$1_$2/$1_$2.taskcomplete") +hla_polysolver/$1_$2/$1_$2.strelka_indels.unfiltered.annotated : hla_polysolver/$1_$2/$1_$2.mutect.unfiltered.annotated + endef $(foreach pair,$(SAMPLE_PAIRS),\ $(eval $(call hla-polysolver,$(tumor.$(pair)),$(normal.$(pair))))) -hla_polysolver/summary/genotype_summary.txt : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).taskcomplete) - $(call RUN,-c -s 12G -m 24G,"mkdir -p hla_polysolver/summary && \ - $(RSCRIPT) modules/variant_callers/somatic/hla_summary.R --sample_names '$(SAMPLE_PAIRS)'") +hla_polysolver/summary/hla_summary.txt : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).mutect.unfiltered.annotated) $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).strelka_indels.unfiltered.annotated) + $(call RUN,-c -s 12G -m 24G,"set -o pipefail && \ + $(RSCRIPT) modules/variant_callers/somatic/hla_summary.R --option 1 --sample_names '$(SAMPLE_PAIRS)'") -.DELETE_ON_ERROR: +hla_polysolver/summary/mutect_summary.txt : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).mutect.unfiltered.annotated) $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).strelka_indels.unfiltered.annotated) + $(call RUN,-c -s 12G -m 24G,"set -o pipefail && \ + $(RSCRIPT) modules/variant_callers/somatic/hla_summary.R --option 2 --sample_names '$(SAMPLE_PAIRS)'") + +hla_polysolver/summary/strelka_summary.txt : $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).mutect.unfiltered.annotated) $(foreach pair,$(SAMPLE_PAIRS),hla_polysolver/$(pair)/$(pair).strelka_indels.unfiltered.annotated) + $(call RUN,-c -s 12G -m 24G,"set -o pipefail && \ + $(RSCRIPT) modules/variant_callers/somatic/hla_summary.R --option 3 --sample_names '$(SAMPLE_PAIRS)'") + + +..DUMMY := $(shell mkdir -p version; \ + $(POLYSOLVER_ENV)/bin/shell_call_hla_type --help &> version/hla_polysolver.txt; \ + $(POLYSOLVER_ENV)/bin/shell_call_hla_mutations_from_type --help &>> version/hla_polysolver.txt; \ + $(POLYSOLVER_ENV)/bin/shell_annotate_hla_mutations --help &>> version/hla_polysolver.txt) .SECONDARY: -.PHONY: $(PHONY) +.DELETE_ON_ERROR: +.PHONY: hla_polysolver diff --git a/variant_callers/somatic/varscanTN.mk b/variant_callers/somatic/varscanTN.mk index b22b0e1f..560f0719 100644 --- a/variant_callers/somatic/varscanTN.mk +++ b/variant_callers/somatic/varscanTN.mk @@ -1,47 +1,40 @@ -# Run VarScan on tumour-normal matched pairs -# Detect point mutations -##### DEFAULTS ###### +include modules/Makefile.inc LOGDIR ?= log/varscanTN.$(NOW) -##### MAKE INCLUDES ##### -include modules/Makefile.inc - IGNORE_FP_FILTER ?= true - +VALIDATION ?= false FP_FILTER = $(PERL) $(HOME)/share/usr/bin/fpfilter.pl BAM_READCOUNT = $(HOME)/share/usr/bin/bam-readcount - VARSCAN_TO_VCF = $(PERL) modules/variant_callers/somatic/varscanTNtoVcf.pl - MIN_MAP_QUAL ?= 1 -VALIDATION ?= false MIN_VAR_FREQ ?= $(if $(findstring false,$(VALIDATION)),0.05,0.000001) -#VARSCAN VARSCAN_MEM = $(JAVA7) -Xmx$1 -jar $(VARSCAN_JAR) VARSCAN = $(call VARSCAN_MEM,8G) VARSCAN_OPTS = $(if $(findstring true,$(VALIDATION)),--validation 1 --strand-filter 0) --min-var-freq $(MIN_VAR_FREQ) - VARSCAN_SOURCE_ANN_VCF = python modules/vcf_tools/annotate_source_vcf.py --source varscan - VPATH ?= bam - VARSCAN_VARIANT_TYPES = varscan_indels varscan_snps -PHONY += varscan varscan_vcfs varscan_mafs -varscan : varscan_vcfs #varscan_mafs -varscan_vcfs : $(foreach type,$(VARSCAN_VARIANT_TYPES),$(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).$(type).vcf)) -varscan_mafs : $(foreach type,$(VARSCAN_VARIANT_TYPES),$(foreach pair,$(SAMPLE_PAIRS),maf/$(pair).$(type).maf)) - - -%.Somatic.txt : %.txt - $(call RUN,-s 5G -m 8G,"$(call VARSCAN_MEM,4G) somaticFilter $< && $(call VARSCAN_MEM,4G) processSomatic $< && rename .txt.Somatic .Somatic.txt $** && rename .txt.Germline .Germline.txt $** && rename .txt.LOH .LOH.txt $** && rename .txt.hc .hc.txt $**") +varscan : $(foreach chr,$(CHROMOSOMES),$(foreach pair,$(SAMPLE_PAIRS),varscan/chr_tables/$(pair).$(chr).varscan_timestamp)) \ + $(foreach chr,$(CHROMOSOMES),$(foreach pair,$(SAMPLE_PAIRS),varscan/chr_tables/$(pair).$(chr).snp.txt)) \ + $(foreach chr,$(CHROMOSOMES),$(foreach pair,$(SAMPLE_PAIRS),varscan/chr_tables/$(pair).$(chr).indel.txt)) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/tables/$(pair).snp.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/tables/$(pair).indel.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/tables/$(pair).snp.Somatic.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/tables/$(pair).indel.Somatic.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/vcf/$(pair).snp.Somatic.vcf) \ + $(foreach pair,$(SAMPLE_PAIRS),varscan/vcf/$(pair).indel.Somatic.vcf) \ + $(foreach type,$(VARSCAN_VARIANT_TYPES),$(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).$(type).vcf)) define varscan-somatic-tumor-normal-chr varscan/chr_tables/$1_$2.$3.varscan_timestamp : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai if [[ $$$$($$(SAMTOOLS) view $$< $3 | head -1 | wc -l) -gt 0 ]]; then \ - $$(call RUN,-s 9G -m 12G,"$$(VARSCAN) somatic \ + $$(call RUN,-s 9G -m 12G -w 72:00:00,"set -o pipefail && \ + rm -rf varscan/chr_tables/$1_$2.$3.snp.txt && \ + rm -rf varscan/chr_tables/$1_$2.$3.indel.txt && \ + $$(VARSCAN) somatic \ <($$(SAMTOOLS) mpileup -A -r $3 -q $$(MIN_MAP_QUAL) -f $$(REF_FASTA) $$(word 2,$$^)) \ <($$(SAMTOOLS) mpileup -A -r $3 -q $$(MIN_MAP_QUAL) -f $$(REF_FASTA) $$<) \ $$(VARSCAN_OPTS) \ @@ -54,14 +47,16 @@ varscan/chr_tables/$1_$2.$3.varscan_timestamp : bam/$1.bam bam/$2.bam bam/$1.bam fi varscan/chr_tables/$1_$2.$3.indel.txt : varscan/chr_tables/$1_$2.$3.varscan_timestamp + varscan/chr_tables/$1_$2.$3.snp.txt : varscan/chr_tables/$1_$2.$3.varscan_timestamp varscan/chr_tables/$1_$2.$3.%.fp_pass.txt : varscan/chr_tables/$1_$2.$3.%.txt bamrc/$1.$3.bamrc.gz $$(call RUN,-s 8G -m 55G,"$$(VARSCAN) fpfilter $$< <(zcat $$(<<)) --output-file $$@") + endef $(foreach chr,$(CHROMOSOMES), \ $(foreach pair,$(SAMPLE_PAIRS), \ - $(eval $(call varscan-somatic-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr))))) + $(eval $(call varscan-somatic-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr))))) define merge-varscan-pair-type varscan/tables/$1.$2.txt : $$(foreach chr,$$(CHROMOSOMES),\ @@ -69,16 +64,34 @@ varscan/tables/$1.$2.txt : $$(foreach chr,$$(CHROMOSOMES),\ varscan/chr_tables/$1.$$(chr).$2.txt,\ varscan/chr_tables/$1.$$(chr).$2.fp_pass.txt)) $$(INIT) head -1 $$< > $$@ && for x in $$^; do sed 1d $$$$x >> $$@; done + +endef +$(foreach pair,$(SAMPLE_PAIRS), \ + $(foreach type,snp indel, \ + $(eval $(call merge-varscan-pair-type,$(pair),$(type))))) + +define filter-varscan-pair-type +varscan/tables/$1.$2.Somatic.txt : varscan/tables/$1.$2.txt + $$(call RUN,-s 5G -m 8G,"set -o pipefail && \ + $$(VARSCAN) somaticFilter $$(<) && \ + $$(VARSCAN) processSomatic $$(<) && \ + cp varscan/tables/$1.$2.txt.Somatic varscan/tables/$1.$2.Somatic.txt") + endef $(foreach pair,$(SAMPLE_PAIRS), \ - $(foreach type,snp indel,$(eval $(call merge-varscan-pair-type,$(pair),$(type))))) + $(foreach type,snp indel, \ + $(eval $(call filter-varscan-pair-type,$(pair),$(type))))) define convert-varscan-tumor-normal -varscan/vcf/$1_$2.%.vcf : varscan/tables/$1_$2.%.txt - $$(call RUN,-s 4G -m 8G,"$$(VARSCAN_TO_VCF) -f $$(REF_FASTA) -t $1 -n $2 $$< | $$(VCF_SORT) $$(REF_DICT) - > $$@") +varscan/vcf/$1_$2.$3.Somatic.vcf : varscan/tables/$1_$2.$3.Somatic.txt + $$(call RUN,-s 4G -m 8G,"set -o pipefail && \ + $$(VARSCAN_TO_VCF) -f $$(REF_FASTA) -t $1 -n $2 $$(<) | $$(VCF_SORT) $$(REF_DICT) - > $$(@)") + + endef $(foreach pair,$(SAMPLE_PAIRS), \ - $(eval $(call convert-varscan-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) + $(foreach type,snp indel, \ + $(eval $(call convert-varscan-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)),$(type))))) vcf/%.varscan_indels.vcf : varscan/vcf/%.indel.Somatic.vcf $(INIT) $(VARSCAN_SOURCE_ANN_VCF) < $< > $@ @@ -86,15 +99,8 @@ vcf/%.varscan_indels.vcf : varscan/vcf/%.indel.Somatic.vcf vcf/%.varscan_snps.vcf : varscan/vcf/%.snp.Somatic.vcf $(INIT) $(VARSCAN_SOURCE_ANN_VCF) < $< > $@ -define bamrc-chr -bamrc/%.$1.bamrc.gz : bam/%.bam - $$(call RUN,-s 8G -m 12G,"$$(BAM_READCOUNT) -f $$(REF_FASTA) $$< $1 | gzip > $$@ 2> /dev/null") -endef -$(foreach chr,$(CHROMOSOMES),$(eval $(call bamrc-chr,$(chr)))) - include modules/variant_callers/gatk.mk .DELETE_ON_ERROR: .SECONDARY: -.PHONY: $(PHONY) - +.PHONY: varscan diff --git a/variant_callers/sufam_gt.mk b/variant_callers/sufam_gt.mk new file mode 100644 index 00000000..1b58f247 --- /dev/null +++ b/variant_callers/sufam_gt.mk @@ -0,0 +1,95 @@ +include modules/Makefile.inc + +LOGDIR ?= log/sufam_gt.$(NOW) + +SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev +SUFAM_OPTS = --mpileup-parameters='-A -q 15 -Q 15 -d 15000' + +sufam_gt : $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample).vcf) \ + $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample).txt) \ + $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample).maf) \ + $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample)_ann.maf) \ + $(foreach set,$(SAMPLE_SETS),sufam/$(set).maf) \ + sufam/mutation_summary.maf \ + sufam/mutation_summary_ft.maf + +define sufam-gt +sufam/$1.vcf : summary/tsv/all.tsv + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 1 \ + --sample_set '$(set.$1)' \ + --normal_sample '$(normal.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + +sufam/$1.txt : sufam/$1.vcf bam/$1.bam + $$(call RUN,-c -n 1 -s 2G -m 3G -v $(SUFAM_ENV),"set -o pipefail && \ + sufam \ + --sample_name $1 \ + $$(SUFAM_OPTS) \ + $$(REF_FASTA) \ + $$(<) \ + $$(<<) \ + > $$(@)") + +sufam/$1.maf : sufam/$1.vcf + $$(call RUN,-c -n 12 -s 1G -m 2G -v $(VEP_ENV),"set -o pipefail && \ + $$(VCF2MAF) \ + --input-vcf $$< \ + --tumor-id $1 \ + --filter-vcf $$(EXAC_NONTCGA) \ + --ref-fasta $$(REF_FASTA) \ + --vep-path $$(VEP_PATH) \ + --vep-data $$(VEP_DATA) \ + --tmp-dir `mktemp -d` \ + --output-maf $$(@)") + +sufam/$1_ann.maf : sufam/$1.maf + $$(call RUN,-c -n 1 -s 2G -m 3G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 2 \ + --tumor_sample $1 \ + --normal_sample '$(normal.$1)' \ + --input_file $$(<) \ + --output_file $$(@)") + + +endef +$(foreach sample,$(TUMOR_SAMPLES),\ + $(eval $(call sufam-gt,$(sample)))) + +define combine-maf +sufam/$1.maf : $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample).txt) $(foreach sample,$(TUMOR_SAMPLES),sufam/$(sample)_ann.maf) + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 3 \ + --sample_set '$(set.$1)' \ + --normal_sample '$(normal.$1)' \ + --output_file $$(@)") + +endef +$(foreach set,$(SAMPLE_SETS),\ + $(eval $(call combine-maf,$(set)))) + + +sufam/mutation_summary.maf : summary/tsv/all.tsv $(foreach set,$(SAMPLE_SETS),sufam/$(set).maf) + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 4 \ + --sample_set '$(SAMPLE_SETS)' \ + --input_file $(<) \ + --output_file $(@)") + +sufam/mutation_summary_ft.maf : sufam/mutation_summary.maf + $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/sufam_gt.R \ + --option 5 \ + --input_file $(<) \ + --output_file $(@)") + +..DUMMY := $(shell mkdir -p version; \ + R --version > version/sufam_gt.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: sufam_gt diff --git a/variant_callers/sufammultisample.mk b/variant_callers/sufammultisample.mk deleted file mode 100644 index cb0157c8..00000000 --- a/variant_callers/sufammultisample.mk +++ /dev/null @@ -1,42 +0,0 @@ -include modules/Makefile.inc - -LOGDIR ?= log/sufam_multisample.$(NOW) -PHONY += sufam summary - -sufam_multisample : $(foreach set,$(SAMPLE_SETS),sufam/$(set).tsv) summary/sufam_summary.xlsx - -ifeq ($(PDX),true) - -define combine-samples-pdx -sufam/%.txt : summary/tsv/mutation_summary.tsv - $$(call RUN,-c -s 4G -m 6G,"$(RSCRIPT) modules/variant_callers/combinesamplesf.R --sample_set $$*") - -sufam/%.tsv : sufam/%.txt - $$(call RUN,-c -s 4G -m 6G,"$(RSCRIPT) modules/variant_callers/updatesamples.R --sample_set $$*") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call combine-samples-pdx,$(set)))) - -else - -define combine-samples -sufam/%.txt : summary/tsv/mutation_summary.tsv - $$(call RUN,-s 4G -m 6G,"$(RSCRIPT) modules/variant_callers/combinesamples.R --sample_set $$*") - -sufam/%.tsv : sufam/%.txt - $$(call RUN,-s 4G -m 6G,"$(RSCRIPT) modules/variant_callers/updatesamples.R --sample_set $$*") - -endef -$(foreach set,$(SAMPLE_SETS),\ - $(eval $(call combine-samples,$(set)))) - -endif - -summary/sufam_summary.xlsx : $(wildcard $(foreach set,$(SAMPLE_SETS),sufam/$(set).tsv)) - $(call RUN,-s 12G -m 16G,"export R_LIBS='~/share/usr/anaconda-envs/jrflab-modules-0.1.4/lib/R/library:~/share/usr/lib64/R/library' && \ - $(RSCRIPT) modules/summary/sufamsummary.R --sample_sets '$(SAMPLE_SETS)'") - -.DELETE_ON_ERROR: -.SECONDARY: -.PHONY: $(PHONY) \ No newline at end of file diff --git a/variant_callers/updatesamples.R b/variant_callers/updatesamples.R deleted file mode 100644 index d9be2876..00000000 --- a/variant_callers/updatesamples.R +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env Rscript - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("CNtu")) -suppressPackageStartupMessages(library("readr")) - -if (!interactive()) { - options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) -} - -args_list <- list(make_option("--sample_set", default = NA, type = 'character', help = "sample names set")) - -parser <- OptionParser(usage = "%prog", option_list = args_list) -arguments <- parse_args(parser, positional_arguments = T) -opt <- arguments$options - -sample_names = unlist(strsplit(opt$sample_set, split="_", fixed=TRUE)) - -vars = read_tsv(file=paste0("sufam/", opt$sample_set, ".txt")) -col_names = colnames(vars) -vars = as.data.frame(vars) -colnames(vars) = col_names - -#==================================== -# sufam -#==================================== -chr = vars$Chromosome -pos = vars$Position -id = rep(".", nrow(vars)) -ref = vars$Ref -alt = vars$Alt -qual = rep(100, nrow(vars)) -filter = rep("PASS", nrow(vars)) -info = rep(".", nrow(vars)) -vcf = cbind(chr, pos, id, ref, alt, qual, filter, info) -colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") -write.table(vcf, file=paste0("sufam/", opt$sample_set, ".vcf"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) - -#==================================== -# dp and maf -#==================================== -for (i in 1:length(sample_names)) { - if (!file.exists(paste0("sufam/", sample_names[i], ".mat"))) { - system(paste0("source ~/share/usr/anaconda/bin/activate ~/share/usr/anaconda-envs/sufam-dev && sufam ~/share/reference/GATK_bundle/2.3/human_g1k_v37.fa sufam/", opt$sample_set, ".vcf bam/", sample_names[i], ".bam > sufam/", sample_names[i], ".mat")) - } - tmp = read.csv(file=paste0("sufam/", sample_names[i], ".mat"), header=TRUE, sep="\t", stringsAsFactors=FALSE) - index = paste0("DP_", sample_names[i]) - vars[,index] = tmp[,"cov"] - index = paste0("MAF_", sample_names[i]) - vars[,index] = tmp[,"val_maf"] -} - -#==================================== -# qt and q2 -#==================================== -q_t = q_2 = NULL -for (i in 1:length(sample_names)) { - file_names = dir(path="ascat/ascat", pattern=".RData", full.names=TRUE) - index = grep(sample_names[i], file_names, fixed=TRUE) - if (length(index)==1) { - load(file_names[index]) - Chromosomes = tmp2$SNPpos[tmp3$seg[,"start"],1] - Chromosomes[Chromosomes==23] = "X" - Start = tmp2$SNPpos[tmp3$seg[,"start"],2] - End = tmp2$SNPpos[tmp3$seg[,"end"],2] - qt = tmp3$seg[,"nA"] + tmp3$seg[,"nB"] - q2 = apply(tmp3$seg[,c("nA","nB")], 1, max) - index = rep(NA, nrow(vars)) - for (j in 1:nrow(vars)) { - indx = which(Chromosomes==vars[j,"Chromosome"] & Start<=vars[j,"Position"] & End>=vars[j,"Position"]) - if (length(indx)!=0) { - index[j] = indx - } else { - index[j] = NA - } - } - q_t = cbind(q_t, qt[index]) - q_2 = cbind(q_2, q2[index]) - } else { - q_t = cbind(q_t, rep(2, nrow(vars))) - q_2 = cbind(q_2, rep(1, nrow(vars))) - } -} -q_t[is.na(q_t)] = 2 -q_2[is.na(q_2)] = 1 -colnames(q_t) = colnames(q_2) = sample_names -colnames(q_t) = paste0("qt_", colnames(q_t)) -colnames(q_2) = paste0("q2_", colnames(q_2)) -vars = cbind(vars, q_t, q_2) - -#==================================== -# loh -#==================================== -for (i in 1:length(sample_names)) { - loh = rep(0, nrow(vars)) - for (j in 1:nrow(vars)) { - if (q_t[j,i]==q_2[j,i]) { - loh[j] = 1 - } - } - vars[,paste0("LOH_", sample_names[i])] = loh -} - -#==================================== -# ccf -#==================================== -cancer_cell_fraction = NULL -ccf_95CI_low = NULL -ccf_95CI_high = NULL -pr_somatic_clonal = NULL -ll = NULL -sq = NULL -clonal_status = NULL -for (i in 1:length(sample_names)) { - file_names = dir(path="ascat/ascat", pattern=".RData", full.names=TRUE) - index = grep(sample_names[i], file_names, fixed=TRUE) - if (length(index)==1) { - load(file_names[index]) - f_hat = vars[,paste0("MAF_", sample_names[i])] - n = vars[,paste0("DP_", sample_names[i])] - qt = vars[,paste0("qt_", sample_names[i])] - qt[qt>10] = 10 - q2 = vars[,paste0("q2_", sample_names[i])] - q2[q2>10] = 10 - alpha = seq(.1, to=.9, length=50) - alpha_hat = list() - indx = f_hat>.1 - if (sum(indx)>5) { - for (j in 1:length(alpha)) { - alpha_hat[[j]] = cancercellFraction(f_hat[indx], n[indx], qt[indx], q2[indx], alpha[j], e=0.01) - } - LL = unlist(lapply(alpha_hat, function(x) {sum(x[,"LL"])})) - pdf(file=paste0("sufam/", sample_names[i], ".pdf")) - plot(alpha, LL, type="o", col="steelblue", axes=FALSE, frame.plot=FALSE, xlab="", ylab="") - axis(1, at = NULL, cex.axis = 1.5, padj = 0.25) - axis(2, at = NULL, cex.axis = 1.5, las = 1) - mtext(side = 1, text = expression(alpha), line = 4, cex = 1.5) - mtext(side = 2, text = expression(Sigma~"LL"), line = 4, cex = 1.5) - index = which.max(LL) - title(main = paste0("alpha* = ", signif(alpha[index], 3)), cex.main = 1.5) - box(lwd = 2) - dev.off() - index = which.max(LL) - alpha_hat = cancercellFraction(f_hat, n, qt, q2, ifelse((alpha[index]-.25)<=0, alpha[index], alpha[index]-.25), e=0.01) - cancer_cell_fraction = cbind(cancer_cell_fraction, alpha_hat[,"cancer_cell_frac"]) - ccf_95CI_low = cbind(ccf_95CI_low, alpha_hat[,"ccf_95CI_low"]) - ccf_95CI_high = cbind(ccf_95CI_high, alpha_hat[,"ccf_95CI_high"]) - pr_somatic_clonal = cbind(pr_somatic_clonal, alpha_hat[,"Pr_somatic_clonal"]) - ll = cbind(ll, alpha_hat[,"LL"]) - sq = cbind(sq, alpha_hat[,"sq"]) - clonal_estimate = rep("Subclonal", nrow(vars)) - clonal_estimate[cancer_cell_fraction[,i]>.75 | pr_somatic_clonal[,i]>.5 | ccf_95CI_low[,i]>.9] = "Clonal" - clonal_status = cbind(clonal_status, clonal_estimate) - } else { - cancer_cell_fraction = cbind(cancer_cell_fraction, rep(NA, nrow(vars))) - ccf_95CI_low = cbind(ccf_95CI_low, rep(NA, nrow(vars))) - ccf_95CI_high = cbind(ccf_95CI_high, rep(NA, nrow(vars))) - pr_somatic_clonal = cbind(pr_somatic_clonal, rep(NA, nrow(vars))) - ll = cbind(ll, rep(NA, nrow(vars))) - sq = cbind(sq, rep(NA, nrow(vars))) - clonal_status = cbind(clonal_status, rep(NA, nrow(vars))) - } - } else { - cancer_cell_fraction = cbind(cancer_cell_fraction, rep(NA, nrow(vars))) - ccf_95CI_low = cbind(ccf_95CI_low, rep(NA, nrow(vars))) - ccf_95CI_high = cbind(ccf_95CI_high, rep(NA, nrow(vars))) - pr_somatic_clonal = cbind(pr_somatic_clonal, rep(NA, nrow(vars))) - ll = cbind(ll, rep(NA, nrow(vars))) - sq = cbind(sq, rep(NA, nrow(vars))) - clonal_status = cbind(clonal_status, rep(NA, nrow(vars))) - } -} -colnames(cancer_cell_fraction) = colnames(ccf_95CI_low) = colnames(ccf_95CI_high) = colnames(pr_somatic_clonal) = colnames(ll) = colnames(sq) = colnames(clonal_status) = sample_names -colnames(cancer_cell_fraction) = paste0("CCF_", colnames(cancer_cell_fraction)) -colnames(ccf_95CI_low) = paste0("CCF_95CI_Low_", colnames(ccf_95CI_low)) -colnames(ccf_95CI_high) = paste0("CCF_95CI_High_", colnames(ccf_95CI_high)) -colnames(pr_somatic_clonal) = paste0("Pr_Somatic_Clonal_", colnames(pr_somatic_clonal)) -colnames(ll) = paste0("LL_", colnames(ll)) -colnames(sq) = paste0("sq_", colnames(sq)) -colnames(clonal_status) = paste0("Clonal_Status_", colnames(clonal_status)) - -vars = cbind(vars, cancer_cell_fraction, - ccf_95CI_low, - ccf_95CI_high, - pr_somatic_clonal, - ll, - sq, - clonal_status) - -write.table(vars, file=paste0("sufam/", opt$sample_set, ".tsv"), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) diff --git a/vcf_tools/annotateSummaryVcf.mk b/vcf_tools/annotateSummaryVcf.mk new file mode 100644 index 00000000..58233a61 --- /dev/null +++ b/vcf_tools/annotateSummaryVcf.mk @@ -0,0 +1,40 @@ +include modules/Makefile.inc +include modules/genome_inc/b37.inc + +LOGDIR ?= log/annotate_smry_maf.$(NOW) + +annotate_smry_maf : vcf2maf/mutation_summary.vcf \ + vcf2maf/mutation_summary.maf \ + vcf2maf/mutation_summary.txt + +vcf2maf/mutation_summary.vcf : summary/tsv/mutation_summary.tsv + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 1 --input $(<) --output $(@)") + +vcf2maf/mutation_summary.maf : vcf2maf/mutation_summary.vcf + $(call RUN, -c -n 12 -s 2G -m 3G -v $(VEP_ENV) -w 72:00:00,"set -o pipefail && \ + $(VCF2MAF) \ + --input-vcf $(<) \ + --output-maf $(@) \ + --tmp-dir $(TMPDIR) \ + --tumor-id NA \ + --normal-id NA \ + --vep-path $(VEP_ENV)/bin \ + --vep-data $(HOME)/share/reference/vep/v86/ \ + --vep-forks 12 \ + --ref-fasta $(HOME)/share/reference/vep/v86/homo_sapiens/86_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa \ + --filter-vcf $(HOME)/share/reference/vep/v86/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz \ + --species homo_sapiens \ + --ncbi-build GRCh37 \ + --maf-center MSKCC && \ + $(RM) $(TMPDIR)/mutation_summary.vep.vcf") + +vcf2maf/mutation_summary.txt : summary/tsv/mutation_summary.tsv vcf2maf/mutation_summary.maf + $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ + $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 2 --input $(<) --maf $(<<) --output $(@)") + +..DUMMY := $(shell mkdir -p version; \ + source $(VCF2MAF_ENV)/bin/activate $(VCF2MAF_ENV) && $(VCF2MAF) --man >> version/annotate_smry_maf.txt) +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: annotate_smry_maf diff --git a/vcf_tools/annotate_sv.mk b/vcf_tools/annotate_sv.mk new file mode 100644 index 00000000..78d340d7 --- /dev/null +++ b/vcf_tools/annotate_sv.mk @@ -0,0 +1,44 @@ +include modules/Makefile.inc + +LOGDIR ?= log/anotate_sv.$(NOW) + +SV_CALLERS = svaba manta gridss merged +ANNOTATE_SV ?= $(HOME)/share/usr/env/annot_sv-3.1.3/opt/AnnotSV/bin/AnnotSV + +annotate_sv : $(foreach pair,$(SAMPLE_PAIRS), \ + $(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.tsv)) \ + $(foreach pair,$(SAMPLE_PAIRS), \ + $(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.maf)) + +define annotate-sv +annotate_sv/$1/$2/$1.$2_sv.tsv : vcf/$1.$2_sv.vcf + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(ANNOTATE_SV_ENV),"set -o pipefail && \ + mkdir -p annotate_sv/$1/$2 && \ + $$(ANNOTATE_SV) \ + -SVinputFile $$(<) \ + -outputFile ./annotate_sv/$1/$2/$1.$2_sv.tsv \ + -genomeBuild GRCh37") + +annotate_sv/$1/$1.$2_sv.tsv : annotate_sv/$1/$2/$1.$2_sv.tsv + $$(INIT) cat $$(<) > $$(@) + +annotate_sv/$1/$1.$2_sv.maf : vcf/$1.$2_sv.vcf + $$(call RUN,-c -n 12 -s 1G -m 2G -v $(VEP_ENV),"set -o pipefail && \ + $$(VCF2MAF) \ + --input-vcf $$(<) \ + --tumor-id $1 \ + --filter-vcf $$(EXAC_NONTCGA) \ + --ref-fasta $$(REF_FASTA) \ + --vep-path $$(VEP_PATH) \ + --vep-data $$(VEP_DATA) \ + --tmp-dir `mktemp -d` \ + --output-maf $$(@)") + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(foreach caller,$(SV_CALLERS), \ + $(eval $(call annotate-sv,$(pair),$(caller))))) + +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: annotate_sv diff --git a/vcf_tools/cravat_annotation.mk b/vcf_tools/cravat_annotation.mk index 9181800f..d8ae4961 100644 --- a/vcf_tools/cravat_annotation.mk +++ b/vcf_tools/cravat_annotation.mk @@ -1,33 +1,57 @@ include modules/Makefile.inc LOGDIR ?= log/cravat_annotate.$(NOW) -PHONY += cravat -cravat_annotate : $(foreach sample,$(SAMPLES),cravat/$(sample).vcf cravat/$(sample).maf cravat/$(sample).cravat.vcf cravat/$(sample).tsv cravat/$(sample).txt) - -DEFAULT_ENV = $(HOME)/share/usr/anaconda-envs/jrflab-modules-0.1.6 -CRAVAT_ENV = $(HOME)/share/usr/anaconda-envs/open-cravat +cravat_annotate : $(foreach sample,$(SAMPLES),cravat/$(sample).vcf) \ + $(foreach sample,$(SAMPLES),cravat/$(sample).maf) \ + $(foreach sample,$(SAMPLES),cravat/$(sample).cravat.vcf) \ + $(foreach sample,$(SAMPLES),cravat/$(sample).tsv) \ + $(foreach sample,$(SAMPLES),cravat/$(sample).txt) define cravat-annotation -cravat/%.vcf : vcf_ann/%.gatk_snps.vcf vcf_ann/%.gatk_indels.vcf - $$(call RUN,-c -s 9G -m 12G -w 7200,"$(RSCRIPT) modules/vcf_tools/combine_vcf.R --sample_name $$(*)") +cravat/$1.vcf : vcf_ann/$1.gatk_snps.vcf vcf_ann/$1.gatk_indels.vcf + $$(call RUN,-c -s 9G -m 12G -w 24:00:00,"set -o pipefail && \ + $(RSCRIPT) modules/vcf_tools/combine_vcf.R \ + --sample_name $$(*)") -cravat/%.maf : cravat/%.vcf - $$(call RUN,-s 9G -m 12G -v $$(VEP_ENV) -w 7200,"$$(VCF2MAF) --input-vcf $$< --tumor-id $$(*) $$(if $$(EXAC_NONTCGA),--filter-vcf $$(EXAC_NONTCGA)) --ref-fasta $$(REF_FASTA) --vep-path $$(VEP_PATH) --vep-data $$(VEP_DATA) --tmp-dir `mktemp -d` --output-maf $$@") +cravat/$1.maf : cravat/$1.vcf + $$(call RUN,-c -s 9G -m 12G -v $(VEP_ENV) -w 24:00:00,"set -o pipefail && \ + $$(VCF2MAF) \ + --input-vcf $$(<) \ + --tumor-id $1 \ + $$(if $$(EXAC_NONTCGA),--filter-vcf $$(EXAC_NONTCGA)) \ + --ref-fasta $$(REF_FASTA) \ + --vep-path $$(VEP_PATH) \ + --vep-data $$(VEP_DATA) \ + --tmp-dir `mktemp -d` \ + --output-maf $$(@)") -cravat/%.cravat.vcf : cravat/%.vcf cravat/%.maf - $$(call RUN,-c -s 9G -m 12G -w 7200,"$(RSCRIPT) modules/vcf_tools/filter_vcf.R --sample_name $$(*)") +cravat/$1.cravat.vcf : cravat/$1.vcf cravat/$1.maf + $$(call RUN,-c -s 9G -m 12G -w 24:00:00,"set -o pipefail && \ + $(RSCRIPT) modules/vcf_tools/filter_vcf.R \ + --sample_name $1") -cravat/%.tsv: cravat/%.cravat.vcf - $$(call RUN,-c -s 9G -m 12G -v $$(DEFAULT_ENV) -w 7200,"source activate $$(CRAVAT_ENV) && \ - cravat cravat/$$(*).cravat.vcf -n $$(*) -d cravat -a clinvar cosmic dbsnp gnomad hgvs -v -l hg19 -t text") +cravat/$1.tsv: cravat/$1.cravat.vcf + $$(call RUN,-c -s 9G -m 12G -v $(CRAVAT_ENV) -w 24:00:00,"set -o pipefail && \ + cravat $$(<) \ + -n $1 \ + -d cravat \ + -a clinvar cosmic dbsnp gnomad hgvs \ + -v \ + -l hg19 \ + -t text") -cravat/%.txt : cravat/%.tsv - $$(call RUN,-c -s 9G -m 12G -w 7200,"$(RSCRIPT) modules/vcf_tools/summary_vcf.R --sample_name $$(*)") +cravat/$1.txt : cravat/$1.tsv + $$(call RUN,-c -s 9G -m 12G -w 24:00:00,"set -o pipefail && \ + $(RSCRIPT) modules/vcf_tools/summary_vcf.R \ + --sample_name $1") endef $(foreach sample,$(SAMPLES),\ $(eval $(call cravat-annotation,$(sample)))) -.PHONY: $(PHONY) - +..DUMMY := $(shell mkdir -p version; \ + echo "cravat" > version/cravat_annotate.txt;) +.SECONDARY: +.DELETE_ON_ERROR: +.PHONY : cravat_annotate diff --git a/vcf_tools/merge_sv.mk b/vcf_tools/merge_sv.mk new file mode 100644 index 00000000..1c301488 --- /dev/null +++ b/vcf_tools/merge_sv.mk @@ -0,0 +1,42 @@ +include modules/Makefile.inc + +LOGDIR ?= log/merge_sv.$(NOW) + +SV_CALLERS = svaba gridss manta +MAX_DIST = 500 +NUM_CALLERS = 2 +TYPE = 0 +STRAND = 0 +MIN_SIZE = 30 + +merge_sv : $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/samples.txt) \ + $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv.vcf) \ + $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv_ft.vcf) \ + $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).merged_sv.vcf) + +define merge-sv +merge_sv/$1_$2/samples.txt : $(foreach caller,$(SV_CALLERS),vcf/$1_$2.$(caller)_sv.vcf) + mkdir -p merge_sv/$1_$2 && \ + $(foreach caller,$(SV_CALLERS),echo vcf/$1_$2.$(caller)_sv.vcf >> $$(@);) + +merge_sv/$1_$2/$1_$2.merged_sv.vcf : merge_sv/$1_$2/samples.txt + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(SURVIVOR_ENV),"set -o pipefail && \ + SURVIVOR merge $$(<) \ + $(MAX_DIST) $(NUM_CALLERS) $(TYPE) $(STRAND) 0 $(MIN_SIZE) $$(@)") + +merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf : merge_sv/$1_$2/$1_$2.merged_sv.vcf + $$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \ + grep '##' $$(<) > $$(@) && \ + $$(RSCRIPT) modules/scripts/filter_sv.R --input_file $$(<) --output_file $$(@)") + + +vcf/$1_$2.merged_sv.vcf : merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf + $$(INIT) cat $$(<) > $$(@) + +endef +$(foreach pair,$(SAMPLE_PAIRS),\ + $(eval $(call merge-sv,$(tumor.$(pair)),$(normal.$(pair))))) + +.DELETE_ON_ERROR: +.SECONDARY: +.PHONY: merge_sv diff --git a/vcf_tools/vcftools.mk b/vcf_tools/vcftools.mk index 18656fc6..0bda8c32 100644 --- a/vcf_tools/vcftools.mk +++ b/vcf_tools/vcftools.mk @@ -1,6 +1,3 @@ -# vim: set ft=make : -# sub module containing vcf related tools - ifndef VCFTOOLS_MK include modules/Makefile.inc