diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..cc82a48 --- /dev/null +++ b/.gitignore @@ -0,0 +1,122 @@ + +# Created by https://www.gitignore.io/ + +### OSX ### +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + + +### Windows ### +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + + +### R ### +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +/*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + + +### Perl ### +/blib/ +/.build/ +_build/ +cover_db/ +inc/ +Build +!Build/ +Build.bat +.last_cover_stats +/Makefile +/Makefile.old +/MANIFEST.bak +/META.yml +/META.json +/MYMETA.* +nytprof.out +/pm_to_blib +*.o +*.bs +/_eumm/ diff --git a/gather-fastqs b/gather-fastqs new file mode 100755 index 0000000..d3e8861 --- /dev/null +++ b/gather-fastqs @@ -0,0 +1,133 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use List::MoreUtils qw(uniq); + +my $HELP = <>", $filename); + + # process each fastq + while (my $fastq_r1 = shift(@fastqs)) { + chomp($fastq_r1); + + # check that R1 exists + unless ( -e $fastq_r1 ) { + die "\n\n ERROR! $fastq_r1 DOES NOT EXIST \n\n"; + } + + # generate R2 filename + my $fastq_r2 = $fastq_r1; + $fastq_r2 =~ s/(.*)_R1_00(.*.fastq.gz)/${1}_R2_00${2}/; + $fastq_r2 =~ s/(.*)_R1.fastq.gz/${1}_R2.fastq.gz/; + $fastq_r2 =~ s/(.*)_1.fastq.gz/${1}_2.fastq.gz/; + + # blank if R2 does not exist + unless ( -e $fastq_r2 ) { + $fastq_r2 = ""; + } + + # blank if R2 is same as R1 (in case of not standard file name, for example) + if ( $fastq_r1 eq $fastq_r2 ) { + $fastq_r2 = ""; + } + + # count based on read type + if ( length($fastq_r2) ) { + $reads_pe++; + } + else { + $reads_se++; + } + + # extract sample name + my $sample = $fastq_r1; + # remove directory structure + $sample =~ s/.*\///; + # bcl2fastq2 format (with S sample number) + $sample =~ s/_S[0-9]{1,3}_L00[0-9]_R1.*//; + # bcl2fastq format with 2 barcodes + $sample =~ s/_[ACTG]{6,}-[ACTG]{6,}_L00[0-9]_R1.*//; + # bcl2fastq format with 1 barcode + $sample =~ s/_[ACTG]{4,}_L00[0-9]_R1.*//; + # no barcodes + $sample =~ s/_L00[0-9]_R[12].*//; + # no barcodes or lane + $sample =~ s/_R[12].fastq.gz//; + # no barcodes or lane + $sample =~ s/_[12].fastq.gz//; + + push @samples, $sample; + + # show progress + print STDERR " SAMPLE : $sample \n"; + print STDERR " FASTQ R1 : $fastq_r1 \n"; + print STDERR " FASTQ R2 : $fastq_r2 \n"; + + # print sample table line + my $output = "${sample},${fastq_r1},${fastq_r2}\n"; + print $fh "$output"; + + } + close($fh); + + # remove duplicate entries + system("cat $filename | LC_ALL=C sort | uniq > ${filename}.tmp && mv -f ${filename}.tmp $filename"); + + # get number of unique sample names + my $num_files = @samples; + @samples = uniq(@samples); + my $num_samples = @samples; + + # print stats + print STDERR "\n"; + print STDERR " NUMBER OF SAMPLES : $num_samples \n"; + print STDERR " NUMBER OF SINGLE FILES : $reads_se \n"; + print STDERR " NUMBER OF PAIRED FILES : $reads_pe \n"; + +} + + + +# end diff --git a/generate-settings b/generate-settings new file mode 100755 index 0000000..91ff2b7 --- /dev/null +++ b/generate-settings @@ -0,0 +1,65 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; + +my $HELP = <", $settings_file); + # print $fh $genome_setting; + print $fh $genome_dir_setting; + close $fh; + + # set fasta to test genome dir setting + # system("bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA"); + + # get values to make sure they were set properly + my $settings_genome = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file GENOME-DIR`; + my $settings_fasta = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA`; + + # print values + print STDERR "\n"; + print STDERR " REF DIR : $settings_genome \n"; + print STDERR " REF FASTA : $settings_fasta \n"; + +} + + + +# end diff --git a/routes/comp-rna-star-dge.sh b/routes/comp-rna-star-dge.sh new file mode 100755 index 0000000..f0e83a9 --- /dev/null +++ b/routes/comp-rna-star-dge.sh @@ -0,0 +1,183 @@ +#!/bin/bash + + +## +## RNA-seq differential gene expression using DESeq2 +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 1 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir \n" >&2 + exit 1 +fi + +# standard comparison route arguments +proj_dir=$(readlink -f "$1") + +# additional settings +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +gene_info_table="${proj_dir}/genes.featurecounts.txt" + +if [ ! -s "$gene_info_table" ] ; then + echo -e "\n $script_name ERROR: GENE INFO $gene_info_table DOES NOT EXIST \n" >&2 + exit 1 +fi + +groups_table="${proj_dir}/samples.groups.csv" + +if [ ! -s "$groups_table" ] ; then + echo -e "\n $script_name ERROR: GROUP TABLE $groups_table DOES NOT EXIST \n" >&2 + exit 1 +fi + +num_samples=$(cat "$groups_table" | grep -v "#SAMPLE" | wc -l) + +if [ "$num_samples" -lt 3 ] ; then + echo -e "\n $script_name ERROR: $num_samples is too few samples \n" >&2 + exit 1 +fi + +strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND); +counts_table="${proj_dir}/quant.featurecounts.counts.${strand}.txt" + +if [ ! -s "$counts_table" ] ; then + echo -e "\n $script_name ERROR: COUNTS TABLE $counts_table DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +dge_dir="${proj_dir}/DGE-DESeq2-${strand}-${num_samples}samples" + +mkdir -p "$dge_dir" + +input_groups_table="${dge_dir}/input.groups.csv" +input_counts_table="${dge_dir}/input.counts.txt" + + +######################### + + +# exit if output exits already + +if [ -s "$input_groups_table" ] ; then + echo -e "\n $script_name ERROR: TABLE $input_groups_table ALREADY EXISTS \n" >&2 + exit 1 +fi + +if [ -s "$input_counts_table" ] ; then + echo -e "\n $script_name ERROR: TABLE $input_counts_table ALREADY EXISTS \n" >&2 + exit 1 +fi + + +######################### + + +echo -e "\n ========== set up inputs ========== \n" + +echo +echo " * counts table: $counts_table " +echo " * groups table: $groups_table " +echo + +bash_cmd="rsync -t $counts_table $input_counts_table" +echo "CMD: $bash_cmd" +($bash_cmd) + +bash_cmd="rsync -t $groups_table $input_groups_table" +echo "CMD: $bash_cmd" +($bash_cmd) + +sleep 3 + + +######################### + + +echo -e "\n ========== test R ========== \n" + +# load relevant modules +module unload r +module load r/3.3.0 +module unload zlib + +echo +echo " * R: $(readlink -f $(which R)) " +echo " * R version: $(R --version | head -1) " +echo " * Rscript: $(readlink -f $(which Rscript)) " +echo " * Rscript version: $(Rscript --version 2>&1) " +echo + +# test R installation and setting +bash_cmd="Rscript --vanilla ${code_dir}/scripts/test-packages.R" +echo "CMD: $bash_cmd" +($bash_cmd) + +sleep 3 + + +######################### + + +echo -e "\n ========== start analysis ========== \n" + +cd "$dge_dir" || exit 1 + +# test R installation and setting +bash_cmd="Rscript --vanilla ${code_dir}/scripts/dge-deseq2.R $input_counts_table $input_groups_table" +echo "CMD: $bash_cmd" +($bash_cmd) + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/routes/rna-rsem.sh b/routes/rna-rsem.sh new file mode 100755 index 0000000..60a2dc6 --- /dev/null +++ b/routes/rna-rsem.sh @@ -0,0 +1,104 @@ +#!/bin/bash + + +## +## RNA-seq using RSEM aligner +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2 + exit 1 +fi + +# standard route arguments +proj_dir=$(readlink -f "$1") +sample=$2 + +# additional settings +threads=$NSLOTS +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * sample: $sample " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " +echo " * threads: $threads " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# segments + +# rename and/or merge raw input FASTQs +segment_fastq_clean="fastq-fastq-clean" +fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) +fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample" + ($bash_cmd) + fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) + fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +fi + +# fastq_screen +bash_cmd="bash ${code_dir}/segments/fastq-qc-fastqscreen.sh $proj_dir $sample $fastq_R1" +($bash_cmd) + +# RSEM +segment_quant="fastq-quant-rsem" +bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads unstr $fastq_R1 $fastq_R2" +($bash_cmd) + + +######################### + + +# combine summary from each step + +sleep 30 + +summary_csv="${proj_dir}/summary-combined.rna-rsem.csv" + +bash_cmd=" +bash ${code_dir}/scripts/join-many.sh , X \ +${proj_dir}/summary.${segment_fastq_clean}.csv \ +${proj_dir}/summary.x.csv \ +${proj_dir}/summary.x.csv \ +> $summary_csv +" +(eval $bash_cmd) + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/routes/rna-star.sh b/routes/rna-star.sh new file mode 100755 index 0000000..a1db895 --- /dev/null +++ b/routes/rna-star.sh @@ -0,0 +1,161 @@ +#!/bin/bash + + +## +## RNA-seq using STAR aligner +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample name] \n" >&2 + exit 1 +fi + +# standard route arguments +proj_dir=$(readlink -f "$1") +sample=$2 + +# additional settings +threads=$NSLOTS +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * sample: $sample " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " +echo " * threads: $threads " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# segments + +# rename and/or merge raw input FASTQs +segment_fastq_clean="fastq-fastq-clean" +fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) +fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample" + ($bash_cmd) + fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) + fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +fi + +# fastq_screen +bash_cmd="bash ${code_dir}/segments/fastq-qc-fastqscreen.sh $proj_dir $sample $fastq_R1" +($bash_cmd) + +# run STAR +segment_align="fastq-bam-star" +bam_star=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.bam-star.csv" | cut -d ',' -f 2) +if [ -z "$bam_star" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2" + ($bash_cmd) + bam_star=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2) +fi + +# generate BigWig (deeptools) +segment_bigwig_deeptools="bam-bigwig-deeptools" +bash_cmd="bash ${code_dir}/segments/${segment_bigwig_deeptools}.sh $proj_dir $sample 4 $bam_star" +qsub_cmd="qsub -N sns.${segment_bigwig_deeptools}.${sample} -M ${USER}@nyumc.org -m a -j y -cwd -pe threaded 4 -b y ${bash_cmd}" +$qsub_cmd + +# generate BigWig (bedtools) +segment_bigwig_bedtools="bam-bigwig-bedtools" +bash_cmd="bash ${code_dir}/segments/${segment_bigwig_bedtools}.sh $proj_dir $sample $bam_star" +qsub_cmd="qsub -N sns.${segment_bigwig_bedtools}.${sample} -M ${USER}@nyumc.org -m a -j y -cwd -b y ${bash_cmd}" +$qsub_cmd + +# Picard CollectRnaSeqMetrics +segment_qc_picard="bam-qc-picard-rnaseqmetrics" +bash_cmd="bash ${code_dir}/segments/${segment_qc_picard}.sh $proj_dir $sample $bam_star" +($bash_cmd) + +# determine run type for featurecounts +if [ -n "$fastq_R2" ] ; then + run_type="PE" +else + run_type="SE" +fi + +# determine strand +exp_strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND); + +# generate counts +segment_quant="bam-quant-featurecounts" +bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads $bam_star $run_type $exp_strand" +($bash_cmd) + +# generate unstranded counts just in case +if [ "$exp_strand" != "unstr" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads $bam_star $run_type unstr" + ($bash_cmd) +fi + + +######################### + + +# combine summary from each step + +sleep 30 + +summary_csv="${proj_dir}/summary-combined.rna-star.csv" + +bash_cmd=" +bash ${code_dir}/scripts/join-many.sh , X \ +${proj_dir}/summary.${segment_fastq_clean}.csv \ +${proj_dir}/summary.${segment_align}.csv \ +${proj_dir}/summary.${segment_quant}-unstr.csv \ +${proj_dir}/summary.${segment_quant}-${exp_strand}.csv \ +${proj_dir}/summary.${segment_qc_picard}.csv \ +> $summary_csv +" +(eval $bash_cmd) + + +######################### + + +# generate groups sample sheet template + +samples_groups_csv="${proj_dir}/samples.groups.csv" + +if [ ! -s "$samples_groups_csv" ] ; then + echo "#SAMPLE,group" > $samples_groups_csv + sed 's/\,.*/,NA/g' samples.fastq-raw.csv | LC_ALL=C sort -u >> $samples_groups_csv +fi + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/routes/rrbs.sh b/routes/rrbs.sh new file mode 100755 index 0000000..2932292 --- /dev/null +++ b/routes/rrbs.sh @@ -0,0 +1,104 @@ +#!/bin/bash + + +## +## RRBS using Bismark +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2 + exit 1 +fi + +# standard route arguments +proj_dir=$(readlink -f "$1") +sample=$2 + +# additional settings +threads=$NSLOTS +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * sample: $sample " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " +echo " * threads: $threads " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# segments + +# rename and/or merge raw input FASTQs +fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 2) +fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1" ] ; then + bash_cmd="bash ${code_dir}/segments/fastq-fastq-clean.sh $proj_dir $sample" + ($bash_cmd) + fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 2) + fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 3) +fi + +# trim FASTQs with Trim Galore +fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 2) +fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1_trimmed" ] ; then + bash_cmd="bash ${code_dir}/segments/fastq-fastq-trim-trimgalore.sh $proj_dir $sample rrbs $fastq_R1 $fastq_R2" + ($bash_cmd) + fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 2) + fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 3) +fi + +# run Bismark alignment +bam_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.bam-bismark.csv" | cut -d ',' -f 2) +if [ -z "$bam_bismark" ] ; then + bash_cmd="bash ${code_dir}/segments/fastq-bam-bismark.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed" + ($bash_cmd) + bam_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.bam-bismark.csv" | cut -d ',' -f 2) +fi + +# run Bismark methylation extractor +if [ -n "$fastq_R2" ] ; then + # + echo "pe" +else + bash_cmd="bash ${code_dir}/segments/bam-meth-bismark.sh $proj_dir $sample $threads $bam_bismark se" + ($bash_cmd) + bash_cmd="bash ${code_dir}/segments/bam-meth-bismark.sh $proj_dir $sample $threads $bam_bismark se-ignore-r1-3" + ($bash_cmd) +fi + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/routes/wes.sh b/routes/wes.sh new file mode 100755 index 0000000..37d8a05 --- /dev/null +++ b/routes/wes.sh @@ -0,0 +1,128 @@ +#!/bin/bash + + +## +## whole genome/exome/targeted variant discovery using BWA-MEM and GATK +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2 + exit 1 +fi + +# standard route arguments +proj_dir=$(readlink -f "$1") +sample=$2 + +# additional settings +threads=$NSLOTS +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * sample: $sample " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " +echo " * threads: $threads " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# segments + +# rename and/or merge raw input FASTQs +segment_fastq_clean="fastq-fastq-clean" +fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) +fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample" + ($bash_cmd) + fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) + fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +fi + +# trim FASTQs with Trimmomatic +segment_fastq_trim="fastq-fastq-trim-trimmomatic" +fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2) +fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1_trimmed" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_trim}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2" + ($bash_cmd) + fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2) + fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3) +fi + +# run BWA-MEM alignment +segment_align="fastq-bam-bwa-mem" +bam=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2) +if [ -z "$bam_bwa" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed" + ($bash_cmd) + bam=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2) +fi + +# remove duplicates +segment_dedup="bam-bam-dd-sambamba" +bam_dd=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2) +if [ -z "$bam_dd" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_dedup}.sh $proj_dir $sample $threads $bam" + ($bash_cmd) + bam_dd=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2) +fi + + + + + +######################### + + +# combine summary from each step + +sleep 30 + +summary_csv="${proj_dir}/summary-combined.wes.csv" + +bash_cmd=" +bash ${code_dir}/scripts/join-many.sh , X \ +${proj_dir}/summary.${segment_fastq_clean}.csv \ +${proj_dir}/summary.${segment_fastq_trim}.csv \ +${proj_dir}/summary.${segment_align}.csv \ +${proj_dir}/summary.${segment_dedup}.csv \ +> $summary_csv +" +(eval $bash_cmd) + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/routes/wgbs.sh b/routes/wgbs.sh new file mode 100755 index 0000000..52d69a6 --- /dev/null +++ b/routes/wgbs.sh @@ -0,0 +1,121 @@ +#!/bin/bash + + +## +## WGBS using Bismark +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +route_name=${script_name/%.sh/} +echo -e "\n ========== ROUTE: $route_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2 + exit 1 +fi + +# standard route arguments +proj_dir=$(readlink -f "$1") +sample=$2 + +# additional settings +threads=$NSLOTS +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +qsub_dir="${proj_dir}/logs-qsub" + +# display settings +echo " * proj_dir: $proj_dir " +echo " * sample: $sample " +echo " * code_dir: $code_dir " +echo " * qsub_dir: $qsub_dir " +echo " * threads: $threads " + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +# segments + +# rename and/or merge raw input FASTQs +segment_fastq_clean="fastq-fastq-clean" +fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) +fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample" + ($bash_cmd) + fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2) + fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3) +fi + +# trim FASTQs with Trimmomatic +segment_fastq_trim="fastq-fastq-trim-trimmomatic" +fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2) +fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3) +if [ -z "$fastq_R1_trimmed" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_fastq_trim}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2" + ($bash_cmd) + fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2) + fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3) +fi + +# run Bismark alignment +segment_align="fastq-bam-bismark" +bam_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2) +if [ -z "$bam_bismark" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed" + ($bash_cmd) + bam_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2) +fi + +# run Bismark dedup +segment_dedup="bam-bam-dd-bismark" +bam_dd_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2) +if [ -z "$bam_dd_bismark" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_dedup}.sh $proj_dir $sample $bam_bismark pe" + ($bash_cmd) + bam_dd_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2) +fi + +bam_bismark="$bam_dd_bismark" + +# run Bismark methylation extractor +segment_meth="bam-meth-bismark" +if [ -n "$fastq_R2" ] ; then + bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark pe" + ($bash_cmd) + bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark pe-ignore-r2-3" + ($bash_cmd) +else + bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark se" + ($bash_cmd) + bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark se-ignore-r1-3" + ($bash_cmd) +fi + + +######################### + + +# delete empty qsub .po files +rm -f ${qsub_dir}/sns.*.po* + + +######################### + + +date + + + +# end diff --git a/run b/run new file mode 100755 index 0000000..80da2be --- /dev/null +++ b/run @@ -0,0 +1,162 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use File::Spec; +use Cwd; +use Cwd 'abs_path'; + +my $HELP = <rel2abs(__FILE__); + $code_dir = dirname($code_dir); + chomp($code_dir); + print "code dir: $code_dir \n"; + + # check that settings file exists + my $settings_txt = "${project_dir}/settings.txt"; + unless ( -e $settings_txt ) { + die "\n\n ERROR: $settings_txt DOES NOT EXIST \n\n"; + } + + # check that the specified route exists + my $route_script = "${code_dir}/routes/${route}.sh"; + unless ( -e $route_script ) { + die "\n\n ERROR: $route DOES NOT EXIST \n\n"; + } + + # move to qsub logs dir so the qsub output ends up there + my $qsub_logs_dir = "${project_dir}/logs-qsub"; + system("mkdir -p $qsub_logs_dir"); + print "logs dir: $qsub_logs_dir \n"; + chdir($qsub_logs_dir); + + # check if standard route (run for each sample) or comparison route (compare samples) + if ($route =~ "comp-") { + + # comparison route + + # sample sheet specifying groups + my $samples_groups_csv = abs_path("${project_dir}/samples.groups.csv"); + print "sample sheet: $samples_groups_csv \n"; + + # check that sample sheet exists + unless ( -e $samples_groups_csv ) { + die "\n\n ERROR: $samples_groups_csv DOES NOT EXIST \n\n"; + } + + system("bash ${code_dir}/scripts/fix-csv.sh $samples_groups_csv"); + + print "\n process comparison $route \n"; + + # route command + my $cmd = "bash $route_script $project_dir"; + + system($cmd); + + } + else { + + # standard route + + # remove problematic characters from any existing samples sheets + my @sample_csvs = `ls -1 ${project_dir}/samples.*.csv`; + chomp(@sample_csvs); + while (my $csv = shift(@sample_csvs)) { + print "sample sheet: $csv \n"; + system("bash ${code_dir}/scripts/fix-csv.sh $csv"); + } + + # check that raw fastq sample sheet exists + my $samples_fastq_csv = abs_path("${project_dir}/samples.fastq-raw.csv"); + unless ( -e $samples_fastq_csv ) { + die "\n\n ERROR: $samples_fastq_csv DOES NOT EXIST \n\n"; + } + + # find unique samples based on the sample sheet + my @unique_samples = `cat $samples_fastq_csv | cut -d ',' -f 1 | uniq`; + chomp(@unique_samples); + + # adjust number of requested threads per sample depending on total number of samples + my $num_samples = @unique_samples; + my $threads; + if ($num_samples < 15) { + $threads = "8-16"; + } + elsif ($num_samples < 50) { + $threads = "4-8"; + } + else { + $threads = "4"; + } + + # process each sample + my $current_sample = 1; + while (my $sample = shift(@unique_samples)) { + + print "\n process sample $sample \n"; + + # route command + my $cmd = "bash $route_script $project_dir $sample"; + + # check for "test" argument to test one sample without qsub + if (defined($ARGV[1]) && $ARGV[1] eq "test") { + print "\n\n test run started \n\n"; + system("export NSLOTS=4 && $cmd"); + print "\n\n test run finished \n\n"; + exit; + } + + # qsub command (run as binary bash script to keep all environment variables) + my $email = '${USER}@nyumc.org'; + $email = `echo $email`; + chomp($email); + my $qsub_name = "sns.${route}.${sample}"; + my $qsub_cmd = "qsub -N $qsub_name -M $email -m a -j y -cwd -pe threaded $threads -b y $cmd"; + print "\n CMD: $qsub_cmd \n"; + system($qsub_cmd); + + # pause between samples (sometimes job scheduler gets overwhelmed) + sleep(5); + + # extra pause for the first few samples so it's easier to fix obvious errors + if ($current_sample < 4) { + sleep(15); + } + + $current_sample++; + + } + + } + +} + + + +# end diff --git a/scripts/deseq2-pca.R b/scripts/deseq2-pca.R new file mode 100755 index 0000000..eeb4d30 --- /dev/null +++ b/scripts/deseq2-pca.R @@ -0,0 +1,67 @@ +## +## Modified DESeq2 plotPCA function with sample names and proportion of variance added. +## Sample names will be shown underneath each dot. +## The axis will display proportion of variance for each principal component. +## Tested using several DESeq2 versions from 1.2.8 to 1.12.3. +## DESeq2 plotPCA function switched from lattice to ggplot2 in version 1.5.11. +## + + +deseq2_pca = function(x, intgroup, ntop=500) +{ + library(RColorBrewer) + library(genefilter) + library(lattice) + + # pca + rv = rowVars(assay(x)) + select = order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] + pca = prcomp(t(assay(x)[select,])) + + # proportion of variance + variance = pca$sdev^2 / sum(pca$sdev^2) + variance = round(variance, 3) * 100 + + # sample names + names = colnames(x) + + # factor of groups + fac = factor(apply(as.data.frame(colData(x)[, intgroup, drop=FALSE]), 1, paste, collapse=" : ")) + + # colors + if( nlevels(fac) > 24 ) { + colors = rainbow(nlevels(fac)) + } + else if( nlevels(fac) > 2 ) { + colors = c(brewer.pal(9, "Set1"), brewer.pal(8, "Accent"), brewer.pal(8, "Dark2")) + colors = unique(colors) + colors = head(colors, nlevels(fac)) + } + else { + colors = c("dodgerblue3", "firebrick3") + } + + # plot + xyplot( + PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=1.5, + aspect = "fill", + col = colors, + xlab = list(paste("PC1 (", variance[1], "%)", sep=""), cex=0.8), + ylab = list(paste("PC2 (", variance[2], "%)", sep=""), cex=0.8), + panel = function(x, y, ...) { + panel.xyplot(x, y, ...); + ltext(x=x, y=y, labels=names, pos=1, offset=0.8, cex=0.7) + }, + main = draw.key( + key = list( + rect = list(col = colors), + text = list(levels(fac)), + rep = FALSE + ) + ) + ) +} + + + +# end diff --git a/scripts/deseq2-results.R b/scripts/deseq2-results.R new file mode 100755 index 0000000..fad5b30 --- /dev/null +++ b/scripts/deseq2-results.R @@ -0,0 +1,67 @@ +## +## Calculate results and export them in various formats with proper names. +## + + +deseq2_results = function(DESeqDataSet, contrast=NULL, name=NULL) +{ + library(xlsx) + + # calculate results (using contrast or name, depending on what is given) + if(!is.null(contrast)) { + res = results(DESeqDataSet, contrast=contrast, cooksCutoff=FALSE, addMLE=TRUE) + # extract results name + pattern = paste(".*", contrast[1], " ", sep="") + res.name = gsub(pattern=pattern, replacement="", x=mcols(res)[2,2]) + } + else { + res = results(DESeqDataSet, name=name, cooksCutoff=FALSE, addMLE=TRUE) + res.name = name + } + + # sort results + res = res[order(res$padj, res$pvalue, -res$baseMean),] + + # save results object + res.rdata = paste("res.", gsub(pattern=" ", replacement="-", x=res.name), ".RData", sep="") + save(res, file=res.rdata) + message("save results object: ", res.rdata) + + # save results as csv + res.csv = paste0("dge.", gsub(pattern=" ", replacement="-", x=res.name), ".csv") + write.csv(as.data.frame(res), file=res.csv) + message("save results csv: ", res.csv) + + # format results for excel export + res.df = as.data.frame(res) + res.df$gene = rownames(res.df) + res.df$baseMean = round(res.df$baseMean, 1) + res.df$log2FoldChange = round(res.df$log2FoldChange, 5) + res.df$lfcMLE = round(res.df$lfcMLE, 5) + res.df$pvalue = round(res.df$pvalue, 15) + res.df$padj = round(res.df$padj, 15) + res.df = subset(res.df, select = c("gene", "baseMean", "log2FoldChange", "lfcMLE", "pvalue", "padj")) + + # rename columns + names(res.df)[names(res.df)=="log2FoldChange"] = "log2FC" + names(res.df)[names(res.df)=="lfcMLE"] = "log2FCunshrunk" + + # save results as xlsx + res.xlsx = paste0("dge.", gsub(pattern=" ", replacement="-", x=res.name), ".xlsx") + write.xlsx2(x=res.df, file=res.xlsx, sheetName=res.name, col.names=TRUE, row.names=FALSE) + message("results genes: ", nrow(res.df)) + message("save results xlsx: ", res.xlsx) + + # save results as xlsx (padj<0.05) + res.padj005.xlsx = gsub(pattern=".xlsx", replacement=".padj005.xlsx", x=res.xlsx) + write.xlsx2(x=subset(res.df, padj<0.05), file=res.padj005.xlsx, sheetName=res.name, col.names=TRUE, row.names=FALSE) + message("num genes padj<0.9: ", nrow(subset(res.df, padj<0.9))) + message("num genes padj<0.2: ", nrow(subset(res.df, padj<0.2))) + message("num genes padj<0.05: ", nrow(subset(res.df, padj<0.05))) + message("num genes padj<0.01: ", nrow(subset(res.df, padj<0.01))) + message("save filtered results xlsx: ", res.padj005.xlsx) +} + + + +# end diff --git a/scripts/dge-deseq2.R b/scripts/dge-deseq2.R new file mode 100755 index 0000000..44277d1 --- /dev/null +++ b/scripts/dge-deseq2.R @@ -0,0 +1,106 @@ +#!/usr/bin/env Rscript + + +## +## Differential gene expression with DESeq2. +## +## usage: Rscript --vanilla dge-deseq2.R counts_table.txt groups_table.csv +## + + +# increase output width +options(width = 150) + +# java heap size +options(java.parameters="-Xmx8G") + +# get scripts directory (directory of this file) and load relevant functions +args_all = commandArgs(trailingOnly = FALSE) +scripts_dir = normalizePath(dirname(sub("^--file=", "", args_all[grep("^--file=", args_all)]))) +source(paste0(scripts_dir, "/load-install-packages.R")) +source(paste0(scripts_dir, "/deseq2-pca.R")) +source(paste0(scripts_dir, "/deseq2-results.R")) + +# relevent arguments +args = commandArgs(trailingOnly = TRUE) +counts_table_file = args[1] +groups_table_file = args[2] + +# check that input files exist +if (!file.exists(counts_table_file)) stop("file does not exist: ", counts_table_file) +if (!file.exists(groups_table_file)) stop("file does not exist: ", groups_table_file) + +# load relevant packages +load_install_packages("DESeq2") +load_install_packages("RColorBrewer") +load_install_packages("gplots") +load_install_packages("genefilter") +load_install_packages("lattice") +load_install_packages("xlsx") + +message(" ========== prepare inputs ========== ") + +# import groups table +groups_table = read.csv(file = groups_table_file, header = TRUE, row.names = 1, colClasses = "factor") +message("groups table sample num: ", nrow(groups_table)) +message("groups table groups: ", colnames(groups_table)) + +# import counts table +counts_table = read.delim(file = counts_table_file, header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE) +message("full counts table gene num: ", nrow(counts_table)) +message("full counts table sample num: ", ncol(counts_table)) + +# subset to samples in groups table (also sets samples to be in the same order) +counts_table = counts_table[,rownames(groups_table)] +message("group-subset counts table gene num: ", nrow(counts_table)) +message("group-subset counts table sample num: ", ncol(counts_table)) + +# group info +group_name = colnames(groups_table)[1] +message("group name: ", group_name) +group_levels = levels(groups_table[,group_name]) +message("group levels: ", toString(group_levels)) + +# design formula +design_formula = formula(paste("~", group_name)) +message("design formula: ", design_formula) + +message(" ========== normalization ========== ") + +# import and normalize +dds = DESeqDataSetFromMatrix(countData = counts_table, colData = groups_table, design = design_formula) +dds = DESeq(dds, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4)) +save(dds, file = "deseq2.dds.RData") + +vsd = varianceStabilizingTransformation(dds) +save(vsd, file = "deseq2.vsd.RData") + +# export counts +write.csv(counts(dds, normalized = FALSE), file = "counts.raw.csv") +write.csv(round(counts(dds, normalized = TRUE), digits = 3), file = "counts.norm.csv") +write.xlsx2(x=round(counts(dds, normalized = TRUE), digits = 3), file = "counts.norm.xlsx", sheetName = "normalized counts") + +message(" ========== QC ========== ") + +pdf("plot.sparsity.pdf", width=7, height=7, family="Palatino", pointsize=10) +print(plotSparsity(dds, normalized = TRUE)) +dev.off() + +pdf("plot.pca.pdf", width=7, height=7, family="Palatino", pointsize=10) +deseq2_pca(vsd, intgroup=c("group"), ntop=1000) +dev.off() + +message(" ========== differential expression ========== ") + +# perform comparisons for all combinations of group levels +group_levels_combinations = combn(group_levels, m = 2, simplify = TRUE) +for (combination_num in 1:ncol(group_levels_combinations)) { + level_numerator = group_levels_combinations[1, combination_num] + level_denominator = group_levels_combinations[2, combination_num] + message("comparison: ", paste(group_name, ":", level_numerator, "vs", level_denominator)) + deseq2_results(DESeqDataSet = dds, contrast = c(group_name, level_numerator, level_denominator)) +} + + + +# end diff --git a/scripts/fix-csv.sh b/scripts/fix-csv.sh new file mode 100755 index 0000000..0d930e2 --- /dev/null +++ b/scripts/fix-csv.sh @@ -0,0 +1,43 @@ +#!/bin/bash + + +## +## make csv file more compatible with various tools by removing problematic characters +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") + +# check for correct number of arguments +if [ ! $# == 1 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name file.csv \n" >&2 + exit 1 +fi + +# arguments +csv=$1 + +# check that input exists +if [ ! -s "$csv" ] ; then + echo -e "\n $script_name ERROR: file $csv does not exist \n" >&2 + exit 1 +fi + +# fix newlines +dos2unix --quiet $csv +mac2unix --quiet $csv + +# replace commas inside quoted fields with dashes +awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $csv > ${csv}.tmp && mv ${csv}.tmp $csv +# replace quotes +sed -i 's/\"//g' $csv +# remove lines missing any values (only commas present) +sed -i '/^,,*$/d' $csv +# add newline to end of file if one does not exist (some scripts may complain) +sed -i -e '$a\' $csv + + + +# end diff --git a/scripts/get-ref.sh b/scripts/get-ref.sh new file mode 100755 index 0000000..1293306 --- /dev/null +++ b/scripts/get-ref.sh @@ -0,0 +1,159 @@ +#!/bin/bash + + +## +## get reference of specified type for specified genome +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name genome_dir ref_type \n" >&2 + exit 1 +fi + +# arguments +genome_root=$1 +ref_type=$2 + + +######################### + + +function find_file { + + local file_name=$1 + + # find the shortest result + local result=$(find -L "$genome_root" -maxdepth 2 -type f -name "$file_name" | awk '{ print length, $0 }' | sort -n | cut -d " " -f 2 | head -1) + + if [ -s "$result" ] && [ "$result" ] ; then + echo $(readlink -f "$result") + else + echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2 + exit 1 + fi + +} + +function find_dir { + + local dir_name=$1 + + local result=$(find -L "$genome_root" -maxdepth 1 -type d -iname "$dir_name") + + if [ -s "$result" ] && [ "$result" ] ; then + echo $(readlink -f "$result") + else + echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2 + exit 1 + fi + +} + +function find_basename { + + local suffix=$1 + + # find the shortest result + local result=$(find -L "$genome_root" -maxdepth 2 -type f -name "genome${suffix}" | awk '{ print length, $0 }' | sort -n | cut -d " " -f 2 | head -1) + + if [ -s "$result" ] && [ "$result" ] ; then + result=$(readlink -f "$result") + echo ${result/${suffix}/} + else + echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2 + exit 1 + fi + +} + + +######################### + + +# genome root + +if [ ! -d "$genome_root" ] ; then + echo -e "\n $script_name ERROR: DIR $genome_root DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# file references + +if [ "$ref_type" == "FASTA" ] ; then + find_file genome.fa +fi + +if [ "$ref_type" == "DICT" ] ; then + find_file genome.dict +fi + +if [ "$ref_type" == "REFFLAT" ] ; then + find_file refFlat.txt.gz +fi + +if [ "$ref_type" == "GTF" ] ; then + find_file genes.gtf +fi + +if [ "$ref_type" == "CHROMSIZES" ] ; then + find_file chrom.sizes +fi + +if [ "$ref_type" == "2BIT" ] ; then + find_file genome.2bit +fi + +if [ "$ref_type" == "FASTQSCREEN" ] ; then + find_file fastq_screen.conf +fi + +if [ "$ref_type" == "RRNAINTERVALLIST" ] ; then + find_file rRNA.interval_list +fi + + +# directory references + +if [ "$ref_type" == "STAR" ] ; then + find_dir star +fi + +if [ "$ref_type" == "BISMARK" ] ; then + find_dir bismark +fi + +if [ "$ref_type" == "RSEM" ] ; then + echo "$(find_dir rsem)/ref" +fi + + +# basename (file without suffix) references + +if [ "$ref_type" == "BOWTIE1" ] ; then + find_basename .1.ebwt +fi + +if [ "$ref_type" == "BOWTIE2" ] ; then + find_basename .1.bt2* +fi + +if [ "$ref_type" == "BWA" ] ; then + echo "$(find_basename .fa.bwt).fa" +fi + + +######################### + + + +# end diff --git a/scripts/get-set-setting.sh b/scripts/get-set-setting.sh new file mode 100755 index 0000000..db20d8d --- /dev/null +++ b/scripts/get-set-setting.sh @@ -0,0 +1,93 @@ +#!/bin/bash + + +## +## get or set (if not set) specified setting for a specified setting +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") + +# check for correct number of arguments +if [ $# -lt 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name settings_file setting_name [setting_value] \n" >&2 + exit 1 +fi + +# arguments +settings_txt=$(readlink -f "$1") +setting_name=$2 +setting_value_arg=$3 + + +######################### + + +# check that input exists + +if [ ! -s "$settings_txt" ] ; then + echo -e "\n $script_name ERROR: file $settings_txt does not exist \n" >&2 + exit 1 +fi + + +######################### + + +# check if setting is already set and output that + +setting_value=$(cat "$settings_txt" | grep "^${setting_name}|" | head -1 | cut -d "|" -f 2) +if [ -n "$setting_value" ] ; then + echo "$setting_value" + exit +fi + + +######################### + + +# determine proper setting value + +genome_root=$(cat "$settings_txt" | grep "^GENOME-DIR|" | head -1 | cut -d "|" -f 2) + +if [[ "$setting_name" == REF-* ]] ; then + # reference file settings + ref_type=${setting_name/REF-/} + setting_value=$(bash ${BASH_SOURCE%/*}/get-ref.sh "$genome_root" "$ref_type") +elif [ -n "$setting_value_arg" ] ; then + setting_value=$setting_value_arg +else + echo -e "\n $script_name ERROR: the value for setting $setting_name must be specified \n" >&2 + exit 1 +fi + + +######################### + + +# save setting value + +if [ -n "$setting_value" ] ; then + echo "${setting_name}|${setting_value}" >> $settings_txt +else + echo -e "\n $script_name ERROR: value for $setting_name could not be determined \n" >&2 + exit 1 +fi + + +######################### + + +# try to get the setting again from the updated settings file + +setting_value=$(cat "$settings_txt" | grep "^${setting_name}|" | head -1 | cut -d "|" -f 2) +echo "$setting_value" + + +######################### + + + +# end diff --git a/scripts/join-many.sh b/scripts/join-many.sh new file mode 100755 index 0000000..66b1afd --- /dev/null +++ b/scripts/join-many.sh @@ -0,0 +1,47 @@ +#!/bin/bash + + +## +## merge any number of tab or comma-separated files ("join" can only do 2 at a time) +## use $'\t' as tab field separator +## + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") + +# check for correct number of arguments +if [ $# -lt 3 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name field_separator missing_field_char in1.txt in2.txt in3.txt ... > merged.txt \n" >&2 + exit 1 +fi + +# arguments +separator="$1" +shift +empty_char="$1" +shift + +# check if at least the first file exists +if [ ! -s "$1" ] ; then + echo -e "\n $script_name ERROR: file $1 does not exist \n" >&2 + exit 1 +fi + +# load recent coreutils ("-o auto" support added in release 8.12) +module load coreutils/8.24 + +function rjoin { + if [[ $# -gt 1 ]]; then + LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") | rjoin "${@:2}" + else + LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") + fi +} + +rjoin "${@:2}" < "$1" + + + +# end diff --git a/scripts/load-install-packages.R b/scripts/load-install-packages.R new file mode 100755 index 0000000..8fa60e2 --- /dev/null +++ b/scripts/load-install-packages.R @@ -0,0 +1,34 @@ +## +## Load packages. If missing, try installing from CRAN and Bioconductor. +## + + +load_install_packages = function(package_list) { + + for (p in package_list) { + + message("test loading package: ", p) + + # try to install from CRAN if package is not already installed + if (!require(p, character.only = TRUE, quietly = TRUE)) { + message("installing package from CRAN: ", p) + install.packages(p, quiet = TRUE, repos = "http://cran.rstudio.com") + } + + # try to install from Bioconductor if package is not already installed + if (!require(p, character.only = TRUE, quietly = TRUE)) { + message("installing package from Bioconductor: ", p) + source("https://bioconductor.org/biocLite.R") + biocLite(p, suppressUpdates = TRUE) + } + + # force load package + library(p, character.only = TRUE, quietly = TRUE) + + } + +} + + + +# end diff --git a/scripts/test-packages.R b/scripts/test-packages.R new file mode 100755 index 0000000..cfaf969 --- /dev/null +++ b/scripts/test-packages.R @@ -0,0 +1,21 @@ +#!/usr/bin/env Rscript + + +## +## Try loading a few packages to test R settings. +## +## usage: Rscript --vanilla test-packages.R +## + +# get scripts directory (directory of this file) and load relevant functions +args_all = commandArgs(trailingOnly = FALSE) +scripts_dir = normalizePath(dirname(sub("^--file=", "", args_all[grep("^--file=", args_all)]))) +source(paste0(scripts_dir, "/load-install-packages.R")) + +# load some packages +test_packages = c("gplots", "grid", "ggplot2", "plyr", "dplyr", "gsalib", "limma") +load_install_packages(test_packages) + + + +# end diff --git a/segments/bam-bam-dd-bismark.sh b/segments/bam-bam-dd-bismark.sh new file mode 100755 index 0000000..2fafa53 --- /dev/null +++ b/segments/bam-bam-dd-bismark.sh @@ -0,0 +1,203 @@ +#!/bin/bash + + +# Bismark deduplicate_bismark + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [Bismark BAM] [analysis type] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +bismark_bam=$3 +analysis_type=$4 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$bismark_bam" ] ; then + echo -e "\n $script_name ERROR: BAM $bismark_bam DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +bismark_bam_dd_dir="${proj_dir}/BAM-DD-Bismark" +mkdir -p "$bismark_bam_dd_dir" +bismark_bam_dd_final="${bismark_bam_dd_dir}/${sample}.bam" + +bismark_dd_logs_dir="${proj_dir}/logs-${segment_name}" +mkdir -p "$bismark_dd_logs_dir" +bismark_dd_report_final="${bismark_dd_logs_dir}/${sample}.report.txt" + +bismark_report_dir="${proj_dir}/Bismark-report" +mkdir -p "$bismark_report_dir" +bismark_report_deduplication="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplication_report.txt" + +bismark_bam_dd_original="${bismark_bam/%.bam/.deduplicated.bam}" +bismark_dd_report_original="${bismark_bam/%.bam/.deduplication_report.txt}" + + +######################### + + +# exit if output exits already + +if [ -s "$bismark_bam_dd_final" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# run deduplicate_bismark + +module load bismark/0.16.3 + +echo " * BISMARK: $(readlink -f $(which deduplicate_bismark)) " +echo " * BAM IN: $bismark_bam " +echo " * BAM OUT ORIGINAL : $bismark_bam_dd_original " +echo " * REPORT ORIGINAL : $bismark_dd_report_original " +echo " * BAM OUT FINAL : $bismark_bam_dd_final " +echo " * REPORT FINAL : $bismark_dd_report_final " + +if [ "$analysis_type" == "se" ] ; then + bismark_flags="--single" +fi + +if [ "$analysis_type" == "pe" ] ; then + bismark_flags="--paired" +fi + +bash_cmd=" +deduplicate_bismark \ +$bismark_flags \ +--bam $bismark_bam +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + + +######################### + + +# check that output generated + +if [ ! -s "$bismark_bam_dd_original" ] ; then + echo -e "\n $script_name ERROR: REPORT $bismark_dd_report_original NOT GENERATED \n" >&2 + exit 1 +fi + +if [ ! -s "$bismark_dd_report_original" ] ; then + echo -e "\n $script_name ERROR: REPORT $bismark_dd_report_original NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# put reports in report directory for bismark2report + +bash_cmd="cp -v $bismark_dd_report_original $bismark_report_deduplication" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# clean up output name + +bash_cmd="mv -v $bismark_bam_dd_original $bismark_bam_dd_final" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +bash_cmd="mv -v $bismark_dd_report_original $bismark_dd_report_final" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# generate summary + +reads_total=$(cat "$bismark_dd_report_final" | grep -m 1 "Total number of alignments analysed" | cut -f 2) +reads_duplicated=$(cat "$bismark_dd_report_final" | grep -m 1 "Total number duplicated alignments" | cut -f 2 | cut -d ' ' -f 1) +reads_deduplicated=$(cat "$bismark_dd_report_final" | grep -m 1 "deduplicated leftover sequences" | cut -d ' ' -f 7) + +reads_duplicated_pct=$(echo "(${reads_duplicated}/${reads_total})*100" | bc -l | cut -c 1-4) +reads_duplicated_pct="${reads_duplicated_pct}%" + +# header +echo "#SAMPLE,ALIGNED PAIRS,DEDUPLICATED PAIRS,DUPLICATION RATE" > $summary_csv + +# print the relevant numbers +echo "${sample},${reads_total},${reads_deduplicated},${reads_duplicated_pct}" >> "$summary_csv" + +# paste -d ',' \ +# <(echo "$sample") \ +# <(cat "$bismark_dd_report_final" | grep -m 1 "Total number of alignments analysed" | cut -f 2) \ +# <(cat "$bismark_dd_report_final" | grep -m 1 "deduplicated leftover sequences" | cut -d ' ' -f 7) \ +# <(cat "$bismark_dd_report_final" | grep -m 1 "Total number duplicated alignments" | cut -f 2 | cut -d ' ' -f 1) \ +# >> $summary_csv + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \ +> "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and BAM to sample sheet +echo "${sample},${bismark_bam_dd_final}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/bam-bam-dd-sambamba.sh b/segments/bam-bam-dd-sambamba.sh new file mode 100755 index 0000000..d3181c3 --- /dev/null +++ b/segments/bam-bam-dd-sambamba.sh @@ -0,0 +1,164 @@ +#!/bin/bash + + +# remove duplicates using sambamba + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name num_threads BAM \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +bam=$4 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$bam" ] ; then + echo -e "\n $script_name ERROR: BAM $bam DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +bam_dd_dir="${proj_dir}/BAM-DD" +mkdir -p "$bam_dd_dir" +bam_dd="${bam_dd_dir}/${sample}.dd.bam" + +dedup_logs_dir="${proj_dir}/logs-${segment_name}" +mkdir -p "$dedup_logs_dir" +bam_dd_log="${dedup_logs_dir}/${sample}.log.txt" +bam_dd_flagstat="${dedup_logs_dir}/${sample}.flagstat.txt" + + +######################### + + +# exit if output exits already + +if [ -s "$bam_dd" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# sambamba markdup + +sambamba_bin="/ifs/home/id460/bin/sambamba" + +echo " * sambamba: $(readlink -f $(which $sambamba_bin)) " +echo " * sambamba version: $($sambamba_bin 2>&1 | head -1) " +echo " * BAM IN: $bam " +echo " * BAM OUT: $bam_dd " + +bash_cmd=" +$sambamba_bin markdup \ +--remove-duplicates \ +--nthreads=${threads} \ +--overflow-list-size=500000 \ +$bam \ +$bam_dd \ +2> $bam_dd_log +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + + +######################### + + +# check that output generated + +if [ ! -s "$bam_dd" ] ; then + echo -e "\n $script_name ERROR: BAM $bam_dd NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# run flagstat + +bash_cmd="$sambamba_bin flagstat $bam_dd > $bam_dd_flagstat" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# generate alignment summary + +reads_duplicates=$(cat "$bam_dd_log" | grep -m 1 "found.*duplicates" | tr -d -c 0-9) +reads_dedup=$(cat "$bam_dd_flagstat" | grep -m 1 "mapped (" | cut -d ' ' -f 1) +reads_mapped=$(echo "${reads_dedup}+${reads_duplicates}" | bc) + +reads_duplicates_pct=$(echo "(${reads_duplicates}/${reads_mapped})*100" | bc -l | cut -c 1-4) +reads_duplicates_pct="${reads_duplicates_pct}%" + +# header for summary file +echo "#SAMPLE,MAPPED READS (MQ10),DEDUPLICATED READS,DUPLICATES %" > "$summary_csv" + +# summarize log file +echo "${sample},${reads_mapped},${reads_dedup},${reads_duplicates_pct}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and BAM to sample sheet +echo "${sample},${bam_dd}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/bam-bigwig-bedtools.sh b/segments/bam-bigwig-bedtools.sh new file mode 100755 index 0000000..40b6b52 --- /dev/null +++ b/segments/bam-bigwig-bedtools.sh @@ -0,0 +1,140 @@ +#!/bin/bash + + +# bedtools generate BigWig from BAM + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 3 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [BAM] \n" >&2 + exit 1 +fi + +# arguments +PROJ_DIR=$1 +SAMPLE=$2 +BAM=$3 + + +######################### + + +# check that inputs exist + +if [ ! -d "$PROJ_DIR" ] || [ ! "$PROJ_DIR" ] ; then + echo -e "\n $script_name ERROR: DIR $PROJ_DIR DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$BAM" ] || [ ! "$BAM" ] ; then + echo -e "\n $script_name ERROR: BAM $BAM DOES NOT EXIST \n" >&2 + exit 1 +fi + +CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +REF_CHROMSIZES=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${PROJ_DIR}/settings.txt" REF-CHROMSIZES); + +if [ ! -s "$REF_CHROMSIZES" ] ; then + echo -e "\n $script_name ERROR: CHROMSIZES $REF_CHROMSIZES DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# output + +BIGWIG_DIR="${PROJ_DIR}/BIGWIG" +mkdir -p "$BIGWIG_DIR" +BEDGRAPH="${BIGWIG_DIR}/${SAMPLE}.norm.bedgraph" +BIGWIG="${BIGWIG_DIR}/${SAMPLE}.norm.bw" + + +######################### + + +# exit if output exits already + +if [ -s "$BIGWIG" ] ; then + echo -e "\n $script_name SKIP SAMPLE $SAMPLE \n" >&2 + exit 1 +fi + + +######################### + + +# BAM to BedGraph with bedtools + +module unload gcc +module load samtools/1.3 +module load bedtools/2.22.0 +module load kentutils/329 + +# determine normalization scaling factor +MAPPED_READS=$(samtools view -c $BAM) +SCALE=`echo "10000000/${MAPPED_READS}" | bc -l` +SCALE=${SCALE:0:8} + +echo " * samtools: $(readlink -f $(which samtools)) " +echo " * bedtools: $(readlink -f $(which bedtools)) " +echo " * bedGraphToBigWig: $(readlink -f $(which bedGraphToBigWig)) " +echo " * CHROM SIZES: $REF_CHROMSIZES " +echo " * BAM: $BAM " +echo " * MAPPED READS: $MAPPED_READS " +echo " * SCALING FACTOR: $SCALE " +echo " * BEDGRAPH: $BEDGRAPH " +echo " * BIGWIG: $BIGWIG " + +CMD="bedtools genomecov -split -bg -g $REF_CHROMSIZES -ibam $BAM -scale $SCALE > $BEDGRAPH" +echo "CMD: $CMD" +eval $CMD + + +######################### + + +# check that output generated + +if [ ! -s "$BEDGRAPH" ] ; then + echo -e "\n $script_name ERROR: BEDGRAPH $BEDGRAPH NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# BadGraph to BigWig with UCSC tools + +CMD="bedGraphToBigWig $BEDGRAPH $REF_CHROMSIZES $BIGWIG" +echo "CMD: $CMD" +eval $CMD + +# delete bedgraph +rm -f $BEDGRAPH + + +######################### + + +# check that output generated + +if [ ! -s "$BIGWIG" ] ; then + echo -e "\n $script_name ERROR: BIGWIG $BIGWIG NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + + +# end diff --git a/segments/bam-bigwig-deeptools.sh b/segments/bam-bigwig-deeptools.sh new file mode 100755 index 0000000..947dea9 --- /dev/null +++ b/segments/bam-bigwig-deeptools.sh @@ -0,0 +1,126 @@ +#!/bin/bash + + +# deepTools generate BigWig from BAM + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [BAM] \n" >&2 + exit 1 +fi + +# arguments +PROJ_DIR=$1 +SAMPLE=$2 +THREADS=$3 +BAM=$4 + + +######################### + + +# check that inputs exist + +if [ ! -d "$PROJ_DIR" ] || [ ! "$PROJ_DIR" ] ; then + echo -e "\n $script_name ERROR: DIR $PROJ_DIR DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$BAM" ] || [ ! "$BAM" ] ; then + echo -e "\n $script_name ERROR: BAM $BAM DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# output + +BIGWIG_DIR="${PROJ_DIR}/BIGWIG" +mkdir -p "$BIGWIG_DIR" +BIGWIG="${BIGWIG_DIR}/${SAMPLE}.bin1.rpkm.bw" + +# BIGWIG_RPKM_BIN1=${BIGWIG_RPKM_DIR}/${ID}.bin1.rpkm.bw +# BIGWIG_RPKM_BIN10=${BIGWIG_RPKM_DIR}/${ID}.bin10.smooth50.rpkm.bw + + +######################### + + +# exit if output exits already + +if [ -s "$BIGWIG" ] ; then + echo -e "\n $script_name SKIP SAMPLE $SAMPLE \n" >&2 + exit 1 +fi + + +######################### + + +# generate bigwig using deeptools + +module unload python +module load python/2.7.3 +module load deeptools/2.2.4 +module load kentutils/329 + +echo " * bamCoverage: $(readlink -f $(which bamCoverage)) " +echo " * BAM: $BAM " +echo " * BIGWIG: $BIGWIG " + +CMD=" +bamCoverage \ +--verbose \ +--numberOfProcessors $THREADS \ +--binSize 1 \ +--normalizeUsingRPKM \ +--outFileFormat bigwig \ +--bam $BAM \ +--outFileName $BIGWIG +" +echo "CMD: $CMD" +eval "$CMD" + +# echo " * BIGWIG: $BIGWIG_RPKM_BIN10 " + +# CMD=" +# bamCoverage \ +# --verbose \ +# --numberOfProcessors $THREADS \ +# --binSize 10 \ +# --fragmentLength 1 \ +# --smoothLength 50 \ +# --normalizeUsingRPKM \ +# --outFileFormat bigwig \ +# --bam $BAM \ +# --outFileName $BIGWIG_RPKM_BIN10 +# " +# echo "CMD: $CMD" +# eval "$CMD" + + +######################### + + +# check that output generated + +if [ ! -s "$BIGWIG" ] ; then + echo -e "\n $script_name ERROR: BIGWIG $BIGWIG NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + + +# end diff --git a/segments/bam-meth-bismark.sh b/segments/bam-meth-bismark.sh new file mode 100755 index 0000000..9b246b1 --- /dev/null +++ b/segments/bam-meth-bismark.sh @@ -0,0 +1,330 @@ +#!/bin/bash + + +# Bismark methylation extractor + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== $script_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 5 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [Bismark BAM] [analysis type] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +bismark_bam=$4 +analysis_type=$5 + + +######################### + + +# settings and files + +segment_name="${segment_name}-${analysis_type}" + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +bismark_meth_dir="${proj_dir}/meth-Bismark-${analysis_type}" +bismark_report_dir="${proj_dir}/Bismark-report" + +bismark_report="${bismark_meth_dir}/${sample}_splitting_report.txt" +bismark_report_short="${bismark_meth_dir}/${sample}.report.txt" + +bismark_mbias_report="${bismark_meth_dir}/${sample}.M-bias.txt" + +bismark_bedgraph="${bismark_meth_dir}/${sample}.bedGraph" +bismark_bedgraph_gz="${bismark_bedgraph}.gz" + +bismark_cpg_report_gz="${bismark_meth_dir}/${sample}.CpG_report.txt.gz" + +# BISMARK_METHYLKIT_REPORT="${bismark_meth_dir}/${sample}.methylkit.txt" + + +######################### + + +# exit if output exits already + +if [ -s "$bismark_report_short" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# check inputs and references + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$bismark_bam" ] ; then + echo -e "\n $script_name ERROR: bismark_bam $bismark_bam DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +ref_bismark=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BISMARK); + +if [ ! -d "$ref_bismark" ] || [ ! "$ref_bismark" ] ; then + echo -e "\n $script_name ERROR: REF $ref_bismark DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# adjust paramaters based on run type + +# --ignore_r2 +# The first couple of bases in Read 2 of BS-Seq experiments show a severe bias towards +# non-methylation as a result of end-repairing sonicated fragments with unmethylated cytosines, +# it is recommended that the first couple of bp of Read 2 are removed before starting downstream analysis. + + +if [ "$analysis_type" == "pe" ] ; then + bismark_flags="--paired-end" +fi + +# the first bases in R2 of BS-Seq experiments show a bias towards non-methylation as a result of end-repairing sonicated fragments with unmethylated cytosines +if [ "$analysis_type" == "pe-ignore-r2-3" ] ; then + bismark_flags="--paired-end --ignore_r2 3" +fi + +if [ "$analysis_type" == "se" ] ; then + bismark_flags="--single-end" +fi + +if [ "$analysis_type" == "se-ignore-r1-3" ] ; then + bismark_flags="--single-end --ignore 3" +fi + + +######################### + + +# bismark_methylation_extractor + +# load module (loads bowtie2/2.2.6 and samtools/1.3) +module load bismark/0.16.3 + +# navigate to output dir +mkdir -p $bismark_meth_dir +cd $bismark_meth_dir + +# number of parallel instances of Bismark to run +multicore=$(( threads / 3 )) + +echo " * BISMARK: $(readlink -f $(which bismark_methylation_extractor)) " +echo " * OUT DIR: $bismark_meth_dir " +echo " * BAM: $bismark_bam " +echo " * REPORT ORIGINAL: $bismark_report " +echo " * BEDGRAPH: $bismark_bedgraph_gz " +echo " * REPORT: $bismark_report_short " +echo " * CYTOSINE REPORT: $bismark_cpg_report_gz " + +# --comprehensive \ + +# Genome-wide cytosine methylation report specific options: +# --cytosine_report - produces a genome-wide methylation report for all cytosines in the genome (1-based) +# --genome_folder - genome folder you wish to use to extract sequences from + +bash_cmd=" +bismark_methylation_extractor \ +--gzip \ +--multicore $multicore \ +--buffer_size 4G \ +--genome_folder $ref_bismark \ +--cytosine_report \ +--bedGraph \ +$bismark_flags \ +$bismark_bam +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# check that output generated + +if [ ! -s $bismark_report ] || [ ! $bismark_report ] +then + printf "\n\n ERROR! REPORT $bismark_report NOT GENERATED \n\n" + exit 1 +fi + + +######################### + + +# put reports in report directory for bismark2report +# bismark_bams for bismark2summary (may not be necessary) + +# adjust for SE/PE +if [ "$analysis_type" == "pe" ] ; then + bismark_report_splittng="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplicated_splitting_report.txt" + bismark_report_mbias="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplicated.M-bias.txt" +fi + +# not tested +if [ "$analysis_type" == "se" ] ; then + bismark_report_splittng="${bismark_report_dir}/${sample}_bismark_bt2_splitting_report.txt" + bismark_report_mbias="${bismark_report_dir}/${sample}_bismark_bt2.M-bias.txt" +fi + +if [ "$analysis_type" == "pe" ] || [ "$analysis_type" == "se" ] +then + + CMD="cp -v $bismark_report $bismark_report_splittng" + printf "\n\n $CMD \n\n" + $CMD + + CMD="cp -v $bismark_mbias_report $bismark_report_mbias" + printf "\n\n $CMD \n\n" + $CMD + +fi + + +######################### + + +CMD="mv -v $bismark_report $bismark_report_short" +printf "\n\n $CMD \n\n" +$CMD + + +######################### + + +# convert cytosine report to methylkit-compatible format and remove uncovered bases + +# cytosine report: +# +# new format: chr, start, strand, methylationRatio, coverage +# zcat $bismark_cpg_report_gz | awk '{ OFS="\t"; if( $4+0 > 0 || $5+0 > 0 ) print $1,$2,$3,$4/($4+$5),$4+$5; }' | sort -k1,1 -k2,2n > $BISMARK_METHYLKIT_REPORT + + +######################### + + +# convert bedgraph to bigwig + +bigwig_dir="${proj_dir}/BIGWIG-Bismark" +bigwig="${bigwig_dir}/${sample}.bw" + +mkdir -p "$bigwig_dir" + +ref_chromsizes=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-CHROMSIZES); + +# bedgraph is gzipped by default +gunzip -cv $bismark_bedgraph_gz > $bismark_bedgraph + +bash_cmd="bedGraphToBigWig $bismark_bedgraph $ref_chromsizes $bigwig" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +# delete bedgraph +rm -fv $bismark_bedgraph + + +######################### + + +# generate summary + +# header +echo "#SAMPLE,READS,TOTAL Cs,METH_CpG_CONTEXT,METH_CHG_CONTEXT,METH_CHH_CONTEXT" > $summary_csv + +# print the relevant numbers +paste -d ',' \ +<(echo "$sample") \ +<(cat $bismark_report_short | grep "lines in total" | head -1 | cut -d " " -f 2) \ +<(cat $bismark_report_short | grep "Total number of C's analysed" | head -1 | cut -f 2) \ +<(cat $bismark_report_short | grep "C methylated in CpG context" | head -1 | cut -f 2) \ +<(cat $bismark_report_short | grep "C methylated in CHG context" | head -1 | cut -f 2) \ +<(cat $bismark_report_short | grep "C methylated in CHH context" | head -1 | cut -f 2) \ +>> $summary_csv + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \ +> "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# combine summary from each step + +combined_summary_csv="${proj_dir}/summary-combined.bs-bismark-${analysis_type}.csv" + +# combine all summaries +bash_cmd=" +bash ${code_dir}/scripts/join-many.sh , X \ +${proj_dir}/summary.fastq-fastq-clean.csv \ +${proj_dir}/summary.fastq-fastq-trim-trimgalore.csv \ +${proj_dir}/summary.fastq-fastq-trim-trimmomatic.csv \ +${proj_dir}/summary.fastq-bam-bismark.csv \ +${proj_dir}/summary.bam-bam-dd-bismark.csv \ +${proj_dir}/summary.${segment_name}.csv \ +> $combined_summary_csv +" +(eval $bash_cmd) + + +######################### + + +# combine charts into a single png + +combined_png_2X="${proj_dir}/summary.${segment_name}.mbias.2x.png" +combined_png_4X="${proj_dir}/summary.${segment_name}.mbias.4x.png" + +rm -f $combined_png_3X +rm -f $combined_png_4X + +# -geometry +20+20 = 20px x and y padding +# -tile 4x = 4 images wide +# montage -geometry +20+20 -tile 4x ${bismark_meth_dir}/*M-bias*.png $COMBINED_PNG +montage -geometry +20+20 -tile 2x -label %t -pointsize 18 -font Nimbus-Sans-Regular ${bismark_meth_dir}/*M-bias*.png $combined_png_2X +montage -geometry +20+20 -tile 4x -label %t -pointsize 18 -font Nimbus-Sans-Regular ${bismark_meth_dir}/*M-bias*.png $combined_png_4X + + +######################### + + +# bismark html report + +cd "$bismark_report_dir" + +# sample report +bismark2report + +# summary report +bismark2summary --basename '_summary' + + +######################### + + + +# end diff --git a/segments/bam-qc-picard-rnaseqmetrics.sh b/segments/bam-qc-picard-rnaseqmetrics.sh new file mode 100755 index 0000000..6ffa855 --- /dev/null +++ b/segments/bam-qc-picard-rnaseqmetrics.sh @@ -0,0 +1,227 @@ +#!/bin/bash + + +# run Picard CollectRnaSeqMetrics + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 3 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [BAM] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +bam=$3 + +# strand: +# fwd | transcript | cufflinks "fr-secondstrand" | htseq "yes" | picard "FIRST_READ" +# rev | rev comp of transcript | cufflinks "fr-firststrand" | htseq "reverse" | picard "SECOND_READ" + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] || [ ! "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$bam" ] || [ ! "$bam" ] ; then + echo -e "\n $script_name ERROR: BAM $bam DOES NOT EXIST \n" >&2 + exit 1 +fi + +CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +REFFLAT=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-REFFLAT) +RRNA_INTERVAL_LIST=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-RRNAINTERVALLIST) + +if [ ! -s "$REFFLAT" ] || [ ! "$REFFLAT" ] ; then + echo -e "\n $script_name ERROR: REFFLAT $REFFLAT DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$RRNA_INTERVAL_LIST" ] || [ ! "$RRNA_INTERVAL_LIST" ] ; then + echo -e "\n $script_name ERROR: RRNA INTERVAL LIST $RRNA_INTERVAL_LIST DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# output + +metrics_dir="${proj_dir}/QC-RnaSeqMetrics" +mkdir -p "$metrics_dir" +metrics_txt="${metrics_dir}/${sample}.txt" +metrics_pdf="${metrics_dir}/${sample}.pdf" + +summary_dir="${proj_dir}/summary" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +# # strand flag +# if [ "$STRAND" == "unstr" ] ; then +# STRAND_FLAG="0" +# elif [ "$STRAND" == "fwd" ] ; then +# STRAND_FLAG="1" +# elif [ "$STRAND" == "rev" ] ; then +# STRAND_FLAG="2" +# else +# echo -e "\n $script_name ERROR: incorrect strand selected \n" >&2 +# exit 1 +# fi + + +######################### + + +# exit if output exits already + +if [ -s "$metrics_txt" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +module unload java +module load java/1.7 +module load picard-tools/1.88 + + + + +# STRAND_SPECIFICITY {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND} +# STRAND_SPECIFICITY="NONE" + + +echo " * CollectRnaSeqMetrics: ${PICARD_ROOT}/CollectRnaSeqMetrics.jar " +echo " * BAM: $bam " +echo " * REF_FLAT: $REFFLAT " +echo " * RIBOSOMAL_INTERVALS: $RRNA_INTERVAL_LIST " +echo " * OUT TXT: $metrics_txt " +echo " * OUT PDF: $metrics_pdf " + +# run picard + +PICARD_CMD="java -Xms8G -Xmx8G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \ +VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000 \ +REF_FLAT=${REFFLAT} \ +INPUT=${bam}" + +CMD="$PICARD_CMD \ +STRAND_SPECIFICITY=NONE \ +RIBOSOMAL_INTERVALS=${RRNA_INTERVAL_LIST} \ +CHART_OUTPUT=${metrics_pdf} \ +OUTPUT=${metrics_txt}" +echo "CMD: $CMD" +$CMD + +CMD="$PICARD_CMD \ +STRAND_SPECIFICITY=FIRST_READ_TRANSCRIPTION_STRAND \ +OUTPUT=${metrics_txt}.1READ" +echo "CMD: $CMD" +$CMD + +CMD="$PICARD_CMD \ +STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND \ +OUTPUT=${metrics_txt}.2READ" +echo "CMD: $CMD" +$CMD + + +######################### + + +# check that output generated + +if [ ! -s "$metrics_txt" ] ; then + echo -e "\n $script_name ERROR: METRICS $metrics_txt NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# generate a summary file + +# 01 PF_BASES +# 02 PF_ALIGNED_BASES +# 03 RIBOSOMAL_BASES +# 04 CODING_BASES +# 05 UTR_BASES +# 06 INTRONIC_BASES +# 07 INTERGENIC_BASES +# 08 IGNORED_READS +# 09 CORRECT_STRAND_READS +# 10 INCORRECT_STRAND_READS +# 11 PCT_RIBOSOMAL_BASES +# 12 PCT_CODING_BASES +# 13 PCT_UTR_BASES +# 14 PCT_INTRONIC_BASES +# 15 PCT_INTERGENIC_BASES +# 16 PCT_MRNA_BASES +# 17 PCT_USABLE_BASES +# 18 PCT_CORRECT_STRAND_READS +# 19 MEDIAN_CV_COVERAGE +# 20 MEDIAN_5PRIME_BIAS +# 21 MEDIAN_3PRIME_BIAS +# 22 MEDIAN_5PRIME_TO_3PRIME_BIAS + +paste \ +<(echo -e "#SAMPLE\n${sample}") \ +<(cat "${metrics_txt}" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 2,11,12,13,14,15,22) \ +<(cat "${metrics_txt}.1READ" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 18 | sed 's/PCT_CORRECT_STRAND_READS/FWD_STRAND/') \ +<(cat "${metrics_txt}.2READ" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 18 | sed 's/PCT_CORRECT_STRAND_READS/REV_STRAND/') \ +| tr '\t' ',' \ +> $summary_csv + +rm -f "${metrics_txt}.1READ" +rm -f "${metrics_txt}.2READ" + + +# combine charts into a single pdf + +combined_pdf=${proj_dir}/summary.${segment_name}.pdf + +rm -f "$combined_pdf" + +# do not use quotes in filenames for ghostscript +gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${combined_pdf} ${metrics_dir}/*.pdf + +# combine charts into a single png + +combined_png_4w=${proj_dir}/summary.${segment_name}.4w.png +combined_png_5w=${proj_dir}/summary.${segment_name}.5w.png + +rm -f "$combined_png_4w" +rm -f "$combined_png_5w" + +# -geometry +20+20 = 20px x and y padding +# -tile 4x = 4 images wide +montage -geometry +20+20 -tile 4x "${metrics_dir}/*.pdf" "$combined_png_4w" +montage -geometry +20+20 -tile 5x "${metrics_dir}/*.pdf" "$combined_png_5w" + + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv| LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + + +# end diff --git a/segments/bam-quant-featurecounts.sh b/segments/bam-quant-featurecounts.sh new file mode 100755 index 0000000..b2baf85 --- /dev/null +++ b/segments/bam-quant-featurecounts.sh @@ -0,0 +1,238 @@ +#!/bin/bash + + +# run featureCounts + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 6 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [bam] [PE/SE] [strand] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +bam=$4 +run_type=$5 +strand=$6 + +# strand: +# fwd | transcript | cufflinks "fr-secondstrand" | htseq "yes" | picard "FIRST_READ" +# rev | rev comp of transcript | cufflinks "fr-firststrand" | htseq "reverse" | picard "SECOND_READ" + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] || [ ! "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$bam" ] || [ ! "$bam" ] ; then + echo -e "\n $script_name ERROR: bam $bam DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +gtf=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-GTF); + +if [ ! -s "$gtf" ] || [ ! "$gtf" ] ; then + echo -e "\n $script_name ERROR: GTF $gtf DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +segment_name="${segment_name}-${strand}" + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +fc_quant_dir=${proj_dir}/quant-featureCounts +mkdir -p "$fc_quant_dir" +counts_clean=${fc_quant_dir}/${sample}.counts.${strand}.txt + +fc_logs_dir="${proj_dir}/logs-featureCounts" +mkdir -p "$fc_logs_dir" +counts_raw="${fc_logs_dir}/${sample}.${strand}.txt" +counts_summary_raw="${fc_logs_dir}/${sample}.${strand}.txt.summary" + +# run type flag +if [ "$run_type" == "SE" ] ; then + run_type_flag="" +elif [ "$run_type" == "PE" ] ; then + run_type_flag="-p" +else + echo -e "\n $script_name ERROR: incorrect run type $run_type selected \n" >&2 + exit 1 +fi + +# strand flag +if [ "$strand" == "unstr" ] ; then + strand_flag="0" +elif [ "$strand" == "fwd" ] ; then + strand_flag="1" +elif [ "$strand" == "rev" ] ; then + strand_flag="2" +else + echo -e "\n $script_name ERROR: incorrect strand $strand selected \n" >&2 + exit 1 +fi + + +######################### + + +# exit if output exits already + +if [ -s "$counts_clean" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# featureCounts + +module load subread/1.4.6-p3 + +# featureCounts generates temp files in the current directory +cd "$fc_logs_dir" + +echo " * featureCounts: $(readlink -f $(which featureCounts)) " +echo " * RUN TYPE: $run_type " +echo " * BAM: $bam " +echo " * GTF: $gtf " +echo " * COUNTS: $counts_clean " + +# Summarize a single-end read dataset using 5 threads: +# featureCounts -T 5 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.sam +# Summarize a bam format dataset: +# featureCounts -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.bam +# Summarize multiple datasets at the same time: +# featureCounts -t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam +# Perform strand-specific read counting (use '-s 2' if reversely stranded): +# featureCounts -s 1 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.bam +# Summarize paired-end reads and count fragments (instead of reads): +# featureCounts -p -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam +# Summarize multiple paired-end datasets: +# featureCounts -p -t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam +# Count the fragments that have fragment length between 50bp and 600bp only: +# featureCounts -p -P -d 50 -D 600 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam +# Count those fragments that have both ends mapped only: +# featureCounts -p -B -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam +# Exclude chimeric fragments from fragment counting: +# featureCounts -p -C -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam + +CMD=" +featureCounts \ +-T $threads \ +$run_type_flag \ +-g gene_name \ +-s $strand_flag \ +-a $gtf \ +-o $counts_raw \ +$bam +" +echo "CMD: $CMD" +eval "$CMD" + + +######################### + + +# check that output generated + +if [ ! -s "$counts_raw" ] ; then + echo -e "\n $script_name ERROR: COUNTS $counts_raw NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# clean up the full output + +# 1: Geneid +# 2: Chr +# 3: Start +# 4: End +# 5: strand +# 6: Length +# 7: count + +# gene counts table +echo -e "#GENE\t${sample}" > "$counts_clean" +cat $counts_raw | grep -v '#' | grep -v 'Geneid' | cut -f 1,7 | LC_ALL=C sort -k1,1 >> "$counts_clean" + +# gene info table +gene_info_file="${proj_dir}/genes.featurecounts.txt" +echo -e "#GENE\tCHR\tSTART\tEND\tstrand\tLENGTH" > "$gene_info_file" +cat $counts_raw | grep -v '#' | grep -v 'Geneid' | cut -f 1-6 | LC_ALL=C sort -k1,1 >> "$gene_info_file" + +# delete the full original counts table +rm -fv $counts_raw + + +######################### + + +# generate summary + +counts_assigned=$(cat $counts_summary_raw | grep -m 1 "^Assigned" | cut -f 2) + +# header for summary file +echo "#SAMPLE,ASSIGNED COUNTS ${strand}" > "$summary_csv" + +# summarize log file +echo "${sample},${counts_assigned}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# generate counts matrix for all samples + +# merged counts filename +merged_counts_file="${proj_dir}/quant.featurecounts.counts.${strand}.txt" + +# list of all counts files +count_file_list=$(find "$fc_quant_dir" -type f -name "*.counts.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ') + +# merged counts filename with sample included to avoid conflicts during join process +CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp" +(eval "$CMD") + +# rewrite any existing merged counts table with current table +mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file" + + +######################### + + + +# end diff --git a/segments/fastq-bam-bismark.sh b/segments/fastq-bam-bismark.sh new file mode 100755 index 0000000..7c7887f --- /dev/null +++ b/segments/fastq-bam-bismark.sh @@ -0,0 +1,270 @@ +#!/bin/bash + + +# Bismark alignment + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +fastq_R1=$4 +fastq_R2=$5 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +ref_bismark=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BISMARK); + +if [ ! -d "$ref_bismark" ] || [ ! "$ref_bismark" ] ; then + echo -e "\n $script_name ERROR: REF $ref_bismark DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +bismark_bam_dir="${proj_dir}/BAM-Bismark" +mkdir -p "$bismark_bam_dir" +bismark_bam_final="${bismark_bam_dir}/${sample}.bam" + +bismark_logs_dir="${proj_dir}/logs-${segment_name}" +mkdir -p "$bismark_logs_dir" +bismark_report_final="${bismark_logs_dir}/${sample}.report.txt" +bismark_nucstats_final="${bismark_logs_dir}/${sample}.nucleotide_stats.txt" + +bismark_report_dir="${proj_dir}/Bismark-report" +mkdir -p "$bismark_report_dir" + +# v0.16.0: "File endings .fastq | .fq | .fastq.gz | .fq.gz are now removed from the output file" +# BISMARK_ID=$(basename $FASTQ1) +# BISMARK_ID="$FASTQ1_SHORT" +bismark_id=$(basename "$fastq_R1") +bismark_id=${bismark_id/.fastq.gz/} + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ; then + suffix_bam="_bismark_bt2_pe.bam" + suffix_report="_bismark_bt2_PE_report.txt" + suffix_nucstats="_bismark_bt2_pe.nucleotide_stats.txt" +else + suffix_bam="_bismark_bt2.bam" + suffix_report="_bismark_bt2_SE_report.txt" + suffix_nucstats="_bismark_bt2.nucleotide_stats.txt" +fi + +bismark_bam_original="${bismark_logs_dir}/${bismark_id}${suffix_bam}" +bismark_report_original="${bismark_logs_dir}/${bismark_id}${suffix_report}" +bismark_nucstats_original="${bismark_logs_dir}/${bismark_id}${suffix_nucstats}" + + +######################### + + +# exit if output exits already + +if [ -s "$bismark_bam_final" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# bismark +# bismark can't use sorted bam at the next step, but other tools may need sorted bam + +# bismark/0.15.0 and bismark/0.16.0 load bowtie2/2.2.6 and samtools/1.3 +module load bismark/0.16.3 + +# "In order to work properly the current working directory must contain the sequence files to be analysed" (as of v0.14) +fastq_dir=$(dirname "$fastq_R1") +cd "$fastq_dir" || exit 1 + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ; then + bismark_flags="--maxins 1000" + fastq_files="-1 $fastq_R1 -2 $fastq_R2" +else + bismark_flags="" + fastq_files="$fastq_R1" +fi + +# number of parallel instances of Bismark to run (3-5 threads per instance depending on parameters) +# "a typical Bismark run will use several cores already (Bismark itself, Bowtie/Bowtie2, Samtools, gzip etc...)" +multicore_flag=$(( threads / 3 )) + +# v0.15.0: "specifying --basename in conjuction with --multicore is currently not supported" + +echo " * BISMARK: $(readlink -f $(which bismark)) " +echo " * BISMARK REF: $ref_bismark " +echo " * FASTQ DIR: $fastq_dir " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * BAM ORIGINAL: $bismark_bam_original " +echo " * REPORT ORIGINAL: $bismark_report_original " +echo " * BAM FINAL: $bismark_bam_final " +echo " * REPORT FINAL: $bismark_report_final " + +bash_cmd=" +bismark \ +--bowtie2 \ +--gzip \ +--nucleotide_coverage \ +--multicore $multicore_flag \ +--path_to_bowtie $BOWTIE2_ROOT \ +$bismark_flags \ +--temp_dir $bismark_logs_dir \ +--output_dir $bismark_logs_dir \ +$ref_bismark \ +$fastq_files +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + + +######################### + + +# check that output generated + +if [ ! -s "$bismark_bam_original" ] ; then + echo -e "\n $script_name ERROR: BAM $bismark_bam_original NOT GENERATED \n" >&2 + exit 1 +fi + +if [ ! -s "$bismark_report_original" ] ; then + echo -e "\n $script_name ERROR: REPORT $bismark_report_original NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# clean up output name + +bash_cmd="mv -v $bismark_bam_original $bismark_bam_final" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +bash_cmd="mv -v $bismark_report_original $bismark_report_final" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +bash_cmd="mv -v $bismark_nucstats_original $bismark_nucstats_final" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + + +######################### + + +# put reports in report directory for bismark2report +# BAMs for bismark2summary (may not be necessary) + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ; then + bismark_report_bam="${bismark_report_dir}/${sample}${suffix_bam}" + bismark_report_report="${bismark_report_dir}/${sample}${suffix_report}" + bismark_report_nucstats="${bismark_report_dir}/${sample}${suffix_nucstats}" +else + bismark_report_bam="${bismark_report_dir}/${sample}${suffix_bam}" + bismark_report_report="${bismark_report_dir}/${sample}${suffix_report}" + bismark_report_nucstats="${bismark_report_dir}/${sample}${suffix_nucstats}" +fi + +bash_cmd="ln -sv ../$(basename "$bismark_bam_dir")/$(basename "$bismark_bam_final") $bismark_report_bam" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +bash_cmd="cp -v $bismark_report_final $bismark_report_report" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +bash_cmd="cp -v $bismark_nucstats_final $bismark_report_nucstats" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# generate alignment summary + +# header for summary file +echo "#SAMPLE,ALIGN INPUT READS,UNIQUELY ALIGNED,ALIGNMENT RATE" > $summary_csv + +# print the relevant numbers +paste -d ',' \ +<(echo "$sample") \ +<(cat "$bismark_report_final" | grep -m 1 "Sequence.*analysed" | cut -f 2) \ +<(cat "$bismark_report_final" | grep -m 1 "alignments with a unique best hit" | cut -f 2) \ +<(cat "$bismark_report_final" | grep -m 1 "Mapping efficiency" | cut -f 2 | sed 's/ //g') \ +>> $summary_csv + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and BAM to sample sheet +echo "${sample},${bismark_bam_final}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/fastq-bam-bwa-mem.sh b/segments/fastq-bam-bwa-mem.sh new file mode 100755 index 0000000..95dc615 --- /dev/null +++ b/segments/fastq-bam-bwa-mem.sh @@ -0,0 +1,371 @@ +#!/bin/bash + + +# run BWA-MEM + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +fastq_R1=$4 +fastq_R2=$5 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +ref_bwa=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BWA); + +if [ ! -s "$ref_bwa" ] || [ ! -n "$ref_bwa" ] ; then + echo -e "\n $script_name ERROR: REF $ref_bwa DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +bwa_bam_dir="${proj_dir}/BAM-BWA" +mkdir -p "$bwa_bam_dir" +bam="${bwa_bam_dir}/${sample}.bam" + +bwa_logs_dir="${proj_dir}/logs-${segment_name}" +mkdir -p "$bwa_logs_dir" +bwa_flagstat="${bwa_logs_dir}/${sample}.flagstat.txt" + + +######################### + + +# exit if output exits already + +if [ -s "$bam" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# BWA + +module load bwa/0.7.13 + +sambamba_bin="/ifs/home/id460/bin/sambamba" + +echo " * bwa: $(readlink -f $(which bwa)) " +echo " * bwa version: $(bwa 2>&1 | grep -m 1 'Version') " +echo " * sambamba: $(readlink -f $(which $sambamba_bin)) " +echo " * sambamba version: $($sambamba_bin 2>&1 | head -1) " +echo " * BWA REF: $ref_bwa " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * BAM: $bam " + +# step 1: align with BWA-MEM +# step 2: convert SAM to BAM and remove low quality reads +# step 3: sort BAM + +bash_cmd=" +bwa mem \ +-M -v 1 \ +-t $threads \ +-R '@RG\tID:${sample}\tSM:${sample}\tLB:${sample}\tPL:ILLUMINA' \ +$ref_bwa \ +$fastq_R1 $fastq_R2 \ +| \ +$sambamba_bin view \ +--sam-input \ +--nthreads=${threads} \ +--filter='mapping_quality>=10' \ +--format=bam \ +--compression-level=0 \ +/dev/stdin \ +| \ +$sambamba_bin sort \ +--nthreads=${threads} \ +--memory-limit=16GB \ +--out=${bam} \ +/dev/stdin +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + + +######################### + + +# check that output generated + +# check that bam file size above 10 kb +du -s $bam +bam_size=$(du -s --apparent-size $bam | cut -f 1) +if [ $bam_size -lt 10 ] ; then + echo -e "\n $script_name ERROR: BAM $bam TOO SMALL \n" >&2 + exit 1 +fi + + +######################### + + +# run flagstat + +bash_cmd="$sambamba_bin flagstat $bam > $bwa_flagstat" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# generate alignment summary + +# get number of input reads (not pairs) +fastq_lines=$(zcat "$fastq_R1" | wc -l) +if [ -n "$fastq_R2" ] ; then + reads_input=$(echo "${fastq_lines}/2" | bc) +else + reads_input=$(echo "${fastq_lines}/4" | bc) +fi + +reads_mapped=$(cat "$bwa_flagstat" | grep -m 1 "mapped (" | cut -d ' ' -f 1) +reads_chimeric=$(cat "$bwa_flagstat" | grep -m 1 "mate mapped to a different chr" | cut -d ' ' -f 1) + +reads_mapped_pct=$(echo "(${reads_mapped}/${reads_input})*100" | bc -l | cut -c 1-4) +reads_mapped_pct="${reads_mapped_pct}%" + +reads_chimeric_pct=$(echo "(${reads_chimeric}/${reads_mapped})*100" | bc -l | cut -c 1-4) +reads_chimeric_pct="${reads_chimeric_pct}%" + +# header for summary file +echo "#SAMPLE,INPUT READS,MAPPED READS (MQ10),MAPPED %,CHIMERIC %" > "$summary_csv" + +# summarize log file +echo "${sample},${reads_input},${reads_mapped},${reads_mapped_pct},${reads_chimeric_pct}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and BAM to sample sheet +echo "${sample},${bam}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end + + + +exit + + + + + + +######################### + + +# output +DD_DIR=${BASE_DIR}/BAM-DD +BAM_DD=${DD_DIR}/${ID}.dd.bam + +mkdir -p $DD_DIR + +print_msg "REMOVE DUPS - $ID" + +CMD="bash ${HOME}/pipeline/wrappers/picard-mark-dups.sh $BASE_DIR $ID $BAM_BWA $BAM_DD false" +printf "\n\n $CMD \n\n" +$CMD + +BAM=$BAM_DD + + +######################### + + +# check that BAM generated + +if [ ! -s $BAM ] +then + printf "\n\n ERROR! $BAM DOES NOT EXIST \n\n" + exit 1 +fi + + +######################### + + +print_msg "SAMTOOLS FLAGSTAT - $ID" + +bash ${HOME}/pipeline/wrappers/samtools-flagstat.sh $BASE_DIR $ID $BAM + + +######################### + + +# Picard CollectInsertSizeMetrics + +print_msg "INSERT SIZE METRICS - $ID" + +# qsub-if-needed.sh +bash ${HOME}/pipeline/helpers/qsub-if-needed.sh $BASE_DIR ${ID}.txt picard-insert-size $GENOME $BASE_DIR $ID $BAM $REF_FASTA + + +######################### + + +# try to find the bed file again in case it was placed in the directory after the run was started + +if [ ! -s $BED ] || [ ! $BED ] +then + BED=$(find $BASE_DIR -type f -name "*.bed") +fi + +if [ ! -s $BED ] || [ ! $BED ] +then + BED=0 +fi + + +######################### + + +COV_DD_DIR=${BASE_DIR}/COV-DD +mkdir -p $COV_DD_DIR + +if [ -f ${COV_DD_DIR}/${ID}.cov.ref.sample_summary ] +then + + print_msg "SKIP COVERAGE - $ID" + +else + + print_msg "GATK COVERAGE - $ID" + + bash /ifs/home/id460/pipeline/wrappers/gatk-cov-all.sh $THREADS $REF_FASTA $BED $ID $BAM $COV_DD_DIR + +fi + + +######################### + + +# GATK requires several R libraries (using the default R 3.0.2 probably) +# install.packages("ggplot2") +# install.packages("gsalib") +# install.packages("reshape") + +if [ $GENOME == "hg19" ] +then + + print_msg "GATK REALIGNMENT/RECALIBRATION (hg19)" + + bash /ifs/home/id460/pipeline/gatk-ra-rc-hg19.sh $ID $BAM $BASE_DIR $THREADS + +elif [ $GENOME == "mm10" ] +then + + print_msg "GATK REALIGNMENT/RECALIBRATION (mm10)" + + bash /ifs/home/id460/pipeline/gatk-ra-rc-mm10.sh $ID $BAM $BASE_DIR $THREADS + +elif [ $GENOME == "dm3" ] +then + + print_msg "GATK REALIGNMENT/RECALIBRATION (dm3)" + + bash /ifs/home/id460/pipeline/gatk-ra-rc-dm3.sh $ID $BAM $BASE_DIR $THREADS + + # realignment only (if chrM only) + # bash /ifs/home/id460/pipeline/gatk-ra.sh $THREADS $GENOME $ID $BAM_DD $BASE_DIR + +else + + print_msg "GENERIC GATK REALIGNMENT (NO RECALIBRATION, SINCE NO SNP INFO)" + + bash /ifs/home/id460/pipeline/gatk-ra.sh $THREADS $GENOME $ID $BAM $BASE_DIR + +fi + + +######################### + + +# combine all summaries + +# in case files do not exist (if re-running analysis, for example) +touch -a summary.trim.txt +touch -a summary.flagstat.txt + +# (should change from paste to join) +# module load coreutils/8.24 +paste \ +${BASE_DIR}/summary.trim.txt \ +${BASE_DIR}/summary.flagstat.txt \ +${BASE_DIR}/COV-DD/__cov.txt \ +${BASE_DIR}/COV-DD-RA-RC/_cov_samples.txt \ +> ${BASE_DIR}/summary-combined.txt + + +######################### + +date + + +# end + + diff --git a/segments/fastq-bam-star.sh b/segments/fastq-bam-star.sh new file mode 100755 index 0000000..803f8c2 --- /dev/null +++ b/segments/fastq-bam-star.sh @@ -0,0 +1,287 @@ +#!/bin/bash + + +# run STAR + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +fastq_R1=$4 +fastq_R2=$5 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +ref_star=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-STAR); + +if [ ! -s "${ref_star}/SA" ] ; then + echo -e "\n $script_name ERROR: REF $ref_star DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +star_bam_dir="${proj_dir}/BAM-STAR" +mkdir -p "$star_bam_dir" +bam="${star_bam_dir}/${sample}.bam" + +star_quant_dir="${proj_dir}/quant-STAR" +mkdir -p "$star_quant_dir" +counts_txt="${star_quant_dir}/${sample}.txt" + +star_logs_dir="${proj_dir}/logs-${segment_name}" +mkdir -p "$star_logs_dir" +star_prefix="${star_logs_dir}/${sample}." + + +######################### + + +# exit if output exits already + +if [ -s "$bam" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + +if [ -s "${star_prefix}.bam" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# STAR + +module unload samtools +module load samtools/1.3 +# module load star/2.5.0c has issues +module load star/2.4.5a + +echo " * STAR: $(readlink -f $(which STAR)) " +echo " * samtools: $(readlink -f $(which samtools)) " +echo " * STAR REF: $ref_star " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * BAM: $bam " + +# change dir because star writes a log file to pwd +cd "$proj_dir" + +# --outFilterType BySJout - ENCODE standard option +# --outSAMmapqUnique - int: 0 to 255: the MAPQ value for unique mappers + +# relevant errors: +# SeqAnswers: "HTseq cannot deal with jM:B:c,-1 and jI:B:i,-1 SAM attributes which are output if you use (non-default) --outSAMmode Full" +# Picard says STAR-sorted BAM "is not coordinate sorted", so using samtools for sorting + +bash_cmd=" +STAR \ +--runThreadN $threads \ +--genomeDir $ref_star \ +--genomeLoad NoSharedMemory \ +--outFilterMismatchNoverLmax 0.05 \ +--outFilterMultimapNmax 1 \ +--outFilterType BySJout \ +--outSAMstrandField intronMotif \ +--outSAMattributes NH HI AS nM NM MD XS \ +--outSAMmapqUnique 60 \ +--twopassMode Basic \ +--readFilesCommand zcat \ +--readFilesIn $fastq_R1 $fastq_R2 \ +--outFileNamePrefix $star_prefix \ +--quantMode GeneCounts \ +--outSAMtype BAM Unsorted \ +--outStd BAM_Unsorted | \ +samtools sort -@ $threads -m 4G -T $sample -o $bam - +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + +# index bam +bash_cmd="samtools index $bam" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + + +######################### + + +# check that output generated + +# check if BAM file is present +if [ ! -s "$bam" ] ; then + echo -e "\n $script_name ERROR: BAM $bam NOT GENERATED \n" >&2 + exit 1 +fi + +# check that bam file size above 10 kb +du -s $bam +bam_size=$(du -s --apparent-size $bam | cut -f 1) +if [ $bam_size -lt 10 ] ; then + echo -e "\n $script_name ERROR: BAM $bam TOO SMALL \n" >&2 + exit 1 +fi + +# check if gene counts file is present +if [ ! -s "${star_prefix}ReadsPerGene.out.tab" ] ; then + echo -e "\n $script_name ERROR: COUNTS FILE ${star_prefix}ReadsPerGene.out.tab NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# STAR counts +# not using STAR counts later, since it's hard to filter and they have gene ids instead of names + +# STAR outputs read counts per gene into ReadsPerGene.out.tab file with 4 columns which correspond to strandedness: +# column 1: gene ID +# column 2: counts for unstranded RNA-seq +# column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes) +# column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse) + +# move counts file to a separate bam-only directory +CMD="mv -v ${star_prefix}ReadsPerGene.out.tab $counts_txt" +echo "CMD: $CMD" +eval "$CMD" + +# strand: +# fwd | transcript | cufflinks "fr-secondstrand" | htseq "yes" | picard "FIRST_READ" +# rev | rev comp of transcript | cufflinks "fr-firststrand" | htseq "reverse" | picard "SECOND_READ" + +# top 1000 genes (useful to determine strandness) +echo -e "#gene_id,unstr,fwd,rev" > "${star_quant_dir}/${sample}.top1000.csv" +cat "$counts_txt" | LC_ALL=C sort -k2,2nr | head -n 1000 | tr '\t' ',' >> "${star_quant_dir}/${sample}.top1000.csv" + + +######################### + + +# determine strand + +# get total counts for each strand +counts_unstr=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$2} END {print sum}') +echo "counts unstr: $counts_unstr" +counts_fwd=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$3} END {print sum}') +echo "counts fwd: $counts_fwd" +counts_rev=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$4} END {print sum}') +echo "counts rev: $counts_rev" + +# perform some sanity checks +if [ "$counts_unstr" -lt 1000 ] || [ "$counts_fwd" -lt 10 ] || [ "$counts_rev" -lt 10 ] ; then + echo -e "\n $script_name ERROR: LOW COUNTS \n" >&2 + exit 1 +fi + +lib_strand="unstr" + +if [ "$(echo "${counts_fwd}/${counts_rev}" | bc)" -gt 8 ] ; then + lib_strand="fwd" +fi + +if [ "$(echo "${counts_rev}/${counts_fwd}" | bc)" -gt 8 ] ; then + lib_strand="rev" +fi + +# set experiment strand (ignored if already specified) +exp_strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND "$lib_strand"); + +echo "sample strand: $lib_strand" +echo "experiment strand: $exp_strand" + + +######################### + + +# generate alignment summary + +# header for summary file +echo "#SAMPLE,INPUT READS,UNIQUELY MAPPED,MULTI-MAPPED,UNIQUELY MAPPED %,MULTI-MAPPED %" > $summary_csv + +# print the relevant numbers from log file +star_log_final="${star_prefix}Log.final.out" +paste -d ',' \ +<(echo "$sample") \ +<(cat "$star_log_final" | grep "Number of input reads" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \ +<(cat "$star_log_final" | grep "Uniquely mapped reads number" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \ +<(cat "$star_log_final" | grep "Number of reads mapped to too many loci" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \ +<(cat "$star_log_final" | grep "Uniquely mapped reads %" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \ +<(cat "$star_log_final" | grep "% of reads mapped to too many loci" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \ +>> $summary_csv + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# delete tmp directories (_STARtmp should be empty at this point) +rm -rfv ${star_prefix}_STAR* + + +######################### + + +# add sample and BAM to sample sheet +echo "${sample},${bam}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/fastq-fastq-clean.sh b/segments/fastq-fastq-clean.sh new file mode 100755 index 0000000..7ab7dc3 --- /dev/null +++ b/segments/fastq-fastq-clean.sh @@ -0,0 +1,182 @@ +#!/bin/bash + + +# merge fastqs or create symlinks to originals if no need to merge + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 2 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample name] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$(readlink -f $1) +sample=$2 + +# check if input exists +if [ ! -d "$proj_dir" ] || [ ! -n "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: PROJ DIR $proj_dir DOES NOT EXIST \n" + exit 1 +fi + +# sample sheet +samples_csv="${proj_dir}/samples.fastq-raw.csv" +if [ ! -s "$samples_csv" ] || [ ! -n "$samples_csv" ] ; then + echo -e "\n $script_name ERROR: SAMPLES CSV $samples_csv DOES NOT EXIST \n" + exit 1 +fi +samples_csv_clean="${proj_dir}/samples.${segment_name}.csv" + +# output summary +summary_dir="${proj_dir}/summary" +mkdir -p "${summary_dir}" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +# output files +fastq_clean_dir="${proj_dir}/FASTQ-CLEAN" +mkdir -p "$fastq_clean_dir" +fastq_R1_clean="${fastq_clean_dir}/${sample}_R1.fastq.gz" +fastq_R2_clean="${fastq_clean_dir}/${sample}_R2.fastq.gz" + +# considered initially +# fastq_R1_clean=$(bash ${CODE_DIR}/scripts/get-segment-filename.sh $proj_dir fastq-to-fastq-clean-r1 $sample) +# fastq_R2_clean=$(bash ${CODE_DIR}/scripts/get-segment-filename.sh $proj_dir fastq-to-fastq-clean-r2 $sample) + +# check if output already exists +if [ -s "$fastq_R1_clean" ] || [ -s "$fastq_R2_clean" ]; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + +# get number of files (or file pairs) for the sample (to decide if files need to be merged) +num_files=$(grep -c "^${sample}," "${samples_csv}") + +# scan for files that belong to the sample +grep "^${sample}," "${samples_csv}" | LC_ALL=C sort | while read -r LINE ; do + fastq_R1=$(echo "$LINE" | cut -d "," -f 2) + fastq_R2=$(echo "$LINE" | cut -d "," -f 3) + fastq_R1=$(readlink -f "$fastq_R1") + fastq_R2=$(readlink -f "$fastq_R2") + + # check that R1 file exists and not set to null + if [ ! -e "$fastq_R1" ] || [ ! -n "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ R1 $fastq_R1 DOES NOT EXIST \n" + exit 1 + fi + + # check that R2 file exists if specified + if [ ! -e "$fastq_R2" ] && [ -n "$fastq_R2" ] ; then + echo -e "\n $script_name ERROR: FASTQ R2 $fastq_R2 DOES NOT EXIST \n" + exit 1 + fi + + if [ "$num_files" -gt 1 ] ; then + + # merge multple FASTQs + + bash_cmd="cat $fastq_R1 >> $fastq_R1_clean" + echo "CMD: $bash_cmd" + eval "$bash_cmd" + + if [ -n "$fastq_R2" ] ; then + bash_cmd="cat $fastq_R2 >> $fastq_R2_clean" + echo "CMD: $bash_cmd" + eval "$bash_cmd" + fi + + elif [ "$num_files" -eq 1 ] ; then + + # symlink to the original FASTQ if only one + + bash_cmd="ln -s $fastq_R1 $fastq_R1_clean" + echo "CMD: $bash_cmd" + eval "$bash_cmd" + + if [ -n "$fastq_R2" ] ; then + bash_cmd="ln -s $fastq_R2 $fastq_R2_clean" + echo "CMD: $bash_cmd" + eval "$bash_cmd" + fi + + fi + + sleep 30 + + date + +done + + +######################### + + +# check if output exists + +if [ ! -s "$fastq_R1_clean" ] || [ ! -n "$fastq_R1_clean" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1_clean DOES NOT EXIST \n" + exit 1 +fi + + +######################### + + +# count number of reads in merged FASTQs + +lines_R1=$(zcat "$fastq_R1_clean" | wc -l) +reads_R1=$(echo "${lines_R1}/4" | bc) + +if [ -s "$fastq_R2_clean" ] ; then + lines_R2=$(zcat "$fastq_R2_clean" | wc -l) + reads_R2=$(echo "${lines_R2}/4" | bc) +else + reads_R2="0" + fastq_R2_clean="" +fi + +echo "FASTQ R1: $fastq_R1_clean" +echo "FASTQ R2: $fastq_R2_clean" +echo "READS R1: $reads_R1" +echo "READS R2: $reads_R2" + + +######################### + + +# generate summary + +echo "#SAMPLE,R1 RAW READS,R2 RAW READS" > "$summary_csv" +echo "${sample},${reads_R1},${reads_R2}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and FASTQ to sample sheet +echo "${sample},${fastq_R1_clean},${fastq_R2_clean}" >> "$samples_csv_clean" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv_clean" "$samples_csv_clean" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/fastq-fastq-trim-trimgalore.sh b/segments/fastq-fastq-trim-trimgalore.sh new file mode 100755 index 0000000..aac6de9 --- /dev/null +++ b/segments/fastq-fastq-trim-trimgalore.sh @@ -0,0 +1,199 @@ +#!/bin/bash + + +# run Trim Galore (for MspI digested RRBS samples) + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name run_type FASTQ_R1 [FASTQ_R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +run_type=$3 +fastq_R1=$4 +fastq_R2=$5 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +fastq_trim_dir="${proj_dir}/FASTQ-TRIMMED" +mkdir -p "$fastq_trim_dir" +fastq_R1_trim="${fastq_trim_dir}/${sample}_R1.trim.fastq.gz" +fastq_R2_trim="${fastq_trim_dir}/${sample}_R2.trim.fastq.gz" + +trim_galore_logs_dir="${proj_dir}/logs-trimgalore" +mkdir -p "$trim_galore_logs_dir" + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ; then + fastq_R1_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R1 | sed 's/.fastq.gz/_val_1.fq.gz/') + fastq_R2_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R2 | sed 's/.fastq.gz/_val_2.fq.gz/') + trim_galore_log=${trim_galore_logs_dir}/$(basename $fastq_R2)_trimming_report.txt +else + fastq_R1_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R1 | sed 's/.fastq.gz/_trimmed.fq.gz/') + trim_galore_log=${trim_galore_logs_dir}/$(basename $fastq_R1)_trimming_report.txt + fastq_R2_trim="" +fi + + +######################### + + +# exit if output exits already + +if [ -s "$fastq_R1_trim" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# trim galore + +module load trim-galore/0.4.1 + +echo " * trim_galore: $(readlink -f $(which trim_galore)) " +echo " * trim_galore version: $(trim_galore --version | grep 'version' | tr -s '[:blank:]') " +echo " * cutadapt: $(readlink -f $(which cutadapt)) " +echo " * cutadapt version: $(cutadapt --version) " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * FASTQ R1 TRIMMED: $fastq_R1_trim " +echo " * FASTQ R2 TRIMMED: $fastq_R2_trim " + +# --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition site: CCGG) + +# optional fastqc +# --fastqc --fastqc_args ' --threads $THREADS --outdir $FASTQC_DIR ' \ + +# check for run type +if [ "$run_type" == "rrbs" ] ; then + trim_galore_flags="--length 30 --rrbs" +else + echo -e "\n $script_name unknown run type $run_type \n" >&2 + exit 1 +fi + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ; then + trim_galore_flags="$trim_galore_flag --paired $fastq_R1 $fastq_R2" +else + trim_galore_flags="$trim_galore_flag $fastq_R1" +fi + +# trim_galore +bash_cmd=" +trim_galore \ +--output_dir $trim_galore_logs_dir \ +$trim_galore_flags +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + +# clean up file names + +mv -fv "$fastq_R1_trim_original" "$fastq_R1_trim" + +if [ -n "$fastq_R2" ] ; then + mv -fv "$fastq_R2_trim_original" "$fastq_R2_trim" +fi + +sleep 30 + + +######################### + + +# check that output generated + +if [ ! -s "$fastq_R1_trim" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1_trim NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# generate summary + +# extract the relevant part of the log +reads_input=$(cat "$trim_galore_log" | grep "sequences processed in total" | tail -1 | cut -d " " -f 1) + +# single-end and paired-end trim logs have different formatting, so just manually counting number of lines +reads_surviving=$(zcat $fastq_R1_trim | wc -l) +reads_surviving=$(echo "${reads_surviving}/4" | bc) + +reads_surviving_pct=$(echo "(${reads_surviving}/${reads_input})*100" | bc -l | cut -c 1-4) +reads_surviving_pct="${reads_surviving_pct}%" + +# header for summary file +echo "#SAMPLE,TRIM INPUT READS,TRIM SURVIVING READS,TRIM SURVIVING READS %" > "$summary_csv" + +# summarize log file +echo "${sample},${reads_input},${reads_surviving},${reads_surviving_pct}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and FASTQ to sample sheet +echo "${sample},${fastq_R1_trim},${fastq_R2_trim}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/fastq-fastq-trim-trimmomatic.sh b/segments/fastq-fastq-trim-trimmomatic.sh new file mode 100755 index 0000000..c3bef39 --- /dev/null +++ b/segments/fastq-fastq-trim-trimmomatic.sh @@ -0,0 +1,196 @@ +#!/bin/bash + + +# run Trimmomatic + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 4 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +fastq_R1=$4 +fastq_R2=$5 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.${segment_name}.csv" + +samples_csv="${proj_dir}/samples.${segment_name}.csv" + +fastq_trim_dir="${proj_dir}/FASTQ-TRIMMED" +mkdir -p "$fastq_trim_dir" +fastq_R1_trim="${fastq_trim_dir}/${sample}_R1.trim.fastq.gz" +fastq_R2_trim="${fastq_trim_dir}/${sample}_R2.trim.fastq.gz" +fastq_R1_trim_unpaired="${fastq_trim_dir}/${sample}_R1.unpaired.fastq.gz" +fastq_R2_trim_unpaired="${fastq_trim_dir}/${sample}_R2.unpaired.fastq.gz" + +trimmomatic_logs_dir="${proj_dir}/logs-trimmomatic" +mkdir -p "$trimmomatic_logs_dir" +trimmomatic_log="${trimmomatic_logs_dir}/${sample}.txt" + + +######################### + + +# exit if output exits already + +if [ -s "$fastq_R1_trim" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# trimmomatic + +module unload java +module load java/1.7 +module load trimmomatic/0.33 + +trimmomatic_jar="${TRIMMOMATIC_ROOT}/trimmomatic-0.33.jar" + +if [ -n "$fastq_R2" ] ; then + run_type_arg="PE" + files_arg="$fastq_R1 $fastq_R2 $fastq_R1_trim $fastq_R1_trim_unpaired $fastq_R2_trim $fastq_R2_trim_unpaired" +else + run_type_arg="SE" + files_arg="$fastq_R1 $fastq_R1_trim" + fastq_R2_trim="" + fastq_R2_trim_unpaired="" +fi + +echo " * trimmomatic: $trimmomatic_jar " +echo " * RUN TYPE: $run_type_arg " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * FASTQ R1 TRIMMED: $fastq_R1_trim " +echo " * FASTQ R2 TRIMMED: $fastq_R2_trim " +echo " * FASTQ R1 TRIMMED UNPAIRED: $fastq_R1_trim_unpaired " +echo " * FASTQ R2 TRIMMED UNPAIRED: $fastq_R2_trim_unpaired " + +bash_cmd=" +java -Xms16G -Xmx16G -jar $trimmomatic_jar \ +$run_type_arg \ +-threads $threads \ +$files_arg \ +ILLUMINACLIP:/ifs/home/id460/ref/contaminants/trimmomatic.fa:2:30:10:1:true \ +TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:35 \ +2> $trimmomatic_log +" +echo "CMD: $bash_cmd" +eval "$bash_cmd" + +sleep 30 + +# delete unpaired files +rm -fv $fastq_R1_trim_unpaired +rm -fv $fastq_R2_trim_unpaired + + +######################### + + +# check that output generated + +if [ ! -s "$fastq_R1_trim" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1_trim NOT GENERATED \n" >&2 + exit 1 +fi + +if grep -q -i "error" "$trimmomatic_log" ; then + echo -e "\n $script_name ERROR: LOG $trimmomatic_log CONTAINS ERROR \n" >&2 + grep -i "error" "$trimmomatic_log" + rm -fv $fastq_R1_trim + rm -fv $fastq_R2_trim + exit 1 +fi + + +######################### + + +# generate summary + +# extract the relevant part of the log + +if [ -n "$fastq_R2" ] ; then + total_reads_col="4" + surviving_reads_col="7" +else + total_reads_col="3" + surviving_reads_col="5" +fi + +reads_input=$(cat $trimmomatic_log | grep "Input Read" | cut -d ' ' -f $total_reads_col) +reads_surviving=$(cat $trimmomatic_log | grep "Input Read" | cut -d ' ' -f $surviving_reads_col) + +reads_surviving_pct=$(echo "(${reads_surviving}/${reads_input})*100" | bc -l | cut -c 1-4) +reads_surviving_pct="${reads_surviving_pct}%" + +# header for summary file +echo "#SAMPLE,TRIM INPUT READS,TRIM SURVIVING READS,TRIM SURVIVING READS %" > "$summary_csv" + +# summarize log file +echo "${sample},${reads_input},${reads_surviving},${reads_surviving_pct}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv" + + +######################### + + +# add sample and FASTQ to sample sheet +echo "${sample},${fastq_R1_trim},${fastq_R2_trim}" >> "$samples_csv" + +sleep 30 + +# sort and remove duplicates in place in sample sheet +LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv" + +sleep 30 + + +######################### + + + +# end diff --git a/segments/fastq-qc-fastqscreen.sh b/segments/fastq-qc-fastqscreen.sh new file mode 100755 index 0000000..6eae89d --- /dev/null +++ b/segments/fastq-qc-fastqscreen.sh @@ -0,0 +1,140 @@ +#!/bin/bash + + +# run fastq_screen + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ ! $# == 3 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [FASTQ] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +fastq=$3 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq DOES NOT EXIST \n" >&2 + exit 1 +fi + +CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +FASTQSCREEN_CONF=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-FASTQSCREEN); + +if [ ! -s "$FASTQSCREEN_CONF" ] ; then + echo -e "\n $script_name ERROR: CONF $FASTQSCREEN_CONF DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +FASTQ_SCREEN_DIR="${proj_dir}/QC-fastqscreen" +mkdir -p "$FASTQ_SCREEN_DIR" + +FASTQ_SCREEN_TXT="${FASTQ_SCREEN_DIR}/${fastq/*\//}_screen.txt" +FASTQ_SCREEN_TXT="${FASTQ_SCREEN_TXT/.fastq.gz/}" + + +######################### + + +# exit if output exits already + +if [ -s "$FASTQ_SCREEN_TXT" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# fastq_screen +# ignore paired reads (in case of rna-seq, paired reads may be too far apart and will not align) + +bowtie2_bin=$(cat "$FASTQSCREEN_CONF" | grep "^BOWTIE2" | head -1 | tr '[:space:]' '\t' | tr -s '\t' | cut -f 2) +fastqscreen_bin="/ifs/home/id460/software/fastq_screen_v0.5.2/fastq_screen" + +echo " * fastq_screen: $fastqscreen_bin " +echo " * fastq_screen version: $($fastqscreen_bin --version) " +echo " * bowtie2: $bowtie2_bin " +echo " * bowtie2 version: $($bowtie2_bin --version | head -1) " +echo " * FASTQ: $fastq " +echo " * OUT TXT: $FASTQ_SCREEN_TXT " + +CMD=" +$fastqscreen_bin \ +--aligner bowtie2 \ +--subset 1000000 \ +--conf $FASTQSCREEN_CONF \ +--outdir $FASTQ_SCREEN_DIR \ +$fastq +" +echo "CMD: $CMD" +$CMD + + +######################### + + +# check that output generated + +if [ ! -s "$FASTQ_SCREEN_TXT" ] ; then + echo -e "\n $script_name ERROR: TXT $FASTQ_SCREEN_TXT NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# summary + +# combine charts into a single pdf +COMBINED_PDF="${proj_dir}/summary.fastqscreen.pdf" +rm -f "$COMBINED_PDF" +convert "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PDF" + + +# combine charts into a single png + +COMBINED_PNG_3W="${proj_dir}/summary.fastqscreen.3w.png" +COMBINED_PNG_4W="${proj_dir}/summary.fastqscreen.4w.png" + +rm -f "$COMBINED_PNG_3W" +rm -f "$COMBINED_PNG_4W" + +# -geometry +20+20 = 20px x and y padding +# -tile 3x = 3 images wide +montage -geometry +20+20 -tile 3x "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PNG_3W" +montage -geometry +20+20 -tile 4x "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PNG_4W" + + +######################### + + + +# end diff --git a/segments/fastq-quant-rsem.sh b/segments/fastq-quant-rsem.sh new file mode 100755 index 0000000..f3f2b9b --- /dev/null +++ b/segments/fastq-quant-rsem.sh @@ -0,0 +1,249 @@ +#!/bin/bash + + +# run RSEM + + +# script filename +script_name=$(basename "${BASH_SOURCE[0]}") +segment_name=${script_name/%.sh/} +echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2 + +# check for correct number of arguments +if [ $# -lt 5 ] ; then + echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 + echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [strand] [FASTQ R1] [FASTQ R2] \n" >&2 + exit 1 +fi + +# arguments +proj_dir=$1 +sample=$2 +threads=$3 +strand=$4 +fastq_R1=$5 +fastq_R2=$6 + + +######################### + + +# check that inputs exist + +if [ ! -d "$proj_dir" ] ; then + echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2 + exit 1 +fi + +if [ ! -s "$fastq_R1" ] ; then + echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2 + exit 1 +fi + +code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")") +rsem_genome=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-RSEM); + +if [ ! -s "${rsem_genome}.transcripts.fa" ] ; then + echo -e "\n $script_name ERROR: GENOME ${rsem_genome}.transcripts.fa DOES NOT EXIST \n" >&2 + exit 1 +fi + + +######################### + + +# settings and files + +summary_dir="${proj_dir}/summary" +mkdir -p "$summary_dir" +summary_csv="${summary_dir}/${sample}.rsem.${strand}.csv" + +rsem_quant_dir="${proj_dir}/quant-RSEM" +mkdir -p "$rsem_quant_dir" +rsem_expcounts_txt="${rsem_quant_dir}/${sample}.expcounts.${strand}.txt" +rsem_tpm_txt="${rsem_quant_dir}/${sample}.TPM.${strand}.txt" +rsem_fpkm_txt="${rsem_quant_dir}/${sample}.FPKM.${strand}.txt" + +rsem_logs_dir="${proj_dir}/logs-RSEM" +mkdir -p "$rsem_logs_dir" +# RSEM_RAW_SUMMARY="${rsem_logs_dir}/${sample}.${strand}.stats.txt" +rsem_raw_genes_results="${rsem_logs_dir}/${sample}.${strand}.genes.results" +rsem_stat_cnt="${rsem_logs_dir}/${sample}.${strand}.stat/${sample}.${strand}.cnt" + +# adjust for single/paired end +if [ -n "$fastq_R2" ] ;then + fastq_flag="--paired-end $fastq_R1 $fastq_R2" +else + fastq_flag="$fastq_R1" +fi + +# adjust for strand +# fwd | transcript | cufflinks "fr-secondstrand" | htseq "yes" | picard "FIRST_READ" +# rev | rev comp of transcript | cufflinks "fr-firststrand" | htseq "reverse" | picard "SECOND_READ" +if [ "$strand" == "unstr" ] ; then + strand_flag="" +elif [ "$strand" == "fwd" ] ; then + strand_flag="--forward-prob 1" +elif [ "$strand" == "rev" ] ; then + strand_flag="--forward-prob 0" +else + echo -e "\n $script_name ERROR: INCORRECT strand $strand SPECIFIED \n" >&2 + exit 1 +fi + + +######################### + + +# exit if output exits already + +if [ -s "$rsem_expcounts_txt" ] ; then + echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2 + exit 1 +fi + + +######################### + + +# RSEM + +module load bowtie2/2.2.6 + +rsem_bin_dir="/ifs/home/id460/software/RSEM-1.2.31/bin" + +echo " * RSEM: ${rsem_bin_dir}/rsem-calculate-expression " +echo " * RSEM version: $(${rsem_bin_dir}/rsem-calculate-expression --version) " +echo " * bowtie2: $(readlink -f $(which bowtie2)) " +echo " * bowtie2 version: $(bowtie2 --version | head -1) " +echo " * RSEM REF: $rsem_genome " +echo " * FASTQ R1: $fastq_R1 " +echo " * FASTQ R2: $fastq_R2 " +echo " * COUNTS : $rsem_expcounts_txt " +echo " * TPM : $rsem_tpm_txt " +echo " * FPKM : $rsem_fpkm_txt " + +CMD=" +${rsem_bin_dir}/rsem-calculate-expression \ +--quiet \ +--no-bam-output \ +--bowtie2 \ +--num-threads $threads \ +--estimate-rspd \ +$strand_flag \ +$fastq_flag \ +$rsem_genome \ +${rsem_logs_dir}/${sample}.${strand} +" +echo "CMD: $CMD" +eval "$CMD" + +sleep 30 + + +######################### + + +# check that output generated + +if [ ! -s "$rsem_raw_genes_results" ] ; then + echo -e "\n $script_name ERROR: RESULTS FILE $rsem_raw_genes_results NOT GENERATED \n" >&2 + exit 1 +fi + + +######################### + + +# clean up the full output + +# all results +# cat $RSEM_GENES_RESULTS | sed 's/gene_id/000_gene_id/' | cut -f 1,5,6,7 | LC_ALL=C sort -k1,1 > $RSEM_GENES_RESULTS_TXT + +echo -e "#GENE\t${sample}" > "$rsem_expcounts_txt" +cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,5 | LC_ALL=C sort -k1,1 >> "$rsem_expcounts_txt" + +echo -e "#GENE\t${sample}" > "$rsem_tpm_txt" +cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,6 | LC_ALL=C sort -k1,1 >> "$rsem_tpm_txt" + +echo -e "#GENE\t${sample}" > "$rsem_fpkm_txt" +cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,7 | LC_ALL=C sort -k1,1 >> "$rsem_fpkm_txt" + + +######################### + + +# generate summary + +# *.cnt file contains alignment statistics based purely on the alignment results obtained from aligners +# N0 N1 N2 N_tot # N0, number of unalignable reads; N1, number of alignable reads; N2, number of filtered reads due to too many alignments; N_tot = N0 + N1 + N2 +# nUnique nMulti nUncertain # nUnique, number of reads aligned uniquely to a gene; +# # nMulti, number of reads aligned to multiple genes; nUnique + nMulti = N1; +# # nUncertain, number of reads aligned to multiple locations in the given reference sequences, which include isoform-level multi-mapping reads +# nHits read_type # nHits, number of total alignments. +# # read_type: 0, single-end read, no quality score; 1, single-end read, with quality score; 2, paired-end read, no quality score; 3, paired-end read, with quality score + +# extract read counts +reads_input=$( cat "$rsem_stat_cnt" | head -1 | cut -d " " -f 4) +reads_map=$( cat "$rsem_stat_cnt" | head -1 | cut -d " " -f 2) +reads_map_unique=$(cat "$rsem_stat_cnt" | head -2 | tail -1 | cut -d " " -f 1) +reads_map_multi=$( cat "$rsem_stat_cnt" | head -2 | tail -1 | cut -d " " -f 2) + +# calculate ratios +map_pct=$( echo "(${reads_map}/${reads_input})*100" | bc -l | cut -c 1-4) +map_unique_pct=$(echo "(${reads_map_unique}/${reads_input})*100" | bc -l | cut -c 1-4) +map_multi_pct=$( echo "(${reads_map_multi}/${reads_input})*100" | bc -l | cut -c 1-4) + +map_pct="${map_pct}%" +map_unique_pct="${map_unique_pct}%" +map_multi_pct="${map_multi_pct}%" + +# header for summary file +echo "#SAMPLE,INPUT READS,MAPPED READS,UNIQUELY MAPPED,MULTI-MAPPED,\ +MAPPED READS,UNIQUELY MAPPED,MULTI-MAPPED" > "$summary_csv" + +# summarize log file +echo "${sample},${reads_input},${reads_map},${reads_map_unique},${reads_map_multi},\ +${map_pct},${map_unique_pct},${map_multi_pct}" >> "$summary_csv" + +sleep 30 + +# combine all sample summaries +cat ${summary_dir}/*.rsem.${strand}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.rsem.${strand}.csv" + + +######################### + + +# generate counts matrix for all samples +# merged counts filename with sample included to avoid conflicts during join process + +merged_counts_file="${proj_dir}/quant.rsem.expcounts.${strand}.txt" +count_file_list=$(find "$rsem_quant_dir" -type f -name "*.expcounts.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ') + +CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp" +(eval "$CMD") +mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file" + + +merged_counts_file="${proj_dir}/quant.rsem.TPM.${strand}.txt" +count_file_list=$(find "$rsem_quant_dir" -type f -name "*.TPM.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ') + +CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp" +(eval "$CMD") +mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file" + + +merged_counts_file="${proj_dir}/quant.rsem.FPKM.${strand}.txt" +count_file_list=$(find "$rsem_quant_dir" -type f -name "*.FPKM.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ') + +CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp" +(eval "$CMD") +mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file" + + +######################### + + + +# end