diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..cc82a48
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,122 @@
+
+# Created by https://www.gitignore.io/
+
+### OSX ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+
+### Windows ###
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+
+### R ###
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+
+### Perl ###
+/blib/
+/.build/
+_build/
+cover_db/
+inc/
+Build
+!Build/
+Build.bat
+.last_cover_stats
+/Makefile
+/Makefile.old
+/MANIFEST.bak
+/META.yml
+/META.json
+/MYMETA.*
+nytprof.out
+/pm_to_blib
+*.o
+*.bs
+/_eumm/
diff --git a/gather-fastqs b/gather-fastqs
new file mode 100755
index 0000000..d3e8861
--- /dev/null
+++ b/gather-fastqs
@@ -0,0 +1,133 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use List::MoreUtils qw(uniq);
+
+my $HELP = <<HELP;
+
+  Find FASTQ files in a given directory (must have "_R1" or "_1" in file name).
+  Extract sample names and paired reads based on file names.
+  Generate sample table file samples.fastq.csv in current directory.
+
+  If run multiple times, it will add new samples to the sample table.
+
+  usage: gather-fastqs dir
+
+HELP
+
+if (!$ARGV[0]) {
+	die $HELP;
+}
+
+main();
+
+# main subroutine
+sub main {
+	my $search_dir = $ARGV[0];
+
+	# convert dir from relative to absolute
+	$search_dir = `readlink -f $search_dir`;
+	chomp($search_dir);
+
+	# check that dir exists
+	unless ( -d $search_dir ) {
+		die "\n\n ERROR! $search_dir DOES NOT EXIST \n\n";
+	}
+
+	# find fastqs in given directory
+	my $find_fastq_cmd = 'find -L ' . $search_dir . ' -maxdepth 2 -type f -name "*_R1*.fastq.gz" -or -name "*_1.fastq.gz" | LC_ALL=C sort';
+	my @fastqs = `$find_fastq_cmd`;
+
+	# counter single and paired reads
+	my $reads_se = 0;
+	my $reads_pe = 0;
+	my @samples = ();
+
+	# sample table file
+	my $filename = "samples.fastq-raw.csv";
+	open(my $fh, ">>", $filename);
+
+	# process each fastq
+	while (my $fastq_r1 = shift(@fastqs)) {
+		chomp($fastq_r1);
+
+		# check that R1 exists
+		unless ( -e $fastq_r1 ) {
+			die "\n\n ERROR! $fastq_r1 DOES NOT EXIST \n\n";
+		}
+
+		# generate R2 filename
+		my $fastq_r2 = $fastq_r1;
+		$fastq_r2 =~ s/(.*)_R1_00(.*.fastq.gz)/${1}_R2_00${2}/;
+		$fastq_r2 =~ s/(.*)_R1.fastq.gz/${1}_R2.fastq.gz/;
+		$fastq_r2 =~ s/(.*)_1.fastq.gz/${1}_2.fastq.gz/;
+
+		# blank if R2 does not exist
+		unless ( -e $fastq_r2 ) {
+			$fastq_r2 = "";
+		}
+
+		# blank if R2 is same as R1 (in case of not standard file name, for example)
+		if ( $fastq_r1 eq $fastq_r2 ) {
+			$fastq_r2 = "";
+		}
+
+		# count based on read type
+		if ( length($fastq_r2) ) {
+			$reads_pe++;
+		}
+		else {
+			$reads_se++;
+		}
+
+		# extract sample name
+		my $sample = $fastq_r1;
+		# remove directory structure
+		$sample =~ s/.*\///;
+		# bcl2fastq2 format (with S sample number)
+		$sample =~ s/_S[0-9]{1,3}_L00[0-9]_R1.*//;
+		# bcl2fastq format with 2 barcodes
+		$sample =~ s/_[ACTG]{6,}-[ACTG]{6,}_L00[0-9]_R1.*//;
+		# bcl2fastq format with 1 barcode
+		$sample =~ s/_[ACTG]{4,}_L00[0-9]_R1.*//;
+		# no barcodes
+		$sample =~ s/_L00[0-9]_R[12].*//;
+		# no barcodes or lane
+		$sample =~ s/_R[12].fastq.gz//;
+		# no barcodes or lane
+		$sample =~ s/_[12].fastq.gz//;
+
+		push @samples, $sample;
+
+		# show progress
+		print STDERR " SAMPLE : $sample \n";
+		print STDERR "  FASTQ R1 : $fastq_r1 \n";
+		print STDERR "  FASTQ R2 : $fastq_r2 \n";
+
+		# print sample table line
+		my $output = "${sample},${fastq_r1},${fastq_r2}\n";
+		print $fh "$output";
+
+	}
+	close($fh);
+
+	# remove duplicate entries
+	system("cat $filename | LC_ALL=C sort | uniq > ${filename}.tmp && mv -f ${filename}.tmp $filename");
+
+	# get number of unique sample names
+	my $num_files = @samples;
+	@samples = uniq(@samples);
+	my $num_samples = @samples;
+
+	# print stats
+	print STDERR "\n";
+	print STDERR " NUMBER OF SAMPLES : $num_samples \n";
+	print STDERR " NUMBER OF SINGLE FILES : $reads_se \n";
+	print STDERR " NUMBER OF PAIRED FILES : $reads_pe \n";
+
+}
+
+
+
+# end
diff --git a/generate-settings b/generate-settings
new file mode 100755
index 0000000..91ff2b7
--- /dev/null
+++ b/generate-settings
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+
+my $HELP = <<HELP;
+
+  Generate the initial settings.txt file for a specified genome.
+
+  usage: generate-settings genome
+  * genome = name (hg19, mm10, etc.) or exact dir (contains genome.fa and genes.gtf)
+
+HELP
+
+if (!$ARGV[0]) {
+	die $HELP;
+}
+
+main();
+
+# main subroutine
+sub main {
+	my $genome_arg = $ARGV[0];
+
+	# pipeline directory (the directory that this file is in)
+	my $pipeline_dir = dirname(__FILE__);
+
+	# settings file
+	my $settings_file = "settings.txt";
+
+	# set genome setting
+	my $genome_dir_setting;
+	if ($genome_arg =~ m/\//) {
+		# use directory if given
+		$genome_dir_setting = "GENOME-DIR|${genome_arg}\n";
+	}
+	else {
+		# use id460 dir as reference if genome name is provided
+		$genome_dir_setting = "GENOME-DIR|/ifs/home/id460/ref/${genome_arg}\n";
+	}
+
+	# save genome setting
+	open(my $fh, ">", $settings_file);
+	# print $fh $genome_setting;
+	print $fh $genome_dir_setting;
+	close $fh;
+
+	# set fasta to test genome dir setting
+	# system("bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA");
+
+	# get values to make sure they were set properly
+	my $settings_genome = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file GENOME-DIR`;
+	my $settings_fasta = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA`;
+
+	# print values
+	print STDERR "\n";
+	print STDERR " REF DIR : $settings_genome \n";
+	print STDERR " REF FASTA : $settings_fasta \n";
+
+}
+
+
+
+# end
diff --git a/routes/comp-rna-star-dge.sh b/routes/comp-rna-star-dge.sh
new file mode 100755
index 0000000..f0e83a9
--- /dev/null
+++ b/routes/comp-rna-star-dge.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+
+
+##
+## RNA-seq differential gene expression using DESeq2
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 1 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir \n" >&2
+	exit 1
+fi
+
+# standard comparison route arguments
+proj_dir=$(readlink -f "$1")
+
+# additional settings
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+gene_info_table="${proj_dir}/genes.featurecounts.txt"
+
+if [ ! -s "$gene_info_table" ] ; then
+	echo -e "\n $script_name ERROR: GENE INFO $gene_info_table DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+groups_table="${proj_dir}/samples.groups.csv"
+
+if [ ! -s "$groups_table" ] ; then
+	echo -e "\n $script_name ERROR: GROUP TABLE $groups_table DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+num_samples=$(cat "$groups_table" | grep -v "#SAMPLE" | wc -l)
+
+if [ "$num_samples" -lt 3 ] ; then
+	echo -e "\n $script_name ERROR: $num_samples is too few samples \n" >&2
+	exit 1
+fi
+
+strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND);
+counts_table="${proj_dir}/quant.featurecounts.counts.${strand}.txt"
+
+if [ ! -s "$counts_table" ] ; then
+	echo -e "\n $script_name ERROR: COUNTS TABLE $counts_table DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+dge_dir="${proj_dir}/DGE-DESeq2-${strand}-${num_samples}samples"
+
+mkdir -p "$dge_dir"
+
+input_groups_table="${dge_dir}/input.groups.csv"
+input_counts_table="${dge_dir}/input.counts.txt"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$input_groups_table" ] ; then
+	echo -e "\n $script_name ERROR: TABLE $input_groups_table ALREADY EXISTS \n" >&2
+	exit 1
+fi
+
+if [ -s "$input_counts_table" ] ; then
+	echo -e "\n $script_name ERROR: TABLE $input_counts_table ALREADY EXISTS \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+echo -e "\n ========== set up inputs ========== \n"
+
+echo
+echo " * counts table: $counts_table "
+echo " * groups table: $groups_table "
+echo
+
+bash_cmd="rsync -t $counts_table $input_counts_table"
+echo "CMD: $bash_cmd"
+($bash_cmd)
+
+bash_cmd="rsync -t $groups_table $input_groups_table"
+echo "CMD: $bash_cmd"
+($bash_cmd)
+
+sleep 3
+
+
+#########################
+
+
+echo -e "\n ========== test R ========== \n"
+
+# load relevant modules
+module unload r
+module load r/3.3.0
+module unload zlib
+
+echo
+echo " * R: $(readlink -f $(which R)) "
+echo " * R version: $(R --version | head -1) "
+echo " * Rscript: $(readlink -f $(which Rscript)) "
+echo " * Rscript version: $(Rscript --version 2>&1) "
+echo
+
+# test R installation and setting
+bash_cmd="Rscript --vanilla ${code_dir}/scripts/test-packages.R"
+echo "CMD: $bash_cmd"
+($bash_cmd)
+
+sleep 3
+
+
+#########################
+
+
+echo -e "\n ========== start analysis ========== \n"
+
+cd "$dge_dir" || exit 1
+
+# test R installation and setting
+bash_cmd="Rscript --vanilla ${code_dir}/scripts/dge-deseq2.R $input_counts_table $input_groups_table"
+echo "CMD: $bash_cmd"
+($bash_cmd)
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/routes/rna-rsem.sh b/routes/rna-rsem.sh
new file mode 100755
index 0000000..60a2dc6
--- /dev/null
+++ b/routes/rna-rsem.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+
+##
+## RNA-seq using RSEM aligner
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2
+	exit 1
+fi
+
+# standard route arguments
+proj_dir=$(readlink -f "$1")
+sample=$2
+
+# additional settings
+threads=$NSLOTS
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * sample: $sample "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+echo " * threads: $threads "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# segments
+
+# rename and/or merge raw input FASTQs
+segment_fastq_clean="fastq-fastq-clean"
+fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample"
+	($bash_cmd)
+	fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+	fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+fi
+
+# fastq_screen
+bash_cmd="bash ${code_dir}/segments/fastq-qc-fastqscreen.sh $proj_dir $sample $fastq_R1"
+($bash_cmd)
+
+# RSEM
+segment_quant="fastq-quant-rsem"
+bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads unstr $fastq_R1 $fastq_R2"
+($bash_cmd)
+
+
+#########################
+
+
+# combine summary from each step
+
+sleep 30
+
+summary_csv="${proj_dir}/summary-combined.rna-rsem.csv"
+
+bash_cmd="
+bash ${code_dir}/scripts/join-many.sh , X \
+${proj_dir}/summary.${segment_fastq_clean}.csv \
+${proj_dir}/summary.x.csv \
+${proj_dir}/summary.x.csv \
+> $summary_csv
+"
+(eval $bash_cmd)
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/routes/rna-star.sh b/routes/rna-star.sh
new file mode 100755
index 0000000..a1db895
--- /dev/null
+++ b/routes/rna-star.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+
+##
+## RNA-seq using STAR aligner
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample name] \n" >&2
+	exit 1
+fi
+
+# standard route arguments
+proj_dir=$(readlink -f "$1")
+sample=$2
+
+# additional settings
+threads=$NSLOTS
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * sample: $sample "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+echo " * threads: $threads "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# segments
+
+# rename and/or merge raw input FASTQs
+segment_fastq_clean="fastq-fastq-clean"
+fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample"
+	($bash_cmd)
+	fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+	fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+fi
+
+# fastq_screen
+bash_cmd="bash ${code_dir}/segments/fastq-qc-fastqscreen.sh $proj_dir $sample $fastq_R1"
+($bash_cmd)
+
+# run STAR
+segment_align="fastq-bam-star"
+bam_star=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.bam-star.csv" | cut -d ',' -f 2)
+if [ -z "$bam_star" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2"
+	($bash_cmd)
+	bam_star=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2)
+fi
+
+# generate BigWig (deeptools)
+segment_bigwig_deeptools="bam-bigwig-deeptools"
+bash_cmd="bash ${code_dir}/segments/${segment_bigwig_deeptools}.sh $proj_dir $sample 4 $bam_star"
+qsub_cmd="qsub -N sns.${segment_bigwig_deeptools}.${sample} -M ${USER}@nyumc.org -m a -j y -cwd -pe threaded 4 -b y ${bash_cmd}"
+$qsub_cmd
+
+# generate BigWig (bedtools)
+segment_bigwig_bedtools="bam-bigwig-bedtools"
+bash_cmd="bash ${code_dir}/segments/${segment_bigwig_bedtools}.sh $proj_dir $sample $bam_star"
+qsub_cmd="qsub -N sns.${segment_bigwig_bedtools}.${sample} -M ${USER}@nyumc.org -m a -j y -cwd -b y ${bash_cmd}"
+$qsub_cmd
+
+# Picard CollectRnaSeqMetrics
+segment_qc_picard="bam-qc-picard-rnaseqmetrics"
+bash_cmd="bash ${code_dir}/segments/${segment_qc_picard}.sh $proj_dir $sample $bam_star"
+($bash_cmd)
+
+# determine run type for featurecounts
+if [ -n "$fastq_R2" ] ; then
+	run_type="PE"
+else
+	run_type="SE"
+fi
+
+# determine strand
+exp_strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND);
+
+# generate counts
+segment_quant="bam-quant-featurecounts"
+bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads $bam_star $run_type $exp_strand"
+($bash_cmd)
+
+# generate unstranded counts just in case
+if [ "$exp_strand" != "unstr" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_quant}.sh $proj_dir $sample $threads $bam_star $run_type unstr"
+	($bash_cmd)
+fi
+
+
+#########################
+
+
+# combine summary from each step
+
+sleep 30
+
+summary_csv="${proj_dir}/summary-combined.rna-star.csv"
+
+bash_cmd="
+bash ${code_dir}/scripts/join-many.sh , X \
+${proj_dir}/summary.${segment_fastq_clean}.csv \
+${proj_dir}/summary.${segment_align}.csv \
+${proj_dir}/summary.${segment_quant}-unstr.csv \
+${proj_dir}/summary.${segment_quant}-${exp_strand}.csv \
+${proj_dir}/summary.${segment_qc_picard}.csv \
+> $summary_csv
+"
+(eval $bash_cmd)
+
+
+#########################
+
+
+# generate groups sample sheet template
+
+samples_groups_csv="${proj_dir}/samples.groups.csv"
+
+if [ ! -s "$samples_groups_csv" ] ; then
+	echo "#SAMPLE,group" > $samples_groups_csv
+	sed 's/\,.*/,NA/g' samples.fastq-raw.csv | LC_ALL=C sort -u >> $samples_groups_csv
+fi
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/routes/rrbs.sh b/routes/rrbs.sh
new file mode 100755
index 0000000..2932292
--- /dev/null
+++ b/routes/rrbs.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+
+##
+## RRBS using Bismark
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2
+	exit 1
+fi
+
+# standard route arguments
+proj_dir=$(readlink -f "$1")
+sample=$2
+
+# additional settings
+threads=$NSLOTS
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * sample: $sample "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+echo " * threads: $threads "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# segments
+
+# rename and/or merge raw input FASTQs
+fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 2)
+fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1" ] ; then
+	bash_cmd="bash ${code_dir}/segments/fastq-fastq-clean.sh $proj_dir $sample"
+	($bash_cmd)
+	fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 2)
+	fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-clean.csv" | cut -d ',' -f 3)
+fi
+
+# trim FASTQs with Trim Galore
+fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 2)
+fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1_trimmed" ] ; then
+	bash_cmd="bash ${code_dir}/segments/fastq-fastq-trim-trimgalore.sh $proj_dir $sample rrbs $fastq_R1 $fastq_R2"
+	($bash_cmd)
+	fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 2)
+	fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.fastq-trim.csv" | cut -d ',' -f 3)
+fi
+
+# run Bismark alignment
+bam_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.bam-bismark.csv" | cut -d ',' -f 2)
+if [ -z "$bam_bismark" ] ; then
+	bash_cmd="bash ${code_dir}/segments/fastq-bam-bismark.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed"
+	($bash_cmd)
+	bam_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.bam-bismark.csv" | cut -d ',' -f 2)
+fi
+
+# run Bismark methylation extractor
+if [ -n "$fastq_R2" ] ; then
+	#
+	echo "pe"
+else
+	bash_cmd="bash ${code_dir}/segments/bam-meth-bismark.sh $proj_dir $sample $threads $bam_bismark se"
+	($bash_cmd)
+	bash_cmd="bash ${code_dir}/segments/bam-meth-bismark.sh $proj_dir $sample $threads $bam_bismark se-ignore-r1-3"
+	($bash_cmd)
+fi
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/routes/wes.sh b/routes/wes.sh
new file mode 100755
index 0000000..37d8a05
--- /dev/null
+++ b/routes/wes.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+
+##
+## whole genome/exome/targeted variant discovery using BWA-MEM and GATK
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2
+	exit 1
+fi
+
+# standard route arguments
+proj_dir=$(readlink -f "$1")
+sample=$2
+
+# additional settings
+threads=$NSLOTS
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * sample: $sample "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+echo " * threads: $threads "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# segments
+
+# rename and/or merge raw input FASTQs
+segment_fastq_clean="fastq-fastq-clean"
+fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample"
+	($bash_cmd)
+	fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+	fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+fi
+
+# trim FASTQs with Trimmomatic
+segment_fastq_trim="fastq-fastq-trim-trimmomatic"
+fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2)
+fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1_trimmed" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_trim}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2"
+	($bash_cmd)
+	fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2)
+	fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3)
+fi
+
+# run BWA-MEM alignment
+segment_align="fastq-bam-bwa-mem"
+bam=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2)
+if [ -z "$bam_bwa" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed"
+	($bash_cmd)
+	bam=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2)
+fi
+
+# remove duplicates
+segment_dedup="bam-bam-dd-sambamba"
+bam_dd=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2)
+if [ -z "$bam_dd" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_dedup}.sh $proj_dir $sample $threads $bam"
+	($bash_cmd)
+	bam_dd=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2)
+fi
+
+
+
+
+
+#########################
+
+
+# combine summary from each step
+
+sleep 30
+
+summary_csv="${proj_dir}/summary-combined.wes.csv"
+
+bash_cmd="
+bash ${code_dir}/scripts/join-many.sh , X \
+${proj_dir}/summary.${segment_fastq_clean}.csv \
+${proj_dir}/summary.${segment_fastq_trim}.csv \
+${proj_dir}/summary.${segment_align}.csv \
+${proj_dir}/summary.${segment_dedup}.csv \
+> $summary_csv
+"
+(eval $bash_cmd)
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/routes/wgbs.sh b/routes/wgbs.sh
new file mode 100755
index 0000000..52d69a6
--- /dev/null
+++ b/routes/wgbs.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+
+##
+## WGBS using Bismark
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+route_name=${script_name/%.sh/}
+echo -e "\n ========== ROUTE: $route_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name \n" >&2
+	exit 1
+fi
+
+# standard route arguments
+proj_dir=$(readlink -f "$1")
+sample=$2
+
+# additional settings
+threads=$NSLOTS
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+qsub_dir="${proj_dir}/logs-qsub"
+
+# display settings
+echo " * proj_dir: $proj_dir "
+echo " * sample: $sample "
+echo " * code_dir: $code_dir "
+echo " * qsub_dir: $qsub_dir "
+echo " * threads: $threads "
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+# segments
+
+# rename and/or merge raw input FASTQs
+segment_fastq_clean="fastq-fastq-clean"
+fastq_R1=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+fastq_R2=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_clean}.sh $proj_dir $sample"
+	($bash_cmd)
+	fastq_R1=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 2)
+	fastq_R2=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_clean}.csv" | cut -d ',' -f 3)
+fi
+
+# trim FASTQs with Trimmomatic
+segment_fastq_trim="fastq-fastq-trim-trimmomatic"
+fastq_R1_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2)
+fastq_R2_trimmed=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3)
+if [ -z "$fastq_R1_trimmed" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_fastq_trim}.sh $proj_dir $sample $threads $fastq_R1 $fastq_R2"
+	($bash_cmd)
+	fastq_R1_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 2)
+	fastq_R2_trimmed=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_fastq_trim}.csv" | cut -d ',' -f 3)
+fi
+
+# run Bismark alignment
+segment_align="fastq-bam-bismark"
+bam_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2)
+if [ -z "$bam_bismark" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_align}.sh $proj_dir $sample $threads $fastq_R1_trimmed $fastq_R2_trimmed"
+	($bash_cmd)
+	bam_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_align}.csv" | cut -d ',' -f 2)
+fi
+
+# run Bismark dedup
+segment_dedup="bam-bam-dd-bismark"
+bam_dd_bismark=$(grep -s -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2)
+if [ -z "$bam_dd_bismark" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_dedup}.sh $proj_dir $sample $bam_bismark pe"
+	($bash_cmd)
+	bam_dd_bismark=$(grep -m 1 "^${sample}," "${proj_dir}/samples.${segment_dedup}.csv" | cut -d ',' -f 2)
+fi
+
+bam_bismark="$bam_dd_bismark"
+
+# run Bismark methylation extractor
+segment_meth="bam-meth-bismark"
+if [ -n "$fastq_R2" ] ; then
+	bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark pe"
+	($bash_cmd)
+	bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark pe-ignore-r2-3"
+	($bash_cmd)
+else
+	bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark se"
+	($bash_cmd)
+	bash_cmd="bash ${code_dir}/segments/${segment_meth}.sh $proj_dir $sample $threads $bam_bismark se-ignore-r1-3"
+	($bash_cmd)
+fi
+
+
+#########################
+
+
+# delete empty qsub .po files
+rm -f ${qsub_dir}/sns.*.po*
+
+
+#########################
+
+
+date
+
+
+
+# end
diff --git a/run b/run
new file mode 100755
index 0000000..80da2be
--- /dev/null
+++ b/run
@@ -0,0 +1,162 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+use File::Spec;
+use Cwd;
+use Cwd 'abs_path';
+
+my $HELP = <<HELP;
+
+  Run pipeline.
+  Must be executed from a project directory that contains the samples.fastq-raw.csv file.
+
+  usage: run route
+  * route = name of pipeline (corresponds to a file in routes directory)
+
+HELP
+
+if (!$ARGV[0]) {
+	die $HELP;
+}
+
+main();
+
+# main subroutine
+sub main {
+
+	my $route = $ARGV[0];
+
+	# get current dir (should be the project dir)
+	my $project_dir = getcwd();
+	print "project dir: $project_dir \n";
+
+	# pipeline dir (where this file is)
+	my $code_dir = File::Spec->rel2abs(__FILE__);
+	$code_dir = dirname($code_dir);
+	chomp($code_dir);
+	print "code dir: $code_dir \n";
+
+	# check that settings file exists
+	my $settings_txt = "${project_dir}/settings.txt";
+	unless ( -e $settings_txt ) {
+		die "\n\n ERROR: $settings_txt DOES NOT EXIST \n\n";
+	}
+
+	# check that the specified route exists
+	my $route_script = "${code_dir}/routes/${route}.sh";
+	unless ( -e $route_script ) {
+		die "\n\n ERROR: $route DOES NOT EXIST \n\n";
+	}
+
+	# move to qsub logs dir so the qsub output ends up there
+	my $qsub_logs_dir = "${project_dir}/logs-qsub";
+	system("mkdir -p $qsub_logs_dir");
+	print "logs dir: $qsub_logs_dir \n";
+	chdir($qsub_logs_dir);
+
+	# check if standard route (run for each sample) or comparison route (compare samples)
+	if ($route =~ "comp-") {
+
+		# comparison route
+
+		# sample sheet specifying groups
+		my $samples_groups_csv = abs_path("${project_dir}/samples.groups.csv");
+		print "sample sheet: $samples_groups_csv \n";
+
+		# check that sample sheet exists
+		unless ( -e $samples_groups_csv ) {
+			die "\n\n ERROR: $samples_groups_csv DOES NOT EXIST \n\n";
+		}
+
+		system("bash ${code_dir}/scripts/fix-csv.sh $samples_groups_csv");
+
+		print "\n process comparison $route \n";
+
+		# route command
+		my $cmd = "bash $route_script $project_dir";
+
+		system($cmd);
+
+	}
+	else {
+
+		# standard route
+
+		# remove problematic characters from any existing samples sheets
+		my @sample_csvs = `ls -1 ${project_dir}/samples.*.csv`;
+		chomp(@sample_csvs);
+		while (my $csv = shift(@sample_csvs)) {
+			print "sample sheet: $csv \n";
+			system("bash ${code_dir}/scripts/fix-csv.sh $csv");
+		}
+
+		# check that raw fastq sample sheet exists
+		my $samples_fastq_csv = abs_path("${project_dir}/samples.fastq-raw.csv");
+		unless ( -e $samples_fastq_csv ) {
+			die "\n\n ERROR: $samples_fastq_csv DOES NOT EXIST \n\n";
+		}
+
+		# find unique samples based on the sample sheet
+		my @unique_samples = `cat $samples_fastq_csv | cut -d ',' -f 1 | uniq`;
+		chomp(@unique_samples);
+
+		# adjust number of requested threads per sample depending on total number of samples
+		my $num_samples = @unique_samples;
+		my $threads;
+		if ($num_samples < 15) {
+			$threads = "8-16";
+		}
+		elsif ($num_samples < 50) {
+			$threads = "4-8";
+		}
+		else {
+			$threads = "4";
+		}
+
+		# process each sample
+		my $current_sample = 1;
+		while (my $sample = shift(@unique_samples)) {
+
+			print "\n process sample $sample \n";
+
+			# route command
+			my $cmd = "bash $route_script $project_dir $sample";
+
+			# check for "test" argument to test one sample without qsub
+			if (defined($ARGV[1]) && $ARGV[1] eq "test") {
+				print "\n\n test run started \n\n";
+				system("export NSLOTS=4 && $cmd");
+				print "\n\n test run finished \n\n";
+				exit;
+			}
+
+			# qsub command (run as binary bash script to keep all environment variables)
+			my $email = '${USER}@nyumc.org';
+			$email = `echo $email`;
+			chomp($email);
+			my $qsub_name = "sns.${route}.${sample}";
+			my $qsub_cmd = "qsub -N $qsub_name -M $email -m a -j y -cwd -pe threaded $threads -b y $cmd";
+			print "\n CMD: $qsub_cmd \n";
+			system($qsub_cmd);
+
+			# pause between samples (sometimes job scheduler gets overwhelmed)
+			sleep(5);
+
+			# extra pause for the first few samples so it's easier to fix obvious errors
+			if ($current_sample < 4) {
+				sleep(15);
+			}
+
+			$current_sample++;
+
+		}
+
+	}
+
+}
+
+
+
+# end
diff --git a/scripts/deseq2-pca.R b/scripts/deseq2-pca.R
new file mode 100755
index 0000000..eeb4d30
--- /dev/null
+++ b/scripts/deseq2-pca.R
@@ -0,0 +1,67 @@
+##
+## Modified DESeq2 plotPCA function with sample names and proportion of variance added.
+## Sample names will be shown underneath each dot.
+## The axis will display proportion of variance for each principal component.
+## Tested using several DESeq2 versions from 1.2.8 to 1.12.3.
+## DESeq2 plotPCA function switched from lattice to ggplot2 in version 1.5.11.
+##
+
+
+deseq2_pca = function(x, intgroup, ntop=500)
+{
+  library(RColorBrewer)
+  library(genefilter)
+  library(lattice)
+
+  # pca
+  rv = rowVars(assay(x))
+  select = order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
+  pca = prcomp(t(assay(x)[select,]))
+
+  # proportion of variance
+  variance = pca$sdev^2 / sum(pca$sdev^2)
+  variance = round(variance, 3) * 100
+
+  # sample names
+  names = colnames(x)
+
+  # factor of groups
+  fac = factor(apply(as.data.frame(colData(x)[, intgroup, drop=FALSE]), 1, paste, collapse=" : "))
+
+  # colors
+  if( nlevels(fac) > 24 ) {
+    colors = rainbow(nlevels(fac))
+  }
+  else if( nlevels(fac) > 2 ) {
+    colors = c(brewer.pal(9, "Set1"), brewer.pal(8, "Accent"), brewer.pal(8, "Dark2"))
+    colors = unique(colors)
+    colors = head(colors, nlevels(fac))
+  }
+  else {
+    colors = c("dodgerblue3", "firebrick3")
+  }
+
+  # plot
+  xyplot(
+    PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=1.5,
+    aspect = "fill",
+    col = colors,
+    xlab = list(paste("PC1 (", variance[1], "%)", sep=""), cex=0.8),
+    ylab = list(paste("PC2 (", variance[2], "%)", sep=""), cex=0.8),
+    panel = function(x, y, ...) {
+      panel.xyplot(x, y, ...);
+      ltext(x=x, y=y, labels=names, pos=1, offset=0.8, cex=0.7)
+    },
+    main = draw.key(
+      key = list(
+        rect = list(col = colors),
+        text = list(levels(fac)),
+        rep = FALSE
+      )
+    )
+  )
+}
+
+
+
+# end
diff --git a/scripts/deseq2-results.R b/scripts/deseq2-results.R
new file mode 100755
index 0000000..fad5b30
--- /dev/null
+++ b/scripts/deseq2-results.R
@@ -0,0 +1,67 @@
+##
+## Calculate results and export them in various formats with proper names.
+##
+
+
+deseq2_results = function(DESeqDataSet, contrast=NULL, name=NULL)
+{
+  library(xlsx)
+
+  # calculate results (using contrast or name, depending on what is given)
+  if(!is.null(contrast)) {
+    res = results(DESeqDataSet, contrast=contrast, cooksCutoff=FALSE, addMLE=TRUE)
+    # extract results name
+    pattern = paste(".*", contrast[1], " ", sep="")
+    res.name = gsub(pattern=pattern, replacement="", x=mcols(res)[2,2])
+  }
+  else {
+    res = results(DESeqDataSet, name=name, cooksCutoff=FALSE, addMLE=TRUE)
+    res.name = name
+  }
+
+  # sort results
+  res = res[order(res$padj, res$pvalue, -res$baseMean),]
+
+  # save results object
+  res.rdata = paste("res.", gsub(pattern=" ", replacement="-", x=res.name), ".RData", sep="")
+  save(res, file=res.rdata)
+  message("save results object: ", res.rdata)
+
+  # save results as csv
+  res.csv = paste0("dge.", gsub(pattern=" ", replacement="-", x=res.name), ".csv")
+  write.csv(as.data.frame(res), file=res.csv)
+  message("save results csv: ", res.csv)
+
+  # format results for excel export
+  res.df = as.data.frame(res)
+  res.df$gene           = rownames(res.df)
+  res.df$baseMean       = round(res.df$baseMean, 1)
+  res.df$log2FoldChange = round(res.df$log2FoldChange, 5)
+  res.df$lfcMLE         = round(res.df$lfcMLE, 5)
+  res.df$pvalue         = round(res.df$pvalue, 15)
+  res.df$padj           = round(res.df$padj, 15)
+  res.df = subset(res.df, select = c("gene", "baseMean", "log2FoldChange", "lfcMLE", "pvalue", "padj"))
+
+  # rename columns
+  names(res.df)[names(res.df)=="log2FoldChange"] = "log2FC"
+  names(res.df)[names(res.df)=="lfcMLE"] = "log2FCunshrunk"
+
+  # save results as xlsx
+  res.xlsx = paste0("dge.", gsub(pattern=" ", replacement="-", x=res.name), ".xlsx")
+  write.xlsx2(x=res.df, file=res.xlsx, sheetName=res.name, col.names=TRUE, row.names=FALSE)
+  message("results genes: ", nrow(res.df))
+  message("save results xlsx: ", res.xlsx)
+
+  # save results as xlsx (padj<0.05)
+  res.padj005.xlsx = gsub(pattern=".xlsx", replacement=".padj005.xlsx", x=res.xlsx)
+  write.xlsx2(x=subset(res.df, padj<0.05), file=res.padj005.xlsx, sheetName=res.name, col.names=TRUE, row.names=FALSE)
+  message("num genes padj<0.9:  ", nrow(subset(res.df, padj<0.9)))
+  message("num genes padj<0.2:  ", nrow(subset(res.df, padj<0.2)))
+  message("num genes padj<0.05: ", nrow(subset(res.df, padj<0.05)))
+  message("num genes padj<0.01: ", nrow(subset(res.df, padj<0.01)))
+  message("save filtered results xlsx: ", res.padj005.xlsx)
+}
+
+
+
+# end
diff --git a/scripts/dge-deseq2.R b/scripts/dge-deseq2.R
new file mode 100755
index 0000000..44277d1
--- /dev/null
+++ b/scripts/dge-deseq2.R
@@ -0,0 +1,106 @@
+#!/usr/bin/env Rscript
+
+
+##
+## Differential gene expression with DESeq2.
+##
+## usage: Rscript --vanilla dge-deseq2.R counts_table.txt groups_table.csv
+##
+
+
+# increase output width
+options(width = 150)
+
+# java heap size
+options(java.parameters="-Xmx8G")
+
+# get scripts directory (directory of this file) and load relevant functions
+args_all = commandArgs(trailingOnly = FALSE)
+scripts_dir = normalizePath(dirname(sub("^--file=", "", args_all[grep("^--file=", args_all)])))
+source(paste0(scripts_dir, "/load-install-packages.R"))
+source(paste0(scripts_dir, "/deseq2-pca.R"))
+source(paste0(scripts_dir, "/deseq2-results.R"))
+
+# relevent arguments
+args = commandArgs(trailingOnly = TRUE)
+counts_table_file = args[1]
+groups_table_file = args[2]
+
+# check that input files exist
+if (!file.exists(counts_table_file)) stop("file does not exist: ", counts_table_file)
+if (!file.exists(groups_table_file)) stop("file does not exist: ", groups_table_file)
+
+# load relevant packages
+load_install_packages("DESeq2")
+load_install_packages("RColorBrewer")
+load_install_packages("gplots")
+load_install_packages("genefilter")
+load_install_packages("lattice")
+load_install_packages("xlsx")
+
+message(" ========== prepare inputs ========== ")
+
+# import groups table
+groups_table = read.csv(file = groups_table_file, header = TRUE, row.names = 1, colClasses = "factor")
+message("groups table sample num: ", nrow(groups_table))
+message("groups table groups: ", colnames(groups_table))
+
+# import counts table
+counts_table = read.delim(file = counts_table_file, header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
+message("full counts table gene num: ", nrow(counts_table))
+message("full counts table sample num: ", ncol(counts_table))
+
+# subset to samples in groups table (also sets samples to be in the same order)
+counts_table = counts_table[,rownames(groups_table)]
+message("group-subset counts table gene num: ", nrow(counts_table))
+message("group-subset counts table sample num: ", ncol(counts_table))
+
+# group info
+group_name = colnames(groups_table)[1]
+message("group name: ", group_name)
+group_levels = levels(groups_table[,group_name])
+message("group levels: ", toString(group_levels))
+
+# design formula
+design_formula = formula(paste("~", group_name))
+message("design formula: ", design_formula)
+
+message(" ========== normalization ========== ")
+
+# import and normalize
+dds = DESeqDataSetFromMatrix(countData = counts_table, colData = groups_table, design = design_formula)
+dds = DESeq(dds, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4))
+save(dds, file = "deseq2.dds.RData")
+
+vsd = varianceStabilizingTransformation(dds)
+save(vsd, file = "deseq2.vsd.RData")
+
+# export counts
+write.csv(counts(dds, normalized = FALSE), file = "counts.raw.csv")
+write.csv(round(counts(dds, normalized = TRUE), digits = 3), file = "counts.norm.csv")
+write.xlsx2(x=round(counts(dds, normalized = TRUE), digits = 3), file = "counts.norm.xlsx", sheetName = "normalized counts")
+
+message(" ========== QC ========== ")
+
+pdf("plot.sparsity.pdf", width=7, height=7, family="Palatino", pointsize=10)
+print(plotSparsity(dds, normalized = TRUE))
+dev.off()
+
+pdf("plot.pca.pdf", width=7, height=7, family="Palatino", pointsize=10)
+deseq2_pca(vsd, intgroup=c("group"), ntop=1000)
+dev.off()
+
+message(" ========== differential expression ========== ")
+
+# perform comparisons for all combinations of group levels
+group_levels_combinations = combn(group_levels, m = 2, simplify = TRUE)
+for (combination_num in 1:ncol(group_levels_combinations)) {
+  level_numerator = group_levels_combinations[1, combination_num]
+  level_denominator = group_levels_combinations[2, combination_num]
+  message("comparison: ", paste(group_name, ":", level_numerator, "vs", level_denominator))
+  deseq2_results(DESeqDataSet = dds, contrast = c(group_name, level_numerator, level_denominator))
+}
+
+
+
+# end
diff --git a/scripts/fix-csv.sh b/scripts/fix-csv.sh
new file mode 100755
index 0000000..0d930e2
--- /dev/null
+++ b/scripts/fix-csv.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+
+##
+## make csv file more compatible with various tools by removing problematic characters
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ ! $# == 1 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name file.csv \n" >&2
+	exit 1
+fi
+
+# arguments
+csv=$1
+
+# check that input exists
+if [ ! -s "$csv" ] ; then
+	echo -e "\n $script_name ERROR: file $csv does not exist \n" >&2
+	exit 1
+fi
+
+# fix newlines
+dos2unix --quiet $csv
+mac2unix --quiet $csv
+
+# replace commas inside quoted fields with dashes
+awk -F '"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "-", $i) } 1' $csv > ${csv}.tmp && mv ${csv}.tmp $csv
+# replace quotes
+sed -i 's/\"//g' $csv
+# remove lines missing any values (only commas present)
+sed -i '/^,,*$/d' $csv
+# add newline to end of file if one does not exist (some scripts may complain)
+sed -i -e '$a\' $csv
+
+
+
+# end
diff --git a/scripts/get-ref.sh b/scripts/get-ref.sh
new file mode 100755
index 0000000..1293306
--- /dev/null
+++ b/scripts/get-ref.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+
+##
+## get reference of specified type for specified genome
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name genome_dir ref_type \n" >&2
+	exit 1
+fi
+
+# arguments
+genome_root=$1
+ref_type=$2
+
+
+#########################
+
+
+function find_file {
+
+	local file_name=$1
+
+	# find the shortest result
+	local result=$(find -L "$genome_root" -maxdepth 2 -type f -name "$file_name" | awk '{ print length, $0 }' | sort -n | cut -d " " -f 2 | head -1)
+
+	if [ -s "$result" ] && [ "$result" ] ; then
+		echo $(readlink -f "$result")
+	else
+		echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2
+		exit 1
+	fi
+
+}
+
+function find_dir {
+
+	local dir_name=$1
+
+	local result=$(find -L "$genome_root" -maxdepth 1 -type d -iname "$dir_name")
+
+	if [ -s "$result" ] && [ "$result" ] ; then
+		echo $(readlink -f "$result")
+	else
+		echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2
+		exit 1
+	fi
+
+}
+
+function find_basename {
+
+	local suffix=$1
+
+	# find the shortest result
+	local result=$(find -L "$genome_root" -maxdepth 2 -type f -name "genome${suffix}" | awk '{ print length, $0 }' | sort -n | cut -d " " -f 2 | head -1)
+
+	if [ -s "$result" ] && [ "$result" ] ; then
+		result=$(readlink -f "$result")
+		echo ${result/${suffix}/}
+	else
+		echo -e "\n $script_name ERROR: RESULT $result DOES NOT EXIST \n" >&2
+		exit 1
+	fi
+
+}
+
+
+#########################
+
+
+# genome root
+
+if [ ! -d "$genome_root" ] ; then
+	echo -e "\n $script_name ERROR: DIR $genome_root DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# file references
+
+if [ "$ref_type" == "FASTA" ] ; then
+	find_file genome.fa
+fi
+
+if [ "$ref_type" == "DICT" ] ; then
+	find_file genome.dict
+fi
+
+if [ "$ref_type" == "REFFLAT" ] ; then
+	find_file refFlat.txt.gz
+fi
+
+if [ "$ref_type" == "GTF" ] ; then
+	find_file genes.gtf
+fi
+
+if [ "$ref_type" == "CHROMSIZES" ] ; then
+	find_file chrom.sizes
+fi
+
+if [ "$ref_type" == "2BIT" ] ; then
+	find_file genome.2bit
+fi
+
+if [ "$ref_type" == "FASTQSCREEN" ] ; then
+	find_file fastq_screen.conf
+fi
+
+if [ "$ref_type" == "RRNAINTERVALLIST" ] ; then
+	find_file rRNA.interval_list
+fi
+
+
+# directory references
+
+if [ "$ref_type" == "STAR" ] ; then
+	find_dir star
+fi
+
+if [ "$ref_type" == "BISMARK" ] ; then
+	find_dir bismark
+fi
+
+if [ "$ref_type" == "RSEM" ] ; then
+	echo "$(find_dir rsem)/ref"
+fi
+
+
+# basename (file without suffix) references
+
+if [ "$ref_type" == "BOWTIE1" ] ; then
+	find_basename .1.ebwt
+fi
+
+if [ "$ref_type" == "BOWTIE2" ] ; then
+	find_basename .1.bt2*
+fi
+
+if [ "$ref_type" == "BWA" ] ; then
+	echo "$(find_basename .fa.bwt).fa"
+fi
+
+
+#########################
+
+
+
+# end
diff --git a/scripts/get-set-setting.sh b/scripts/get-set-setting.sh
new file mode 100755
index 0000000..db20d8d
--- /dev/null
+++ b/scripts/get-set-setting.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+
+##
+## get or set (if not set) specified setting for a specified setting
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ $# -lt 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name settings_file setting_name [setting_value] \n" >&2
+	exit 1
+fi
+
+# arguments
+settings_txt=$(readlink -f "$1")
+setting_name=$2
+setting_value_arg=$3
+
+
+#########################
+
+
+# check that input exists
+
+if [ ! -s "$settings_txt" ] ; then
+	echo -e "\n $script_name ERROR: file $settings_txt does not exist \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# check if setting is already set and output that
+
+setting_value=$(cat "$settings_txt" | grep "^${setting_name}|" | head -1 | cut -d "|" -f 2)
+if [ -n "$setting_value" ] ; then
+	echo "$setting_value"
+	exit
+fi
+
+
+#########################
+
+
+# determine proper setting value
+
+genome_root=$(cat "$settings_txt" | grep "^GENOME-DIR|" | head -1 | cut -d "|" -f 2)
+
+if [[ "$setting_name" == REF-* ]] ; then
+	# reference file settings
+	ref_type=${setting_name/REF-/}
+	setting_value=$(bash ${BASH_SOURCE%/*}/get-ref.sh "$genome_root" "$ref_type")
+elif [ -n "$setting_value_arg" ] ; then
+	setting_value=$setting_value_arg
+else
+	echo -e "\n $script_name ERROR: the value for setting $setting_name must be specified \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# save setting value
+
+if [ -n "$setting_value" ] ; then
+	echo "${setting_name}|${setting_value}" >> $settings_txt
+else
+	echo -e "\n $script_name ERROR: value for $setting_name could not be determined \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# try to get the setting again from the updated settings file
+
+setting_value=$(cat "$settings_txt" | grep "^${setting_name}|" | head -1 | cut -d "|" -f 2)
+echo "$setting_value"
+
+
+#########################
+
+
+
+# end
diff --git a/scripts/join-many.sh b/scripts/join-many.sh
new file mode 100755
index 0000000..66b1afd
--- /dev/null
+++ b/scripts/join-many.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+
+##
+## merge any number of tab or comma-separated files ("join" can only do 2 at a time)
+## use $'\t' as tab field separator
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ $# -lt 3 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name field_separator missing_field_char in1.txt in2.txt in3.txt ... > merged.txt \n" >&2
+	exit 1
+fi
+
+# arguments
+separator="$1"
+shift
+empty_char="$1"
+shift
+
+# check if at least the first file exists
+if [ ! -s "$1" ] ; then
+	echo -e "\n $script_name ERROR: file $1 does not exist \n" >&2
+	exit 1
+fi
+
+# load recent coreutils ("-o auto" support added in release 8.12)
+module load coreutils/8.24
+
+function rjoin {
+	if [[ $# -gt 1 ]]; then
+		LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1") | rjoin "${@:2}"
+	else
+		LC_ALL=C join -t "$separator" -a1 -a2 -o auto -e "$empty_char" - <(LC_ALL=C sort "$1")
+	fi
+}
+
+rjoin "${@:2}" < "$1"
+
+
+
+# end
diff --git a/scripts/load-install-packages.R b/scripts/load-install-packages.R
new file mode 100755
index 0000000..8fa60e2
--- /dev/null
+++ b/scripts/load-install-packages.R
@@ -0,0 +1,34 @@
+##
+## Load packages. If missing, try installing from CRAN and Bioconductor.
+##
+
+
+load_install_packages = function(package_list) {
+
+  for (p in package_list) {
+
+    message("test loading package: ", p)
+
+    # try to install from CRAN if package is not already installed
+    if (!require(p, character.only = TRUE, quietly = TRUE)) {
+      message("installing package from CRAN: ", p)
+      install.packages(p, quiet = TRUE, repos = "http://cran.rstudio.com")
+    }
+
+    # try to install from Bioconductor if package is not already installed
+    if (!require(p, character.only = TRUE, quietly = TRUE)) {
+      message("installing package from Bioconductor: ", p)
+      source("https://bioconductor.org/biocLite.R")
+      biocLite(p, suppressUpdates = TRUE)
+    }
+
+    # force load package
+    library(p, character.only = TRUE, quietly = TRUE)
+
+  }
+
+}
+
+
+
+# end
diff --git a/scripts/test-packages.R b/scripts/test-packages.R
new file mode 100755
index 0000000..cfaf969
--- /dev/null
+++ b/scripts/test-packages.R
@@ -0,0 +1,21 @@
+#!/usr/bin/env Rscript
+
+
+##
+## Try loading a few packages to test R settings.
+##
+## usage: Rscript --vanilla test-packages.R
+##
+
+# get scripts directory (directory of this file) and load relevant functions
+args_all = commandArgs(trailingOnly = FALSE)
+scripts_dir = normalizePath(dirname(sub("^--file=", "", args_all[grep("^--file=", args_all)])))
+source(paste0(scripts_dir, "/load-install-packages.R"))
+
+# load some packages
+test_packages = c("gplots", "grid", "ggplot2", "plyr", "dplyr", "gsalib", "limma")
+load_install_packages(test_packages)
+
+
+
+# end
diff --git a/segments/bam-bam-dd-bismark.sh b/segments/bam-bam-dd-bismark.sh
new file mode 100755
index 0000000..2fafa53
--- /dev/null
+++ b/segments/bam-bam-dd-bismark.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+
+# Bismark deduplicate_bismark
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [Bismark BAM] [analysis type] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+bismark_bam=$3
+analysis_type=$4
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bismark_bam" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bismark_bam DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+bismark_bam_dd_dir="${proj_dir}/BAM-DD-Bismark"
+mkdir -p "$bismark_bam_dd_dir"
+bismark_bam_dd_final="${bismark_bam_dd_dir}/${sample}.bam"
+
+bismark_dd_logs_dir="${proj_dir}/logs-${segment_name}"
+mkdir -p "$bismark_dd_logs_dir"
+bismark_dd_report_final="${bismark_dd_logs_dir}/${sample}.report.txt"
+
+bismark_report_dir="${proj_dir}/Bismark-report"
+mkdir -p "$bismark_report_dir"
+bismark_report_deduplication="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplication_report.txt"
+
+bismark_bam_dd_original="${bismark_bam/%.bam/.deduplicated.bam}"
+bismark_dd_report_original="${bismark_bam/%.bam/.deduplication_report.txt}"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bismark_bam_dd_final" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# run deduplicate_bismark
+
+module load bismark/0.16.3
+
+echo " * BISMARK: $(readlink -f $(which deduplicate_bismark)) "
+echo " * BAM IN: $bismark_bam "
+echo " * BAM OUT ORIGINAL : $bismark_bam_dd_original "
+echo " * REPORT ORIGINAL : $bismark_dd_report_original "
+echo " * BAM OUT FINAL : $bismark_bam_dd_final "
+echo " * REPORT FINAL : $bismark_dd_report_final "
+
+if [ "$analysis_type" == "se" ] ; then
+	bismark_flags="--single"
+fi
+
+if [ "$analysis_type" == "pe" ] ; then
+	bismark_flags="--paired"
+fi
+
+bash_cmd="
+deduplicate_bismark \
+$bismark_flags \
+--bam $bismark_bam
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$bismark_bam_dd_original" ] ; then
+	echo -e "\n $script_name ERROR: REPORT $bismark_dd_report_original NOT GENERATED \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bismark_dd_report_original" ] ; then
+	echo -e "\n $script_name ERROR: REPORT $bismark_dd_report_original NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# put reports in report directory for bismark2report
+
+bash_cmd="cp -v $bismark_dd_report_original $bismark_report_deduplication"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# clean up output name
+
+bash_cmd="mv -v $bismark_bam_dd_original $bismark_bam_dd_final"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+bash_cmd="mv -v $bismark_dd_report_original $bismark_dd_report_final"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# generate summary
+
+reads_total=$(cat "$bismark_dd_report_final" | grep -m 1 "Total number of alignments analysed" | cut -f 2)
+reads_duplicated=$(cat "$bismark_dd_report_final" | grep -m 1 "Total number duplicated alignments"  | cut -f 2 | cut -d ' ' -f 1)
+reads_deduplicated=$(cat "$bismark_dd_report_final" | grep -m 1 "deduplicated leftover sequences" | cut -d ' ' -f 7)
+
+reads_duplicated_pct=$(echo "(${reads_duplicated}/${reads_total})*100" | bc -l | cut -c 1-4)
+reads_duplicated_pct="${reads_duplicated_pct}%"
+
+# header
+echo "#SAMPLE,ALIGNED PAIRS,DEDUPLICATED PAIRS,DUPLICATION RATE" > $summary_csv
+
+# print the relevant numbers
+echo "${sample},${reads_total},${reads_deduplicated},${reads_duplicated_pct}" >> "$summary_csv"
+
+# paste -d ',' \
+# <(echo "$sample") \
+# <(cat "$bismark_dd_report_final" | grep -m 1 "Total number of alignments analysed" | cut -f 2) \
+# <(cat "$bismark_dd_report_final" | grep -m 1 "deduplicated leftover sequences"     | cut -d ' ' -f 7) \
+# <(cat "$bismark_dd_report_final" | grep -m 1 "Total number duplicated alignments"  | cut -f 2 | cut -d ' ' -f 1) \
+# >> $summary_csv
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \
+> "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and BAM to sample sheet
+echo "${sample},${bismark_bam_dd_final}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-bam-dd-sambamba.sh b/segments/bam-bam-dd-sambamba.sh
new file mode 100755
index 0000000..d3181c3
--- /dev/null
+++ b/segments/bam-bam-dd-sambamba.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+
+# remove duplicates using sambamba
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name num_threads BAM \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+bam=$4
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bam" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+bam_dd_dir="${proj_dir}/BAM-DD"
+mkdir -p "$bam_dd_dir"
+bam_dd="${bam_dd_dir}/${sample}.dd.bam"
+
+dedup_logs_dir="${proj_dir}/logs-${segment_name}"
+mkdir -p "$dedup_logs_dir"
+bam_dd_log="${dedup_logs_dir}/${sample}.log.txt"
+bam_dd_flagstat="${dedup_logs_dir}/${sample}.flagstat.txt"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bam_dd" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# sambamba markdup
+
+sambamba_bin="/ifs/home/id460/bin/sambamba"
+
+echo " * sambamba: $(readlink -f $(which $sambamba_bin)) "
+echo " * sambamba version: $($sambamba_bin 2>&1 | head -1) "
+echo " * BAM IN: $bam "
+echo " * BAM OUT: $bam_dd "
+
+bash_cmd="
+$sambamba_bin markdup \
+--remove-duplicates \
+--nthreads=${threads} \
+--overflow-list-size=500000 \
+$bam \
+$bam_dd \
+2> $bam_dd_log
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$bam_dd" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam_dd NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# run flagstat
+
+bash_cmd="$sambamba_bin flagstat $bam_dd > $bam_dd_flagstat"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# generate alignment summary
+
+reads_duplicates=$(cat "$bam_dd_log" | grep -m 1 "found.*duplicates" | tr -d -c 0-9)
+reads_dedup=$(cat "$bam_dd_flagstat" | grep -m 1 "mapped (" | cut -d ' ' -f 1)
+reads_mapped=$(echo "${reads_dedup}+${reads_duplicates}" | bc)
+
+reads_duplicates_pct=$(echo "(${reads_duplicates}/${reads_mapped})*100" | bc -l | cut -c 1-4)
+reads_duplicates_pct="${reads_duplicates_pct}%"
+
+# header for summary file
+echo "#SAMPLE,MAPPED READS (MQ10),DEDUPLICATED READS,DUPLICATES %" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${reads_mapped},${reads_dedup},${reads_duplicates_pct}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and BAM to sample sheet
+echo "${sample},${bam_dd}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-bigwig-bedtools.sh b/segments/bam-bigwig-bedtools.sh
new file mode 100755
index 0000000..40b6b52
--- /dev/null
+++ b/segments/bam-bigwig-bedtools.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+
+# bedtools generate BigWig from BAM
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 3 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [BAM] \n" >&2
+	exit 1
+fi
+
+# arguments
+PROJ_DIR=$1
+SAMPLE=$2
+BAM=$3
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$PROJ_DIR" ] || [ ! "$PROJ_DIR" ] ; then
+	echo -e "\n $script_name ERROR: DIR $PROJ_DIR DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$BAM" ] || [ ! "$BAM" ] ; then
+	echo -e "\n $script_name ERROR: BAM $BAM DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+REF_CHROMSIZES=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${PROJ_DIR}/settings.txt" REF-CHROMSIZES);
+
+if [ ! -s "$REF_CHROMSIZES" ] ; then
+	echo -e "\n $script_name ERROR: CHROMSIZES $REF_CHROMSIZES DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# output
+
+BIGWIG_DIR="${PROJ_DIR}/BIGWIG"
+mkdir -p "$BIGWIG_DIR"
+BEDGRAPH="${BIGWIG_DIR}/${SAMPLE}.norm.bedgraph"
+BIGWIG="${BIGWIG_DIR}/${SAMPLE}.norm.bw"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$BIGWIG" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $SAMPLE \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# BAM to BedGraph with bedtools
+
+module unload gcc
+module load samtools/1.3
+module load bedtools/2.22.0
+module load kentutils/329
+
+# determine normalization scaling factor
+MAPPED_READS=$(samtools view -c $BAM)
+SCALE=`echo "10000000/${MAPPED_READS}" | bc -l`
+SCALE=${SCALE:0:8}
+
+echo " * samtools: $(readlink -f $(which samtools)) "
+echo " * bedtools: $(readlink -f $(which bedtools)) "
+echo " * bedGraphToBigWig: $(readlink -f $(which bedGraphToBigWig)) "
+echo " * CHROM SIZES: $REF_CHROMSIZES "
+echo " * BAM: $BAM "
+echo " * MAPPED READS: $MAPPED_READS "
+echo " * SCALING FACTOR: $SCALE "
+echo " * BEDGRAPH: $BEDGRAPH "
+echo " * BIGWIG: $BIGWIG "
+
+CMD="bedtools genomecov -split -bg -g $REF_CHROMSIZES -ibam $BAM -scale $SCALE > $BEDGRAPH"
+echo "CMD: $CMD"
+eval $CMD
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$BEDGRAPH" ] ; then
+	echo -e "\n $script_name ERROR: BEDGRAPH $BEDGRAPH NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# BadGraph to BigWig with UCSC tools
+
+CMD="bedGraphToBigWig $BEDGRAPH $REF_CHROMSIZES $BIGWIG"
+echo "CMD: $CMD"
+eval $CMD
+
+# delete bedgraph
+rm -f $BEDGRAPH
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$BIGWIG" ] ; then
+	echo -e "\n $script_name ERROR: BIGWIG $BIGWIG NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-bigwig-deeptools.sh b/segments/bam-bigwig-deeptools.sh
new file mode 100755
index 0000000..947dea9
--- /dev/null
+++ b/segments/bam-bigwig-deeptools.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+
+# deepTools generate BigWig from BAM
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [BAM] \n" >&2
+	exit 1
+fi
+
+# arguments
+PROJ_DIR=$1
+SAMPLE=$2
+THREADS=$3
+BAM=$4
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$PROJ_DIR" ] || [ ! "$PROJ_DIR" ] ; then
+	echo -e "\n $script_name ERROR: DIR $PROJ_DIR DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$BAM" ] || [ ! "$BAM" ] ; then
+	echo -e "\n $script_name ERROR: BAM $BAM DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# output
+
+BIGWIG_DIR="${PROJ_DIR}/BIGWIG"
+mkdir -p "$BIGWIG_DIR"
+BIGWIG="${BIGWIG_DIR}/${SAMPLE}.bin1.rpkm.bw"
+
+# BIGWIG_RPKM_BIN1=${BIGWIG_RPKM_DIR}/${ID}.bin1.rpkm.bw
+# BIGWIG_RPKM_BIN10=${BIGWIG_RPKM_DIR}/${ID}.bin10.smooth50.rpkm.bw
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$BIGWIG" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $SAMPLE \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# generate bigwig using deeptools
+
+module unload python
+module load python/2.7.3
+module load deeptools/2.2.4
+module load kentutils/329
+
+echo " * bamCoverage: $(readlink -f $(which bamCoverage)) "
+echo " * BAM: $BAM "
+echo " * BIGWIG: $BIGWIG "
+
+CMD="
+bamCoverage \
+--verbose \
+--numberOfProcessors $THREADS \
+--binSize 1 \
+--normalizeUsingRPKM \
+--outFileFormat bigwig \
+--bam $BAM \
+--outFileName $BIGWIG
+"
+echo "CMD: $CMD"
+eval "$CMD"
+
+# echo " * BIGWIG: $BIGWIG_RPKM_BIN10 "
+
+# CMD="
+# bamCoverage \
+# --verbose \
+# --numberOfProcessors $THREADS \
+# --binSize 10 \
+# --fragmentLength 1 \
+# --smoothLength 50 \
+# --normalizeUsingRPKM \
+# --outFileFormat bigwig \
+# --bam $BAM \
+# --outFileName $BIGWIG_RPKM_BIN10
+# "
+# echo "CMD: $CMD"
+# eval "$CMD"
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$BIGWIG" ] ; then
+	echo -e "\n $script_name ERROR: BIGWIG $BIGWIG NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-meth-bismark.sh b/segments/bam-meth-bismark.sh
new file mode 100755
index 0000000..9b246b1
--- /dev/null
+++ b/segments/bam-meth-bismark.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+
+# Bismark methylation extractor
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== $script_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 5 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [Bismark BAM] [analysis type] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+bismark_bam=$4
+analysis_type=$5
+
+
+#########################
+
+
+# settings and files
+
+segment_name="${segment_name}-${analysis_type}"
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+bismark_meth_dir="${proj_dir}/meth-Bismark-${analysis_type}"
+bismark_report_dir="${proj_dir}/Bismark-report"
+
+bismark_report="${bismark_meth_dir}/${sample}_splitting_report.txt"
+bismark_report_short="${bismark_meth_dir}/${sample}.report.txt"
+
+bismark_mbias_report="${bismark_meth_dir}/${sample}.M-bias.txt"
+
+bismark_bedgraph="${bismark_meth_dir}/${sample}.bedGraph"
+bismark_bedgraph_gz="${bismark_bedgraph}.gz"
+
+bismark_cpg_report_gz="${bismark_meth_dir}/${sample}.CpG_report.txt.gz"
+
+# BISMARK_METHYLKIT_REPORT="${bismark_meth_dir}/${sample}.methylkit.txt"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bismark_report_short" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# check inputs and references
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bismark_bam" ] ; then
+	echo -e "\n $script_name ERROR: bismark_bam $bismark_bam DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+ref_bismark=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BISMARK);
+
+if [ ! -d "$ref_bismark" ] || [ ! "$ref_bismark" ] ; then
+	echo -e "\n $script_name ERROR: REF $ref_bismark DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# adjust paramaters based on run type
+
+# --ignore_r2 <int>
+# The first couple of bases in Read 2 of BS-Seq experiments show a severe bias towards
+# non-methylation as a result of end-repairing sonicated fragments with unmethylated cytosines,
+# it is recommended that the first couple of bp of Read 2 are removed before starting downstream analysis.
+
+
+if [ "$analysis_type" == "pe" ] ; then
+	bismark_flags="--paired-end"
+fi
+
+# the first bases in R2 of BS-Seq experiments show a bias towards non-methylation as a result of end-repairing sonicated fragments with unmethylated cytosines
+if [ "$analysis_type" == "pe-ignore-r2-3" ] ; then
+	bismark_flags="--paired-end --ignore_r2 3"
+fi
+
+if [ "$analysis_type" == "se" ] ; then
+	bismark_flags="--single-end"
+fi
+
+if [ "$analysis_type" == "se-ignore-r1-3" ] ; then
+	bismark_flags="--single-end --ignore 3"
+fi
+
+
+#########################
+
+
+# bismark_methylation_extractor
+
+# load module (loads bowtie2/2.2.6 and samtools/1.3)
+module load bismark/0.16.3
+
+# navigate to output dir
+mkdir -p $bismark_meth_dir
+cd $bismark_meth_dir
+
+# number of parallel instances of Bismark to run
+multicore=$(( threads / 3 ))
+
+echo " * BISMARK: $(readlink -f $(which bismark_methylation_extractor)) "
+echo " * OUT DIR: $bismark_meth_dir "
+echo " * BAM: $bismark_bam "
+echo " * REPORT ORIGINAL: $bismark_report "
+echo " * BEDGRAPH: $bismark_bedgraph_gz "
+echo " * REPORT: $bismark_report_short "
+echo " * CYTOSINE REPORT: $bismark_cpg_report_gz "
+
+# --comprehensive \
+
+# Genome-wide cytosine methylation report specific options:
+# --cytosine_report - produces a genome-wide methylation report for all cytosines in the genome (1-based)
+# --genome_folder - genome folder you wish to use to extract sequences from
+
+bash_cmd="
+bismark_methylation_extractor \
+--gzip \
+--multicore $multicore \
+--buffer_size 4G \
+--genome_folder $ref_bismark \
+--cytosine_report \
+--bedGraph \
+$bismark_flags \
+$bismark_bam
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s $bismark_report ] || [ ! $bismark_report ]
+then
+	printf "\n\n ERROR! REPORT $bismark_report NOT GENERATED \n\n"
+	exit 1
+fi
+
+
+#########################
+
+
+# put reports in report directory for bismark2report
+# bismark_bams for bismark2summary (may not be necessary)
+
+# adjust for SE/PE
+if [ "$analysis_type" == "pe" ] ; then
+	bismark_report_splittng="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplicated_splitting_report.txt"
+	bismark_report_mbias="${bismark_report_dir}/${sample}_bismark_bt2_pe.deduplicated.M-bias.txt"
+fi
+
+# not tested
+if [ "$analysis_type" == "se" ] ; then
+	bismark_report_splittng="${bismark_report_dir}/${sample}_bismark_bt2_splitting_report.txt"
+	bismark_report_mbias="${bismark_report_dir}/${sample}_bismark_bt2.M-bias.txt"
+fi
+
+if [ "$analysis_type" == "pe" ] || [ "$analysis_type" == "se" ]
+then
+
+	CMD="cp -v $bismark_report $bismark_report_splittng"
+	printf "\n\n $CMD \n\n"
+	$CMD
+
+	CMD="cp -v $bismark_mbias_report $bismark_report_mbias"
+	printf "\n\n $CMD \n\n"
+	$CMD
+
+fi
+
+
+#########################
+
+
+CMD="mv -v $bismark_report $bismark_report_short"
+printf "\n\n $CMD \n\n"
+$CMD
+
+
+#########################
+
+
+# convert cytosine report to methylkit-compatible format and remove uncovered bases
+
+# cytosine report:
+# <chromosome> <position> <strand> <count methylated> <count unmethylated> <C-context> <trinucleotide context>
+# new format: chr, start, strand, methylationRatio, coverage
+# zcat $bismark_cpg_report_gz | awk '{ OFS="\t"; if( $4+0 > 0 || $5+0 > 0 ) print $1,$2,$3,$4/($4+$5),$4+$5; }' | sort -k1,1 -k2,2n > $BISMARK_METHYLKIT_REPORT
+
+
+#########################
+
+
+# convert bedgraph to bigwig
+
+bigwig_dir="${proj_dir}/BIGWIG-Bismark"
+bigwig="${bigwig_dir}/${sample}.bw"
+
+mkdir -p "$bigwig_dir"
+
+ref_chromsizes=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-CHROMSIZES);
+
+# bedgraph is gzipped by default
+gunzip -cv $bismark_bedgraph_gz > $bismark_bedgraph
+
+bash_cmd="bedGraphToBigWig $bismark_bedgraph $ref_chromsizes $bigwig"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+# delete bedgraph
+rm -fv $bismark_bedgraph
+
+
+#########################
+
+
+# generate summary
+
+# header
+echo "#SAMPLE,READS,TOTAL Cs,METH_CpG_CONTEXT,METH_CHG_CONTEXT,METH_CHH_CONTEXT" > $summary_csv
+
+# print the relevant numbers
+paste -d ',' \
+<(echo "$sample") \
+<(cat $bismark_report_short | grep "lines in total" 					| head -1 | cut -d " " -f 2) \
+<(cat $bismark_report_short | grep "Total number of C's analysed" 		| head -1 | cut -f 2) \
+<(cat $bismark_report_short | grep "C methylated in CpG context" 		| head -1 | cut -f 2) \
+<(cat $bismark_report_short | grep "C methylated in CHG context" 		| head -1 | cut -f 2) \
+<(cat $bismark_report_short | grep "C methylated in CHH context" 		| head -1 | cut -f 2) \
+>> $summary_csv
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq \
+> "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# combine summary from each step
+
+combined_summary_csv="${proj_dir}/summary-combined.bs-bismark-${analysis_type}.csv"
+
+# combine all summaries
+bash_cmd="
+bash ${code_dir}/scripts/join-many.sh , X \
+${proj_dir}/summary.fastq-fastq-clean.csv \
+${proj_dir}/summary.fastq-fastq-trim-trimgalore.csv \
+${proj_dir}/summary.fastq-fastq-trim-trimmomatic.csv \
+${proj_dir}/summary.fastq-bam-bismark.csv \
+${proj_dir}/summary.bam-bam-dd-bismark.csv \
+${proj_dir}/summary.${segment_name}.csv \
+> $combined_summary_csv
+"
+(eval $bash_cmd)
+
+
+#########################
+
+
+# combine charts into a single png
+
+combined_png_2X="${proj_dir}/summary.${segment_name}.mbias.2x.png"
+combined_png_4X="${proj_dir}/summary.${segment_name}.mbias.4x.png"
+
+rm -f $combined_png_3X
+rm -f $combined_png_4X
+
+# -geometry +20+20 = 20px x and y padding
+# -tile 4x = 4 images wide
+# montage -geometry +20+20 -tile 4x ${bismark_meth_dir}/*M-bias*.png $COMBINED_PNG
+montage -geometry +20+20 -tile 2x -label %t -pointsize 18 -font Nimbus-Sans-Regular ${bismark_meth_dir}/*M-bias*.png $combined_png_2X
+montage -geometry +20+20 -tile 4x -label %t -pointsize 18 -font Nimbus-Sans-Regular ${bismark_meth_dir}/*M-bias*.png $combined_png_4X
+
+
+#########################
+
+
+# bismark html report
+
+cd "$bismark_report_dir"
+
+# sample report
+bismark2report
+
+# summary report
+bismark2summary --basename '_summary'
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-qc-picard-rnaseqmetrics.sh b/segments/bam-qc-picard-rnaseqmetrics.sh
new file mode 100755
index 0000000..6ffa855
--- /dev/null
+++ b/segments/bam-qc-picard-rnaseqmetrics.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+
+# run Picard CollectRnaSeqMetrics
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 3 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [BAM] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+bam=$3
+
+# strand:
+# fwd | transcript             | cufflinks "fr-secondstrand" | htseq "yes"     | picard "FIRST_READ"
+# rev | rev comp of transcript | cufflinks "fr-firststrand"  | htseq "reverse" | picard "SECOND_READ"
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] || [ ! "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bam" ] || [ ! "$bam" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+REFFLAT=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-REFFLAT)
+RRNA_INTERVAL_LIST=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-RRNAINTERVALLIST)
+
+if [ ! -s "$REFFLAT" ] || [ ! "$REFFLAT" ] ; then
+	echo -e "\n $script_name ERROR: REFFLAT $REFFLAT DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$RRNA_INTERVAL_LIST" ] || [ ! "$RRNA_INTERVAL_LIST" ] ; then
+	echo -e "\n $script_name ERROR: RRNA INTERVAL LIST $RRNA_INTERVAL_LIST DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# output
+
+metrics_dir="${proj_dir}/QC-RnaSeqMetrics"
+mkdir -p "$metrics_dir"
+metrics_txt="${metrics_dir}/${sample}.txt"
+metrics_pdf="${metrics_dir}/${sample}.pdf"
+
+summary_dir="${proj_dir}/summary"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+# # strand flag
+# if [ "$STRAND" == "unstr" ] ; then
+# 	STRAND_FLAG="0"
+# elif [ "$STRAND" == "fwd" ] ; then
+# 	STRAND_FLAG="1"
+# elif [ "$STRAND" == "rev" ] ; then
+# 	STRAND_FLAG="2"
+# else
+# 	echo -e "\n $script_name ERROR: incorrect strand selected \n" >&2
+# 	exit 1
+# fi
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$metrics_txt" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+module unload java
+module load java/1.7
+module load picard-tools/1.88
+
+
+
+
+# STRAND_SPECIFICITY {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND}
+# STRAND_SPECIFICITY="NONE"
+
+
+echo " * CollectRnaSeqMetrics: ${PICARD_ROOT}/CollectRnaSeqMetrics.jar "
+echo " * BAM: $bam "
+echo " * REF_FLAT: $REFFLAT "
+echo " * RIBOSOMAL_INTERVALS: $RRNA_INTERVAL_LIST "
+echo " * OUT TXT: $metrics_txt "
+echo " * OUT PDF: $metrics_pdf "
+
+# run picard
+
+PICARD_CMD="java -Xms8G -Xmx8G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \
+VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000 \
+REF_FLAT=${REFFLAT} \
+INPUT=${bam}"
+
+CMD="$PICARD_CMD \
+STRAND_SPECIFICITY=NONE \
+RIBOSOMAL_INTERVALS=${RRNA_INTERVAL_LIST} \
+CHART_OUTPUT=${metrics_pdf} \
+OUTPUT=${metrics_txt}"
+echo "CMD: $CMD"
+$CMD
+
+CMD="$PICARD_CMD \
+STRAND_SPECIFICITY=FIRST_READ_TRANSCRIPTION_STRAND \
+OUTPUT=${metrics_txt}.1READ"
+echo "CMD: $CMD"
+$CMD
+
+CMD="$PICARD_CMD \
+STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND \
+OUTPUT=${metrics_txt}.2READ"
+echo "CMD: $CMD"
+$CMD
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$metrics_txt" ] ; then
+	echo -e "\n $script_name ERROR: METRICS $metrics_txt NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# generate a summary file
+
+# 01 PF_BASES
+# 02 PF_ALIGNED_BASES
+# 03 RIBOSOMAL_BASES
+# 04 CODING_BASES
+# 05 UTR_BASES
+# 06 INTRONIC_BASES
+# 07 INTERGENIC_BASES
+# 08 IGNORED_READS
+# 09 CORRECT_STRAND_READS
+# 10 INCORRECT_STRAND_READS
+# 11 PCT_RIBOSOMAL_BASES
+# 12 PCT_CODING_BASES
+# 13 PCT_UTR_BASES
+# 14 PCT_INTRONIC_BASES
+# 15 PCT_INTERGENIC_BASES
+# 16 PCT_MRNA_BASES
+# 17 PCT_USABLE_BASES
+# 18 PCT_CORRECT_STRAND_READS
+# 19 MEDIAN_CV_COVERAGE
+# 20 MEDIAN_5PRIME_BIAS
+# 21 MEDIAN_3PRIME_BIAS
+# 22 MEDIAN_5PRIME_TO_3PRIME_BIAS
+
+paste \
+<(echo -e "#SAMPLE\n${sample}") \
+<(cat "${metrics_txt}" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 2,11,12,13,14,15,22) \
+<(cat "${metrics_txt}.1READ" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 18 | sed 's/PCT_CORRECT_STRAND_READS/FWD_STRAND/') \
+<(cat "${metrics_txt}.2READ" | grep -A 1 "PF_ALIGNED_BASES" | cut -f 18 | sed 's/PCT_CORRECT_STRAND_READS/REV_STRAND/') \
+| tr '\t' ',' \
+> $summary_csv
+
+rm -f "${metrics_txt}.1READ"
+rm -f "${metrics_txt}.2READ"
+
+
+# combine charts into a single pdf
+
+combined_pdf=${proj_dir}/summary.${segment_name}.pdf
+
+rm -f "$combined_pdf"
+
+# do not use quotes in filenames for ghostscript
+gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${combined_pdf} ${metrics_dir}/*.pdf
+
+# combine charts into a single png
+
+combined_png_4w=${proj_dir}/summary.${segment_name}.4w.png
+combined_png_5w=${proj_dir}/summary.${segment_name}.5w.png
+
+rm -f "$combined_png_4w"
+rm -f "$combined_png_5w"
+
+# -geometry +20+20 = 20px x and y padding
+# -tile 4x = 4 images wide
+montage -geometry +20+20 -tile 4x "${metrics_dir}/*.pdf" "$combined_png_4w"
+montage -geometry +20+20 -tile 5x "${metrics_dir}/*.pdf" "$combined_png_5w"
+
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv| LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+
+# end
diff --git a/segments/bam-quant-featurecounts.sh b/segments/bam-quant-featurecounts.sh
new file mode 100755
index 0000000..b2baf85
--- /dev/null
+++ b/segments/bam-quant-featurecounts.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+
+# run featureCounts
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 6 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [bam] [PE/SE] [strand] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+bam=$4
+run_type=$5
+strand=$6
+
+# strand:
+# fwd | transcript             | cufflinks "fr-secondstrand" | htseq "yes"     | picard "FIRST_READ"
+# rev | rev comp of transcript | cufflinks "fr-firststrand"  | htseq "reverse" | picard "SECOND_READ"
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] || [ ! "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bam" ] || [ ! "$bam" ] ; then
+	echo -e "\n $script_name ERROR: bam $bam DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+gtf=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-GTF);
+
+if [ ! -s "$gtf" ] || [ ! "$gtf" ] ; then
+	echo -e "\n $script_name ERROR: GTF $gtf DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+segment_name="${segment_name}-${strand}"
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+fc_quant_dir=${proj_dir}/quant-featureCounts
+mkdir -p "$fc_quant_dir"
+counts_clean=${fc_quant_dir}/${sample}.counts.${strand}.txt
+
+fc_logs_dir="${proj_dir}/logs-featureCounts"
+mkdir -p "$fc_logs_dir"
+counts_raw="${fc_logs_dir}/${sample}.${strand}.txt"
+counts_summary_raw="${fc_logs_dir}/${sample}.${strand}.txt.summary"
+
+# run type flag
+if [ "$run_type" == "SE" ] ; then
+	run_type_flag=""
+elif [ "$run_type" == "PE" ] ; then
+	run_type_flag="-p"
+else
+	echo -e "\n $script_name ERROR: incorrect run type $run_type selected \n" >&2
+	exit 1
+fi
+
+# strand flag
+if [ "$strand" == "unstr" ] ; then
+	strand_flag="0"
+elif [ "$strand" == "fwd" ] ; then
+	strand_flag="1"
+elif [ "$strand" == "rev" ] ; then
+	strand_flag="2"
+else
+	echo -e "\n $script_name ERROR: incorrect strand $strand selected \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$counts_clean" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# featureCounts
+
+module load subread/1.4.6-p3
+
+# featureCounts generates temp files in the current directory
+cd "$fc_logs_dir"
+
+echo " * featureCounts: $(readlink -f $(which featureCounts)) "
+echo " * RUN TYPE: $run_type "
+echo " * BAM: $bam "
+echo " * GTF: $gtf "
+echo " * COUNTS: $counts_clean "
+
+# Summarize a single-end read dataset using 5 threads:
+# featureCounts -T 5 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.sam
+# Summarize a bam format dataset:
+# featureCounts -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.bam
+# Summarize multiple datasets at the same time:
+# featureCounts -t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam
+# Perform strand-specific read counting (use '-s 2' if reversely stranded):
+# featureCounts -s 1 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_SE.bam
+# Summarize paired-end reads and count fragments (instead of reads):
+# featureCounts -p -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam
+# Summarize multiple paired-end datasets:
+# featureCounts -p -t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam
+# Count the fragments that have fragment length between 50bp and 600bp only:
+# featureCounts -p -P -d 50 -D 600 -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam
+# Count those fragments that have both ends mapped only:
+# featureCounts -p -B -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam
+# Exclude chimeric fragments from fragment counting:
+# featureCounts -p -C -t exon -g gene_id -a annotation.gtf -o counts.txt mapping_results_PE.bam
+
+CMD="
+featureCounts \
+-T $threads \
+$run_type_flag \
+-g gene_name \
+-s $strand_flag \
+-a $gtf \
+-o $counts_raw \
+$bam
+"
+echo "CMD: $CMD"
+eval "$CMD"
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$counts_raw" ] ; then
+	echo -e "\n $script_name ERROR: COUNTS $counts_raw NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# clean up the full output
+
+# 1: Geneid
+# 2: Chr
+# 3: Start
+# 4: End
+# 5: strand
+# 6: Length
+# 7: count
+
+# gene counts table
+echo -e "#GENE\t${sample}" > "$counts_clean"
+cat $counts_raw | grep -v '#' | grep -v 'Geneid' | cut -f 1,7 | LC_ALL=C sort -k1,1 >> "$counts_clean"
+
+# gene info table
+gene_info_file="${proj_dir}/genes.featurecounts.txt"
+echo -e "#GENE\tCHR\tSTART\tEND\tstrand\tLENGTH" > "$gene_info_file"
+cat $counts_raw | grep -v '#' | grep -v 'Geneid' | cut -f 1-6 | LC_ALL=C sort -k1,1 >> "$gene_info_file"
+
+# delete the full original counts table
+rm -fv $counts_raw
+
+
+#########################
+
+
+# generate summary
+
+counts_assigned=$(cat $counts_summary_raw | grep -m 1 "^Assigned" | cut -f 2)
+
+# header for summary file
+echo "#SAMPLE,ASSIGNED COUNTS ${strand}" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${counts_assigned}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# generate counts matrix for all samples
+
+# merged counts filename
+merged_counts_file="${proj_dir}/quant.featurecounts.counts.${strand}.txt"
+
+# list of all counts files
+count_file_list=$(find "$fc_quant_dir" -type f -name "*.counts.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ')
+
+# merged counts filename with sample included to avoid conflicts during join process
+CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp"
+(eval "$CMD")
+
+# rewrite any existing merged counts table with current table
+mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file"
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-bam-bismark.sh b/segments/fastq-bam-bismark.sh
new file mode 100755
index 0000000..7c7887f
--- /dev/null
+++ b/segments/fastq-bam-bismark.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+
+# Bismark alignment
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+fastq_R1=$4
+fastq_R2=$5
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+ref_bismark=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BISMARK);
+
+if [ ! -d "$ref_bismark" ] || [ ! "$ref_bismark" ] ; then
+	echo -e "\n $script_name ERROR: REF $ref_bismark DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+bismark_bam_dir="${proj_dir}/BAM-Bismark"
+mkdir -p "$bismark_bam_dir"
+bismark_bam_final="${bismark_bam_dir}/${sample}.bam"
+
+bismark_logs_dir="${proj_dir}/logs-${segment_name}"
+mkdir -p "$bismark_logs_dir"
+bismark_report_final="${bismark_logs_dir}/${sample}.report.txt"
+bismark_nucstats_final="${bismark_logs_dir}/${sample}.nucleotide_stats.txt"
+
+bismark_report_dir="${proj_dir}/Bismark-report"
+mkdir -p "$bismark_report_dir"
+
+# v0.16.0: "File endings .fastq | .fq | .fastq.gz | .fq.gz are now removed from the output file"
+# BISMARK_ID=$(basename $FASTQ1)
+# BISMARK_ID="$FASTQ1_SHORT"
+bismark_id=$(basename "$fastq_R1")
+bismark_id=${bismark_id/.fastq.gz/}
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ; then
+	suffix_bam="_bismark_bt2_pe.bam"
+	suffix_report="_bismark_bt2_PE_report.txt"
+	suffix_nucstats="_bismark_bt2_pe.nucleotide_stats.txt"
+else
+	suffix_bam="_bismark_bt2.bam"
+	suffix_report="_bismark_bt2_SE_report.txt"
+	suffix_nucstats="_bismark_bt2.nucleotide_stats.txt"
+fi
+
+bismark_bam_original="${bismark_logs_dir}/${bismark_id}${suffix_bam}"
+bismark_report_original="${bismark_logs_dir}/${bismark_id}${suffix_report}"
+bismark_nucstats_original="${bismark_logs_dir}/${bismark_id}${suffix_nucstats}"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bismark_bam_final" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# bismark
+# bismark can't use sorted bam at the next step, but other tools may need sorted bam
+
+# bismark/0.15.0 and bismark/0.16.0 load bowtie2/2.2.6 and samtools/1.3
+module load bismark/0.16.3
+
+# "In order to work properly the current working directory must contain the sequence files to be analysed" (as of v0.14)
+fastq_dir=$(dirname "$fastq_R1")
+cd "$fastq_dir" || exit 1
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ; then
+	bismark_flags="--maxins 1000"
+	fastq_files="-1 $fastq_R1 -2 $fastq_R2"
+else
+	bismark_flags=""
+	fastq_files="$fastq_R1"
+fi
+
+# number of parallel instances of Bismark to run (3-5 threads per instance depending on parameters)
+# "a typical Bismark run will use several cores already (Bismark itself, Bowtie/Bowtie2, Samtools, gzip etc...)"
+multicore_flag=$(( threads / 3 ))
+
+# v0.15.0: "specifying --basename in conjuction with --multicore is currently not supported"
+
+echo " * BISMARK: $(readlink -f $(which bismark)) "
+echo " * BISMARK REF: $ref_bismark "
+echo " * FASTQ DIR: $fastq_dir "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * BAM ORIGINAL: $bismark_bam_original "
+echo " * REPORT ORIGINAL: $bismark_report_original "
+echo " * BAM FINAL: $bismark_bam_final "
+echo " * REPORT FINAL: $bismark_report_final "
+
+bash_cmd="
+bismark \
+--bowtie2 \
+--gzip \
+--nucleotide_coverage \
+--multicore $multicore_flag \
+--path_to_bowtie $BOWTIE2_ROOT \
+$bismark_flags \
+--temp_dir $bismark_logs_dir \
+--output_dir $bismark_logs_dir \
+$ref_bismark \
+$fastq_files
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$bismark_bam_original" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bismark_bam_original NOT GENERATED \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$bismark_report_original" ] ; then
+	echo -e "\n $script_name ERROR: REPORT $bismark_report_original NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# clean up output name
+
+bash_cmd="mv -v $bismark_bam_original $bismark_bam_final"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+bash_cmd="mv -v $bismark_report_original $bismark_report_final"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+bash_cmd="mv -v $bismark_nucstats_original $bismark_nucstats_final"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+
+#########################
+
+
+# put reports in report directory for bismark2report
+# BAMs for bismark2summary (may not be necessary)
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ; then
+	bismark_report_bam="${bismark_report_dir}/${sample}${suffix_bam}"
+	bismark_report_report="${bismark_report_dir}/${sample}${suffix_report}"
+	bismark_report_nucstats="${bismark_report_dir}/${sample}${suffix_nucstats}"
+else
+	bismark_report_bam="${bismark_report_dir}/${sample}${suffix_bam}"
+	bismark_report_report="${bismark_report_dir}/${sample}${suffix_report}"
+	bismark_report_nucstats="${bismark_report_dir}/${sample}${suffix_nucstats}"
+fi
+
+bash_cmd="ln -sv ../$(basename "$bismark_bam_dir")/$(basename "$bismark_bam_final") $bismark_report_bam"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+bash_cmd="cp -v $bismark_report_final $bismark_report_report"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+bash_cmd="cp -v $bismark_nucstats_final $bismark_report_nucstats"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# generate alignment summary
+
+# header for summary file
+echo "#SAMPLE,ALIGN INPUT READS,UNIQUELY ALIGNED,ALIGNMENT RATE" > $summary_csv
+
+# print the relevant numbers
+paste -d ',' \
+<(echo "$sample") \
+<(cat "$bismark_report_final" | grep -m 1 "Sequence.*analysed"                | cut -f 2) \
+<(cat "$bismark_report_final" | grep -m 1 "alignments with a unique best hit" | cut -f 2) \
+<(cat "$bismark_report_final" | grep -m 1 "Mapping efficiency"                | cut -f 2 | sed 's/ //g') \
+>> $summary_csv
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and BAM to sample sheet
+echo "${sample},${bismark_bam_final}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-bam-bwa-mem.sh b/segments/fastq-bam-bwa-mem.sh
new file mode 100755
index 0000000..95dc615
--- /dev/null
+++ b/segments/fastq-bam-bwa-mem.sh
@@ -0,0 +1,371 @@
+#!/bin/bash
+
+
+# run BWA-MEM
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+fastq_R1=$4
+fastq_R2=$5
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+ref_bwa=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-BWA);
+
+if [ ! -s "$ref_bwa" ] || [ ! -n "$ref_bwa" ] ; then
+	echo -e "\n $script_name ERROR: REF $ref_bwa DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+bwa_bam_dir="${proj_dir}/BAM-BWA"
+mkdir -p "$bwa_bam_dir"
+bam="${bwa_bam_dir}/${sample}.bam"
+
+bwa_logs_dir="${proj_dir}/logs-${segment_name}"
+mkdir -p "$bwa_logs_dir"
+bwa_flagstat="${bwa_logs_dir}/${sample}.flagstat.txt"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bam" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# BWA
+
+module load bwa/0.7.13
+
+sambamba_bin="/ifs/home/id460/bin/sambamba"
+
+echo " * bwa: $(readlink -f $(which bwa)) "
+echo " * bwa version: $(bwa 2>&1 | grep -m 1 'Version') "
+echo " * sambamba: $(readlink -f $(which $sambamba_bin)) "
+echo " * sambamba version: $($sambamba_bin 2>&1 | head -1) "
+echo " * BWA REF: $ref_bwa "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * BAM: $bam "
+
+# step 1: align with BWA-MEM
+# step 2: convert SAM to BAM and remove low quality reads
+# step 3: sort BAM
+
+bash_cmd="
+bwa mem \
+-M -v 1 \
+-t $threads \
+-R '@RG\tID:${sample}\tSM:${sample}\tLB:${sample}\tPL:ILLUMINA' \
+$ref_bwa \
+$fastq_R1 $fastq_R2 \
+| \
+$sambamba_bin view \
+--sam-input \
+--nthreads=${threads} \
+--filter='mapping_quality>=10' \
+--format=bam \
+--compression-level=0 \
+/dev/stdin \
+| \
+$sambamba_bin sort \
+--nthreads=${threads} \
+--memory-limit=16GB \
+--out=${bam} \
+/dev/stdin
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+# check that bam file size above 10 kb
+du -s $bam
+bam_size=$(du -s --apparent-size $bam | cut -f 1)
+if [ $bam_size -lt 10 ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam TOO SMALL \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# run flagstat
+
+bash_cmd="$sambamba_bin flagstat $bam > $bwa_flagstat"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# generate alignment summary
+
+# get number of input reads (not pairs)
+fastq_lines=$(zcat "$fastq_R1" | wc -l)
+if [ -n "$fastq_R2" ] ; then
+	reads_input=$(echo "${fastq_lines}/2" | bc)
+else
+	reads_input=$(echo "${fastq_lines}/4" | bc)
+fi
+
+reads_mapped=$(cat "$bwa_flagstat" | grep -m 1 "mapped (" | cut -d ' ' -f 1)
+reads_chimeric=$(cat "$bwa_flagstat" | grep -m 1 "mate mapped to a different chr" | cut -d ' ' -f 1)
+
+reads_mapped_pct=$(echo "(${reads_mapped}/${reads_input})*100" | bc -l | cut -c 1-4)
+reads_mapped_pct="${reads_mapped_pct}%"
+
+reads_chimeric_pct=$(echo "(${reads_chimeric}/${reads_mapped})*100" | bc -l | cut -c 1-4)
+reads_chimeric_pct="${reads_chimeric_pct}%"
+
+# header for summary file
+echo "#SAMPLE,INPUT READS,MAPPED READS (MQ10),MAPPED %,CHIMERIC %" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${reads_input},${reads_mapped},${reads_mapped_pct},${reads_chimeric_pct}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and BAM to sample sheet
+echo "${sample},${bam}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
+
+
+
+exit
+
+
+
+
+
+
+#########################
+
+
+# output
+DD_DIR=${BASE_DIR}/BAM-DD
+BAM_DD=${DD_DIR}/${ID}.dd.bam
+
+mkdir -p $DD_DIR
+
+print_msg "REMOVE DUPS - $ID"
+
+CMD="bash ${HOME}/pipeline/wrappers/picard-mark-dups.sh $BASE_DIR $ID $BAM_BWA $BAM_DD false"
+printf "\n\n $CMD \n\n"
+$CMD
+
+BAM=$BAM_DD
+
+
+#########################
+
+
+# check that BAM generated
+
+if [ ! -s $BAM ]
+then
+	printf "\n\n ERROR! $BAM DOES NOT EXIST \n\n"
+	exit 1
+fi
+
+
+#########################
+
+
+print_msg "SAMTOOLS FLAGSTAT - $ID"
+
+bash ${HOME}/pipeline/wrappers/samtools-flagstat.sh $BASE_DIR $ID $BAM
+
+
+#########################
+
+
+# Picard CollectInsertSizeMetrics
+
+print_msg "INSERT SIZE METRICS - $ID"
+
+# qsub-if-needed.sh <base dir> <expected out file> <analysis type> <cmd parameters>
+bash ${HOME}/pipeline/helpers/qsub-if-needed.sh $BASE_DIR ${ID}.txt picard-insert-size $GENOME $BASE_DIR $ID $BAM $REF_FASTA
+
+
+#########################
+
+
+# try to find the bed file again in case it was placed in the directory after the run was started
+
+if [ ! -s $BED ] || [ ! $BED ]
+then
+	BED=$(find $BASE_DIR -type f -name "*.bed")
+fi
+
+if [ ! -s $BED ] || [ ! $BED ]
+then
+	BED=0
+fi
+
+
+#########################
+
+
+COV_DD_DIR=${BASE_DIR}/COV-DD
+mkdir -p $COV_DD_DIR
+
+if [ -f ${COV_DD_DIR}/${ID}.cov.ref.sample_summary ]
+then
+
+	print_msg "SKIP COVERAGE - $ID"
+
+else
+
+	print_msg "GATK COVERAGE - $ID"
+
+	bash /ifs/home/id460/pipeline/wrappers/gatk-cov-all.sh $THREADS $REF_FASTA $BED $ID $BAM $COV_DD_DIR 
+
+fi
+
+
+#########################
+
+
+# GATK requires several R libraries (using the default R 3.0.2 probably)
+# install.packages("ggplot2")
+# install.packages("gsalib")
+# install.packages("reshape")
+
+if [ $GENOME == "hg19" ]
+then
+
+	print_msg "GATK REALIGNMENT/RECALIBRATION (hg19)"
+
+	bash /ifs/home/id460/pipeline/gatk-ra-rc-hg19.sh $ID $BAM $BASE_DIR $THREADS
+
+elif [ $GENOME == "mm10" ]
+then
+	
+	print_msg "GATK REALIGNMENT/RECALIBRATION (mm10)"
+
+	bash /ifs/home/id460/pipeline/gatk-ra-rc-mm10.sh $ID $BAM $BASE_DIR $THREADS
+
+elif [ $GENOME == "dm3" ]
+then
+	
+	print_msg "GATK REALIGNMENT/RECALIBRATION (dm3)"
+
+	bash /ifs/home/id460/pipeline/gatk-ra-rc-dm3.sh $ID $BAM $BASE_DIR $THREADS
+
+	# realignment only (if chrM only)
+	# bash /ifs/home/id460/pipeline/gatk-ra.sh $THREADS $GENOME $ID $BAM_DD $BASE_DIR
+
+else
+
+	print_msg "GENERIC GATK REALIGNMENT (NO RECALIBRATION, SINCE NO SNP INFO)"
+
+	bash /ifs/home/id460/pipeline/gatk-ra.sh $THREADS $GENOME $ID $BAM $BASE_DIR
+
+fi
+
+
+#########################
+
+
+# combine all summaries
+
+# in case files do not exist (if re-running analysis, for example)
+touch -a summary.trim.txt
+touch -a summary.flagstat.txt
+
+# (should change from paste to join)
+# module load coreutils/8.24
+paste \
+${BASE_DIR}/summary.trim.txt \
+${BASE_DIR}/summary.flagstat.txt \
+${BASE_DIR}/COV-DD/__cov.txt \
+${BASE_DIR}/COV-DD-RA-RC/_cov_samples.txt \
+> ${BASE_DIR}/summary-combined.txt
+
+
+#########################
+
+date
+
+
+# end
+
+
diff --git a/segments/fastq-bam-star.sh b/segments/fastq-bam-star.sh
new file mode 100755
index 0000000..803f8c2
--- /dev/null
+++ b/segments/fastq-bam-star.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+
+
+# run STAR
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+fastq_R1=$4
+fastq_R2=$5
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+ref_star=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-STAR);
+
+if [ ! -s "${ref_star}/SA" ] ; then
+	echo -e "\n $script_name ERROR: REF $ref_star DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+star_bam_dir="${proj_dir}/BAM-STAR"
+mkdir -p "$star_bam_dir"
+bam="${star_bam_dir}/${sample}.bam"
+
+star_quant_dir="${proj_dir}/quant-STAR"
+mkdir -p "$star_quant_dir"
+counts_txt="${star_quant_dir}/${sample}.txt"
+
+star_logs_dir="${proj_dir}/logs-${segment_name}"
+mkdir -p "$star_logs_dir"
+star_prefix="${star_logs_dir}/${sample}."
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$bam" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+if [ -s "${star_prefix}.bam" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# STAR
+
+module unload samtools
+module load samtools/1.3
+# module load star/2.5.0c has issues
+module load star/2.4.5a
+
+echo " * STAR: $(readlink -f $(which STAR)) "
+echo " * samtools: $(readlink -f $(which samtools)) "
+echo " * STAR REF: $ref_star "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * BAM: $bam "
+
+# change dir because star writes a log file to pwd
+cd "$proj_dir"
+
+# --outFilterType BySJout - ENCODE standard option
+# --outSAMmapqUnique - int: 0 to 255: the MAPQ value for unique mappers
+
+# relevant errors:
+# SeqAnswers: "HTseq cannot deal with jM:B:c,-1 and jI:B:i,-1 SAM attributes which are output if you use (non-default) --outSAMmode Full"
+# Picard says STAR-sorted BAM "is not coordinate sorted", so using samtools for sorting
+
+bash_cmd="
+STAR \
+--runThreadN $threads \
+--genomeDir $ref_star \
+--genomeLoad NoSharedMemory \
+--outFilterMismatchNoverLmax 0.05 \
+--outFilterMultimapNmax 1 \
+--outFilterType BySJout \
+--outSAMstrandField intronMotif \
+--outSAMattributes NH HI AS nM NM MD XS \
+--outSAMmapqUnique 60 \
+--twopassMode Basic \
+--readFilesCommand zcat \
+--readFilesIn $fastq_R1 $fastq_R2 \
+--outFileNamePrefix $star_prefix \
+--quantMode GeneCounts \
+--outSAMtype BAM Unsorted \
+--outStd BAM_Unsorted | \
+samtools sort -@ $threads -m 4G -T $sample -o $bam -
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+# index bam
+bash_cmd="samtools index $bam"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+
+#########################
+
+
+# check that output generated
+
+# check if BAM file is present
+if [ ! -s "$bam" ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam NOT GENERATED \n" >&2
+	exit 1
+fi
+
+# check that bam file size above 10 kb
+du -s $bam
+bam_size=$(du -s --apparent-size $bam | cut -f 1)
+if [ $bam_size -lt 10 ] ; then
+	echo -e "\n $script_name ERROR: BAM $bam TOO SMALL \n" >&2
+	exit 1
+fi
+
+# check if gene counts file is present
+if [ ! -s "${star_prefix}ReadsPerGene.out.tab" ] ; then
+	echo -e "\n $script_name ERROR: COUNTS FILE ${star_prefix}ReadsPerGene.out.tab NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# STAR counts
+# not using STAR counts later, since it's hard to filter and they have gene ids instead of names
+
+# STAR outputs read counts per gene into ReadsPerGene.out.tab file with 4 columns which correspond to strandedness:
+# column 1: gene ID
+# column 2: counts for unstranded RNA-seq
+# column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
+# column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse)
+
+# move counts file to a separate bam-only directory
+CMD="mv -v ${star_prefix}ReadsPerGene.out.tab $counts_txt"
+echo "CMD: $CMD"
+eval "$CMD"
+
+# strand:
+# fwd | transcript             | cufflinks "fr-secondstrand" | htseq "yes"     | picard "FIRST_READ"
+# rev | rev comp of transcript | cufflinks "fr-firststrand"  | htseq "reverse" | picard "SECOND_READ"
+
+# top 1000 genes (useful to determine strandness)
+echo -e "#gene_id,unstr,fwd,rev" > "${star_quant_dir}/${sample}.top1000.csv"
+cat "$counts_txt" | LC_ALL=C sort -k2,2nr | head -n 1000 | tr '\t' ',' >> "${star_quant_dir}/${sample}.top1000.csv"
+
+
+#########################
+
+
+# determine strand
+
+# get total counts for each strand
+counts_unstr=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$2} END {print sum}')
+echo "counts unstr: $counts_unstr"
+counts_fwd=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$3} END {print sum}')
+echo "counts fwd: $counts_fwd"
+counts_rev=$(cat $counts_txt | grep -v 'N_' | awk -F $'\t' '{sum+=$4} END {print sum}')
+echo "counts rev: $counts_rev"
+
+# perform some sanity checks
+if [ "$counts_unstr" -lt 1000 ] || [ "$counts_fwd" -lt 10 ] || [ "$counts_rev" -lt 10 ] ; then
+	echo -e "\n $script_name ERROR: LOW COUNTS \n" >&2
+	exit 1
+fi
+
+lib_strand="unstr"
+
+if [ "$(echo "${counts_fwd}/${counts_rev}" | bc)" -gt 8 ] ; then
+	lib_strand="fwd"
+fi
+
+if [ "$(echo "${counts_rev}/${counts_fwd}" | bc)" -gt 8 ] ; then
+	lib_strand="rev"
+fi
+
+# set experiment strand (ignored if already specified)
+exp_strand=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" EXP-STRAND "$lib_strand");
+
+echo "sample strand: $lib_strand"
+echo "experiment strand: $exp_strand"
+
+
+#########################
+
+
+# generate alignment summary
+
+# header for summary file
+echo "#SAMPLE,INPUT READS,UNIQUELY MAPPED,MULTI-MAPPED,UNIQUELY MAPPED %,MULTI-MAPPED %" > $summary_csv
+
+# print the relevant numbers from log file
+star_log_final="${star_prefix}Log.final.out"
+paste -d ',' \
+<(echo "$sample") \
+<(cat "$star_log_final" | grep "Number of input reads"                   | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \
+<(cat "$star_log_final" | grep "Uniquely mapped reads number"            | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \
+<(cat "$star_log_final" | grep "Number of reads mapped to too many loci" | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \
+<(cat "$star_log_final" | grep "Uniquely mapped reads %"                 | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \
+<(cat "$star_log_final" | grep "% of reads mapped to too many loci"      | head -1 | tr -d "[:blank:]" | cut -d "|" -f 2) \
+>> $summary_csv
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# delete tmp directories (_STARtmp should be empty at this point)
+rm -rfv ${star_prefix}_STAR*
+
+
+#########################
+
+
+# add sample and BAM to sample sheet
+echo "${sample},${bam}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-fastq-clean.sh b/segments/fastq-fastq-clean.sh
new file mode 100755
index 0000000..7ab7dc3
--- /dev/null
+++ b/segments/fastq-fastq-clean.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+
+# merge fastqs or create symlinks to originals if no need to merge
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample name] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$(readlink -f $1)
+sample=$2
+
+# check if input exists
+if [ ! -d "$proj_dir" ] || [ ! -n "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: PROJ DIR $proj_dir DOES NOT EXIST \n"
+	exit 1
+fi
+
+# sample sheet
+samples_csv="${proj_dir}/samples.fastq-raw.csv"
+if [ ! -s "$samples_csv" ] || [ ! -n "$samples_csv" ] ; then
+	echo -e "\n $script_name ERROR: SAMPLES CSV $samples_csv DOES NOT EXIST \n"
+	exit 1
+fi
+samples_csv_clean="${proj_dir}/samples.${segment_name}.csv"
+
+# output summary
+summary_dir="${proj_dir}/summary"
+mkdir -p "${summary_dir}"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+# output files
+fastq_clean_dir="${proj_dir}/FASTQ-CLEAN"
+mkdir -p "$fastq_clean_dir"
+fastq_R1_clean="${fastq_clean_dir}/${sample}_R1.fastq.gz"
+fastq_R2_clean="${fastq_clean_dir}/${sample}_R2.fastq.gz"
+
+# considered initially
+# fastq_R1_clean=$(bash ${CODE_DIR}/scripts/get-segment-filename.sh $proj_dir fastq-to-fastq-clean-r1 $sample)
+# fastq_R2_clean=$(bash ${CODE_DIR}/scripts/get-segment-filename.sh $proj_dir fastq-to-fastq-clean-r2 $sample)
+
+# check if output already exists
+if [ -s "$fastq_R1_clean" ] || [ -s "$fastq_R2_clean" ]; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+# get number of files (or file pairs) for the sample (to decide if files need to be merged)
+num_files=$(grep -c "^${sample}," "${samples_csv}")
+
+# scan for files that belong to the sample
+grep "^${sample}," "${samples_csv}" | LC_ALL=C sort | while read -r LINE ; do
+	fastq_R1=$(echo "$LINE" | cut -d "," -f 2)
+	fastq_R2=$(echo "$LINE" | cut -d "," -f 3)
+	fastq_R1=$(readlink -f "$fastq_R1")
+	fastq_R2=$(readlink -f "$fastq_R2")
+
+	# check that R1 file exists and not set to null
+	if [ ! -e "$fastq_R1" ] || [ ! -n "$fastq_R1" ] ; then
+		echo -e "\n $script_name ERROR: FASTQ R1 $fastq_R1 DOES NOT EXIST \n"
+		exit 1
+	fi
+
+	# check that R2 file exists if specified
+	if [ ! -e "$fastq_R2" ] && [ -n "$fastq_R2" ] ; then
+		echo -e "\n $script_name ERROR: FASTQ R2 $fastq_R2 DOES NOT EXIST \n"
+		exit 1
+	fi
+
+	if [ "$num_files" -gt 1 ] ; then
+
+		# merge multple FASTQs
+
+		bash_cmd="cat $fastq_R1 >> $fastq_R1_clean"
+		echo "CMD: $bash_cmd"
+		eval "$bash_cmd"
+
+		if [ -n "$fastq_R2" ] ; then
+			bash_cmd="cat $fastq_R2 >> $fastq_R2_clean"
+			echo "CMD: $bash_cmd"
+			eval "$bash_cmd"
+		fi
+
+	elif [ "$num_files" -eq 1 ] ; then
+
+		# symlink to the original FASTQ if only one
+
+		bash_cmd="ln -s $fastq_R1 $fastq_R1_clean"
+		echo "CMD: $bash_cmd"
+		eval "$bash_cmd"
+
+		if [ -n "$fastq_R2" ] ; then
+			bash_cmd="ln -s $fastq_R2 $fastq_R2_clean"
+			echo "CMD: $bash_cmd"
+			eval "$bash_cmd"
+		fi
+
+	fi
+
+	sleep 30
+
+	date
+
+done
+
+
+#########################
+
+
+# check if output exists
+
+if [ ! -s "$fastq_R1_clean" ] || [ ! -n "$fastq_R1_clean" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1_clean DOES NOT EXIST \n"
+	exit 1
+fi
+
+
+#########################
+
+
+# count number of reads in merged FASTQs
+
+lines_R1=$(zcat "$fastq_R1_clean" | wc -l)
+reads_R1=$(echo "${lines_R1}/4" | bc)
+
+if [ -s "$fastq_R2_clean" ] ; then
+	lines_R2=$(zcat "$fastq_R2_clean" | wc -l)
+	reads_R2=$(echo "${lines_R2}/4" | bc)
+else
+	reads_R2="0"
+	fastq_R2_clean=""
+fi
+
+echo "FASTQ R1: $fastq_R1_clean"
+echo "FASTQ R2: $fastq_R2_clean"
+echo "READS R1: $reads_R1"
+echo "READS R2: $reads_R2"
+
+
+#########################
+
+
+# generate summary
+
+echo "#SAMPLE,R1 RAW READS,R2 RAW READS" > "$summary_csv"
+echo "${sample},${reads_R1},${reads_R2}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and FASTQ to sample sheet
+echo "${sample},${fastq_R1_clean},${fastq_R2_clean}" >> "$samples_csv_clean"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv_clean" "$samples_csv_clean"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-fastq-trim-trimgalore.sh b/segments/fastq-fastq-trim-trimgalore.sh
new file mode 100755
index 0000000..aac6de9
--- /dev/null
+++ b/segments/fastq-fastq-trim-trimgalore.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+
+
+# run Trim Galore (for MspI digested RRBS samples)
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name run_type FASTQ_R1 [FASTQ_R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+run_type=$3
+fastq_R1=$4
+fastq_R2=$5
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+fastq_trim_dir="${proj_dir}/FASTQ-TRIMMED"
+mkdir -p "$fastq_trim_dir"
+fastq_R1_trim="${fastq_trim_dir}/${sample}_R1.trim.fastq.gz"
+fastq_R2_trim="${fastq_trim_dir}/${sample}_R2.trim.fastq.gz"
+
+trim_galore_logs_dir="${proj_dir}/logs-trimgalore"
+mkdir -p "$trim_galore_logs_dir"
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ; then
+	fastq_R1_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R1 | sed 's/.fastq.gz/_val_1.fq.gz/')
+	fastq_R2_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R2 | sed 's/.fastq.gz/_val_2.fq.gz/')
+	trim_galore_log=${trim_galore_logs_dir}/$(basename $fastq_R2)_trimming_report.txt
+else
+	fastq_R1_trim_original=${trim_galore_logs_dir}/$(basename $fastq_R1 | sed 's/.fastq.gz/_trimmed.fq.gz/')
+	trim_galore_log=${trim_galore_logs_dir}/$(basename $fastq_R1)_trimming_report.txt
+	fastq_R2_trim=""
+fi
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$fastq_R1_trim" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# trim galore
+
+module load trim-galore/0.4.1
+
+echo " * trim_galore: $(readlink -f $(which trim_galore)) "
+echo " * trim_galore version: $(trim_galore --version | grep 'version' | tr -s '[:blank:]') "
+echo " * cutadapt: $(readlink -f $(which cutadapt)) "
+echo " * cutadapt version: $(cutadapt --version) "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * FASTQ R1 TRIMMED: $fastq_R1_trim "
+echo " * FASTQ R2 TRIMMED: $fastq_R2_trim "
+
+# --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition site: CCGG)
+
+# optional fastqc
+# --fastqc --fastqc_args ' --threads $THREADS --outdir $FASTQC_DIR ' \
+
+# check for run type
+if [ "$run_type" == "rrbs" ] ; then
+	trim_galore_flags="--length 30 --rrbs"
+else
+	echo -e "\n $script_name unknown run type $run_type \n" >&2
+	exit 1
+fi
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ; then
+	trim_galore_flags="$trim_galore_flag --paired $fastq_R1 $fastq_R2"
+else
+	trim_galore_flags="$trim_galore_flag $fastq_R1"
+fi
+
+# trim_galore
+bash_cmd="
+trim_galore \
+--output_dir $trim_galore_logs_dir \
+$trim_galore_flags
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+# clean up file names
+
+mv -fv "$fastq_R1_trim_original" "$fastq_R1_trim"
+
+if [ -n "$fastq_R2" ] ; then
+	mv -fv "$fastq_R2_trim_original" "$fastq_R2_trim"
+fi
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$fastq_R1_trim" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1_trim NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# generate summary
+
+# extract the relevant part of the log
+reads_input=$(cat "$trim_galore_log" | grep "sequences processed in total" | tail -1 | cut -d " " -f 1)
+
+# single-end and paired-end trim logs have different formatting, so just manually counting number of lines
+reads_surviving=$(zcat $fastq_R1_trim | wc -l)
+reads_surviving=$(echo "${reads_surviving}/4" | bc)
+
+reads_surviving_pct=$(echo "(${reads_surviving}/${reads_input})*100" | bc -l | cut -c 1-4)
+reads_surviving_pct="${reads_surviving_pct}%"
+
+# header for summary file
+echo "#SAMPLE,TRIM INPUT READS,TRIM SURVIVING READS,TRIM SURVIVING READS %" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${reads_input},${reads_surviving},${reads_surviving_pct}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and FASTQ to sample sheet
+echo "${sample},${fastq_R1_trim},${fastq_R2_trim}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-fastq-trim-trimmomatic.sh b/segments/fastq-fastq-trim-trimmomatic.sh
new file mode 100755
index 0000000..c3bef39
--- /dev/null
+++ b/segments/fastq-fastq-trim-trimmomatic.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+
+
+# run Trimmomatic
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 4 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name project_dir sample_name num_threads FASTQ_R1 [FASTQ_R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+fastq_R1=$4
+fastq_R2=$5
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.${segment_name}.csv"
+
+samples_csv="${proj_dir}/samples.${segment_name}.csv"
+
+fastq_trim_dir="${proj_dir}/FASTQ-TRIMMED"
+mkdir -p "$fastq_trim_dir"
+fastq_R1_trim="${fastq_trim_dir}/${sample}_R1.trim.fastq.gz"
+fastq_R2_trim="${fastq_trim_dir}/${sample}_R2.trim.fastq.gz"
+fastq_R1_trim_unpaired="${fastq_trim_dir}/${sample}_R1.unpaired.fastq.gz"
+fastq_R2_trim_unpaired="${fastq_trim_dir}/${sample}_R2.unpaired.fastq.gz"
+
+trimmomatic_logs_dir="${proj_dir}/logs-trimmomatic"
+mkdir -p "$trimmomatic_logs_dir"
+trimmomatic_log="${trimmomatic_logs_dir}/${sample}.txt"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$fastq_R1_trim" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# trimmomatic
+
+module unload java
+module load java/1.7
+module load trimmomatic/0.33
+
+trimmomatic_jar="${TRIMMOMATIC_ROOT}/trimmomatic-0.33.jar"
+
+if [ -n "$fastq_R2" ] ; then
+	run_type_arg="PE"
+	files_arg="$fastq_R1 $fastq_R2 $fastq_R1_trim $fastq_R1_trim_unpaired $fastq_R2_trim $fastq_R2_trim_unpaired"
+else
+	run_type_arg="SE"
+	files_arg="$fastq_R1 $fastq_R1_trim"
+	fastq_R2_trim=""
+	fastq_R2_trim_unpaired=""
+fi
+
+echo " * trimmomatic: $trimmomatic_jar "
+echo " * RUN TYPE: $run_type_arg "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * FASTQ R1 TRIMMED: $fastq_R1_trim "
+echo " * FASTQ R2 TRIMMED: $fastq_R2_trim "
+echo " * FASTQ R1 TRIMMED UNPAIRED: $fastq_R1_trim_unpaired "
+echo " * FASTQ R2 TRIMMED UNPAIRED: $fastq_R2_trim_unpaired "
+
+bash_cmd="
+java -Xms16G -Xmx16G -jar $trimmomatic_jar \
+$run_type_arg \
+-threads $threads \
+$files_arg \
+ILLUMINACLIP:/ifs/home/id460/ref/contaminants/trimmomatic.fa:2:30:10:1:true \
+TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:35 \
+2> $trimmomatic_log
+"
+echo "CMD: $bash_cmd"
+eval "$bash_cmd"
+
+sleep 30
+
+# delete unpaired files
+rm -fv $fastq_R1_trim_unpaired
+rm -fv $fastq_R2_trim_unpaired
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$fastq_R1_trim" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1_trim NOT GENERATED \n" >&2
+	exit 1
+fi
+
+if grep -q -i "error" "$trimmomatic_log" ; then
+	echo -e "\n $script_name ERROR: LOG $trimmomatic_log CONTAINS ERROR \n" >&2
+	grep -i "error" "$trimmomatic_log"
+	rm -fv $fastq_R1_trim
+	rm -fv $fastq_R2_trim
+	exit 1
+fi
+
+
+#########################
+
+
+# generate summary
+
+# extract the relevant part of the log
+
+if [ -n "$fastq_R2" ] ; then
+	total_reads_col="4"
+	surviving_reads_col="7"
+else
+	total_reads_col="3"
+	surviving_reads_col="5"
+fi
+
+reads_input=$(cat $trimmomatic_log | grep "Input Read" | cut -d ' ' -f $total_reads_col)
+reads_surviving=$(cat $trimmomatic_log | grep "Input Read" | cut -d ' ' -f $surviving_reads_col)
+
+reads_surviving_pct=$(echo "(${reads_surviving}/${reads_input})*100" | bc -l | cut -c 1-4)
+reads_surviving_pct="${reads_surviving_pct}%"
+
+# header for summary file
+echo "#SAMPLE,TRIM INPUT READS,TRIM SURVIVING READS,TRIM SURVIVING READS %" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${reads_input},${reads_surviving},${reads_surviving_pct}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.${segment_name}.csv"
+
+
+#########################
+
+
+# add sample and FASTQ to sample sheet
+echo "${sample},${fastq_R1_trim},${fastq_R2_trim}" >> "$samples_csv"
+
+sleep 30
+
+# sort and remove duplicates in place in sample sheet
+LC_ALL=C sort -t ',' -k1,1 -u -o "$samples_csv" "$samples_csv"
+
+sleep 30
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-qc-fastqscreen.sh b/segments/fastq-qc-fastqscreen.sh
new file mode 100755
index 0000000..6eae89d
--- /dev/null
+++ b/segments/fastq-qc-fastqscreen.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+
+# run fastq_screen
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ ! $# == 3 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [FASTQ] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+fastq=$3
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+CODE_DIR=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+FASTQSCREEN_CONF=$(bash ${CODE_DIR}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-FASTQSCREEN);
+
+if [ ! -s "$FASTQSCREEN_CONF" ] ; then
+	echo -e "\n $script_name ERROR: CONF $FASTQSCREEN_CONF DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+FASTQ_SCREEN_DIR="${proj_dir}/QC-fastqscreen"
+mkdir -p "$FASTQ_SCREEN_DIR"
+
+FASTQ_SCREEN_TXT="${FASTQ_SCREEN_DIR}/${fastq/*\//}_screen.txt"
+FASTQ_SCREEN_TXT="${FASTQ_SCREEN_TXT/.fastq.gz/}"
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$FASTQ_SCREEN_TXT" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# fastq_screen
+# ignore paired reads (in case of rna-seq, paired reads may be too far apart and will not align)
+
+bowtie2_bin=$(cat "$FASTQSCREEN_CONF" | grep "^BOWTIE2" | head -1 | tr '[:space:]' '\t' | tr -s '\t' | cut -f 2)
+fastqscreen_bin="/ifs/home/id460/software/fastq_screen_v0.5.2/fastq_screen"
+
+echo " * fastq_screen: $fastqscreen_bin "
+echo " * fastq_screen version: $($fastqscreen_bin --version) "
+echo " * bowtie2: $bowtie2_bin "
+echo " * bowtie2 version: $($bowtie2_bin --version | head -1) "
+echo " * FASTQ: $fastq "
+echo " * OUT TXT: $FASTQ_SCREEN_TXT "
+
+CMD="
+$fastqscreen_bin \
+--aligner bowtie2 \
+--subset 1000000 \
+--conf $FASTQSCREEN_CONF \
+--outdir $FASTQ_SCREEN_DIR \
+$fastq
+"
+echo "CMD: $CMD"
+$CMD
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$FASTQ_SCREEN_TXT" ] ; then
+	echo -e "\n $script_name ERROR: TXT $FASTQ_SCREEN_TXT NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# summary
+
+# combine charts into a single pdf
+COMBINED_PDF="${proj_dir}/summary.fastqscreen.pdf"
+rm -f "$COMBINED_PDF"
+convert "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PDF"
+
+
+# combine charts into a single png
+
+COMBINED_PNG_3W="${proj_dir}/summary.fastqscreen.3w.png"
+COMBINED_PNG_4W="${proj_dir}/summary.fastqscreen.4w.png"
+
+rm -f "$COMBINED_PNG_3W"
+rm -f "$COMBINED_PNG_4W"
+
+# -geometry +20+20 = 20px x and y padding
+# -tile 3x = 3 images wide
+montage -geometry +20+20 -tile 3x "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PNG_3W"
+montage -geometry +20+20 -tile 4x "${FASTQ_SCREEN_DIR}/*screen.png" "$COMBINED_PNG_4W"
+
+
+#########################
+
+
+
+# end
diff --git a/segments/fastq-quant-rsem.sh b/segments/fastq-quant-rsem.sh
new file mode 100755
index 0000000..f3f2b9b
--- /dev/null
+++ b/segments/fastq-quant-rsem.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+
+# run RSEM
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+segment_name=${script_name/%.sh/}
+echo -e "\n ========== SEGMENT: $segment_name ========== \n" >&2
+
+# check for correct number of arguments
+if [ $# -lt 5 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name [project dir] [sample] [threads] [strand] [FASTQ R1] [FASTQ R2] \n" >&2
+	exit 1
+fi
+
+# arguments
+proj_dir=$1
+sample=$2
+threads=$3
+strand=$4
+fastq_R1=$5
+fastq_R2=$6
+
+
+#########################
+
+
+# check that inputs exist
+
+if [ ! -d "$proj_dir" ] ; then
+	echo -e "\n $script_name ERROR: DIR $proj_dir DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+if [ ! -s "$fastq_R1" ] ; then
+	echo -e "\n $script_name ERROR: FASTQ $fastq_R1 DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+code_dir=$(dirname "$(dirname "${BASH_SOURCE[0]}")")
+rsem_genome=$(bash ${code_dir}/scripts/get-set-setting.sh "${proj_dir}/settings.txt" REF-RSEM);
+
+if [ ! -s "${rsem_genome}.transcripts.fa" ] ; then
+	echo -e "\n $script_name ERROR: GENOME ${rsem_genome}.transcripts.fa DOES NOT EXIST \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# settings and files
+
+summary_dir="${proj_dir}/summary"
+mkdir -p "$summary_dir"
+summary_csv="${summary_dir}/${sample}.rsem.${strand}.csv"
+
+rsem_quant_dir="${proj_dir}/quant-RSEM"
+mkdir -p "$rsem_quant_dir"
+rsem_expcounts_txt="${rsem_quant_dir}/${sample}.expcounts.${strand}.txt"
+rsem_tpm_txt="${rsem_quant_dir}/${sample}.TPM.${strand}.txt"
+rsem_fpkm_txt="${rsem_quant_dir}/${sample}.FPKM.${strand}.txt"
+
+rsem_logs_dir="${proj_dir}/logs-RSEM"
+mkdir -p "$rsem_logs_dir"
+# RSEM_RAW_SUMMARY="${rsem_logs_dir}/${sample}.${strand}.stats.txt"
+rsem_raw_genes_results="${rsem_logs_dir}/${sample}.${strand}.genes.results"
+rsem_stat_cnt="${rsem_logs_dir}/${sample}.${strand}.stat/${sample}.${strand}.cnt"
+
+# adjust for single/paired end
+if [ -n "$fastq_R2" ] ;then
+	fastq_flag="--paired-end $fastq_R1 $fastq_R2"
+else
+	fastq_flag="$fastq_R1"
+fi
+
+# adjust for strand
+# fwd | transcript             | cufflinks "fr-secondstrand" | htseq "yes"     | picard "FIRST_READ"
+# rev | rev comp of transcript | cufflinks "fr-firststrand"  | htseq "reverse" | picard "SECOND_READ"
+if [ "$strand" == "unstr" ] ; then
+	strand_flag=""
+elif [ "$strand" == "fwd" ] ; then
+	strand_flag="--forward-prob 1"
+elif [ "$strand" == "rev" ] ; then
+	strand_flag="--forward-prob 0"
+else
+	echo -e "\n $script_name ERROR: INCORRECT strand $strand SPECIFIED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# exit if output exits already
+
+if [ -s "$rsem_expcounts_txt" ] ; then
+	echo -e "\n $script_name SKIP SAMPLE $sample \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# RSEM
+
+module load bowtie2/2.2.6
+
+rsem_bin_dir="/ifs/home/id460/software/RSEM-1.2.31/bin"
+
+echo " * RSEM: ${rsem_bin_dir}/rsem-calculate-expression "
+echo " * RSEM version: $(${rsem_bin_dir}/rsem-calculate-expression --version) "
+echo " * bowtie2: $(readlink -f $(which bowtie2)) "
+echo " * bowtie2 version: $(bowtie2 --version | head -1) "
+echo " * RSEM REF: $rsem_genome "
+echo " * FASTQ R1: $fastq_R1 "
+echo " * FASTQ R2: $fastq_R2 "
+echo " * COUNTS : $rsem_expcounts_txt "
+echo " * TPM : $rsem_tpm_txt "
+echo " * FPKM : $rsem_fpkm_txt "
+
+CMD="
+${rsem_bin_dir}/rsem-calculate-expression \
+--quiet \
+--no-bam-output \
+--bowtie2 \
+--num-threads $threads \
+--estimate-rspd \
+$strand_flag \
+$fastq_flag \
+$rsem_genome \
+${rsem_logs_dir}/${sample}.${strand}
+"
+echo "CMD: $CMD"
+eval "$CMD"
+
+sleep 30
+
+
+#########################
+
+
+# check that output generated
+
+if [ ! -s "$rsem_raw_genes_results" ] ; then
+	echo -e "\n $script_name ERROR: RESULTS FILE $rsem_raw_genes_results NOT GENERATED \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# clean up the full output
+
+# all results
+# cat $RSEM_GENES_RESULTS | sed 's/gene_id/000_gene_id/' | cut -f 1,5,6,7 | LC_ALL=C sort -k1,1 > $RSEM_GENES_RESULTS_TXT
+
+echo -e "#GENE\t${sample}" > "$rsem_expcounts_txt"
+cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,5 | LC_ALL=C sort -k1,1 >> "$rsem_expcounts_txt"
+
+echo -e "#GENE\t${sample}" > "$rsem_tpm_txt"
+cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,6 | LC_ALL=C sort -k1,1 >> "$rsem_tpm_txt"
+
+echo -e "#GENE\t${sample}" > "$rsem_fpkm_txt"
+cat "$rsem_raw_genes_results" | grep -v "gene_id" | cut -f 1,7 | LC_ALL=C sort -k1,1 >> "$rsem_fpkm_txt"
+
+
+#########################
+
+
+# generate summary
+
+# *.cnt file contains alignment statistics based purely on the alignment results obtained from aligners
+# N0 N1 N2 N_tot   #  N0, number of unalignable reads; N1, number of alignable reads; N2, number of filtered reads due to too many alignments; N_tot = N0 + N1 + N2
+# nUnique nMulti nUncertain   # nUnique, number of reads aligned uniquely to a gene;
+#                             # nMulti, number of reads aligned to multiple genes; nUnique + nMulti = N1;
+#                             # nUncertain, number of reads aligned to multiple locations in the given reference sequences, which include isoform-level multi-mapping reads
+# nHits read_type             # nHits, number of total alignments.
+#                             # read_type: 0, single-end read, no quality score; 1, single-end read, with quality score; 2, paired-end read, no quality score; 3, paired-end read, with quality score
+
+# extract read counts
+reads_input=$(     cat "$rsem_stat_cnt" | head -1 | cut -d " " -f 4)
+reads_map=$(       cat "$rsem_stat_cnt" | head -1 | cut -d " " -f 2)
+reads_map_unique=$(cat "$rsem_stat_cnt" | head -2 | tail -1 | cut -d " " -f 1)
+reads_map_multi=$( cat "$rsem_stat_cnt" | head -2 | tail -1 | cut -d " " -f 2)
+
+# calculate ratios
+map_pct=$(       echo "(${reads_map}/${reads_input})*100"        | bc -l | cut -c 1-4)
+map_unique_pct=$(echo "(${reads_map_unique}/${reads_input})*100" | bc -l | cut -c 1-4)
+map_multi_pct=$( echo "(${reads_map_multi}/${reads_input})*100"  | bc -l | cut -c 1-4)
+
+map_pct="${map_pct}%"
+map_unique_pct="${map_unique_pct}%"
+map_multi_pct="${map_multi_pct}%"
+
+# header for summary file
+echo "#SAMPLE,INPUT READS,MAPPED READS,UNIQUELY MAPPED,MULTI-MAPPED,\
+MAPPED READS,UNIQUELY MAPPED,MULTI-MAPPED" > "$summary_csv"
+
+# summarize log file
+echo "${sample},${reads_input},${reads_map},${reads_map_unique},${reads_map_multi},\
+${map_pct},${map_unique_pct},${map_multi_pct}" >> "$summary_csv"
+
+sleep 30
+
+# combine all sample summaries
+cat ${summary_dir}/*.rsem.${strand}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "${proj_dir}/summary.rsem.${strand}.csv"
+
+
+#########################
+
+
+# generate counts matrix for all samples
+# merged counts filename with sample included to avoid conflicts during join process
+
+merged_counts_file="${proj_dir}/quant.rsem.expcounts.${strand}.txt"
+count_file_list=$(find "$rsem_quant_dir" -type f -name "*.expcounts.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ')
+
+CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp"
+(eval "$CMD")
+mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file"
+
+
+merged_counts_file="${proj_dir}/quant.rsem.TPM.${strand}.txt"
+count_file_list=$(find "$rsem_quant_dir" -type f -name "*.TPM.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ')
+
+CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp"
+(eval "$CMD")
+mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file"
+
+
+merged_counts_file="${proj_dir}/quant.rsem.FPKM.${strand}.txt"
+count_file_list=$(find "$rsem_quant_dir" -type f -name "*.FPKM.${strand}.txt" | LC_ALL=C sort | tr '\n' ' ')
+
+CMD="bash ${code_dir}/scripts/join-many.sh $'\t' 0 $count_file_list > ${merged_counts_file}.${sample}.tmp"
+(eval "$CMD")
+mv -fv "${merged_counts_file}.${sample}.tmp" "$merged_counts_file"
+
+
+#########################
+
+
+
+# end