routine sync

igordot · Jul 10, 2019 · 4191229 · 4191229
1 parent 55133a9
commit 4191229
Show file tree

Hide file tree

Showing 4 changed files with 431 additions and 32 deletions.
diff --git a/scripts-bigpurple/assembly-10x-supernova.sh b/scripts-bigpurple/assembly-10x-supernova.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+
+##
+## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova.
+##
+## Usage:
+## sbatch --job-name=supernova --nodes=1 --ntasks=1 --cpus-per-task=17 --mem-per-cpu=32G --time=15-00 \
+## --partition=fn_long --mail-user=${USER}@nyulangone.org --mail-type=END,FAIL,REQUEUE --export=NONE \
+## --wrap="bash /gpfs/data/igorlab/public/genomics/scripts-bigpurple/assembly-10x-supernova.sh fastq_dir [max_reads]"
+##
+
+
+#########################
+
+
+# system-specific settings
+
+# supernova directory
+supernova_version="2.1.1"
+supernova_dir="/gpfs/data/igorlab/software/supernova/supernova-${supernova_version}"
+
+# prepare environment
+module purge
+module add default-environment
+
+
+#########################
+
+
+# check for correct number of arguments
+if [ $# -lt 1 ] ; then
+	echo -e "\n ERROR: wrong number of arguments supplied \n" >&2
+	echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2
+	exit 1
+fi
+
+# arguments
+fastq_dir=$(readlink -f "$1")
+max_reads="$2"
+
+# check that input exists
+if [ ! -d "$fastq_dir" ] ; then
+	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# step 1: assembly (supernova run)
+
+# million reads cutoff (default is 1200M)
+# set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads
+# coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset
+# default value is 1.2B, which only makes sense for ~3.2 Gb genomes
+if [ -n "$max_reads" ] ; then
+	max_reads_m="$max_reads"
+else
+	max_reads_m="1200"
+fi
+
+# system load settings (reserve an extra thread for overhead)
+threads=$SLURM_CPUS_PER_TASK
+threads=$(( threads - 1 ))
+mem=$(echo "$threads * 32" | bc)
+
+# run name (used to name output folder)
+supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g')
+run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M"
+
+# display settingse
+echo
+echo " * fastq dir:               $fastq_dir "
+echo " * supernova bin dir:       $supernova_dir "
+echo " * reads cutoff (million):  $max_reads_m "
+echo " * threads:                 $threads "
+echo " * mem:                     $mem "
+echo " * run name (output dir):   $run_id "
+echo
+
+echo -e "\n assembly started: $(date) \n" >&2
+
+# supernova assembly command
+
+supernova_cmd="
+${supernova_dir}/supernova run \
+--maxreads ${max_reads_m}000000 \
+--localcores ${threads} \
+--localmem ${mem} \
+--id ${run_id} \
+--fastqs ${fastq_dir}
+"
+echo -e "\n CMD: $supernova_cmd \n"
+$supernova_cmd
+
+echo -e "\n assembly ended: $(date) \n" >&2
+
+# check that output generated
+supernova_out_dir=$(readlink -f "$(pwd)/${run_id}")
+if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then
+	echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2
+	exit 1
+fi
+
+
+#########################
+
+
+# step 2: generate fasta file (supernova mkoutput)
+
+# display settings
+echo
+echo " * assembly dir:  ${supernova_out_dir}/outs/assembly "
+echo " * fasta prefix:  ${supernova_out_dir}/assembly "
+echo
+
+# generate different style fasta files
+styles="raw megabubbles pseudohap pseudohap2"
+for s in $styles; do
+
+	echo -e "\n generate fasta: style $s \n" >&2
+
+	# supernova mkoutput command
+	${supernova_dir}/supernova mkoutput \
+	--asmdir "${supernova_out_dir}/outs/assembly" \
+	--outprefix "${supernova_out_dir}/assembly.${s}" \
+	--style "${s}"
+
+done
+
+# check that output generated
+styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2"
+for s in $styles_out; do
+
+	# check that output generated
+	if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then
+		echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2
+		exit 1
+	fi
+
+done
+
+
+#########################
+
+
+# cleanup
+
+# check file size before cleanup
+echo
+echo "file size before cleanup"
+du -sh "$run_id"
+echo
+
+# delete large assembly files (keep small ones just in case)
+rm -rf ${run_id}/outs/assembly/a*
+rm -rf ${run_id}/outs/assembly/closures*
+rm -rf ${run_id}/outs/assembly/data
+# delete temp files
+rm -rf ${run_id}/ASSEMBLER_CS
+
+# check file size after cleanup
+echo
+echo "file size after cleanup"
+du -sh "$run_id"
+echo
+
+
+#########################
+
+
+
+# end
diff --git a/scripts-bigpurple/scrna-10x-cellranger-aggr.sh b/scripts-bigpurple/scrna-10x-cellranger-aggr.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+
+##
+## 10X Cell Ranger
+## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count'
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ $# -lt 1 ] || [ $# -gt 2 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2
+	exit 1
+fi
+
+# arguments
+sample_sheet=$1
+analysis_name=$2
+
+# settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant)
+threads="4"
+mem="32"
+
+# make the output group-writeable
+umask 007
+
+# output (add analysis name if provided)
+sample_name="aggregated"
+if [ -n "$analysis_name" ] ; then
+	sample_name="${sample_name}-${analysis_name}"
+fi
+web_summary_html="${sample_name}/outs/web_summary.html"
+
+# check that input exists
+if [ ! -s "$sample_sheet" ] ; then
+	echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2
+	exit 1
+fi
+
+echo -e "\n $(date) \n" >&2
+
+# check if output already exits
+if [ -s "$web_summary_html" ]; then
+	echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2
+	exit 1
+fi
+
+module unload gcc
+module load cellranger/3.0.0
+module load dos2unix/7.4.0
+
+# clean up sample sheet
+dos2unix -q "$sample_sheet"
+sed -i 's/"//g' "$sample_sheet"
+sed -i -e '$a\' "$sample_sheet"
+
+# display settings
+echo " * cellranger: $(which cellranger) "
+echo " * sample sheet: $sample_sheet "
+
+# cellranger aggr command
+
+# id      A unique run id, used to name output folder [a-zA-Z0-9_-]+.
+# csv     Path of CSV file enumerating 'cellranger count' outputs.
+
+cellranger_cmd="
+cellranger aggr \
+--jobmode local \
+--localcores $threads \
+--localmem $mem \
+--id $sample_name \
+--csv $sample_sheet
+"
+echo -e "\n CMD: $cellranger_cmd \n"
+$cellranger_cmd
+
+sleep 15
+
+# check that output html summary (and probably everything else) exists
+if [ ! -s "$web_summary_html" ] ; then
+	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
+	exit 1
+fi
+
+# copy html summary to top level for easy navigation
+rsync -t "$web_summary_html" "./${sample_name}.html"
+
+# clean up (temp files)
+rm -rf "${sample_name}/SC_RNA_COUNTER_CS"
+
+echo -e "\n $(date) \n"
+
+
+
+# end
diff --git a/scripts-bigpurple/scrna-10x-cellranger-count.sh b/scripts-bigpurple/scrna-10x-cellranger-count.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+
+##
+## 10X Cell Ranger - processes Chromium single cell RNA-seq output
+##
+
+
+# script filename
+script_name=$(basename "${BASH_SOURCE[0]}")
+
+# check for correct number of arguments
+if [ ! $# == 3 ] ; then
+	echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
+	echo -e "\n USAGE: $script_name genome_name sample_name fastq_dir \n" >&2
+	exit 1
+fi
+
+# arguments
+genome_name=$1
+sample_name_fastq=$2
+sample_name_out="count-$sample_name_fastq"
+fastq_dir=$(readlink -f "$3")
+
+# settings
+#threads=$NSLOTS
+#threads=$SLURM_NTASKS
+threads=16
+#mem=$(echo "$threads * 8" | bc)
+mem=128
+
+# make the output group-writeable
+umask 007
+
+if [[ "$genome_name" == "hg19" ]] ; then
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0"
+elif [[ "$genome_name" == "hg38" ]] ; then
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
+elif [[ "$genome_name" == "GRCh38" ]] ; then
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
+elif [[ "$genome_name" == "mm10" ]] ; then
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-mm10-3.0.0"
+elif [[ "$genome_name" == "hg19_and_mm10" ]] ; then
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-and-mm10-3.0.0"
+else
+	transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/ref/${genome_name}/cellranger"
+	#transcriptome_dir="/gpfs/home/id460/ref/${genome_name}/cellranger"
+fi
+
+# unload all loaded modulefiles
+module purge
+
+# check that input exists
+if [ ! -d "$fastq_dir" ] ; then
+	echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
+	exit 1
+fi
+
+if [ ! -d "$transcriptome_dir" ] ; then
+	echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2
+	exit 1
+fi
+
+module load cellranger/3.0.0
+
+# display settings
+echo " * cellranger:        $(which cellranger) "
+echo " * threads:           $threads "
+echo " * mem:               $mem "
+echo " * transcriptome dir: $transcriptome_dir "
+echo " * fastq dir:         $fastq_dir "
+echo " * sample:            $sample_name_fastq "
+echo " * out dir:           $sample_name_out "
+
+echo -e "\n $(date) \n" >&2
+
+# cellranger run command
+
+# id             A unique run id, used to name output folder
+# fastqs         Path of folder created by 10x demultiplexing or bcl2fastq
+# sample         Prefix of the filenames of FASTQs to select
+# transcriptome  Path of folder containing 10X-compatible transcriptome
+
+cellranger_cmd="
+cellranger count \
+--localmem $mem \
+--localcores $threads \
+--transcriptome $transcriptome_dir \
+--fastqs $fastq_dir \
+--sample $sample_name_fastq \
+--id $sample_name_out \
+"
+echo -e "\n CMD: $cellranger_cmd \n"
+$cellranger_cmd
+
+sleep 15
+
+web_summary_html="./${sample_name_out}/outs/web_summary.html"
+
+# check that output html summary (and probably everything else) exists
+if [ ! -s "$web_summary_html" ] ; then
+	echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
+	exit 1
+fi
+
+# copy html summary to top level for easy navigation
+rsync -tv "$web_summary_html" "./${sample_name_out}.html"
+
+# delete temp files
+rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS"
+
+echo -e "\n $(date) \n"
+
+
+
+# end