Skip to content

Commit

Permalink
routine sync
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Jul 10, 2019
1 parent 55133a9 commit 4191229
Show file tree
Hide file tree
Showing 4 changed files with 431 additions and 32 deletions.
175 changes: 175 additions & 0 deletions scripts-bigpurple/assembly-10x-supernova.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/bin/bash


##
## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova.
##
## Usage:
## sbatch --job-name=supernova --nodes=1 --ntasks=1 --cpus-per-task=17 --mem-per-cpu=32G --time=15-00 \
## --partition=fn_long --mail-user=${USER}@nyulangone.org --mail-type=END,FAIL,REQUEUE --export=NONE \
## --wrap="bash /gpfs/data/igorlab/public/genomics/scripts-bigpurple/assembly-10x-supernova.sh fastq_dir [max_reads]"
##


#########################


# system-specific settings

# supernova directory
supernova_version="2.1.1"
supernova_dir="/gpfs/data/igorlab/software/supernova/supernova-${supernova_version}"

# prepare environment
module purge
module add default-environment


#########################


# check for correct number of arguments
if [ $# -lt 1 ] ; then
echo -e "\n ERROR: wrong number of arguments supplied \n" >&2
echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2
exit 1
fi

# arguments
fastq_dir=$(readlink -f "$1")
max_reads="$2"

# check that input exists
if [ ! -d "$fastq_dir" ] ; then
echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
exit 1
fi


#########################


# step 1: assembly (supernova run)

# million reads cutoff (default is 1200M)
# set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads
# coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset
# default value is 1.2B, which only makes sense for ~3.2 Gb genomes
if [ -n "$max_reads" ] ; then
max_reads_m="$max_reads"
else
max_reads_m="1200"
fi

# system load settings (reserve an extra thread for overhead)
threads=$SLURM_CPUS_PER_TASK
threads=$(( threads - 1 ))
mem=$(echo "$threads * 32" | bc)

# run name (used to name output folder)
supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g')
run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M"

# display settingse
echo
echo " * fastq dir: $fastq_dir "
echo " * supernova bin dir: $supernova_dir "
echo " * reads cutoff (million): $max_reads_m "
echo " * threads: $threads "
echo " * mem: $mem "
echo " * run name (output dir): $run_id "
echo

echo -e "\n assembly started: $(date) \n" >&2

# supernova assembly command

supernova_cmd="
${supernova_dir}/supernova run \
--maxreads ${max_reads_m}000000 \
--localcores ${threads} \
--localmem ${mem} \
--id ${run_id} \
--fastqs ${fastq_dir}
"
echo -e "\n CMD: $supernova_cmd \n"
$supernova_cmd

echo -e "\n assembly ended: $(date) \n" >&2

# check that output generated
supernova_out_dir=$(readlink -f "$(pwd)/${run_id}")
if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then
echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2
exit 1
fi


#########################


# step 2: generate fasta file (supernova mkoutput)

# display settings
echo
echo " * assembly dir: ${supernova_out_dir}/outs/assembly "
echo " * fasta prefix: ${supernova_out_dir}/assembly "
echo

# generate different style fasta files
styles="raw megabubbles pseudohap pseudohap2"
for s in $styles; do

echo -e "\n generate fasta: style $s \n" >&2

# supernova mkoutput command
${supernova_dir}/supernova mkoutput \
--asmdir "${supernova_out_dir}/outs/assembly" \
--outprefix "${supernova_out_dir}/assembly.${s}" \
--style "${s}"

done

# check that output generated
styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2"
for s in $styles_out; do

# check that output generated
if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then
echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2
exit 1
fi

done


#########################


# cleanup

# check file size before cleanup
echo
echo "file size before cleanup"
du -sh "$run_id"
echo

# delete large assembly files (keep small ones just in case)
rm -rf ${run_id}/outs/assembly/a*
rm -rf ${run_id}/outs/assembly/closures*
rm -rf ${run_id}/outs/assembly/data
# delete temp files
rm -rf ${run_id}/ASSEMBLER_CS

# check file size after cleanup
echo
echo "file size after cleanup"
du -sh "$run_id"
echo


#########################



# end
99 changes: 99 additions & 0 deletions scripts-bigpurple/scrna-10x-cellranger-aggr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash


##
## 10X Cell Ranger
## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count'
##


# script filename
script_name=$(basename "${BASH_SOURCE[0]}")

# check for correct number of arguments
if [ $# -lt 1 ] || [ $# -gt 2 ] ; then
echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2
exit 1
fi

# arguments
sample_sheet=$1
analysis_name=$2

# settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant)
threads="4"
mem="32"

# make the output group-writeable
umask 007

# output (add analysis name if provided)
sample_name="aggregated"
if [ -n "$analysis_name" ] ; then
sample_name="${sample_name}-${analysis_name}"
fi
web_summary_html="${sample_name}/outs/web_summary.html"

# check that input exists
if [ ! -s "$sample_sheet" ] ; then
echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2
exit 1
fi

echo -e "\n $(date) \n" >&2

# check if output already exits
if [ -s "$web_summary_html" ]; then
echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2
exit 1
fi

module unload gcc
module load cellranger/3.0.0
module load dos2unix/7.4.0

# clean up sample sheet
dos2unix -q "$sample_sheet"
sed -i 's/"//g' "$sample_sheet"
sed -i -e '$a\' "$sample_sheet"

# display settings
echo " * cellranger: $(which cellranger) "
echo " * sample sheet: $sample_sheet "

# cellranger aggr command

# id A unique run id, used to name output folder [a-zA-Z0-9_-]+.
# csv Path of CSV file enumerating 'cellranger count' outputs.

cellranger_cmd="
cellranger aggr \
--jobmode local \
--localcores $threads \
--localmem $mem \
--id $sample_name \
--csv $sample_sheet
"
echo -e "\n CMD: $cellranger_cmd \n"
$cellranger_cmd

sleep 15

# check that output html summary (and probably everything else) exists
if [ ! -s "$web_summary_html" ] ; then
echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
exit 1
fi

# copy html summary to top level for easy navigation
rsync -t "$web_summary_html" "./${sample_name}.html"

# clean up (temp files)
rm -rf "${sample_name}/SC_RNA_COUNTER_CS"

echo -e "\n $(date) \n"



# end
116 changes: 116 additions & 0 deletions scripts-bigpurple/scrna-10x-cellranger-count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash


##
## 10X Cell Ranger - processes Chromium single cell RNA-seq output
##


# script filename
script_name=$(basename "${BASH_SOURCE[0]}")

# check for correct number of arguments
if [ ! $# == 3 ] ; then
echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2
echo -e "\n USAGE: $script_name genome_name sample_name fastq_dir \n" >&2
exit 1
fi

# arguments
genome_name=$1
sample_name_fastq=$2
sample_name_out="count-$sample_name_fastq"
fastq_dir=$(readlink -f "$3")

# settings
#threads=$NSLOTS
#threads=$SLURM_NTASKS
threads=16
#mem=$(echo "$threads * 8" | bc)
mem=128

# make the output group-writeable
umask 007

if [[ "$genome_name" == "hg19" ]] ; then
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0"
elif [[ "$genome_name" == "hg38" ]] ; then
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
elif [[ "$genome_name" == "GRCh38" ]] ; then
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0"
elif [[ "$genome_name" == "mm10" ]] ; then
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-mm10-3.0.0"
elif [[ "$genome_name" == "hg19_and_mm10" ]] ; then
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-and-mm10-3.0.0"
else
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/ref/${genome_name}/cellranger"
#transcriptome_dir="/gpfs/home/id460/ref/${genome_name}/cellranger"
fi

# unload all loaded modulefiles
module purge

# check that input exists
if [ ! -d "$fastq_dir" ] ; then
echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2
exit 1
fi

if [ ! -d "$transcriptome_dir" ] ; then
echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2
exit 1
fi

module load cellranger/3.0.0

# display settings
echo " * cellranger: $(which cellranger) "
echo " * threads: $threads "
echo " * mem: $mem "
echo " * transcriptome dir: $transcriptome_dir "
echo " * fastq dir: $fastq_dir "
echo " * sample: $sample_name_fastq "
echo " * out dir: $sample_name_out "

echo -e "\n $(date) \n" >&2

# cellranger run command

# id A unique run id, used to name output folder
# fastqs Path of folder created by 10x demultiplexing or bcl2fastq
# sample Prefix of the filenames of FASTQs to select
# transcriptome Path of folder containing 10X-compatible transcriptome

cellranger_cmd="
cellranger count \
--localmem $mem \
--localcores $threads \
--transcriptome $transcriptome_dir \
--fastqs $fastq_dir \
--sample $sample_name_fastq \
--id $sample_name_out \
"
echo -e "\n CMD: $cellranger_cmd \n"
$cellranger_cmd

sleep 15

web_summary_html="./${sample_name_out}/outs/web_summary.html"

# check that output html summary (and probably everything else) exists
if [ ! -s "$web_summary_html" ] ; then
echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2
exit 1
fi

# copy html summary to top level for easy navigation
rsync -tv "$web_summary_html" "./${sample_name_out}.html"

# delete temp files
rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS"

echo -e "\n $(date) \n"



# end
Loading

0 comments on commit 4191229

Please sign in to comment.