-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
431 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
#!/bin/bash | ||
|
||
|
||
## | ||
## De novo assembly from 10x Genomics Chromium Linked-Reads using Supernova. | ||
## | ||
## Usage: | ||
## sbatch --job-name=supernova --nodes=1 --ntasks=1 --cpus-per-task=17 --mem-per-cpu=32G --time=15-00 \ | ||
## --partition=fn_long --mail-user=${USER}@nyulangone.org --mail-type=END,FAIL,REQUEUE --export=NONE \ | ||
## --wrap="bash /gpfs/data/igorlab/public/genomics/scripts-bigpurple/assembly-10x-supernova.sh fastq_dir [max_reads]" | ||
## | ||
|
||
|
||
######################### | ||
|
||
|
||
# system-specific settings | ||
|
||
# supernova directory | ||
supernova_version="2.1.1" | ||
supernova_dir="/gpfs/data/igorlab/software/supernova/supernova-${supernova_version}" | ||
|
||
# prepare environment | ||
module purge | ||
module add default-environment | ||
|
||
|
||
######################### | ||
|
||
|
||
# check for correct number of arguments | ||
if [ $# -lt 1 ] ; then | ||
echo -e "\n ERROR: wrong number of arguments supplied \n" >&2 | ||
echo -e "\n USAGE: bash assembly-10x-supernova.sh fastq_dir [max_reads] \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
# arguments | ||
fastq_dir=$(readlink -f "$1") | ||
max_reads="$2" | ||
|
||
# check that input exists | ||
if [ ! -d "$fastq_dir" ] ; then | ||
echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
|
||
######################### | ||
|
||
|
||
# step 1: assembly (supernova run) | ||
|
||
# million reads cutoff (default is 1200M) | ||
# set the number of reads so as to achieve 56x raw coverage: (genome size) x 56 / 150, assuming 150bp reads | ||
# coverage significantly greater than 56x can sometimes help but can also be deleterious, depending on the dataset | ||
# default value is 1.2B, which only makes sense for ~3.2 Gb genomes | ||
if [ -n "$max_reads" ] ; then | ||
max_reads_m="$max_reads" | ||
else | ||
max_reads_m="1200" | ||
fi | ||
|
||
# system load settings (reserve an extra thread for overhead) | ||
threads=$SLURM_CPUS_PER_TASK | ||
threads=$(( threads - 1 )) | ||
mem=$(echo "$threads * 32" | bc) | ||
|
||
# run name (used to name output folder) | ||
supernova_version_nodot=$(echo "$supernova_version" | sed 's/\.//g') | ||
run_id="assembly-supernova-v${supernova_version_nodot}-reads${max_reads_m}M" | ||
|
||
# display settingse | ||
echo | ||
echo " * fastq dir: $fastq_dir " | ||
echo " * supernova bin dir: $supernova_dir " | ||
echo " * reads cutoff (million): $max_reads_m " | ||
echo " * threads: $threads " | ||
echo " * mem: $mem " | ||
echo " * run name (output dir): $run_id " | ||
echo | ||
|
||
echo -e "\n assembly started: $(date) \n" >&2 | ||
|
||
# supernova assembly command | ||
|
||
supernova_cmd=" | ||
${supernova_dir}/supernova run \ | ||
--maxreads ${max_reads_m}000000 \ | ||
--localcores ${threads} \ | ||
--localmem ${mem} \ | ||
--id ${run_id} \ | ||
--fastqs ${fastq_dir} | ||
" | ||
echo -e "\n CMD: $supernova_cmd \n" | ||
$supernova_cmd | ||
|
||
echo -e "\n assembly ended: $(date) \n" >&2 | ||
|
||
# check that output generated | ||
supernova_out_dir=$(readlink -f "$(pwd)/${run_id}") | ||
if [ ! -e "${supernova_out_dir}/outs/report.txt" ] ; then | ||
echo -e "\n ERROR: output ${supernova_out_dir}/outs/report.txt does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
|
||
######################### | ||
|
||
|
||
# step 2: generate fasta file (supernova mkoutput) | ||
|
||
# display settings | ||
echo | ||
echo " * assembly dir: ${supernova_out_dir}/outs/assembly " | ||
echo " * fasta prefix: ${supernova_out_dir}/assembly " | ||
echo | ||
|
||
# generate different style fasta files | ||
styles="raw megabubbles pseudohap pseudohap2" | ||
for s in $styles; do | ||
|
||
echo -e "\n generate fasta: style $s \n" >&2 | ||
|
||
# supernova mkoutput command | ||
${supernova_dir}/supernova mkoutput \ | ||
--asmdir "${supernova_out_dir}/outs/assembly" \ | ||
--outprefix "${supernova_out_dir}/assembly.${s}" \ | ||
--style "${s}" | ||
|
||
done | ||
|
||
# check that output generated | ||
styles_out="raw megabubbles pseudohap pseudohap2.1 pseudohap2.2" | ||
for s in $styles_out; do | ||
|
||
# check that output generated | ||
if [ ! -e "${supernova_out_dir}/assembly.${s}.fasta.gz" ] ; then | ||
echo -e "\n ERROR: output ${supernova_out_dir}/assembly.${s}.fasta.gz does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
done | ||
|
||
|
||
######################### | ||
|
||
|
||
# cleanup | ||
|
||
# check file size before cleanup | ||
echo | ||
echo "file size before cleanup" | ||
du -sh "$run_id" | ||
echo | ||
|
||
# delete large assembly files (keep small ones just in case) | ||
rm -rf ${run_id}/outs/assembly/a* | ||
rm -rf ${run_id}/outs/assembly/closures* | ||
rm -rf ${run_id}/outs/assembly/data | ||
# delete temp files | ||
rm -rf ${run_id}/ASSEMBLER_CS | ||
|
||
# check file size after cleanup | ||
echo | ||
echo "file size after cleanup" | ||
du -sh "$run_id" | ||
echo | ||
|
||
|
||
######################### | ||
|
||
|
||
|
||
# end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/bin/bash | ||
|
||
|
||
## | ||
## 10X Cell Ranger | ||
## cellranger aggr - aggregates count data from multiple runs of the 'cellranger count' | ||
## | ||
|
||
|
||
# script filename | ||
script_name=$(basename "${BASH_SOURCE[0]}") | ||
|
||
# check for correct number of arguments | ||
if [ $# -lt 1 ] || [ $# -gt 2 ] ; then | ||
echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 | ||
echo -e "\n USAGE: $script_name sample_sheet [name] \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
# arguments | ||
sample_sheet=$1 | ||
analysis_name=$2 | ||
|
||
# settings (many sub-steps seem to be single-threaded, so threads are mostly irrelevant) | ||
threads="4" | ||
mem="32" | ||
|
||
# make the output group-writeable | ||
umask 007 | ||
|
||
# output (add analysis name if provided) | ||
sample_name="aggregated" | ||
if [ -n "$analysis_name" ] ; then | ||
sample_name="${sample_name}-${analysis_name}" | ||
fi | ||
web_summary_html="${sample_name}/outs/web_summary.html" | ||
|
||
# check that input exists | ||
if [ ! -s "$sample_sheet" ] ; then | ||
echo -e "\n ERROR: sample sheet $sample_sheet does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
echo -e "\n $(date) \n" >&2 | ||
|
||
# check if output already exits | ||
if [ -s "$web_summary_html" ]; then | ||
echo -e "\n ERROR: summary $web_summary_html already exists \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
module unload gcc | ||
module load cellranger/3.0.0 | ||
module load dos2unix/7.4.0 | ||
|
||
# clean up sample sheet | ||
dos2unix -q "$sample_sheet" | ||
sed -i 's/"//g' "$sample_sheet" | ||
sed -i -e '$a\' "$sample_sheet" | ||
|
||
# display settings | ||
echo " * cellranger: $(which cellranger) " | ||
echo " * sample sheet: $sample_sheet " | ||
|
||
# cellranger aggr command | ||
|
||
# id A unique run id, used to name output folder [a-zA-Z0-9_-]+. | ||
# csv Path of CSV file enumerating 'cellranger count' outputs. | ||
|
||
cellranger_cmd=" | ||
cellranger aggr \ | ||
--jobmode local \ | ||
--localcores $threads \ | ||
--localmem $mem \ | ||
--id $sample_name \ | ||
--csv $sample_sheet | ||
" | ||
echo -e "\n CMD: $cellranger_cmd \n" | ||
$cellranger_cmd | ||
|
||
sleep 15 | ||
|
||
# check that output html summary (and probably everything else) exists | ||
if [ ! -s "$web_summary_html" ] ; then | ||
echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
# copy html summary to top level for easy navigation | ||
rsync -t "$web_summary_html" "./${sample_name}.html" | ||
|
||
# clean up (temp files) | ||
rm -rf "${sample_name}/SC_RNA_COUNTER_CS" | ||
|
||
echo -e "\n $(date) \n" | ||
|
||
|
||
|
||
# end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/bin/bash | ||
|
||
|
||
## | ||
## 10X Cell Ranger - processes Chromium single cell RNA-seq output | ||
## | ||
|
||
|
||
# script filename | ||
script_name=$(basename "${BASH_SOURCE[0]}") | ||
|
||
# check for correct number of arguments | ||
if [ ! $# == 3 ] ; then | ||
echo -e "\n $script_name ERROR: WRONG NUMBER OF ARGUMENTS SUPPLIED \n" >&2 | ||
echo -e "\n USAGE: $script_name genome_name sample_name fastq_dir \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
# arguments | ||
genome_name=$1 | ||
sample_name_fastq=$2 | ||
sample_name_out="count-$sample_name_fastq" | ||
fastq_dir=$(readlink -f "$3") | ||
|
||
# settings | ||
#threads=$NSLOTS | ||
#threads=$SLURM_NTASKS | ||
threads=16 | ||
#mem=$(echo "$threads * 8" | bc) | ||
mem=128 | ||
|
||
# make the output group-writeable | ||
umask 007 | ||
|
||
if [[ "$genome_name" == "hg19" ]] ; then | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-3.0.0" | ||
elif [[ "$genome_name" == "hg38" ]] ; then | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0" | ||
elif [[ "$genome_name" == "GRCh38" ]] ; then | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-GRCh38-3.0.0" | ||
elif [[ "$genome_name" == "mm10" ]] ; then | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-mm10-3.0.0" | ||
elif [[ "$genome_name" == "hg19_and_mm10" ]] ; then | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/refdata-cellranger-hg19-and-mm10-3.0.0" | ||
else | ||
transcriptome_dir="/gpfs/data/sequence/cellranger-refdata/ref/${genome_name}/cellranger" | ||
#transcriptome_dir="/gpfs/home/id460/ref/${genome_name}/cellranger" | ||
fi | ||
|
||
# unload all loaded modulefiles | ||
module purge | ||
|
||
# check that input exists | ||
if [ ! -d "$fastq_dir" ] ; then | ||
echo -e "\n ERROR: fastq dir $fastq_dir does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
if [ ! -d "$transcriptome_dir" ] ; then | ||
echo -e "\n ERROR: genome dir $transcriptome_dir does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
module load cellranger/3.0.0 | ||
|
||
# display settings | ||
echo " * cellranger: $(which cellranger) " | ||
echo " * threads: $threads " | ||
echo " * mem: $mem " | ||
echo " * transcriptome dir: $transcriptome_dir " | ||
echo " * fastq dir: $fastq_dir " | ||
echo " * sample: $sample_name_fastq " | ||
echo " * out dir: $sample_name_out " | ||
|
||
echo -e "\n $(date) \n" >&2 | ||
|
||
# cellranger run command | ||
|
||
# id A unique run id, used to name output folder | ||
# fastqs Path of folder created by 10x demultiplexing or bcl2fastq | ||
# sample Prefix of the filenames of FASTQs to select | ||
# transcriptome Path of folder containing 10X-compatible transcriptome | ||
|
||
cellranger_cmd=" | ||
cellranger count \ | ||
--localmem $mem \ | ||
--localcores $threads \ | ||
--transcriptome $transcriptome_dir \ | ||
--fastqs $fastq_dir \ | ||
--sample $sample_name_fastq \ | ||
--id $sample_name_out \ | ||
" | ||
echo -e "\n CMD: $cellranger_cmd \n" | ||
$cellranger_cmd | ||
|
||
sleep 15 | ||
|
||
web_summary_html="./${sample_name_out}/outs/web_summary.html" | ||
|
||
# check that output html summary (and probably everything else) exists | ||
if [ ! -s "$web_summary_html" ] ; then | ||
echo -e "\n ERROR: summary $web_summary_html does not exist \n" >&2 | ||
exit 1 | ||
fi | ||
|
||
# copy html summary to top level for easy navigation | ||
rsync -tv "$web_summary_html" "./${sample_name_out}.html" | ||
|
||
# delete temp files | ||
rm -rf "./${sample_name_out}/SC_RNA_COUNTER_CS" | ||
|
||
echo -e "\n $(date) \n" | ||
|
||
|
||
|
||
# end |
Oops, something went wrong.