Skip to content

Commit

Permalink
major dependencies update support
Browse files Browse the repository at this point in the history
  • Loading branch information
gongyixiao committed Apr 17, 2017
1 parent 592ae97 commit 80e445b
Show file tree
Hide file tree
Showing 19 changed files with 230 additions and 91 deletions.
9 changes: 2 additions & 7 deletions code/By_Sample
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,11 @@ date
echo

echo "###### Downloading: $SAMPLE"
run_fastq-download $params pipeline/download meta_data $SAMPLE
if [ $FLAG -eq 1 ]; then rsync -rl pipeline/download/ $CWD/pipeline/download/; fi
fastq-download `cat params | grep "^fastq-download" | cut -f2` pipeline/fastq $SAMPLE
if [ $FLAG -eq 1 ]; then rsync -rl pipeline/fastq/ $CWD/pipeline/fastq/; fi
status=$?
if [ $status -ne 0 ]; then exit 1; fi

echo "###### Linking: $SAMPLE"
run_fastq-link $params pipeline/fastq pipeline/download $SAMPLE
status=$?
if [ $FLAG -eq 1 ]; then rsync -rl pipeline/fastq/ $CWD/pipeline/fastq/; fi
if [ $status -ne 0 ]; then exit 1; fi
echo "###### Alignning: $SAMPLE"
run_alignment $params pipeline/alignment pipeline/fastq $SAMPLE
status=$?
Expand Down
47 changes: 35 additions & 12 deletions code/DESeq_Comparison_Report.r
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ summarize_comparison=function(outdir,signature,target,DE,comparison,preprocessin

# MA-plot. The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by size factor is shown on the x-axis (“M” for minus, because a log ratio is equal to log minus log, and “A” for average). Each gene is represented with a dot. Genes with an adjusted p value below a threshold (here 0.1, the default) are shown in red
markMAplot <- function(outdir,signature,comparison,prefix,res){
pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.pdf",sep=""),onefile=FALSE)
if(outdir==""){outdir="."}
plotMA(res, ylim=c(-5,5), main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""))
mylist=""
Expand All @@ -117,18 +118,26 @@ markMAplot <- function(outdir,signature,comparison,prefix,res){
text(baseMean, log2FoldChange, topGeneName, pos=2, col="dodgerblue",cex=0.6)
})
}
p <- recordPlot()
plot.new()
png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.png",sep=""))
print(p)
dev.off()
pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.pdf",sep=""),onefile=FALSE)
print(p)
png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.png",sep=""))
if(outdir==""){outdir="."}
plotMA(res, ylim=c(-5,5), main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""))
mylist=""
for (i in signature){mylist=c(mylist,grep(paste("^",i,"$|:",i,"$",sep=""), res$symbol))}
mylist=rownames(res)[as.numeric(mylist[-1])]
for (topGene in mylist){
topGeneName=res[topGene,]$symbol
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=0.5, lwd=2)
text(baseMean, log2FoldChange, topGeneName, pos=2, col="dodgerblue",cex=0.6)
})
}
dev.off()
}

# Plot Volcano Plot
markVolcanoPlot <- function(outdir,signature,comparison,prefix,res){
pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.pdf",sep=""),onefile=FALSE)
if(outdir==""){outdir="."}
tab = data.frame(logFC = res$log2FoldChange, negLogPval = -log10(res$pvalue),symbol=res$symbol)
par(mar = c(5, 5, 5, 6))
Expand All @@ -148,14 +157,28 @@ abline(h = -log10(pval), col = "green3", lty = 2)
abline(v = c(-lfc, lfc), col = "blue", lty = 2)
mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.8, line = 0.5, las = 1)
mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
p <- recordPlot()
plot.new()
dev.off()

png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.png",sep=""))
print(p)
dev.off()
pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.pdf",sep=""),onefile=FALSE)
print(p)
if(outdir==""){outdir="."}
tab = data.frame(logFC = res$log2FoldChange, negLogPval = -log10(res$pvalue),symbol=res$symbol)
par(mar = c(5, 5, 5, 6))
plot(tab[,1:2], main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""), xlim=range(-max(abs(na.omit(tab$logFC))),max(abs(na.omit(tab$logFC)))), pch = 16, cex = 0.6, xlab = expression(log[2]~fold~change), ylab = expression(-log[10]~pvalue))
lfc = 1
pval = 0.05
signGene=tab %>% filter(abs(logFC)>lfc) %>% filter(negLogPval>-log10(pval))
points(signGene, pch = 16, cex = 0.6, col = "red")
mylist=""
for (i in signature){mylist=c(mylist,grep(paste("^",i,"$|:",i,"$",sep=""), res$symbol))}
mylist=rownames(res)[as.numeric(mylist[-1])]
for (topGene in mylist){
points(tab[topGene,1:2], cex=0.7, lwd=2, col = "dodgerblue")
text(tab[topGene,1:2],labels=as.character(tab[topGene,]$symbol), pos=4, col="dodgerblue",cex=0.6)
}
abline(h = -log10(pval), col = "green3", lty = 2)
abline(v = c(-lfc, lfc), col = "blue", lty = 2)
mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.8, line = 0.5, las = 1)
mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
dev.off()
}

Expand Down
4 changes: 2 additions & 2 deletions code/DESeq_post_v3.r
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ align_reads <- function(x){
x=x[order(x$V1,x$V2),]
myplot <-
ggplot(x,aes(x=V1,y=V3,fill=V2)) +
geom_bar(stat="identity",position="stack") +
geom_bar(stat="identity",position=position_stack(vjust = 1, reverse = T)) +
scale_fill_manual(values=c("limegreen","forestgreen","orchid","indianred1")) +
ggtitle("B. Alignment Category Reads") +
xlab("") + ylab("Number of Reads in Millions") +
Expand All @@ -66,7 +66,7 @@ align_PCT <- function(x){
x$V4 <- factor(x$V4, levels = c("Uniquely_Mapped","Multi_Mapped","Unmapped"))
x=x[order(x$V1,x$V2),]
myplot <- ggplot(x,aes(x=V1,y=V3,fill=V2)) +
geom_bar(stat="identity",position="stack") +
geom_bar(stat="identity",position=position_stack(vjust = 1, reverse = T)) +
ggtitle("A. Alignment Category Percentage") +
scale_fill_manual(values=c("limegreen","forestgreen","orchid","indianred1")) +
xlab("") + ylab("Percentage of Reads") +
Expand Down
5 changes: 3 additions & 2 deletions code/custom-bashrc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export PYTHONPATH
export PERL5LIB=$HOME/lib
export EDITOR="/usr/local/bin/mate -w"

module load samtools/0.1.19
module load samtools/1.3
module load r/3.3.0
module load python/2.7
module load python/2.7.3
module load java/1.8
155 changes: 155 additions & 0 deletions code/fastq-download
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/bin/bash

source code/custom-bashrc


if [ "$#" -ne 2 ] && [ "$#" -ne 3 ] && [ "$#" -ne 4 ]
then
printf "\n###### Usage\n"
printf "$0 [Optional: Threads (Default 6)] <PARAMS> <OUTPUT_DIR> [Optional: Sample Name] \n"
exit
fi

date




#########################
# input

if [[ $1 =~ ^[[:digit:]]{1,2}$ ]];then NSLOTS=$1;shift
elif [[ $NSLOTS == "" ]]; then NSLOTS=6; fi

PARAMS=$1
OUT_DIR=$2
SAMPLE=$3

CWD=`pwd`

#########################
if [ ! -d $OUT_DIR ]; then mkdir -p $OUT_DIR; fi

IFS=$'\n'

#header=`cat $PARAMS | grep -E "^#" | head -1 | cut -d' ' -f2`


########################
# SRA Download
for record in `cat $PARAMS | grep -v "^#"`
do
name_sample=`echo $record | cut -f1`
id_sample=`echo $record | cut -f2`
header=`echo $record | cut -f3`
if [ "$name_sample" == "$SAMPLE" ] || [ -z "$SAMPLE" ]
then
# check for output existance
if [ ! -d $OUT_DIR/$name_sample ]
then
mkdir $OUT_DIR/$name_sample
fastq_dump_path=`which fastq-dump.2.4.2`
picard_path=`which picard.jar`
gdc_client_path=`which gdc-client`
### change of directory
cd $OUT_DIR/$name_sample/
echo "#!/bin/bash" > run.bash
echo "cd $OUT_DIR/$name_sample/ # change of directory" >> run.bash
else
echo "Output $OUT_DIR/$name_sample Existing!"
break
fi

if [ "$header" == "SRA" ]; then
echo
echo "###### Downloading $name_sample from $header!"

link_sample="ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/${id_sample:0:6}/$id_sample/"
ID_RUNS=(`curl -sS $link_sample | sed -r 's/\s+/\t/g' | cut -f9 `)
num_run=0
echo "$link_sample"
mkdir data # creating data directory...
echo "mkdir data # creating data directory" >> run.bash
for id_run in "${ID_RUNS[@]}"
do
echo " |____ $id_run"
((num_run++))
num_run=$(printf "%03d" $num_run)
link_run=${link_sample}${id_run}"/"
ID_FILES=(`curl -sS $link_run | sed -r 's/\s+/\t/g' | cut -f9 `)
for name_file in "${ID_FILES[@]}"
do
echo " | |____ $id_file"
echo " | | Retrieving SRA... "
link_file=${link_run}${name_file}
wget -nv $link_file # downloading .sra file
echo "wget -nv $link_file # download .sra file" >> run.bash
echo " | | Converting SRA..."
$fastq_dump_path --split-files -gzip $name_file # convert SRA file to fastq.gz file and seperate read1 and read2
echo "$fastq_dump_path --split-files -gzip $name_file # convert SRA file to fastq.gz file and seperate read1 and read2" >> run.bash
id_file=${name_file%.*}
rm -rf $name_file # cleaning up .sra file...
echo "mkdir data # cleaning up .sra file..." >> run.bash
mv ${id_file}_1.fastq.gz data/; ln -s data/${id_file}_1.fastq.gz ${name_sample}_L${num_run}_R1.fastq.gz # linking read1...
echo "mv ${id_file}_1.fastq.gz data/; ln -s data/${id_file}_1.fastq.gz ${name_sample}_L${num_run}_R1.fastq.gz # linking read1..." >> run.bash
echo " | | Linking data/${id_file}_1.fastq.gz to ${name_sample}_L${num_run}_R1.fastq.gz"
if [[ -f ${id_file}_2.fastq.gz ]]
then
mv ${id_file}_2.fastq.gz data/; ln -s data/${id_file}_2.fastq.gz ${name_sample}_L${num_run}_R2.fastq.gz # linking read2...
echo "mv ${id_file}_2.fastq.gz data/; ln -s data/${id_file}_2.fastq.gz ${name_sample}_L${num_run}_R2.fastq.gz # linking read2..." >> run.bash
echo " | | Linking data/${id_file}_2.fastq.gz to ${name_sample}_L${num_run}_R2.fastq.gz"
fi
done
done
elif [[ $header == *gdc-user-token* ]]
then
echo
echo "###### Downloading $name_sample from TCGA!"
echo $id_sample
echo " |____ Downloading .bam file..."
cd ../;$gdc_client_path download --no-annotations -n $NSLOTS -t $header $id_sample;cd -;mkdir data # Downloading bam file and creating data directory...
echo "cd ../;$gdc_client_path download --no-annotations -n $NSLOTS-t $header $id_sample;cd -;mkdir data # Downloading bam file and creatingi data directory... " >> run.bash
echo " |____ Converting to .fastq file..."
java -jar $picard_path SamToFastq I=`ls *.bam` FASTQ=data/${id_sample}_L001_R1.fastq SECOND_END_FASTQ=data/${id_sample}_L001_R2.fastq QUIET=true VERBOSITY=ERROR INCLUDE_NON_PF_READS=true # Converting to fastq file...
echo "java -jar $picard_path SamToFastq I=\`ls *.bam\` FASTQ=data/${id_sample}_L001_R1.fastq SECOND_END_FASTQ=data/${id_sample}_L001_R2.fastq QUIET=true VERBOSITY=ERROR INCLUDE_NON_PF_READS=true # Converting to fastq file..." >> run.bash
rm -rf *.ba{m,i} # Cleaning up .bam and .bai file
echo "rm -rf *.ba{m,i} # cleaning up .bam and .bai file" >> run.bash
echo " |____ Gzipping to .gz file..."
gzip data/${id_sample}_L001_R1.fastq data/${id_sample}_L001_R2.fastq # Gzipping to .gz file...
echo "gzip data/${id_sample}_L001_R1.fastq data/${id_sample}_L001_R2.fastq # Gzipping to .gz file..." >> run.bash
ln -s data/${id_sample}_L001_R1.fastq.gz ${name_sample}_L001_R1.fastq.gz # Linking read1...
echo "ln -s data/${id_sample}_L001_R1.fastq.gz ${name_sample}_L001_R1.fastq.gz # Linking read1..." >> run.bash
if [ ! -s data/${id_sample}_L001_R2.fastq.gz ]
then
rm -rf data/${id_sample}_L001_R2.fastq.gz # Deleting empty read2 file...
echo "rm -rf data/${id_sample}_L001_R2.fastq.gz # Deleting empty read2 file..." >> run.bash
else
ln -s data/${id_sample}_L001_R2.fastq.gz ${name_sample}_L001_R2.fastq.gz # Linking read2...
echo "ln -s data/${id_sample}_L001_R2.fastq.gz ${name_sample}_L001_R2.fastq.gz # Linking read2..." >> run.bash
fi
elif [ -d $header ]
then
echo
echo "###### Linking $name_sample from Local Directory!"
echo $header
ln -s $header data # linking source directory...
echo "ln -s $header data # linking source directory..." >> run.bash
for myfastq in `ls -1 $header | grep "$id_sample"`
do
echo " |____ data/`basename $myfastq` to ${name_sample}`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2`"
ln -sf data/`basename $myfastq` ${name_sample}`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2` # linking file...
echo "ln -sf data/\`basename $myfastq\` ${name_sample}\`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2\` # linking file..." >> run.bash
done
else
printf "\n###### Metadata file lacking of comment header! \n\t(The first line of input metadata file should start with \"\# \" and follow by \" SRA\", \"TCGA\", or local directory path\n"
fi
fi
echo "cd $CWD # changing back to working directory..." >> run.bash
cd $CWD
done


#########################
echo
date
# end
6 changes: 3 additions & 3 deletions code/install_R_packages.r
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env Rscript

list.of.packages=c("tools","DESeq2","GenomicFeatures","dplyr","BiocParallel","pheatmap","RColorBrewer","ggplot2","ReportingTools","hwriter","reshape2","preprocessCore","cowplot","diagram","GGally","tidyr","animation")
list.of.packages=c("EDASeq","ggthemes","TxDb.Hsapiens.UCSC.hg19.knownGene","TCGAbiolinks","tools","DESeq2","GenomicFeatures","dplyr","BiocParallel","pheatmap","RColorBrewer","ggplot2","ReportingTools","hwriter","reshape2","preprocessCore","cowplot","diagram","GGally","tidyr","animation")
missing.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
for (p in missing.packages){
if(!file.exists(Sys.getenv("R_LIBS_USER"))){
system(paste0("mkdir -p ",Sys.getenv("R_LIBS_USER")))
}
source("https://bioconductor.org/biocLite.R")
biocLite(p,lib=Sys.getenv("R_LIBS_USER"))
source("http://bioconductor.org/biocLite.R")
biocLite(p,lib=Sys.getenv("R_LIBS_USER"),suppressUpdates=TRUE)
}
2 changes: 1 addition & 1 deletion code/job_submitter.bash
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ source code/custom-bashrc
NSLOTS=$1
shift

qsub -b Y -cwd -pe threaded $NSLOTS -l tmp_free=120G -l tmp_token=30G -hard $@
qsub -b Y -cwd -pe threaded $NSLOTS -e /dev/null -o /dev/null -l tmp_free=120G -l tmp_token=30G -hard $@
13 changes: 8 additions & 5 deletions code/run_alignment
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ STAR_PREFIX=${BASE_DIR}/STAR.
NAME=`basename $BASE_DIR`


#########################
# bin path
picard_path=`which picard.jar`
star_path=`which STAR`


#########################
# check inputs
Expand Down Expand Up @@ -94,14 +99,12 @@ echo " * BAM: ${STAR_PREFIX}Aligned.sortedByCoord.out.bam "
# Picard says STAR-sorted BAM "is not coordinate sorted", so using samtools for sorting

CMD="
STAR \
$star_path \
--runThreadN $NSLOTS \
--genomeLoad NoSharedMemory \
--outFilterMismatchNoverLmax 0.05 \
--outFilterType BySJout \
--outSAMstrandField intronMotif \
--outSAMattributes NH HI AS nM NM MD XS \
--outSAMmapqUnique 60 \
--twopassMode Basic \
--readFilesIn $FASTQ1 $FASTQ2 \
--outFileNamePrefix $STAR_PREFIX \
Expand Down Expand Up @@ -146,7 +149,7 @@ else
#---------------#
# need to figure out how to set this path to the jar files
CMD="
java -XX:-UsePerfData -Xms8G -Xmx16G -jar /ifs/home/gongy05/utilities/picard-tools/1.88/MarkDuplicates.jar \
java -XX:-UsePerfData -Xms8G -Xmx16G -jar $picard_path MarkDuplicates \
VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000 \
$params \
INPUT=${STAR_PREFIX}Aligned.sortedByCoord.out.bam \
Expand Down Expand Up @@ -184,7 +187,7 @@ CMD="
cat ${STAR_PREFIX}Log.final.out | grep \"Number of input reads\|Uniquely mapped reads number\|Number of reads mapped to too many loci\" | cut -f2 | perl -lane 'printf \"\$_\t\"' | awk '{print \$1-\$2-\$3}' | awk '{print \"\t\tNumber of reads unmapped\t|\t\"\$1}' >> ${STAR_PREFIX}Log.final.out;
echo >> ${STAR_PREFIX}Log.final.out;
echo -e \"\t\t\t\tDUPLICATED READS:\" >> ${STAR_PREFIX}Log.final.out;
head -8 $BASE_DIR/remove_duplicates.txt | tail -1 | cut -f8 | awk '{print \"\t\t\t% of reads Duplicated\t|\t\"\$1*100\"%\"}' >> ${STAR_PREFIX}Log.final.out;
head -8 $BASE_DIR/remove_duplicates.txt | tail -1 | cut -f9 | awk '{print \"\t\t\t% of reads Duplicated\t|\t\"\$1*100\"%\"}' >> ${STAR_PREFIX}Log.final.out;
grep \"Uniquely mapped reads number\|Duplicated\" ${STAR_PREFIX}Log.final.out | sed -E 's/\s+//g' | cut -d '|' -f2 | perl -lane 'printf \"\$_\t\"' | perl -lane '{\$dup=sprintf(\"%.0f\",\$F[0]*\$F[1]/100);\$unique=\$F[0]-\$dup;print \"\$unique\t\$dup\"}' | awk '{print \"\t\tNumber of reads unduplicated\t|\t\"\$1\"\n\t\tNumber of reads duplicated\t|\t\"\$2}' >> ${STAR_PREFIX}Log.final.out
"
#printf "\n\n $CMD \n\n"
Expand Down
4 changes: 4 additions & 0 deletions code/utilities/gdc-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/local/apps/python/2.7.3/bin/python
# EASY-INSTALL-SCRIPT: 'gdc-client==1.2.0','gdc-client'
__requires__ = 'gdc-client==1.2.0'
__import__('pkg_resources').run_script('gdc-client==1.2.0', 'gdc-client')
Binary file added code/utilities/picard-2.9.0.jar
Binary file not shown.
1 change: 1 addition & 0 deletions code/utilities/picard.jar
25 changes: 0 additions & 25 deletions meta_data/20160224.txt

This file was deleted.

6 changes: 6 additions & 0 deletions meta_data/download_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
EryB SRX669497 SRA
f327cd15-9af4-4c6d-8318-986934ea32c9 f327cd15-9af4-4c6d-8318-986934ea32c9 meta_data/gdc-user-token.xxxxxx.txt
A549_REN_24H-rep1 A549_REN_REP1_24H_CGATGT /ifs/data/2016-02-24/fastq/
EryA SRX669496 SRA
e3878649-03e0-4d19-a14c-344afe2f1b4a e3878649-03e0-4d19-a14c-344afe2f1b4a meta_data/gdc-user-token.xxxxxx.txt
A549_REN_48H-rep1 A549_REN_REP1_48H_CGATGT /ifs/data/2016-02-24/fastq/
Loading

0 comments on commit 80e445b

Please sign in to comment.