major dependencies update support

NYU-BFX · Apr 17, 2017 · 80e445b · 80e445b
1 parent 592ae97
commit 80e445b
Show file tree

Hide file tree

Showing 19 changed files with 230 additions and 91 deletions.
diff --git a/code/By_Sample b/code/By_Sample
@@ -46,16 +46,11 @@ date
 echo
 
 echo "###### Downloading: $SAMPLE"
-run_fastq-download $params pipeline/download meta_data $SAMPLE
-if [ $FLAG -eq 1 ]; then rsync -rl pipeline/download/ $CWD/pipeline/download/; fi
+fastq-download `cat params | grep "^fastq-download" | cut -f2` pipeline/fastq $SAMPLE
+if [ $FLAG -eq 1 ]; then rsync -rl pipeline/fastq/ $CWD/pipeline/fastq/; fi
 status=$?
 if [ $status -ne 0 ]; then exit 1; fi
 
-echo "###### Linking: $SAMPLE"
-run_fastq-link $params pipeline/fastq pipeline/download $SAMPLE
-status=$?
-if [ $FLAG -eq 1 ]; then rsync -rl pipeline/fastq/ $CWD/pipeline/fastq/; fi
-if [ $status -ne 0 ]; then exit 1; fi
 echo "###### Alignning: $SAMPLE"
 run_alignment $params pipeline/alignment pipeline/fastq $SAMPLE
 status=$?

diff --git a/code/DESeq_Comparison_Report.r b/code/DESeq_Comparison_Report.r
@@ -105,6 +105,7 @@ summarize_comparison=function(outdir,signature,target,DE,comparison,preprocessin
 
 # MA-plot. The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by size factor is shown on the x-axis (“M” for minus, because a log ratio is equal to log minus log, and “A” for average). Each gene is represented with a dot. Genes with an adjusted p value below a threshold (here 0.1, the default) are shown in red
 markMAplot <- function(outdir,signature,comparison,prefix,res){
+  pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.pdf",sep=""),onefile=FALSE)
   if(outdir==""){outdir="."}
   plotMA(res, ylim=c(-5,5), main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""))
   mylist=""
@@ -117,18 +118,26 @@ markMAplot <- function(outdir,signature,comparison,prefix,res){
       text(baseMean, log2FoldChange, topGeneName, pos=2, col="dodgerblue",cex=0.6)
     })
   }
-  p <- recordPlot()
-  plot.new()
-  png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.png",sep=""))
-  print(p)
   dev.off()
-  pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.pdf",sep=""),onefile=FALSE)
-  print(p)
+  png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_MA_plot.png",sep=""))
+  if(outdir==""){outdir="."}
+  plotMA(res, ylim=c(-5,5), main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""))
+  mylist=""
+  for (i in signature){mylist=c(mylist,grep(paste("^",i,"$|:",i,"$",sep=""), res$symbol))}
+  mylist=rownames(res)[as.numeric(mylist[-1])]
+  for (topGene in mylist){
+    topGeneName=res[topGene,]$symbol
+    with(res[topGene, ], {
+      points(baseMean, log2FoldChange, col="dodgerblue", cex=0.5, lwd=2)
+      text(baseMean, log2FoldChange, topGeneName, pos=2, col="dodgerblue",cex=0.6)
+    })
+  }
   dev.off()
 }
 
 # Plot Volcano Plot
 markVolcanoPlot <- function(outdir,signature,comparison,prefix,res){
+pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.pdf",sep=""),onefile=FALSE)
 if(outdir==""){outdir="."}
 tab = data.frame(logFC = res$log2FoldChange, negLogPval = -log10(res$pvalue),symbol=res$symbol)
 par(mar = c(5, 5, 5, 6))
@@ -148,14 +157,28 @@ abline(h = -log10(pval), col = "green3", lty = 2)
 abline(v = c(-lfc, lfc), col = "blue", lty = 2) 
 mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.8, line = 0.5, las = 1) 
 mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
-p <- recordPlot()
-plot.new()
+dev.off()
 
 png(filename=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.png",sep=""))
-print(p)
-dev.off()
-pdf(file=paste(outdir,"/",comparison[2],"_vs_",comparison[3],"/",prefix,"_Volcano_plot.pdf",sep=""),onefile=FALSE)
-print(p)
+if(outdir==""){outdir="."}
+tab = data.frame(logFC = res$log2FoldChange, negLogPval = -log10(res$pvalue),symbol=res$symbol)
+par(mar = c(5, 5, 5, 6))
+plot(tab[,1:2], main=paste(prefix," ", comparison[2]," vs ",comparison[3],sep=""), xlim=range(-max(abs(na.omit(tab$logFC))),max(abs(na.omit(tab$logFC)))), pch = 16, cex = 0.6, xlab = expression(log[2]~fold~change), ylab = expression(-log[10]~pvalue))
+lfc = 1
+pval = 0.05
+signGene=tab %>% filter(abs(logFC)>lfc) %>% filter(negLogPval>-log10(pval))
+points(signGene, pch = 16, cex = 0.6, col = "red") 
+mylist=""
+for (i in signature){mylist=c(mylist,grep(paste("^",i,"$|:",i,"$",sep=""), res$symbol))}
+mylist=rownames(res)[as.numeric(mylist[-1])]
+for (topGene in mylist){
+  points(tab[topGene,1:2], cex=0.7, lwd=2, col = "dodgerblue") 
+  text(tab[topGene,1:2],labels=as.character(tab[topGene,]$symbol), pos=4, col="dodgerblue",cex=0.6)
+}
+abline(h = -log10(pval), col = "green3", lty = 2) 
+abline(v = c(-lfc, lfc), col = "blue", lty = 2) 
+mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.8, line = 0.5, las = 1) 
+mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
 dev.off()
 }
 

diff --git a/code/DESeq_post_v3.r b/code/DESeq_post_v3.r
@@ -51,7 +51,7 @@ align_reads <- function(x){
   x=x[order(x$V1,x$V2),]
   myplot <-
     ggplot(x,aes(x=V1,y=V3,fill=V2)) +
-    geom_bar(stat="identity",position="stack") +
+    geom_bar(stat="identity",position=position_stack(vjust = 1, reverse = T)) +
     scale_fill_manual(values=c("limegreen","forestgreen","orchid","indianred1")) +
     ggtitle("B. Alignment Category Reads") +
     xlab("") + ylab("Number of Reads in Millions") +
@@ -66,7 +66,7 @@ align_PCT <- function(x){
   x$V4 <- factor(x$V4, levels = c("Uniquely_Mapped","Multi_Mapped","Unmapped"))
   x=x[order(x$V1,x$V2),]
     myplot <- ggplot(x,aes(x=V1,y=V3,fill=V2)) +
-    geom_bar(stat="identity",position="stack") +
+    geom_bar(stat="identity",position=position_stack(vjust = 1, reverse = T)) +
     ggtitle("A. Alignment Category Percentage") +
     scale_fill_manual(values=c("limegreen","forestgreen","orchid","indianred1")) +
     xlab("") + ylab("Percentage of Reads") +

diff --git a/code/custom-bashrc b/code/custom-bashrc
@@ -15,6 +15,7 @@ export PYTHONPATH
 export PERL5LIB=$HOME/lib
 export EDITOR="/usr/local/bin/mate -w"
 
-module load samtools/0.1.19
+module load samtools/1.3
 module load r/3.3.0
-module load python/2.7
+module load python/2.7.3
+module load java/1.8
diff --git a/code/fastq-download b/code/fastq-download
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+source code/custom-bashrc
+
+
+if [ "$#" -ne 2 ] && [ "$#" -ne 3 ] && [ "$#" -ne 4 ]
+then
+  printf "\n###### Usage\n"
+  printf "$0 [Optional: Threads (Default 6)] <PARAMS> <OUTPUT_DIR> [Optional: Sample Name] \n"
+  exit
+fi
+
+date
+
+
+
+
+#########################
+# input
+
+if [[ $1 =~ ^[[:digit:]]{1,2}$ ]];then NSLOTS=$1;shift
+elif [[ $NSLOTS == "" ]]; then NSLOTS=6; fi
+
+PARAMS=$1
+OUT_DIR=$2
+SAMPLE=$3
+
+CWD=`pwd`
+
+#########################
+if [ ! -d $OUT_DIR ]; then mkdir -p $OUT_DIR; fi
+
+IFS=$'\n'
+
+#header=`cat $PARAMS | grep -E "^#" | head -1 | cut -d' ' -f2`
+
+
+########################
+# SRA Download
+for record in `cat $PARAMS | grep -v "^#"`
+do
+  name_sample=`echo $record | cut -f1`
+  id_sample=`echo $record | cut -f2`
+  header=`echo $record | cut -f3`
+  if [ "$name_sample" == "$SAMPLE" ] || [ -z "$SAMPLE" ]
+  then
+  # check for output existance
+    if [ ! -d $OUT_DIR/$name_sample ]
+    then
+      mkdir $OUT_DIR/$name_sample
+      fastq_dump_path=`which fastq-dump.2.4.2`
+      picard_path=`which picard.jar`
+      gdc_client_path=`which gdc-client`
+      ### change of directory
+      cd $OUT_DIR/$name_sample/
+      echo "#!/bin/bash" > run.bash
+      echo "cd $OUT_DIR/$name_sample/ # change of directory" >> run.bash
+    else
+      echo "Output $OUT_DIR/$name_sample Existing!"
+      break
+    fi
+
+    if [ "$header" == "SRA" ]; then
+      echo
+      echo "###### Downloading $name_sample from $header!"
+
+      link_sample="ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/${id_sample:0:6}/$id_sample/"
+      ID_RUNS=(`curl -sS $link_sample | sed -r 's/\s+/\t/g' | cut -f9 `)
+      num_run=0
+      echo "$link_sample"
+      mkdir data  # creating data directory...
+      echo "mkdir data  # creating data directory" >> run.bash
+      for id_run in "${ID_RUNS[@]}"
+      do
+        echo "  |____ $id_run"
+        ((num_run++))
+        num_run=$(printf "%03d" $num_run)
+        link_run=${link_sample}${id_run}"/"
+        ID_FILES=(`curl -sS $link_run | sed -r 's/\s+/\t/g' | cut -f9 `)
+        for name_file in "${ID_FILES[@]}"
+        do
+          echo "  |  |____ $id_file"
+          echo "  |  |         Retrieving SRA... "
+          link_file=${link_run}${name_file}
+          wget -nv $link_file # downloading .sra file
+          echo "wget -nv $link_file # download .sra file" >> run.bash
+          echo "  |  |         Converting SRA..."
+          $fastq_dump_path --split-files -gzip $name_file # convert SRA file to fastq.gz file and seperate read1 and read2
+          echo "$fastq_dump_path --split-files -gzip $name_file # convert SRA file to fastq.gz file and seperate read1 and read2" >> run.bash
+          id_file=${name_file%.*}
+          rm -rf $name_file  # cleaning up .sra file...
+          echo "mkdir data  # cleaning up .sra file..." >> run.bash
+          mv ${id_file}_1.fastq.gz data/; ln -s data/${id_file}_1.fastq.gz ${name_sample}_L${num_run}_R1.fastq.gz # linking read1...
+          echo "mv ${id_file}_1.fastq.gz data/; ln -s data/${id_file}_1.fastq.gz ${name_sample}_L${num_run}_R1.fastq.gz # linking read1..." >> run.bash
+          echo " |  |         Linking data/${id_file}_1.fastq.gz to ${name_sample}_L${num_run}_R1.fastq.gz"
+          if [[ -f ${id_file}_2.fastq.gz ]] 
+          then 
+            mv ${id_file}_2.fastq.gz data/; ln -s data/${id_file}_2.fastq.gz ${name_sample}_L${num_run}_R2.fastq.gz # linking read2...
+            echo "mv ${id_file}_2.fastq.gz data/; ln -s data/${id_file}_2.fastq.gz ${name_sample}_L${num_run}_R2.fastq.gz # linking read2..." >> run.bash
+            echo " |  |         Linking data/${id_file}_2.fastq.gz to ${name_sample}_L${num_run}_R2.fastq.gz"
+          fi
+        done
+      done
+    elif [[ $header == *gdc-user-token* ]]
+    then
+      echo
+      echo "###### Downloading $name_sample from TCGA!"
+      echo $id_sample
+      echo "  |____ Downloading .bam file..."
+      cd ../;$gdc_client_path download --no-annotations -n $NSLOTS -t $header $id_sample;cd -;mkdir data # Downloading bam file and creating data directory...
+      echo "cd ../;$gdc_client_path download --no-annotations -n $NSLOTS-t $header $id_sample;cd -;mkdir data # Downloading bam file and creatingi data directory... " >> run.bash
+      echo "  |____ Converting to .fastq file..."
+      java -jar $picard_path SamToFastq I=`ls *.bam` FASTQ=data/${id_sample}_L001_R1.fastq SECOND_END_FASTQ=data/${id_sample}_L001_R2.fastq QUIET=true VERBOSITY=ERROR INCLUDE_NON_PF_READS=true # Converting to fastq file...
+      echo "java -jar $picard_path SamToFastq I=\`ls *.bam\` FASTQ=data/${id_sample}_L001_R1.fastq SECOND_END_FASTQ=data/${id_sample}_L001_R2.fastq QUIET=true VERBOSITY=ERROR INCLUDE_NON_PF_READS=true # Converting to fastq file..." >> run.bash 
+      rm -rf *.ba{m,i}  # Cleaning up .bam and .bai file
+      echo "rm -rf *.ba{m,i} # cleaning up .bam and .bai file" >> run.bash
+      echo "  |____ Gzipping to .gz file..."
+      gzip data/${id_sample}_L001_R1.fastq data/${id_sample}_L001_R2.fastq # Gzipping to .gz file...
+      echo "gzip data/${id_sample}_L001_R1.fastq data/${id_sample}_L001_R2.fastq # Gzipping to .gz file..." >> run.bash
+      ln -s data/${id_sample}_L001_R1.fastq.gz ${name_sample}_L001_R1.fastq.gz # Linking read1...
+      echo "ln -s data/${id_sample}_L001_R1.fastq.gz ${name_sample}_L001_R1.fastq.gz # Linking read1..." >> run.bash
+      if [ ! -s data/${id_sample}_L001_R2.fastq.gz ]
+      then
+        rm -rf data/${id_sample}_L001_R2.fastq.gz # Deleting empty read2 file...
+        echo "rm -rf data/${id_sample}_L001_R2.fastq.gz # Deleting empty read2 file..." >> run.bash
+      else
+        ln -s data/${id_sample}_L001_R2.fastq.gz ${name_sample}_L001_R2.fastq.gz # Linking read2...
+        echo "ln -s data/${id_sample}_L001_R2.fastq.gz ${name_sample}_L001_R2.fastq.gz # Linking read2..." >> run.bash
+      fi
+    elif [ -d $header ]
+    then
+      echo
+      echo "###### Linking $name_sample from Local Directory!"
+      echo $header
+      ln -s $header data # linking source directory...
+      echo "ln -s $header data # linking source directory..." >> run.bash
+      for myfastq in `ls -1 $header | grep "$id_sample"`
+      do
+        echo "  |____ data/`basename $myfastq` to ${name_sample}`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2`"
+        ln -sf data/`basename $myfastq` ${name_sample}`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2` # linking file...
+        echo "ln -sf data/\`basename $myfastq\` ${name_sample}\`echo $myfastq | sed 's/'$id_sample'/\t/g' | cut -f2\` # linking file..." >> run.bash
+      done
+    else
+      printf "\n###### Metadata file lacking of comment header! \n\t(The first line of input metadata file should start with \"\# \" and follow by \" SRA\", \"TCGA\", or local directory path\n"
+    fi
+  fi
+  echo "cd $CWD # changing back to working directory..." >> run.bash
+  cd $CWD
+done
+
+
+#########################
+echo
+date
+# end
diff --git a/code/install_R_packages.r b/code/install_R_packages.r
@@ -1,11 +1,11 @@
 #!/usr/bin/env Rscript
 
-list.of.packages=c("tools","DESeq2","GenomicFeatures","dplyr","BiocParallel","pheatmap","RColorBrewer","ggplot2","ReportingTools","hwriter","reshape2","preprocessCore","cowplot","diagram","GGally","tidyr","animation")
+list.of.packages=c("EDASeq","ggthemes","TxDb.Hsapiens.UCSC.hg19.knownGene","TCGAbiolinks","tools","DESeq2","GenomicFeatures","dplyr","BiocParallel","pheatmap","RColorBrewer","ggplot2","ReportingTools","hwriter","reshape2","preprocessCore","cowplot","diagram","GGally","tidyr","animation")
 missing.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 for (p in missing.packages){
  if(!file.exists(Sys.getenv("R_LIBS_USER"))){
     system(paste0("mkdir -p ",Sys.getenv("R_LIBS_USER")))
  }
- source("https://bioconductor.org/biocLite.R") 
- biocLite(p,lib=Sys.getenv("R_LIBS_USER"))
+ source("http://bioconductor.org/biocLite.R") 
+ biocLite(p,lib=Sys.getenv("R_LIBS_USER"),suppressUpdates=TRUE)
 }
diff --git a/code/job_submitter.bash b/code/job_submitter.bash
@@ -5,4 +5,4 @@ source code/custom-bashrc
 NSLOTS=$1
 shift
 
-qsub -b Y -cwd -pe threaded $NSLOTS -l tmp_free=120G -l tmp_token=30G -hard $@
+qsub -b Y -cwd -pe threaded $NSLOTS -e /dev/null -o /dev/null -l tmp_free=120G -l tmp_token=30G -hard $@
diff --git a/code/run_alignment b/code/run_alignment
@@ -34,6 +34,11 @@ STAR_PREFIX=${BASE_DIR}/STAR.
 NAME=`basename $BASE_DIR`
 
 
+#########################
+# bin path
+picard_path=`which picard.jar`
+star_path=`which STAR`
+
 
 #########################
 # check inputs
@@ -94,14 +99,12 @@ echo " * BAM: ${STAR_PREFIX}Aligned.sortedByCoord.out.bam "
 # Picard says STAR-sorted BAM "is not coordinate sorted", so using samtools for sorting
 
 CMD="
-STAR \
+$star_path \
 --runThreadN $NSLOTS \
 --genomeLoad NoSharedMemory \
---outFilterMismatchNoverLmax 0.05 \
 --outFilterType BySJout \
 --outSAMstrandField intronMotif \
 --outSAMattributes NH HI AS nM NM MD XS \
---outSAMmapqUnique 60 \
 --twopassMode Basic \
 --readFilesIn $FASTQ1 $FASTQ2 \
 --outFileNamePrefix $STAR_PREFIX \
@@ -146,7 +149,7 @@ else
 #---------------#
 # need to figure out how to set this path to the jar files
 CMD="
-java -XX:-UsePerfData -Xms8G -Xmx16G -jar /ifs/home/gongy05/utilities/picard-tools/1.88/MarkDuplicates.jar \
+java -XX:-UsePerfData -Xms8G -Xmx16G -jar $picard_path MarkDuplicates \
 VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000 \
 $params \
 INPUT=${STAR_PREFIX}Aligned.sortedByCoord.out.bam \
@@ -184,7 +187,7 @@ CMD="
 cat ${STAR_PREFIX}Log.final.out | grep \"Number of input reads\|Uniquely mapped reads number\|Number of reads mapped to too many loci\" | cut -f2 | perl -lane 'printf \"\$_\t\"' | awk '{print \$1-\$2-\$3}' | awk '{print \"\t\tNumber of reads unmapped\t|\t\"\$1}' >> ${STAR_PREFIX}Log.final.out;
 echo >> ${STAR_PREFIX}Log.final.out;
 echo -e \"\t\t\t\tDUPLICATED READS:\" >> ${STAR_PREFIX}Log.final.out;
-head -8 $BASE_DIR/remove_duplicates.txt | tail -1 | cut -f8 | awk '{print \"\t\t\t% of reads Duplicated\t|\t\"\$1*100\"%\"}' >> ${STAR_PREFIX}Log.final.out;
+head -8 $BASE_DIR/remove_duplicates.txt | tail -1 | cut -f9 | awk '{print \"\t\t\t% of reads Duplicated\t|\t\"\$1*100\"%\"}' >> ${STAR_PREFIX}Log.final.out;
 grep \"Uniquely mapped reads number\|Duplicated\" ${STAR_PREFIX}Log.final.out | sed -E 's/\s+//g' | cut -d '|' -f2 | perl -lane 'printf \"\$_\t\"' | perl -lane '{\$dup=sprintf(\"%.0f\",\$F[0]*\$F[1]/100);\$unique=\$F[0]-\$dup;print \"\$unique\t\$dup\"}' | awk '{print \"\t\tNumber of reads unduplicated\t|\t\"\$1\"\n\t\tNumber of reads duplicated\t|\t\"\$2}' >> ${STAR_PREFIX}Log.final.out
 "
 #printf "\n\n $CMD \n\n"

diff --git a/code/utilities/gdc-client b/code/utilities/gdc-client
@@ -0,0 +1,4 @@
+#!/local/apps/python/2.7.3/bin/python
+# EASY-INSTALL-SCRIPT: 'gdc-client==1.2.0','gdc-client'
+__requires__ = 'gdc-client==1.2.0'
+__import__('pkg_resources').run_script('gdc-client==1.2.0', 'gdc-client')
diff --git a/code/utilities/picard-2.9.0.jar b/code/utilities/picard-2.9.0.jar
diff --git a/code/utilities/picard.jar b/code/utilities/picard.jar
@@ -0,0 +1 @@
+picard-2.9.0.jar
diff --git a/meta_data/20160224.txt b/meta_data/20160224.txt
diff --git a/meta_data/download_list.txt b/meta_data/download_list.txt
@@ -0,0 +1,6 @@
+EryB	SRX669497	SRA
+f327cd15-9af4-4c6d-8318-986934ea32c9	f327cd15-9af4-4c6d-8318-986934ea32c9	meta_data/gdc-user-token.xxxxxx.txt
+A549_REN_24H-rep1	A549_REN_REP1_24H_CGATGT	/ifs/data/2016-02-24/fastq/
+EryA	SRX669496	SRA
+e3878649-03e0-4d19-a14c-344afe2f1b4a	e3878649-03e0-4d19-a14c-344afe2f1b4a	meta_data/gdc-user-token.xxxxxx.txt
+A549_REN_48H-rep1	A549_REN_REP1_48H_CGATGT	/ifs/data/2016-02-24/fastq/