first file dump

NYU-Molecular-Pathology · Aug 24, 2016 · 5019b41 · 5019b41
1 parent eaa4fa3
commit 5019b41
Show file tree

Hide file tree

Showing 34 changed files with 5,242 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,122 @@
+
+# Created by https://www.gitignore.io/
+
+### OSX ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+
+### Windows ###
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+
+### R ###
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+
+### Perl ###
+/blib/
+/.build/
+_build/
+cover_db/
+inc/
+Build
+!Build/
+Build.bat
+.last_cover_stats
+/Makefile
+/Makefile.old
+/MANIFEST.bak
+/META.yml
+/META.json
+/MYMETA.*
+nytprof.out
+/pm_to_blib
+*.o
+*.bs
+/_eumm/
diff --git a/gather-fastqs b/gather-fastqs
@@ -0,0 +1,133 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use List::MoreUtils qw(uniq);
+
+my $HELP = <<HELP;
+
+  Find FASTQ files in a given directory (must have "_R1" or "_1" in file name).
+  Extract sample names and paired reads based on file names.
+  Generate sample table file samples.fastq.csv in current directory.
+
+  If run multiple times, it will add new samples to the sample table.
+
+  usage: gather-fastqs dir
+
+HELP
+
+if (!$ARGV[0]) {
+	die $HELP;
+}
+
+main();
+
+# main subroutine
+sub main {
+	my $search_dir = $ARGV[0];
+
+	# convert dir from relative to absolute
+	$search_dir = `readlink -f $search_dir`;
+	chomp($search_dir);
+
+	# check that dir exists
+	unless ( -d $search_dir ) {
+		die "\n\n ERROR! $search_dir DOES NOT EXIST \n\n";
+	}
+
+	# find fastqs in given directory
+	my $find_fastq_cmd = 'find -L ' . $search_dir . ' -maxdepth 2 -type f -name "*_R1*.fastq.gz" -or -name "*_1.fastq.gz" | LC_ALL=C sort';
+	my @fastqs = `$find_fastq_cmd`;
+
+	# counter single and paired reads
+	my $reads_se = 0;
+	my $reads_pe = 0;
+	my @samples = ();
+
+	# sample table file
+	my $filename = "samples.fastq-raw.csv";
+	open(my $fh, ">>", $filename);
+
+	# process each fastq
+	while (my $fastq_r1 = shift(@fastqs)) {
+		chomp($fastq_r1);
+
+		# check that R1 exists
+		unless ( -e $fastq_r1 ) {
+			die "\n\n ERROR! $fastq_r1 DOES NOT EXIST \n\n";
+		}
+
+		# generate R2 filename
+		my $fastq_r2 = $fastq_r1;
+		$fastq_r2 =~ s/(.*)_R1_00(.*.fastq.gz)/${1}_R2_00${2}/;
+		$fastq_r2 =~ s/(.*)_R1.fastq.gz/${1}_R2.fastq.gz/;
+		$fastq_r2 =~ s/(.*)_1.fastq.gz/${1}_2.fastq.gz/;
+
+		# blank if R2 does not exist
+		unless ( -e $fastq_r2 ) {
+			$fastq_r2 = "";
+		}
+
+		# blank if R2 is same as R1 (in case of not standard file name, for example)
+		if ( $fastq_r1 eq $fastq_r2 ) {
+			$fastq_r2 = "";
+		}
+
+		# count based on read type
+		if ( length($fastq_r2) ) {
+			$reads_pe++;
+		}
+		else {
+			$reads_se++;
+		}
+
+		# extract sample name
+		my $sample = $fastq_r1;
+		# remove directory structure
+		$sample =~ s/.*\///;
+		# bcl2fastq2 format (with S sample number)
+		$sample =~ s/_S[0-9]{1,3}_L00[0-9]_R1.*//;
+		# bcl2fastq format with 2 barcodes
+		$sample =~ s/_[ACTG]{6,}-[ACTG]{6,}_L00[0-9]_R1.*//;
+		# bcl2fastq format with 1 barcode
+		$sample =~ s/_[ACTG]{4,}_L00[0-9]_R1.*//;
+		# no barcodes
+		$sample =~ s/_L00[0-9]_R[12].*//;
+		# no barcodes or lane
+		$sample =~ s/_R[12].fastq.gz//;
+		# no barcodes or lane
+		$sample =~ s/_[12].fastq.gz//;
+
+		push @samples, $sample;
+
+		# show progress
+		print STDERR " SAMPLE : $sample \n";
+		print STDERR "  FASTQ R1 : $fastq_r1 \n";
+		print STDERR "  FASTQ R2 : $fastq_r2 \n";
+
+		# print sample table line
+		my $output = "${sample},${fastq_r1},${fastq_r2}\n";
+		print $fh "$output";
+
+	}
+	close($fh);
+
+	# remove duplicate entries
+	system("cat $filename | LC_ALL=C sort | uniq > ${filename}.tmp && mv -f ${filename}.tmp $filename");
+
+	# get number of unique sample names
+	my $num_files = @samples;
+	@samples = uniq(@samples);
+	my $num_samples = @samples;
+
+	# print stats
+	print STDERR "\n";
+	print STDERR " NUMBER OF SAMPLES : $num_samples \n";
+	print STDERR " NUMBER OF SINGLE FILES : $reads_se \n";
+	print STDERR " NUMBER OF PAIRED FILES : $reads_pe \n";
+
+}
+
+
+
+# end
diff --git a/generate-settings b/generate-settings
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+
+my $HELP = <<HELP;
+
+  Generate the initial settings.txt file for a specified genome.
+
+  usage: generate-settings genome
+  * genome = name (hg19, mm10, etc.) or exact dir (contains genome.fa and genes.gtf)
+
+HELP
+
+if (!$ARGV[0]) {
+	die $HELP;
+}
+
+main();
+
+# main subroutine
+sub main {
+	my $genome_arg = $ARGV[0];
+
+	# pipeline directory (the directory that this file is in)
+	my $pipeline_dir = dirname(__FILE__);
+
+	# settings file
+	my $settings_file = "settings.txt";
+
+	# set genome setting
+	my $genome_dir_setting;
+	if ($genome_arg =~ m/\//) {
+		# use directory if given
+		$genome_dir_setting = "GENOME-DIR|${genome_arg}\n";
+	}
+	else {
+		# use id460 dir as reference if genome name is provided
+		$genome_dir_setting = "GENOME-DIR|/ifs/home/id460/ref/${genome_arg}\n";
+	}
+
+	# save genome setting
+	open(my $fh, ">", $settings_file);
+	# print $fh $genome_setting;
+	print $fh $genome_dir_setting;
+	close $fh;
+
+	# set fasta to test genome dir setting
+	# system("bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA");
+
+	# get values to make sure they were set properly
+	my $settings_genome = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file GENOME-DIR`;
+	my $settings_fasta = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA`;
+
+	# print values
+	print STDERR "\n";
+	print STDERR " REF DIR : $settings_genome \n";
+	print STDERR " REF FASTA : $settings_fasta \n";
+
+}
+
+
+
+# end