Skip to content

Commit

Permalink
first file dump
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Aug 24, 2016
1 parent eaa4fa3 commit 5019b41
Show file tree
Hide file tree
Showing 34 changed files with 5,242 additions and 0 deletions.
122 changes: 122 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@

# Created by https://www.gitignore.io/

### OSX ###
*.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk


### Linux ###
*~

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*


### Windows ###
# Windows image file caches
Thumbs.db
ehthumbs.db

# Folder config file
Desktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msm
*.msp

# Windows shortcuts
*.lnk


### R ###
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md


### Perl ###
/blib/
/.build/
_build/
cover_db/
inc/
Build
!Build/
Build.bat
.last_cover_stats
/Makefile
/Makefile.old
/MANIFEST.bak
/META.yml
/META.json
/MYMETA.*
nytprof.out
/pm_to_blib
*.o
*.bs
/_eumm/
133 changes: 133 additions & 0 deletions gather-fastqs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env perl

use strict;
use warnings;
use List::MoreUtils qw(uniq);

my $HELP = <<HELP;
Find FASTQ files in a given directory (must have "_R1" or "_1" in file name).
Extract sample names and paired reads based on file names.
Generate sample table file samples.fastq.csv in current directory.
If run multiple times, it will add new samples to the sample table.
usage: gather-fastqs dir
HELP

if (!$ARGV[0]) {
die $HELP;
}

main();

# main subroutine
sub main {
my $search_dir = $ARGV[0];

# convert dir from relative to absolute
$search_dir = `readlink -f $search_dir`;
chomp($search_dir);

# check that dir exists
unless ( -d $search_dir ) {
die "\n\n ERROR! $search_dir DOES NOT EXIST \n\n";
}

# find fastqs in given directory
my $find_fastq_cmd = 'find -L ' . $search_dir . ' -maxdepth 2 -type f -name "*_R1*.fastq.gz" -or -name "*_1.fastq.gz" | LC_ALL=C sort';
my @fastqs = `$find_fastq_cmd`;

# counter single and paired reads
my $reads_se = 0;
my $reads_pe = 0;
my @samples = ();

# sample table file
my $filename = "samples.fastq-raw.csv";
open(my $fh, ">>", $filename);

# process each fastq
while (my $fastq_r1 = shift(@fastqs)) {
chomp($fastq_r1);

# check that R1 exists
unless ( -e $fastq_r1 ) {
die "\n\n ERROR! $fastq_r1 DOES NOT EXIST \n\n";
}

# generate R2 filename
my $fastq_r2 = $fastq_r1;
$fastq_r2 =~ s/(.*)_R1_00(.*.fastq.gz)/${1}_R2_00${2}/;
$fastq_r2 =~ s/(.*)_R1.fastq.gz/${1}_R2.fastq.gz/;
$fastq_r2 =~ s/(.*)_1.fastq.gz/${1}_2.fastq.gz/;

# blank if R2 does not exist
unless ( -e $fastq_r2 ) {
$fastq_r2 = "";
}

# blank if R2 is same as R1 (in case of not standard file name, for example)
if ( $fastq_r1 eq $fastq_r2 ) {
$fastq_r2 = "";
}

# count based on read type
if ( length($fastq_r2) ) {
$reads_pe++;
}
else {
$reads_se++;
}

# extract sample name
my $sample = $fastq_r1;
# remove directory structure
$sample =~ s/.*\///;
# bcl2fastq2 format (with S sample number)
$sample =~ s/_S[0-9]{1,3}_L00[0-9]_R1.*//;
# bcl2fastq format with 2 barcodes
$sample =~ s/_[ACTG]{6,}-[ACTG]{6,}_L00[0-9]_R1.*//;
# bcl2fastq format with 1 barcode
$sample =~ s/_[ACTG]{4,}_L00[0-9]_R1.*//;
# no barcodes
$sample =~ s/_L00[0-9]_R[12].*//;
# no barcodes or lane
$sample =~ s/_R[12].fastq.gz//;
# no barcodes or lane
$sample =~ s/_[12].fastq.gz//;

push @samples, $sample;

# show progress
print STDERR " SAMPLE : $sample \n";
print STDERR " FASTQ R1 : $fastq_r1 \n";
print STDERR " FASTQ R2 : $fastq_r2 \n";

# print sample table line
my $output = "${sample},${fastq_r1},${fastq_r2}\n";
print $fh "$output";

}
close($fh);

# remove duplicate entries
system("cat $filename | LC_ALL=C sort | uniq > ${filename}.tmp && mv -f ${filename}.tmp $filename");

# get number of unique sample names
my $num_files = @samples;
@samples = uniq(@samples);
my $num_samples = @samples;

# print stats
print STDERR "\n";
print STDERR " NUMBER OF SAMPLES : $num_samples \n";
print STDERR " NUMBER OF SINGLE FILES : $reads_se \n";
print STDERR " NUMBER OF PAIRED FILES : $reads_pe \n";

}



# end
65 changes: 65 additions & 0 deletions generate-settings
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;

my $HELP = <<HELP;
Generate the initial settings.txt file for a specified genome.
usage: generate-settings genome
* genome = name (hg19, mm10, etc.) or exact dir (contains genome.fa and genes.gtf)
HELP

if (!$ARGV[0]) {
die $HELP;
}

main();

# main subroutine
sub main {
my $genome_arg = $ARGV[0];

# pipeline directory (the directory that this file is in)
my $pipeline_dir = dirname(__FILE__);

# settings file
my $settings_file = "settings.txt";

# set genome setting
my $genome_dir_setting;
if ($genome_arg =~ m/\//) {
# use directory if given
$genome_dir_setting = "GENOME-DIR|${genome_arg}\n";
}
else {
# use id460 dir as reference if genome name is provided
$genome_dir_setting = "GENOME-DIR|/ifs/home/id460/ref/${genome_arg}\n";
}

# save genome setting
open(my $fh, ">", $settings_file);
# print $fh $genome_setting;
print $fh $genome_dir_setting;
close $fh;

# set fasta to test genome dir setting
# system("bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA");

# get values to make sure they were set properly
my $settings_genome = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file GENOME-DIR`;
my $settings_fasta = `bash ${pipeline_dir}/scripts/get-set-setting.sh $settings_file REF-FASTA`;

# print values
print STDERR "\n";
print STDERR " REF DIR : $settings_genome \n";
print STDERR " REF FASTA : $settings_fasta \n";

}



# end
Loading

0 comments on commit 5019b41

Please sign in to comment.