fastqStats.smk

rule basicFASTQqc:
	input: "fastqs/" + "{techname}_{capDesign}_{sizeFrac}_{sampleRep}.fastq.gz"
	output: "output/fastqs/" + "qc/{techname}_{capDesign}_{sizeFrac}.{sampleRep}.dupl.txt"
	shell:
		'''

# check that read IDs don't contain spaces (frequent with ONT)
#zcat {input} | perl -lane 'if ($F[3]) {{die}}' 

# check that there are no read ID duplicates
zcat {input} | fastq2tsv.pl | awk '{{print $1}}' | sort -T {TMPDIR} | uniq -dc > {output}
count=$(cat {output} | wc -l)
if [ $count -gt 0 ]; then echo "$count duplicate read IDs found"; mv {output} {output}.tmp; exit 1; fi
		'''

rule fastqTimestamps:
	input: expand("fastqs/" + "{techname}_{capDesign}_{sizeFrac}_{sampleRep}.fastq.gz", filtered_product, techname=TECHNAMES, capDesign=CAPDESIGNS, sizeFrac=SIZEFRACS, sampleRep=SAMPLEREPS)

	output: "output/statsFiles/" + "all.fastq.timestamps.tsv"
	shell:
		'''
uuid=$(uuidgen)
echo -e "sample_name\\tFASTQ_modified" > {TMPDIR}/$uuid
for file in $(echo {input}); do
echo -e "$(basename $file .fastq.gz)\t$(stat -L --printf='%y' $file  | awk '{{print $1}}')"
done | sort >> {TMPDIR}/$uuid
mv {TMPDIR}/$uuid {output}

		'''

#get read lengths for all FASTQ files:
rule getReadLengthSummary:
	input: "fastqs/" + "{techname}_{capDesign}_{sizeFrac}_{sampleRep}.fastq.gz"
	output: 
		reads="output/statsFiles/" + "tmp/{techname}_{capDesign}_{sizeFrac}.{sampleRep}.readlength.tsv.gz",
		summ="output/statsFiles/" + "tmp/{techname}_{capDesign}_{sizeFrac}.{sampleRep}.readlengthSummary.tsv"
	conda: "envs/R_env.yml"
	params:
		bc=lambda wildcards: wildcards.sampleRep
	shell:
		'''
uuidTmpOut=$(uuidgen)
echo -e "seqTech\\tcapDesign\\tsizeFrac\\tsampleRep\\tlength" |gzip > {TMPDIR}/$uuidTmpOut.gz

zcat {input} | fastq2tsv.pl | perl -F"\\t" -slane '$F[0]=~s/^(\S+).*/$1/; print join("\\t", @F)' | awk -v t={wildcards.techname} -v c={wildcards.capDesign} -v si={wildcards.sizeFrac} -v b={params.bc} '{{print t\"\\t\"c\"\\t\"si\"\\t\"b\"\\t\"length($2)}}'| gzip >> {TMPDIR}/$uuidTmpOut.gz

echo "
library(data.table)
library(tidyverse)
dat<-fread('{TMPDIR}/$uuidTmpOut.gz', header=T, sep='\\t')
dat %>%
  group_by(seqTech, sizeFrac, capDesign, sampleRep) %>%
  summarise(n=n(), median=median(length), mean=mean(length), max=max(length)) -> datSumm

write_tsv(datSumm, '{TMPDIR}/$uuidTmpOut.1')

" | R --slave

mv {TMPDIR}/$uuidTmpOut.gz {output.reads}
mv {TMPDIR}/$uuidTmpOut.1 {output.summ}
		'''

rule aggReadLengthSummary:
	input: lambda wildcards: expand("output/statsFiles/" + "tmp/{techname}_{capDesign}_{sizeFrac}.{sampleRep}.readlengthSummary.tsv", filtered_product, techname=TECHNAMES, capDesign=CAPDESIGNS, sizeFrac=SIZEFRACS, sampleRep=SAMPLEREPS)
	output: 
		summary="output/statsFiles/" + "all.readlength.summary.tsv"
	shell:
		'''
uuid=$(uuidgen)
head -n1 {input[0]} > {TMPDIR}/$uuid
tail -q -n+2 {input} |sort >> {TMPDIR}/$uuid
mv {TMPDIR}/$uuid {output}

		'''

# plot histograms with R:
rule plotReadLength:
	input: "output/statsFiles/" + "tmp/{techname}_{capDesign}_{sizeFrac}.{sampleRep}.readlength.tsv.gz"
	output: returnPlotFilenames("output/plots/" + "readLength.stats/{techname}/{capDesign}/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.readLength.stats")
	conda: "envs/R_env.yml"
	params:
		filterDat=lambda wildcards: multi_figures(wildcards.capDesign, wildcards.sizeFrac, wildcards.sampleRep, wildcards.techname),
	shell:
		'''
echo "
library(ggplot2)
library(cowplot)
library(scales)
library(gridExtra)
library(grid)
library(ggplotify)
library(dplyr)
library(data.table)
dat<-fread('{input}', header=T, sep='\\t')
{params.filterDat[technameFilterString]}
{params.filterDat[capDesignFilterString]}

{params.filterDat[sizeFracFilterString]}
{params.filterDat[sampleRepFilterString]}
{params.filterDat[substSeqTechString]}
{params.filterDat[substSampleRepString]}
{params.filterDat[graphDimensions]}

wXyPlot = wXyPlot * 1.2

dat\$sizeFrac_f=factor(dat\$sizeFrac, levels=names({sizeFrac_Rpalette}), ordered=TRUE)
dat %>%
  group_by(seqTech, sizeFrac_f, capDesign, sampleRep) %>%
  summarise(n=n(), med=median(length)) -> datSumm


summaryStats = transform(datSumm, LabelN = paste0('N= ', comma(n)), LabelM = paste0( 'Median= ', comma(med))) 

plotBase <- \\"p <- ggplot(dat, aes(x=length)) +
geom_histogram(aes(y=..density..,fill=sizeFrac_f), binwidth=100) +
geom_vline(data = summaryStats, aes(xintercept=med), color='#ff0055', linetype='solid', size=lineSize) +
scale_fill_manual(values={sizeFrac_Rpalette}) +
geom_text(data = summaryStats, aes(label = LabelN, x = Inf, y = Inf), hjust=1, vjust=1,  size=geom_textSize, fontface = 'bold') +
geom_text(data = summaryStats, aes(label = LabelM, x = med, y = Inf), hjust=-0.1, vjust=2.5,  size=geom_textSize, fontface = 'bold', color='#ff0055') +
coord_cartesian(xlim=c(0, 3500)) +
#scale_y_continuous(labels=scientific)+
scale_x_continuous(labels=comma, name='Read length (nts)')+
{GGPLOT_PUB_QUALITY} + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + \\"

{params.filterDat[facetPlotSetup]}

wYxPlot = wYxPlot * 1.2
wYxNoLegendPlot<- wYxPlot - wLegendOnly

save_plot('{output[0]}', legendOnly, base_width=wLegendOnly, base_height=hLegendOnly)
save_plot('{output[1]}', pXy, base_width=wXyPlot, base_height=hXyPlot)

save_plot('{output[2]}', pXyNoLegend, base_width=wXyNoLegendPlot, base_height=hXyNoLegendPlot)
save_plot('{output[3]}', pYx, base_width=wYxPlot, base_height=hYxPlot)

save_plot('{output[4]}', pYxNoLegend, base_width=wYxNoLegendPlot, base_height=hYxNoLegendPlot)


" > $(dirname {output[0]})/$(basename {output[0]} .legendOnly.png).r

cat $(dirname {output[0]})/$(basename {output[0]} .legendOnly.png).r | R --slave


	 	'''