Skip to content

Commit f765393

Browse files
authored
Merge pull request #10 from cokelaer/main
update report by adding summary table
2 parents eaddc3e + a9744bb commit f765393

File tree

2 files changed

+54
-25
lines changed

2 files changed

+54
-25
lines changed

README.rst

+1
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Version Description
151151
1.4.0 * sub sampling was biased in v1.3.0. Using stratified sampling to
152152
correcly sample large file. Also set a --promethion option that
153153
auomatically sub sample 10% of the data
154+
* add summary table
154155
1.3.0 * handle large promethium run by using a sub sample of the
155156
sequencing summary file (--sample of pycoQC still loads the entire
156157
file in memory)

sequana_pipelines/nanomerge/nanomerge.rules

+53-25
Original file line numberDiff line numberDiff line change
@@ -226,73 +226,101 @@ rule html_report:
226226

227227

228228
dirs = ",".join([f'<a href="{x}/">{x}</a>' for x in samples.get_projects()])
229-
if config['summary']:
230229

230+
def get_stats():
231231
from sequana import FastA
232232
from sequana.stats import N50
233233
from pylab import mean
234+
from collections import defaultdict
234235

235-
mus = []
236-
N50s = []
237-
nreads = []
238-
sample_names = []
239-
barcodes = []
236+
lengths = defaultdict(list)
240237

241238
for sample, filename in manager.samples.items():
242239
barcode = filename.split("/")[-2]
243240
barcodes.append(barcode)
244-
print(sample, filename)
245241
f = FastA(filename)
246-
lengths = list(f.get_lengths_as_dict().values())
247-
mus.append(round(mean(lengths), 0))
248-
N50s.append(N50(lengths))
249-
nreads.append(len(lengths))
250-
sample_names.append(sample)
251242

252-
total_reads = sum(nreads)
243+
# keep track of all lengths
244+
these_lengths = list(f.get_lengths_as_dict().values())
245+
lengths[barcode].extend(these_lengths)
246+
247+
mus = {}
248+
N50s = {}
249+
nreads = {}
250+
sample_names = {}
251+
for barcode in lengths.keys():
252+
mus[barcode] = round(mean(lengths[barcode]), 0)
253+
N50s[barcode] = N50(lengths[barcode])
254+
nreads[barcode] = len(lengths[barcode])
255+
try:
256+
sample_names[barcode] = samples.df.query("barcode==@barcode").samplename.values[0]
257+
except:
258+
sample_names[barcode] = "undefined"
253259

254260
# a summary table
255261
df = pd.DataFrame({
256-
"sample": sample_names,
257-
"barcodes": barcodes,
258-
"N50": N50s,
259-
"mean read length": mus,
260-
"Number of reads":nreads},
262+
"sample": [sample_names[k] for k in sorted(sample_names.keys())],
263+
"barcodes": [k for k in sorted(sample_names.keys())],
264+
"N50": [N50s[k] for k in sorted(sample_names.keys())],
265+
"mean read length": [mus[k] for k in sorted(sample_names.keys())],
266+
"Number of reads": [nreads[k] for k in sorted(sample_names.keys())]
267+
},
261268
index=sample_names)
269+
270+
total_reads = sum([nreads[k] for k in nreads.keys()])
271+
262272
from sequana.utils.datatables_js import DataTable
263273
datatable = DataTable(df, 'nanomerge', index=False)
264274
datatable.datatable.datatable_options = {'paging': 'false',
265275
'buttons': ['copy', 'csv'],
266-
'bSort': 'true',
267-
'dom':"RSPrt"
276+
'bSort': 'true',
277+
'dom':"RSPrtp"
268278
}
269279
js = datatable.create_javascript_function()
270280
htmltable = datatable.create_datatable()
271281

282+
return js + htmltable, total_reads
283+
284+
285+
htmltable, total_reads = get_stats()
286+
287+
288+
def get_model():
289+
from sequana import FastA
290+
s = next(FastA(input[0]))
291+
try:
292+
model = [x.split("=")[1] for x in s.comment.split() if "model_version_id" in x][0]
293+
except IndexError:
294+
model = "unknown"
295+
return model
296+
297+
model = get_model()
298+
model = f"The model used for base calling was {model}. "
299+
300+
if config['summary']:
301+
272302
# a warning message
273303
percentage=config['sub_sample_summary']['percentage'] / 100
274304

275-
276305
if percentage == 1:
277306
subsample = ""
278307
else:
279308
ratio = round(1 / percentage,2)
280309
subsample = f'<b style="color:red">Sub sampling was performed. Numbers here below are approximation of must be multiplies by {ratio} since only {percentage} of the data were used to generate the tables and plots</b>'
281310

282-
# the pyco qc report
311+
# the pyco qc repor
283312
with open("pyco/pyco.html", "r") as fout:
284313
pycodata = fout.read()
285314
pycodata = '<div class="columns">' + pycodata.split('<div class="columns">')[-1].replace("</div>\n</body>\n</html>","")
286315

287316
# final report
288317
s = SummaryModule2(data, f"""
289318
<h2>General Information</h2>
290-
<p>Your data (fastq files) are available in {dirs} directories. Please see the summary plots here below (if sequence summary was provided), generated with <a href="https://github.com/a-slide/pycoQC">pycoQC</a> software.</p>""" + js + htmltable+f"Total number of reads passing filtering: {total_reads}" + "<hr>" + "<h2>Quality Control information</h2>" + subsample + pycodata)
319+
<p>Your data (fastq files) are available in {dirs} directories. Please see the summary plots here below (if sequence summary was provided), generated with <a href="https://github.com/a-slide/pycoQC">pycoQC</a> software. {model}</p>""" + htmltable + f"Total number of reads passing filtering: {total_reads}" + "<hr>" + "<h2>Quality Control information</h2>" + subsample + pycodata)
291320
else:
292-
s = SummaryModule2(data, f"No summary was found. Your data (fastq files) are available in {dirs} directories." + js + htmltable +f"Total number of reads passing filtering: {total_reads}" )
321+
s = SummaryModule2(data, f"No summary was found. Your data (fastq files) are available in {dirs} directories." + htmltable +f"Total number of reads passing filtering: {total_reads}. {model}" )
293322

294323

295-
localrules: html_report
296324

297325
# ======================================================================================== rulegraph
298326

0 commit comments

Comments
 (0)