Skip to content

Commit eaddc3e

Browse files
authored
Merge pull request #9 from cokelaer/main
Perform stratified sampling
2 parents e0f626d + 1ea05e5 commit eaddc3e

File tree

5 files changed

+127
-12
lines changed

5 files changed

+127
-12
lines changed

README.rst

+3
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ Changelog
148148
========= ====================================================================
149149
Version Description
150150
========= ====================================================================
151+
1.4.0 * sub sampling was biased in v1.3.0. Using stratified sampling to
152+
correcly sample large file. Also set a --promethion option that
153+
auomatically sub sample 10% of the data
151154
1.3.0 * handle large promethium run by using a sub sample of the
152155
sequencing summary file (--sample of pycoQC still loads the entire
153156
file in memory)

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
sequana_pipetools>=0.12.2
22
sequana
3+
pandas
4+
numpy

sequana_pipelines/nanomerge/main.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -65,28 +65,34 @@ def __init__(self, prog=NAME, epilog=None):
6565

6666
pipeline_group.add_argument(
6767
"--summary",
68-
help="a summary file generated by albacore or guppy. if provided,pyqoQC is used to generate a HTML report. ",
68+
help="a summary file generated by albacore or guppy. if provided, pyqoQC is used to generate a HTML report. ",
6969
default=None,
7070
type=str,
7171
dest="summary",
7272
)
7373

7474
pipeline_group.add_argument(
7575
"--summary-percentage",
76-
help="percentage of the sequencing summary file to process. Use this option if you have memory issue (typically with promethium runs). If unset, nanomerge will set this value automatically so that the final file to process do not exceed 16Go. This value can be cahnged with --summary-max--gb",
76+
help="percentage of the sequencing summary file to process. Use this option if you have memory issue typically with promethium runs). If unset, nanomerge will set this value automatically so that the final file to process do not exceed 16Go. This value can be changed with --summary-max--gb",
7777
default=None,
7878
type=int,
7979
dest="summary_percentage",
8080
)
8181

8282
pipeline_group.add_argument(
8383
"--summary-max-gb",
84-
help="percentage of the sequencing summary file to process. Use :this option if you have memory issue (typically with promethium runs. ",
84+
help="max size of the summary file before performing sub sampling automatically. Use this option if you have memory issue.",
8585
default=16,
8686
type=float,
8787
dest="summary_max_gb",
8888
)
8989

90+
pipeline_group.add_argument(
91+
"--promethion",
92+
action="store_true",
93+
help="set summary_percentage to 10%%"
94+
)
95+
9096
self.add_argument("--run", default=False, action="store_true", help="execute the pipeline directly")
9197

9298
def parse_args(self, *args):
@@ -152,6 +158,8 @@ def main(args=None):
152158

153159
# if the sequencing summary file is large (larger than 16gb by default) we sub sample the data
154160
# The percentage is set automatically to have a final file of 16Gb (by default)
161+
162+
155163
if options.summary_percentage is None:
156164
cfg.sub_sample_summary.percentage = options.summary_max_gb / (
157165
os.stat(options.summary).st_size / 1024 / 1024 / 1024
@@ -164,9 +172,12 @@ def main(args=None):
164172
logger.warning(
165173
f"Input file size is {size}Gb , which is larger than {options.summary_max_gb}Gb. Will use {cfg.sub_sample_summary.percentage}% of the data"
166174
)
167-
168175
else: # user sets the value himself, so nothing to do
169176
cfg.sub_sample_summary.percentage = options.summary_percentage
177+
178+
# if --promethion was used, set percentage to 10 whatsover
179+
if options.promethion:
180+
cfg.sub_sample_summary.percentage = 10
170181
else:
171182
cfg.summary = None
172183

sequana_pipelines/nanomerge/nanomerge.rules

+106-7
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ or following the instructions from step 1.
3131

3232
from pathlib import Path
3333
import pandas as pd
34+
from pylab import linspace
35+
from numpy import zeros
3436
from sequana_pipetools import snaketools as sm
3537

3638
sequana_wrapper_branch = "main"
@@ -113,11 +115,43 @@ if config["summary"]:
113115
output:
114116
"sub_sample_summary/summary.txt"
115117
params:
116-
percentage=config['sub_sample_summary']['percentage'] / 100
117-
shell:
118-
"""
119-
head -n 1 {input} > {output} && tail -n +2 {input} | awk -v k={params.percentage} 'BEGIN {{ srand(); n = 0; }} {{ if (n < k * NR) {{ reservoir[n++] = $0; }} else {{ r = int(rand() * n); if (r < k * NR) {{ reservoir[r] = $0; }} }} }} END {{ for (i = 0; i < n; i++) {{ print reservoir[i]; }} }}' >> {output}
120-
"""
118+
percentage=config['sub_sample_summary']['percentage'] / 100,
119+
binning = 100
120+
run:
121+
122+
123+
124+
# We need the min and max first of the entire start time vector
125+
max_time = 0.
126+
min_time = 24 * 10 * 3600 #(10 days is enough. runs are expected to be 72 hours max)
127+
Ntotal = 0
128+
with pd.read_csv(input[0], chunksize=10000, sep='\t') as reader:
129+
for i, chunk in enumerate(reader):
130+
max_time = max(max_time, chunk.start_time.max())
131+
min_time = min(min_time, chunk.start_time.min())
132+
Ntotal += len(chunk)
133+
134+
bins = linspace(min_time, max_time, params.binning + 1)
135+
136+
# now we perform the stratified histogram
137+
with pd.read_csv(input[0], chunksize=10000, sep="\t") as reader:
138+
139+
# we'll save the header once
140+
header = True
141+
142+
# go through all chunks
143+
for i,chunk in enumerate(reader):
144+
145+
# save rows based on stratified sampling
146+
for j in range(params.binning-1):
147+
X1, X2 = bins[j], bins[j+1]
148+
subdf = chunk.query("start_time>=@X1 and start_time<@X2").sample(frac=params.percentage)
149+
if header is True:
150+
subdf.to_csv(output[0], header=True, mode="w", index=False, sep='\t')
151+
header = False
152+
else:
153+
subdf.to_csv(output[0], header=False, mode="a+", index=False, sep='\t')
154+
121155

122156
rule pyco:
123157
input:
@@ -170,6 +204,8 @@ rule dot2svg:
170204
shell:
171205
"""dot -Tsvg {input} -o {output}"""
172206

207+
rule get_stats:
208+
173209

174210
rule html_report:
175211
input:
@@ -180,20 +216,83 @@ rule html_report:
180216
from sequana.modules_report.summary import SummaryModule2
181217
from sequana_pipelines import nanomerge
182218
os.makedirs("images", exist_ok=True)
219+
183220
data = {"name": "nanomerge",
184221
"rulegraph": ".sequana/rulegraph.svg",
185222
"pipeline_version": nanomerge.version}
223+
186224
manager.teardown(extra_files_to_remove=["pyco/pyco.log", "pyco/pyco.html"])
187225

226+
227+
188228
dirs = ",".join([f'<a href="{x}/">{x}</a>' for x in samples.get_projects()])
189229
if config['summary']:
230+
231+
from sequana import FastA
232+
from sequana.stats import N50
233+
from pylab import mean
234+
235+
mus = []
236+
N50s = []
237+
nreads = []
238+
sample_names = []
239+
barcodes = []
240+
241+
for sample, filename in manager.samples.items():
242+
barcode = filename.split("/")[-2]
243+
barcodes.append(barcode)
244+
print(sample, filename)
245+
f = FastA(filename)
246+
lengths = list(f.get_lengths_as_dict().values())
247+
mus.append(round(mean(lengths), 0))
248+
N50s.append(N50(lengths))
249+
nreads.append(len(lengths))
250+
sample_names.append(sample)
251+
252+
total_reads = sum(nreads)
253+
254+
# a summary table
255+
df = pd.DataFrame({
256+
"sample": sample_names,
257+
"barcodes": barcodes,
258+
"N50": N50s,
259+
"mean read length": mus,
260+
"Number of reads":nreads},
261+
index=sample_names)
262+
from sequana.utils.datatables_js import DataTable
263+
datatable = DataTable(df, 'nanomerge', index=False)
264+
datatable.datatable.datatable_options = {'paging': 'false',
265+
'buttons': ['copy', 'csv'],
266+
'bSort': 'true',
267+
'dom':"RSPrt"
268+
}
269+
js = datatable.create_javascript_function()
270+
htmltable = datatable.create_datatable()
271+
272+
# a warning message
273+
percentage=config['sub_sample_summary']['percentage'] / 100
274+
275+
276+
if percentage == 1:
277+
subsample = ""
278+
else:
279+
ratio = round(1 / percentage,2)
280+
subsample = f'<b style="color:red">Sub sampling was performed. Numbers here below are approximation of must be multiplies by {ratio} since only {percentage} of the data were used to generate the tables and plots</b>'
281+
282+
# the pyco qc report
190283
with open("pyco/pyco.html", "r") as fout:
191284
pycodata = fout.read()
192285
pycodata = '<div class="columns">' + pycodata.split('<div class="columns">')[-1].replace("</div>\n</body>\n</html>","")
193286

194-
s = SummaryModule2(data, f"""Your data are available in {dirs} directories. Please see the summary plots here below (if sequence summary was provided), generated with <a href="https://github.com/a-slide/pycoQC">pycoQC</a> software.""" + pycodata)
287+
# final report
288+
s = SummaryModule2(data, f"""
289+
<h2>General Information</h2>
290+
<p>Your data (fastq files) are available in {dirs} directories. Please see the summary plots here below (if sequence summary was provided), generated with <a href="https://github.com/a-slide/pycoQC">pycoQC</a> software.</p>""" + js + htmltable+f"Total number of reads passing filtering: {total_reads}" + "<hr>" + "<h2>Quality Control information</h2>" + subsample + pycodata)
195291
else:
196-
s = SummaryModule2(data, f"no summary found. Please checkout the sub directories {dirs}. They should contain your final fastq files for each project.")
292+
s = SummaryModule2(data, f"No summary was found. Your data (fastq files) are available in {dirs} directories." + js + htmltable +f"Total number of reads passing filtering: {total_reads}" )
293+
294+
295+
localrules: html_report
197296

198297
# ======================================================================================== rulegraph
199298

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111

1212
_MAJOR = 1
13-
_MINOR = 3
13+
_MINOR = 4
1414
_MICRO = 0
1515
version = f"{_MAJOR}.{_MINOR}.{_MICRO}"
1616
release = f"{_MAJOR}.{_MINOR}"

0 commit comments

Comments
 (0)