Skip to content

Commit 3e0d0c3

Browse files
committed
artifact_and_preparation_files
1 parent 8b6bf61 commit 3e0d0c3

File tree

2 files changed

+159
-0
lines changed

2 files changed

+159
-0
lines changed

qiita_client/qiita_client.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99
import time
1010
import requests
1111
import threading
12+
import pandas as pd
1213
from json import dumps
1314
from random import randint
15+
from itertools import zip_longest
16+
from os.path import basename
1417

1518
from .exceptions import (QiitaClientError, NotFoundError, BadRequestError,
1619
ForbiddenError)
@@ -23,6 +26,7 @@
2326
MAX_RETRIES = 3
2427
MIN_TIME_SLEEP = 180
2528
MAX_TIME_SLEEP = 360
29+
MIN_FILEPATH_SIZE = 100
2630

2731

2832
class ArtifactInfo(object):
@@ -556,3 +560,106 @@ def complete_job(self, job_id, success, error_msg=None,
556560
artifacts_info=artifacts_info))
557561
# Create the URL where we have to post the results
558562
self.post("/qiita_db/jobs/%s/complete/" % job_id, data=json_payload)
563+
564+
def artifact_and_preparation_files(self, artifact_id,
565+
ignore_small_files=True):
566+
"""Gets the artifact and preparation files from a given artifact_id
567+
568+
Parameters
569+
----------
570+
artifact_id : int
571+
The artifact id
572+
ignore_small_files : bool
573+
Whether to ignore small files or retrieve all of them (only applies
574+
to per_sample_FASTQ artifacts)
575+
576+
Returns
577+
-------
578+
579+
Raises
580+
------
581+
RuntimeError
582+
- If the artifact belongs to an analysis
583+
584+
"""
585+
artifact_info = self.get("/qiita_db/artifacts/%s/" % artifact_id)
586+
587+
if artifact_info['analysis'] is not None:
588+
raise RuntimeError(
589+
f'Artifact {artifact_id} is an analysis artifact, this method '
590+
'is meant to work with artifacts linked to a preparation.')
591+
592+
prep_info = self.get('/qiita_db/prep_template/%s/'
593+
% artifact_info['prep_information'][0])
594+
prep_info = pd.read_csv(prep_info['prep-file'], sep='\t', dtype=str)
595+
if artifact_info['type'] == 'per_sample_FASTQ':
596+
files, prep_info = self._process_files_per_sample_fastq(
597+
artifact_info['files'], prep_info, ignore_small_files)
598+
else:
599+
files = {k: [vv['filepath'] for vv in v]
600+
for k, v in artifact_info['files'].items()}
601+
602+
return files, prep_info
603+
604+
def _process_files_per_sample_fastq(self, files, prep_info,
605+
ignore_small_files):
606+
"helper function to process per_sample_FASTQ artifacts and their preps"
607+
608+
fwds = sorted(files['raw_forward_seqs'], key=lambda x: x['filepath'])
609+
revs = []
610+
if 'raw_reverse_seqs' in files:
611+
revs = sorted(
612+
files['raw_reverse_seqs'], key=lambda x: x['filepath'])
613+
if len(fwds) != len(revs):
614+
raise ValueError(f'The fwd ({len(fwds)}) and rev ({len(revs)})'
615+
' files should be of the same length')
616+
617+
run_prefixes = prep_info['run_prefix'].to_dict()
618+
619+
# make parirings
620+
sample_names = dict()
621+
used_prefixes = []
622+
for i, (fwd, rev) in enumerate(zip_longest(fwds, revs)):
623+
fwd_fn = basename(fwd['filepath'])
624+
file_smaller_than_min = fwd['size'] < MIN_FILEPATH_SIZE
625+
626+
# iterate over run prefixes and make sure only one matches
627+
run_prefix = None
628+
sample_name = None
629+
for sn, rp in run_prefixes.items():
630+
if fwd_fn.startswith(rp) and run_prefix is None:
631+
run_prefix = rp
632+
sample_name = sn
633+
elif fwd_fn.startswith(rp) and run_prefix is not None:
634+
raise ValueError(
635+
f'Multiple run prefixes match this fwd read: {fwd_fn}')
636+
637+
if run_prefix is None:
638+
raise ValueError(
639+
f'No run prefix matching this fwd read: {fwd_fn}')
640+
if run_prefix in used_prefixes:
641+
raise ValueError(
642+
f'Run prefix matches multiple fwd reads: {run_prefix}')
643+
used_prefixes.append(run_prefix)
644+
645+
if rev is not None:
646+
# if we have reverse reads, make sure the matching pair also
647+
# matches the run prefix:
648+
rev_fn = basename(rev['filepath'])
649+
if not file_smaller_than_min:
650+
file_smaller_than_min = rev['size'] < MIN_FILEPATH_SIZE
651+
if not rev_fn.startswith(run_prefix):
652+
raise ValueError(
653+
'Reverse read does not match run prefix. run_prefix: '
654+
f'{run_prefix}; files: {fwd_fn} / {rev_fn}')
655+
656+
used_prefixes.append(run_prefix)
657+
658+
if ignore_small_files and file_smaller_than_min:
659+
continue
660+
661+
sample_names[sample_name] = (fwd, rev)
662+
663+
prep_info = prep_info.filter(items=sample_names.keys(), axis=0)
664+
665+
return sample_names, prep_info

qiita_client/tests/test_qiita_client.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from os.path import basename, exists
1212
from tempfile import mkstemp
1313
from json import dumps
14+
import pandas as pd
1415

1516
from qiita_client.qiita_client import (QiitaClient, _format_payload,
1617
ArtifactInfo)
@@ -257,6 +258,57 @@ def test_complete_job(self):
257258
obs = self.tester.complete_job(job_id, True, artifacts_info=ainfo)
258259
self.assertIsNone(obs)
259260

261+
def test_artifact_and_preparation_files(self):
262+
263+
# check success
264+
fobs, prep_info = self.tester.artifact_and_preparation_files(1)
265+
# just leaving filenames as the folders are dynamic and a pain to test
266+
fobs = {k: [basename(vv) for vv in v] for k, v in fobs.items()}
267+
fexp = {'raw_forward_seqs': ['1_s_G1_L001_sequences.fastq.gz'],
268+
'raw_barcodes': ['1_s_G1_L001_sequences_barcodes.fastq.gz']}
269+
self.assertEqual(fobs, fexp)
270+
self.assertEqual(prep_info.shape, (27, 22))
271+
272+
# check failure
273+
with self.assertRaisesRegex(RuntimeError, 'Artifact 8 is an analysis '
274+
'artifact, this method is meant to work '
275+
'with artifacts linked to a preparation.'):
276+
self.tester.artifact_and_preparation_files(8)
277+
278+
# test _process_files_per_sample_fastq
279+
# both fwd/rev
280+
files = {
281+
'raw_forward_seqs': [
282+
{'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
283+
{'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
284+
{'filepath': '/X/file_2_R1.fastq.gz', 'size': 101}],
285+
'raw_reverse_seqs': [
286+
{'filepath': '/X/file_2_R2.fastq.gz', 'size': 101},
287+
{'filepath': '/X/file_1_R2.fastq.gz', 'size': 101},
288+
{'filepath': '/X/file_3_R2.fastq.gz', 'size': 101}]}
289+
prep_info = pd.DataFrame.from_dict({
290+
'run_prefix': {"sample.1": 'file_1',
291+
"sample.2": 'file_2',
292+
"sample.3": 'file_3'}}, dtype=str)
293+
prep_info.index.name = 'sample_name'
294+
fobs, piobs = self.tester._process_files_per_sample_fastq(
295+
files, prep_info, False)
296+
fexp = {
297+
'sample.1': ({'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
298+
{'filepath': '/X/file_1_R2.fastq.gz', 'size': 101}),
299+
'sample.2': ({'filepath': '/X/file_2_R1.fastq.gz', 'size': 101},
300+
{'filepath': '/X/file_2_R2.fastq.gz', 'size': 101}),
301+
'sample.3': ({'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
302+
{'filepath': '/X/file_3_R2.fastq.gz', 'size': 101})}
303+
self.assertEqual(fobs, fexp)
304+
self.assertEqual(piobs.shape, (3, 1))
305+
306+
fobs, piobs = self.tester._process_files_per_sample_fastq(
307+
files, prep_info, True)
308+
del fexp['sample.1']
309+
self.assertEqual(fobs, fexp)
310+
self.assertEqual(piobs.shape, (2, 1))
311+
260312

261313
if __name__ == '__main__':
262314
main()

0 commit comments

Comments
 (0)