|
9 | 9 | import time
|
10 | 10 | import requests
|
11 | 11 | import threading
|
| 12 | +import pandas as pd |
12 | 13 | from json import dumps
|
13 | 14 | from random import randint
|
| 15 | +from itertools import zip_longest |
| 16 | +from os.path import basename |
14 | 17 |
|
15 | 18 | from .exceptions import (QiitaClientError, NotFoundError, BadRequestError,
|
16 | 19 | ForbiddenError)
|
|
23 | 26 | MAX_RETRIES = 3
|
24 | 27 | MIN_TIME_SLEEP = 180
|
25 | 28 | MAX_TIME_SLEEP = 360
|
| 29 | +MIN_FILEPATH_SIZE = 100 |
26 | 30 |
|
27 | 31 |
|
28 | 32 | class ArtifactInfo(object):
|
@@ -556,3 +560,106 @@ def complete_job(self, job_id, success, error_msg=None,
|
556 | 560 | artifacts_info=artifacts_info))
|
557 | 561 | # Create the URL where we have to post the results
|
558 | 562 | self.post("/qiita_db/jobs/%s/complete/" % job_id, data=json_payload)
|
| 563 | + |
| 564 | + def artifact_and_preparation_files(self, artifact_id, |
| 565 | + ignore_small_files=True): |
| 566 | + """Gets the artifact and preparation files from a given artifact_id |
| 567 | +
|
| 568 | + Parameters |
| 569 | + ---------- |
| 570 | + artifact_id : int |
| 571 | + The artifact id |
| 572 | + ignore_small_files : bool |
| 573 | + Whether to ignore small files or retrieve all of them (only applies |
| 574 | + to per_sample_FASTQ artifacts) |
| 575 | +
|
| 576 | + Returns |
| 577 | + ------- |
| 578 | +
|
| 579 | + Raises |
| 580 | + ------ |
| 581 | + RuntimeError |
| 582 | + - If the artifact belongs to an analysis |
| 583 | +
|
| 584 | + """ |
| 585 | + artifact_info = self.get("/qiita_db/artifacts/%s/" % artifact_id) |
| 586 | + |
| 587 | + if artifact_info['analysis'] is not None: |
| 588 | + raise RuntimeError( |
| 589 | + f'Artifact {artifact_id} is an analysis artifact, this method ' |
| 590 | + 'is meant to work with artifacts linked to a preparation.') |
| 591 | + |
| 592 | + prep_info = self.get('/qiita_db/prep_template/%s/' |
| 593 | + % artifact_info['prep_information'][0]) |
| 594 | + prep_info = pd.read_csv(prep_info['prep-file'], sep='\t', dtype=str) |
| 595 | + if artifact_info['type'] == 'per_sample_FASTQ': |
| 596 | + files, prep_info = self._process_files_per_sample_fastq( |
| 597 | + artifact_info['files'], prep_info, ignore_small_files) |
| 598 | + else: |
| 599 | + files = {k: [vv['filepath'] for vv in v] |
| 600 | + for k, v in artifact_info['files'].items()} |
| 601 | + |
| 602 | + return files, prep_info |
| 603 | + |
| 604 | + def _process_files_per_sample_fastq(self, files, prep_info, |
| 605 | + ignore_small_files): |
| 606 | + "helper function to process per_sample_FASTQ artifacts and their preps" |
| 607 | + |
| 608 | + fwds = sorted(files['raw_forward_seqs'], key=lambda x: x['filepath']) |
| 609 | + revs = [] |
| 610 | + if 'raw_reverse_seqs' in files: |
| 611 | + revs = sorted( |
| 612 | + files['raw_reverse_seqs'], key=lambda x: x['filepath']) |
| 613 | + if len(fwds) != len(revs): |
| 614 | + raise ValueError(f'The fwd ({len(fwds)}) and rev ({len(revs)})' |
| 615 | + ' files should be of the same length') |
| 616 | + |
| 617 | + run_prefixes = prep_info['run_prefix'].to_dict() |
| 618 | + |
| 619 | + # make parirings |
| 620 | + sample_names = dict() |
| 621 | + used_prefixes = [] |
| 622 | + for i, (fwd, rev) in enumerate(zip_longest(fwds, revs)): |
| 623 | + fwd_fn = basename(fwd['filepath']) |
| 624 | + file_smaller_than_min = fwd['size'] < MIN_FILEPATH_SIZE |
| 625 | + |
| 626 | + # iterate over run prefixes and make sure only one matches |
| 627 | + run_prefix = None |
| 628 | + sample_name = None |
| 629 | + for sn, rp in run_prefixes.items(): |
| 630 | + if fwd_fn.startswith(rp) and run_prefix is None: |
| 631 | + run_prefix = rp |
| 632 | + sample_name = sn |
| 633 | + elif fwd_fn.startswith(rp) and run_prefix is not None: |
| 634 | + raise ValueError( |
| 635 | + f'Multiple run prefixes match this fwd read: {fwd_fn}') |
| 636 | + |
| 637 | + if run_prefix is None: |
| 638 | + raise ValueError( |
| 639 | + f'No run prefix matching this fwd read: {fwd_fn}') |
| 640 | + if run_prefix in used_prefixes: |
| 641 | + raise ValueError( |
| 642 | + f'Run prefix matches multiple fwd reads: {run_prefix}') |
| 643 | + used_prefixes.append(run_prefix) |
| 644 | + |
| 645 | + if rev is not None: |
| 646 | + # if we have reverse reads, make sure the matching pair also |
| 647 | + # matches the run prefix: |
| 648 | + rev_fn = basename(rev['filepath']) |
| 649 | + if not file_smaller_than_min: |
| 650 | + file_smaller_than_min = rev['size'] < MIN_FILEPATH_SIZE |
| 651 | + if not rev_fn.startswith(run_prefix): |
| 652 | + raise ValueError( |
| 653 | + 'Reverse read does not match run prefix. run_prefix: ' |
| 654 | + f'{run_prefix}; files: {fwd_fn} / {rev_fn}') |
| 655 | + |
| 656 | + used_prefixes.append(run_prefix) |
| 657 | + |
| 658 | + if ignore_small_files and file_smaller_than_min: |
| 659 | + continue |
| 660 | + |
| 661 | + sample_names[sample_name] = (fwd, rev) |
| 662 | + |
| 663 | + prep_info = prep_info.filter(items=sample_names.keys(), axis=0) |
| 664 | + |
| 665 | + return sample_names, prep_info |
0 commit comments