diff --git a/environment_linux.yml b/environment_linux.yml index 90022ee..f8d61c8 100644 --- a/environment_linux.yml +++ b/environment_linux.yml @@ -56,9 +56,9 @@ dependencies: - bwidget=1.9.14 - bzip2=1.0.8 - c-ares=1.24.0 - - ca-certificates=2023.11.17 + - ca-certificates=2024.6.2 - cairo=1.16.0 - - certifi=2023.11.17 + - certifi=2024.6.2 - charset-normalizer=3.3.2 - click=8.1.7 - contourpy=1.1.1 @@ -82,7 +82,7 @@ dependencies: - gfortran_impl_linux-64=13.2.0 - giflib=5.2.1 - glpk=5.0 - - gmap=2023.10.10 + - gmap=2024.05.20 - gmp=6.3.0 - graphite2=1.3.13 - gsl=2.7 @@ -155,7 +155,7 @@ dependencies: - numpy=1.24.4 - openjdk=17.0.3 - openjpeg=2.5.0 - - openssl=3.2.0 + - openssl=3.3.1 - packaging=23.2 - pandas=2.0.3 - pandoc=3.1.3 diff --git a/environment_macos.yml b/environment_macos.yml index c5f0df6..1da2d67 100644 --- a/environment_macos.yml +++ b/environment_macos.yml @@ -52,10 +52,10 @@ dependencies: - bwidget=1.9.14 - bzip2=1.0.8 - c-ares=1.20.1 - - ca-certificates=2023.11.17 + - ca-certificates=2024.6.2 - cairo=1.16.0 - cctools_osx-64=973.0.1 - - certifi=2023.11.17 + - certifi=2024.6.2 - charset-normalizer=3.3.1 - clang=16.0.1 - clang-16=16.0.1 @@ -87,7 +87,7 @@ dependencies: - gfortran_impl_osx-64=11.4.0 - gfortran_osx-64=11.4.0 - glpk=5.0 - - gmap=2023.10.10 + - gmap=2024.05.20 - gmp=6.2.1 - graphite2=1.3.13 - gsl=2.7 @@ -157,7 +157,7 @@ dependencies: - numpy=1.24.4 - openjdk=21.0.1 - openjpeg=2.5.0 - - openssl=3.2.0 + - openssl=3.3.1 - packaging=23.2 - pandas=2.0.3 - pandoc=3.1.3 diff --git a/pepti_map/aligning/gmap_wrapper.py b/pepti_map/aligning/gmap_wrapper.py index 9166bce..07b19f3 100644 --- a/pepti_map/aligning/gmap_wrapper.py +++ b/pepti_map/aligning/gmap_wrapper.py @@ -8,7 +8,7 @@ class GmapWrapper: - def __init__(self): + def __init__(self, min_trimmed_coverage: float = 0.0, min_identity: float = 0.0): env_vars = dotenv_values() try: n_threads = env_vars.get("GMAP_N_THREADS") @@ -27,6 +27,30 @@ def __init__(self): self._n_threads = n_threads self._batch_mode = batch_mode + try: + assert min_trimmed_coverage >= 0.0 and min_trimmed_coverage <= 1.0 + self._min_trimmed_coverage = min_trimmed_coverage + except AssertionError: + logging.info( + ( + "--min_trimmed_coverage option for GMAP must be between 0.0 and " + f"1.0, but was {str(min_trimmed_coverage)}. Using default of 0.0" + ) + ) + self._min_trimmed_coverage = 0.0 + + try: + assert min_identity >= 0.0 and min_identity <= 1.0 + self._min_identity = min_identity + except AssertionError: + logging.info( + ( + "--min_identity option for GMAP must be between 0.0 and " + f"1.0, but was {str(min_identity)}. Using default of 0.0" + ) + ) + self._min_identity = 0.0 + def build_index( self, files_to_index: List[Path], @@ -111,6 +135,10 @@ def produce_alignment( str(self._n_threads), "-f", "gff3_gene", + "--min-trimmed-coverage", + str(self._min_trimmed_coverage), + "--min-identity", + str(self._min_identity), ] alignment_command.extend( [ @@ -137,6 +165,10 @@ def _produce_alignment_for_single_file(self, path_to_sequences: Path) -> None: str(self._n_threads), "-f", "gff3_gene", + "--min-trimmed-coverage", + str(self._min_trimmed_coverage), + "--min-identity", + str(self._min_identity), path_to_sequences.absolute().as_posix(), ] with open( diff --git a/pepti_map/main.py b/pepti_map/main.py index 38d3d7d..240f3f1 100644 --- a/pepti_map/main.py +++ b/pepti_map/main.py @@ -196,7 +196,7 @@ def generate_trinity_results( return trinity_results_paths -def load_trinity_results_paths() -> List[Path]: +def load_current_results_paths() -> List[Path]: return TrinityWrapper.load_results_filepaths() @@ -205,8 +205,10 @@ def align_reads_to_genome( genome: Union[str, None], gmap_index: Union[str, None], output_dir: str, -) -> None: - gmap_wrapper = GmapWrapper() + min_trimmed_coverage: float, + min_identity: float, +) -> List[Path]: + gmap_wrapper = GmapWrapper(min_trimmed_coverage, min_identity) # TODO: How to automatically use previously generated index? if gmap_index is not None and gmap_index != "": gmap_index_path = Path(gmap_index) @@ -220,16 +222,37 @@ def align_reads_to_genome( logging.error(missing_option_message) raise ValueError(missing_option_message) + new_results_paths: List[Path] = [] for trinity_results_path in trinity_results_paths: gmap_wrapper.produce_alignment( [trinity_results_path], trinity_results_path.parent / "alignment_result.gff3", ) + # Check if actual output was produced + with open( + trinity_results_path.parent / "alignment_result.gff3", + "rt", + encoding="utf-8", + ) as gmap_output: + line_count = 0 + for _ in gmap_output: + line_count += 1 + if line_count == 4: + new_results_paths.append(trinity_results_path) + break + + # Save new output paths + TrinityWrapper.save_results_filepaths(new_results_paths) _write_last_step(Step.ALIGNMENT.value) logging.info("Generated alignment of assembled contigs with GMAP.") + return new_results_paths -def generate_pogo_input(paths_to_subdirectories: List[Path], peptide_file: str) -> None: +def generate_pogo_input( + paths_to_subdirectories: List[Path], + peptide_file: str, + no_indels: bool +) -> None: pogo_input_helper = PoGoInputHelper( Path(peptide_file), PATH_PEPTIDE_TO_CLUSTER_MAPPING_FILE ) @@ -237,7 +260,7 @@ def generate_pogo_input(paths_to_subdirectories: List[Path], peptide_file: str) paths_to_subdirectories, PATH_TO_MERGED_INDEXES ) PoGoInputHelper.generate_gtf_and_protein_files_for_multiple_directories( - paths_to_subdirectories + paths_to_subdirectories, no_indels ) _write_last_step(Step.POGO_INPUT.value) logging.info("Generated PoGo input files.") @@ -338,6 +361,7 @@ def concat_output(paths_to_subdirectories: List[Path], output_dir: str) -> None: "-pi", "--precompute-intersections", is_flag=True, + default=False, help=( "If used, the intersection sizes for the Jaccard Index " "calculation are precomputed during the matching phase." @@ -394,6 +418,33 @@ def concat_output(paths_to_subdirectories: List[Path], output_dir: str) -> None: "the '-g/--genome' option is ignored." ), ) +@click.option( + "-mtc", + "--min-trimmed-coverage", + required=False, + type=float, + default=0.0, + show_default=True, + help="Sets the '--min-trimmed-coverage' option for GMAP during alignment.", +) +@click.option( + "-mid", + "--min-identity", + required=False, + type=float, + default=0.0, + show_default=True, + help="Sets the '--min-identity' option for GMAP during alignment.", +) +@click.option( + "-ni", + "--no-indels", + required=False, + is_flag=True, + default=False, + help=("If set, contig alignments containing indels " + "are excluded from further processing.") +) def main( peptide_file: str, rna_file: str, @@ -407,6 +458,9 @@ def main( min_contig_length: int, genome: Union[str, None], gmap_index: Union[str, None], + min_trimmed_coverage: float, + min_identity: float, + no_indels: bool ): _setup(output_dir) @@ -464,13 +518,21 @@ def main( ) else: logging.info("Using already generated Trinity output files.") - trinity_results_paths = load_trinity_results_paths() + trinity_results_paths = load_current_results_paths() if last_step < Step.ALIGNMENT.value: logging.info("Aligning assembled RNA-seq reads to the genome.") - align_reads_to_genome(trinity_results_paths, genome, gmap_index, output_dir) + trinity_results_paths = align_reads_to_genome( + trinity_results_paths, + genome, + gmap_index, + output_dir, + min_trimmed_coverage, + min_identity, + ) else: logging.info("Using already generated alignments.") + trinity_results_paths = load_current_results_paths() paths_to_subdirectories = [ trinity_results_path.parent for trinity_results_path in trinity_results_paths @@ -478,7 +540,7 @@ def main( if last_step < Step.POGO_INPUT.value: logging.info("Generating input files for PoGo.") - generate_pogo_input(paths_to_subdirectories, peptide_file) + generate_pogo_input(paths_to_subdirectories, peptide_file, no_indels) else: logging.info("Using already generated PoGo input files.") diff --git a/pepti_map/output_generation/pogo_input_helper.py b/pepti_map/output_generation/pogo_input_helper.py index 93aca3e..58bf853 100644 --- a/pepti_map/output_generation/pogo_input_helper.py +++ b/pepti_map/output_generation/pogo_input_helper.py @@ -1,4 +1,5 @@ from collections import defaultdict +from functools import partial import logging import multiprocessing from dotenv import dotenv_values @@ -75,6 +76,10 @@ def generate_all_peptide_input_files( output_directory, merged_indexes[set_index] ) + @staticmethod + def _get_exon_feature_id(exon: gffutils.Feature) -> int: + return int(exon.attributes["ID"][0].split(".")[-1].replace("exon", "")) + @staticmethod def _write_feature_in_gtf_format( output_gtf: TextIO, @@ -115,191 +120,170 @@ def _write_new_feature_coordinates( gene: gffutils.Feature, mrna: gffutils.Feature, exons: List[gffutils.Feature], - cds: List[gffutils.Feature], - start_exon_index: int, - end_exon_index: int, strand: str, direction: str, - contig_length: int, - ) -> None: - start_exon = exons[start_exon_index] - end_exon = exons[end_exon_index] - if ( - not start_exon.start - or not start_exon.end - or not end_exon.start - or not end_exon.end - ): - raise ValueError("No start and end coordinates given.") - - if direction == ".": - contig_id, start_exon_contig_end, contig_start, _ = start_exon.attributes[ - "Target" - ][0].split(" ") - _, contig_end, end_exon_contig_start, _ = end_exon.attributes["Target"][ - 0 - ].split(" ") - contig_start = int(contig_start) - contig_end = int(contig_end) - - if start_exon == end_exon: - start_exon.attributes["Target"] = " ".join( - [contig_id, str(contig_length), "1", direction] - ) + contig: str, + ) -> str: + # The exons need to be sorted to follow the same order as in the original GFF. + # This is necessary because gffutils does not necessarily return the children + # of a feature in order when calling children(). The parameter order_by + # cannot be used here because it does not allow us to select for an + # exon identifier or specify the same order as in the original. + # TODO: Is there an easier option for ordering + # that does not involve string splitting? + exons.sort(key=cls._get_exon_feature_id) + + # If direction is antisense, change alignment to be on the complementary strand + if direction == "-": + if strand == "+": + strand = "-" else: - start_exon.attributes["Target"] = " ".join( - [contig_id, start_exon_contig_end, "1", direction] - ) - end_exon.attributes["Target"] = " ".join( - [contig_id, str(contig_length), end_exon_contig_start, direction] - ) - else: - contig_id, contig_start, start_exon_contig_end, _ = start_exon.attributes[ - "Target" - ][0].split(" ") - _, end_exon_contig_start, contig_end, _ = end_exon.attributes["Target"][ - 0 - ].split(" ") - contig_start = int(contig_start) - contig_end = int(contig_end) - - if start_exon == end_exon: - start_exon.attributes["Target"] = " ".join( - [contig_id, "1", str(contig_length), direction] - ) + strand = "+" + + gene.strand = strand + mrna.strand = strand + for exon in exons: + exon.strand = strand + + exons.reverse() + + # TODO: Is assumption that exons are already in correct order + # if dir=indeterminate true? + # Potentially only labeled "indeterminate" if only one exon? + + exon_starts: List[int] = [] + exon_ends: List[int] = [] + for exon in exons: + if direction == ".": + _, exon_end, exon_start, _ = exon.attributes["Target"][0].split(" ") else: - start_exon.attributes["Target"] = " ".join( - [contig_id, "1", start_exon_contig_end, direction] - ) - end_exon.attributes["Target"] = " ".join( - [contig_id, end_exon_contig_start, str(contig_length), direction] - ) + _, exon_start, exon_end, _ = exon.attributes["Target"][0].split(" ") + exon_starts.append(int(exon_start)) + exon_ends.append(int(exon_end)) - exon_ids = [exon.attributes["ID"][0] for exon in exons] - cds_ids = [cds_entry.attributes["ID"][0] for cds_entry in cds] - mrna_id = mrna.attributes["ID"][0] + cut_contig_parts: List[str] = [] + for i in range(len(exon_starts)): + current_start = exon_starts[i] + current_end = exon_ends[i] + current_part = contig[current_start - 1 : current_end] # noqa: E203 + cut_contig_parts.append(current_part) + start_end_cut_contig = "".join(cut_contig_parts) + + if len(start_end_cut_contig) < (0.7 * len(contig)): + # TODO: Filter out alignment and report + pass - if ( - (direction == "+" and strand == "+") - or (direction == "-" and strand == "-") - or (direction == "." and strand == "+") - ): - new_start = start_exon.start - (contig_start - 1) - start_exon.start = new_start - mrna.start = new_start - gene.start = new_start - new_end = end_exon.end + (contig_length - contig_end) - end_exon.end = new_end - mrna.end = new_end - gene.end = new_end + # TODO: Not needed? + # exon_ids = [exon.attributes["ID"][0] for exon in exons] + # cds_ids = [exon_id.replace("exon", "cds") for exon_id in exon_ids] + mrna_id = mrna.attributes["ID"][0] + exon_start_coords = [exon.start for exon in exons] + exon_end_coords = [exon.end for exon in exons] + cls._write_feature_in_gtf_format( + output_gtf, + gene, + gene.attributes["ID"][0], + ) + for frame in range(3): + mrna.attributes["ID"] = mrna_id + "." + str(frame) cls._write_feature_in_gtf_format( - output_gtf, - gene, - gene.attributes["ID"][0], + output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] ) - for frame in range(3): - mrna.attributes["ID"] = mrna_id + "." + str(frame) + for exon_idx, exon in enumerate(exons): + # TODO: Not needed? + # exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) + # exon.attributes["Parent"] = mrna.attributes["ID"] + exon.featuretype = "exon" + exon.start = exon_start_coords[exon_idx] + exon.end = exon_end_coords[exon_idx] cls._write_feature_in_gtf_format( - output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] + output_gtf, + exon, + gene.attributes["ID"][0], + mrna.attributes["ID"][0], ) - for exon_index, exon in enumerate(exons): - exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) - exon.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - exon, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - for cds_index, cds_entry in enumerate(cds): - cds_entry.start = exons[cds_index].start - cds_entry.end = exons[cds_index].end - if cds_index == start_exon_index: - cds_entry.start = cds_entry.start + frame # pyright: ignore - if cds_index == end_exon_index: - cds_entry.end = cds_entry.end - ( # pyright: ignore - (contig_length - frame) % 3 + for exon_idx, exon in enumerate(exons): + # TODO: Is there a better solution, + # e.g. copying and modifying the feature? + exon.featuretype = "CDS" + # strand = +, dir = sense + # -> add frame to start of first CDS + # strand = +, dir = antisense + # -> subtract frame from end of first CDS (is first after reversing) + # strand = -, dir = sense + # -> subtract frame from end of first CDS + # strand = -, dir = antisense + # -> add frame to start of first CDS (is first after reversing) + # --> differentiation between +/- strand should suffice after reversing + if strand == "+": + if exon_idx == 0: + exon.start = exon.start + frame # pyright: ignore + if exon_idx == (len(exons) - 1): + exon.end = exon.end - ( # pyright: ignore + (len(start_end_cut_contig) - frame) % 3 + ) + else: + if exon_idx == 0: + exon.end = exon.end - frame # pyright: ignore + if exon_idx == (len(exons) - 1): + exon.start = exon.start + ( # pyright: ignore + (len(start_end_cut_contig) - frame) % 3 ) - cds_entry.attributes["ID"] = cds_ids[cds_index] + "." + str(frame) - cds_entry.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - cds_entry, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - elif ( - (direction == "+" and strand == "-") - or (direction == "-" and strand == "+") - or (direction == "." and strand == "-") - ): - new_end = start_exon.end + (contig_start - 1) - start_exon.end = new_end - mrna.end = new_end - gene.end = new_end - new_start = end_exon.start - (contig_length - contig_end) - end_exon.start = new_start - mrna.start = new_start - gene.start = new_start - - cls._write_feature_in_gtf_format(output_gtf, gene, gene.attributes["ID"][0]) - for frame in range(3): - mrna.attributes["ID"] = mrna_id + "." + str(frame) cls._write_feature_in_gtf_format( - output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] + output_gtf, + exon, + gene.attributes["ID"][0], + mrna.attributes["ID"][0], ) - for exon_index, exon in enumerate(exons): - exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) - exon.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - exon, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - for cds_index, cds_entry in enumerate(cds): - cds_entry.start = exons[cds_index].start - cds_entry.end = exons[cds_index].end - if cds_index == start_exon_index: - cds_entry.end = cds_entry.end - frame # pyright: ignore - if cds_index == end_exon_index: - cds_entry.start = cds_entry.start + ( # pyright: ignore - (contig_length - frame) % 3 - ) - cds_entry.attributes["ID"] = cds_ids[cds_index] + "." + str(frame) - cds_entry.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - cds_entry, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - else: - raise ValueError("Strand must be one of '+', '-'.") + + return start_end_cut_contig @classmethod def generate_gtf_input_file( cls, path_to_gff: Path, output_directory: Path, - sequence_lengths_per_contig: List[int], - ) -> List[int]: - # Track number of transcripts to write protein FASTA with matching ids - number_of_transcripts_per_contig: List[int] = [ - 0 for _ in range(len(sequence_lengths_per_contig)) + contig_sequences: List[Tuple[str, str]], + no_indels=False, + ) -> Tuple[List[List[int]], List[List[str]]]: + # Track contig alignment ids to write protein FASTA with matching ids + alignment_ids_per_contig: List[List[int]] = [ + [] for _ in range(len(contig_sequences)) + ] + # Per original contig, there can be several new contigs + # based on different cutoffs + new_contig_sequences: List[List[str]] = [ + [] for _ in range(len(contig_sequences)) ] gffutils_db = gffutils.create_db( path_to_gff.absolute().as_posix(), (path_to_gff.parent / "gffutils_db.sqlite").absolute().as_posix(), ) + with open( output_directory / "pogo_gtf_in.gtf", "wt", encoding="utf-8" ) as output_gtf: for gene_feature in gffutils_db.features_of_type("gene"): gene_children = list(gffutils_db.children(gene_feature)) + + mrna = [ + gene_child + for gene_child in gene_children + if gene_child.featuretype == "mRNA" + ][ + 0 + ] # There can be only one mRNA per gene + + # If no indels allowed: Check if feature contains indels + # -> If so, gene feature is skipped + if no_indels: + mrna_indels = int(mrna.attributes["indels"][0]) + if mrna_indels != 0: + continue + exons = [ gene_child for gene_child in gene_children @@ -310,91 +294,51 @@ def generate_gtf_input_file( strand = first_exon.strand target: str = first_exon.attributes["Target"][0] contig_id, _, _, direction = target.split(" ") - contig_length = sequence_lengths_per_contig[int(contig_id[-1])] - if direction == ".": # indeterminate - start_exon_index = exons.index( - min( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[2] - ), - ) - ) - end_exon_index = exons.index( - max( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[1] - ), - ) - ) - else: # sense or antisense - start_exon_index = exons.index( - min( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[1] - ), - ) - ) - end_exon_index = exons.index( - max( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[2] - ), - ) - ) - mrna = [ - gene_child - for gene_child in gene_children - if gene_child.featuretype == "mRNA" - ][ - 0 - ] # There can be only one mRNA per gene - mrna_id = mrna.attributes["ID"][0] - number_of_transcripts_per_contig[int(mrna_id.split(".")[0][-1])] += 1 + contig_id = int(contig_id.split("-")[-1]) + contig = contig_sequences[contig_id] - cls._write_new_feature_coordinates( + new_contig = cls._write_new_feature_coordinates( output_gtf, gene_feature, mrna, exons, - [ - gene_child - for gene_child in gene_children - if gene_child.featuretype == "CDS" - ], - start_exon_index, - end_exon_index, strand, direction, - contig_length, + contig[1], ) - return number_of_transcripts_per_contig + mrna_id = mrna.attributes["ID"][0] + contig_idx = int(mrna_id.split(".")[0].split("-")[-1]) + path_number = int(mrna_id.split(".")[1].replace("mrna", "")) + new_contig_sequences[contig_idx].append(new_contig) + alignment_ids_per_contig[contig_idx].append(path_number) + + return (alignment_ids_per_contig, new_contig_sequences) + # TODO: Remove unneeded arguments @staticmethod def generate_protein_fasta_input_file( - contig_sequences: List[Tuple[str, str]], + contig_ids: List[str], + contig_sequences: List[List[str]], output_directory: Path, - number_of_transcripts_per_contig: List[int], + alignment_ids_per_contig: List[List[int]], ) -> None: + # TODO: Adapt to new separation of ids and seqs with open( output_directory / "pogo_fasta_in.fa", "wt", encoding="utf-8" ) as output_file: - for contig_index, (contig_id, contig_sequence) in enumerate( - contig_sequences + for contig_id, contig_cut_sequences, alignment_ids in zip( + contig_ids, contig_sequences, alignment_ids_per_contig ): - for translation, frame in get_three_frame_translations( - contig_sequence, False + for alignment_id, contig_sequence in zip( + alignment_ids, contig_cut_sequences ): - for transcript_index in range( - number_of_transcripts_per_contig[contig_index] + for translation, frame in get_three_frame_translations( + contig_sequence, False ): - gene_id = f"{contig_id}_path{str(transcript_index + 1)}" + gene_id = f"{contig_id}_path{str(alignment_id)}" transcript_id = ( - f"{contig_id}_mrna{str(transcript_index + 1)}_{str(frame)}" + f"{contig_id}_mrna{str(alignment_id)}_{str(frame)}" ) output_file.write( ( @@ -420,26 +364,29 @@ def _get_contig_sequences(path_to_contig_sequences: Path) -> List[Tuple[str, str @classmethod def generate_gtf_and_protein_files_for_directory( - cls, path_to_directory: Path + cls, path_to_directory: Path, no_indels=False ) -> None: contig_sequences = cls._get_contig_sequences( path_to_directory / "resulting_contigs.fa" ) - number_of_transcripts_per_contig = cls.generate_gtf_input_file( - path_to_directory / "alignment_result.gff3", - path_to_directory, - [len(contig_sequence[1]) for contig_sequence in contig_sequences], + alignment_ids_per_contig, updated_contig_sequences = ( + cls.generate_gtf_input_file( + path_to_directory / "alignment_result.gff3", + path_to_directory, + contig_sequences, + no_indels, + ) ) cls.generate_protein_fasta_input_file( - contig_sequences, + [contig_sequence[0] for contig_sequence in contig_sequences], + updated_contig_sequences, path_to_directory, - number_of_transcripts_per_contig, + alignment_ids_per_contig, ) @classmethod def generate_gtf_and_protein_files_for_multiple_directories( - cls, - paths_to_directories: List[Path], + cls, paths_to_directories: List[Path], no_indels=False ) -> None: # TODO: Refactor code duplication try: @@ -453,5 +400,9 @@ def generate_gtf_and_protein_files_for_multiple_directories( ) with multiprocessing.Pool(n_processes) as pool: pool.map( - cls.generate_gtf_and_protein_files_for_directory, paths_to_directories + partial( + cls.generate_gtf_and_protein_files_for_directory, + no_indels=no_indels, + ), + paths_to_directories, )