diff --git a/environment_linux.yml b/environment_linux.yml index 90022ee..f8d61c8 100644 --- a/environment_linux.yml +++ b/environment_linux.yml @@ -56,9 +56,9 @@ dependencies: - bwidget=1.9.14 - bzip2=1.0.8 - c-ares=1.24.0 - - ca-certificates=2023.11.17 + - ca-certificates=2024.6.2 - cairo=1.16.0 - - certifi=2023.11.17 + - certifi=2024.6.2 - charset-normalizer=3.3.2 - click=8.1.7 - contourpy=1.1.1 @@ -82,7 +82,7 @@ dependencies: - gfortran_impl_linux-64=13.2.0 - giflib=5.2.1 - glpk=5.0 - - gmap=2023.10.10 + - gmap=2024.05.20 - gmp=6.3.0 - graphite2=1.3.13 - gsl=2.7 @@ -155,7 +155,7 @@ dependencies: - numpy=1.24.4 - openjdk=17.0.3 - openjpeg=2.5.0 - - openssl=3.2.0 + - openssl=3.3.1 - packaging=23.2 - pandas=2.0.3 - pandoc=3.1.3 diff --git a/environment_macos.yml b/environment_macos.yml index c5f0df6..a834900 100644 --- a/environment_macos.yml +++ b/environment_macos.yml @@ -4,74 +4,78 @@ channels: - conda-forge dependencies: - _r-mutex=1.0.1 - - argcomplete=3.1.2 - - argh=0.30.4 - - bioconductor-annotate=1.76.0 - - bioconductor-annotationdbi=1.60.0 - - bioconductor-biobase=2.58.0 - - bioconductor-biocfilecache=2.6.0 - - bioconductor-biocgenerics=0.44.0 - - bioconductor-biocio=1.8.0 - - bioconductor-biocparallel=1.32.5 - - bioconductor-biomart=2.54.0 - - bioconductor-biostrings=2.66.0 - - bioconductor-ctc=1.72.0 - - bioconductor-data-packages=20230718 - - bioconductor-delayedarray=0.24.0 - - bioconductor-deseq2=1.38.0 - - bioconductor-dexseq=1.44.0 - - bioconductor-edger=3.40.0 - - bioconductor-genefilter=1.80.0 - - bioconductor-genelendatabase=1.34.0 - - bioconductor-geneplotter=1.76.0 - - bioconductor-genomeinfodb=1.34.9 - - bioconductor-genomeinfodbdata=1.2.9 - - bioconductor-genomicalignments=1.34.0 - - bioconductor-genomicfeatures=1.50.2 - - bioconductor-genomicranges=1.50.0 - - bioconductor-go.db=3.16.0 - - bioconductor-goseq=1.50.0 - - bioconductor-iranges=2.32.0 - - bioconductor-keggrest=1.38.0 - - bioconductor-limma=3.54.0 - - bioconductor-matrixgenerics=1.10.0 - - bioconductor-qvalue=2.30.0 - - bioconductor-rhtslib=2.0.0 - - bioconductor-rsamtools=2.14.0 - - bioconductor-rtracklayer=1.58.0 - - bioconductor-s4vectors=0.36.0 - - bioconductor-summarizedexperiment=1.28.0 - - bioconductor-xvector=0.38.0 - - bioconductor-zlibbioc=1.44.0 + - argcomplete=3.5.1 + - argh=0.31.3 + - bioconductor-annotate=1.80.0 + - bioconductor-annotationdbi=1.64.1 + - bioconductor-apeglm=1.24.0 + - bioconductor-biobase=2.62.0 + - bioconductor-biocfilecache=2.10.1 + - bioconductor-biocgenerics=0.48.1 + - bioconductor-biocio=1.12.0 + - bioconductor-biocparallel=1.36.0 + - bioconductor-biomart=2.58.0 + - bioconductor-biostrings=2.70.1 + - bioconductor-ctc=1.76.0 + - bioconductor-data-packages=20250401 + - bioconductor-delayedarray=0.28.0 + - bioconductor-deseq2=1.42.0 + - bioconductor-dexseq=1.48.0 + - bioconductor-edger=4.0.16 + - bioconductor-genefilter=1.84.0 + - bioconductor-genelendatabase=1.38.0 + - bioconductor-geneplotter=1.80.0 + - bioconductor-genomeinfodb=1.38.1 + - bioconductor-genomeinfodbdata=1.2.11 + - bioconductor-genomicalignments=1.38.0 + - bioconductor-genomicfeatures=1.54.1 + - bioconductor-genomicranges=1.54.1 + - bioconductor-go.db=3.18.0 + - bioconductor-goseq=1.54.0 + - bioconductor-iranges=2.36.0 + - bioconductor-keggrest=1.42.0 + - bioconductor-limma=3.58.1 + - bioconductor-matrixgenerics=1.14.0 + - bioconductor-qvalue=2.34.0 + - bioconductor-rhtslib=2.4.0 + - bioconductor-rsamtools=2.18.0 + - bioconductor-rtracklayer=1.62.0 + - bioconductor-s4arrays=1.2.0 + - bioconductor-s4vectors=0.40.2 + - bioconductor-sparsearray=1.2.2 + - bioconductor-summarizedexperiment=1.32.0 + - bioconductor-xvector=0.42.0 + - bioconductor-zlibbioc=1.48.0 - biopython=1.81 - - boost-cpp=1.78.0 - - bowtie2=2.5.2 + - boost-cpp=1.85.0 + - bowtie2=2.5.4 - brotli=1.1.0 - brotli-bin=1.1.0 - brotli-python=1.1.0 - - bwidget=1.9.14 + - bwidget=1.10.1 - bzip2=1.0.8 - - c-ares=1.20.1 - - ca-certificates=2023.11.17 - - cairo=1.16.0 - - cctools_osx-64=973.0.1 - - certifi=2023.11.17 - - charset-normalizer=3.3.1 - - clang=16.0.1 - - clang-16=16.0.1 - - clang_osx-64=16.0.1 - - clangxx=16.0.1 - - clangxx_osx-64=16.0.1 + - c-ares=1.34.4 + - ca-certificates=2025.1.31 + - cairo=1.18.0 + - cctools_osx-64=986 + - certifi=2024.8.30 + - cffi=1.17.0 + - charset-normalizer=3.4.0 + - clang=18.1.7 + - clang-18=18.1.7 + - clang_impl_osx-64=18.1.7 + - clang_osx-64=18.1.7 + - clangxx=18.1.7 + - clangxx_impl_osx-64=18.1.7 + - clangxx_osx-64=18.1.7 - click=8.1.7 - - colorama=0.4.6 - - compiler-rt=16.0.1 - - compiler-rt_osx-64=16.0.1 + - compiler-rt=18.1.7 + - compiler-rt_osx-64=18.1.7 - contourpy=1.1.1 - - curl=8.1.2 + - curl=8.8.0 - cycler=0.12.1 - datasketch=1.6.4 - - exceptiongroup=1.1.3 - - expat=2.5.0 + - expat=2.7.0 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 @@ -79,301 +83,338 @@ dependencies: - fontconfig=2.14.2 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.43.1 + - fonttools=4.53.1 - freetype=2.12.1 - fribidi=1.0.10 - - gettext=0.21.1 + - gdbm=1.18 + - gettext=0.23.1 + - gettext-tools=0.23.1 - gffutils=0.12 - - gfortran_impl_osx-64=11.4.0 - - gfortran_osx-64=11.4.0 + - gfortran_impl_osx-64=12.3.0 + - gfortran_osx-64=12.3.0 - glpk=5.0 - - gmap=2023.10.10 - - gmp=6.2.1 + - gmap=2024.05.20 + - gmp=6.3.0 - graphite2=1.3.13 - - gsl=2.7 - - harfbuzz=6.0.0 - - hdf5=1.12.2 - - htseq=2.0.3 - - htslib=1.17 - - icu=70.1 - - idna=3.4 - - importlib-metadata=6.8.0 - - importlib-resources=6.1.0 - - importlib_resources=6.1.0 - - iniconfig=2.0.0 - - isl=0.25 - - joblib=1.3.2 + - h2=4.1.0 + - harfbuzz=8.5.0 + - hdf5=1.14.3 + - hpack=4.0.0 + - htseq=2.0.5 + - htslib=1.21 + - hyperframe=6.0.1 + - icu=73.2 + - idna=3.10 + - importlib-metadata=8.5.0 + - importlib-resources=6.4.5 + - importlib_resources=6.4.5 + - isl=0.26 + - joblib=1.4.2 - jq=1.5 - - kallisto=0.50.0 + - k8=1.2 + - kallisto=0.51.1 - kiwisolver=1.4.5 - - kmer-jellyfish=2.3.0 - - krb5=1.20.1 - - lcms2=2.15 - - ld64_osx-64=609 + - kmer-jellyfish=2.3.1 + - krb5=1.21.3 + - lcms2=2.16 + - ld64_osx-64=711 - lerc=4.0.0 - - libaec=1.1.2 + - libaec=1.1.3 + - libasprintf=0.23.1 + - libasprintf-devel=0.23.1 - libblas=3.9.0 + - libboost=1.85.0 + - libboost-devel=1.85.0 + - libboost-headers=1.85.0 - libbrotlicommon=1.1.0 - libbrotlidec=1.1.0 - libbrotlienc=1.1.0 - libcblas=3.9.0 - - libclang-cpp16=16.0.1 - - libcurl=8.1.2 - - libcxx=16.0.6 + - libclang-cpp18.1=18.1.7 + - libcurl=8.8.0 + - libcxx=20.1.2 - libdb=6.2.32 - - libdeflate=1.18 - - libedit=3.1.20191231 + - libdeflate=1.20 + - libedit=3.1.20250104 - libev=4.33 - - libexpat=2.5.0 - - libffi=3.4.2 + - libexpat=2.7.0 + - libffi=3.4.6 - libgcc=4.8.5 + - libgettextpo=0.23.1 + - libgettextpo-devel=0.23.1 - libgfortran=5.0.0 - - libgfortran-devel_osx-64=11.4.0 - - libgfortran5=13.2.0 - - libglib=2.78.0 - - libiconv=1.17 + - libgfortran-devel_osx-64=12.3.0 + - libgfortran5=14.2.0 + - libglib=2.80.2 + - libhwloc=2.11.2 + - libiconv=1.18 + - libintl=0.23.1 + - libintl-devel=0.23.1 - libjemalloc=5.3.0 - - libjpeg-turbo=2.1.5.1 + - libjpeg-turbo=3.0.0 - liblapack=3.9.0 - - libllvm16=16.0.1 - - libnghttp2=1.55.1 - - libopenblas=0.3.24 - - libpng=1.6.39 - - libsqlite=3.43.2 + - libllvm18=18.1.7 + - liblzma=5.8.1 + - liblzma-devel=5.8.1 + - libnghttp2=1.58.0 + - libopenblas=0.3.25 + - libpng=1.6.43 + - libsqlite=3.46.0 - libssh2=1.11.0 - - libtiff=4.5.1 - - libwebp-base=1.3.2 + - libtiff=4.6.0 + - libwebp-base=1.5.0 - libxcb=1.15 - - libxml2=2.10.3 + - libxml2=2.12.7 - libzlib=1.2.13 - - llvm-openmp=17.0.3 - - llvm-tools=16.0.1 - - make=4.3 + - llvm-openmp=20.1.2 + - llvm-tools=18.1.7 + - make=4.4.1 - matplotlib-base=3.7.3 + - minimap2=2.28 - mpc=1.3.1 - mpfr=4.2.1 - munkres=1.0.7 - - ncurses=6.4 + - ncurses=6.5 + - ntcard=1.2.2 - numpy=1.24.4 - - openjdk=21.0.1 - - openjpeg=2.5.0 - - openssl=3.2.0 - - packaging=23.2 + - openjdk=22.0.1 + - openjpeg=2.5.2 + - openssl=3.4.1 + - packaging=24.2 - pandas=2.0.3 - - pandoc=3.1.3 - - pango=1.50.14 - - pcre2=10.40 + - pandoc=3.6.4 + - pango=1.54.0 + - pcre2=10.43 - perl=5.32.1 - perl-constant=1.33 - perl-db_file=1.858 - perl-exporter=5.72 - perl-file-util=4.201720 - perl-lib=0.63 - - pillow=10.0.0 - - pip=23.3.1 - - pixman=0.42.2 - - platformdirs=3.11.0 - - pluggy=1.3.0 - - pooch=1.8.0 + - pillow=10.3.0 + - pip=24.3.1 + - pixman=0.44.2 + - platformdirs=4.3.6 + - pooch=1.8.2 - pthread-stubs=0.4 - - pyfaidx=0.7.2.2 + - pycparser=2.22 + - pyfaidx=0.8.1.3 - pyfastx=2.0.1 - - pyparsing=3.1.1 - - pysam=0.21.0 + - pyparsing=3.1.4 + - pysam=0.22.1 - pysocks=1.7.1 - - pytest=7.4.3 - python=3.8.18 - - python-dateutil=2.8.2 + - python-dateutil=2.9.0 - python-dotenv=1.0.0 - - python-tzdata=2023.3 + - python-tzdata=2024.2 - python_abi=3.8 - - pytz=2023.3.post1 + - pytz=2024.2 - pyvcf3=1.0.3 - - pyyaml=6.0.1 - - r-amap=0.8_19 - - r-ape=5.7_1 - - r-argparse=2.2.2 - - r-askpass=1.2.0 + - pyyaml=6.0.2 + - r-abind=1.4_5 + - r-amap=0.8_20 + - r-ape=5.8_1 + - r-argparse=2.2.5 + - r-ashr=2.2_63 + - r-askpass=1.2.1 - r-assertthat=0.2.1 - - r-backports=1.4.1 - - r-base=4.2.3 + - r-backports=1.5.0 + - r-base=4.3.3 - r-base64enc=0.1_3 - - r-bh=1.81.0_1 - - r-biasedurn=2.0.11 - - r-bit=4.0.5 - - r-bit64=4.0.5 - - r-bitops=1.0_7 + - r-bbmle=1.0.25.1 + - r-bdsmatrix=1.3_7 + - r-bh=1.87.0_1 + - r-biasedurn=2.0.12 + - r-bit=4.6.0 + - r-bit64=4.6.0_1 + - r-bitops=1.0_9 - r-blob=1.2.4 - - r-broom=1.0.5 - - r-bslib=0.5.1 - - r-cachem=1.0.8 - - r-callr=3.7.3 - - r-catools=1.18.2 + - r-broom=1.0.8 + - r-bslib=0.9.0 + - r-cachem=1.1.0 + - r-callr=3.7.6 + - r-catools=1.18.3 - r-cellranger=1.1.0 - - r-cli=3.6.1 + - r-cli=3.6.4 - r-clipr=0.8.0 - - r-cluster=2.1.4 - - r-codetools=0.2_19 - - r-colorspace=2.1_0 + - r-cluster=2.1.8.1 + - r-coda=0.19_4.1 + - r-codetools=0.2_20 + - r-colorspace=2.1_1 - r-conflicted=1.2.0 - - r-cpp11=0.4.6 - - r-crayon=1.5.2 - - r-curl=5.0.1 - - r-data.table=1.14.8 - - r-dbi=1.1.3 - - r-dbplyr=2.4.0 - - r-digest=0.6.33 - - r-dplyr=1.1.3 + - r-cpp11=0.5.2 + - r-crayon=1.5.3 + - r-curl=6.2.2 + - r-data.table=1.15.2 + - r-dbi=1.2.3 + - r-dbplyr=2.5.0 + - r-digest=0.6.37 + - r-dplyr=1.1.4 - r-dtplyr=1.3.1 - r-ellipsis=0.3.2 - - r-evaluate=0.22 - - r-fansi=1.0.5 - - r-farver=2.1.1 - - r-fastcluster=1.2.3 - - r-fastmap=1.1.1 - - r-fastmatch=1.1_4 - - r-filelock=1.0.2 - - r-findpython=1.0.8 - - r-fontawesome=0.5.2 + - r-emdbook=1.3.13 + - r-etrunct=0.1 + - r-evaluate=1.0.3 + - r-fansi=1.0.6 + - r-farver=2.1.2 + - r-fastcluster=1.2.6 + - r-fastmap=1.2.0 + - r-fastmatch=1.1_6 + - r-filelock=1.0.3 + - r-findpython=1.0.9 + - r-fontawesome=0.5.3 - r-forcats=1.0.0 - r-formatr=1.14 - - r-fs=1.6.3 + - r-fs=1.6.5 - r-futile.logger=1.4.3 - r-futile.options=1.0.1 - r-gargle=1.5.2 - r-generics=0.1.3 - - r-ggplot2=3.4.4 - - r-glue=1.6.2 + - r-ggplot2=3.5.1 + - r-glue=1.8.0 - r-googledrive=2.1.1 - r-googlesheets4=1.1.1 - - r-gplots=3.1.3 - - r-gtable=0.3.4 - - r-gtools=3.9.4 - - r-haven=2.5.3 - - r-highr=0.10 + - r-gplots=3.2.0 + - r-gtable=0.3.6 + - r-gtools=3.9.5 + - r-haven=2.5.4 + - r-highr=0.11 - r-hms=1.1.3 - - r-htmltools=0.5.6.1 + - r-htmltools=0.5.8.1 - r-httr=1.4.7 - r-hwriter=1.3.2.1 - r-ids=1.0.1 - - r-igraph=1.4.2 + - r-igraph=2.0.3 + - r-invgamma=1.1 + - r-irlba=2.3.5.1 - r-isoband=0.2.7 - r-jquerylib=0.1.4 - - r-jsonlite=1.8.7 - - r-kernsmooth=2.23_22 - - r-knitr=1.45 + - r-jsonlite=1.9.1 + - r-kernsmooth=2.23_26 + - r-knitr=1.50 - r-labeling=0.4.3 - r-lambda.r=1.2.4 - - r-lattice=0.22_5 - - r-lifecycle=1.0.3 - - r-locfit=1.5_9.8 - - r-lubridate=1.9.3 + - r-lattice=0.22_7 + - r-lifecycle=1.0.4 + - r-locfit=1.5_9.12 + - r-lubridate=1.9.4 - r-magrittr=2.0.3 - - r-mass=7.3_60 - - r-matrix=1.6_1.1 - - r-matrixstats=1.0.0 + - r-mass=7.3_60.0.1 + - r-matrix=1.6_5 + - r-matrixstats=1.5.0 - r-memoise=2.0.1 - - r-mgcv=1.9_0 - - r-mime=0.12 + - r-mgcv=1.9_3 + - r-mime=0.13 + - r-mixsqp=0.3_54 - r-modelr=0.1.11 - - r-munsell=0.5.0 - - r-nlme=3.1_163 - - r-openssl=2.1.1 - - r-phangorn=2.11.1 - - r-pillar=1.9.0 + - r-munsell=0.5.1 + - r-mvtnorm=1.3_3 + - r-nlme=3.1_168 + - r-numderiv=2016.8_1.1 + - r-openssl=2.3.2 + - r-phangorn=2.12.1 + - r-pillar=1.10.2 - r-pkgconfig=2.0.3 - r-plogr=0.2.0 - r-plyr=1.8.9 - r-png=0.1_8 - r-prettyunits=1.2.0 - - r-processx=3.8.2 - - r-progress=1.2.2 - - r-ps=1.7.5 - - r-purrr=1.0.2 + - r-processx=3.8.6 + - r-progress=1.2.3 + - r-ps=1.8.1 + - r-purrr=1.0.4 - r-quadprog=1.5_8 - - r-r6=2.5.1 - - r-ragg=1.2.5 + - r-r6=2.6.1 + - r-ragg=1.3.3 - r-rappdirs=0.3.3 - r-rcolorbrewer=1.1_3 - - r-rcpp=1.0.11 - - r-rcpparmadillo=0.12.6.4.0 - - r-rcurl=1.98_1.12 - - r-readr=2.1.4 - - r-readxl=1.4.3 + - r-rcpp=1.0.14 + - r-rcpparmadillo=14.4.1_1 + - r-rcppeigen=0.3.4.0.2 + - r-rcppnumerical=0.6_0 + - r-rcurl=1.98_1.16 + - r-readr=2.1.5 + - r-readxl=1.4.5 - r-rematch=2.0.0 - r-rematch2=2.1.2 - - r-reprex=2.0.2 + - r-reprex=2.1.1 - r-reshape2=1.4.4 - r-restfulr=0.0.15 - - r-rjson=0.2.21 - - r-rlang=1.1.1 - - r-rmarkdown=2.25 - - r-rsqlite=2.3.2 - - r-rstudioapi=0.15.0 - - r-rvest=1.0.3 - - r-sass=0.4.7 - - r-scales=1.2.1 + - r-rjson=0.2.23 + - r-rlang=1.1.5 + - r-rmarkdown=2.29 + - r-rsqlite=2.3.9 + - r-rstudioapi=0.17.1 + - r-rvest=1.0.4 + - r-sass=0.4.9 + - r-scales=1.3.0 - r-selectr=0.4_2 - - r-sm=2.2_5.7.1 + - r-sm=2.2_6.0 - r-snow=0.4_4 + - r-squarem=2021.1 - r-statmod=1.5.0 - - r-stringi=1.7.12 - - r-stringr=1.5.0 - - r-survival=3.5_7 - - r-sys=3.4.2 - - r-systemfonts=1.0.5 - - r-textshaping=0.3.6 + - r-stringi=1.8.4 + - r-stringr=1.5.1 + - r-survival=3.8_3 + - r-sys=3.4.3 + - r-systemfonts=1.2.1 + - r-textshaping=0.4.0 - r-tibble=3.2.1 - - r-tidyr=1.3.0 - - r-tidyselect=1.2.0 + - r-tidyr=1.3.1 + - r-tidyselect=1.2.1 - r-tidyverse=2.0.0 - - r-timechange=0.2.0 - - r-tinytex=0.48 - - r-tzdb=0.4.0 + - r-timechange=0.3.0 + - r-tinytex=0.56 + - r-truncnorm=1.0_9 + - r-tzdb=0.5.0 - r-utf8=1.2.4 - - r-uuid=1.1_1 - - r-vctrs=0.6.4 - - r-vioplot=0.4.0 + - r-uuid=1.2_1 + - r-vctrs=0.6.5 + - r-vioplot=0.5.1 - r-viridislite=0.4.2 - - r-vroom=1.6.4 - - r-withr=2.5.2 - - r-xfun=0.40 - - r-xml=3.99_0.14 - - r-xml2=1.3.3 + - r-vroom=1.6.5 + - r-withr=3.0.2 + - r-xfun=0.52 + - r-xml=3.99_0.17 + - r-xml2=1.3.6 - r-xtable=1.8_4 - - r-yaml=2.3.7 - - r-zoo=1.8_12 + - r-yaml=2.3.10 + - r-zoo=1.8_13 + - racon=1.5.0 - readline=8.2 - - requests=2.31.0 - - salmon=1.10.2 - - samtools=1.18 + - requests=2.32.3 + - rnabloom=2.0.1 + - ruby=3.2.2 + - salmon=1.10.3 + - samtools=1.21 - scikit-learn=1.3.2 - scipy=1.10.1 - - setuptools=68.2.2 + - setuptools=75.3.0 - sigtool=0.1.3 - - simplejson=3.19.2 + - simplejson=3.19.3 - six=1.16.0 - tapi=1100.0.11 - - tbb=2021.10.0 - - threadpoolctl=3.2.0 + - tbb=2022.1.0 + - threadpoolctl=3.5.0 - tk=8.6.13 - tktable=2.10 - toml=0.10.2 - - tomli=2.0.1 - - tomlkit=0.12.1 + - tomlkit=0.13.2 - trimmomatic=0.39 - trinity=2.15.1 - - typing-extensions=4.8.0 - - typing_extensions=4.8.0 - unicodedata2=15.1.0 - - urllib3=2.0.7 - - wheel=0.41.3 - - xmltodict=0.13.0 - - xorg-libxau=1.0.11 - - xorg-libxdmcp=1.1.3 - - xz=5.2.6 + - urllib3=2.2.3 + - wheel=0.45.1 + - xmltodict=0.14.2 + - xorg-libxau=1.0.12 + - xorg-libxdmcp=1.1.5 + - xz=5.8.1 + - xz-gpl-tools=5.8.1 + - xz-tools=5.8.1 + - yaggo=1.5.10 - yaml=0.2.5 - - yq=3.2.3 - - zipp=3.17.0 + - yq=3.4.3 + - zipp=3.21.0 - zlib=1.2.13 - - zstd=1.5.5 + - zstandard=0.23.0 + - zstd=1.5.6 diff --git a/pepti_map/aligning/gmap_wrapper.py b/pepti_map/aligning/gmap_wrapper.py index 9166bce..07b19f3 100644 --- a/pepti_map/aligning/gmap_wrapper.py +++ b/pepti_map/aligning/gmap_wrapper.py @@ -8,7 +8,7 @@ class GmapWrapper: - def __init__(self): + def __init__(self, min_trimmed_coverage: float = 0.0, min_identity: float = 0.0): env_vars = dotenv_values() try: n_threads = env_vars.get("GMAP_N_THREADS") @@ -27,6 +27,30 @@ def __init__(self): self._n_threads = n_threads self._batch_mode = batch_mode + try: + assert min_trimmed_coverage >= 0.0 and min_trimmed_coverage <= 1.0 + self._min_trimmed_coverage = min_trimmed_coverage + except AssertionError: + logging.info( + ( + "--min_trimmed_coverage option for GMAP must be between 0.0 and " + f"1.0, but was {str(min_trimmed_coverage)}. Using default of 0.0" + ) + ) + self._min_trimmed_coverage = 0.0 + + try: + assert min_identity >= 0.0 and min_identity <= 1.0 + self._min_identity = min_identity + except AssertionError: + logging.info( + ( + "--min_identity option for GMAP must be between 0.0 and " + f"1.0, but was {str(min_identity)}. Using default of 0.0" + ) + ) + self._min_identity = 0.0 + def build_index( self, files_to_index: List[Path], @@ -111,6 +135,10 @@ def produce_alignment( str(self._n_threads), "-f", "gff3_gene", + "--min-trimmed-coverage", + str(self._min_trimmed_coverage), + "--min-identity", + str(self._min_identity), ] alignment_command.extend( [ @@ -137,6 +165,10 @@ def _produce_alignment_for_single_file(self, path_to_sequences: Path) -> None: str(self._n_threads), "-f", "gff3_gene", + "--min-trimmed-coverage", + str(self._min_trimmed_coverage), + "--min-identity", + str(self._min_identity), path_to_sequences.absolute().as_posix(), ] with open( diff --git a/pepti_map/main.py b/pepti_map/main.py index 38d3d7d..240f3f1 100644 --- a/pepti_map/main.py +++ b/pepti_map/main.py @@ -196,7 +196,7 @@ def generate_trinity_results( return trinity_results_paths -def load_trinity_results_paths() -> List[Path]: +def load_current_results_paths() -> List[Path]: return TrinityWrapper.load_results_filepaths() @@ -205,8 +205,10 @@ def align_reads_to_genome( genome: Union[str, None], gmap_index: Union[str, None], output_dir: str, -) -> None: - gmap_wrapper = GmapWrapper() + min_trimmed_coverage: float, + min_identity: float, +) -> List[Path]: + gmap_wrapper = GmapWrapper(min_trimmed_coverage, min_identity) # TODO: How to automatically use previously generated index? if gmap_index is not None and gmap_index != "": gmap_index_path = Path(gmap_index) @@ -220,16 +222,37 @@ def align_reads_to_genome( logging.error(missing_option_message) raise ValueError(missing_option_message) + new_results_paths: List[Path] = [] for trinity_results_path in trinity_results_paths: gmap_wrapper.produce_alignment( [trinity_results_path], trinity_results_path.parent / "alignment_result.gff3", ) + # Check if actual output was produced + with open( + trinity_results_path.parent / "alignment_result.gff3", + "rt", + encoding="utf-8", + ) as gmap_output: + line_count = 0 + for _ in gmap_output: + line_count += 1 + if line_count == 4: + new_results_paths.append(trinity_results_path) + break + + # Save new output paths + TrinityWrapper.save_results_filepaths(new_results_paths) _write_last_step(Step.ALIGNMENT.value) logging.info("Generated alignment of assembled contigs with GMAP.") + return new_results_paths -def generate_pogo_input(paths_to_subdirectories: List[Path], peptide_file: str) -> None: +def generate_pogo_input( + paths_to_subdirectories: List[Path], + peptide_file: str, + no_indels: bool +) -> None: pogo_input_helper = PoGoInputHelper( Path(peptide_file), PATH_PEPTIDE_TO_CLUSTER_MAPPING_FILE ) @@ -237,7 +260,7 @@ def generate_pogo_input(paths_to_subdirectories: List[Path], peptide_file: str) paths_to_subdirectories, PATH_TO_MERGED_INDEXES ) PoGoInputHelper.generate_gtf_and_protein_files_for_multiple_directories( - paths_to_subdirectories + paths_to_subdirectories, no_indels ) _write_last_step(Step.POGO_INPUT.value) logging.info("Generated PoGo input files.") @@ -338,6 +361,7 @@ def concat_output(paths_to_subdirectories: List[Path], output_dir: str) -> None: "-pi", "--precompute-intersections", is_flag=True, + default=False, help=( "If used, the intersection sizes for the Jaccard Index " "calculation are precomputed during the matching phase." @@ -394,6 +418,33 @@ def concat_output(paths_to_subdirectories: List[Path], output_dir: str) -> None: "the '-g/--genome' option is ignored." ), ) +@click.option( + "-mtc", + "--min-trimmed-coverage", + required=False, + type=float, + default=0.0, + show_default=True, + help="Sets the '--min-trimmed-coverage' option for GMAP during alignment.", +) +@click.option( + "-mid", + "--min-identity", + required=False, + type=float, + default=0.0, + show_default=True, + help="Sets the '--min-identity' option for GMAP during alignment.", +) +@click.option( + "-ni", + "--no-indels", + required=False, + is_flag=True, + default=False, + help=("If set, contig alignments containing indels " + "are excluded from further processing.") +) def main( peptide_file: str, rna_file: str, @@ -407,6 +458,9 @@ def main( min_contig_length: int, genome: Union[str, None], gmap_index: Union[str, None], + min_trimmed_coverage: float, + min_identity: float, + no_indels: bool ): _setup(output_dir) @@ -464,13 +518,21 @@ def main( ) else: logging.info("Using already generated Trinity output files.") - trinity_results_paths = load_trinity_results_paths() + trinity_results_paths = load_current_results_paths() if last_step < Step.ALIGNMENT.value: logging.info("Aligning assembled RNA-seq reads to the genome.") - align_reads_to_genome(trinity_results_paths, genome, gmap_index, output_dir) + trinity_results_paths = align_reads_to_genome( + trinity_results_paths, + genome, + gmap_index, + output_dir, + min_trimmed_coverage, + min_identity, + ) else: logging.info("Using already generated alignments.") + trinity_results_paths = load_current_results_paths() paths_to_subdirectories = [ trinity_results_path.parent for trinity_results_path in trinity_results_paths @@ -478,7 +540,7 @@ def main( if last_step < Step.POGO_INPUT.value: logging.info("Generating input files for PoGo.") - generate_pogo_input(paths_to_subdirectories, peptide_file) + generate_pogo_input(paths_to_subdirectories, peptide_file, no_indels) else: logging.info("Using already generated PoGo input files.") diff --git a/pepti_map/output_generation/pogo_input_helper.py b/pepti_map/output_generation/pogo_input_helper.py index 93aca3e..58bf853 100644 --- a/pepti_map/output_generation/pogo_input_helper.py +++ b/pepti_map/output_generation/pogo_input_helper.py @@ -1,4 +1,5 @@ from collections import defaultdict +from functools import partial import logging import multiprocessing from dotenv import dotenv_values @@ -75,6 +76,10 @@ def generate_all_peptide_input_files( output_directory, merged_indexes[set_index] ) + @staticmethod + def _get_exon_feature_id(exon: gffutils.Feature) -> int: + return int(exon.attributes["ID"][0].split(".")[-1].replace("exon", "")) + @staticmethod def _write_feature_in_gtf_format( output_gtf: TextIO, @@ -115,191 +120,170 @@ def _write_new_feature_coordinates( gene: gffutils.Feature, mrna: gffutils.Feature, exons: List[gffutils.Feature], - cds: List[gffutils.Feature], - start_exon_index: int, - end_exon_index: int, strand: str, direction: str, - contig_length: int, - ) -> None: - start_exon = exons[start_exon_index] - end_exon = exons[end_exon_index] - if ( - not start_exon.start - or not start_exon.end - or not end_exon.start - or not end_exon.end - ): - raise ValueError("No start and end coordinates given.") - - if direction == ".": - contig_id, start_exon_contig_end, contig_start, _ = start_exon.attributes[ - "Target" - ][0].split(" ") - _, contig_end, end_exon_contig_start, _ = end_exon.attributes["Target"][ - 0 - ].split(" ") - contig_start = int(contig_start) - contig_end = int(contig_end) - - if start_exon == end_exon: - start_exon.attributes["Target"] = " ".join( - [contig_id, str(contig_length), "1", direction] - ) + contig: str, + ) -> str: + # The exons need to be sorted to follow the same order as in the original GFF. + # This is necessary because gffutils does not necessarily return the children + # of a feature in order when calling children(). The parameter order_by + # cannot be used here because it does not allow us to select for an + # exon identifier or specify the same order as in the original. + # TODO: Is there an easier option for ordering + # that does not involve string splitting? + exons.sort(key=cls._get_exon_feature_id) + + # If direction is antisense, change alignment to be on the complementary strand + if direction == "-": + if strand == "+": + strand = "-" else: - start_exon.attributes["Target"] = " ".join( - [contig_id, start_exon_contig_end, "1", direction] - ) - end_exon.attributes["Target"] = " ".join( - [contig_id, str(contig_length), end_exon_contig_start, direction] - ) - else: - contig_id, contig_start, start_exon_contig_end, _ = start_exon.attributes[ - "Target" - ][0].split(" ") - _, end_exon_contig_start, contig_end, _ = end_exon.attributes["Target"][ - 0 - ].split(" ") - contig_start = int(contig_start) - contig_end = int(contig_end) - - if start_exon == end_exon: - start_exon.attributes["Target"] = " ".join( - [contig_id, "1", str(contig_length), direction] - ) + strand = "+" + + gene.strand = strand + mrna.strand = strand + for exon in exons: + exon.strand = strand + + exons.reverse() + + # TODO: Is assumption that exons are already in correct order + # if dir=indeterminate true? + # Potentially only labeled "indeterminate" if only one exon? + + exon_starts: List[int] = [] + exon_ends: List[int] = [] + for exon in exons: + if direction == ".": + _, exon_end, exon_start, _ = exon.attributes["Target"][0].split(" ") else: - start_exon.attributes["Target"] = " ".join( - [contig_id, "1", start_exon_contig_end, direction] - ) - end_exon.attributes["Target"] = " ".join( - [contig_id, end_exon_contig_start, str(contig_length), direction] - ) + _, exon_start, exon_end, _ = exon.attributes["Target"][0].split(" ") + exon_starts.append(int(exon_start)) + exon_ends.append(int(exon_end)) - exon_ids = [exon.attributes["ID"][0] for exon in exons] - cds_ids = [cds_entry.attributes["ID"][0] for cds_entry in cds] - mrna_id = mrna.attributes["ID"][0] + cut_contig_parts: List[str] = [] + for i in range(len(exon_starts)): + current_start = exon_starts[i] + current_end = exon_ends[i] + current_part = contig[current_start - 1 : current_end] # noqa: E203 + cut_contig_parts.append(current_part) + start_end_cut_contig = "".join(cut_contig_parts) + + if len(start_end_cut_contig) < (0.7 * len(contig)): + # TODO: Filter out alignment and report + pass - if ( - (direction == "+" and strand == "+") - or (direction == "-" and strand == "-") - or (direction == "." and strand == "+") - ): - new_start = start_exon.start - (contig_start - 1) - start_exon.start = new_start - mrna.start = new_start - gene.start = new_start - new_end = end_exon.end + (contig_length - contig_end) - end_exon.end = new_end - mrna.end = new_end - gene.end = new_end + # TODO: Not needed? + # exon_ids = [exon.attributes["ID"][0] for exon in exons] + # cds_ids = [exon_id.replace("exon", "cds") for exon_id in exon_ids] + mrna_id = mrna.attributes["ID"][0] + exon_start_coords = [exon.start for exon in exons] + exon_end_coords = [exon.end for exon in exons] + cls._write_feature_in_gtf_format( + output_gtf, + gene, + gene.attributes["ID"][0], + ) + for frame in range(3): + mrna.attributes["ID"] = mrna_id + "." + str(frame) cls._write_feature_in_gtf_format( - output_gtf, - gene, - gene.attributes["ID"][0], + output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] ) - for frame in range(3): - mrna.attributes["ID"] = mrna_id + "." + str(frame) + for exon_idx, exon in enumerate(exons): + # TODO: Not needed? + # exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) + # exon.attributes["Parent"] = mrna.attributes["ID"] + exon.featuretype = "exon" + exon.start = exon_start_coords[exon_idx] + exon.end = exon_end_coords[exon_idx] cls._write_feature_in_gtf_format( - output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] + output_gtf, + exon, + gene.attributes["ID"][0], + mrna.attributes["ID"][0], ) - for exon_index, exon in enumerate(exons): - exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) - exon.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - exon, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - for cds_index, cds_entry in enumerate(cds): - cds_entry.start = exons[cds_index].start - cds_entry.end = exons[cds_index].end - if cds_index == start_exon_index: - cds_entry.start = cds_entry.start + frame # pyright: ignore - if cds_index == end_exon_index: - cds_entry.end = cds_entry.end - ( # pyright: ignore - (contig_length - frame) % 3 + for exon_idx, exon in enumerate(exons): + # TODO: Is there a better solution, + # e.g. copying and modifying the feature? + exon.featuretype = "CDS" + # strand = +, dir = sense + # -> add frame to start of first CDS + # strand = +, dir = antisense + # -> subtract frame from end of first CDS (is first after reversing) + # strand = -, dir = sense + # -> subtract frame from end of first CDS + # strand = -, dir = antisense + # -> add frame to start of first CDS (is first after reversing) + # --> differentiation between +/- strand should suffice after reversing + if strand == "+": + if exon_idx == 0: + exon.start = exon.start + frame # pyright: ignore + if exon_idx == (len(exons) - 1): + exon.end = exon.end - ( # pyright: ignore + (len(start_end_cut_contig) - frame) % 3 + ) + else: + if exon_idx == 0: + exon.end = exon.end - frame # pyright: ignore + if exon_idx == (len(exons) - 1): + exon.start = exon.start + ( # pyright: ignore + (len(start_end_cut_contig) - frame) % 3 ) - cds_entry.attributes["ID"] = cds_ids[cds_index] + "." + str(frame) - cds_entry.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - cds_entry, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - elif ( - (direction == "+" and strand == "-") - or (direction == "-" and strand == "+") - or (direction == "." and strand == "-") - ): - new_end = start_exon.end + (contig_start - 1) - start_exon.end = new_end - mrna.end = new_end - gene.end = new_end - new_start = end_exon.start - (contig_length - contig_end) - end_exon.start = new_start - mrna.start = new_start - gene.start = new_start - - cls._write_feature_in_gtf_format(output_gtf, gene, gene.attributes["ID"][0]) - for frame in range(3): - mrna.attributes["ID"] = mrna_id + "." + str(frame) cls._write_feature_in_gtf_format( - output_gtf, mrna, gene.attributes["ID"][0], mrna.attributes["ID"][0] + output_gtf, + exon, + gene.attributes["ID"][0], + mrna.attributes["ID"][0], ) - for exon_index, exon in enumerate(exons): - exon.attributes["ID"] = exon_ids[exon_index] + "." + str(frame) - exon.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - exon, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - for cds_index, cds_entry in enumerate(cds): - cds_entry.start = exons[cds_index].start - cds_entry.end = exons[cds_index].end - if cds_index == start_exon_index: - cds_entry.end = cds_entry.end - frame # pyright: ignore - if cds_index == end_exon_index: - cds_entry.start = cds_entry.start + ( # pyright: ignore - (contig_length - frame) % 3 - ) - cds_entry.attributes["ID"] = cds_ids[cds_index] + "." + str(frame) - cds_entry.attributes["Parent"] = mrna.attributes["ID"] - cls._write_feature_in_gtf_format( - output_gtf, - cds_entry, - gene.attributes["ID"][0], - mrna.attributes["ID"][0], - ) - else: - raise ValueError("Strand must be one of '+', '-'.") + + return start_end_cut_contig @classmethod def generate_gtf_input_file( cls, path_to_gff: Path, output_directory: Path, - sequence_lengths_per_contig: List[int], - ) -> List[int]: - # Track number of transcripts to write protein FASTA with matching ids - number_of_transcripts_per_contig: List[int] = [ - 0 for _ in range(len(sequence_lengths_per_contig)) + contig_sequences: List[Tuple[str, str]], + no_indels=False, + ) -> Tuple[List[List[int]], List[List[str]]]: + # Track contig alignment ids to write protein FASTA with matching ids + alignment_ids_per_contig: List[List[int]] = [ + [] for _ in range(len(contig_sequences)) + ] + # Per original contig, there can be several new contigs + # based on different cutoffs + new_contig_sequences: List[List[str]] = [ + [] for _ in range(len(contig_sequences)) ] gffutils_db = gffutils.create_db( path_to_gff.absolute().as_posix(), (path_to_gff.parent / "gffutils_db.sqlite").absolute().as_posix(), ) + with open( output_directory / "pogo_gtf_in.gtf", "wt", encoding="utf-8" ) as output_gtf: for gene_feature in gffutils_db.features_of_type("gene"): gene_children = list(gffutils_db.children(gene_feature)) + + mrna = [ + gene_child + for gene_child in gene_children + if gene_child.featuretype == "mRNA" + ][ + 0 + ] # There can be only one mRNA per gene + + # If no indels allowed: Check if feature contains indels + # -> If so, gene feature is skipped + if no_indels: + mrna_indels = int(mrna.attributes["indels"][0]) + if mrna_indels != 0: + continue + exons = [ gene_child for gene_child in gene_children @@ -310,91 +294,51 @@ def generate_gtf_input_file( strand = first_exon.strand target: str = first_exon.attributes["Target"][0] contig_id, _, _, direction = target.split(" ") - contig_length = sequence_lengths_per_contig[int(contig_id[-1])] - if direction == ".": # indeterminate - start_exon_index = exons.index( - min( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[2] - ), - ) - ) - end_exon_index = exons.index( - max( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[1] - ), - ) - ) - else: # sense or antisense - start_exon_index = exons.index( - min( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[1] - ), - ) - ) - end_exon_index = exons.index( - max( - exons, - key=lambda exon: int( - exon.attributes["Target"][0].split(" ")[2] - ), - ) - ) - mrna = [ - gene_child - for gene_child in gene_children - if gene_child.featuretype == "mRNA" - ][ - 0 - ] # There can be only one mRNA per gene - mrna_id = mrna.attributes["ID"][0] - number_of_transcripts_per_contig[int(mrna_id.split(".")[0][-1])] += 1 + contig_id = int(contig_id.split("-")[-1]) + contig = contig_sequences[contig_id] - cls._write_new_feature_coordinates( + new_contig = cls._write_new_feature_coordinates( output_gtf, gene_feature, mrna, exons, - [ - gene_child - for gene_child in gene_children - if gene_child.featuretype == "CDS" - ], - start_exon_index, - end_exon_index, strand, direction, - contig_length, + contig[1], ) - return number_of_transcripts_per_contig + mrna_id = mrna.attributes["ID"][0] + contig_idx = int(mrna_id.split(".")[0].split("-")[-1]) + path_number = int(mrna_id.split(".")[1].replace("mrna", "")) + new_contig_sequences[contig_idx].append(new_contig) + alignment_ids_per_contig[contig_idx].append(path_number) + + return (alignment_ids_per_contig, new_contig_sequences) + # TODO: Remove unneeded arguments @staticmethod def generate_protein_fasta_input_file( - contig_sequences: List[Tuple[str, str]], + contig_ids: List[str], + contig_sequences: List[List[str]], output_directory: Path, - number_of_transcripts_per_contig: List[int], + alignment_ids_per_contig: List[List[int]], ) -> None: + # TODO: Adapt to new separation of ids and seqs with open( output_directory / "pogo_fasta_in.fa", "wt", encoding="utf-8" ) as output_file: - for contig_index, (contig_id, contig_sequence) in enumerate( - contig_sequences + for contig_id, contig_cut_sequences, alignment_ids in zip( + contig_ids, contig_sequences, alignment_ids_per_contig ): - for translation, frame in get_three_frame_translations( - contig_sequence, False + for alignment_id, contig_sequence in zip( + alignment_ids, contig_cut_sequences ): - for transcript_index in range( - number_of_transcripts_per_contig[contig_index] + for translation, frame in get_three_frame_translations( + contig_sequence, False ): - gene_id = f"{contig_id}_path{str(transcript_index + 1)}" + gene_id = f"{contig_id}_path{str(alignment_id)}" transcript_id = ( - f"{contig_id}_mrna{str(transcript_index + 1)}_{str(frame)}" + f"{contig_id}_mrna{str(alignment_id)}_{str(frame)}" ) output_file.write( ( @@ -420,26 +364,29 @@ def _get_contig_sequences(path_to_contig_sequences: Path) -> List[Tuple[str, str @classmethod def generate_gtf_and_protein_files_for_directory( - cls, path_to_directory: Path + cls, path_to_directory: Path, no_indels=False ) -> None: contig_sequences = cls._get_contig_sequences( path_to_directory / "resulting_contigs.fa" ) - number_of_transcripts_per_contig = cls.generate_gtf_input_file( - path_to_directory / "alignment_result.gff3", - path_to_directory, - [len(contig_sequence[1]) for contig_sequence in contig_sequences], + alignment_ids_per_contig, updated_contig_sequences = ( + cls.generate_gtf_input_file( + path_to_directory / "alignment_result.gff3", + path_to_directory, + contig_sequences, + no_indels, + ) ) cls.generate_protein_fasta_input_file( - contig_sequences, + [contig_sequence[0] for contig_sequence in contig_sequences], + updated_contig_sequences, path_to_directory, - number_of_transcripts_per_contig, + alignment_ids_per_contig, ) @classmethod def generate_gtf_and_protein_files_for_multiple_directories( - cls, - paths_to_directories: List[Path], + cls, paths_to_directories: List[Path], no_indels=False ) -> None: # TODO: Refactor code duplication try: @@ -453,5 +400,9 @@ def generate_gtf_and_protein_files_for_multiple_directories( ) with multiprocessing.Pool(n_processes) as pool: pool.map( - cls.generate_gtf_and_protein_files_for_directory, paths_to_directories + partial( + cls.generate_gtf_and_protein_files_for_directory, + no_indels=no_indels, + ), + paths_to_directories, )