diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml index 7987be9..85f7379 100644 --- a/.github/workflows/CI_build.yml +++ b/.github/workflows/CI_build.yml @@ -8,39 +8,39 @@ on: jobs: first_check: - name: first code check / python-3.8 / ubuntu-latest + name: first code check / python-3.10 / ubuntu-latest runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: "3.10" - name: Python info run: | which python python --version - name: Build package and create dev environment run: | - python -m pip install --upgrade pip - pip install -e .[dev] + python -m pip install --upgrade pip poetry + poetry install - name: Show pip list run: | pip list - name: Test with coverage run: | - pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml + poetry run pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml - name: Correct coverage paths run: sed -i "s+$PWD/++g" coverage.xml - name: Check style against standards using prospector shell: bash -l {0} - run: prospector -o grouped -o pylint:pylint-report.txt + run: poetry run prospector -o grouped -o pylint:pylint-report.txt - name: Check whether import statements are used consistently shell: bash -l {0} - run: isort --check-only --diff --conda-env spec2vec-dev . - - name: SonarCloud Scan + run: poetry run isort --check-only --diff --conda-env spec2vec-dev . + - name: SonarQube Scan if: github.repository == 'iomega/spec2vec' - uses: sonarsource/sonarcloud-github-action@master + uses: SonarSource/sonarqube-scan-action@master env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} @@ -53,15 +53,15 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] - python-version: ['3.7', '3.8', '3.9'] + python-version: ['3.10'] exclude: # already tested in first_check job - - python-version: 3.8 + - python-version: "3.10" os: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Python info @@ -70,31 +70,31 @@ jobs: python --version - name: Install dependencies run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip poetry - name: Build package run: | - pip install wheel twine - python setup.py sdist bdist_wheel + poetry build - name: Test package run: | + pip install twine python -m twine check dist/* + - name: Install dependencies + run: | + python -m pip install --upgrade pip poetry + poetry install - name: Show pip list run: | pip list - - name: Install development dependencies - run: | - pip install -e .[dev] - name: Test run: | - pytest + poetry run pytest - name: Show environment variables shell: bash -l {0} run: | env | sort - name: Build documentation - shell: bash -l {0} run: | - make coverage doctest html + poetry run make coverage doctest html working-directory: readthedocs/ env: SPHINXOPTS: "-n" # enable nit-picky mode @@ -108,22 +108,22 @@ jobs: echo "The code is sufficiently documented with ${UNCOVERED_MEMBERS} uncovered members out of ${UNCOVERED_MEMBERS_ALLOWED} allowed."; anaconda_build: - name: Anaconda build / python-3.7 / ubuntu-latest + name: Anaconda build / python-3.10 / ubuntu-latest runs-on: ubuntu-latest strategy: fail-fast: false needs: first_check steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: Create spec2vec-build environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: activate-environment: spec2vec-build auto-update-conda: true environment-file: conda/environment-build.yml - python-version: 3.8 + python-version: "3.10" - name: Show conda config shell: bash -l {0} run: | @@ -148,11 +148,11 @@ jobs: [ "$RUNNING_OS" = "Windows" ] && export BUILDDIR=$RUNNER_TEMP\\spec2vec\\_build\\ conda config --set anaconda_upload no conda build --no-include-recipe \ - --channel bioconda --channel conda-forge \ + --channel conda-forge --channel bioconda \ --croot ${BUILDDIR} \ ./conda - name: Upload package artifact from build - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: conda-package-artifact path: ${{ runner.temp }}/spec2vec/_build diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml new file mode 100644 index 0000000..707a71c --- /dev/null +++ b/.github/workflows/cffconvert.yml @@ -0,0 +1,19 @@ +name: cffconvert + +on: + push: + paths: + - CITATION.cff + +jobs: + validate: + name: "validate" + runs-on: ubuntu-latest + steps: + - name: Check out a copy of the repository + uses: actions/checkout@v2 + + - name: Check whether the citation metadata from CITATION.cff is valid + uses: citation-file-format/cffconvert-github-action@2.0.0 + with: + args: "--validate" diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml index 6429a7a..8e5bcd0 100644 --- a/.github/workflows/pypi_publish.yml +++ b/.github/workflows/pypi_publish.yml @@ -5,21 +5,11 @@ on: types: [published] jobs: - publish: + build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.7 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - python setup.py sdist bdist_wheel - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI_TOKEN }} + - uses: actions/checkout@v4 + - name: Build and publish to pypi + uses: JRubics/poetry-publish@v1.17 + with: + pypi_token: ${{ secrets.PYPI_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 677d58a..79665c4 100644 --- a/.gitignore +++ b/.gitignore @@ -16,8 +16,8 @@ pylint-report.txt xunit-result.xml .scannerwork/ -docs/_build -docs/apidocs +readthedocs/_build +readthedocs/api # ide .idea diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f67d1c..77f5874 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.9.0] - 2025-10-20 +### Changed +- Added loss_mz_from and loss_mz_to to SpectrumDocument [#95](https://github.com/iomega/spec2vec/pull/95) +- Updated required matchms version to >=0.27.0 [#95](https://github.com/iomega/spec2vec/pull/95) +- Updated required Python version to >=3.10 [#95](https://github.com/iomega/spec2vec/pull/95) +- Moved to poetry [#95](https://github.com/iomega/spec2vec/pull/95) +- Updated CI_build workflow to use poetry and actions [#95](https://github.com/iomega/spec2vec/pull/95) +- Added sparse array type handling and tests [#95](https://github.com/iomega/spec2vec/pull/95) + +### Fixed + +- Fixed missing keyword in model dict [#95](https://github.com/iomega/spec2vec/pull/95) +- Fixed documentation [#95](https://github.com/iomega/spec2vec/pull/95) + ## [0.8.1] - 2024-08-06 ### Changed - Set max matchms to 0.26.4 @@ -147,7 +161,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fossa configuration - Flowchart -[Unreleased]: https://github.com/iomega/spec2vec/compare/0.8.0...HEAD +[Unreleased]: https://github.com/iomega/spec2vec/compare/0.9.0...HEAD +[0.9.0]: https://github.com/iomega/spec2vec/compare/0.8.1...0.9.0 +[0.8.1]: https://github.com/iomega/spec2vec/compare/0.8.0...0.8.1 [0.8.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0 [0.7.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0 [0.6.0]: https://github.com/iomega/spec2vec/compare/0.5.0...0.6.0 diff --git a/CITATION.cff b/CITATION.cff index 21d7f8e..015dc01 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -64,7 +64,7 @@ authors: given-names: Maksym orcid: "https://orcid.org/0000-0003-2056-8018" -cff-version: "1.1.0" +cff-version: 1.2.0 keywords: - Word2Vec - "similarity measures" diff --git a/README.rst b/README.rst index fe98866..3890bbe 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,6 @@ Thanks! .. |ReadTheDocs Badge| image:: https://readthedocs.org/projects/spec2vec/badge/?version=latest :alt: Documentation Status - :scale: 100% :target: https://spec2vec.readthedocs.io/en/latest/?badge=latest .. |Sonarcloud Quality Gate Badge| image:: https://sonarcloud.io/api/project_badges/measure?project=iomega_spec2vec&metric=alert_status @@ -139,7 +138,6 @@ dataset. s = msfilters.normalize_intensities(s) s = msfilters.reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5, n_max=500) s = msfilters.select_by_mz(s, mz_from=0, mz_to=1000) - s = msfilters.add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = msfilters.require_minimum_number_of_peaks(s, n_required=10) return s @@ -150,7 +148,7 @@ dataset. spectrums = [s for s in spectrums if s is not None] # Create spectrum documents - reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums] + reference_documents = [SpectrumDocument(s, n_decimals=2, loss_mz_from=10.0, loss_mz_to=200.0) for s in spectrums] model_file = "references.model" model = train_new_word2vec_model(reference_documents, iterations=[10, 20, 30], filename=model_file, diff --git a/conda/environment-build.yml b/conda/environment-build.yml index c5baf80..9caabbd 100644 --- a/conda/environment-build.yml +++ b/conda/environment-build.yml @@ -5,4 +5,4 @@ dependencies: - anaconda-client - conda-build - conda-verify - - python >=3.7 + - python >=3.10 diff --git a/conda/environment-dev.yml b/conda/environment-dev.yml index 2996c9f..2628215 100644 --- a/conda/environment-dev.yml +++ b/conda/environment-dev.yml @@ -3,15 +3,23 @@ channels: - bioconda - conda-forge - defaults - - nlesc dependencies: - - gensim >=4.2.0 - - matchms >=0.6.2 - - numba >=0.51 + - python + - gensim >=4.4.0 + - matchms >=0.27.0 + - numba - numpy - pip - - python >=3.7 - scipy - tqdm - - pip: - - -e ..[dev] + - bump2version + - isort>=5.1.0 + - pylint<2.12.0 + - prospector + - pytest + - pytest-cov + - sphinx>=4.0.0 + - sphinx_rtd_theme + - sphinxcontrib-apidoc + - yapf + diff --git a/conda/environment.yml b/conda/environment.yml deleted file mode 100644 index bf9fc6c..0000000 --- a/conda/environment.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: spec2vec -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - gensim >=4.2.0 - - matchms >=0.6.2 - - numba >=0.51 - - numpy - - python >=3.7 - - scipy - - tqdm diff --git a/conda/meta.yaml b/conda/meta.yaml index 000c35a..a7d2e82 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,65 +1,49 @@ {% set name = "spec2vec" %} -{% set version = "0.8.1" %} +{% set version = "0.9.0" %} package: name: {{ name|lower }} version: {{ version }} source: - path: .. - -extra: - channels: - - nlesc - - conda-forge - - bioconda + path: ../ build: noarch: python - preserve_egg_dir: True + script: {{ PYTHON }} -m pip install . -vv --no-deps --no-build-isolation number: 0 - skip: True # [py2k] - script: {{ PYTHON }} -m pip install --no-deps --ignore-installed . -vv requirements: build: - - conda-build - - conda-verify - - pytest-runner - python - - matchms >=0.6.2 - - numpy {{ numpy }} - - setuptools + - poetry host: - - python >=3.7 + - python >=3.10, <3.14 + - poetry - pip - - pytest-runner - - setuptools run: - - gensim >=4.2.0 - - matchms >=0.14.0, <=0.26.4 - - numba >=0.51 - - numpy - - pip - - python >=3.7 - - scipy <=1.10.1 + - python >=3.10, <3.14 + - gensim >=4.4.0 + - matchms >=0.27.0 - tqdm + - lxml >=4.9.3, <6.0.0 + - rdkit >=2024.3.5, <2025.0.0 + - numba >=0.60, <0.62 test: imports: - spec2vec + commands: + - pip check + requires: + - pip about: home: https://github.com/iomega/spec2vec + summary: Word2Vec based similarity measure of mass spectrometry data. license: Apache-2.0 - license_family: APACHE license_file: LICENSE - summary: Word2Vec based similarity measure of mass spectrometry data. - description: Word2Vec based similarity measure of mass spectrometry data. - doc_url: https://spec2vec.readthedocs.io/ - dev_url: https://github.com/iomega/spec2vec extra: recipe-maintainers: - - fdiblen - - florian-huber + - hechth diff --git a/integration-tests/test_user_workflow_spec2vec.py b/integration-tests/test_user_workflow_spec2vec.py index 6012eeb..91d976e 100644 --- a/integration-tests/test_user_workflow_spec2vec.py +++ b/integration-tests/test_user_workflow_spec2vec.py @@ -2,12 +2,12 @@ import gensim import numpy as np from matchms import calculate_scores -from matchms.filtering import (add_losses, add_parent_mass, default_filters, +from matchms.filtering import (add_parent_mass, default_filters, normalize_intensities, reduce_to_number_of_peaks, require_minimum_number_of_peaks, select_by_mz) from matchms.importing import load_from_mgf -from spec2vec import Spec2Vec, SpectrumDocument +from spec2vec import Spec2Vec, SpectrumDocumentWithLosses def test_user_workflow_spec2vec(): @@ -26,7 +26,6 @@ def apply_my_filters(s): s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) s = select_by_mz(s, mz_from=0, mz_to=1000) - s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s @@ -40,7 +39,7 @@ def apply_my_filters(s): spectrums = [s for s in spectrums if s is not None] # convert spectrums to spectrum 'documents' - documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums] + documents = [SpectrumDocumentWithLosses(s, n_decimals=1, loss_mz_from=10.0, loss_mz_to=200.0) for s in spectrums] model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") if os.path.isfile(model_file): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8e4206c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[tool.poetry] +name = "spec2vec" +version = "0.8.1" +description = "Word2Vec based similarity measure of mass spectrometry data." +authors = ["Florian Huber "] +license = "Apache Software License 2.0" +readme = "README.rst" +repository = "https://github.com/iomega/spec2vec" +keywords = [ + "word2vec", + "mass spectrometry", + "fuzzy matching", + "fuzzy search" +] + +[tool.poetry.dependencies] +python = ">=3.10,<3.14" +gensim = "^4.4.0" +matchms = ">=0.27.0" +tqdm = "^4.66.5" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.2" +isort = "^5.13.2" +yapf = "^0.40.2" +pytest-cov = "^5.0.0" +poetry-bumpversion = "^0.3.2" +prospector = {extras = ["with-pyroma"], version = "^1.10.3"} + +[tool.poetry.group.docs.dependencies] +sphinx-rtd-theme = "^2.0.0" +sphinxcontrib-apidoc = "^0.5.0" + +[tool.poetry_bumpversion.file."spec2vec/__version__.py"] +[tool.poetry_bumpversion.file."conda/meta.yaml"] + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.isort] +sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" +no_lines_before = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" +lines_after_imports = 2 + +[tool.coverage.run] +branch = true +source = ["spec2vec"] + +[tool.pytest.ini_options] +testpaths = [ + "tests", + "integration-tests", +] + +[build_sphinx] +source-dir = "docs" +build-dir = "docs/_build" +all_files = 1 +builder = "html" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 9ed1c30..0000000 --- a/setup.cfg +++ /dev/null @@ -1,36 +0,0 @@ -[bumpversion] -current_version = 0.8.0 - -[bumpversion:file:conda/meta.yaml] -search = set version = "{current_version}" -replace = set version = "{new_version}" - -[bumpversion:file:spec2vec/__version__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[isort] -sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER -no_lines_before = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER -lines_after_imports = 2 - -[metadata] -description-file = README.rst - -[aliases] -test = pytest - -[coverage:run] -branch = True -source = spec2vec - -[tool:pytest] -testpaths = tests integration-tests -python_classes = *TestSuite -junit_family = xunit2 - -[build_sphinx] -source-dir = docs -build-dir = docs/_build -all_files = 1 -builder = html diff --git a/setup.py b/setup.py deleted file mode 100644 index 97e809f..0000000 --- a/setup.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -import os -from setuptools import find_packages, setup - - -here = os.path.abspath(os.path.dirname(__file__)) - -version = {} -with open(os.path.join(here, "spec2vec", "__version__.py")) as f: - exec(f.read(), version) - -with open("README.rst") as readme_file: - readme = readme_file.read() - -setup( - name="spec2vec", - version=version["__version__"], - description="Word2Vec based similarity measure of mass spectrometry data.", - long_description=readme, - long_description_content_type="text/x-rst", - author="Spec2Vec developer team", - author_email="florian.huber@hs-duesseldorf.de", - url="https://github.com/iomega/spec2vec", - packages=find_packages(), - include_package_data=True, - license="Apache Software License 2.0", - zip_safe=False, - keywords=[ - "word2vec", - "mass spectrometry", - "fuzzy matching", - "fuzzy search" - ], - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Education", - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Natural Language :: English", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - ], - test_suite="tests", - python_requires='>=3.7', - install_requires=[ - "gensim >=4.2.0", - "matchms >=0.14.0,<=0.26.4", - "numba >=0.51", - "numpy", - "scipy <=1.10.1", - "tqdm", - ], - extras_require={"dev": ["bump2version", - "isort>=5.1.0", - "pylint<2.12.0", - "prospector[with_pyroma]", - "pytest", - "pytest-cov", - "sphinx>=4.0.0", - "sphinx_rtd_theme", - "sphinxcontrib-apidoc", - "yapf",], - } -) diff --git a/spec2vec/Spec2Vec.py b/spec2vec/Spec2Vec.py index cf009c9..428b82b 100644 --- a/spec2vec/Spec2Vec.py +++ b/spec2vec/Spec2Vec.py @@ -4,6 +4,7 @@ from gensim.models import Word2Vec from matchms import Spectrum from matchms.similarity.BaseSimilarity import BaseSimilarity +from sparsestack import StackedSparseArray from tqdm import tqdm from spec2vec.serialization import Word2VecLight from spec2vec.SpectrumDocument import SpectrumDocument @@ -29,7 +30,6 @@ class Spec2Vec(BaseSimilarity): import os import gensim from matchms import calculate_scores - from matchms.filtering import add_losses from matchms.filtering import default_filters from matchms.filtering import normalize_intensities from matchms.filtering import require_minimum_number_of_peaks @@ -45,7 +45,6 @@ def spectrum_processing(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=0, mz_to=1000) s = select_by_intensity(s, intensity_from=0.01) - s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s @@ -77,7 +76,7 @@ def spectrum_processing(s): .. testoutput:: - ['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ... + ['CCMSLIB00001058430', 'CCMSLIB00001058367', 'CCMSLIB00001058433', ... """ def __init__(self, model: Union[Word2Vec, Word2VecLight], intensity_weighting_power: Union[float, int] = 0, @@ -176,7 +175,13 @@ def matrix(self, references: Union[List[SpectrumDocument], List[Spectrum]], spec2vec_similarity = cosine_similarity_matrix(reference_vectors, query_vectors) - return spec2vec_similarity + if array_type == "numpy": + return spec2vec_similarity + if array_type == "sparse": + sparse = StackedSparseArray(n_rows, n_cols) + sparse.add_dense_matrix(spec2vec_similarity, "") + return sparse + raise NotImplementedError("Only 'numpy' and 'sparse' array types are supported.") @staticmethod def _get_word_decimals(model): diff --git a/spec2vec/SpectrumDocument.py b/spec2vec/SpectrumDocument.py index 4c0c2da..66ba46b 100644 --- a/spec2vec/SpectrumDocument.py +++ b/spec2vec/SpectrumDocument.py @@ -1,4 +1,4 @@ -from typing import Optional +from matchms import Spectrum from matchms.Spikes import Spikes from .Document import Document @@ -6,7 +6,7 @@ class SpectrumDocument(Document): """Create documents from spectra. - Every peak (and loss) positions (m/z value) will be converted into a string "word". + Every peak positions (m/z value) will be converted into a string "word". The entire list of all peak words forms a spectrum document. Peak words have the form "peak@100.32" (for n_decimals=2), and losses have the format "loss@100.32". Peaks with identical resulting strings will not be merged, hence same words can @@ -38,7 +38,7 @@ class SpectrumDocument(Document): [100. 150. 200.51] substance1 """ - def __init__(self, spectrum, n_decimals: int = 2): + def __init__(self, spectrum: Spectrum, n_decimals: int = 2): """ Parameters @@ -50,31 +50,24 @@ def __init__(self, spectrum, n_decimals: int = 2): The default is 2, which would convert a peak at 100.387 into the word "peak@100.39". """ - self.n_decimals = n_decimals + self.n_decimals: int = n_decimals self.weights = None super().__init__(obj=spectrum) self._add_weights() + self._obj: Spectrum = self._obj - def _make_words(self): + def _make_words(self) -> list[str]: """Create word from peaks (and losses).""" peak_words = [f"peak@{mz:.{self.n_decimals}f}" for mz in self._obj.peaks.mz] - if self._obj.losses is not None: - loss_words = [f"loss@{mz:.{self.n_decimals}f}" for mz in self._obj.losses.mz] - else: - loss_words = [] - self.words = peak_words + loss_words + self.words = peak_words return self - def _add_weights(self): + def _add_weights(self) -> list[float]: """Add peaks (and loss) intensities as weights.""" assert self._obj.peaks.intensities.max() <= 1, "peak intensities not normalized" peak_intensities = self._obj.peaks.intensities.tolist() - if self._obj.losses is not None: - loss_intensities = self._obj.losses.intensities.tolist() - else: - loss_intensities = [] - self.weights = peak_intensities + loss_intensities + self.weights = peak_intensities return self def get(self, key: str, default=None): @@ -93,11 +86,6 @@ def metadata(self): """Return metadata of original spectrum.""" return self._obj.metadata - @property - def losses(self) -> Optional[Spikes]: - """Return losses of original spectrum.""" - return self._obj.losses - @property def peaks(self) -> Spikes: """Return peaks of original spectrum.""" diff --git a/spec2vec/SpectrumDocumentWithLosses.py b/spec2vec/SpectrumDocumentWithLosses.py new file mode 100644 index 0000000..8cdc514 --- /dev/null +++ b/spec2vec/SpectrumDocumentWithLosses.py @@ -0,0 +1,32 @@ +from .SpectrumDocument import SpectrumDocument + + +class SpectrumDocumentWithLosses(SpectrumDocument): + def __init__(self, spectrum, n_decimals: int = 2, loss_mz_from: int = 10, loss_mz_to: int = 200): + self._loss_mz_from = loss_mz_from + self._loss_mz_to = loss_mz_to + super().__init__(spectrum, n_decimals) + + + def _make_words(self): + """Create word from peaks (and losses).""" + peak_words = [f"peak@{mz:.{self.n_decimals}f}" for mz in self._obj.peaks.mz] + loss_words = [f"loss@{mz:.{self.n_decimals}f}" for mz in self.losses.mz] + self.words = peak_words + loss_words + return self + + + def _add_weights(self): + """Add peaks (and loss) intensities as weights.""" + assert self._obj.peaks.intensities.max() <= 1, "peak intensities not normalized" + + peak_intensities = self._obj.peaks.intensities.tolist() + loss_intensities = self.losses.intensities.tolist() + self.weights = peak_intensities + loss_intensities + return self + + + @property + def losses(self): + """Return losses of original spectrum.""" + return self._obj.compute_losses(self._loss_mz_from, self._loss_mz_to) diff --git a/spec2vec/__init__.py b/spec2vec/__init__.py index 659e2bf..e1596af 100644 --- a/spec2vec/__init__.py +++ b/spec2vec/__init__.py @@ -4,6 +4,7 @@ from .logging_functions import _init_logger from .Spec2Vec import Spec2Vec from .SpectrumDocument import SpectrumDocument +from .SpectrumDocumentWithLosses import SpectrumDocumentWithLosses from .vector_operations import calc_vector @@ -16,5 +17,6 @@ "Document", "serialization", "SpectrumDocument", + "SpectrumDocumentWithLosses", "Spec2Vec", ] diff --git a/spec2vec/__version__.py b/spec2vec/__version__.py index ef72cc0..e4e49b3 100644 --- a/spec2vec/__version__.py +++ b/spec2vec/__version__.py @@ -1 +1 @@ -__version__ = '0.8.1' +__version__ = '0.9.0' diff --git a/spec2vec/serialization/model_importing.py b/spec2vec/serialization/model_importing.py index 8459ad3..0ef074f 100644 --- a/spec2vec/serialization/model_importing.py +++ b/spec2vec/serialization/model_importing.py @@ -38,7 +38,7 @@ def build(self) -> KeyedVectors: def from_dict(self, dictionary: dict): expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads", - "index_to_key", "norms", "key_to_index", "__weights_format"} + "index_to_key", "norms", "key_to_index", "__weights_format"} #, "mapfile_path" if dictionary.keys() == expected_keys: self.__dict__ = dictionary elif expected_keys.symmetric_difference(dictionary.keys()) == {"next_index"}: # backward compatibility diff --git a/tests/test_model_building.py b/tests/test_model_building.py index 53aa820..02ff113 100644 --- a/tests/test_model_building.py +++ b/tests/test_model_building.py @@ -8,6 +8,17 @@ train_new_word2vec_model) +@pytest.fixture +def documents(): + documents = [] + for i in range(100): + spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), + intensities=np.ones((10)).astype("float"), + metadata={}) + documents.append(SpectrumDocument(spectrum, n_decimals=1)) + return documents + + def test_set_learning_rate_decay(): """Test if correct alpha and min_alpha are calculated.""" alpha, min_alpha = set_learning_rate_decay(0.5, 0.05, 8) @@ -22,15 +33,9 @@ def test_set_learning_rate_decay_rate_too_high(): assert min_alpha == 0.0, "Expected different min_alpha" -def test_train_new_word2vec_model(): +def test_train_new_word2vec_model(documents): """Test training of a dummy model.""" # Create fake corpus - documents = [] - for i in range(100): - spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), - intensities=np.ones((10)).astype("float"), - metadata={}) - documents.append(SpectrumDocument(spectrum, n_decimals=1)) model = train_new_word2vec_model(documents, iterations=20, vector_size=20, progress_logger=False) assert model.sg == 0, "Expected different default value." @@ -44,16 +49,9 @@ def test_train_new_word2vec_model(): assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size." -def test_train_new_word2vec_model_with_logger_and_saving(tmp_path): +def test_train_new_word2vec_model_with_logger_and_saving(tmp_path, documents): """Test training of a dummy model and save it.""" # Create fake corpus - documents = [] - for i in range(100): - spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), - intensities=np.ones((10)).astype("float"), - metadata={}) - documents.append(SpectrumDocument(spectrum, n_decimals=1)) - # Train model and write to file filename = os.path.join(tmp_path, "test.model") model = train_new_word2vec_model(documents, iterations=20, filename=filename, vector_size=20, progress_logger=True) @@ -74,18 +72,11 @@ def test_train_new_word2vec_model_with_logger_and_saving(tmp_path): assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size." -def test_train_new_word2vec_model_wrong_entry(): +def test_train_new_word2vec_model_wrong_entry(documents): """Test training of a dummy model with not-accepted gensim argument entry.""" # Create fake corpus - documents = [] - for i in range(10): - spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), - intensities=np.ones((10)).astype("float"), - metadata={}) - documents.append(SpectrumDocument(spectrum, n_decimals=1)) - with pytest.raises(AssertionError) as msg: - _ = train_new_word2vec_model(documents, iterations=20, alpha=0.01, + _ = train_new_word2vec_model(documents[:10], iterations=20, alpha=0.01, progress_logger=False) expected_message_part = "Expect 'learning_rate_initial' instead of 'alpha'." diff --git a/tests/test_model_serialization.py b/tests/test_model_serialization.py index fff2e8b..40e87a1 100644 --- a/tests/test_model_serialization.py +++ b/tests/test_model_serialization.py @@ -1,11 +1,13 @@ import os +from pathlib import Path from unittest.mock import MagicMock, patch import numpy as np import pytest from gensim.models import Word2Vec from matchms import Spectrum, calculate_scores from scipy.sparse import coo_matrix, csc_matrix, csr_matrix -from spec2vec import Spec2Vec +from spec2vec import Spec2Vec, SpectrumDocument +from spec2vec.model_building import train_new_word2vec_model from spec2vec.serialization import Word2VecLight, export_model, import_model @@ -21,6 +23,25 @@ def model(request, test_dir): model.wv.vectors = scipy_matrix_builder[request.param](model.wv.vectors) return model +@pytest.fixture +def new_model(): + documents = [] + for i in range(100): + spectrum = Spectrum(mz=np.linspace(i, 9+i, 10), + intensities=np.ones((10)).astype("float"), + metadata={}) + documents.append(SpectrumDocument(spectrum, n_decimals=1)) + return train_new_word2vec_model(documents, iterations=20, vector_size=20, + progress_logger=False) + +@pytest.fixture +def new_model_on_disk(new_model, tmp_path) -> [Path, Path, Word2Vec]: + outfile_model = tmp_path / "model.json" + outfile_weights = tmp_path / "model.npy" + export_model(new_model, outfile_model, outfile_weights) + return outfile_model, outfile_weights, new_model + + def write_read_model(model, tmp_path): model_file = tmp_path / "model.json" @@ -116,3 +137,21 @@ def test_reloaded_model_computes_scores(model, tmp_path): scores_reloaded = list(calculate_scores(references, queries, spec2vec_reloaded)) assert scores == scores_reloaded + + +def test_export_model(tmp_path, new_model): + outfile_model = tmp_path / "model.json" + outfile_weights = tmp_path / "model.npy" + + export_model(new_model, outfile_model, outfile_weights) + + assert Path.exists(outfile_model) + assert Path.exists(outfile_weights) + + +@pytest.mark.skip +def test_import_model(new_model_on_disk): + model_path, weights_path, expected = new_model_on_disk + + actual = import_model(model_path, weights_path) + assert actual == expected \ No newline at end of file diff --git a/tests/test_spec2vec.py b/tests/test_spec2vec.py index 34b680a..cba7bb1 100644 --- a/tests/test_spec2vec.py +++ b/tests/test_spec2vec.py @@ -6,49 +6,64 @@ from spec2vec import Spec2Vec, SpectrumDocument -def test_spec2vec_pair_method_spectrum_entry(): - """Test if pair of two Spectrums is handled correctly""" +@pytest.fixture +def spectra(): spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), intensities=np.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), intensities=np.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) + + return spectrum_1, spectrum_2 + + +@pytest.fixture +def documents(spectra): + return [SpectrumDocument(s, n_decimals=1) for s in spectra] - model = load_test_model() + +@pytest.fixture +def model(): + repository_root = os.path.join(os.path.dirname(__file__), "..") + model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") + return gensim.models.Word2Vec.load(model_file) + + +def test_load_test_model(): + """Load pretrained Word2Vec model.""" + repository_root = os.path.join(os.path.dirname(__file__), "..") + model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") + assert os.path.isfile(model_file), "Expected file not found." + + +def test_spec2vec_pair_method_spectrum_entry(spectra, model): + """Test if pair of two Spectrums is handled correctly""" + spectrum_1, spectrum_2 = spectra spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) + score01 = spec2vec.pair(spectrum_1, spectrum_2) assert score01 == pytest.approx(0.9936808, 1e-6) score11 = spec2vec.pair(spectrum_2, spectrum_2) assert score11 == pytest.approx(1.0, 1e-9) -def test_spec2vec_pair_method_spectrumdocument_entry(): +def test_spec2vec_pair_method_spectrumdocument_entry(documents, model): """Test if pair of two SpectrumDocuments is handled correctly""" - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) - - documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] - model = load_test_model() spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) + score01 = spec2vec.pair(documents[0], documents[1]) assert score01 == pytest.approx(0.9936808, 1e-6) score11 = spec2vec.pair(documents[1], documents[1]) assert score11 == pytest.approx(1.0, 1e-9) -def test_spec2vec_pair_method_none_entry(): +def test_spec2vec_pair_method_none_entry(spectra, model): """Test if wrong input data raises expected exception""" - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) + spectrum_1, _ = spectra spectrum_2 = None - model = load_test_model() spec2vec = Spec2Vec(model=model) + with pytest.raises(ValueError) as msg: _ = spec2vec.pair(spectrum_1, spectrum_2) @@ -56,113 +71,56 @@ def test_spec2vec_pair_method_none_entry(): assert expected_msg in str(msg), "Expected different exception" -def test_spec2vec_pair_method_wrong_spectrumdocument_entry(): +def test_spec2vec_pair_method_wrong_spectrumdocument_entry(spectra, model): """Test if SpectrumDocuments with different decimal rounding is handled correctly""" - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) - - documents = [SpectrumDocument(s, n_decimals=2) for s in [spectrum_1, spectrum_2]] - model = load_test_model() + documents = [SpectrumDocument(s, n_decimals=2) for s in spectra] spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) + with pytest.raises(AssertionError) as msg: _ = spec2vec.pair(documents[0], documents[1]) expected_msg = "Decimal rounding of input data does not agree with model vocabulary." assert expected_msg in str(msg), "Expected different exception" - +@pytest.mark.parametrize("array_type", ["numpy", "sparse"]) +@pytest.mark.parametrize("is_symmetric", [True, False]) @pytest.mark.parametrize("progress_bar", [True, False]) -def test_spec2vec_matrix_method(progress_bar): +def test_spec2vec_matrix_method(progress_bar, is_symmetric, array_type, documents, model): """Test if matrix of 2x2 SpectrumDocuments is handled correctly. Run with and without progress bar. """ - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) - - documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] - model = load_test_model() spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, progress_bar=progress_bar) - scores = spec2vec.matrix(documents, documents) + scores = spec2vec.matrix(documents, documents, array_type=array_type, is_symmetric=is_symmetric) + assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." -def test_spec2vec_matrix_method_symmetric_spectrum_entry(): +def test_spec2vec_matrix_method_symmetric_spectrum_entry(spectra, model): """Test if matrix of 2x2 Spectrums is handled correctly. Run with is_symmetric=True. """ - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) - - spectrums = [spectrum_1, spectrum_2] - model = load_test_model() spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) - scores = spec2vec.matrix(spectrums, spectrums, is_symmetric=True) - assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." - assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." - assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." - assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." - - -def test_spec2vec_matrix_method_symmetric_spectrumdocument_entry(): - """Test if matrix of 2x2 SpectrumDocuments is handled correctly. - Run with is_symmetric=True. - """ - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) + scores = spec2vec.matrix(spectra, spectra, is_symmetric=True) - documents = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] - model = load_test_model() - spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) - scores = spec2vec.matrix(documents, documents, is_symmetric=True) assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score." -def test_spec2vec_matrix_method_symmetric_wrong_entry(): +def test_spec2vec_matrix_method_symmetric_wrong_entry(spectra, model): """Test if matrix of 2x2 SpectrumDocuments is handled correctly. Run with is_symmetric=True but non symmetric entries. """ - spectrum_1 = Spectrum(mz=np.array([100, 150, 200.]), - intensities=np.array([0.7, 0.2, 0.1]), - metadata={'id': 'spectrum1'}) - spectrum_2 = Spectrum(mz=np.array([100, 140, 190.]), - intensities=np.array([0.4, 0.2, 0.1]), - metadata={'id': 'spectrum2'}) - + spectrum_1, spectrum_2 = spectra documents1 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_1, spectrum_2]] documents2 = [SpectrumDocument(s, n_decimals=1) for s in [spectrum_2, spectrum_1]] - model = load_test_model() + spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) expected_msg = "Expected references to be equal to queries for is_symmetric=True" with pytest.raises(AssertionError) as msg: _ = spec2vec.matrix(documents1, documents2, is_symmetric=True) assert expected_msg in str(msg), "Expected different exception message" - - -def load_test_model(): - """Load pretrained Word2Vec model.""" - repository_root = os.path.join(os.path.dirname(__file__), "..") - model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") - assert os.path.isfile(model_file), "Expected file not found." - return gensim.models.Word2Vec.load(model_file) diff --git a/tests/test_spectrum_document.py b/tests/test_spectrum_document.py index 6f2992d..3fae847 100644 --- a/tests/test_spectrum_document.py +++ b/tests/test_spectrum_document.py @@ -1,16 +1,19 @@ import numpy as np import pytest from matchms import Spectrum -from matchms.filtering import add_losses from spec2vec import SpectrumDocument -def test_spectrum_document_init_n_decimals_default_value_no_losses(): - +@pytest.fixture +def spectrum(): mz = np.array([10, 20, 30, 40], dtype="float") intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = dict(precursor_mz=100.0) + metadata = {"precursor_mz": 100.0, "smiles": "testsmiles"} spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata) + return spectrum + + +def test_spectrum_document_init_n_decimals_default_value_no_losses(spectrum): spectrum_document = SpectrumDocument(spectrum) assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals" @@ -21,11 +24,7 @@ def test_spectrum_document_init_n_decimals_default_value_no_losses(): assert next(spectrum_document) == "peak@10.00" -def test_spectrum_document_init_n_decimals_1_no_losses(): - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = dict(precursor_mz=100.0) - spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata) +def test_spectrum_document_init_n_decimals_1_no_losses(spectrum): spectrum_document = SpectrumDocument(spectrum, n_decimals=1) assert spectrum_document.n_decimals == 1 @@ -36,54 +35,13 @@ def test_spectrum_document_init_n_decimals_1_no_losses(): assert next(spectrum_document) == "peak@10.0" -def test_spectrum_document_init_default_with_losses(): - """Use default n_decimal and add losses.""" - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = dict(precursor_mz=100.0) - spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) - spectrum = add_losses(spectrum_in) - spectrum_document = SpectrumDocument(spectrum) - - assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals" - assert len(spectrum_document) == 8 - assert spectrum_document.words == [ - "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00", - "loss@60.00", "loss@70.00", "loss@80.00", "loss@90.00" - ] - assert next(spectrum_document) == "peak@10.00" - - -def test_spectrum_document_init_n_decimals_1(): - """Use n_decimal=1 and add losses.""" - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = dict(precursor_mz=100.0) - spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) - spectrum = add_losses(spectrum_in) - spectrum_document = SpectrumDocument(spectrum, n_decimals=1) - - assert spectrum_document.n_decimals == 1 - assert len(spectrum_document) == 8 - assert spectrum_document.words == [ - "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0", - "loss@60.0", "loss@70.0", "loss@80.0", "loss@90.0" - ] - assert next(spectrum_document) == "peak@10.0" - - -def test_spectrum_document_metadata_getter(): +def test_spectrum_document_metadata_getter(spectrum): """Test metadata getter""" - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = {"precursor_mz": 100.0, - "smiles": "testsmiles"} - spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) - spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2) + spectrum_document = SpectrumDocument(spectrum, n_decimals=2) assert spectrum_document.n_decimals == 2 assert len(spectrum_document) == 4 - assert spectrum_document.metadata == metadata, "Expected different metadata" + assert spectrum_document.metadata == spectrum.metadata, "Expected different metadata" assert spectrum_document.get("smiles") == "testsmiles", "Expected different metadata" assert spectrum_document.words == [ "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00" @@ -106,30 +64,12 @@ def test_spectrum_document_metadata_getter_notallowed_key(): assert str(msg.value) == "Key cannot be attribute of SpectrumDocument class" -def test_spectrum_document_peak_getter(): +def test_spectrum_document_peak_getter(spectrum): """Test peak getter""" - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = {"precursor_mz": 100.0} - spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) - spectrum_document = SpectrumDocument(spectrum_in, n_decimals=2) + spectrum_document = SpectrumDocument(spectrum, n_decimals=2) assert spectrum_document.words == [ "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00" ] - assert np.all(spectrum_document.peaks.mz == mz), "Expected different peak m/z" - assert np.all(spectrum_document.peaks.intensities == intensities), "Expected different peaks" - - -def test_spectrum_document_losses_getter(): - """Test losses getter""" - mz = np.array([10, 20, 30, 40], dtype="float") - intensities = np.array([0, 0.01, 0.1, 1], dtype="float") - metadata = {"precursor_mz": 100.0} - spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=metadata) - spectrum = add_losses(spectrum_in) - spectrum_document = SpectrumDocument(spectrum, n_decimals=2) - assert np.all(spectrum_document.losses.mz == np.array([60., 70., 80., 90.])), \ - "Expected different losses" - assert np.all(spectrum_document.losses.intensities == intensities[::-1]), \ - "Expected different losses" + assert np.all(spectrum_document.peaks.mz == spectrum.mz), "Expected different peak m/z" + assert np.all(spectrum_document.peaks.intensities == spectrum.intensities), "Expected different peaks" diff --git a/tests/test_spectrum_document_with_losses.py b/tests/test_spectrum_document_with_losses.py new file mode 100644 index 0000000..39d2fd6 --- /dev/null +++ b/tests/test_spectrum_document_with_losses.py @@ -0,0 +1,56 @@ +import numpy as np +import pytest +from matchms import Spectrum +from spec2vec import SpectrumDocumentWithLosses + + +@pytest.fixture +def spectrum() -> Spectrum: + mz = np.array([10, 20, 30, 40], dtype="float") + intensities = np.array([0, 0.01, 0.1, 1], dtype="float") + metadata = {"precursor_mz": 100.0} + return Spectrum(mz=mz, intensities=intensities, metadata=metadata) + +def test_spectrum_document_init_default_with_losses(spectrum: Spectrum): + """Use default n_decimal and add losses.""" + spectrum_document = SpectrumDocumentWithLosses(spectrum) + + assert spectrum_document.n_decimals == 2, "Expected different default for n_decimals" + assert len(spectrum_document) == 8 + assert spectrum_document.words == [ + "peak@10.00", "peak@20.00", "peak@30.00", "peak@40.00", + "loss@60.00", "loss@70.00", "loss@80.00", "loss@90.00" + ] + assert next(spectrum_document) == "peak@10.00" + + +def test_spectrum_document_init_n_decimals_1(spectrum: Spectrum): + """Use n_decimal=1 and add losses.""" + spectrum_document = SpectrumDocumentWithLosses(spectrum, n_decimals=1) + + assert spectrum_document.n_decimals == 1 + assert len(spectrum_document) == 8 + assert spectrum_document.words == [ + "peak@10.0", "peak@20.0", "peak@30.0", "peak@40.0", + "loss@60.0", "loss@70.0", "loss@80.0", "loss@90.0" + ] + assert next(spectrum_document) == "peak@10.0" + +def test_spectrum_document_losses_getter(spectrum: Spectrum): + """Test losses getter""" + spectrum_document = SpectrumDocumentWithLosses(spectrum, n_decimals=2) + assert np.all(spectrum_document.losses.mz == np.array([60., 70., 80., 90.])), \ + "Expected different losses" + assert np.all(spectrum_document.losses.intensities == spectrum.intensities[::-1]), \ + "Expected different losses" + + +def test_losses(spectrum: Spectrum): + loss_mz_from = 10 + loss_mz_to = 30 + expected = spectrum.compute_losses(loss_mz_from, loss_mz_to) + + spectrum_document = SpectrumDocumentWithLosses(spectrum, n_decimals=2, loss_mz_from=loss_mz_from, loss_mz_to=loss_mz_to) + actual = spectrum_document.losses + + assert actual == expected