Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions minedatabase/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Submodule providing exceptions used in the mine database package."""
from rdkit.Chem import Mol
from rdkit.Chem import MolToSmiles


class DisconnectedMoleculeException(ValueError):
"""Exception raised when a molecule is disconnected."""
def __init__(self, molecule: Mol):
molecule_smile = MolToSmiles(molecule)
super().__init__((
f"The provided molecule with SMILES '{molecule_smile}' is "
"disconnected. This is most common when compounds are salts. "
"If you want to include disconnected molecules, you can set "
"the `fragmented_mols` parameter to True."
))
5 changes: 4 additions & 1 deletion minedatabase/filters/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,9 @@ def gen_df_from_similarity(
for mol_chunk in Chunks(mol_info, 10000):
# Construct targets to sample df
temp_df = pd.DataFrame(mol_chunk, columns=["_id", "SMILES"])
df = df.append(_parallelize_dataframe(temp_df, partial_T_calc, processes))
df = pd.concat(
[df, _parallelize_dataframe(temp_df, partial_T_calc, processes)]
)

# Reset index for CDF calculation
df.reset_index(inplace=True, drop=True)
Expand Down Expand Up @@ -733,6 +735,7 @@ def _compare_target_fps(

Returns cpd_id if a the compound is similar enough to a target.
"""

# Generate the fingerprint of a compound and compare to the fingerprints
# of the targets
def fingerprint(fingerprint_method, keyword_dict, smi):
Expand Down
156 changes: 118 additions & 38 deletions minedatabase/pickaxe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This module generates new compounds from user-specified starting
compounds using a set of SMARTS-based reaction rules.
"""

import csv
import datetime
import os
Expand All @@ -12,7 +13,7 @@
from io import StringIO
from pathlib import Path, PosixPath, WindowsPath
from sys import exit
from typing import List, Set, Tuple, Union
from typing import List, Set, Tuple, Union, Optional

import libsbml
import lxml.etree as etree
Expand Down Expand Up @@ -42,6 +43,7 @@
write_targets_to_mine,
)
from minedatabase.reactions import transform_all_compounds_with_full
from minedatabase.exceptions import DisconnectedMoleculeException


# Default to no errors
Expand Down Expand Up @@ -189,33 +191,36 @@ def __init__(
# cid options
self.cid_num_inchi_blocks = inchikey_blocks_for_cid

print("----------------------------------------")
print("Intializing pickaxe object")
if not self.quiet:
print("----------------------------------------")
print("Intializing pickaxe object")
if database:
# Determine if a specified database is legal
db = MINE(database, self.mongo_uri)
if database in db.client.list_database_names():
if database_overwrite:
# If db exists, remove db from all of core compounds
# and drop db
print(
(
f"Database {database} already exists. "
"Deleting database and removing from core compound"
" mines."
if not self.quiet:
print(
(
f"Database {database} already exists. "
"Deleting database and removing from core compound"
" mines."
)
)
)
db.core_compounds.update_many({}, {"$pull": {"MINES": database}})
db.client.drop_database(database)
self.mine = database
else:
print(
(
f"Warning! Database {database} already exists."
"Specify database_overwrite as true to delete "
"old database and write new."
if not self.quiet:
print(
(
f"Warning! Database {database} already exists."
"Specify database_overwrite as true to delete "
"old database and write new."
)
)
)
exit("Exiting due to database name collision.")
else:
self.mine = database
Expand All @@ -238,8 +243,9 @@ def __init__(
if rule_list:
self._load_operators(rule_list)

print("\nDone intializing pickaxe object")
print("----------------------------------------\n")
if not self.quiet:
print("\nDone intializing pickaxe object")
print("----------------------------------------\n")

def load_targets(
self,
Expand Down Expand Up @@ -275,6 +281,76 @@ def load_targets(

print(f"{len(self.target_smiles)} target compounds loaded\n")

def add_molecule(self, molecule: Mol, identifier: Optional[str] = None) -> str:
"""Adds the provided molecule to the pickaxe object after sanitizing it.

Parameters
----------
molecule : Mol
RDKit molecule to add.
identifier : Optional[str] = None
Identifier for the molecule.
If None, the molecule will be assigned an identifier equal to
the string version of the number of compounds in the pickaxe object.

Returns
-------
str
Identifier of the molecule.

Raises
------
DisconnectedMoleculeException
If the molecule is disconnected and fragmented_mols is False.
"""

assert molecule is not None
assert isinstance(molecule, Mol)

if identifier is None:
identifier = str(len(self.compounds))

assert isinstance(identifier, str)
assert len(identifier) > 0

# If compound is disconnected (determined by GetMolFrags
# from rdkit) and loading of these molecules is not
# allowed, then don't add to internal dictionary.
# This is most common when compounds are salts.
if not self.fragmented_mols and len(GetMolFrags(molecule)) > 1:
raise DisconnectedMoleculeException(molecule)

# If specified remove charges (before applying reaction
# rules later on)
if self.neutralise:
molecule = utils.neutralise_charges(molecule)

assert molecule is not None
assert isinstance(molecule, Mol)

RemoveStereochemistry(molecule)

# Add compound to internal dictionary as a starting
# compound and store SMILES string to be returned
smile = MolToSmiles(molecule, True)

# Do not operate on inorganic compounds
if "C" in smile or "c" in smile:
# resolve potential tautomers and choose first one
if "n" in smile:
smile = utils.postsanitize_smiles([smile])[0][0]
molecule = MolFromSmiles(smile)

SanitizeMol(molecule)
self._add_compound(
cpd_name=identifier,
smi=smile,
cpd_type="Starting Compound",
mol=molecule,
)

return identifier

def load_compound_set(self, compound_file: str = None, id_field: str = "id") -> str:
"""Load compounds for expansion into pickaxe.

Expand Down Expand Up @@ -428,11 +504,12 @@ def _load_operators(self, rule_path: str) -> None:
rule["Reactants"]
) or rxn.GetNumProductTemplates() != len(rule["Products"]):
skipped += 1
print(
"The number of coreactants does not match the "
"number of compounds in the SMARTS for reaction "
"rule: " + rule["Name"]
)
if self.errors:
print(
"The number of coreactants does not match the "
"number of compounds in the SMARTS for reaction "
"rule: " + rule["Name"]
)
if rule["Name"] in self.operators:
raise ValueError("Duplicate reaction rule name")
# Update reaction rules dictionary
Expand Down Expand Up @@ -643,9 +720,10 @@ def transform_all(self, processes: int = 1, generations: int = 1) -> None:

# Prune network to only things being expanded
self.prune_network(white_list, True)

print("----------------------------------------")
print(f"Expanding Generation {self.generation + 1}\n")

if not self.quiet:
print("----------------------------------------")
print(f"Expanding Generation {self.generation + 1}\n")

# Starting time for expansion
time_init = time.time()
Expand All @@ -664,23 +742,25 @@ def transform_all(self, processes: int = 1, generations: int = 1) -> None:
]
# No compounds found
if not compound_smiles:
print(
"No compounds to expand in generation "
f"{self.generation + 1}. Finished expanding."
)
if not self.quiet:
print(
"No compounds to expand in generation "
f"{self.generation + 1}. Finished expanding."
)
return None

self._transform_helper(compound_smiles, processes)
self._remove_cofactor_redundancy()

print(
f"Generation {self.generation + 1} finished in"
f" {time.time()-time_init} s and contains:"
)
print(f"\t\t{len(self.compounds) - n_comps} new compounds")
print(f"\t\t{len(self.reactions) - n_rxns} new reactions")
print(f"\nDone expanding Generation: {self.generation + 1}.")
print("----------------------------------------\n")
if not self.quiet:
print(
f"Generation {self.generation + 1} finished in"
f" {time.time()-time_init} s and contains:"
)
print(f"\t\t{len(self.compounds) - n_comps} new compounds")
print(f"\t\t{len(self.reactions) - n_rxns} new reactions")
print(f"\nDone expanding Generation: {self.generation + 1}.")
print("----------------------------------------\n")

self.generation += 1

Expand Down Expand Up @@ -1363,7 +1443,7 @@ def get_reaction_annotation(
+ reaction_annotation.split("\n")
+ [
" </rdf:RDF>",
"</sbml:annotation>"
"</sbml:annotation>",
# "</annotation>"
]
)
Expand Down
5 changes: 2 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
[options]
python_requires = >= 3.7, <3.10
install_requires =
keras
python-libsbml
lxml==4.9
mordred==1.2
lxml
mordred
pandas
pymongo
rdkit
Expand Down
57 changes: 31 additions & 26 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,35 @@
with open("README.md", "r") as fh:
long_description = fh.read()

setup(name='minedatabase',
version='2.2.0',
description='Metabolic In silico Network Expansions',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/tyo-nu/MINE-Database',
author='Kevin Shebek, Jonathan Strutz',
author_email='jonstrutz11@gmail.com',
license='MIT',
packages=setuptools.find_packages(exclude=["docs", "tests"]),
setup(
name="minedatabase",
version="2.2.0",
description="Metabolic In silico Network Expansions",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/tyo-nu/MINE-Database",
author="Kevin Shebek, Jonathan Strutz",
author_email="jonstrutz11@gmail.com",
license="MIT",
packages=setuptools.find_packages(exclude=["docs", "tests"]),
# install_requires=['pymongo'],#, 'rdkit', 'scikit-learn'],
package_data={'minedatabase': ['data/*'],
'minedatabase.NP_Score': ['*.gz'],
'minedatabase.tests': ['data/*'],
},
include_package_data=True,
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'Topic :: Scientific/Engineering :: Chemistry',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.8',
],
)
package_data={
"minedatabase": [
"data/*/*.tsv",
"data/feasibility/final_model.h5",
"data/feasibility/vae_model.pth",
"data/feasibility/final_model.json",
"data/*/*.txt",
],
"minedatabase.NP_Score": ["*.gz"],
"minedatabase.tests": ["data/*"],
},
include_package_data=True,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Topic :: Scientific/Engineering :: Chemistry",
],
)