Skip to content

Improve SMILES translation for surface adsorbates #2701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rmgpy/molecule/converter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ cimport rmgpy.molecule.molecule as mm
cimport rmgpy.molecule.element as elements


cpdef to_rdkit_mol(mm.Molecule mol, bint remove_h=*, bint return_mapping=*, bint sanitize=*, bint save_order=?)
cpdef to_rdkit_mol(mm.Molecule mol, bint remove_h=*, bint return_mapping=*, bint sanitize=*, bint save_order=?, int X=?)

cpdef mm.Molecule from_rdkit_mol(mm.Molecule mol, object rdkitmol, bint raise_atomtype_exception=?)

Expand Down
6 changes: 4 additions & 2 deletions rmgpy/molecule/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@
from rmgpy.exceptions import DependencyError


def to_rdkit_mol(mol, remove_h=True, return_mapping=False, sanitize=True, save_order=False):
def to_rdkit_mol(mol, remove_h=True, return_mapping=False, sanitize=True, save_order=False, X=0):
"""
Convert a molecular structure to a RDKit rdmol object. Uses
`RDKit <http://rdkit.org/>`_ to perform the conversion.
Perceives aromaticity and, unless remove_h==False, removes Hydrogen atoms.
X is the atomic number or symbol to use for surface sites
(default is 0, which works well for SMILES, but you might want 78 (Pt) for InChIs).

If return_mapping==True then it also returns a dictionary mapping the
atoms to RDKit's atom indices.
Expand All @@ -69,7 +71,7 @@ def to_rdkit_mol(mol, remove_h=True, return_mapping=False, sanitize=True, save_o
rdkitmol = Chem.rdchem.EditableMol(Chem.rdchem.Mol())
for index, atom in enumerate(mol.vertices):
if atom.element.symbol == 'X':
rd_atom = Chem.rdchem.Atom('Pt') # not sure how to do this with linear scaling when this might not be Pt
rd_atom = Chem.rdchem.Atom(X) # set in the function call. Default 0
else:
rd_atom = Chem.rdchem.Atom(atom.element.symbol)
if atom.element.isotope != -1:
Expand Down
2 changes: 1 addition & 1 deletion rmgpy/molecule/inchi.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ def create_augmented_layers(mol):
else:
molcopy = mol.copy(deep=True)

rdkitmol = to_rdkit_mol(molcopy, remove_h=True)
rdkitmol = to_rdkit_mol(molcopy, remove_h=True, X=78)
_, auxinfo = Chem.MolToInchiAndAuxInfo(rdkitmol, options='-SNon') # suppress stereo warnings

hydrogens = [at for at in molcopy.atoms if at.number == 1]
Expand Down
5 changes: 4 additions & 1 deletion rmgpy/molecule/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def _rdkit_translator(input_object, identifier_type, mol=None):
if identifier_type == 'smi':
rdkitmol = to_rdkit_mol(input_object, sanitize=False)
else:
rdkitmol = to_rdkit_mol(input_object, sanitize=True)
rdkitmol = to_rdkit_mol(input_object, sanitize=True, X=78) # Use Pt (78) for surface sites.
if identifier_type == 'inchi':
output = Chem.inchi.MolToInchi(rdkitmol, options='-SNon')
elif identifier_type == 'inchikey':
Expand Down Expand Up @@ -441,6 +441,9 @@ def _openbabel_translator(input_object, identifier_type, mol=None):
raise ValueError('Unexpected identifier type {0}.'.format(identifier_type))
obmol = to_ob_mol(input_object)
output = ob_conversion.WriteString(obmol).strip()
if identifier_type == 'smi':
# If we use * for surface sites, RDKit can read it again.
output = output.replace('[Pt]','*')
else:
raise ValueError('Unexpected input format. Should be a Molecule or a string.')

Expand Down
14 changes: 14 additions & 0 deletions test/rmgpy/molecule/converterTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,17 @@ def test_ob_round_trip(self):
new_mol = from_ob_mol(Molecule(), ob_mol)
assert mol.is_isomorphic(new_mol) or self.test_Hbond_free_mol.is_isomorphic(new_mol)
assert mol.get_element_count() == new_mol.get_element_count()

def test_rdkit_adsorbate_round_trip(self):
"""Test conversion to and from RDKitMol for adsorbates"""
adsorbates = [
Molecule().from_smiles("C*"),
Molecule().from_smiles("[*H]"),
Molecule().from_smiles("CC(=*)CO"),
Molecule().from_smiles("*COC*"),
]
for mol in adsorbates:
rdkit_mol = to_rdkit_mol(mol)
new_mol = from_rdkit_mol(Molecule(), rdkit_mol)
assert mol.is_isomorphic(new_mol)
assert mol.get_element_count() == new_mol.get_element_count()
110 changes: 104 additions & 6 deletions test/rmgpy/molecule/translatorTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,8 +863,86 @@ def test_aromatics(self):
assert smiles2 != smiles3
assert smiles1 != smiles3

def test_smiles_rdkit_adsorbates(self):
"""
Test that we can get from an adsorbate
to SMILES and back again using RDKit
"""
def compare(adjlist, smiles, alternatives=[]):
"the adjlist should convert to smiles, but the alternative smiles should give the same molecule"
molecule = Molecule().from_adjacency_list(adjlist)
assert to_smiles(molecule, backend="rdkit") == smiles, f"Expected {smiles}, got {to_smiles(molecule, backend='rdkit')}"
molecule2 = Molecule()
from_smiles(molecule2, smiles, backend="rdkit")
assert molecule2.is_isomorphic(molecule), f"Expected {molecule.to_smiles()} got {molecule2.to_smiles()}"
for alt in alternatives:
from_smiles(molecule2, alt, backend="rdkit")
assert molecule2.is_isomorphic(molecule), f"Expected {molecule.to_smiles()} got {molecule2.to_smiles()}"


adjlist = """
1 C u0 p0 c0 {2,S} {3,S} {4,S} {5,S}
2 X u0 p0 c0 {1,S}
3 H u0 p0 c0 {1,S}
4 H u0 p0 c0 {1,S}
5 H u0 p0 c0 {1,S}
"""
smiles = '*C'
compare(adjlist, smiles, alternatives=['C*'])

adjlist = """
1 C u0 p0 c0 {2,S} {4,S} {5,S} {6,S}
2 C u0 p0 c0 {1,S} {3,S} {7,S} {8,S}
3 X u0 p0 c0 {2,S}
4 H u0 p0 c0 {1,S}
5 H u0 p0 c0 {1,S}
6 H u0 p0 c0 {1,S}
7 H u0 p0 c0 {2,S}
8 H u0 p0 c0 {2,S}
"""
smiles = '*CC'
compare(adjlist, smiles, alternatives=['CC*', 'C(*)C'])

adjlist = """
1 C u0 p0 c0 {2,D} {3,S} {4,S}
2 X u0 p0 c0 {1,D}
3 H u0 p0 c0 {1,S}
4 H u0 p0 c0 {1,S}
"""
smiles = '*=C'
compare(adjlist, smiles, alternatives=['C=*'])

adjlist = """
1 X u0 p0 c0 {2,S}
2 H u0 p0 c0 {1,S}
"""
smiles = '*[H]'
compare(adjlist, smiles, alternatives=['[*H]', '[H]*'])

adjlist = """
1 X u0 p0 c0 {2,D}
2 O u0 p2 c0 {1,D}
"""
smiles = '*=O'
compare(adjlist, smiles, alternatives=['O=*'])

adjlist = """
1 X u0 p0 c0 {2,S}
2 C u0 p0 c0 {1,S} {3,S} {6,S} {7,S}
3 O u0 p2 c0 {2,S} {4,S}
4 C u0 p0 c0 {3,S} {5,S} {8,S} {9,S}
5 X u0 p0 c0 {4,S}
6 H u0 p0 c0 {2,S}
7 H u0 p0 c0 {2,S}
8 H u0 p0 c0 {4,S}
9 H u0 p0 c0 {4,S}
"""
smiles = '*COC*'
compare(adjlist, smiles)


def test_surface_molecule_rdkit(self):
"""Test InChI generation for a surface molecule using RDKit"""
"""Test SMILES generation for a surface molecule using RDKit"""
mol = Molecule().from_adjacency_list(
"""
1 C u0 p0 c0 {2,S} {3,S} {4,S} {5,S}
Expand All @@ -874,12 +952,11 @@ def test_surface_molecule_rdkit(self):
5 X u0 p0 c0 {1,S}
"""
)
smiles = "C[Pt]"

smiles = "*C"
assert to_smiles(mol, backend="rdkit") == smiles

def test_surface_molecule_ob(self):
"""Test InChI generation for a surface molecule using OpenBabel"""
"""Test SMILES generation for a surface molecule using OpenBabel"""
mol = Molecule().from_adjacency_list(
"""
1 C u0 p0 c0 {2,S} {3,S} {4,S} {5,S}
Expand All @@ -889,10 +966,31 @@ def test_surface_molecule_ob(self):
5 X u0 p0 c0 {1,S}
"""
)
smiles = "C[Pt]"

smiles = "C*"
assert to_smiles(mol, backend="openbabel") == smiles

def test_smiles_adsorbates_round_trip(self):
"""
Test that we can get from an adsorbate SMILES and back again,
by whatever back-end is chosen automatically.
"""
def check(s0):
m = Molecule(smiles=s0)
s1 = m.to_smiles()
m2 = Molecule(smiles=s1)
assert m.is_isomorphic(m2)
s2 = m2.to_smiles()
assert s1 == s2
check("*C")
check("*CC")
check("*=C")
check("*[H]")
check("*=O")
check("*COC*")
check("*CNC")
check("*C1CCCC1")
check("*N")
check("*N(Cl)")

class ParsingTest:
def setup_class(self):
Expand Down
Loading