Skip to content

Commit

Permalink
Enhanced stereo read/write support in SDF files. (rdkit#2022)
Browse files Browse the repository at this point in the history
* add a couple test files

* backup

* first pass at some theory documentatin

* it's a draft

* Update enhanced stereochemistry documentation

Adds initial target use case and caveats about the tentative
nature of the current implementation.

* Support read/write of molfile enhanced stereochemistry

This includes reading and writing of enhanced stereochemistry
from v3000 molfiles (sdf). Enhanced stereochemistry encodes
the relative configuration of stereocenters, allowing
representation of racemic mixtures and compounds with
unknown absolute stereochemistry.

It does not include:
* Python wrapping
* invalidation of the enhanced stereochemistry
* use of enhanced stereochemistry in search
* depiction of enhanced stereochemistry.

* Update to reflect changes from rdkit#1971

* change names of enum elements to allow compilation in VS2017

I think it's also clearer to do things this way

* Addressed most review comments.

* Run missed test "testEnhancedStereoChemistry"
* In tests, added size checks to group equality checks
* Updated copyright statements
* Deleted mol created for a test
* Use perfect forwarding in RWMol::setStereoGroups()
* use references for stereo groups that are checked in write and pickle
* Updated stereogroup.h in hopes of fixing compilation on Windows.
* clang-format

* try allowing a switch to boost regex and requiring it for g++-4.8

* do a better job of that

* typo

* Code review comments. Updated Copyright notice.

* When an atom is deleted, delete stereo groups containing it.

Also updates StereoGroup toUse accessors instead of
constant member attributes. This allows move of StereoGroups.

* RDKit style guide

* Add header required on Windows.

* get the SWIG wrappers to build
  • Loading branch information
d-b-w authored and greglandrum committed Sep 26, 2018
1 parent ad6ce82 commit eaa44b4
Show file tree
Hide file tree
Showing 37 changed files with 691 additions and 42 deletions.
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ before_install:
- if [[ "$CONDA_PYTHON_VERSION" == "3.6" ]] ; then export PYMAJOR="3"; fi
- if [[ "$CONDA_PYTHON_VERSION" == "2.7" ]] ; then export PYMAJOR="2"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then wget http://repo.continuum.io/miniconda/Miniconda$PYMAJOR-4.4.10-Linux-x86_64.sh -O miniconda.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then wget http://repo.continuum.io/miniconda/Miniconda$PYMAJOR-4.4.10-MacOSX-x86_64.sh -O miniconda.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then wget http://repo.continuum.io/miniconda/Miniconda$PYMAJOR-4.4.10-MacOSX-x86_64.sh -O miniconda.sh; fi
- bash miniconda.sh -b -p $HOME/conda
- export PATH="$HOME/conda/bin:$PATH"
- hash -r
Expand Down Expand Up @@ -82,7 +82,8 @@ script:
- cd build
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXX="clang++-3.9" ; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CC="clang-3.9" ; fi
- cmake -D Python_ADDITIONAL_VERSIONS=$CONDA_PYTHON_VERSION -D PYTHON_EXECUTABLE=$PYTHON -D PYTHON_LIBRARY=`find $PY_PREFIX -name "libpython$CONDA_PYTHON_VERSION*.so"` -D PYTHON_NUMPY_INCLUDE_PATH=$PY_SP_DIR/numpy/core/include -D BOOST_ROOT=$PY_PREFIX -D Boost_NO_SYSTEM_PATHS=ON -D RDK_BUILD_AVALON_SUPPORT=ON -D RDK_BUILD_INCHI_SUPPORT=ON -DRDK_BUILD_THREADSAFE_SSS=on -DRDK_TEST_MULTITHREADED=on -DRDK_INSTALL_STATIC_LIBS=OFF -DRDK_BUILD_SWIG_WRAPPERS=OFF -DRDK_SWIG_STATIC=OFF -DRDK_BUILD_PYTHON_WRAPPERS=OFF -DRDK_BUILD_FREESASA_SUPPORT=ON ..
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export REGEX_EXTRA="-DRDK_USE_BOOST_REGEX=ON" ; fi
- cmake $REGEX_EXTRA -D Python_ADDITIONAL_VERSIONS=$CONDA_PYTHON_VERSION -D PYTHON_EXECUTABLE=$PYTHON -D PYTHON_LIBRARY=`find $PY_PREFIX -name "libpython$CONDA_PYTHON_VERSION*.so"` -D PYTHON_NUMPY_INCLUDE_PATH=$PY_SP_DIR/numpy/core/include -D BOOST_ROOT=$PY_PREFIX -D Boost_NO_SYSTEM_PATHS=ON -D RDK_BUILD_AVALON_SUPPORT=ON -D RDK_BUILD_INCHI_SUPPORT=ON -DRDK_BUILD_THREADSAFE_SSS=on -DRDK_TEST_MULTITHREADED=on -DRDK_INSTALL_STATIC_LIBS=OFF -DRDK_BUILD_SWIG_WRAPPERS=OFF -DRDK_SWIG_STATIC=OFF -DRDK_BUILD_PYTHON_WRAPPERS=OFF -DRDK_BUILD_FREESASA_SUPPORT=ON ..
- make -j2 VERBOSE=3
- make install
- ls "$PY_PREFIX/lib"
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ option(RDK_BUILD_FREESASA_SUPPORT "build the rdkit freesasa wrapper" OFF )
option(RDK_BUILD_COORDGEN_SUPPORT "build the rdkit coordgen wrapper" ON )
option(RDK_BUILD_MOLINTERCHANGE_SUPPORT "build in support for CommonChem molecule interchange" ON )
option(RDK_INSTALL_DEV_COMPONENT "install libraries and headers" ON)
option(RDK_USE_BOOST_REGEX "use boost::regex instead of std::regex (needed for systems with g++-4.8)" OFF)
if(NOT MSVC)
if(RDK_OPTIMIZE_NATIVE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt")
Expand Down
3 changes: 2 additions & 1 deletion Code/GraphMol/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rdkit_library(GraphMol
AtomIterators.cpp BondIterators.cpp Aromaticity.cpp Kekulize.cpp
MolDiscriminators.cpp ConjugHybrid.cpp AddHs.cpp
Matrices.cpp Chirality.cpp RingInfo.cpp Conformer.cpp
Renumber.cpp AdjustQuery.cpp Resonance.cpp
Renumber.cpp AdjustQuery.cpp Resonance.cpp StereoGroup.cpp
new_canon.cpp
SHARED
LINK_LIBRARIES RDGeometryLib RDGeneral
Expand Down Expand Up @@ -39,6 +39,7 @@ rdkit_headers(Atom.h
ROMol.h
RWMol.h
SanitException.h
StereoGroup.h
MonomerInfo.h
new_canon.h
MolBundle.h
Expand Down
11 changes: 10 additions & 1 deletion Code/GraphMol/FileParsers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ else()
set (maeparser_var "")
endif()

if(RDK_USE_BOOST_REGEX)
add_definitions(-DRDKIT_USE_BOOST_REGEX)
find_package(Boost 1.58.0 COMPONENTS regex REQUIRED)
set(regex_lib Boost::regex)
endif()

rdkit_library(FileParsers
Mol2FileParser.cpp
MolFileParser.cpp MolFileStereochem.cpp MolFileWriter.cpp
Expand All @@ -22,7 +28,7 @@ rdkit_library(FileParsers
ProximityBonds.cpp
SequenceParsers.cpp SequenceWriters.cpp
SVGParser.cpp
LINK_LIBRARIES Depictor SmilesParse GraphMol ${maeparser_var})
LINK_LIBRARIES Depictor SmilesParse GraphMol ${maeparser_var} ${regex_lib})

rdkit_headers(FileParsers.h
FileParserUtils.h
Expand Down Expand Up @@ -59,4 +65,7 @@ rdkit_test(testMol2ToMol testMol2ToMol.cpp LINK_LIBRARIES FileParsers SmilesPars

rdkit_test(testSequence testSequence.cpp LINK_LIBRARIES FileParsers SmilesParse GraphMol RDGeneral RDGeometryLib )

rdkit_test(testExtendedStereoParsing testExtendedStereoParsing.cpp
LINK_LIBRARIES FileParsers Depictor GraphMol RDGeneral RDGeometryLib )

rdkit_catch_test(fileParsersCatchTest catch_tests.cpp LINK_LIBRARIES FileParsers SmilesParse GraphMol RDGeneral RDGeometryLib )
98 changes: 89 additions & 9 deletions Code/GraphMol/FileParsers/MolFileParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/StereoGroup.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/RDLog.h>

Expand All @@ -29,9 +30,21 @@
#include <RDGeneral/LocaleSwitcher.h>
#include <typeinfo>
#include <exception>
#ifdef RDKIT_USE_BOOST_REGEX
#include <boost/regex.hpp>
using boost::regex;
using boost::regex_match;
using boost::smatch;
#else
#include <regex>
using std::regex;
using std::regex_match;
using std::smatch;
#endif
#include <sstream>
#include <locale>
#include <stdlib.h>
#include <cstdio>

namespace RDKit {
class MolFileUnhandledFeatureException : public std::exception {
Expand Down Expand Up @@ -139,7 +152,7 @@ void completeQueryAndChildren(ATOM_EQUALS_QUERY *query, Atom *tgt,
magicVal);
}
}
void CompleteMolQueries(RWMol *mol, int magicVal = 0xDEADBEEF) {
void completeMolQueries(RWMol *mol, int magicVal = 0xDEADBEEF) {
for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms();
++ai) {
if ((*ai)->hasQuery()) {
Expand All @@ -150,6 +163,71 @@ void CompleteMolQueries(RWMol *mol, int magicVal = 0xDEADBEEF) {
}
}

bool startsWith(const std::string &haystack, const char *needle, size_t size) {
return haystack.compare(0u, size, needle, size) == 0;
}

//! parse a collection block to find enhanced stereo groups
std::string parseEnhancedStereo(std::istream *inStream, unsigned int &line,
RWMol *mol) {
// Lines like (absolute, relative, racemic):
// M V30 MDLV30/STEABS ATOMS=(2 2 3)
// M V30 MDLV30/STEREL1 ATOMS=(1 12)
// M V30 MDLV30/STERAC1 ATOMS=(1 12)
const regex stereo_label(
R"regex(MDLV30/STE(...)[0-9]* +ATOMS=\(([0-9]+) +(.*)\))regex");

smatch match;
std::vector<StereoGroup> groups;

// Read the collection until the end
auto tempStr = getV3000Line(inStream, line);
boost::to_upper(tempStr);
while (!startsWith(tempStr, "END", 3)) {
// If this line in the collection is part of a stereo group
if (regex_match(tempStr, match, stereo_label)) {
StereoGroupType grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;

if (match[1] == "ABS") {
grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;
} else if (match[1] == "REL") {
grouptype = RDKit::StereoGroupType::STEREO_OR;
} else if (match[1] == "RAC") {
grouptype = RDKit::StereoGroupType::STEREO_AND;
} else {
std::ostringstream errout;
errout << "Unrecognized stereogroup type : '" << tempStr << "' on line"
<< line;
throw FileParseException(errout.str());
}

const unsigned int count = FileParserUtils::toInt(match[2], true);
std::vector<Atom *> atoms;
std::stringstream ss(match[3]);
unsigned int index;
for (size_t i = 0; i < count; ++i) {
ss >> index;
// atoms are 1 indexed in molfiles
atoms.push_back(mol->getAtomWithIdx(index - 1));
}
groups.emplace_back(grouptype, std::move(atoms));
} else {
// skip collection types we don't know how to read. Only one documented
// is MDLV30/HILITE
BOOST_LOG(rdWarningLog) << "Skipping unrecognized collection type at "
"line "
<< line << ": " << tempStr << std::endl;
}
tempStr = getV3000Line(inStream, line);
}

if (!groups.empty()) {
mol->setStereoGroups(std::move(groups));
}
tempStr = getV3000Line(inStream, line);
return tempStr;
}

//*************************************
//
// Every effort has been made to adhere to MDL's standard
Expand Down Expand Up @@ -2321,15 +2399,17 @@ bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol,
}

while (tempStr.length() > 5 && tempStr.substr(0, 5) == "BEGIN") {
// skip blocks we don't know how to read
BOOST_LOG(rdWarningLog)
<< "skipping block at line " << line << ": " << tempStr << std::endl;
tempStr = getV3000Line(inStream, line);

while (tempStr.length() < 3 || tempStr.substr(0, 3) != "END") {
if (tempStr.length() > 15 && tempStr.substr(6, 10) == "COLLECTION") {
tempStr = parseEnhancedStereo(inStream, line, mol);
} else {
// skip blocks we don't know how to read
BOOST_LOG(rdWarningLog)
<< "skipping block at line " << line << ": " << tempStr << std::endl;
while (tempStr.length() < 3 || tempStr.substr(0, 3) != "END") {
tempStr = getV3000Line(inStream, line);
}
tempStr = getV3000Line(inStream, line);
}
tempStr = getV3000Line(inStream, line);
}

boost::to_upper(tempStr);
Expand Down Expand Up @@ -2658,7 +2738,7 @@ RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line,

if (res->hasProp(common_properties::_NeedsQueryScan)) {
res->clearProp(common_properties::_NeedsQueryScan);
CompleteMolQueries(res);
completeMolQueries(res);
}
}
return res;
Expand Down
61 changes: 50 additions & 11 deletions Code/GraphMol/FileParsers/MolFileWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,14 @@ int getQueryBondSymbol(const Bond *bond) {
}
}
}
} else if( qry->getDescription() == "SingleOrAromaticBond" && !qry->getNegation()) {
} else if (qry->getDescription() == "SingleOrAromaticBond" &&
!qry->getNegation()) {
res = 6;
}
}
return res;
}
}
} // namespace

const std::string GetMolFileChargeInfo(const RWMol &mol) {
std::stringstream res;
Expand Down Expand Up @@ -276,9 +277,8 @@ const std::string GetMolFileQueryInfo(const RWMol &mol) {
wrote_query = true;
}
std::string molFileValue;
if (!wrote_query &&
(*atomIt)->getPropIfPresent(common_properties::molFileValue,
molFileValue))
if (!wrote_query && (*atomIt)->getPropIfPresent(
common_properties::molFileValue, molFileValue))
ss << "V " << std::setw(3) << (*atomIt)->getIdx() + 1 << " "
<< molFileValue << std::endl;
}
Expand Down Expand Up @@ -510,7 +510,7 @@ unsigned int getAtomParityFlag(const Atom *atom, const Conformer *conf) {
}
return 0;
}
}
} // namespace

bool hasNonDefaultValence(const Atom *atom) {
if (atom->getNumRadicalElectrons() != 0) return true;
Expand Down Expand Up @@ -656,7 +656,7 @@ class RequiresV3000Exception : public std::runtime_error {
explicit RequiresV3000Exception()
: std::runtime_error("RequiresV3000Exception"){};
};
}
} // namespace

int BondGetMolFileSymbol(const Bond *bond) {
PRECONDITION(bond, "");
Expand Down Expand Up @@ -778,7 +778,7 @@ void GetMolFileBondStereoInfo(const Bond *bond, const INT_MAP_INT &wedgeBonds,
boost::tie(beg, end) =
bond->getOwningMol().getAtomBonds(bond->getBeginAtom());
while (beg != end && !nbrHasDir) {
const Bond* nbrBond = bond->getOwningMol()[*beg];
const Bond *nbrBond = bond->getOwningMol()[*beg];
if (nbrBond->getBondType() == Bond::SINGLE &&
(nbrBond->getBondDir() == Bond::ENDUPRIGHT ||
nbrBond->getBondDir() == Bond::ENDDOWNRIGHT)) {
Expand All @@ -789,7 +789,7 @@ void GetMolFileBondStereoInfo(const Bond *bond, const INT_MAP_INT &wedgeBonds,
boost::tie(beg, end) =
bond->getOwningMol().getAtomBonds(bond->getEndAtom());
while (beg != end && !nbrHasDir) {
const Bond* nbrBond = bond->getOwningMol()[*beg];
const Bond *nbrBond = bond->getOwningMol()[*beg];
if (nbrBond->getBondType() == Bond::SINGLE &&
(nbrBond->getBondDir() == Bond::ENDUPRIGHT ||
nbrBond->getBondDir() == Bond::ENDDOWNRIGHT)) {
Expand Down Expand Up @@ -1009,6 +1009,42 @@ const std::string GetV3000MolFileBondLine(const Bond *bond,
return ss.str();
}

void appendEnhancedStereoGroups(std::string &res, const RWMol &tmol) {
unsigned or_count = 1u, and_count = 1u;
auto &stereo_groups = tmol.getStereoGroups();
if (!stereo_groups.empty()) {
res += "M V30 BEGIN COLLECTION\n";
for (auto &&group : stereo_groups) {
res += "M V30 MDLV30/";
switch (group.getGroupType()) {
case RDKit::StereoGroupType::STEREO_ABSOLUTE:
res += "STEABS";
break;
case RDKit::StereoGroupType::STEREO_OR:
res += "STEREL";
res += boost::lexical_cast<std::string>(or_count);
++or_count;
break;
case RDKit::StereoGroupType::STEREO_AND:
res += "STERAC";
res += boost::lexical_cast<std::string>(and_count);
++and_count;
break;
}
res += " ATOMS=(";
auto& atoms = group.getAtoms();
res += boost::lexical_cast<std::string>(atoms.size());
for (auto &&atom : atoms) {
res += ' ';
// atoms are 1 indexed in molfiles
res += boost::lexical_cast<std::string>(atom->getIdx() + 1);
}
res += ")\n";
}
res += "M V30 END COLLECTION\n";
}
}

//------------------------------------------------
//
// gets a mol block as a string
Expand Down Expand Up @@ -1073,7 +1109,8 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId,
if (forceV3000)
isV3000 = true;
else
isV3000 = (nAtoms > 999) || (nBonds > 999);
isV3000 =
(nAtoms > 999) || (nBonds > 999) || !tmol.getStereoGroups().empty();

// the counts line:
std::stringstream ss;
Expand Down Expand Up @@ -1157,6 +1194,8 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId,
}
res += "M V30 END BOND\n";
}
appendEnhancedStereoGroups(res, tmol);

res += "M V30 END CTAB\n";
}
res += "M END\n";
Expand Down Expand Up @@ -1219,4 +1258,4 @@ void MolToMolFile(const ROMol &mol, const std::string &fName,
*outStream << outString;
delete outStream;
}
}
} // namespace RDKit
Loading

0 comments on commit eaa44b4

Please sign in to comment.