Spaces:
Running
on
A10G
Running
on
A10G
# | |
# calculation of synthetic accessibility score as described in: | |
# | |
# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions | |
# Peter Ertl and Ansgar Schuffenhauer | |
# Journal of Cheminformatics 1:8 (2009) | |
# http://www.jcheminf.com/content/1/1/8 | |
# | |
# several small modifications to the original paper are included | |
# particularly slightly different formula for marocyclic penalty | |
# and taking into account also molecule symmetry (fingerprint density) | |
# | |
# for a set of 10k diverse molecules the agreement between the original method | |
# as implemented in PipelinePilot and this implementation is r2 = 0.97 | |
# | |
# peter ertl & greg landrum, september 2013 | |
# | |
from __future__ import print_function | |
from rdkit import Chem | |
from rdkit.Chem import rdMolDescriptors | |
from rdkit.six.moves import cPickle | |
from rdkit.six import iteritems | |
import math | |
from collections import defaultdict | |
import os.path as op | |
_fscores = None | |
def readFragmentScores(name='models/fpscores'): | |
import gzip | |
global _fscores | |
# generate the full path filename: | |
if name == "fpscores": | |
name = op.join(op.dirname(__file__), name) | |
_fscores = cPickle.load(gzip.open('%s.pkl.gz' % name)) | |
outDict = {} | |
for i in _fscores: | |
for j in range(1, len(i)): | |
outDict[i[j]] = float(i[0]) | |
_fscores = outDict | |
def numBridgeheadsAndSpiro(mol, ri=None): | |
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) | |
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) | |
return nBridgehead, nSpiro | |
def calculateScore(m): | |
if _fscores is None: | |
readFragmentScores() | |
# fragment score | |
fp = rdMolDescriptors.GetMorganFingerprint(m, | |
2) #<- 2 is the *radius* of the circular fingerprint | |
fps = fp.GetNonzeroElements() | |
score1 = 0. | |
nf = 0 | |
for bitId, v in iteritems(fps): | |
nf += v | |
sfp = bitId | |
score1 += _fscores.get(sfp, -4) * v | |
score1 /= nf | |
# features score | |
nAtoms = m.GetNumAtoms() | |
nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True)) | |
ri = m.GetRingInfo() | |
nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri) | |
nMacrocycles = 0 | |
for x in ri.AtomRings(): | |
if len(x) > 8: | |
nMacrocycles += 1 | |
sizePenalty = nAtoms**1.005 - nAtoms | |
stereoPenalty = math.log10(nChiralCenters + 1) | |
spiroPenalty = math.log10(nSpiro + 1) | |
bridgePenalty = math.log10(nBridgeheads + 1) | |
macrocyclePenalty = 0. | |
# --------------------------------------- | |
# This differs from the paper, which defines: | |
# macrocyclePenalty = math.log10(nMacrocycles+1) | |
# This form generates better results when 2 or more macrocycles are present | |
if nMacrocycles > 0: | |
macrocyclePenalty = math.log10(2) | |
score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty | |
# correction for the fingerprint density | |
# not in the original publication, added in version 1.1 | |
# to make highly symmetrical molecules easier to synthetise | |
score3 = 0. | |
if nAtoms > len(fps): | |
score3 = math.log(float(nAtoms) / len(fps)) * .5 | |
sascore = score1 + score2 + score3 | |
# need to transform "raw" value into scale between 1 and 10 | |
min = -4.0 | |
max = 2.5 | |
sascore = 11. - (sascore - min + 1) / (max - min) * 9. | |
# smooth the 10-end | |
if sascore > 8.: | |
sascore = 8. + math.log(sascore + 1. - 9.) | |
if sascore > 10.: | |
sascore = 10.0 | |
elif sascore < 1.: | |
sascore = 1.0 | |
return sascore | |
def processMols(mols): | |
print('smiles\tName\tsa_score') | |
for i, m in enumerate(mols): | |
if m is None: | |
continue | |
s = calculateScore(m) | |
smiles = Chem.MolToSmiles(m) | |
print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s) | |
if __name__ == '__main__': | |
import sys, time | |
t1 = time.time() | |
readFragmentScores("fpscores") | |
t2 = time.time() | |
suppl = Chem.SmilesMolSupplier(sys.argv[1]) | |
t3 = time.time() | |
processMols(suppl) | |
t4 = time.time() | |
print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)), | |
file=sys.stderr) | |
# | |
# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. | |
# All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions are | |
# met: | |
# | |
# * Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# * Redistributions in binary form must reproduce the above | |
# copyright notice, this list of conditions and the following | |
# disclaimer in the documentation and/or other materials provided | |
# with the distribution. | |
# * Neither the name of Novartis Institutes for BioMedical Research Inc. | |
# nor the names of its contributors may be used to endorse or promote | |
# products derived from this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
# |