RDKit | 可视化重要片段

 

1. 导入依赖库

from matplotlib.pyplot import figure, imshow, axis
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from math import log10
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from IPython.display import display, Markdown, HTML
%matplotlib inline

2. 载入数据

#load sdf file
data = "CHEMBL952131_EGFR.sdf"

3. 定义描述符计算函数

def sdf_to_desc(data):
    fps = []
    targets = []
    nfps = []
    mols=[]
    bis = []

    for mol in Chem.SDMolSupplier(data):
        mols.append(mol)
        bi = {}
        fps.append(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bi))  # fingerprint
        targets.append(9.0 - log10(float(mol.GetProp("ACTIVITY"))))  # pIC50
        bis.append(bi)

    for fp in fps:
        nfp = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, nfp)
        nfps.append(nfp)

    return (np.array(nfps), np.array(targets), mols, bis)

4. 随机森林回归分析

def rf(x_train, x_test, y_train, y_test):
    r = RandomForestRegressor().fit(x_train, y_train)
    y_pred = r.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(' RF: R2: {0:f}, RMSE:{1:f}'.format(r2, rmse))
    return r
x, y, mols,bis = sdf_to_desc(data)
x_train, x_test, y_train, y_test, mols_train, mols_test, bis_train, bis_test = train_test_split(x, y, mols, bis, test_size=0.1)
r = rf(x_train, x_test, y_train, y_test)
RF: R2: 0.421548, RMSE:0.626370

5. 按特征重要性进行排序

sorted_pos = sorted(zip(range(r.feature_importances_.shape[0]), r.feature_importances_), key=lambda x: x[1], reverse=True)
important_posisions = [t[0] for t in sorted_pos[:20]]

6. 可视化重要特征

i = 0
for bi, mol in zip(bis_test, mols_test):
    i += 1
    display(Markdown("## Test molecule #{}<h2>".format(i)))
    display(mol)
    display(Markdown("#### Important fragments"))
    frgs = []
    for pos in important_posisions:
        if bi.get(pos, False):
            print("Bit position: {}".format(pos))
            display(Draw.DrawMorganBit(mol,pos,bi))

RDKit | 可视化重要片段


 

RDKit | 可视化重要片段
DrugAI

 

 

上一篇:「联赛模拟测试35」题解


下一篇:POJ 3525 半平面交+二分