1. 导入依赖库
from matplotlib.pyplot import figure, imshow, axis
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from math import log10
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from IPython.display import display, Markdown, HTML
%matplotlib inline
2. 载入数据
#load sdf file
data = "CHEMBL952131_EGFR.sdf"
3. 定义描述符计算函数
def sdf_to_desc(data):
fps = []
targets = []
nfps = []
mols=[]
bis = []
for mol in Chem.SDMolSupplier(data):
mols.append(mol)
bi = {}
fps.append(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bi)) # fingerprint
targets.append(9.0 - log10(float(mol.GetProp("ACTIVITY")))) # pIC50
bis.append(bi)
for fp in fps:
nfp = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp, nfp)
nfps.append(nfp)
return (np.array(nfps), np.array(targets), mols, bis)
4. 随机森林回归分析
def rf(x_train, x_test, y_train, y_test):
r = RandomForestRegressor().fit(x_train, y_train)
y_pred = r.predict(x_test)
r2 = r2_score(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(' RF: R2: {0:f}, RMSE:{1:f}'.format(r2, rmse))
return r
x, y, mols,bis = sdf_to_desc(data)
x_train, x_test, y_train, y_test, mols_train, mols_test, bis_train, bis_test = train_test_split(x, y, mols, bis, test_size=0.1)
r = rf(x_train, x_test, y_train, y_test)
RF: R2: 0.421548, RMSE:0.626370
5. 按特征重要性进行排序
sorted_pos = sorted(zip(range(r.feature_importances_.shape[0]), r.feature_importances_), key=lambda x: x[1], reverse=True)
important_posisions = [t[0] for t in sorted_pos[:20]]
6. 可视化重要特征
i = 0
for bi, mol in zip(bis_test, mols_test):
i += 1
display(Markdown("## Test molecule #{}<h2>".format(i)))
display(mol)
display(Markdown("#### Important fragments"))
frgs = []
for pos in important_posisions:
if bi.get(pos, False):
print("Bit position: {}".format(pos))
display(Draw.DrawMorganBit(mol,pos,bi))