处理pdb文件

import os
from math import sqrt
import numpy
import torch
from Bio.PDB import PDBParser
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
device = torch.device("cuda")


class P450Dataset(Dataset):
    def __init__(self, testp450, transform=None, target_transform=None):
        # 处理pdb数据
        path = './testp450'
        arr = []
        max_num = 0
        index = 0
        self.data = []
        # 遍历文件夹下的pdb文件名
        for filename in os.listdir('testp450'):
            p = PDBParser()
            struct_id = "1fat"
            filename = path + '/' + filename
            structure = p.get_structure(struct_id, filename)
            atoms = structure.get_atoms()
            atoms = list(atoms)

            # 获得一个结构中的原子总数
            atom_num = 0
            for atom in atoms:
                atom_num = atom_num + 1
            print(atom_num)
        #     arr.append(atom_num)
        # max_num = max(arr)
        # print(max_num)

            # 计算距离矩阵
            i = 0
            n = numpy.zeros(shape=(1, 1))
            a = numpy.zeros(shape=(atom_num ** 2, 1))
            b = numpy.zeros(shape=(atom_num, atom_num))

            # 快速遍历一个结构中的所有原子
            for atom in atoms:
                for ato in atoms:
                    n = sqrt((atom.get_coord()[0] - ato.get_coord()[0]) ** 2 +
                             (atom.get_coord()[1] - ato.get_coord()[1]) ** 2 +
                             (atom.get_coord()[2] - ato.get_coord()[2]) ** 2)
                    a[i] = n
                    i = i + 1

            # 创建距离矩阵
            m = 0
            for p in range(0, atom_num):
                for q in range(0, atom_num):
                    b[p][q] = a[m]
                    m = m + 1
            # print(b)
            # 把所有数据padding到同一大小,计算剩余需要填补
            padding_num = 4996 - atom_num
            # b = torch.tensor(b, dtype=torch.float32)
            b = numpy.pad(b, (0, padding_num), 'constant')
            b = torch.tensor(b, dtype=torch.float32)
            # b = torch.reshape(b, (1, 5000, 5000))
            b = b[numpy.newaxis, :, :]
            print(b)
            print(b.shape)
            # 首先读取数据,计算残基距离矩阵
            # 加载所有数据,处理成相应的格式,
            self.data.append(b)

        print(self.data)
        self.testp450 = testp450

    def __len__(self):
        # 返回数据集长度,(有多少数据)
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]


num_epochs = 1000
batch_size = 2
learning_rate = 1e-3
total_train_step = 0


model = autoencoder()
model.to(device)
criterion = nn.MSELoss()
criterion.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
dataset = P450Dataset(testp450="testp450")
dataloader = DataLoader(dataset, batch_size, shuffle=True)
writer = SummaryWriter("./logs_testp450")

for epoch in range(num_epochs):
    for data in dataloader:
        img = data
        img = img.to(device)
        # print(img.shape)
        # ===================forward=====================
        output = model(img)
        # print(output.shape)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_step = total_train_step + 1
        if total_train_step % 100 == 0:
            print("训练次数:{},Loss:{}".format(total_train_step, loss.item()))
        writer.add_scalar("train_loss1000", loss.item(), total_train_step)

writer.close()
上一篇:SaaS的营销陷阱


下一篇:2011 ACM 0和1思想