毕设要统计蛋白质的信息,就写了个小脚本。
把PDB里的编号写到pdbselected里、我本来想做成根据蛋白质直接搜索到编号的,发现居然有反爬…只能先写成这个了…
具体爬的是哪个位置、建议自己在开发者模式里的自己搜索一下康康。
from selenium import webdriver
import xlwt
import random
import time
import re
driver=webdriver.Chrome(r'E:\Chrome\Application\chromedriver.exe')
#这里自己改
pdbselected=\
["6VCA",
"6VC9",
"6TVG",
"6TVX",
"6TW0",
"6TWA",
"6TWF",
"6XUQ",
"6XUE",
"6XUG",
"7JV8",
"7JV9",
"6YE2",
"6YE1",
"4H1S",
"6Z9B",
"6Z9D",
"3ZU0",
"6TVE",
"6HXW",
"6S7F",
"6S7H",
"4H2I",
"4H2B",
"3ZTV",
"4H2F",
"4H2G",
"4H1Y",
"4CD1",
"4CD3"]
work_book=xlwt.Workbook()
work_sheet = work_book.add_sheet("data")
for i in range(len(pdbselected)):
time.sleep(random.uniform(1,3))
url = 'http://www1.rcsb.org/structure/'+str(pdbselected[i])
driver.get(url)
#这里可以根据需要自己设定爬虫的xpath
contain=driver.find_element_by_xpath("//*[@id='exp_header_0_snapshot']")
contain_2=driver.find_element_by_xpath("//*[@class='list-unstyled']")
#高分子含量
contain_3=driver.find_element_by_xpath("//*[@id='macromoleculeContent']")
#分子的其他信息
contain_4=driver.find_element_by_xpath("//*[@id='macromolecule-entityId-1-rowDescription']")
#把contain里爬到的模块的text分割
ListOfContain=re.split("[\n:]", contain .text)
ListOfContain_2= re.split("[\n:]", contain_2.text)
ListOfContain_3 = re.split("[\n:]", contain_3.text)
ListOfContain_4 = re.split("[\n: ]", contain_4.text)
#合并list
ListOfContain=ListOfContain+ListOfContain_2+ListOfContain_3+ListOfContain_4
#先输入名字
work_sheet.write(i, 0, pdbselected[i])
for j in range(len(ListOfContain)):
work_sheet.write(i, j+1, ListOfContain[j])
work_book.save('test.csv')