# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
from lxml import etree
from pyquery import PyQuery as pq
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
url = 'https://www.medchemexpress.cn/mce_category.shtml?independentCateName=Inhibitory%20Antibodies'
html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
div_list = tree.xpath('//*[@id="page_table_1"]/li')
list_all = []
for div in div_list:
list1 = []
mulu = div.xpath('./dl/dt//text()')
name = div.xpath('./dl/dd/table/tr[1]/th[1]/a/strong/text()')
jianjie = div.xpath('./dl/dd/table/tr[2]/td/text()')
for i in range(len(mulu)):
list1.append(pq(mulu[0]).text())
list1.append(pq(name[0]).text())
list1.append(pq(jianjie[0]).text())
list_all.append(list1)
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import codecs
from lxml import etree
import csv
import re
import time
import requests
import json
from pyquery import PyQuery as pq
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
proxies = [{'HTTP': '58.252.195.180:9999'}, {'HTTP': '49.70.89.159:9999'}, {'HTTP': '112.91.79.12:9999'}, {'HTTP': '110.18.155.171:9999'}, {'HTTP': '113.128.29.71:9999'}, {
'HTTP': '27.156.195.104:9999'}, {'HTTP': '121.205.177.96:9999'}, {'HTTP': '163.204.246.82:9999'}, {'HTTP': '114.233.136.28:9999'}, {'HTTP': '42.176.132.253:9999'}]
url = 'https://www.glpbio.com/research-area/proteases/caspase.html'
page_text = requests.get(url=url, headers=headers, proxies=proxies[1]).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//*[@id="products-list"]/li')
list_all = []
for div in div_list:
list1_item = []
name = div.xpath('./span[2]/a')
name = pq(name).text()
print(name)
url = div.xpath('./span[2]/a/@href') #获取目标网址列表, 每次一个
if len(url) == 0:
continue
# print(url)
html = requests.get(url[0], headers=headers, proxies=proxies[2]).text
time.sleep(3)
tree1 = etree.HTML(html) #获取一个目标网址下的所有html
guige_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
guige_list = []
for i in guige_all:
guige = i.xpath('./td[1]')
guige = pq(guige).text()
guige_list.append(guige)
print(guige_list)
price_list = []
price_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
for i in price_all:
price = i.xpath('./td[2]/div/span')
price = pq(price).text()
price_list.append(price)
print(price_list)
stock_list = []
stock_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
for i in stock_all:
try:
stock = i.xpath('./td[3]')
except:
continue
stock = pq(stock).text()
stock_list.append(stock)
print(stock_list)
name_list = []
for i in range(len(stock_list)):
name_list.append(name)
lists = list(zip(name_list, guige_list, price_list, stock_list))
list_all.extend(lists)