实习爬虫示例

# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
from lxml import etree
from pyquery import PyQuery as pq

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
url = 'https://www.medchemexpress.cn/mce_category.shtml?independentCateName=Inhibitory%20Antibodies'

html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
div_list = tree.xpath('//*[@id="page_table_1"]/li')

list_all = []
for div in div_list:
    list1 = []
    mulu = div.xpath('./dl/dt//text()')
    name = div.xpath('./dl/dd/table/tr[1]/th[1]/a/strong/text()')
    jianjie = div.xpath('./dl/dd/table/tr[2]/td/text()')
    for i in range(len(mulu)):
        list1.append(pq(mulu[0]).text())
        list1.append(pq(name[0]).text())
        list1.append(pq(jianjie[0]).text())
    list_all.append(list1)
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import codecs
from lxml import etree
import csv
import re
import time
import requests
import json
from pyquery import PyQuery as pq
import pandas as pd


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
proxies = [{'HTTP': '58.252.195.180:9999'}, {'HTTP': '49.70.89.159:9999'}, {'HTTP': '112.91.79.12:9999'}, {'HTTP': '110.18.155.171:9999'}, {'HTTP': '113.128.29.71:9999'}, {
    'HTTP': '27.156.195.104:9999'}, {'HTTP': '121.205.177.96:9999'}, {'HTTP': '163.204.246.82:9999'}, {'HTTP': '114.233.136.28:9999'}, {'HTTP': '42.176.132.253:9999'}]
url = 'https://www.glpbio.com/research-area/proteases/caspase.html'

page_text = requests.get(url=url, headers=headers, proxies=proxies[1]).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//*[@id="products-list"]/li')
list_all = []
for div in div_list:
    list1_item = []
    name = div.xpath('./span[2]/a')
    name = pq(name).text()
    print(name)
    url = div.xpath('./span[2]/a/@href') #获取目标网址列表, 每次一个
    if len(url) == 0:
        continue
    # print(url)
    
    html = requests.get(url[0], headers=headers, proxies=proxies[2]).text
    time.sleep(3)
    tree1 = etree.HTML(html) #获取一个目标网址下的所有html
    
    guige_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    guige_list = []
    for i in guige_all:
        guige = i.xpath('./td[1]')
        guige = pq(guige).text()
        guige_list.append(guige)
    print(guige_list)

    price_list = []
    price_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    for i in price_all:
        price = i.xpath('./td[2]/div/span')
        price = pq(price).text()
        price_list.append(price)
    print(price_list)
    stock_list = []
    stock_all = tree1.xpath('//*[@id="super-product-table"]/tbody/tr/td[1]/table/tbody/tr')
    for i in stock_all:
        try:
            stock = i.xpath('./td[3]')
        except:
            continue
        stock = pq(stock).text()
        stock_list.append(stock)
    print(stock_list)
    
    name_list = []
    for i in range(len(stock_list)):
        name_list.append(name)
    
    lists = list(zip(name_list, guige_list, price_list, stock_list))
    list_all.extend(lists)

上一篇:CAPM模型应用策略


下一篇:python解析通达信day文件,生成csv文件,期货历史回测