代理IP

APL代理

import requests
import re
import os
from lxml import html#此处直接引入etree报错是因为版本问题,换个方式引入
etree = html.etree#引入etree方法
from string import punctuation
# headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'}
# source=requests.get('https://www.ip138.com/',headers=headers).text
# # print(source)
# demo=etree.HTML(source).xpath('//iframe/@src')
# demo=''.join(demo)
# # print(demo)
# # print(source)
# #获取代理IP页面
# source=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=8169392b637f4f9ebeb750f51c4c612a&count=1&expiryDate=0&format=1&newLine=2').json()#将复制的东西粘贴在这
# print(source)
# #拼接到字典中
# proxies={
#     'http':source['msg'][0]['ip']+':'+source['msg'][0]['port']
# }
# # print(proxies)
# print('http:'+demo)
# headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'}
# source=requests.get('http:'+demo,headers=headers).text
# demo=etree.HTML(source).xpath('/html/body/p[1]/a/text()')
# demo1=etree.HTML(source).xpath('/html/body/p[1]/text()[2]')
# print(demo)
# print(demo1)

隧道代理

固定部分

import requests
import time
from multiprocessing import Pool
# 蘑菇代理的隧道订单
appKey = "OWxHV1kyUnVvdlJodVpqNTpHZDhSZkdJb3VWYVdueHJ6"#粘贴
# 蘑菇隧道代理服务器地址
ip_port = 'secondtransfer.moguproxy.com:9001'

proxy = {"http": "http://" + ip_port,"https": "https://" + ip_port}
headers = {
  "Proxy-Authorization": 'Basic '+ appKey,
  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
  "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"}

以下为获取页面源码

#     source = requests.get('https://tj.fang.anjuke.com/loupan/all/p' + str(page) + '/', headers=headers, proxies=proxy,verify=False, allow_redirects=False).text  #获取页面源码
#     demo = etree.HTML(source).xpath('//*[@id="container"]/div[2]/div[1]/div[3]/div/div/a[1]/span/text()')

以下为多线程

# #多线程
# def index(page):
#     source = requests.get('https://tj.fang.anjuke.com/loupan/all/p' + str(page) + '/', headers=headers, proxies=proxy,verify=False, allow_redirects=False).text  #获取页面源码
#     demo = etree.HTML(source).xpath('//*[@id="container"]/div[2]/div[1]/div[3]/div/div/a[1]/span/text()')
#     print(demo)
#     print('=====================第'+str(page)+'页==================')
# if __name__ == '__main__':
#     # print('Parent process %s.' % os.getpid())
#     p = Pool(2)#开启的线程数
#     for page in range(1, 30):#翻页
#         p.apply_async(index, args=(page,))
#     print('Waiting for all subprocesses done...')
#     p.close()
#     p.join()
#     print('All subprocesses done.')

 

上一篇:第一次用爬虫批量下载小说章节


下一篇:lxml库之xpath的使用