APL代理
import requests import re import os from lxml import html#此处直接引入etree报错是因为版本问题,换个方式引入 etree = html.etree#引入etree方法 from string import punctuation # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'} # source=requests.get('https://www.ip138.com/',headers=headers).text # # print(source) # demo=etree.HTML(source).xpath('//iframe/@src') # demo=''.join(demo) # # print(demo) # # print(source) # #获取代理IP页面 # source=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=8169392b637f4f9ebeb750f51c4c612a&count=1&expiryDate=0&format=1&newLine=2').json()#将复制的东西粘贴在这 # print(source) # #拼接到字典中 # proxies={ # 'http':source['msg'][0]['ip']+':'+source['msg'][0]['port'] # } # # print(proxies) # print('http:'+demo) # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'} # source=requests.get('http:'+demo,headers=headers).text # demo=etree.HTML(source).xpath('/html/body/p[1]/a/text()') # demo1=etree.HTML(source).xpath('/html/body/p[1]/text()[2]') # print(demo) # print(demo1)
隧道代理
固定部分
import requests import time from multiprocessing import Pool # 蘑菇代理的隧道订单 appKey = "OWxHV1kyUnVvdlJodVpqNTpHZDhSZkdJb3VWYVdueHJ6"#粘贴 # 蘑菇隧道代理服务器地址 ip_port = 'secondtransfer.moguproxy.com:9001' proxy = {"http": "http://" + ip_port,"https": "https://" + ip_port} headers = { "Proxy-Authorization": 'Basic '+ appKey, "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"}
以下为获取页面源码
# source = requests.get('https://tj.fang.anjuke.com/loupan/all/p' + str(page) + '/', headers=headers, proxies=proxy,verify=False, allow_redirects=False).text #获取页面源码 # demo = etree.HTML(source).xpath('//*[@id="container"]/div[2]/div[1]/div[3]/div/div/a[1]/span/text()')
以下为多线程
# #多线程 # def index(page): # source = requests.get('https://tj.fang.anjuke.com/loupan/all/p' + str(page) + '/', headers=headers, proxies=proxy,verify=False, allow_redirects=False).text #获取页面源码 # demo = etree.HTML(source).xpath('//*[@id="container"]/div[2]/div[1]/div[3]/div/div/a[1]/span/text()') # print(demo) # print('=====================第'+str(page)+'页==================') # if __name__ == '__main__': # # print('Parent process %s.' % os.getpid()) # p = Pool(2)#开启的线程数 # for page in range(1, 30):#翻页 # p.apply_async(index, args=(page,)) # print('Waiting for all subprocesses done...') # p.close() # p.join() # print('All subprocesses done.')