1. 基于selenium实现12306登录
#下述代码为超级鹰提供的示例代码 import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json() # chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') #用户中心>>软件ID 生成一个替换 96001 # im = open('12306.jpg', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要// # print(chaojiying.PostPic(im, 9004)['pic_str']) #上述代码为超级鹰提供的示例代码 #使用selenium打开登录页面 from selenium import webdriver import time from PIL import Image from selenium.webdriver import Chrome from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.options import Options ''' 在拖拽滑块的过程中,老是报错,即便是用手拖住。这是因为服务器检测到了在使用selenium。 如何实现让selenium规避被检测到的风险 实际上,服务器是通过window.navigator.webdriver来检查是否使用了selenium的。正常情况下,结果为false,但是如果使用了selenium,则结果为true。 解决方法如下: 1. 如果chrome版本小于88,则在启动浏览器的时候(此时没有加载任何网页内容),向页面签入js代码,去掉webdriver。 web = Chrome() web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{ "source":""" window.navigator.webdriver = undefined Object.defineProperty(navigator,'webdriver',{ get:()=>undefined }) """ }) web.get(xxxxxx) 2. 如果chrome版本大于等于88,则采用以下方式: option = Options() # option.add_experimental_option('excludeSwitches',['enable-automation']) # 这一句加不加无所谓 option.add_argument('--disable-blink-features=AutomationControlled') bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option) bro.get('https://kyfw.12306.cn/otn/resources/login.html') ''' option = Options() option.add_argument('--disable-blink-features=AutomationControlled') bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option) bro.get('https://kyfw.12306.cn/otn/resources/login.html') time.sleep(1) a_tag = bro.find_element_by_xpath("/html/body/div[2]/div[2]/ul/li[2]/a") a_tag.click() #save_screenshot就是将当前页面进行截图且保存 bro.save_screenshot('aa.png') #确定验证码图片对应的左上角和右下角的坐标(裁剪的区域就确定) code_img_ele = bro.find_element_by_xpath('//*[@id="J-loginImg"]') location = code_img_ele.location # 验证码图片左上角的坐标 x,y print('location:',location) size = code_img_ele.size #验证码标签对应的长和宽 print('size:',size) #左上角和右下角坐标 rangle = ( int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) #至此验证码图片区域就确定下来了 i = Image.open('./aa.png') code_img_name = './code.png' #crop根据指定区域进行图片裁剪 frame = i.crop(rangle) frame.save(code_img_name) #将验证码图片提交给超级鹰进行识别 chaojiying = Chaojiying_Client('ziyouzheyan3', 'liuyanyan03', '914163') #用户中心>>软件ID 生成一个替换 96001 im = open('code.png', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要// print(chaojiying.PostPic(im, 9004)['pic_str']) result = chaojiying.PostPic(im, 9004)['pic_str'] all_list = [] #要存储即将被点击的点的坐标 [[x1,y1],[x2,y2]] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) #遍历列表,使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作 for l in all_list: x = l[0] y = l[1] ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform() time.sleep(0.5) bro.find_element_by_id('J-userName').send_keys('18769756237') time.sleep(2) bro.find_element_by_id('J-password').send_keys('liuyanyan03') time.sleep(2) bro.find_element_by_id('J-login').click() time.sleep(5) # 滑块验证 span = bro.find_element_by_xpath('//*[@id="nc_1_n1z"]') ActionChains(bro).drag_and_drop_by_offset(span,300,0).perform()
2. 基于selenium实现豆瓣网登录
#使用selenium打开登录页面 from selenium import webdriver import time from PIL import Image from selenium.webdriver import ActionChains from selenium.webdriver.chrome.options import Options from selenium.webdriver import ChromeOptions def get_tracks(distance): v = 0 t = 0.3 tracks = [] current = 0 mid = distance*4/5 while current < distance: if current < mid: a = 2 else: a =-3 v0 = v s = v0*t + 0.5*a*(t**2) current += s tracks.append(round(s)) v = v0 + a*t return tracks option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) #如何实现让selenium规避被检测到的风险 bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option) # bro.maximize_window() bro.get('https://accounts.douban.com/passport/login') time.sleep(1) a_tag = bro.find_element_by_xpath("//*[@id='account']/div[2]/div[2]/div/div[1]/ul[1]/li[2]") a_tag.click() #save_screenshot就是将当前页面进行截图且保存 bro.save_screenshot('aa.png') bro.find_element_by_id('username').send_keys('18769756237') time.sleep(2) bro.find_element_by_id('password').send_keys('liuyanyan33') time.sleep(2) bro.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() time.sleep(3) bro.switch_to.frame('tcaptcha_iframe') span = bro.find_element_by_xpath('//*[@id="slideBlock"]') ActionChains(bro).click_and_hold(on_element=span).perform() ActionChains(bro).move_to_element_with_offset(to_element=span,xoffset=200,yoffset=0).perform() tracks = get_tracks(40) print(tracks) for track in tracks: ActionChains(bro).move_by_offset(xoffset=track,yoffset=0).perform() time.sleep(1) ActionChains(bro).release().perform()
3. 从快代理爬取IP
import requests from lxml import etree import time ''' 1. 循环抓取代理IP https://www.kuaidaili.com/free/inha/1 2. 检测代理IP的质量问题 ''' header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' } base_url = 'https://www.kuaidaili.com/free/inha/%d' def getHttp(): proxy_lst = [] for i in range(1, 10): url = base_url % i response = requests.get(url,headers = header) data = response.text tree = etree.HTML(data) tr_list = tree.xpath('//*[@id="list"]/table//tr') # http_lst = [] for tr in tr_list: dic = {} type_list = tr.xpath('./td[@data-title="类型"]/text()') IP_list = tr.xpath('./td[@data-title="IP"]/text()') PORT_list = tr.xpath('./td[@data-title="PORT"]/text()') if type_list and IP_list and PORT_list: type = type_list[0] IP = IP_list[0] PORT = PORT_list[0] dic[type] = IP + ":" + PORT proxy_lst.append(dic) time.sleep(0.5) # 如果在爬取每页数据之后不加间隔时间,则无法完全抓取到数据,只会抓取到部分数据。 return proxy_lst def check_ip(lst): act_lst = [] for proxy in lst: # timeout=0.1指的是响应时间。超过该时间会自动报错。 try: res = requests.get('http://www.baidu.com',headers = header,proxies = proxy,timeout = 0.1) if res.status_code == 200: act_lst.append(proxy) except Exception as e: print(e) return act_lst # print(lst,len(lst)) if __name__ == "__main__": # 检测代理IP可用性 proxy_lst=getHttp() print(proxy_lst) can_use = check_ip(proxy_lst) print('可用的代理IP:',can_use) print('可用的代理IP的数量:',len(can_use)) print('可用的代理IP的百分比:',len(can_use)/len(proxy_lst))