第53讲!:
0:统一资源定位符
1:crawler
2:法律
3:禁止同IP的大量访问 或者禁止同u-a的访问。或者不禁止 给一些自动传假数据给这些大量访问的。 robots.txt文件
4:类似于内容管理的一个对象 httpresponse
5:没错就是我 URLERROR HTTPerror
6:不知道 UTF-8在head里
7:万国码
8:
import chardet url = 'http://www.fishc.com' req = urllib.request.Request(url) response = urllib.request.urlopen(req) print(type(response)) print(chardet.detect(str(response).encode())) html = response.read(300) print(type(html)) print(html)
9:
import chardet url = 'http://www.fishc.com' req = urllib.request.Request(url) response = urllib.request.urlopen(req) print(type(response)) print(chardet.detect(str(response).encode()))
10:
import urllib.request import os with open('record.txt','r') as f: # lines = f.readlines() for i in range(4): # url = [line.strip('\n').split(',') for line in lines] # url = [line.strip('\n').split(',') for line in lines[1:] ] url = f.readline() req = urllib.request.Request(url) response = urllib.request.urlopen(req) html = response.read() with open(('url_'+str(i+1)+'.txt'),'wb') as f2: f2.write(html)
豆瓣防爬.
第五十四讲:
0:用于设置超时时间 默认就用全局的默认超时 即 多久没响应算超时
1:response.getcode()
2: get post
3:客户端请求啊
4:记录客户端的基本信息
5:加入data参数
6: decode
7:轻量级数据交换格式 就是字典数组在那里玩嘛
8:
import easygui as g import os def getsize(): msg = '下载一只喵' title = '请填写喵的尺寸' fieldNames = ["宽","高"] fieldValues = [] fieldValues = g.multenterbox(msg,title, fieldNames) return fieldValues def getPic(sizew ,sizeh): if sizew == '': sizew = 400 sizeh = 600 url = "http://placekitten.com/g/"+str(sizew)+'/'+str(sizeh) req = urllib.request.Request(url) response = urllib.request.urlopen(req) cat_img = response.read() file_path = g.diropenbox('cat_'+str(sizew)+'_'+str(sizeh)+'.jpg') file_name = 'cat_'+str(sizew)+'_'+str(sizeh)+'.jpg' with open(file_name,'wb') as f: f.write(cat_img) def main(): size_w,size_h = getsize() getPic(size_w,size_h) if __name__ == "__main__": main()
9:
豆瓣防爬啦
import re import urllib, urllib.request from http import cookiejar loginurl = 'https://www.douban.com/accounts/login' cookie = cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) params = { "form_email":"your email", "form_password":"your password", "source":"index_nav" #没有的话登录不成功 } #从首页提交登录 response=opener.open(loginurl, urllib.parse.urlencode(params).encode()) #验证成功跳转至登录页 if response.geturl() == "https://www.douban.com/accounts/login": html=response.read() #验证码图片地址 imgurl=re.search('<img id="captcha_image" src="(.+?)" alt="captcha" class="captcha_image"/>', html) if imgurl: url=imgurl.group(1) #将图片保存至同目录下 res=urllib.urlretrieve(url, 'v.jpg') #获取captcha-id参数 captcha=re.search('<input type="hidden" name="captcha-id" value="(.+?)"/>' ,html) if captcha: vcode=raw_input('请输入图片上的验证码:') params["captcha-solution"] = vcode params["captcha-id"] = captcha.group(1) params["user_login"] = "登录" #提交验证码验证 response=opener.open(loginurl, urllib.urlencode(params)) ''' 登录成功跳转至首页 ''' if response.geturl() == "http://www.douban.com/": print ('login success ! ')
第55讲:
0:看你的UA
1:因为服务器访问超时了
2:客户端
3:X.add_header
4:应该是 你发给代理 代理再发给服务器 崩溃就是同时访问人太多
5:木知 用get——method方法 获取REQUEST对象
6:COOKIE
7:
import re import urllib.request from bs4 import BeautifulSoup url = 'http://baike.baidu.com/view/284853.htm' req = urllib.request.Request(url) req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36') response = urllib.request.urlopen(req) html = response.read().decode('utf-8') soup = BeautifulSoup(html,'html.parser') for each in soup.find_all(href=re.compile("view")): print(each.text, "->", ''.join(["http://baike.baidu.com", each["href"
8: 对不起 还没学如何点击按钮
import re import urllib.request from bs4 import BeautifulSoup url = "https://baike.baidu.com/item/%E7%8C%AA%E5%85%AB%E6%88%92/769" req = urllib.request.Request(url) req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36') response = urllib.request.urlopen(req) html = response.read().decode('utf-8') print(html) soup = BeautifulSoup(html,'html.parser') print(soup.prettify()) re1 = r'((.*?))' print(type(str(soup.find_all('title')[0]))) print(str(soup.find_all('title')[0])) print('副标题是:',re.findall(re1,str(soup.find_all('title')[0])))
9:
后面才学
import urllib.request import urllib.parse import re from bs4 import BeautifulSoup def test_url(soup): result = soup.find(text=re.compile("百度百科尚未收录词条")) if result: print(result[0:-1]) # 百度这个碧池在最后加了个“符号,给它去掉 return False else: return True def summary(soup): word = soup.h1.text # 如果存在副标题,一起打印 if soup.h2: word += soup.h2.text # 打印标题 print(word) # 打印简介 if soup.find(class_="lemma-summary"): print(soup.find(class_="lemma-summary").text) def get_urls(soup): for each in soup.find_all(href=re.compile("view")): content = ''.join([each.text]) url2 = ''.join(["http://baike.baidu.com", each["href"]]) response2 = urllib.request.urlopen(url2) html2 = response2.read() soup2 = BeautifulSoup(html2, "html.parser") if soup2.h2: content = ''.join([content, soup2.h2.text]) content = ''.join([content, " -> ", url2]) yield content def main(): word = input("请输入关键词:") keyword = urllib.parse.urlencode({"word":word}) response = urllib.request.urlopen("http://baike.baidu.com/search/word?%s" % keyword) html = response.read() soup = BeautifulSoup(html, "html.parser") if test_url(soup): summary(soup) print("下边打印相关链接:") each = get_urls(soup) while True: try: for i in range(10): print(next(each)) except StopIteration: break command = input("输入任意字符将继续打印,q退出程序:") if command == 'q': break else: continue if __name__ == "__main__": main()
结束了 做的一塌糊涂 感觉要复习一遍才行。