爬虫之request
各种请求方式
get
host_url = ‘https://www.pearvideo.com/‘
#浏览器的版本等信息
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
res = requests.get(host_url, headers=headers)
post
r = requests.post(‘http://httpbin.org/post‘, data = {‘key‘:‘value‘})
delete
r = requests.delete(‘http://httpbin.org/delete‘)
put
r = requests.put(‘http://httpbin.org/put‘, data = {‘key‘:‘value‘})
响应response的属性
import requests
respone=requests.get(‘http://www.jianshu.com‘)
# respone属性
#获得响应的文本为字符串格式
print(respone.text)
#获得响应的文本为二进制格式
print(respone.content)
#获得响应的状态码
print(respone.status_code)
print(respone.headers)
print(respone.cookies)
#已字典的形式获得响应的cookie
print(respone.cookies.get_dict())
print(respone.cookies.items())
print(respone.url)
print(respone.history)
print(respone.encoding)
爬取梨视频首页视频
import os
import re
from concurrent.futures import ThreadPoolExecutor
import requests
host_url = ‘https://www.pearvideo.com/‘
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
def get_index():
res = requests.get(host_url, headers=headers)
return res.text
def parser_index(text):
res = re.findall(‘<a href="(.*?)" class="vervideo-lilink actplay">‘, text)
res = [host_url + i for i in res]
return res
def get_detail(html_text):
# 获得视频的下载地址
download_index = re.search(r‘srcUrl="(.*?\.mp4)"‘, html_text).group(1)
# 获取标题
title = re.search(‘<h1 class="video-tt">(.*?)</h1>‘, html_text).group(1)
dic = {
‘download_index‘: download_index,
‘title‘: title
}
print(‘成功链接到[%s]视频文件‘ % title)
return dic
def get_video(video_url, title):
video_bytes = requests.get(video_url).content
if not os.path.exists(‘down_pearvideos‘):
os.mkdir(‘down_pearvideos‘)
file_path = os.path.join(‘down_pearvideos‘, title) + ‘.mp4‘
with open(file_path, ‘wb‘) as f:
f.write(video_bytes)
print(file_path + ‘下载成功!‘)
if __name__ == ‘__main__‘:
pool = ThreadPoolExecutor(10)
text = get_index()
url_list = parser_index(text)
for url in url_list:
response = requests.get(url, headers=headers).text
content_dic = get_detail(response)
# get_video(content_dic[‘download_index‘],content_dic[‘title‘])
# 开启多线程快速的爬取数据
pool.submit(get_video, content_dic[‘download_index‘], content_dic[‘title‘])
模拟登陆github
import re
import requests
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
login_url = ‘https://github.com/login‘
login_response = requests.get(login_url, headers=headers)
login_token = re.search(‘name="authenticity_token" value="(.*?)"‘, login_response.text).group(1)
print(login_token)
login_cookie = login_response.cookies.get_dict()
print(login_cookie)
session_url = ‘https://github.com/session‘
session_response = requests.post(
session_url,
headers=headers,
cookies=login_cookie,
data={
"commit": "Sign in",
"utf8": "?",
"authenticity_token": login_token,
"login": "yangyuanhu",
"password": "123654asd"
}
)
print(session_response.text)