1、爬取梨视频
''' 1、请求url: https://www.pearvideo.com/ 2、请求方式: GET 3、请求头: user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 ''' import requests # #正则模块 # #对梨视频详情页发送请求,获取响应数据 import re response=requests.get(url='https://www.pearvideo.com/') print(response.status_code) print(response.text) re.findall('正则匹配规则','解析文本','正则模式') re.S:全局模式(对整个文本进行匹配) # #.指的是当前位置 # #*指的是查找所有 # ''' # <a href="video_1543373" # <a href="video_(.*?)" #提取1543373 # ''' res=re.findall('a href="video_(.*?)"',response.text,re.S) for m_id in res: # #拼接详情页url detail_url='https://www.pearvideo.com/video_'+m_id print(detail_url) import requests import re uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 import uuid #爬虫三部曲 #1、发送请求 def get_page(url): response=requests.get(url) return response #2、解析数据 # 解析主页获取视频详请 def parse_index(text): res=re.findall('<a href="video_(.*?)"',text,re.S) detail_url_list=[] for m_id in res: # 拼接详情页url detail_url = 'https://www.pearvideo.com/video_' + m_id detail_url_list.append(detail_url) return detail_url_list # 解析详情页获取视频url def parse_detail(text): movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0] return movie_url # 3、保存数据 def save_movie(movie_url): response=requests.get(movie_url) # 把视频保存到本地 with open(f'{uuid.uuid4()}.mp4','wb')as f: f.write(response.content) f.flush() if __name__ == '__main__': # 1、对主页发送请求 index_res=get_page(url='https://www.pearvideo.com/') # 2、对主页进行解析、获取详情页id detail_url_list=parse_index(index_res.text) # 3、对每个详情页url发送请求 for detail_url in detail_url_list: detail_res=get_page(url=detail_url) # 4、解析详情页获取视频url movie_url=parse_detail(detail_res.text) print(movie_url) #5、保存视频 save_movie(movie_url)
2、携带请求头参数访问知乎
import requests #请求头字典 aaa={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } #在get请求内,添加user-agent response=requests.get('https://www.zhihu.com/explore',headers=aaa) print(response.status_code) print(response.text) with open('zhihu.html','w',encoding='utf-8')as f: f.write(response.text)
3、携带cookies,携带登录cookies
import requests # 请求url url='https://github.com/settings/profile' requests.get('url') #请求头 headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', 'cookie':'has_recent_activity=1; _ga=GA1.2.713277069.1560498508; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.904014561.1560498512; _device_id=2ec72f7f5eee8d1472d4c213342fe648; user_session=OsXwZy4_1xidSOCzQjoWUn07tgf89bD-fRqCQIcqy8gRwZf4; __Host-user_session_same_site=OsXwZy4_1xidSOCzQjoWUn07tgf89bD-fRqCQIcqy8gRwZf4; logged_in=yes; dotcom_user=852653835; _gh_sess=QTNWSkVuSmRVekh4UGNGRE1sOGJ6NzN2ZWhybm5FRWUwUCtSVFlrZUNMNTZpRGR4S3BaazdFOUNzclhTKy9HT1lvMkFDdzdONnN0akVMUEN4TlVQLzFZWjVCamNpbG9ZZ0MyTFRHVmJldDRGT0ZyMWJua0NFbWFLZkR0R1c0UlZEOUNHRmVITUVOY2RlWjBwenNkZnFwcHM5b3pkNjh6Q0FER0pvTE5EZWFnRUZqMm9ZTjVlcGFHMEtxOW9rZ0tCeXNPWjZSMFhZRnZmT2dDUklWOWh5eFdFaTZVSy9kWEo4S2RJMG1WS05xajdkLy9lOGJLdUdpOXVyNUFGbndranBQc1F4VWZVbklWOEFEQjg0YjNOQ1h5dkJDQTgxMUVDV1VvTk1DT3FjN2xIZ0U4VTRnVmxpQjlLdFNTNnZjSG5rcjFpdm5qSk42YWhQcnhycnVaak93PT0tLU9GblFkWExBenhaZS81NHZrK2RBWGc9PQ%3D%3D--1dc9fa1c1ab2f284caffe8ef59c31786dea5bdf1' } # cookies={ # 'cookie':'_ga=GA1.2.100056359.1560418560; _gid=GA1.2.881683916.1560418560; __gads=ID=f4cea76701723c0e:T=1560418559:S=ALNI_MbzL_sVa3MGdgdoVXcdAvSVYRrf5g; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrndAt1eQioA8EwvbUcCPuycOG0uK7DsHTuI0GSL-4qyZv2bXXTNP6z_9lKpYNEJv93916PR89d4tED7Jj8ahDtht0Zp4JYdtBbJaKDIm6mqsS9p0LjvfCbvU-Q3_bHKpnU9KUAr8n3PYKBSuo4hRY1KTB8G5hw8w_5AUe3a2L4BWSf8aNFQYtNowA1sWBPG5Thsple1imIrgOGGMcLt_J4W9y99o9ZG5HSmer4tZ4EkzT3v95MehcQrPnP1i6ujtqHB6N7vWSvoaj4_OmncsmQ47BoqmIQHeHopTbNqOZPDDfgSQJBlVq4FUSop7paQ0romWOnP_HBQfDBUmCtOZkiyogixPv2_rkGnhd5yXNOB8kwfdzyX7g9ywd7Z9yHreouaoy0adUlZhhJCDWUweQAs4-YLKJEW0XrQhot9NQyp8w; .CNBlogsCookie=D6950E9543CC088C4E7215CCE094064A9EF705174A450660EB3680D8C2E5DA9A75E182C856468E53F793CB9A3AC9D86A493D96CC89F6CCB7462772755B3043A04A206C4FF52C4CFBC8CF8F062B3727E116D4317D; _gat=1' # } res=requests.get(url,headers=headers) print('852653835' in res.text)
4、提取豆瓣网页视频信息(前25部)
import requests import re url='https://movie.douban.com/top250' headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } response=requests.get(url,headers=headers) # print(response.text) # 通过正则解析提取数据 movie_content_list=re.findall( #正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',response.text,re.S ) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point, num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)
5、今日作业(250部)
import requests import re url2='https://movie.douban.com/top250?start&filter=' insert=url2.index("&filter=") for i in range(0,230): if i%25==0: url = '{0}={1}{2}'.format(url2[:insert], i, url2[insert:]) else: continue print(url) headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } response=requests.get(url,headers=headers) movie_content_list=re.findall( #正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?导演:(.*?)<br>.*?\n(.*?) / (.*?) / (.*?)\n.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?class="inq">(.*?)</span>',response.text,re.S ) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, director, year, country, kind, point, num, profile = movie_content director = director.replace(' ', ' ') data = f'电影名称:{name},导演:{director},{year.strip()},{country},{kind},评分: {point}, 评价人数: {num},{profile},详情页url:{detail_url}, 图片url:{movie_jpg}\n' # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)