Day4作业

1、爬取梨视频

'''
1、请求url:
    https://www.pearvideo.com/
2、请求方式:
    GET
3、请求头:
    user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36
'''
import  requests
# #正则模块
# #对梨视频详情页发送请求,获取响应数据
import re
response=requests.get(url='https://www.pearvideo.com/')
print(response.status_code)
print(response.text)
re.findall('正则匹配规则','解析文本','正则模式')
re.S:全局模式(对整个文本进行匹配)
# #.指的是当前位置
# #*指的是查找所有
# '''
# <a href="video_1543373"
# <a href="video_(.*?)" #提取1543373
# '''
res=re.findall('a href="video_(.*?)"',response.text,re.S)
for m_id in res:
#     #拼接详情页url
    detail_url='https://www.pearvideo.com/video_'+m_id
    print(detail_url)




import requests
import re
uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串
import uuid
#爬虫三部曲
#1、发送请求
def get_page(url):
    response=requests.get(url)
    return response
#2、解析数据
# 解析主页获取视频详请
def parse_index(text):
    res=re.findall('<a href="video_(.*?)"',text,re.S)
    detail_url_list=[]
    for m_id in res:
        # 拼接详情页url
        detail_url = 'https://www.pearvideo.com/video_' + m_id
        detail_url_list.append(detail_url)
    return detail_url_list
# 解析详情页获取视频url
def parse_detail(text):
    movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0]
    return movie_url
# 3、保存数据
def save_movie(movie_url):
    response=requests.get(movie_url)
    # 把视频保存到本地
    with open(f'{uuid.uuid4()}.mp4','wb')as f:
        f.write(response.content)
        f.flush()
if __name__ == '__main__':
    # 1、对主页发送请求
    index_res=get_page(url='https://www.pearvideo.com/')

    # 2、对主页进行解析、获取详情页id
    detail_url_list=parse_index(index_res.text)
    # 3、对每个详情页url发送请求
    for detail_url in detail_url_list:
        detail_res=get_page(url=detail_url)
        # 4、解析详情页获取视频url
        movie_url=parse_detail(detail_res.text)
        print(movie_url)
        #5、保存视频
        save_movie(movie_url)

2、携带请求头参数访问知乎

import requests
#请求头字典
aaa={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
#在get请求内,添加user-agent
response=requests.get('https://www.zhihu.com/explore',headers=aaa)
print(response.status_code)
print(response.text)
with open('zhihu.html','w',encoding='utf-8')as f:
    f.write(response.text)

3、携带cookies,携带登录cookies

import requests
# 请求url
url='https://github.com/settings/profile'
requests.get('url')
#请求头
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
    'cookie':'has_recent_activity=1; _ga=GA1.2.713277069.1560498508; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.904014561.1560498512; _device_id=2ec72f7f5eee8d1472d4c213342fe648; user_session=OsXwZy4_1xidSOCzQjoWUn07tgf89bD-fRqCQIcqy8gRwZf4; __Host-user_session_same_site=OsXwZy4_1xidSOCzQjoWUn07tgf89bD-fRqCQIcqy8gRwZf4; logged_in=yes; dotcom_user=852653835; _gh_sess=QTNWSkVuSmRVekh4UGNGRE1sOGJ6NzN2ZWhybm5FRWUwUCtSVFlrZUNMNTZpRGR4S3BaazdFOUNzclhTKy9HT1lvMkFDdzdONnN0akVMUEN4TlVQLzFZWjVCamNpbG9ZZ0MyTFRHVmJldDRGT0ZyMWJua0NFbWFLZkR0R1c0UlZEOUNHRmVITUVOY2RlWjBwenNkZnFwcHM5b3pkNjh6Q0FER0pvTE5EZWFnRUZqMm9ZTjVlcGFHMEtxOW9rZ0tCeXNPWjZSMFhZRnZmT2dDUklWOWh5eFdFaTZVSy9kWEo4S2RJMG1WS05xajdkLy9lOGJLdUdpOXVyNUFGbndranBQc1F4VWZVbklWOEFEQjg0YjNOQ1h5dkJDQTgxMUVDV1VvTk1DT3FjN2xIZ0U4VTRnVmxpQjlLdFNTNnZjSG5rcjFpdm5qSk42YWhQcnhycnVaak93PT0tLU9GblFkWExBenhaZS81NHZrK2RBWGc9PQ%3D%3D--1dc9fa1c1ab2f284caffe8ef59c31786dea5bdf1'
}
# cookies={
#     'cookie':'_ga=GA1.2.100056359.1560418560; _gid=GA1.2.881683916.1560418560; __gads=ID=f4cea76701723c0e:T=1560418559:S=ALNI_MbzL_sVa3MGdgdoVXcdAvSVYRrf5g; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrndAt1eQioA8EwvbUcCPuycOG0uK7DsHTuI0GSL-4qyZv2bXXTNP6z_9lKpYNEJv93916PR89d4tED7Jj8ahDtht0Zp4JYdtBbJaKDIm6mqsS9p0LjvfCbvU-Q3_bHKpnU9KUAr8n3PYKBSuo4hRY1KTB8G5hw8w_5AUe3a2L4BWSf8aNFQYtNowA1sWBPG5Thsple1imIrgOGGMcLt_J4W9y99o9ZG5HSmer4tZ4EkzT3v95MehcQrPnP1i6ujtqHB6N7vWSvoaj4_OmncsmQ47BoqmIQHeHopTbNqOZPDDfgSQJBlVq4FUSop7paQ0romWOnP_HBQfDBUmCtOZkiyogixPv2_rkGnhd5yXNOB8kwfdzyX7g9ywd7Z9yHreouaoy0adUlZhhJCDWUweQAs4-YLKJEW0XrQhot9NQyp8w; .CNBlogsCookie=D6950E9543CC088C4E7215CCE094064A9EF705174A450660EB3680D8C2E5DA9A75E182C856468E53F793CB9A3AC9D86A493D96CC89F6CCB7462772755B3043A04A206C4FF52C4CFBC8CF8F062B3727E116D4317D; _gat=1'
# }
res=requests.get(url,headers=headers)
print('852653835' in res.text)

4、提取豆瓣网页视频信息(前25部)

import requests
import re
url='https://movie.douban.com/top250'
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
response=requests.get(url,headers=headers)
# print(response.text)
# 通过正则解析提取数据
movie_content_list=re.findall(
    #正则规则
'<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',response.text,re.S
)
for movie_content in movie_content_list:
    # 解压赋值每一部电影
    detail_url, movie_jpg, name, point, num = movie_content
    data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n'
    print(data)
    # 3、保存数据,把电影信息写入文件中
    with open('douban.txt', 'a', encoding='utf-8') as f:
        f.write(data)

5、今日作业(250部)

import requests
import re
url2='https://movie.douban.com/top250?start&filter='
insert=url2.index("&filter=")
for i in range(0,230):
    if i%25==0:
        url = '{0}={1}{2}'.format(url2[:insert], i, url2[insert:])
    else:
        continue
    print(url)
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
    }
    response=requests.get(url,headers=headers)
    movie_content_list=re.findall(
        #正则规则
    '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?导演:(.*?)<br>.*?\n(.*?)&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)\n.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?class="inq">(.*?)</span>',response.text,re.S
    )
    for movie_content in movie_content_list:
        # 解压赋值每一部电影
        detail_url, movie_jpg, name, director, year, country, kind, point, num, profile = movie_content
        director = director.replace('&nbsp;', ' ')
        data = f'电影名称:{name},导演:{director},{year.strip()},{country},{kind},评分: {point}, 评价人数: {num},{profile},详情页url:{detail_url}, 图片url:{movie_jpg}\n'

        # 3、保存数据,把电影信息写入文件中
        with open('douban.txt', 'a', encoding='utf-8') as f:
            f.write(data)

 

上一篇:Day 4作业


下一篇:使用pyquery爬取豆瓣电影top250,存储在mongodb