豆瓣Top250电影信息的抓取

利用urllib以及re模块进行豆瓣网页Top250电影信息的抓取

import urllib.request
import bs4
import re
from openpyxl import Workbook

# 网页信息匹配规则
pattern_find_Title = re.compile(r'<span class="title">(.*?)</span>')  # 电影名字匹配规则
pattern_find_Score = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')  # 电影评分匹配规则
pattern_find_Evaluator = re.compile(r'<span>(\d*)人评价</span>')  # 电影评价人数匹配规则
pattern_find_actor = re.compile(r'主演: (.*?) ') #主演匹配规则
pattern_find_BriefComment = re.compile(r'<span class="inq">(.*?)</span>')  # 简评匹配规则
pattern_find_Editor = re.compile(r'导演: (.*?) ')  #导演匹配规则

def main():
    savepath = 'C:/Users/slli/Desktop/豆瓣Top250.xlsx'
    base_url = 'https://movie.douban.com/top250?start='
    data_xlsx = data_cross(base_url)
    save_data(data_xlsx,savepath)

def askurl(url):  # 得到指定的一个url网页信息
    head = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')
        # print(html)
    except Exception as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)
    return html

def data_cross(base_url):
    data_xlsx = [['排名','电影名称','导演','主演','评分','评价人数','简评']]

    for i in range(0, 10):
        url = base_url + str(i * 25)
        html = askurl(url)
        soup = bs4.BeautifulSoup(html, 'html.parser')
        all_items = soup.find_all('div', class_="item")
        for j in range(0,len(all_items)):
            data = []
            item = str(all_items[j])
            rank_str = '第{}名'.format(i*25+j+1)
            rank = []
            rank.append(rank_str)
            title = re.findall(pattern_find_Title, item)
            editor = re.findall(pattern_find_Editor,item)
            actor = re.findall(pattern_find_actor, item)
            score = re.findall(pattern_find_Score, item)
            evaluator = re.findall(pattern_find_Evaluator, item)
            comment = re.findall(pattern_find_BriefComment, item)

            data.append(rank[0])
            data.append(title[0])
            data.append(editor[0])
            if actor == []:
                data.append('名字不详')
            else:
                data.append(actor[0])
            data.append(score[0])
            data.append(evaluator[0])
            if comment == []:
                data.append('暂未简评')
            else:
                data.append(comment[0])

            data_xlsx.append(data)
    return data_xlsx

def save_data(data_xlsx,savepath):
    wb = Workbook()
    ws = wb.create_sheet(title='豆瓣Top250')
    for row in data_xlsx:
        ws.append(row)  #把每一行写入到excel
    wb.save(savepath)

if __name__ == '__main__':
    main()
    print('Done!')

运行后的结果:
豆瓣Top250电影信息的抓取

上一篇:5.爬虫准备工作


下一篇:python爬虫-豆瓣电影top250