用requests及BeautifulSoup实现豆瓣电影信息的获取

最近花了三天左右的时间做了一个爬虫项目,记录如下:

import requests
from bs4 import BeautifulSoup
url='https://movie.douban.com/top250'
movie_names=[]
movie_messages=[]
 movie_scores=[]
movie_rank=[]
 def movie_nameget(url):
    headers = {  # 这是请求头
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
    }
    responce=requests.get(url,headers=headers,timeout=10)
    page=responce.content.decode()
    soup=BeautifulSoup(page,'lxml')
     wd=soup.find_all('div',class_='hd')
    qt=soup.find_all('p',class_='')
    pf=soup.find_all('span',class_="rating_num")
     pm=soup.find_all('em',class_="")
    for each in qt:                          #获取导演等信息
        message1=each.text.replace("                            ","")
        message=message1.replace("\n                        ","")
        movie_messages.append(message)    

    for each in wd:                          #获取电影名
        span=each.find('span',class_='title')
        title=span.text
        movie_names.append(title)

     for each in pf:                         #获取电影评分
        scores=each.text
        movie_scores.append("评分"+scores)
    
    for each in pm:                       #获取电影排名
        rank=each.text
        movie_rank.append("排名:"+rank+"\t")

     for i in range(0,25):            #每页有25个电影信息,逐一存储  
        file=open("movietop250.txt","a", encoding="utf-8")
        file.write(movie_rank[i]+movie_names[i]+movie_messages[i]+"\t\t\t"+movie_scores[i]+"\n")               #将信息写入到txt文件中
        file.close()
    movie_names.clear()
    movie_messages.clear()
    movie_scores.clear()
    movie_rank.clear()               #更新列表
 for j in range(0,10):         #一共有10页,用for循环获取每页信息
    houzui='?start='+str(j*25)+'&filter='
    link=url+houzui
    movie_nameget(link)


上一篇:【数据结构C++邓俊辉】笔记


下一篇:PAT甲级1012