初入Python(猫眼top100)

思路:

1.查看网页源代码
2.抓取单页内容
3.使用正则表达式
4.猫眼电影TOP100所有信息

查看网页源代码

使用Chrome浏览器:
初入Python(猫眼top100)

抓取单页内容

通过以下代码获取网页源代码

import requests
from requests.exceptions import RequestException
import re
import os

headers = {
    #'Content-Type': 'text/html;charset=UTF-8',
    #'Origin':' https://maoyan.com?offset=0',
    'Referer': 'https://maoyan.com/board/4',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
def get_page(url, headers):
    try:
            res = requests.get(url=url, headers=headers)
            if res.status_code == 200:
                return res.text
            return None
    except RequestException:
            return None

使用正则表达式提取信息

def parse_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }

将爬取的信息写入文件,我这采用的是写入txt文本

def write_file(txt):
    data = open('result.txt', 'a', encoding='utf-8')
    print(txt, file=data)
    data.close()

将电影所对应的图片进行下载

def save_picture(url, path):
    if not os.path.exists('picture'):
        os.makedirs('picture')
    res = requests.get(url=url)
    if res.status_code ==200:
        with open (path, 'wb') as f:
            f.write(res.content)
            print(path,'下载成功')
            f.close()

代码实现如下

import requests
from requests.exceptions import RequestException
import re
import os

headers = {
    #'Content-Type': 'text/html;charset=UTF-8',
    #'Origin':' https://maoyan.com?offset=0',
    'Referer': 'https://maoyan.com/board/4',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
def get_page(url, headers):
    try:
            res = requests.get(url=url, headers=headers)
            if res.status_code == 200:
                return res.text
            return None
    except RequestException:
            return None


def parse_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }

def write_file(txt):
    data = open('result.txt', 'a', encoding='utf-8')
    print(txt, file=data)
    data.close()

def save_picture(url, path):
    if not os.path.exists('picture'):
        os.makedirs('picture')
    res = requests.get(url=url)
    if res.status_code ==200:
        with open (path, 'wb') as f:
            f.write(res.content)
            print(path,'下载成功')
            f.close()

def main(offset):
    url = "https://maoyan.com/board/4?offset="+str(offset)
    html = get_page(url, headers)
    for item in parse_page(html):
        write_file(item)
        save_picture(item['image'], 'picture/'+item['title']+'.jpg')


if __name__ == '__main__':
    for i in range(10):
        main (i*10)

运行结果
初入Python(猫眼top100)

上一篇:python爬虫入门新手向实战 - 爬取猫眼电影Top100排行榜


下一篇:爬取猫眼电影榜单TOP100榜-以mysql数据库保存