分析Ajax来爬取今日头条街拍美图并保存到MongDB

2022-03-01 13:16:57
前提:.需要安装MongDB
注:因今日投票网页发生变更,如下代码不保证能正常使用
#!/usr/bin/env python

#-*- coding: utf-8 -*-

import json

import os

from urllib.parse import urlencode

import pymongo

import requests

from bs4 import BeautifulSoup

from requests.exceptions import ConnectionError

import re

from multiprocessing import Pool

from hashlib import md5

from json.decoder import JSONDecodeError

MONGO_URL = 'localhost'

MONGO_DB = 'toutiao'

MONGO_TABLE = 'toutiao'

GROUP_START = 1

GROUP_END = 20

KEYWORD='街拍'

client = pymongo.MongoClient(MONGO_URL, connect=False)

db = client[MONGO_DB]

def get_page_index(offset, keyword):

    data = {

        'autoload': 'true',

        'count': 20,

        'cur_tab': 3,

        'format': 'json',

        'keyword': keyword,

        'offset': offset,

    }

    params = urlencode(data)

    base = 'http://www.toutiao.com/search_content/'

    url = base + '?' + params

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except ConnectionError:

        print('Error occurred')

        return None

def download_image(url):

    print('Downloading', url)

    try:

        response = requests.get(url)

        if response.status_code == 200:

            save_image(response.content)

        return None

    except ConnectionError:

        return None

def save_image(content):

    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')

    print(file_path)

    if not os.path.exists(file_path):

        with open(file_path, 'wb') as f:

            f.write(content)

            f.close()

def parse_page_index(text):

    try:

        data = json.loads(text)

        if data and 'data' in data.keys():

            for item in data.get('data'):

                yield item.get('article_url')

    except JSONDecodeError:

        pass

def get_page_detail(url):

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except ConnectionError:

        print('Error occurred')

        return None

def parse_page_detail(html, url):

    soup = BeautifulSoup(html, 'lxml')

    result = soup.select('title')

    title = result[0].get_text() if result else ''

    images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)

    result = re.search(images_pattern, html)

    if result:

        data = json.loads(result.group(1).replace('\\', ''))

        if data and 'sub_images' in data.keys():

            sub_images = data.get('sub_images')

            images = [item.get('url') for item in sub_images]

            for image in images: download_image(image)

            return {

                'title': title,

                'url': url,

                'images': images

            }

def save_to_mongo(result):

    if db[MONGO_TABLE].insert(result):

        print('Successfully Saved to Mongo', result)

        return True

    return False

def main(offset):

    text = get_page_index(offset, KEYWORD)

    urls = parse_page_index(text)

    for url in urls:

        html = get_page_detail(url)

        print(html)

        result = parse_page_detail(html, url)

        print(result)

        if result: save_to_mongo(result)

if __name__ == '__main__':

    pool = Pool()

    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

    pool.map(main, groups)

    pool.close()

    pool.join()
码农公寓

相关文章