Scrapy爬取色花堂磁力和图片

Scrapy爬取色花堂磁力和图片

重点说明爬取图片

一.创建项目

scrapy startproject SeHuaTang
scrapy genspider SeHuaTang ""

二.修改settings.py文件

# 控制日志输出等级
LOG_LEVEL = "WARNING"
# 定义爬取网页的基本地址
BASE_URL="https://rtretyrytre.xyz/"
# 修改是否遵守robots协议
ROBOTSTXT_OBEY = False
# 设置图片保存地址

import os
# # 配置保存本地的地址
# project_dir = os.path.abspath(os.path.dirname(__file__))  # 获取当前爬虫项目的绝对路径
# IMAGES_STORE = os.path.join(project_dir, 'images')  # 组装新的图片
IMAGES_STORE = 'D:/ImageSpider/'
# 打开管道
ITEM_PIPELINES = {
    'SeHuaTang.pipelines.SeHuaTangPipeline': 300,
}

三.修改items.py文件

import scrapy

class SeHuaTangItem(scrapy.Item):
    # 影片名
    common=scrapy.Field()
    # 浏览数量
    num=scrapy.Field()
    # 影片详情地址
    url=scrapy.Field()
    # 磁力链接
    cili=scrapy.Field()
    # 图片链接
    picture=scrapy.Field()

四.修改SeHuaTang.py文件

import re

import scrapy

from SeHuaTang.settings import BASE_URL
from SeHuaTang.items import SeHuaTangItem


class SeHuaTangSpider(scrapy.Spider):
    # 定义链接可变链接
    base_url = BASE_URL
    name = 'sehuatang'  # 爬虫名
    # allowed_domains = ['itcast.cn'] #允许爬虫的范围
    start_urls = [base_url + '/forum-2-2.html']  # 最开始请求的url地址

    def parse(self, response):

        # 获取行列表
        tr_list = response.xpath('//table//tr')[5:-2]
        # print(len(tr_list))
        for tr in tr_list:
            item = SeHuaTangItem()
            # 影片名称
            item["common"] = tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
            # 查看数量
            item["num"] = tr.xpath('./td[@class="num"]/em/text()').extract_first()
            # 详情页链接
            item["url"] = self.base_url + str(
                tr.xpath('./th/a[@onclick="atarget(this)"]/@href').extract_first())
            # print(item["url"])
            yield scrapy.Request(
                url=item["url"],
                callback=self.parse1,
                meta={'item': item}
            )
        # 找到总页数
        # page_count = str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()')
        #                  .extract_first()).replace('/', "").replace("页", "")
        page_count = 2
        # 获取当前页
        current_page = str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
        if int(page_count) != int(current_page):
            # 说明不是最后一页
            # 找到下一页url地址
            next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
            next_url = self.base_url + next_url
            print(next_url, int(page_count), int(current_page))
            # 提交任务
            yield scrapy.Request(
                url=next_url,
                callback=self.parse
            )

    # 处理详情页
    def parse1(self, response, **kwargs):
        item = response.meta['item']
        # 通过正则表达式匹配
        # guize1 = "(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)"
        # item["cili"]=re.findall(guize, response.text)
        # 通过xpath匹配磁力链接
        text_xpath = '/html/body/div[6]/div[6]/div[2]/div[1]/table//tr[1]/td[2]/div[2]/div/div[1]/table//tr/td'
        item["cili"] = response.xpath(text_xpath).extract_first()
        item["cili"] = re.findall("(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)", item["cili"])
        # 获取下载图片链接
        # picture_xpath = '//ignore_js_op/img/@zoomfile'
        picture_xpath = '//img[@class="zoom"]/@file'
        # 获取图片列表
        item["picture"] = response.xpath(picture_xpath).extract()
        yield item

五.修改pipelines.py文件

重点在于继承ImagesPipeline这个类

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import math
import os
import random
import time
import pymongo

import scrapy
from SeHuaTang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline

class SeHuaTangPipeline(ImagesPipeline):
    # 下载图片
    def get_media_requests(self, item, info):
        # 将数据存入数据库
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient['SeHuaTang']
        mycol = mydb["demo"]
        mycol.insert_one(dict(item))

        image_url=item['picture']
        for x in image_url:
            # print(x)
            yield scrapy.Request(x)
        # print('下载图片',image_url)

    # 对图片重命名
    def item_completed(self, results, item, info):
        # 取出results中的图片路径的值
        image_path=[x["path"] for ok,x in results if ok]
        for x in range(len(image_path)):
            # 旧名,新名
            if os.path.exists(IMAGES_STORE+image_path[x]):
                os.rename(IMAGES_STORE+image_path[x],
                          IMAGES_STORE+str(item["common"])+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))
                          +str(random.randint(1,10000000))+'.jpg')
        # print(results)
        return item

上一篇:商品采购系统(js+jq)


下一篇:append/prepend,after/before之间的关系和区别