Scrapy爬取色花堂磁力和图片
重点说明爬取图片
一.创建项目
scrapy startproject SeHuaTang
scrapy genspider SeHuaTang ""
二.修改settings.py文件
# 控制日志输出等级
LOG_LEVEL = "WARNING"
# 定义爬取网页的基本地址
BASE_URL="https://rtretyrytre.xyz/"
# 修改是否遵守robots协议
ROBOTSTXT_OBEY = False
# 设置图片保存地址
import os
# # 配置保存本地的地址
# project_dir = os.path.abspath(os.path.dirname(__file__)) # 获取当前爬虫项目的绝对路径
# IMAGES_STORE = os.path.join(project_dir, 'images') # 组装新的图片
IMAGES_STORE = 'D:/ImageSpider/'
# 打开管道
ITEM_PIPELINES = {
'SeHuaTang.pipelines.SeHuaTangPipeline': 300,
}
三.修改items.py文件
import scrapy
class SeHuaTangItem(scrapy.Item):
# 影片名
common=scrapy.Field()
# 浏览数量
num=scrapy.Field()
# 影片详情地址
url=scrapy.Field()
# 磁力链接
cili=scrapy.Field()
# 图片链接
picture=scrapy.Field()
四.修改SeHuaTang.py文件
import re
import scrapy
from SeHuaTang.settings import BASE_URL
from SeHuaTang.items import SeHuaTangItem
class SeHuaTangSpider(scrapy.Spider):
# 定义链接可变链接
base_url = BASE_URL
name = 'sehuatang' # 爬虫名
# allowed_domains = ['itcast.cn'] #允许爬虫的范围
start_urls = [base_url + '/forum-2-2.html'] # 最开始请求的url地址
def parse(self, response):
# 获取行列表
tr_list = response.xpath('//table//tr')[5:-2]
# print(len(tr_list))
for tr in tr_list:
item = SeHuaTangItem()
# 影片名称
item["common"] = tr.xpath('./th/a[@onclick="atarget(this)"]/text()').extract_first()
# 查看数量
item["num"] = tr.xpath('./td[@class="num"]/em/text()').extract_first()
# 详情页链接
item["url"] = self.base_url + str(
tr.xpath('./th/a[@onclick="atarget(this)"]/@href').extract_first())
# print(item["url"])
yield scrapy.Request(
url=item["url"],
callback=self.parse1,
meta={'item': item}
)
# 找到总页数
# page_count = str(response.xpath('//*[@id="fd_page_bottom"]/div/label/span/text()')
# .extract_first()).replace('/', "").replace("页", "")
page_count = 2
# 获取当前页
current_page = str(response.xpath('//*[@id="fd_page_bottom"]/div/strong/text()').extract_first())
if int(page_count) != int(current_page):
# 说明不是最后一页
# 找到下一页url地址
next_url = response.xpath('//*[@id="fd_page_bottom"]/div/a[@class="nxt"]/@href').extract_first()
next_url = self.base_url + next_url
print(next_url, int(page_count), int(current_page))
# 提交任务
yield scrapy.Request(
url=next_url,
callback=self.parse
)
# 处理详情页
def parse1(self, response, **kwargs):
item = response.meta['item']
# 通过正则表达式匹配
# guize1 = "(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)"
# item["cili"]=re.findall(guize, response.text)
# 通过xpath匹配磁力链接
text_xpath = '/html/body/div[6]/div[6]/div[2]/div[1]/table//tr[1]/td[2]/div[2]/div/div[1]/table//tr/td'
item["cili"] = response.xpath(text_xpath).extract_first()
item["cili"] = re.findall("(magnet:\?xt=urn:btih:[0-9a-fA-F]{40}.*?)", item["cili"])
# 获取下载图片链接
# picture_xpath = '//ignore_js_op/img/@zoomfile'
picture_xpath = '//img[@class="zoom"]/@file'
# 获取图片列表
item["picture"] = response.xpath(picture_xpath).extract()
yield item
五.修改pipelines.py文件
重点在于继承ImagesPipeline这个类
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import math
import os
import random
import time
import pymongo
import scrapy
from SeHuaTang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline
class SeHuaTangPipeline(ImagesPipeline):
# 下载图片
def get_media_requests(self, item, info):
# 将数据存入数据库
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient['SeHuaTang']
mycol = mydb["demo"]
mycol.insert_one(dict(item))
image_url=item['picture']
for x in image_url:
# print(x)
yield scrapy.Request(x)
# print('下载图片',image_url)
# 对图片重命名
def item_completed(self, results, item, info):
# 取出results中的图片路径的值
image_path=[x["path"] for ok,x in results if ok]
for x in range(len(image_path)):
# 旧名,新名
if os.path.exists(IMAGES_STORE+image_path[x]):
os.rename(IMAGES_STORE+image_path[x],
IMAGES_STORE+str(item["common"])+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))
+str(random.randint(1,10000000))+'.jpg')
# print(results)
return item