spider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import Hao6VItem
import re
class A6vSpider(CrawlSpider):
name = '6v'
allowed_domains = ['www.hao6v.tv','www.6vw.cc','www.hao6v.cc', 'www.6vhao.tv','www.6vhao.com','www.6vhao.net', 'www.dy131.net', 'www.6vgood.com','www.hao6v.net']
start_urls = ['https://www.hao6v.tv']
rules = (
Rule(LinkExtractor(allow=r'/\d{4}-\d{2}-\d{2}/'), callback='parse_itemA', follow=True),
Rule(LinkExtractor(allow=r'/\d{4}-\d{2}-\d{2}/'), callback='parse_itemB', follow=True)
)
def parse_itemA(self, response):
# response =HtmlResponse()
# print(response.url)
电影名 = response.xpath("//title/text()").re('《(.*)》')
if len(电影名)<1:
return
电影名=电影名[0]
内容 = response.xpath("//meta[@name='description']/@content").extract()
if len(内容)<1:
return
内容=内容[0]
内容 = 内容.split("<br />")
内容1 = str(内容).replace(r"\u3000","")
内容2 = str(内容1).replace(r" ",r"")
内容3 = str(内容2).replace(r"\r\n","")
内容4 = str(内容3).replace(r"·","")
内容5 = str(内容4).replace(r"】","")
内容6 = str(内容5).replace(r"]:","")
内容7 = str(内容6).replace(r"]:","")
内容8 = str(内容7).replace(r":","")
t = 内容8[2:-2]
try:
译名= "译名(.*?)'"
片名= "片名(.*?)'"
年代= "年代(.*?)'"
产地= "产地(.*?)'"
类别= "类别(.*?)'"
片长= "片长(.*?)'"
简介= "简介(.*?)'"
item = [译名,片名,年代,产地,类别,片长,简介]
res = []
for i in range(len(item)):
a = re.search(item[i],t)
if a:
a =t[a.start():a.end()]
a = a[2:-1]
else:
a = None
res.append(a)
except:
pass
yield Hao6VItem(电影名 = 电影名,译名 = res[0],片名 = res[1],年代= res[2],产地= res[3],类别= res[4],片长= res[5],简介= res[6])
items
import scrapy
class Hao6VItem(scrapy.Item):
# define the fields for your item here like:
# 译名 = res[0],片名 = res[1],年代= res[2]\
# ,产地= res[3],类别= res[4],上映时间= res[5],片长= res[6]
电影名 = scrapy.Field()
译名 = scrapy.Field()
片名 = scrapy.Field()
年代 = scrapy.Field()
产地 = scrapy.Field()
类别 = scrapy.Field()
片长 = scrapy.Field()
简介 = scrapy.Field()
class Hao6VItem1(scrapy.Item):
# define the fields for your item here like:
电影名 = scrapy.Field() # define the fields for your item here like:
磁力下载 = scrapy.Field()
pipelines
import pandas as pd
import sqlite3
from .items import Hao6VItem
from .items import Hao6VItem1
dbcon = sqlite3.connect("6v.db")
class Hao6VPipeline:
def open_spider(self,spider):
self.Link=[]
def close_spider(self,spider):
df = pd.DataFrame(self.Link)
df.to_sql('电影概要',dbcon,if_exists='append')
def process_item(self,item,spider):
if not isinstance(item,Hao6VItem):
return item
self.Link.append(item)
if len(self.Link)>50:
df = pd.DataFrame(self.Link)
df.to_sql('电影概要',dbcon,if_exists='append')
self.Link=[]
class Hao6VPipeline1:
def open_spider(self,spider):
self.Link=[]
def close_spider(self,spider):
df = pd.DataFrame(self.Link)
df.to_sql('磁力链接',dbcon,if_exists='append')
def process_item(self,item,spider):
if not isinstance(item,Hao6VItem1):
return item
self.Link.append(item)
if len(self.Link)>100:
df = pd.DataFrame(self.Link)
df.to_sql('磁力链接',dbcon,if_exists='append')
self.Link=[]