QiushuSpider
# -*- coding: utf-8 -*- import scrapy import time from qiushu.items import QiushuItem class QiushuspiderSpider(scrapy.Spider): name = 'QiushuSpider' allowed_domains = ['www.qiushu.cc'] start_urls = ['http://www.qiushu.cc/'] def parse(self, response): '''解析分类列表''' # TODO 解析分类 # 分类URLs links = response.xpath('//p[@class="hot_tips"]/a/@href').extract() # 所有类型链接 for i in links: url = 'http://www.qiushu.cc' + i yield scrapy.Request(url, callback=self.parse_books, dont_filter=True) def parse_books(self, response): '''解析书籍列表''' # TODO: 解析书籍列表 # time.sleep(2) book_url = [] for i in response.xpath('//*[@id="main"]/div[1]/div/div/ul/li'): book_dan_url = ''.join(i.xpath('.//span[@class="t1"]/a/@href').extract_first()) book_url.append(book_dan_url) print('*' * 30, book_dan_url) # import ipdb as pdb; pdb.set_trace() print('*' * 30, book_url) for i in book_url: yield scrapy.Request(i, callback=self.parse_section, dont_filter=True) # TODO: 处理下一页 xia_url = ''.join(response.xpath('//*[@class="next"]/@href').extract()) if bool(xia_url): yield scrapy.Request(xia_url, callback=self.parse_books, dont_filter=True) def parse_section(self, response): '''解析具体的某一章''' # TODO: 解析具体的章 item = QiushuItem() # 书名 item['name'] = ''.join(response.xpath('//div[@class="title"]/h1/text()').extract()) # 作者 item['author'] = ''.join(response.xpath('//div[@class="title"]/span/text()').extract()) # 书籍分类标签 item['booktype'] = ''.join(response.xpath('//*[@id="main"]/div[2]/text()[2]').extract()).split('>')[1] # 书籍状态 item['state'] = ''.join(response.xpath('//*[@id="main"]/div[2]/span/text()').extract()) # 书籍的有效地址 item['showUrl'] = response.url # 书籍描述 item['describe'] = ''.join(response.xpath('//div[@class="intro"]/p/text()').extract()) yield item
items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QiushuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 书名 name = scrapy.Field() # 作者 author = scrapy.Field() # 书籍分类标签 booktype = scrapy.Field() # 书籍状态 state = scrapy.Field() # 书籍的有效地址 showUrl = scrapy.Field() # 书籍描述 describe = scrapy.Field()