start
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'jokespider'])
items.py
import scrapy class JokejiItem(scrapy.Item):
title=scrapy.Field()
url=scrapy.Field() class ListItem(scrapy.Item):
title=scrapy.Field()
url=scrapy.Field()
spider.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jokeji.items import JokejiItem,ListItem class JokespiderSpider(CrawlSpider):
name = 'jokespider'
allowed_domains = ['zizi.cn']
start_urls = ['http://www.zizi.cn'] rules = [
Rule(LinkExtractor(allow=r'/list\w+.htm'), callback='parse_list', follow=True),
Rule(LinkExtractor(allow=r'/jokehtml/\w+/\d+\.htm',deny=(r'/list')), callback='parse_item', follow=True),
] def parse_item(self, response):
item=JokejiItem()
item['title']='from content'
return item def parse_list(self,response):
item=ListItem()
item['url']="from list........"+response.url
return item
pipelines.py
class JokejiPipeline(object):
def process_item(self, item, spider):
print(item,item__class__,spider)
通过 item__class__ 是什么类来决定如何处理数据
当然 ItemClass() 类里可以加
def __str__(self):
return 'ItemClass"
更直观.