pipline
1 import json 2 import os 3 import requests 4 5 6 class JsonPipeline(object): 7 def __init__(self): 8 self.file = open('xiaohua.txt', 'w') 9 10 def process_item(self, item, spider): 11 v = json.dumps(dict(item), ensure_ascii=False) 12 self.file.write(v) 13 self.file.write('\n') 14 self.file.flush() 15 return item 16 17 18 class FilePipeline(object): 19 def __init__(self): 20 if not os.path.exists('imgs'): 21 os.makedirs('imgs') 22 23 def process_item(self, item, spider): 24 response = requests.get(item['url'], stream=True) 25 file_name = '%s_%s.jpg' % (item['name'], item['school']) 26 with open(os.path.join('imgs', file_name), mode='wb') as f: 27 f.write(response.content) 28 return itempipline
1 ITEM_PIPELINES = { 2 'spider1.pipelines.JsonPipeline': 100, 3 'spider1.pipelines.FilePipeline': 300, 4 } 5 # 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内。setting
from scrapy.exceptions import DropItem class CustomPipeline(object): def __init__(self,v): self.value = v def process_item(self, item, spider): # 操作并进行持久化 # return表示会被后续的pipeline继续处理 return item # 表示将item丢弃,不会被后续pipeline处理 # raise DropItem() @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ val = crawler.settings.getint('MMMM') return cls(val) def open_spider(self,spider): """ 爬虫开始执行时,调用 :param spider: :return: """ print('000000') def close_spider(self,spider): """ 爬虫关闭时,被调用 :param spider: :return: """ print('111111')自定义pipline