与下载图片类似:
1.item中需要有固定的字段
file_urls = scrapy.Field() files = scrapy.Field()
2.获取到文件的url,通过item["file_urls"]传送到 pipelines
def parse_item(self, response): item = ScrapyanthingItem() data = response.body.decode(response.encoding).replace("\\", "") item["file_urls"] = re.findall(r'https://[a-zA-Z0-9./]+/index.m3u8', data)[0] yield item
3.pipelines 中处理file_urls
from scrapy.pipelines.images import ImagesPipeline, FilesPipeline
class DownloadM3u8Pipeline(FilesPipeline): # 继承FilePipeline def get_media_requests(self, item, info): m3u8_url=item['file_urls'] yield Request(m3u8_url, meta={"item": item}) # 请求文件url,scrapy的FilesPipeline会调用file_path,和(第4步)设置中的FILES_STORE储存文件 def file_path(self, request, response=None, info=None):
# 返回图片储存的地址 a/a.m3u8 item = request.meta["item"] # date = datetime.date.today() st = uuid.uuid4().hex geshi = item['file_urls'].split(".")[-1] file_paths = '{}.{}'.format(st, geshi) return file_paths def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") # 如果没有路径则抛出异常 item['m3u8_paths'] = image_paths return item
4.setting中设置
# 文件储存目录 project_dir = os.path.dirname(__file__) FILES_STORE = os.path.join(project_dir, "warehouse/files") # 必须指定FILES_STORE字段 ITEM_PIPELINES = { 'ScrapyAnthing.pipelines.DownloadM3u8Pipeline': 1, # 启动文件下载中间件 }