1.创建scrapy框架和爬虫程序
2.定义settings.py
3.编写spider爬虫程序
1 #!/usr/bin/python3 2 #-*-coding:UTF-8-*- 3 import scrapy 4 import sys 5 import time 6 sys.path.append("..") 7 from top250.items import Top250Item 8 9 class Top250Spider(scrapy.Spider): 10 name="top250" 11 allowed_domains=["www.douban.com"] 12 start_urls=["https://movie.douban.com/top250"] 13 def parse(self,response): 14 sel=scrapy.selector.Selector(response)#response为参数 15 sites=sel.xpath("//li/div/div[2]") 16 for site in sites: 17 item=Top250Item() 18 item['title']=site.xpath("div[1]/a/span[1]/text()").extract() 19 item['link']=site.xpath("div[1]/a/@href").extract() 20 item['dc']=site.xpath("div[2]/p[2]/span/text()").extract() 21 yield item#这一句很重要,省去了item().appendView Code
4.items.py
1 # -*- coding: utf-8 -*- 2 # Define here the models for your scraped items 3 # 4 # See documentation in: 5 # https://docs.scrapy.org/en/latest/topics/items.html 6 import scrapy 7 8 class Top250Item(scrapy.Item): 9 # define the fields for your item here like: 10 # name = scrapy.Field() 11 title=scrapy.Field() 12 link=scrapy.Field() 13 dc=scrapy.Field()View Code
5.建立sql数据库和表格
略
6.定义pipelines
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 import json 8 import pymysql 9 class Top250Pipeline(object): 10 def __init__(self): 11 self.connect = pymysql.connect(host="localhost",user="root",password="",db="db2",charset="utf8",use_unicode=True) 12 self.cursor = self.connect.cursor() 13 def process_item(self, item, spider): 14 self.sql="insert into top250(title,link,dc) values (%s,%s,%s)" 15 self.cursor.execute(self.sql,(item['title'],item['link'],item['dc'])) 16 self.connect.commit() 17 return item 18 def close_spider(self,spider): 19 self.cursor.close() 20 self.connect.close() 21 ''' 22 with open('items.json', 'a') as f: 23 json.dump(dict(item),f,ensure_ascii=False) 24 f.write(',\n') 25 '''#后面多行注释部分为导出json表格时使用View Code
7.在cmd-top250窗口输入scrapy crawl top250