将python爬取到的数据导入到mysql中

1.创建scrapy框架和爬虫程序

2.定义settings.py

将python爬取到的数据导入到mysql中

 

 3.编写spider爬虫程序

将python爬取到的数据导入到mysql中
 1 #!/usr/bin/python3
 2 #-*-coding:UTF-8-*-
 3 import scrapy
 4 import sys
 5 import time
 6 sys.path.append("..")
 7 from top250.items import Top250Item
 8 
 9 class Top250Spider(scrapy.Spider):
10         name="top250"
11         allowed_domains=["www.douban.com"]
12         start_urls=["https://movie.douban.com/top250"]
13         def parse(self,response):
14             sel=scrapy.selector.Selector(response)#response为参数
15             sites=sel.xpath("//li/div/div[2]")
16             for site in sites:
17                     item=Top250Item()
18                     item['title']=site.xpath("div[1]/a/span[1]/text()").extract()
19                     item['link']=site.xpath("div[1]/a/@href").extract()
20                     item['dc']=site.xpath("div[2]/p[2]/span/text()").extract()
21                     yield item#这一句很重要,省去了item().append
View Code

 4.items.py

将python爬取到的数据导入到mysql中
 1 # -*- coding: utf-8 -*-
 2 # Define here the models for your scraped items
 3 #
 4 # See documentation in:
 5 # https://docs.scrapy.org/en/latest/topics/items.html
 6 import scrapy
 7 
 8 class Top250Item(scrapy.Item):
 9     # define the fields for your item here like:
10     # name = scrapy.Field()
11     title=scrapy.Field()
12     link=scrapy.Field()
13     dc=scrapy.Field()
View Code

5.建立sql数据库和表格

6.定义pipelines

将python爬取到的数据导入到mysql中
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 import json
 8 import pymysql
 9 class Top250Pipeline(object):
10     def __init__(self):
11         self.connect = pymysql.connect(host="localhost",user="root",password="",db="db2",charset="utf8",use_unicode=True)     
12         self.cursor = self.connect.cursor()
13     def process_item(self, item, spider):
14         self.sql="insert into top250(title,link,dc) values (%s,%s,%s)"
15         self.cursor.execute(self.sql,(item['title'],item['link'],item['dc']))
16         self.connect.commit()
17         return item
18     def close_spider(self,spider):
19         self.cursor.close()
20         self.connect.close()
21 '''
22         with open('items.json', 'a') as f:
23             json.dump(dict(item),f,ensure_ascii=False)
24             f.write(',\n')
25 '''#后面多行注释部分为导出json表格时使用
View Code

7.在cmd-top250窗口输入scrapy crawl top250

上一篇:豆瓣电影top250(网络爬虫)


下一篇:python 爬豆瓣TOP250电影练习