本文操作环境:ubuntu14.04
一.安装Scrapy/Mysql/MySQLdb
- 参照官网教程安装Scrapy
#sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 627220E7
#echo 'deb http://archive.scrapy.org/ubuntu scrapy main' | sudo tee /etc/apt/sources.list.d/scrapy.list
#sudo apt-get update && sudo apt-get install scrapy-0.25
- 安装Mysql
#apt-get install mysql-server
- 安装第三方库MySQLdb
#apt-get install python2.7-mysqldb
二.爬虫的目标是抓取
start一个爬虫项目,目标是抓取http://www.mininova.org/today 界面上每日更新的内容并以json格式保存。
root@alexknight:/home# scrapy startproject mininova mininova/
├── mininova
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ └── __init__.py
└── scrapy.cfg
items一般与pipelines.py一起使用,items可以定义自己需要抓取的格式,pipelines.py可以定义抓取内容。
items.py:
# -*- coding: utf-8 -*- # Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html import scrapy class MininovaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
size=scrapy.Field()
在spider新建文件mininova_spider.py:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from mininova.items import MininovaItem
class MininovaSpider(CrawlSpider):
name='mininova'
allowed_domains=['mininova.org']
start_urls = ['http://www.mininova.org/today']
rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')] def parse_torrent(self, response):
#global MininovaItem
torrent = MininovaItem()
torrent['url'] = response.url
torrent['name'] = response.xpath("//h1/text()").extract()
torrent['description'] = response.xpath("//div[@id='description']").extract()
torrent['size'] = response.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
return torrent
将数据json化
root@alexknight:/home/mininova#scrapy crawl mininova -o scraped_data.json
root@alexknight:/home/mininova#vim scraped_data.json
[{"url": "http://www.mininova.org/tor/13278887", "size": [], "description": ["<div id=\"description\">\n\n<iframe id=\"share-facebook\" src=\"http://www.facebook.com/plugins/like.php?href=http://www.mininova.org/tor/13278887&layout=box_count&show_faces=false&width=50&action=like&colorscheme=light&height=65\" scrolling=\"no\" frameborder=\"0\"></iframe>\n\nYoung Lucid's debut release features Joell Oritz, one quarter of the Shady Records\u2019 super group, Slaughterhouse. The emotional production from 2 Deep & MeccaGodZilla set the tone for the lyrical duo as they trade bars about a world they long to escape. \u201cRunaway\u201d is a mere preview of Young Lucid\u2019s upcoming project, \u201cLucid Dreams\u201d which is slated for a 2015 release.<br>\n<br>\n<a target=\"_blank\" rel=\"nofollow\" href=\"http://www.younglucid.com/\">www.younglucid.com</a><br>\n<a target=\"_blank\" rel=\"nofollow\" href=\"http://www.meccagodzilla.com\">www.meccagodzilla.com</a><br>\n<br>\n<div class=\"clear-left\"></div>\n</div>"], "name": ["NEW: Runaway Ft. Joell Ortiz by Young Lucid "]}]