目前网上有很多关于scrapy的文章,这里我主要介绍一下我在开发中遇到问题及一些技巧:
1,以登录状态去爬取(带cookie)
-安装内容:
brew install phantomjs (MAC上)
pip install selenium
-代码:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS也可以对header进行修改
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
)
#通过账号密码获得cookie的函数
def get_cookie_from_aicoin_login(account, password):
browser = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs',desired_capabilities=dcap)
browser.get("https://www.aicoin.net.cn/sign_in")
while 'Sign in to AIcoin' in browser.title:
username = browser.find_element_by_name("user_account")#获得用户名标签
username.clear()
username.send_keys(account)#输入用户名 psd = browser.find_element_by_name("user_password")#获得密码标签
psd.clear()
psd.send_keys(password)#输入密码 code = browser.find_element_by_name("user_verify")#获得验证码标签
code.clear()
code_verify = browser.find_element_by_xpath("//button[@class='verify_code']")#部分页面存在验证码错误,需要再次点击刷新获得新的验证码
code_verify.click()
time.sleep(1)
browser.save_screenshot("aa.png") # 对登录页截屏并保存在本地
code_txt = input("请查看路径下新生成的aa.png,然后输入验证码:") # 查看图片后手动输入验证码
code.send_keys(code_txt)#输入验证码
commit = browser.find_element_by_xpath("//div[@class='sure_btn']/button[@type='submit']") # 获得登录按钮
commit.click()#点击提交按钮
time.sleep(3)
cookie = {}
for elem in browser.get_cookies():
cookie[elem["name"]] = elem["value"]
#返回cookie
if 'AICoin - Leader Of Global Cryptocurrency Tickers Application' in browser.title:#验证是否登录成功,成功后会跳转到首页
return json.dumps(cookie)
else:
return {}
※特别提示:当需要爬取动态内容(js加载的内容)时,也会用到PHANTOMJS
※运行爬虫(scrapy crawl yourspider)需要到cd到该爬虫主目录下即包含scrapy.cfg的目录; 另外调试的时候可以直接使用scrapy shell yoururl 进行代码测试;
2,递归爬取内容
-在scrapy中对应的spider文件中添加如下代码(下面是代码是爬取股吧的帖子和评论)
from scrapy.http import Request
from gubaspider.items import PostItem,CommentItem class GubaSpider(scrapy.spiders.Spider):
name = "guba"
allowed_domains = ["eastmoney.com"] start_urls = [
"http://guba.eastmoney.com/default_551215.html"
] def parse(self, response):
tmp_list = [] for i in response.xpath('//ul[@class="newlist"]/li'): title = i.xpath('span/a[2]/text()').extract()[0]
ar_url = i.xpath('span/a[2]/@href').extract()[0]
group = i.xpath('span/a[1]/text()').extract()[0]
comment_sum = i.xpath('cite[2]/text()').extract()[0]
read_sum = i.xpath('cite[1]/text()').extract()[0]
author = i.xpath('cite[3]/a/text()').extract()[0]
tmp_list.append({'title':title,'ar_url':ar_url,'group':group,'comment_sum':comment_sum,'read_sum':read_sum,\
'author':author}) for z in tmp_list:
yield Request('http://guba.eastmoney.com' + z.pop('ar_url'), callback=self.parse_article,meta=z,cookies=get_cookie_from_aicoin_login(user,pwd))#通过第一个页面里爬取到url再爬取并可以携带参数和cookie;callback就是爬取新url的方法 def parse_article(self,response):
title = response.meta['title']
group = response.meta['group']
comment_sum = response.meta['comment_sum']
read_sum = response.meta['read_sum']
author = response.meta['author']
content = response.xpath('//div[@id="zwcontent"]/div[@class="zwcontentmain"]/div[@id="zwconbody"]/div[@class="stockcodec"]').extract()
post_time = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@id="zwcontt"]/div[@id="zwconttb"]/div[@class="zwfbtime"]/text()').extract())
if post_time != 0:
post_type = post_time.split(' ')[-1]
post_time = post_time[4:24]
good_sum = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_z"]/span[@id="zwpraise"]/a/span/text()').extract())
transmit_sum = self.get_node_value(response.xpath(
'//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_zf"]/a/span/text()').extract())
comments = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/text()').extract() cm_name = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlianame"]/span[@class="zwnick"]/a/text()').extract()
time = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitime"]/text()').extract()
page_info = response.xpath(
'//div[@id="zwlist"]/div[@class="pager talc zwpager"]/span[@id="newspage"]/@data-page').extract() item = PostItem()
item['Author'] = author # 帖子作者称
item['Title'] = title # 帖子标题
item['Content'] = content # 帖子内容
item['PubTime'] = post_time # 发表时间
item['PostWay'] = post_time if post_time==0 else post_type # 发表方式 网页等
item['Url'] = response.url # 帖子地址
item['Group'] = group # 所属贴吧
item['Like'] = good_sum # 点赞数
item['Transmit'] = transmit_sum # 转发数
item['Comment_Num'] = comment_sum # 评论数
item['Tour'] = read_sum # 浏览数 for x in range(len(cm_name)):
if comments[x]==' ':
if comments[x] == ' ':
s = '//div[@id="zwlist"]/div[' + str(
x + 1) + ']/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/img/@title'
s = response.xpath(s).extract()
comment = reduce(lambda x, y: x + '|'+y, s) if len(s) > 0 else ''
else:
comment = comments[x]
else:
comment = comments[x]
cm_list.append({'name':cm_name[x],'time':time[x][4:],'comment':comment})
item['Comments'] = cm_list # 回复内容
yield item#存入DB
if len(page_info)>0:
page_info = page_info[0].split('|')
sumpage = int(int(page_info[1])/int(page_info[2]))+1
for p in range(1,sumpage):
cm_url = 'http://guba.eastmoney.com/'+page_info[0]+str(p+1)+'.html'
yield Request(cm_url,callback=self.parse_comment)#再爬取下一个页面
3,将数据存入mongodb
-pipelines文件中添加自定义的pipeline类:
import pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def close_spider(self, spider):
self.client.close() def process_item(self, item, spider):
collection_name = item.__class__.__name__
self.db[collection_name].insert(dict(item))
return item
-items中定义自己item:
from scrapy import Item,Field class PostItem(Item):
Author = Field() # 帖子作者称
Title = Field() # 帖子标题
Content = Field() # 帖子内容
PubTime = Field() # 发表时间
# Top = Field() # 是否顶
PostWay = Field() # 发表方式 网页等
Url = Field() # 帖子地址
Group = Field() # 所属贴吧
Like = Field() # 点赞数
Transmit = Field() # 转发数
Comment_Num = Field() # 评论数
Tour = Field() # 浏览数
Comments = Field() # 回复内容 class CommentItem(Item):
Url = Field() # url
Comments = Field() # 评论
-settings中添加ITEM_PIPELINES
ITEM_PIPELINES = {
'gubaspider.pipelines.MongoPipeline': 300,
}
4,添加代理和Agent
-在middlewares中添加你定义的中间件类:
from user_agents import agents#从一个文件导入全部agent
import random class UserAgentMiddleware(object): def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent#随机agent
request.meta['proxy'] = "http://proxy.yourproxy:8001"#添加代理地址
-在settings中进行中间配置
DOWNLOADER_MIDDLEWARES = {
'gubaspider.middlewares.UserAgentMiddleware' : 543
}
-user_agents文件包含一个agent列表:
""" User-Agents """
agents = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
"Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
"Mozilla/2.02E (Win95; U)",
"Mozilla/3.01Gold (Win95; I)",
"Mozilla/4.8 [en] (Windows NT 5.1; U)",
"Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
"HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
"Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]
※ 以上部分代码参考https://github.com/LiuXingMing/SinaSpider
ITEM_PIPELINES