scrapy 爬取指定贴吧
爬取指定贴吧,只爬取今天与昨天的,并设置定时,定时生成一个文件
## -*- coding: utf-8 -*-
from BaiduPOA.items import OneItem
import datetime
from scrapy_splash.request import SplashRequest, SplashFormRequest#时间为动态数据,scrapy.Request改为Splash.request
class PublicSentimentSpider(scrapy.Spider):
name = "publicSentiment"
allowed_domains = ["tieba.baidu.com"]
#设置爬取首页网址
start_urls = [
"http://tieba.baidu.com/f?kw=%E5%90%89%E6%9E%97%E5%8A%A8%E7%94%BB%E5%AD%A6%E9%99%A2&ie=utf-8&pn=0"]
# 此函数爬取首页的今天与昨天的链接,并将链接传给下一级函数,判断每页的最后一个帖子(第50个)的时间来进行翻页
def parse(self, response):
#获取系统昨天的日期,格式为1-1
oneDayAgo = (datetime.datetime.now() - datetime.timedelta(days=1))
month = str(int(oneDayAgo.strftime('%m')))
day = str(int(oneDayAgo.strftime('%d')))
oneDayAgoStyle = month + '-' + day
#使用xpath()提取帖子url,回复时间
node_list = response.xpath("//div[@class='t_con cleafix']")
count=0
print(node_list)
for node in node_list:
count = count + 1
item=OneItem()
item['link']='https://tieba.baidu.com'+node.xpath("./div/div/div/a[@class='j_th_tit ']/@href").extract()[0]
last_reply_time = node.xpath('normalize-space(./div/div/div/span[@title="最后回复时间"]/text())').extract()
#判断回复时间是否为当天系统时间与系统昨天的日期,若是,则进入帖子url爬取主贴,跟贴,时间
if last_reply_time[0]==oneDayAgoStyle:
yield scrapy.Request(url=item['link'],meta = {'item': item}, callback=self.parse_detail, dont_filter=True)
elif ":"in last_reply_time[0] and last_reply_time [0] >='00:00':
yield scrapy.Request(url=item['link'],meta = {'item': item}, callback=self.parse_detail,dont_filter=True)
#else:
# 不是昨天也不是今天的帖子的链接删除掉
#pass
#获取’下一页‘的xpath,判断每页的最后一个(第50个)的回复时间是否为当天系统时间与系统昨天的日期,若是,则进行翻页}
a=response.xpath("//*[@id='frs_list_pager']/a[@class='next pagination-item ']/@href").extract()
if len(a)==0:
pass
else:
next_page ='http:'+a[0]
# 因每页总共50个帖子,所以判断第50个for循环式的情况
if count % 50 == 0:
if last_reply_time[0]==oneDayAgoStyle:
yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=True)
elif ":"in last_reply_time[0] and last_reply_time[0] >='00:00':
yield scrapy.Request(url=next_page, callback=self.parse,dont_filter=True)
# 此函数爬取主题与跟帖,若帖子内容需要翻页,则调用下一级的函数
def parse_detail(self, response):
#使用xpath()获取主贴,跟帖,时间,帖子页数
item = OneItem()
list1 = []
list2 = []
item['link']=response.url
#使用xpath规定主贴的范围,使用for循环进行提取
node_list = response.xpath("//*[@class='d_post_content_main d_post_content_firstfloor']")
for node in node_list:
item['comment1']=node.xpath("./div/cc/div[@class='d_post_content j_d_post_content clearfix']/text()").extract()
item['time1'] = node.xpath( "./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
#使用xpath规定跟帖的范围,使用for循环进行提取,因只能提取跟帖的最后一个,所以使用list与append()
node2_list=response.xpath("//*[@class='d_post_content_main']")
for node2 in node2_list:
item['comment2'] = node2.xpath( "./div/cc/div[@class='d_post_content j_d_post_content clearfix']/text()").extract()
item['time2']=node2.xpath( "./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
list1.append(item['comment2'])
list2.append(item['time2'])
item['comment2']=list1
item['time2']=list2
# print(c)
# 获取帖子的页数,获取‘下一页’的xpath,并判断当页数>=2,’下一页‘的xpath不为空的情况下,进行翻页
page_num = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()
a=response.xpath("//ul[@class='l_posts_num']/li/a[contains(text(),'下一页')]/@href").extract()
if int(str(page_num[0]))>=1:
if len(a) == 0:
yield item
else:
yield scrapy.Request(url='http://tieba.baidu.com'+a[0],callback=self.parse_detail2,meta = {'item': item}, dont_filter=True)
#yield item
print(item)
#print(type(item['time2']))
# 当帖子内容需要翻页时,调动该函数,接受上级爬取的内容,并进行下一页跟帖的爬取
def parse_detail2(self, response):
item = response.meta['item']
list1 = []
list1.append(item['comment2'])
list2 = []
list2.append(item['time2'])
# 使用xpath规定跟帖的范围,使用for循环进行提取,因只能提取跟帖的最后一个,所以使用list与append()
node2_list = response.xpath("//*[@class='d_post_content_main']")
for node2 in node2_list:
comment2 = node2.xpath("./div/cc/div[@class='d_post_content j_d_post_content clearfix']/text()").extract()
time2 = node2.xpath(
"./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
list1.append(comment2)
list2.append(time2)
item['comment2'] = list1
item['time2'] = list2
# print((item['time2']))
# 获取帖子的页数,获取‘下一页’的xpath,并判断当页数>=2,’下一页‘的xpath不为空的情况下,进行翻页
page_num = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()
a = response.xpath("//ul[@class='l_posts_num']/li/a[contains(text(),'下一页')]/@href").extract()
if int(str(page_num[0])) >= 2:
if len(a) == 0:
yield item
else:
yield scrapy.Request(url='http://tieba.baidu.com' + a[0], callback=self.parse_detail2,
meta={'item': item}, dont_filter=True)
#yield item
print(item)
#print(type(item['comment2']))
#print(type(item['time2']))
#评论为图片时判断:
#zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
#match = zhPattern.search(str(comment2))
#if match:
#comment2 = comment2
#else:
#comment2 = '内容非文字'
import os
import time
import datetime
if __name__ == '__main__':#若运行本文件
def main(h=11, m=27,s=0):
while True:
now = datetime.datetime.now()
print(now.hour==h)
if now.hour == h and now.minute == m and now.second == s:
os.system("scrapy crawl publicSentiment")#执行此命令
print('AAAAAAAAAAAAAAAAAAAAAAAAAAA')
else:
print("WWWWWWWWWWWWWW")
# 每隔60秒检测一次
time.sleep(1)
main()