scrapy 爬取指定贴吧

scrapy 爬取指定贴吧

爬取指定贴吧,只爬取今天与昨天的,并设置定时,定时生成一个文件

spider.py

## -*- coding: utf-8 -*-

from BaiduPOA.items import OneItem
import datetime
from scrapy_splash.request import SplashRequest, SplashFormRequest#时间为动态数据,scrapy.Request改为Splash.request


class PublicSentimentSpider(scrapy.Spider):

    name = "publicSentiment"
    allowed_domains = ["tieba.baidu.com"]
    #设置爬取首页网址
    start_urls = [
            "http://tieba.baidu.com/f?kw=%E5%90%89%E6%9E%97%E5%8A%A8%E7%94%BB%E5%AD%A6%E9%99%A2&ie=utf-8&pn=0"]

    # 此函数爬取首页的今天与昨天的链接,并将链接传给下一级函数,判断每页的最后一个帖子(第50个)的时间来进行翻页
    def parse(self, response):

        #获取系统昨天的日期,格式为1-1
        oneDayAgo = (datetime.datetime.now() - datetime.timedelta(days=1))
        month = str(int(oneDayAgo.strftime('%m')))
        day = str(int(oneDayAgo.strftime('%d')))
        oneDayAgoStyle = month + '-' + day

        #使用xpath()提取帖子url,回复时间
        node_list = response.xpath("//div[@class='t_con cleafix']")
        count=0
        print(node_list)
        for node in node_list:
            count = count + 1
            item=OneItem()
            item['link']='https://tieba.baidu.com'+node.xpath("./div/div/div/a[@class='j_th_tit ']/@href").extract()[0]
            last_reply_time = node.xpath('normalize-space(./div/div/div/span[@title="最后回复时间"]/text())').extract()
            #判断回复时间是否为当天系统时间与系统昨天的日期,若是,则进入帖子url爬取主贴,跟贴,时间
            if last_reply_time[0]==oneDayAgoStyle:
                yield scrapy.Request(url=item['link'],meta = {'item': item}, callback=self.parse_detail, dont_filter=True)
            elif ":"in last_reply_time[0] and last_reply_time [0] >='00:00':
                yield scrapy.Request(url=item['link'],meta = {'item': item}, callback=self.parse_detail,dont_filter=True)
            #else:
                # 不是昨天也不是今天的帖子的链接删除掉
				#pass



       #获取’下一页‘的xpath,判断每页的最后一个(第50个)的回复时间是否为当天系统时间与系统昨天的日期,若是,则进行翻页}
        a=response.xpath("//*[@id='frs_list_pager']/a[@class='next pagination-item ']/@href").extract()
        if len(a)==0:
            pass
        else:
            next_page ='http:'+a[0]
            # 因每页总共50个帖子,所以判断第50个for循环式的情况
            if count % 50 == 0:
                if last_reply_time[0]==oneDayAgoStyle:
                    yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=True)
                elif ":"in last_reply_time[0] and last_reply_time[0] >='00:00':
                    yield scrapy.Request(url=next_page, callback=self.parse,dont_filter=True)

    # 此函数爬取主题与跟帖,若帖子内容需要翻页,则调用下一级的函数
    def parse_detail(self, response):
        #使用xpath()获取主贴,跟帖,时间,帖子页数
        item = OneItem()
        list1 = []
        list2 = []

        item['link']=response.url
        #使用xpath规定主贴的范围,使用for循环进行提取
        node_list = response.xpath("//*[@class='d_post_content_main d_post_content_firstfloor']")
        for node in node_list:
            item['comment1']=node.xpath("./div/cc/div[@class='d_post_content j_d_post_content  clearfix']/text()").extract()
            item['time1'] = node.xpath( "./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
        #使用xpath规定跟帖的范围,使用for循环进行提取,因只能提取跟帖的最后一个,所以使用list与append()
        node2_list=response.xpath("//*[@class='d_post_content_main']")
        for node2 in node2_list:
            item['comment2'] = node2.xpath( "./div/cc/div[@class='d_post_content j_d_post_content  clearfix']/text()").extract()
            item['time2']=node2.xpath( "./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
            list1.append(item['comment2'])
            list2.append(item['time2'])

        item['comment2']=list1
        item['time2']=list2
         # print(c)
            # 获取帖子的页数,获取‘下一页’的xpath,并判断当页数>=2,’下一页‘的xpath不为空的情况下,进行翻页
        page_num = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()
        a=response.xpath("//ul[@class='l_posts_num']/li/a[contains(text(),'下一页')]/@href").extract()
        if int(str(page_num[0]))>=1:
            if len(a) == 0:
                yield item
            else:
                yield scrapy.Request(url='http://tieba.baidu.com'+a[0],callback=self.parse_detail2,meta = {'item': item}, dont_filter=True)
        #yield item
        print(item)
        #print(type(item['time2']))
    # 当帖子内容需要翻页时,调动该函数,接受上级爬取的内容,并进行下一页跟帖的爬取
    def parse_detail2(self, response):
        item = response.meta['item']
        list1 = []
        list1.append(item['comment2'])
        list2 = []
        list2.append(item['time2'])
        # 使用xpath规定跟帖的范围,使用for循环进行提取,因只能提取跟帖的最后一个,所以使用list与append()
        node2_list = response.xpath("//*[@class='d_post_content_main']")
        for node2 in node2_list:
            comment2 = node2.xpath("./div/cc/div[@class='d_post_content j_d_post_content  clearfix']/text()").extract()
            time2 = node2.xpath(
                "./div[@class='core_reply j_lzl_wrapper']/div/ul[@class='p_tail']/li[2]/span//text()").extract()
            list1.append(comment2)
            list2.append(time2)
        item['comment2'] = list1
        item['time2'] = list2
        # print((item['time2']))
        # 获取帖子的页数,获取‘下一页’的xpath,并判断当页数>=2,’下一页‘的xpath不为空的情况下,进行翻页
        page_num = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()
        a = response.xpath("//ul[@class='l_posts_num']/li/a[contains(text(),'下一页')]/@href").extract()
        if int(str(page_num[0])) >= 2:
            if len(a) == 0:
                yield item
            else:
                yield scrapy.Request(url='http://tieba.baidu.com' + a[0], callback=self.parse_detail2,
                                    meta={'item': item}, dont_filter=True)
        #yield item
        print(item)
        #print(type(item['comment2']))

        #print(type(item['time2']))



#评论为图片时判断:
               #zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
                #match = zhPattern.search(str(comment2))
                #if match:
                    #comment2 = comment2
                #else:
                    #comment2 = '内容非文字'


timing.py

import os
import time
import datetime

if __name__ == '__main__':#若运行本文件
    def main(h=11, m=27,s=0):
        while True:
             now = datetime.datetime.now()
             print(now.hour==h)
             if  now.hour == h and now.minute == m and now.second == s:
                 os.system("scrapy crawl publicSentiment")#执行此命令
                 print('AAAAAAAAAAAAAAAAAAAAAAAAAAA')
             else:
                 print("WWWWWWWWWWWWWW")

        # 每隔60秒检测一次
                 time.sleep(1)
    main()

上一篇:Vue 部署到Tomcat资源报错问题解决


下一篇:extract()函数:用于从一个date或者interval类型中截取到特定的部分