python爬虫beta版之抓取知乎单页面回答（low 逼版）

2023-01-02 17:08:26

　　闲着无聊，逛知乎。发现想找点有意思的回答也不容易，就想说要不写个爬虫帮我把点赞数最多的给我搞下来方便阅读，也许还能做做数据分析（意淫中～～）

　　鉴于之前用python写爬虫，帮运营人员抓取过京东的商品品牌以及分类，这次也是用python来搞简单的抓取单页面版，后期再补充哈。

#-*- coding: UTF-8 -*-

import requests

import sys

from bs4 import BeautifulSoup

#－－－－－－知乎答案收集－－－－－－－－－－

#获取网页body里的内容

def get_content(url , data = None):

    header={

        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

        'Accept-Encoding': 'gzip, deflate, sdch',

        'Accept-Language': 'zh-CN,zh;q=0.8',

        'Connection': 'keep-alive',

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'

    }

    req = requests.get(url, headers=header)

    req.encoding = 'utf-8'

    bs = BeautifulSoup(req.text, "html.parser")  # 创建BeautifulSoup对象

    body = bs.body # 获取body部分

    return body

#获取问题标题

def get_title(html_text):

     data = html_text.find('span', {'class': 'zm-editable-content'})

     return data.string.encode('utf-8')

#获取问题内容

def get_question_content(html_text):

     data = html_text.find('div', {'class': 'zm-editable-content'})

     if data.string is None:

         out = '';

         for datastring in data.strings:

             out = out + datastring.encode('utf-8')

         print '内容：\n' + out

     else:

         print '内容：\n' + data.string.encode('utf-8')

#获取点赞数

def get_answer_agree(body):

    agree = body.find('span',{'class': 'count'})

    print '点赞数：' + agree.string.encode('utf-8') + '\n'

#获取答案

def get_response(html_text):

     response = html_text.find_all('div', {'class': 'zh-summary summary clearfix'})

     for index in range(len(response)):

         #获取标签

         answerhref = response[index].find('a', {'class': 'toggle-expand'})

         if not(answerhref['href'].startswith('javascript')):

             url = 'http://www.zhihu.com/' + answerhref['href']

             print url

             body = get_content(url)

             get_answer_agree(body)

             answer = body.find('div', {'class': 'zm-editable-content clearfix'})

             if answer.string is None:

                 out = '';

                 for datastring in answer.strings:

                     out = out + '\n' + datastring.encode('utf-8')

                 print out

             else:

                 print answer.string.encode('utf-8')

html_text = get_content('https://www.zhihu.com/question/43879769')

title = get_title(html_text)

print "标题：\n" + title + '\n'

questiondata = get_question_content(html_text)

print '\n'

data = get_response(html_text)

　　　输出结果：

码农公寓

相关文章