python 分析知乎粉丝数据

2022-11-01 13:36:56

昨天花了一下午写了一个小爬虫，用来分析自己的粉丝数据。这个真好玩！今天帮了群里好多大V也爬了他们的数据。运行速度：每分钟5千粉丝以上。暂时先写成这样，这两天要准备补考，没有时间继续玩这个。

下次要改进的地方：1、多线程 2、scrapy 3、深度数据 4、分布式爬虫

希望实现的功能：

1、地区、教育程度、注册时间、送粉识别、颜值检测
2、导出 h5超秀的界面 和完美的 xlsx 数据
3、对内容提出建议
4、对接微信后台实现自动化

下面是源码，经2019年8月21日测试可用：

from selenium.webdriver import Chrome,ChromeOptions

from requests.cookies import RequestsCookieJar

from lxml import etree

from pandas import DataFrame

import json,time,requests,re,os,clipboard

def sele_input_zhihu():

    '首次登陆知乎，需要输入账号密码'

    # 防止检测

    option = ChromeOptions()

    option.add_experimental_option('excludeSwitches', ['enable-automation'])

    driver = Chrome(options=option)

    # 登录

    driver.get("http://www.zhihu.com/")

    name=input("请输入手机号或邮箱:")

    pwd=input("请输入密码：")

    needPass=driver.find_element_by_xpath("//div[@class='SignFlow-tab']")

    needPass.click()

    driver.find_element_by_name("username").send_keys(name)

    driver.find_element_by_name("password").send_keys(pwd)

    submitBtn = driver.find_element_by_xpath("//button[@type='submit']")

    submitBtn.click()

    time.sleep(5)

    # 保存cookies

    cookies = driver.get_cookies()

    with open("cookies.json", "w") as fp:

        json.dump(cookies, fp)

    print("保存cookies成功！")

    driver.close()

def login_zhihu(s):

    '利用保存的cookies登录知乎'

    headers = {

        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'

    }

    s.headers=headers

    cookies_jar = RequestsCookieJar()

    with open("cookies.json","r")as fp:

        cookies = json.load(fp)

        for cookie in cookies:

            cookies_jar.set(cookie['name'], cookie['value'])

    s.cookies.update(cookies_jar)

    print("登录成功！")

# 浅数据处理

# "users":{(.*), "questions":{}

def parse_infos(session,url,id):

    content = session.get(url).text

    info_json=re.search(r'"users":(.*}}),"questions":{}',content).group(1)

    json_dict=json.loads(info_json)

    items=[]

    for key in json_dict.keys():

        item = json_dict[key]

        if "name" in item.keys() and key!=id:

            custom = "是" if(item["useDefaultAvatar"]==False) else "否"

            thetype="普通用户" if(item["isOrg"]==False) else "机构号"

            gender="女" if(item["gender"]==0) else ("男" if (item["gender"]==1)else "未知")

            vip="否"if(item["vipInfo"]["isVip"]==False) else "是"

            items.append([item["urlToken"] , item["name"] , custom , item["avatarUrl"], item["url"] , thetype , item["headline"] , gender , vip ,  item["followerCount"] , item["answerCount"] , item["articlesCount"]])

    return items

def main():

    # 登录

    if not os.path.exists("cookies.json"):

        print("未登录账户，请登录！")

        sele_input_zhihu()

    session=requests.session()

    login_zhihu(session)

    # 需要的数据

    zhuye_url="https://www.zhihu.com/people/you-yi-shi-de-hu-xi/activities"     # 这个地方用来输入主页链接

    zhuye=re.match(r"(.*)/activities$",zhuye_url).group(1)

    id = re.match(r".*/(.*)$",zhuye).group(1)

    followers_url=zhuye+r"/followers?page={}"

    # 分析粉丝个数和页码

    html = etree.HTML(session.get(zhuye_url).text)

    text=html.xpath("//div[@class='NumberBoard FollowshipCard-counts NumberBoard--divider']//strong/text()")[1]

    follows = int("".join(text.split(",")))

    pages = follows//20+1

    print("关注者 "+str(follows)+"人，共 "+str(pages)+"页数据！")

    # 获取导出浅数据

    all_info = []

    for i in range(1,pages+1):

        infos_url = followers_url.format(i)

        print("正在获取第 "+str(i)+" 页数据...")

        array = parse_infos(session,infos_url,id)

        all_info+=array

    many=len(all_info)

    print("数据获取完成，共"+ str(many)+" 条数据！")

    data = DataFrame(data=all_info,columns=["id", "用户名", "自定义头像", "头像url", "主页链接", "类型", "一句话描述", "性别", "盐选会员",  "粉丝总数", "回答数", "文章数"])

    data.to_csv(id+"_浅数据.csv",encoding="utf-8-sig")

    print("数据已导出到"+ id+"_浅数据.csv！")

    # 生成粉丝数据报告

    wood=org=female=money=male=f2k=f5k=f10k=gfemale=gmale=0

    for info in all_info:

        if info[2]=="否" and info[9]==0 and info[10]<=2 and info[11]<=2: wood+=1

        if info[5]=="机构号": org+=1

        if info[7]=="女":

            female+=1

            if info[9]>=20: gfemale+=1

        else:

            if info[9]>=50: gmale+=1

        if info[7]=="男": male+=1

        if info[8]=="是": money+=1

        if info[9]>=10000: f10k+=1

        elif info[9]>=5000: f5k+=1

        elif info[9]>=2000: f2k+=1

    report="*"*40+"\n浅粉丝数据快览：在你所有 "+str(many)+" 个粉丝中：\n"+"共有僵尸粉 "+str(wood)+" 个，占比 "+"{:.4%}".format(wood/many)+" ，这可是相当"+(" 低 " if (wood/many)<0.2 else " 高 ")+"的比例。\n"+"另外，粉丝的男女比例为 1 : "+"{:.3}".format(female/male)+" ,看来你深受广大"+(" 女 " if (female>=male) else " 男 ")+"性同胞的喜爱！\n"+"靓女"+str(gfemale)+"人,靓仔"+str(gmale)+"人 【只统计有颜值的】\n"+"在你的所有粉丝里，氪金学习的用户有 "+str(money)+" 个,占比 "+"{:.3%}".format(money/many)+"，看来您的粉丝多为"+("高"if(money/many>0.045) else" 低 ")+"收入用户！\n"+" ◉ 粉丝10K+有 "+str(f10k)+" 人；\n"+" ◉ 粉丝5K-10K有 "+str(f5k)+" 人；\n"+" ◉ 粉丝2K-5K有 "+str(f2k)+" 人；\n"

    if org>=1:

        report+="除此之外，你的粉丝中还有 "+str(org)+" 位机构号！详细的报告快去 浅数据.csv 里看看吧！\n"

    clipboard.copy(report)

    print(report)

if __name__=="__main__":

    main()

运行截图：

这是我今天的粉丝报告：

浅粉丝数据快览：在你所有 9934 个粉丝中：

共有僵尸粉 2091 个，占比 21.0489% ，这可是相当高的比例。

另外，粉丝的男女比例为 1 : 0.352 ,看来你深受广大男性同胞的喜爱！

靓女123人,靓仔354人【只统计有颜值的】

在你的所有粉丝里，氪金学习的用户有 477 个,占比 4.802%，看来您的粉丝多为高收入用户！

◉ 粉丝10K+有 11 人；

◉ 粉丝5K-10K有 7 人；

◉ 粉丝2K-5K有 17 人；

除此之外，你的粉丝中还有 1 位机构号！详细的报告快去浅数据.csv 里看看吧！

码农公寓

相关文章