Scrapy 抓取网易云粉丝 利用API,速度最快 【六】

Scrapy 抓取网易云粉丝 利用API,速度最快 【六】

 

 首先创建工程,然后建立fensi.py

# -*- coding: utf-8 -*-

import sys
if sys.version_info[0] > 2:
    from urllib.parse import urlencode
else:
    from urllib import urlencode


import scrapy

import random

import math

from Crypto.Cipher import AES

import codecs

import base64





import json


class WyyFansSpider(scrapy.Spider):
    name = wyy

    allowed_domains = [163.com]

    # start_urls = [‘http://163.com/‘]

    def __init__(self):

        self.key = 0CoJUm6Qyw8W8jud

        self.f = 00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7

        self.e = 010001

        self.singer_id = 1411492497

        self.post_url1 = https://music.163.com/weapi/user/getfolloweds?csrf_token=


        #self.post_url2 = ‘https://music.163.com/weapi/v1/play/record?csrf_token=‘

    # 生成16个随机字符

    def _generate_random_strs(self, length):

        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

        # 控制次数参数i

        i = 0

        # 初始化随机字符串

        random_strs = ""

        while i < length:
            e = random.random() * len(string)

            # 向下取整

            e = math.floor(e)

            random_strs = random_strs + list(string)[e]

            i = i + 1

        return random_strs

    # AES加密

    def _AESencrypt(self, msg, key):

        # 如果不是16的倍数则进行填充(paddiing)

        padding = 16 - len(msg) % 16

        msg = msg + padding * chr(padding)

        # 用来加密或者解密的初始向量(必须是16位)

        iv = 0102030405060708

        cipher = AES.new(key.encode(utf8), AES.MODE_CBC, iv.encode(utf8))

        # 加密后得到的是bytes类型的数据

        encryptedbytes = cipher.encrypt(msg.encode(utf-8))

        # 使用Base64进行编码,返回byte字符串

        encodestrs = base64.b64encode(encryptedbytes)

        # 对byte字符串按utf-8进行解码

        enctext = encodestrs.decode(utf-8)

        return enctext

    # RSA加密

    def _RSAencrypt(self, randomstrs, key, f):

        # 随机字符串逆序排列

        string = randomstrs[::-1]

        # 将随机字符串转换成byte类型数据

        text = bytes(string, utf-8)

        seckey = int(codecs.encode(text, encoding=hex), 16) ** int(key, 16) % int(f, 16)

        return format(seckey, x).zfill(256)


    # 获取参数

    def _get_params1(self, page):

        offset = (page - 1) * 20

        msg = {"userId": "1411492497", "offset": + str(offset) + , "total": "false", "limit": "20", "csrf_token": ""}

        enctext = self._AESencrypt(msg, self.key)

        # 生成长度为16的随机字符串

        i = self._generate_random_strs(16)

        # 两次AES加密之后得到params的值

        encText = self._AESencrypt(enctext, i)

        # RSA加密之后得到encSecKey的值

        encSecKey = self._RSAencrypt(i, self.e, self.f)

        return encText, encSecKey
    def start_requests(self):
        for i in range(1,int(4923265/20)+1): #修改抓取粉丝数
            params, encSecKey = self._get_params1(i)
            headers = {Host: music.163.com,
                       Connection: keep-alive,
                       Content-Length: 476,
                       Pragma: no-cache,
                       Cache-Control: no-cache,
                       User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36,
                       Content-Type: application/x-www-form-urlencoded,
                       Accept: */*,
                       Origin: https://music.163.com,
                       Sec-Fetch-Site: same-origin,
                       Sec-Fetch-Mode: cors,
                       Sec-Fetch-Dest: empty,
                       Referer: https://music.163.com/user/fans?id=1411492497,
                       Accept-Encoding: gzip, deflate, br,
                       Accept-Language: zh-CN,zh;q=0.9,
                       # ‘Cookie‘: ‘_iuqxldmzr_=32; _ntes_nnid=008eb89f93bb80b8c5abbbfeb29cf783,1601351876041; _ntes_nuid=008eb89f93bb80b8c5abbbfeb29cf783; NMTID=00OjXQcC8_wl8Qc5Eyzj_hKZKF_GlUAAAF02AJ4-A; WM_NI=Nz8nT1vsX8DoejbrC5yMqBrqv70bOcl%2Fe9pgZSO9wSff8VZdQamhdi38Tu5LOB4kn7SaIJfCij4ENk3o9AkK0xpJ9ALg8jqb0bfyIAprddlPL1%2FzcgWpVXoiyEbZoNNKdHQ%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eea2e47b9bbaaf93b6508d9a8ab7d44b828b9aafb546a2909db9b3489c949eaff22af0fea7c3b92a8ea6f88fd73bb6ad98d9c765a7b6ae93c23fbb93bfa3f17bb6e90083ce69b0f1abdaf36d9a8d81daf659a6baf8b7d97093bfa389e763f1e89c96b8488d9efc8ced3d91bf87abe549abb6fca6cd61f1b1f7a2ea41f4a8c0abbc4b90b9f88bf46af4beaa9acc4bb4aeac85ef5c90efe19be26d95b09ab9ee63bb9baebbee468e95aca9d437e2a3; WM_TID=5UHlf7z1yZ1FQBVRREY%2FJaeDTwOCfBMs; JSESSIONID-WYYY=khAdF6WsaT8Vl%2FBmeuUxNUJzXuSo9AuMAkkyWuiGbGlShWwbk%5CW3flpBsDz0ZTNpKPz8PcvsO%2FYH8jX9F07a5ACh0KqO5O0nAoEJO5W%2FR8yfJSJdCm95FQaQxo7QQzQ%2FfJpypzjeXQI8RO3opWeXr1x7z1GUBQQ2sn4P5sEWeDNkPoSO%3A1601382242186‘
                       }
            url = https://music.163.com/weapi/user/getfolloweds?csrf_token=

            formdata = {
                params: params,
                encSecKey: encSecKey,
            }

            yield scrapy.FormRequest(url = self.post_url1, formdata = formdata, callback = self.parse)

    def parse(self, response):

        response = json.loads(response.text)

        followeds = response[followeds]

        for followed in followeds:
            avatar = followed[avatarUrl]

            userId = followed[userId]

            vipType = followed[vipType]

            gender = followed[gender]

            eventCount = followed[eventCount]

            fan_followeds = followed[followeds]

            fan_follows = followed[follows]

            signature = followed[signature]

            time1 = followed[time]

            nickname = followed[nickname]

            playlistCount = followed[playlistCount]

            fan = {

                userId: userId,

                avatar: avatar,

                vipType: vipType,

                gender: gender,

                eventCount: eventCount,

                followeds: fan_followeds,

                follows: fan_follows,

                signature: signature,

                time: time1,

                nickname: nickname,

                playlistCount: playlistCount

            }
            yield fan

接下来写入items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class WyyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # define the fields for your item here like:

    # name = scrapy.Field()

    avatar = scrapy.Field()

    userId = scrapy.Field()

    # vipRights = scrapy.Field()

    vipType = scrapy.Field()

    gender = scrapy.Field()

    eventCount = scrapy.Field()

    fan_followeds = scrapy.Field()

    fan_follows = scrapy.Field()

    signature = scrapy.Field()

    time = scrapy.Field()

    nickname = scrapy.Field()

    playlistCount = scrapy.Field()

    total_record_count = scrapy.Field()

    week_record_count = scrapy.Field()

接下来有两种选择,可以写入mongo数据库,或者写入txt文件,自行选择。

写入mongo  则在pipelines.py

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter




from pymongo import MongoClient


class WyyPipeline(object):

    def __init__(self) -> None:
        # 连接

        self.client = MongoClient(host=localhost, port=27017)

        # 如果设置有权限, 则需要先登录

        # db_auth = self.client.admin

        # db_auth.authenticate(‘root‘, ‘root‘)

        # 需要保存到的collection

        self.col = self.client[wyy]

        self.fans = self.col.fans2

    def process_item(self, item, spider):
        res = dict(item)

        self.fans.update_one({"userId": res[userId]}, {"$set": res}, upsert=True)

        return item

    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        self.client.close()

我不知道为啥,我的写入mongo只能抓取1000条,所以放弃了。

选择写入txt,在创建一个iputpipelines.py

import random
import os

class WyyPipeline(object):

    def process_item(self, item, spider):
        print (---------write------------------)
        mingzi = item[nickname]
        guanzhu = item[follows]
        fensi = item[followeds]
        dongtai = item[eventCount]
        shuju = 名字 + : + str(mingzi) + ; + 动态 + : + str(dongtai) + ; + 关注 + : + str(guanzhu) + ; + 粉丝 + : + str(fensi) + \n
        suiji = random.randint(1,50)
        fileName = D:/Study/pythonProject/网易云音乐——粉丝/scrapy/wyy/wyy/fensi/鱿小鱼粉丝+ str(suiji) + .txt‘  #换成自己的绝对地址
        fs = round(os.path.getsize(fileName) / float(1024 * 1024), 2)  # 将文件大小的单位转换成MB
        if fs <= 1:
            f = open(fileName, "a+", encoding=utf-8)
            f.write(shuju)
            f.close()
            return item

接着在相同目录下,创建fensi文件夹,里面创建shengcheng.py 里面创建50个txt。以供写入:

for i in range(50):
    fp = 粉丝 + str(i) + .txt
    with open(fp,w,encoding=utf-8) as fn:   # 如果文件存在时,先进行清空,实现对一个文件重复
        pass
    

接着写settings.py,加入以下内容。

CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100
COOKIES_ENABLED = False
ITEM_PIPELINES = {

    wyy.iputpipelines.WyyPipeline: 300, #ipupipelinse ,可以修改成pipelines

}

 

这样可以抓取500w粉丝,分配到50个txt里面,看电脑配置了,电脑越好抓的越快,我没试过抓这么多。

 

Scrapy 抓取网易云粉丝 利用API,速度最快 【六】

上一篇:我是如何在 Win的 VSCode 上开发 Keil for GD32 实现 I2C 从机的游戏机手柄。


下一篇:win10系统没有Hyper-v解决办法