爬取网易云评论--涉及加密问题、程序断点问题

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 # Author:woshinidaye
  4 
  5 #抓取网易云歌曲的热评,为了简单,不要登录了
  6 #1、找到未加密的参数
  7 #2、想办法把参数进行加密,加密逻辑与网易一致,params,encSecKey
  8 #3、请求,拿去数据
  9 #加密
 10 # var
 11 # bUM2x = window.asrsea(JSON.stringify(i6c), bsG7z(["流泪", "强"]), bsG7z(WW3x.md), bsG7z(["爱心", "女孩", "惊恐", "大笑"]));
 12 # e6c.data = j6d.cs6m({
 13 #     params: bUM2x.encText,
 14 #     encSecKey: bUM2x.encSecKey
 15 # })
 16 
 17 import requests,re,json,base64
 18 from lxml import html
 19 from Crypto.Cipher import AES           #pip install pycryptodome
 20 etree = html.etree
 21 
 22 url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
 23 #请求方式
 24 e = '010001'
 25 f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
 26 g = "0CoJUm6Qyw8W8jud"
 27 i = 'hjbiwTejTo17235R'
 28 def get_encSecKey():
 29     return '6c11f64c829ec94df8ce7d711932c15c4c6e46daf00674f0f22dc1170ba68e809047ee5a7e12c3e07d8c1c3f66b76e4518201b1d4679bd1659a747856f16ac17c32286fba6a82034fa2597004dcca90ca9bfce49bd1a85d09fac162d7b40b390fe8d4c4be15bcc65788d0002fdbd91fb529a71d4d42aa702170fd8e92f1ed87e'
 30 def to_16 (data):
 31     pad = 16 -len(data)%16
 32     data = data + chr(pad)*pad
 33     return data
 34 def enc_params(data,key):
 35     iv = '0102030405060708'
 36     data = to_16(data)
 37     aes = AES.new(key=key.encode('utf-8'),IV=iv.encode('utf-8'),mode=AES.MODE_CBC)
 38     bs = aes.encrypt(data.encode('utf-8'))
 39     return str(base64.b64encode(bs),'utf-8')       #返回params
 40 def get_params(data):       #默认data是字符串
 41     first = enc_params(data,g)
 42     second = enc_params(first,i)
 43     return second
 44 
 45 
 46 # "c6aaef7d7fe54edc416de03808f94c8de2590f943d4f334d8bc485e53f00b95acdfbe704330a01d81bfe666c00b5d681321ab4b04147d0ba1683877e4350b1310e3ad67465ffa1dc9ea57b9d682f1efffbe14ad734a9454faf8e28464491542226109de2fdce6751b63426bd3b18543108c5076ef2b8eab03358ea7a88ce90e9"
 47 data = {
 48     'csrf_token': "",
 49     'cursor': '-1',
 50     'offset': '0',
 51     'orderType': '1',
 52     'pageNo': '1',
 53     'pageSize': '20',
 54     'rid': "R_SO_4_1881521546",
 55     'threadId': "R_SO_4_1881521546"
 56 }
 57 #加密方式
 58 '''
 59     function a(a) {             #随机产生16位字符串
 60         var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
 61         for (d = 0; a > d; d += 1)      #循环16次
 62             e = Math.random() * b.length,       #随机数
 63             e = Math.floor(e),              #取整
 64             c += b.charAt(e);                   #取字符串中的xxxx位置
 65         return c
 66     }
 67     function b(a, b) {          #a是要加密的数据
 68         var c = CryptoJS.enc.Utf8.parse(b)
 69           , d = CryptoJS.enc.Utf8.parse("0102030405060708")
 70           , e = CryptoJS.enc.Utf8.parse(a)
 71           , f = CryptoJS.AES.encrypt(e, c, {
 72             iv: d,              #偏移量
 73             mode: CryptoJS.mode.CBC         #模式CBC
 74         });
 75         return f.toString()
 76     }
 77     function c(a, b, c) {
 78         var d, e;
 79         return setMaxDigits(131),
 80         d = new RSAKeyPair(b,"",c),
 81         e = encryptedString(d, a)
 82     } 
 83     function d(d, e, f, g) {        #d:data   e:010001    f:bsG7z(WW3x.md)  g:bsG7z(["爱心", "女孩", "惊恐", "大笑"])
 84         var h = {}
 85           , i = a(16);          #i就是16位随机字符
 86         return h.encText = b(d, g), 
 87         h.encText = b(h.encText, i),           #得到params,       做了两次加密,第一次 data+g
 88         h.encSecKey = c(i, e, f),              #得到encSecKey  
 89         h
 90     }
 91 '''
 92 # var bUM2x = window.asrsea(JSON.stringify(i6c), bsG7z(["流泪", "强"]), bsG7z(WW3x.md), bsG7z(["爱心", "女孩", "惊恐", "大笑"]));
 93 '''
 94   u6o.be6Y = function(Y6S, e6c) {
 95         var i6c = {}
 96           , e6c = NEJ.X({}, e6c)
 97           , mo0x = Y6S.indexOf("?");
 98         if (window.GEnc && /(^|\.com)\/api/.test(Y6S) && !(e6c.headers && e6c.headers[eu7n.AI4M] == eu7n.FD6x) && !e6c.noEnc) {
 99             if (mo0x != -1) {
100                 i6c = j6d.gW8O(Y6S.substring(mo0x + 1));
101                 Y6S = Y6S.substring(0, mo0x)
102             }
103             if (e6c.query) {
104                 i6c = NEJ.X(i6c, j6d.fT8L(e6c.query) ? j6d.gW8O(e6c.query) : e6c.query)
105             }
106             if (e6c.data) {
107                 i6c = NEJ.X(i6c, j6d.fT8L(e6c.data) ? j6d.gW8O(e6c.data) : e6c.data)
108             }
109             i6c["csrf_token"] = u6o.gQ8I("__csrf");
110             Y6S = Y6S.replace("api", "weapi");
111             e6c.method = "post";
112             delete e6c.query;
113             var bUM2x = window.asrsea(JSON.stringify(i6c), bsG7z(["流泪", "强"]), bsG7z(WW3x.md), bsG7z(["爱心", "女孩", "惊恐", "大笑"]));
114             e6c.data = j6d.cs6m({
115                 params: bUM2x.encText,
116                 encSecKey: bUM2x.encSecKey
117             })
118         }
119         var cdnHost = "y.music.163.com";
120         var apiHost = "interface.music.163.com";
121         if (location.host === cdnHost) {
122             Y6S = Y6S.replace(cdnHost, apiHost);
123             if (Y6S.match(/^\/(we)?api/)) {
124                 Y6S = "//" + apiHost + Y6S
125             }
126             e6c.cookie = true
127         }
128         cxg5l(Y6S, e6c)
129 '''
130 
131 resp = requests.post(url,data={
132     'params': get_params(json.dumps(data)),
133     "encSecKey":get_encSecKey()
134 })
135 print(resp.text)
136 
137 
138 #上面是获取某一首歌的评论,变量主要在data里面,更换歌曲ID,可以通过页面查找获取
139 '''
140 url = 'https://music.163.com/playlist?id=6920064959'
141 resp = requests.get(url=url,headers=headers)
142 resp.encoding = 'utf-8'
143 # print(resp.text)
144 # 用RE
145 # obj = re.compile(r'<li><a href="/(?P<song_id>.*?)">(?P<song_title>.*?)</a></li>',re.S)
146 # songs = obj.finditer(resp.text)
147 # for my_list in songs:
148 #     aa = my_list.group('song_id').split('=')[-1]
149 #     print(aa,'\t',my_list.group('song_title'))
150 
151 #用Xpath
152 # etree = html.etree
153 # # print(resp.text)
154 # html = etree.HTML(resp.text)
155 # test = html.xpath('//html/body/div[3]/div[1]/div/div/div[2]/div[2]//a/@href')
156 # #这个地方试了好久,写全的话取不出来,感觉是跟页面有嵌套有关系
157 # print(test)
158 
159 #用bs4
160 # from bs4 import BeautifulSoup
161 # html = BeautifulSoup(resp.text,'html.parser')
162 # test = html.find('ul',class_='f-hide').find_all('a')
163 # print(test)
164 '''

 

上一篇:RestHighLevelClient使用记录


下一篇:element-ui upload头像上传