python爬虫(一)之 抓取极氪网站汽车文章

import random import requests import json import csv from lxml import etree import time import base64 import re from Crypto.Cipher import AES """ 极氪爬虫逆向 https://blog.****.net/2301_79445611/article/details/133840084 """ class Kr36: def __init__(self): self.article_list_pre_url = "https://gateway.36kr.com/api/mis/nav/ifm/subNav/flow" self.start_page = 1 self.end_page = 1000 self.init_page_callback = "eyJmaXJzdElkIjo0NTIzNzYyLCJsYXN0SWQiOjQ1MjIzOTAsImZpcnN0Q3JlYXRlVGltZSI6MTcxMDQxODU0MjA2NywibGFzdENyZWF0ZVRpbWUiOjE3MTAzMjk5MTM0MTl9" self.payload = json.dumps({ "partner_id": "web", "timestamp": 1710253726028, "param": { "subnavType": 1, "subnavNick": "travel", "pageSize": 30, "pageEvent": 1, "pageCallback": "eyJmaXJzdElkIjo0NTE5NDg3LCJsYXN0SWQiOjQ1MTc1NzksImZpcnN0Q3JlYXRlVGltZSI6MTcxMDEzMDE5OTM4MywibGFzdENyZWF0ZVRpbWUiOjE3MDk4NTUyMzkxMzl9", "siteId": 1, "platformId": 2 } }) self.article_list_headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Cookie': 'sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218e330f24e210f-09e7e5136418ef-26001b51-1821369-18e330f24e3257b%22%2C%22%24device_id%22%3A%2218e330f24e210f-09e7e5136418ef-26001b51-1821369-18e330f24e3257b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; tfstk=enSMvVNZXN8175m5p1K_xdJHfjzdCftXwsnvMnd48BRIbq59kwAcT6ANBtefiE5dimB9XtCng_CY1EesXI5HfMQtDCF6mt5Jm7F82uB1CIt4w7CpcU8wcEV4JbERCOMEIjyR8unD7DJAc9HIGUqPYrLyZYLloFBmJpdPQCWOBMl2-V69tFYhAdXHaOur0mSHIFjrr2urNOnXL-IElqTwdpARo9nzs4osO423K4Ze7p9_xJ2nlqTwdpA8KJ0r8FJBCkf..; Hm_lvt_1684191ccae0314c6254306a8333d090=1710253616,1710345937,1710421835; Hm_lvt_713123c60a0e86982326bae1a51083e1=1710253616,1710345937,1710421835; aliyungf_tc=4c273ea1e0ec1ba7c726c1d40e9f785731cff0f77ce5ac27f88ffeb1a6079cab; acw_tc=1a0c398517104218376678635e141118f68f5ec0ce2ac3421247f3e3c09817; Hm_lpvt_1684191ccae0314c6254306a8333d090=1710421897; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1710421897', 'Origin': 'https://36kr.com', 'Referer': 'https://36kr.com/', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"' } self.article_detail_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218e330f24e210f-09e7e5136418ef-26001b51-1821369-18e330f24e3257b%22%2C%22%24device_id%22%3A%2218e330f24e210f-09e7e5136418ef-26001b51-1821369-18e330f24e3257b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; tfstk=enSMvVNZXN8175m5p1K_xdJHfjzdCftXwsnvMnd48BRIbq59kwAcT6ANBtefiE5dimB9XtCng_CY1EesXI5HfMQtDCF6mt5Jm7F82uB1CIt4w7CpcU8wcEV4JbERCOMEIjyR8unD7DJAc9HIGUqPYrLyZYLloFBmJpdPQCWOBMl2-V69tFYhAdXHaOur0mSHIFjrr2urNOnXL-IElqTwdpARo9nzs4osO423K4Ze7p9_xJ2nlqTwdpA8KJ0r8FJBCkf..; aliyungf_tc=c8a95eb5a40ff7daafa6a84507110db651dad31ea165d934af3ec32b3f6514cf; acw_tc=ac11000117104218364426625e82f18159f99e101f9e32e840da085962de21; Hm_lvt_1684191ccae0314c6254306a8333d090=1710253616,1710345937,1710421835; Hm_lvt_713123c60a0e86982326bae1a51083e1=1710253616,1710345937,1710421835; Hm_lpvt_1684191ccae0314c6254306a8333d090=1710421976; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1710421976; SERVERID=6754aaff36cb16c614a357bbc08228ea|1710421983|1710421837', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"' } self.cookies = { "sensorsdata2015jssdkcross": "^%^7B^%^22distinct_id^%^22^%^3A^%^2218b2931b25d63d-08aa76c3e8a47a-78505770-1821369-18b2931b25e11b9^%^22^%^2C^%^22^%^24device_id^%^22^%^3A^%^2218b2931b25d63d-08aa76c3e8a47a-78505770-1821369-18b2931b25e11b9^%^22^%^2C^%^22props^%^22^%^3A^%^7B^%^22^%^24latest_traffic_source_type^%^22^%^3A^%^22^%^E7^%^9B^%^B4^%^E6^%^8E^%^A5^%^E6^%^B5^%^81^%^E9^%^87^%^8F^%^22^%^2C^%^22^%^24latest_referrer^%^22^%^3A^%^22^%^22^%^2C^%^22^%^24latest_referrer_host^%^22^%^3A^%^22^%^22^%^2C^%^22^%^24latest_search_keyword^%^22^%^3A^%^22^%^E6^%^9C^%^AA^%^E5^%^8F^%^96^%^E5^%^88^%^B0^%^E5^%^80^%^BC_^%^E7^%^9B^%^B4^%^E6^%^8E^%^A5^%^E6^%^89^%^93^%^E5^%^BC^%^80^%^22^%^7D^%^7D", "aliyungf_tc": "87a15f620fd2d71d70704946aa353992ba8148f24a896fdd26a3f2dda630d053", "acw_tc": "0a6fd1ef16973374951548793e6f60efb7406e5628b6676540af7b96b8de7d", "Hm_lvt_1684191ccae0314c6254306a8333d090": "1697203204,1697337496", "Hm_lvt_713123c60a0e86982326bae1a51083e1": "1697203204,1697337496", "Hm_lpvt_713123c60a0e86982326bae1a51083e1": "1697337578", "Hm_lpvt_1684191ccae0314c6254306a8333d090": "1697337578", "SERVERID": "d36083915ff24d6bb8cb3b8490c52181^|1697337581^|1697337496" } def post_request(self, url, headers, payload): response = requests.request("POST", url, headers=headers, data=payload) return response.text def get_request(self, url, headers, cookies): response = requests.get(url, headers=headers, cookies=cookies) return response.text def do_work(self): with open('36kr.csv', 'w', newline='', encoding='utf-8-sig') as file: writer = csv.writer(file) csv_title = ["标题", "作者", "发布时间", "原文地址", "正文"] writer.writerow(csv_title) page_no = 1 pageCallback = self.init_page_callback while True: print("================> 当前第" + str(page_no) + "页 ============") payload = json.dumps({ "partner_id": "web", "timestamp": int(round(time.time() * 1000)), "param": { "subnavType": 1, "subnavNick": "travel", "pageSize": 30, "pageEvent": 1, "pageCallback": pageCallback, "siteId": 1, "platformId": 2 } }) # seconds = random.randint(30, 60) # time.sleep(seconds) text = self.post_request(self.article_list_pre_url, headers=self.article_list_headers, payload=payload) data = json.loads(text)["data"] pageCallback = data["pageCallback"] itemList = data["itemList"] self.write_page(writer, itemList) page_no += 1 def write_page(self, writer, itemList): for item in itemList: # print(item["title"]) # print(item["author"]["username"]) # print(item["created_at"]) # 获取文章详情内容 # https://36kr.com/p/2686487273459590 article_url = "https://36kr.com/p/" + str(item["itemId"]) text = self.get_request(article_url, headers=self.article_detail_headers, cookies=self.cookies) tree = etree.HTML(text) data = tree.xpath('/html/body/script/text()')[0] baase64_data = re.findall('.*?state":"(.*?)","', data)[0] key = "efabccee-b754-4c".encode('utf-8') aes = AES.new(key, AES.MODE_ECB) data = aes.decrypt(base64.b64decode(baase64_data)) decode_date = data.decode() replace_data = decode_date[0: decode_date.rfind('}') + 1] # print(replace_data) json_data = json.loads(replace_data) article_detail_data = json_data["articleDetail"]["articleDetailData"] # print(article_detail_data) time_struct = time.localtime(int(item["templateMaterial"]["publishTime"] / 1000)) date = time.strftime("%Y-%m-%d %H:%M:%S", time_struct) row = [item["templateMaterial"]["widgetTitle"], item["templateMaterial"]["authorName"], article_url, date, article_detail_data["data"]["widgetContent"]] writer.writerow(row) seconds = random.randint(20, 60) print("===========> 当前文章 " + article_url + " 写入完毕,等待" + str(seconds) + "秒继续") time.sleep(seconds) if __name__ == '__main__': kr36 = Kr36() kr36.do_work()
上一篇:深度学习+计算机视觉


下一篇:VUE 使用路由跳转登录页面