今日头条app数据爬虫demo

import json
import time
from urllib.parse import quote
from urllib import request
import requests
"""
1.综合
2.视屏
3.资讯
4.小视屏
5.图片
6.用户
7.音乐
8.问答
9.微头条
10.话题
"""
tab_list = [
    "pd=synthesis&from=search_tab",
    "pd=video&from=video",
    "pd=information&from=news",
    "pd=xiaoshipin&from=xiaoshipin",
    "pd=atlas&from=gallery",
    "pd=user&from=media",
    "pd=music&from=music",
    "pd=question&from=question",
    "pd=weitoutiao&from=weitoutiao",
    "pd=huati&from=huati"
]
headers = {
        "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1"
    }

#keyWords关键词,page分页,tab模块
def queryList(keyWords,page,tab):
    keyWords = quote(keyWords, safe=";/?:@&=+$,", encoding="utf-8")
    #秒  毫秒
    time_second,time_second_min = get_time()
    count = 10
    offset = (page-1) * count
    tab_str = tab_list[tab]
    url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset="+str(offset)+"&action_type=input_keyword_search&has_count=&is_from_native=1&count="+str(count)+"&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&"+tab_str+"&keyword="+str(keyWords)+"&from_search_subtab=1&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket="+str(time_second_min)+"&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts="+str(time_second)+"&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab"
    response = requests.post(url=url,timeout=100,headers=headers)
    response_str = str(response.content,encoding="utf-8")
    print(response_str)
    result_json = json.loads(response_str)
    return result_json

def test():
    url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset=10&action_type=input_keyword_search&has_count=&is_from_native=1&count=10&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&pd=information&from=news&keyword=%E5%8D%8E%E4%B8%BA&from_search_subtab=3&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_versionab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=1547795488503&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=1547795488&as=a2c5879430624c8cd12044&mas=00f71df35ab69fe5b9d8e4e1ec4ea19fc10f42e68cc0e4e63a"
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1"
    }
    response = requests.post(url=url, timeout=100, headers=headers)
    text = str(response.content,encoding=utf-8)
    print(text)

def get_detail_url(result_list):
    # 秒  毫秒
    time_second, time_second_min = get_time()
    detail_url_param = "iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=" +         str(time_second_min) + "&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=" +         str(time_second) + "&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab"
    detail_url_head = "http://a.pstatp.com/article/full/22/1/"
    detail_url_center = "/0/0/0/0?"
    comment_url_head = "https://www.toutiao.com/api/comment/list/?group_id="
    comment_url_tail = "&offset=0&count=5"
    detail_list = []
    for item in result_list:
        id_str = item.get("id", None)
        if not id_str:
            id_str = item.get("group_id", None)
        title = item.get("title",None)
        detail_url = detail_url_head + str(id_str) + "/" + str(id_str) + detail_url_center + detail_url_param
        comment_url = comment_url_head + str(id_str) + "&item_id=" + str(id_str) + comment_url_tail
        detail_data = {
            "detailUrl" : detail_url,
            "commentUrl" : comment_url,
            "id" : id_str,
            "title" : title
        }
        detail_list.append(detail_data)
    return detail_list

def load_detail(detail_list):
    if len(detail_list) < 1:
        return
    for item in detail_list:
        detailUrl = item["detailUrl"]
        commentUrl = item["commentUrl"]
        id_str = item["id"]
        title = item.get("title",None)
        if title:
            title = title.replace("/","").replace("\n","").replace("\r","").replace(" ","")
        else:
            continue
        response = requests.post(url=detailUrl, timeout=100, headers=headers)
        response_str = str(response.content, encoding="utf-8")
        print(response_str)
        response_json = json.loads(response_str,encoding="utf-8")
        with open("detail/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file:
            file.write(json.dumps(response_json,ensure_ascii = False))
        response = requests.post(url=commentUrl, timeout=100, headers=headers)
        response_str = str(response.content, encoding="utf-8")
        print(response_str)
        response_json = json.loads(response_str,encoding="utf-8")
        with open("comment/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file:
            file.write(json.dumps(response_json,ensure_ascii = False))


#获取时间
def get_time():
    # 毫秒
    t = time.time()
    time_second_min = int(round(t * 1000))
    #
    time_second = int(t)
    return time_second,time_second_min


if __name__ == __main__:
    # test()
    keyWords = input("请输入关键词:")
    page = input("请输入页数:")
    tab_index = input("请输入模块:")
    result_json = queryList(keyWords,int(page),int(tab_index))
    result_list = result_json["data"]
    detail_list = get_detail_url(result_list)
    print(detail_list)
    load_detail(detail_list)

 

import json import time from urllib.parse import quote from urllib import request import requests """ 1.综合 2.视屏 3.资讯 4.小视屏 5.图片 6.用户 7.音乐 8.问答 9.微头条 10.话题 """ tab_list = [ "pd=synthesis&from=search_tab", "pd=video&from=video", "pd=information&from=news", "pd=xiaoshipin&from=xiaoshipin", "pd=atlas&from=gallery", "pd=user&from=media", "pd=music&from=music", "pd=question&from=question", "pd=weitoutiao&from=weitoutiao", "pd=huati&from=huati" ] headers = { "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1" } #keyWords关键词,page分页,tab模块 def queryList(keyWords,page,tab): keyWords = quote(keyWords, safe=";/?:@&=+$,", encoding="utf-8") #秒 毫秒 time_second,time_second_min = get_time() count = 10 offset = (page-1) * count tab_str = tab_list[tab] url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset="+str(offset)+"&action_type=input_keyword_search&has_count=&is_from_native=1&count="+str(count)+"&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&"+tab_str+"&keyword="+str(keyWords)+"&from_search_subtab=1&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket="+str(time_second_min)+"&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts="+str(time_second)+"&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab" response = requests.post(url=url,timeout=100,headers=headers) response_str = str(response.content,encoding="utf-8") print(response_str) result_json = json.loads(response_str) return result_json def test(): url = "http://ic.snssdk.com/api/search/content/?qc_query=&offset=10&action_type=input_keyword_search&has_count=&is_from_native=1&count=10&format=json&source=input&keyword_type=&search_id=&search_position=search_bar&pd=information&from=news&keyword=%E5%8D%8E%E4%B8%BA&from_search_subtab=3&iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_versionab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=1547795488503&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=1547795488&as=a2c5879430624c8cd12044&mas=00f71df35ab69fe5b9d8e4e1ec4ea19fc10f42e68cc0e4e63a" headers = { "User-Agent": "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1" } response = requests.post(url=url, timeout=100, headers=headers) text = str(response.content,encoding=‘utf-8‘) print(text) def get_detail_url(result_list): # 秒 毫秒 time_second, time_second_min = get_time() detail_url_param = "iid=57820401425&device_id=54550815314&ac=wifi&channel=xiaomi&aid=13&app_name=news_article&version_code=707&version_name=7.0.7&device_platform=android&ab_version=475404%2C680425%2C687252%2C684578%2C571130%2C665173%2C674056%2C639003%2C612193%2C691933%2C170988%2C643891%2C374117%2C687462%2C688267%2C655402%2C702095%2C613176%2C550042%2C686297%2C690816%2C687745%2C690975%2C649426%2C614097%2C677129%2C685523%2C522766%2C701302%2C416055%2C684977%2C703944%2C689886%2C693247%2C558140%2C586260%2C555254%2C471406%2C603441%2C700492%2C596392%2C660510%2C598626%2C701730%2C700540%2C686885%2C701724%2C677898%2C603383%2C603401%2C603403%2C603405%2C638928%2C699227%2C696109%2C703265%2C686031%2C661904%2C662644%2C703737%2C668775%2C673945%2C692060%2C693468%2C629151%2C645714%2C607361%2C609338%2C666965%2C698916%2C635529%2C669649%2C662099%2C696796%2C701078%2C693364%2C703077%2C697038%2C703339%2C689538%2C697022%2C668774%2C683805%2C698097%2C698380%2C688105%2C554836%2C694759%2C549647%2C699616%2C31240%2C572465%2C656568%2C644058%2C615291%2C606547%2C681183%2C703370%2C673168%2C702884%2C671426%2C546701%2C702195%2C641190%2C281297%2C678046%2C325620%2C678477%2C665474%2C696624%2C669034%2C700459%2C625065%2C652953%2C696373%2C696990%2C698915%2C693900%2C703230%2C680284%2C638336%2C467514%2C679100%2C697663%2C702714%2C702994%2C699109%2C702878%2C699036%2C595556%2C697759%2C702757%2C670151%2C661453%2C654127%2C698630%2C660830%2C688723%2C690189%2C691671%2C686376%2C699478%2C677774%2C697104%2C700437%2C486951%2C701439%2C662176%2C662350%2C633486%2C662684%2C661781%2C457480%2C649403%2C655988%2C648317%2C654049&ab_client=a1%2Cc4%2Ce1%2Cf1%2Cg2%2Cf7&ab_feature=94563%2C102749&abflag=3&ssmix=a&device_type=MI+8&device_brand=Xiaomi&language=zh&os_api=27&os_version=8.1.0&openudid=1a16ce94f2005274&manifest_version_code=707&resolution=1080*2118&dpi=440&update_version_code=70714&_rticket=" + \ str(time_second_min) + "&plugin=26958&fp=9lT_FSDqFYPZFlwIFrU1FYwIPM4q&tma_jssdk_version=1.10.3.4&rom_version=miui_v10_8.8.31&ts=" + \ str(time_second) + "&as=a2c555b4d565fcd9004533&mas=005bc89b119dd3e1d3f552f76df48fc2a6f6cdc4e4660e08ab" detail_url_head = "http://a.pstatp.com/article/full/22/1/" detail_url_center = "/0/0/0/0?" comment_url_head = "https://www.toutiao.com/api/comment/list/?group_id=" comment_url_tail = "&offset=0&count=5" detail_list = [] for item in result_list: id_str = item.get("id", None) if not id_str: id_str = item.get("group_id", None) title = item.get("title",None) detail_url = detail_url_head + str(id_str) + "/" + str(id_str) + detail_url_center + detail_url_param comment_url = comment_url_head + str(id_str) + "&item_id=" + str(id_str) + comment_url_tail detail_data = { "detailUrl" : detail_url, "commentUrl" : comment_url, "id" : id_str, "title" : title } detail_list.append(detail_data) return detail_list def load_detail(detail_list): if len(detail_list) < 1: return for item in detail_list: detailUrl = item["detailUrl"] commentUrl = item["commentUrl"] id_str = item["id"] title = item.get("title",None) if title: title = title.replace("/","").replace("\n","").replace("\r","").replace(" ","") else: continue response = requests.post(url=detailUrl, timeout=100, headers=headers) response_str = str(response.content, encoding="utf-8") print(response_str) response_json = json.loads(response_str,encoding="utf-8") with open("detail/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file: file.write(json.dumps(response_json,ensure_ascii = False)) response = requests.post(url=commentUrl, timeout=100, headers=headers) response_str = str(response.content, encoding="utf-8") print(response_str) response_json = json.loads(response_str,encoding="utf-8") with open("comment/"+str(id_str) + title + ".txt",mode="w",encoding="utf-8") as file: file.write(json.dumps(response_json,ensure_ascii = False)) #获取时间 def get_time(): # 毫秒 t = time.time() time_second_min = int(round(t * 1000)) # 秒 time_second = int(t) return time_second,time_second_min if __name__ == ‘__main__‘: # test() keyWords = input("请输入关键词:") page = input("请输入页数:") tab_index = input("请输入模块:") result_json = queryList(keyWords,int(page),int(tab_index)) result_list = result_json["data"] detail_list = get_detail_url(result_list) print(detail_list) load_detail(detail_list)

今日头条app数据爬虫demo

上一篇:js中有趣的数学


下一篇:appium+python+unittest+HTMLRunner编写UI自动化测试集