1 import requests 2 from requests.exceptions import RequestException 3 from lxml import etree 4 import csv 5 import re 6 import time 7 from urllib import parse 8 import time 9 10 11 def get_page(url): 12 """ 13 获取网页的源代码 14 :param url: 15 :return: 16 """ 17 try: 18 headers = { 19 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50‘, 20 21 } 22 response = requests.get(url, headers=headers) 23 if response.status_code == 200: 24 return response.text 25 return None 26 except RequestException: 27 return None 28 29 def timeswitch(chuo): 30 31 tupTime = time.localtime(chuo) # 秒时间戳 32 stadardTime = time.strftime("%Y-%m-%d %H:%M:%S", tupTime) 33 return stadardTime 34 35 def parse_page(text): 36 """ 37 解析网页源代码 38 :param text: 39 :return: 40 """ 41 html = etree.HTML(text) 42 43 ‘‘‘ 44 movie_name = html.xpath("//*[@id=‘sogou_vr_11002601_title_0‘]/text()[1]") 45 actor = html.xpath("//p[@class=‘star‘]/text()") 46 actor = list(map(lambda item: re.sub(‘\s+‘, ‘‘, item), actor)) 47 time = html.xpath("//p[@class=‘releasetime‘]/text()") 48 grade1 = html.xpath("//p[@class=‘score‘]/i[@class=‘integer‘]/text()") 49 grade2 = html.xpath("//p[@class=‘score‘]/i[@class=‘fraction‘]/text()") 50 new = [grade1[i] + grade2[i] for i in range(min(len(grade1), len(grade2)))] 51 ranking = html.xpath("///dd/i/text()") 52 return zip(ranking, movie_name, actor, time, new) 53 ‘‘‘ 54 55 biaotinew = list() 56 biaoti = html.xpath("//div[@class=‘txt-box‘]/h3/a") 57 for bt in biaoti: 58 b = bt.xpath("string(.)") 59 biaotinew.append(b) 60 print(biaotinew) 61 62 wangzhinew = list() 63 base_url = ‘https://weixin.sogou.com‘ 64 wangzhi = html.xpath("//div[@class=‘txt-box‘]/h3//@href") 65 for wz in wangzhi: 66 w = "".join(list(base_url)+wangzhi) 67 wangzhinew.append(w) 68 print(wangzhinew) 69 70 zhaiyaonew = list() 71 zhaiyao = html.xpath("//p[@class=‘txt-info‘]") 72 for bt in zhaiyao: 73 b = bt.xpath("string(.)") 74 zhaiyaonew.append(b) 75 print(zhaiyaonew) 76 77 gzh = html.xpath("//a[@class=‘account‘]/text()") 78 print(gzh) 79 80 lastnew = list() 81 shijiannew = list() 82 shijian = html.xpath("//div[2]/div/span") 83 for bt in shijian: 84 b = bt.xpath("string(.)") 85 shijiannew.append(b) 86 for bp in shijiannew : 87 newstr = re.findall(r"\d+\.?\d*",bp) 88 # [‘1.45‘, ‘5‘, ‘6.45‘, ‘8.82‘] 89 lastor = ‘‘.join(newstr) 90 lastnew.append(timeswitch(int(lastor))) 91 print(lastnew) 92 93 94 95 96 return zip(biaotinew,wangzhinew,zhaiyaonew,gzh,lastnew) 97 98 99 100 101 def change_page1(number): 102 """ 103 翻页 104 :param number: 105 :return: 106 """ 107 base_url =‘https://weixin.sogou.com/weixin?oq=&query=python&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1604564741184&page=‘ 108 url = base_url +str(number)+‘&ie=utf8&p=40040108&dp=1&w=01015002&dr=1‘ 109 return url 110 111 112 def save_to_csv(result, filename): 113 """ 114 保存 115 :param result: 116 :param filename: 117 :return: 118 """ 119 with open(filename, ‘a‘,encoding=‘utf-8-sig‘,newline="") as csvfile: 120 writer = csv.writer(csvfile, dialect=‘excel‘) 121 writer.writerow(result) 122 123 124 def main(): 125 """ 126 主函数 127 :return: 128 """ 129 f = open(‘message.csv‘, ‘a+‘, encoding=‘utf-8-sig‘, newline="") # newline取消空行 130 csv_writer = csv.writer(f) 131 csv_writer.writerow(["文章名称","文章链接地址","摘要","公众号名称","发布时间"]) 132 f.close() 133 134 135 136 137 for number in range(1,6): 138 url = change_page1(number) 139 text = get_page(url) 140 result = parse_page(text) 141 for a in result: 142 save_to_csv(a, filename=‘message.csv‘) 143 144 145 if __name__ == ‘__main__‘: 146 main()