【Web_接口爬虫_Python3_高德地图_request&os&etree】高德地图,商铺信息,爬取内容+下载图片,保存文本_20210326

#!/usr/bin/env/python3
# -*- coding:utf-8 -*-
'''
Author:leo
Date&Time:2021/03/26 18:10
Project:Python3  FileName:gaode_request.py
'''
# -*- coding: utf-8 -*-

import json, time, os, re, requests, random
from lxml import etree
from fake_useragent import UserAgent

class Gaode_requests(object):
    def __init__(self):
        self.logTime = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
        self.curTime = time.strftime('%Y%m%d %H%M%S', time.localtime(time.time()))
        self.nowdate_8, self.nowtime_6 = self.curTime.split(" ")[0], self.curTime.split(" ")[1]
        self.proxies = {"http": None, "https": None}
        self.headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Cookie": "guid=fd1b-ee7a-3f28-e167; UM_distinctid=177dbe27dd2261-090e01e25d14aa-5e1a3f18-100200-177dbe27dd35d9; cna=15DbF7eJLyYCAbfeFBk1OWNb; xlly_s=1; _uab_collina=161430179363728282983957; CNZZDATA1255626299=1850886754-1614296565-https%253A%252F%252Fwww.baidu.com%252F%7C1614301969; tfstk=ccKcB0qtcE7bjY_kRmsXLz8GvlcdZt2VGwQy43Eh62WhWMKPic4z8_q7G_RDXP1..; l=eBN5SFucjMSRj8hzBOfaourza779sIRYSuPzaNbMiOCP9TC65wVAW6gTj28BCnGVh6z6R3rMK82YBeYBqBAnnxv9sThLxkDmn; isg=BBcXPum9O1bm07_aExT_SeLopothXOu-04QrDWlEMeZNmDfacS_3DvKy-jiGdcM2",
        "Host": "www.amap.com",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
        }


    def get_proxies(self):
        # 绕过代理方法一
        proxies = {"http": None, "https": None}
        requests.get("http://ff2.pw", proxies=proxies)
        response = requests.get("http://httpbin.org/ip", proxies=proxies)
        print(response.text)

        # 绕过代理方法二
        session = requests.Session()
        session.trust_env = False
        response = session.get('http://ff2.pw')
        print(response.text)
        return proxies

    def mkdir_file(self, path):
        path = path.strip()
        path = path.rstrip("\\")
        isExists=os.path.exists(path)
        if not isExists:
            os.makedirs(path)
            # print(path+' 创建成功')
            return True
        else:
            # print(path+' 目录已存在')
            return False

    def get_pic(self, pic_name, pic_url):
        try:
            res_pic = requests.get(url=pic_url, headers=self.headers, timeout=10, proxies=self.proxies)  #二进制数据
            # print('【成功】正在下载第图片,图片地址:' + str(pic_url))
            self.mkdir_file("/爬虫\\images_gaode")
            pic_dir = 'images_gaode/' + pic_name + '.jpg'  #给每张图片指定路径并命名
            fp = open(pic_dir, 'wb+')      #给每张图片指定路径并命名
            fp.write(res_pic.content)      #将图片二进制数据写成图片
            fp.close()
        except requests.exceptions.ConnectionError:
            print('【失败】当前图片无法下载,图片地址:' + str(pic_url))

    def get_info(self, html, mode, x_xpath):
        try:
            str_result = x_xpath
            if mode == "string":
                res_value = html.xpath(str_result)[0].xpath('string(.)')
            elif mode == "url":
                res_value = html.xpath(str_result)[0]
            elif mode == "list":
                res_value = html.xpath(str_result)
            elif mode == 'picture':
                res_value = html.xpath(str_result)[0]
            if isinstance(res_value, str):
                res_value = res_value.encode('gbk', 'ignore').decode('gbk').strip().replace(" ", '').replace("\n", '')
            elif isinstance(res_value, list):
                res_value = " | ".join([l for l in res_value if "\n" not in l])
        except Exception as e:
            res_value = str(str_result) + "\t错误码:" + str(e)
        return res_value

    def get_html(self, url, log=False):
        response = requests.get(url, headers=self.headers, proxies=self.proxies)
        res_text = response.text.encode('gbk', 'ignore').decode('gbk')
        html = etree.HTML(res_text, etree.HTMLParser())
        if log == True:
            print("Start crawling:" + url)
        # print(res_text)
        return html

    # 获取信息
    def get_catalog(self, html):
        html_list = html.xpath('//*[@class="serp-list"]/li')
        print("Statistic data:" + str(len(html_list)) + "条数据\n")
        # 遍历列表
        for i in range(1, len(html_list)):
        # for i in range(1, 5+1):
            res_pic_address = self.get_info(html=html, mode="url", x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-imgbox"]/span/@style')
            res_store_name = self.get_info(html=html, mode="picture",  x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/h3[@class="poi-title"]/span/@title')
            res_store_star = self.get_info(html=html, mode="picture",  x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/div[@class="poi-info"]/p/span/b/@style')
            res_store_address = self.get_info(html=html, mode="string",  x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/div[@class="poi-info"]/p[2]')

            # 格式化
            res_pic_address = str(re.findall(r'''background-image: url\("([^"]+)"\);''', res_pic_address)[0])
            res_store_name = str(res_store_name)
            res_store_star = str(float(re.findall(r'''width:([^"]+)px''', res_store_star)[0])/13.0)
            res_store_address = str(res_store_address)

            self.get_pic(pic_name=str(i) + "_" + res_store_name, pic_url=res_pic_address)
            print('店铺名称:' + res_store_name, end='\n------------列表信息------------\n')
            print('店铺星级:' + res_store_star, end='\n')
            print('店铺地址:' + res_store_address, end='\n')
            print('图片地址:' + res_pic_address, end='\n')

    def write_info(self, res_text):
            with open('response_2021.txt', 'a+', encoding='utf-8') as write:
                write.write(json.dumps(res_text, ensure_ascii=False) + '\n')
                write.write('\n')

    def run(self):
        test = Gaode_requests()
        html = test.get_html(url='https://www.amap.com/search?query=钱大妈&city=44060', log=True)
        test.get_catalog(html=html)
class Gaode_requests_json(object):
    def __init__(self):
        self.headers = {
            "Host": "map.amap.com",
            'User-Agent': UserAgent().random
        }
        self.proxies = {"http": None, "https": None}
        # self.headers = {
        #     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        #     "Accept-Encoding": "gzip, deflate, br",
        #     "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        #     "Cache-Control": "max-age=0",
        #     "Connection": "keep-alive",
        #     "Cookie": "guid=fd1b-ee7a-3f28-e167; UM_distinctid=177dbe27dd2261-090e01e25d14aa-5e1a3f18-100200-177dbe27dd35d9; cna=15DbF7eJLyYCAbfeFBk1OWNb; xlly_s=1; _uab_collina=161430179363728282983957; CNZZDATA1255626299=1850886754-1614296565-https%253A%252F%252Fwww.baidu.com%252F%7C1614301969; tfstk=ccKcB0qtcE7bjY_kRmsXLz8GvlcdZt2VGwQy43Eh62WhWMKPic4z8_q7G_RDXP1..; l=eBN5SFucjMSRj8hzBOfaourza779sIRYSuPzaNbMiOCP9TC65wVAW6gTj28BCnGVh6z6R3rMK82YBeYBqBAnnxv9sThLxkDmn; isg=BBcXPum9O1bm07_aExT_SeLopothXOu-04QrDWlEMeZNmDfacS_3DvKy-jiGdcM2",
        #     "Host": "www.amap.com",
        #     "Sec-Fetch-Dest": "document",
        #     "Sec-Fetch-Mode": "navigate",
        #     "Sec-Fetch-Site": "none",
        #     "Sec-Fetch-User": "?1",
        #     "Upgrade-Insecure-Requests": "1",
        #     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
        # }

        self.data = {
                    "query_type": "TQUERY",
                    "pagesize": "20",
                    "pagenum": "1",
                    "qii": "true",
                    "cluster_state": "5",
                    "need_utd": "true",
                    "utd_sceneid": "1000",
                    "div": "PC1000",
                    "addr_poi_merge": "true",
                    "is_classify": "true",
                    "zoom": "9.16",
                    "city": "440600",
                    "geoobj": "103.868799|30.198854|105.050418|31.181088",
                    "keywords": "钱大妈"
                    }
    def mkdir_file(self, path):
        path = path.strip()
        path = path.rstrip("\\")
        isExists=os.path.exists(path)
        if not isExists:
            os.makedirs(path)
            # print(path+' 创建成功')
            return True
        else:
            # print(path+' 目录已存在')
            return False

    def get_pic(self, pic_pash, pic_name, pic_url):
        try:
            res_pic = requests.get(url=pic_url, headers=self.headers, timeout=10, proxies=self.proxies)  #二进制数据
            # print('【成功】正在下载第图片,图片地址:' + str(pic_url))
            # self.mkdir_file("D:\\Mytest\\Svnbucket\\Python3\\爬虫\\images_gaode")
            self.mkdir_file(pic_pash)
            pic_dir = pic_pash + "\\" + pic_name + '.jpg'  #给每张图片指定路径并命名
            fp = open(pic_dir, 'wb+')      #给每张图片指定路径并命名
            fp.write(res_pic.content)      #将图片二进制数据写成图片
            fp.close()
        except requests.exceptions.ConnectionError:
            print('【失败】当前图片无法下载,图片地址:' + str(pic_url))
        return pic_name + ".jpg"

    def get_res(self):
        for page in range(1, 12+1):
            try:
                url = f"https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={page}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=10&city=440600&geoobj=113.012853|22.753595|113.672034|23.341201&keywords=钱大妈"
                response = requests.get(url=url, timeout=5, headers=self.headers, proxies=self.proxies)
                res_json = response.json()
                time.sleep(1)
                # print(type(res_json), len(res_json), res_json)
                res_list = res_json["data"]["poi_list"]
                # print(type(res_list), len(res_list), res_list, "\n")
                print(f"----第{page}页:数据{len(res_list)}条----")
                for i in range(len(res_list)):
                    try:
                        res_name = res_list[i]["disp_name"]
                    except:
                        res_name = "空"
                    try:
                        res_rating = res_list[i]["rating"]
                    except:
                        res_rating = "空"

                    try:
                        res_address = res_list[i]["address"]
                    except:
                        res_address = "空"
                    try:
                        res_pic_address = res_list[i]["domain_list"][5]["value"]
                    except:
                        res_pic_address = "空"
                    try:
                        pic_name = self.get_pic(pic_pash=f"D:\\Mytest\\Svnbucket\\Python3\\爬虫\\gaode_file\\images_gaode_page{page}", pic_name=str(int(i)+1) + "_" + res_name + "_" + res_address, pic_url=res_pic_address)
                    except:
                        res_pic_address = "空"
                    print(f'{i+1}.店铺名称:' + res_name, end='\n')
                    print('店铺星级:' + res_rating, end='\n')
                    print('店铺地址:' + res_address, end='\n')
                    print('图片名称:' + pic_name, end='\n')
                    print('图片地址:' + res_pic_address, end='\n\n')
            except Exception as e:
                print(e)

    def run(self):
        Gaode_requests_json().get_res()



if __name__ == "__main__":
    Gaode_requests_json().run()
----第1页:数据20条----
1.店铺名称:钱大妈
店铺星级:4.6
店铺地址:海六路17号
图片名称:1_钱大妈_海六路17号.jpg
图片地址:http://store.is.autonavi.com/showpic/20105d428a05884bd27f3b08d8720b21

2.店铺名称:钱大妈
店铺星级:3.5
店铺地址:
图片名称:2_钱大妈_.jpg
图片地址:http://store.is.autonavi.com/showpic/19fbf6b3b480d6af29a63969dcf28de8

3.店铺名称:钱大妈
店铺星级:4.6
店铺地址:盐步穗盐东路花地湾雍景豪园帝景台25-30座61号商铺
图片名称:3_钱大妈_盐步穗盐东路花地湾雍景豪园帝景台25-30座61号商铺.jpg
图片地址:http://store.is.autonavi.com/showpic/5ba6be6a133c2ad85911559016651ef9

4.店铺名称:钱大妈
店铺星级:4.6
店铺地址:大沥镇建设大道中海金沙湾中区商业街B53号
图片名称:4_钱大妈_大沥镇建设大道中海金沙湾中区商业街B53号.jpg
图片地址:http://store.is.autonavi.com/showpic/a15e133eadb5fb4a5c5ba4a4ae484296

5.店铺名称:钱大妈
店铺星级:
店铺地址:荷富路与明国路交叉路口往东南约50米(美的西海岸)
图片名称:4_钱大妈_大沥镇建设大道中海金沙湾中区商业街B53号.jpg
图片地址:空

6.店铺名称:钱大妈
店铺星级:
店铺地址:文星路活力盈居地铺10号
图片名称:6_钱大妈_文星路活力盈居地铺10号.jpg
图片地址:http://store.is.autonavi.com/showpic/11f18279ee0be161718ebf6d38f7a2c8

7.店铺名称:钱大妈
店铺星级:3.5
店铺地址:佛山大道北东海国际3区2期达伦五金对面
图片名称:7_钱大妈_佛山大道北东海国际3区2期达伦五金对面.jpg
图片地址:http://store.is.autonavi.com/showpic/df8146ecbc6601c141df94492d6209cf

8.店铺名称:钱大妈
店铺星级:
店铺地址:
图片名称:8_钱大妈_.jpg
图片地址:http://store.is.autonavi.com/showpic/ce2ed6c27936805194b189fbbbad4a90&type=pic

9.店铺名称:钱大妈
店铺星级:
店铺地址:碧桂园翡翠湾东南(庆云大道北)
图片名称:9_钱大妈_碧桂园翡翠湾东南(庆云大道北).jpg
图片地址:http://store.is.autonavi.com/showpic/2806d087aa92d9f443c58acd7061d9c3

10.店铺名称:钱大妈
店铺星级:3.5
店铺地址:大沥镇盐步广佛路平地段89号
图片名称:10_钱大妈_大沥镇盐步广佛路平地段89号.jpg
图片地址:http://store.is.autonavi.com/showpic/38bc9fba23aeed54fd96c6c12f46568b


上一篇:搬家第14天-138.Wincc V7.3 c脚本计算ListView勾选的行数、勾选行flow1求和、最大值、最小值


下一篇:scrapy简单命令行