python爬取国家统计局2019年行政区划分数据mssql

请安装python3.7版本,更高版本无法使用pymssql包

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#   功能:  获取省市县数据
#   版本:v1.1
import importlib
import sys
import pymssql
importlib.reload(sys)
import requests
import lxml.etree as etree
import os
class chinese_city():
    # 初始化函数
    def __init__(self):
        self.trdic = {
            1: //tr[@class="provincetr"],
            2: //tr[@class="citytr"],
            3: //tr[@class="countytr"],
            4: //tr[@class="towntr"],
            5: //tr[@class="villagetr"]
        }        
 
    def crawl_page(self,url):
        try:
            headers = {
                user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0,
                Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
            }
            html = requests.get(url, headers=headers, timeout=1000)
            html.encoding = gbk
            print(str(html.status_code)+url)
            text = html.text
            return text
        except requests.exceptions.RequestException:
            print(超时+url)
    
    #根据trid 解析子页
    def parse(self, trid, pid, url):
        if url.strip() == ‘‘:
            return None
        else:
            html = self.crawl_page(url)
            tree = etree.HTML(html, parser=etree.HTMLParser(encoding=gbk))
            if trid==3:
                nodes = tree.xpath(self.trdic.get(trid))
                if len(nodes)==0:
                    nodes = tree.xpath(self.trdic.get(4))
                    print(有镇的市:+url)
            else:
                nodes = tree.xpath(self.trdic.get(trid)) 
            path = os.path.basename(url)
            base_url = url.replace(path, ‘‘)
            values = []
            for node in nodes:
                nexturl = node.xpath(./td[1]/a/@href)
                if(len(nexturl)>0):
                    code = node.xpath(./td[1]/a/text())
                    if len(code) == 0:
                        code = node.xpath(./td[1]/text())
                    name = node.xpath(./td[2]/a/text())
                    if len(name) == 0:
                        name = node.xpath(./td[2]/text())
                    value = {}
                    value[url] = base_url + "".join(nexturl)
                    value[code] = "".join(code)
                    value[name] = "".join(name)
                    value[pcode] = pid
                    value[grade] = trid
                    value[cat] = 0
                    values.append(value)
            return values
 
    #解析社区页
    def parseVillager(self, trid, pid, url):
        if url.strip() == ‘‘:
            return None
        else:
            html = self.crawl_page(url)
            tree = etree.HTML(html, parser=etree.HTMLParser(encoding=gbk))
            nodes = tree.xpath(self.trdic.get(trid))
            values = []
            for node in nodes:
                code = node.xpath(./td[1]/text())
                cate = node.xpath(./td[2]/text())
                name = node.xpath(./td[3]/text())
                value = {}
                value[code] = "".join(code)
                value[name] = "".join(name)
                value[pcode] = pid
                value[grade] = trid
                value[cat] = cate
                values.append(value)
            return values
 
    #插入数据库
    def insert_to_db(self, taobao):
        conn = pymssql.connect(
            host=r127.0.0.1,
            user=rsa,
            password=123,
            database=areadb
        )
        cursor = conn.cursor()
        try:
            param = []
            for p in taobao:
                param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat"))) 
            sql = INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)
            cursor.executemany(sql, param)
            conn.commit()
        except Exception as e:
            conn.rollback()
            print(e)
        finally:
            if cursor:
                cursor.close()
            if conn:
                conn.close()
 
    #从头执行解析
    def parseChineseCity(self):
        citys = self.parse(2, 520000000000, http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html)
        if not citys is None:
            self.insert_to_db(citys)
            for city in citys:
                countys = self.parse(3, city[code], city[url])
                if not countys is None:
                    self.insert_to_db(countys)
                    for county in countys:
                        towns = self.parse(4, county[code], county[url])
                        if not towns is None:
                            self.insert_to_db(towns)
                            for town in towns:
                                villagers = self.parseVillager(5, town[code], town[url])
                                if not villagers is None:
                                    self.insert_to_db(villagers)
 
if __name__ == __main__:
    chinese_city = chinese_city()
    chinese_city.parseChineseCity()

创建数据库表

CREATE TABLE [dbo].[fetch_area](
    [code] [varchar](16) NULL,
    [grade] [int] NULL,
    [pcode] [varchar](16) NULL,
    [name] [nvarchar](64) NULL,
    [cat] [int] NULL
)

 

python爬取国家统计局2019年行政区划分数据mssql

上一篇:MyBatis-Plus+mysql5.7 动态拼接sql语句 分页查询 自定义sql 查询条件 分组 排序


下一篇:transform feedback