请安装python3.7版本,更高版本无法使用pymssql包
#!/usr/bin/python # -*- coding: UTF-8 -*- # 功能: 获取省市县数据 # 版本:v1.1 import importlib import sys import pymssql importlib.reload(sys) import requests import lxml.etree as etree import os class chinese_city(): # 初始化函数 def __init__(self): self.trdic = { 1: ‘//tr[@class="provincetr"]‘, 2: ‘//tr[@class="citytr"]‘, 3: ‘//tr[@class="countytr"]‘, 4: ‘//tr[@class="towntr"]‘, 5: ‘//tr[@class="villagetr"]‘ } def crawl_page(self,url): try: headers = { ‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0‘, ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘ } html = requests.get(url, headers=headers, timeout=1000) html.encoding = ‘gbk‘ print(str(html.status_code)+url) text = html.text return text except requests.exceptions.RequestException: print(‘超时‘+url) #根据trid 解析子页 def parse(self, trid, pid, url): if url.strip() == ‘‘: return None else: html = self.crawl_page(url) tree = etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘)) if trid==3: nodes = tree.xpath(self.trdic.get(trid)) if len(nodes)==0: nodes = tree.xpath(self.trdic.get(4)) print(‘有镇的市:‘+url) else: nodes = tree.xpath(self.trdic.get(trid)) path = os.path.basename(url) base_url = url.replace(path, ‘‘) values = [] for node in nodes: nexturl = node.xpath(‘./td[1]/a/@href‘) if(len(nexturl)>0): code = node.xpath(‘./td[1]/a/text()‘) if len(code) == 0: code = node.xpath(‘./td[1]/text()‘) name = node.xpath(‘./td[2]/a/text()‘) if len(name) == 0: name = node.xpath(‘./td[2]/text()‘) value = {} value[‘url‘] = base_url + "".join(nexturl) value[‘code‘] = "".join(code) value[‘name‘] = "".join(name) value[‘pcode‘] = pid value[‘grade‘] = trid value[‘cat‘] = 0 values.append(value) return values #解析社区页 def parseVillager(self, trid, pid, url): if url.strip() == ‘‘: return None else: html = self.crawl_page(url) tree = etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘)) nodes = tree.xpath(self.trdic.get(trid)) values = [] for node in nodes: code = node.xpath(‘./td[1]/text()‘) cate = node.xpath(‘./td[2]/text()‘) name = node.xpath(‘./td[3]/text()‘) value = {} value[‘code‘] = "".join(code) value[‘name‘] = "".join(name) value[‘pcode‘] = pid value[‘grade‘] = trid value[‘cat‘] = cate values.append(value) return values #插入数据库 def insert_to_db(self, taobao): conn = pymssql.connect( host=r‘127.0.0.1‘, user=r‘sa‘, password=‘123‘, database=‘areadb‘ ) cursor = conn.cursor() try: param = [] for p in taobao: param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat"))) sql = ‘INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)‘ cursor.executemany(sql, param) conn.commit() except Exception as e: conn.rollback() print(e) finally: if cursor: cursor.close() if conn: conn.close() #从头执行解析 def parseChineseCity(self): citys = self.parse(2, ‘520000000000‘, ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html‘) if not citys is None: self.insert_to_db(citys) for city in citys: countys = self.parse(3, city[‘code‘], city[‘url‘]) if not countys is None: self.insert_to_db(countys) for county in countys: towns = self.parse(4, county[‘code‘], county[‘url‘]) if not towns is None: self.insert_to_db(towns) for town in towns: villagers = self.parseVillager(5, town[‘code‘], town[‘url‘]) if not villagers is None: self.insert_to_db(villagers) if __name__ == ‘__main__‘: chinese_city = chinese_city() chinese_city.parseChineseCity()
创建数据库表
CREATE TABLE [dbo].[fetch_area]( [code] [varchar](16) NULL, [grade] [int] NULL, [pcode] [varchar](16) NULL, [name] [nvarchar](64) NULL, [cat] [int] NULL )