jsoup爬取全国省市区

项目中经常用到全国省市区的数据表,但是这个是数据又会经常变动,每次都需要找最新的数据,很麻烦,特此记录一下,用jsoup爬取国家统计局的数据。

jsoup爬取全国省市区

1.引入jar包

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>

2.创建数据表,生成Haha实体类

DROP TABLE IF EXISTS `haha`;
CREATE TABLE `haha` (
`id` int NOT NULL AUTO_INCREMENT,
`code` bigint DEFAULT NULL,
`name` varchar(255) NOT NULL,
`parent_id` int NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3647 DEFAULT CHARSET=utf8;

 

3.实现接口

jsoup爬取全国省市区
//省份代码json
        String provinceJson = "[{\"code\":110000,\"id\":1,\"name\":\"北京市\",\"parentId\":0}," +
                "{\"code\":120000,\"id\":2,\"name\":\"天津市\",\"parentId\":0}," +
                "{\"code\":130000,\"id\":3,\"name\":\"河北省\",\"parentId\":0}," +
                "{\"code\":140000,\"id\":4,\"name\":\"山西省\",\"parentId\":0}," +
                "{\"code\":150000,\"id\":5,\"name\":\"内蒙古自治区\",\"parentId\":0}," +
                "{\"code\":210000,\"id\":6,\"name\":\"辽宁省\",\"parentId\":0}," +
                "{\"code\":220000,\"id\":7,\"name\":\"吉林省\",\"parentId\":0}," +
                "{\"code\":230000,\"id\":8,\"name\":\"黑龙江省\",\"parentId\":0}," +
                "{\"code\":310000,\"id\":9,\"name\":\"上海市\",\"parentId\":0}," +
                "{\"code\":320000,\"id\":10,\"name\":\"江苏省\",\"parentId\":0}," +
                "{\"code\":330000,\"id\":11,\"name\":\"浙江省\",\"parentId\":0}," +
                "{\"code\":340000,\"id\":12,\"name\":\"安徽省\",\"parentId\":0}," +
                "{\"code\":350000,\"id\":13,\"name\":\"福建省\",\"parentId\":0}," +
                "{\"code\":360000,\"id\":14,\"name\":\"江西省\",\"parentId\":0}," +
                "{\"code\":370000,\"id\":15,\"name\":\"山东省\",\"parentId\":0}," +
                "{\"code\":410000,\"id\":16,\"name\":\"河南省\",\"parentId\":0}," +
                "{\"code\":420000,\"id\":17,\"name\":\"湖北省\",\"parentId\":0}," +
                "{\"code\":430000,\"id\":18,\"name\":\"湖南省\",\"parentId\":0}," +
                "{\"code\":440000,\"id\":19,\"name\":\"广东省\",\"parentId\":0}," +
                "{\"code\":450000,\"id\":20,\"name\":\"广西壮族自治区\",\"parentId\":0}," +
                "{\"code\":460000,\"id\":21,\"name\":\"海南省\",\"parentId\":0}," +
                "{\"code\":500000,\"id\":22,\"name\":\"重庆市\",\"parentId\":0}," +
                "{\"code\":510000,\"id\":23,\"name\":\"四川省\",\"parentId\":0}," +
                "{\"code\":520000,\"id\":24,\"name\":\"贵州省\",\"parentId\":0}," +
                "{\"code\":530000,\"id\":25,\"name\":\"云南省\",\"parentId\":0}," +
                "{\"code\":540000,\"id\":26,\"name\":\"*自治区\",\"parentId\":0}," +
                "{\"code\":610000,\"id\":27,\"name\":\"陕西省\",\"parentId\":0}," +
                "{\"code\":620000,\"id\":28,\"name\":\"甘肃省\",\"parentId\":0}," +
                "{\"code\":630000,\"id\":29,\"name\":\"青海省\",\"parentId\":0}," +
                "{\"code\":640000,\"id\":30,\"name\":\"宁夏回族自治区\",\"parentId\":0}," +
                "{\"code\":650000,\"id\":31,\"name\":\"**自治区\",\"parentId\":0}," +
                "{\"code\":710000,\"id\":32,\"name\":\"*\",\"parentId\":0}," +
                "{\"code\":810000,\"id\":33,\"name\":\"香港特别行政区\",\"parentId\":0}," +
                "{\"code\":820000,\"id\":34,\"name\":\"澳门特别行政区\",\"parentId\":0}]\n";

        JSONArray array = JSONArray.parseArray(provinceJson);
        //国家统计局地址
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";

        //链接到目标地址
        Connection connect = Jsoup.connect(url);
        //设置useragent,设置超时时间,并以get请求方式请求服务器
        Document document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                timeout(6000).ignoreContentType(true).get();

        //获取省份列表
        ListIterator<Element> elements = document.getElementsByClass("provincetr").listIterator();

        while (elements.hasNext()) {
            ListIterator<Element> tds = elements.next().children().listIterator();

            while (tds.hasNext()) {
                Element element = tds.next().child(0);
                String provinceName = element.text();

                Haha province = new Haha();
                province.setName(provinceName);
                for (int i = 0; i < array.size(); i++) {
                    JSONObject json = array.getJSONObject(i);
                    if (provinceName.equals(json.getString("name"))) {
                        province.setCode(json.getLong("code"));
                    }
                }

                province.save();

                url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + element.attr("href");
                connect = Jsoup.connect(url);
                document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                        timeout(6000).ignoreContentType(true).get();
                ListIterator<Element> citys = document.getElementsByClass("citytr").listIterator();
                while (citys.hasNext()) {
                    ListIterator<Element> as = citys.next().getElementsByTag("a").listIterator();

                    int index = 1;
                    Haha city = new Haha();

                    while (as.hasNext()) {
                        Element c = as.next();
                        if (index == 1) {
                            index++;
                            city.setCode(Long.parseLong(c.text().trim()) / 1000000);
                        } else {
                            url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + c.attr("href");
                            city.setName(c.text().trim());
                        }
                    }
                    city.setParentId(province.getId());
                    city.save();

                    connect = Jsoup.connect(url);
                    document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                            timeout(6000).ignoreContentType(true).get();

                    ListIterator<Element> countys = document.getElementsByClass("countytr").listIterator();

                    while (countys.hasNext()) {
                        ListIterator<Element> couna = countys.next().getElementsByTag("td").listIterator();

                        Haha county = new Haha();
                        int countIndex = 1;
                        while (couna.hasNext()) {
                            Element a = couna.next();
                            if (countIndex == 1) {
                                countIndex++;
                                county.setCode(Long.parseLong(a.text().trim()) / 1000000);
                            } else {
                                county.setName(a.text().trim());
                            }
                        }
                        county.setParentId(city.getId());
                        county.save();
                    }
                }
            }
        }
View Code

 

4.请求接口

http://localhost/demo

 

上一篇:js算法-两种将数组数据转为属性结构记录


下一篇:iosSelect级联选择器的使用