Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句

多不废话,直接上马,小主您稳着。。。

 package com.fanfengping.zeus.uitl;

 import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.testng.annotations.Test; import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map; @Slf4j
public class JsoupGetRegionSql {
@Test
public void getRegionSql () throws Exception {
String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/201901-06/201904301706.html";
String fp = System.getProperty("user.dir") + File.separator + "initRegion.sql"; int count = 0; File file = new File(fp); if (file.exists()) {
file.delete();
} file.createNewFile(); FileWriter fileWriter = new FileWriter(file.getName(), true); Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36")
.header("Accept", "text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,application/signed-exchangev=b3")
.maxBodySize(0)
.timeout(100000)
.get(); Elements trs = doc.select("tr"); List<Map<String, Object>> adminRegion = new ArrayList<>();
List<Map<String, Object>> adminRegionSec = new ArrayList<>();
List<Map<String, Object>> adminRegionThi = new ArrayList<>(); for (Element tr : trs ) {
Elements tds = tr.select("td"); Map<String, Object> region = new HashMap<>(); if (tds.size() > 3) {
String regionCode = tds.get(1).text();
String regionArea = tds.get(2).text();
String parentCode = ""; if (validCode(regionCode)) {
int leveType = 2;
parentCode = regionCode.substring(0,2) + "0000"; if (!regionCode.endsWith("00")) {
leveType = 3;
parentCode = regionCode.substring(0,4) + "00";
} if (regionCode.endsWith("0000")) {
leveType = 1;
parentCode = "000000";
} region.put("code", regionCode);
region.put("region", regionArea);
region.put("parentCode", parentCode);
region.put("level", leveType); switch ((Integer) region.get("level")) {
case 1:
adminRegion.add(region);
break;
case 2:
adminRegionSec.add(region);
break;
default:
adminRegionThi.add(region);
break;
} count++;
String content = String.format("insert into region_code (code, region, level, parent_code, dtime, note, ctime)" +
" values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode); fileWriter.write(content);
}
}
} System.out.println("总数量:" + count); System.out.println(fp); fileWriter.close();
} public boolean validCode(String code) {
try {
Integer.parseInt(code);
return true;
} catch (Exception e) {
return false;
}
}
}

  

  控制台输出如下所示:

  Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句

  

  数据库文件截图如下所示:

  Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句

  

  

上一篇:最近学习总结 Nodejs express 获取url参数,post参数的三种方式


下一篇:java学习笔记06--正则表达式