一、国家统计局网址
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
二、引入jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
三、爬虫程序
/**
* @ClassName JavaJsoupUtil
* @Description TODO
* @Author Lock-玄清
* @Date 2022/9/15 10:59
**/
public class JavaJsoupUtil {
/**
* 公共路径url
*/
private static String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
/**
* 建立连接
*/
private static Document connect(String url) {
if (url == null || url.isEmpty()) {
throw new IllegalArgumentException("无效的url");
}
try {
return Jsoup.connect(url).timeout(200 * 2000).get();
} catch (IOException e) {
System.out.println(url+"地址不存在");
return null;
}
}
/**
* 获取所有的省份 Lock-玄清
* @param
* @return
*/
public List<SysCitys> getProvinces() {
List<SysCitys> sysAreas = new ArrayList<>();
Document connect = connect(url+"index.html");
Elements rowProvince = connect.select("tr.provincetr");
for (Element provinceElement : rowProvince) {
Elements select = provinceElement.select("a");
for (Element province : select) {
String codUrl = province.select("a").attr("href");
String fatherCode = codUrl.replace(".html", "0000");
String name = province.text();
SysCitys sysCitys = returnCitys(fatherCode, name, "0000", "01");
sysAreas.add(sysCitys);
System.err.println("++++++++++++++++++++++++++开始获取" + name + "下属市区行政区划信息++++++++++++++++++++++++");
String provinceUrl = url + codUrl;
List<SysCitys> sysAreasList = getCitys(provinceUrl, fatherCode);
sysAreas.addAll(sysAreasList);
}
}
return sysAreas;
}
/**
* 获取市行政区划信息 Lock-玄清
* @param provinceUrl 省份对应的地址
* @param parentCode 需要爬取的省份行政区划(对于市的父级代码即为省行政区划)
* @return
*/
public List<SysCitys> getCitys(String provinceUrl,String parentCode){
List<SysCitys> sysAreas = new ArrayList<>();
Document connect = connect(provinceUrl);
Elements rowCity = connect.select("tr.citytr");
for (Element cityElement : rowCity) {
String codUrl = cityElement.select("a").attr("href");
String name = cityElement.select("td").text();
String[] split = name.split(" ");
String addrCode = split[0].substring(0,4);
SysCitys sysCitys = returnCitys(addrCode+"00",split[1],parentCode,"02");
sysAreas.add(sysCitys);
System.err.println("-------------------开始获取"+split[1]+"下属区县行政区划信息-----------------------");
String cityUrl = url+codUrl;
List<SysCitys> downAreaCodeList = getCountys(cityUrl,addrCode+"00");
sysAreas.addAll(downAreaCodeList);
}
return sysAreas;
}
/**
* 获取区县行政区划信息 Lock-玄清
* @param cityUrl 城市对应的地址
* @param parentCode 需要爬取的市行政区划(对于区县的父级代码即为市行政区划)
* @return
*/
public List<SysCitys> getCountys(String cityUrl,String parentCode){
List<SysCitys> sysAreas = new ArrayList<>();
Document connect = connect(cityUrl);
Elements rowDown = connect.select("tr.countytr");
for (Element downElement : rowDown) {
String codUrl = downElement.select("a").attr("href");
String name = downElement.select("td").text();
String[] split = name.split(" ");
if(!"市辖区".equals(split[1])){
SysCitys sysCitys = returnCitys(split[0].substring(0,6),split[1],parentCode,"03");
sysAreas.add(sysCitys);
}
}
return sysAreas;
}
/**
* 返回城市对象 Lock-玄清
* @param addrCode
* @param addrName
* @param fatherCode
* @return
*/
private SysCitys returnCitys(String addrCode,String addrName,String fatherCode,String type){
SysCitys sysCitys = new SysCitys();
sysCitys.setAddrCode(addrCode);
sysCitys.setAddrName(addrName);
sysCitys.setFatherCode(fatherCode);
sysCitys.setType(type);
sysCitys.setCreateTime(new Date());
sysCitys.setUpdateTime(new Date());
return sysCitys;
}
}
三、单元测试
/**
* @ClassName JavaJsoupTest
* @Description TODO
* @Author Lock-玄清
* @Date 2022/9/15 11:23
**/
public class JavaJsoupTest {
JavaJsoupUtil util = new JavaJsoupUtil();
@Test
public void cityTest(){
//省
List<SysCitys> sysAreas = util.getProvinces();
System.out.println(sysAreas.size());
System.err.println("爬虫相应数据为:"+ JSONObject.toJSONString(sysAreas));
/* //市
List<SysCitys> sysAreas = util.getCitys("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/50.html","500000");
System.err.println("爬虫相应数据为:"+ JSONObject.toJSONString(sysAreas));*/
}
}
四、附录
城市表结构及数据请点击下方链接下载
中国大陆地区省市区三级联动数据
省市区联动json数据下载中国大陆地区省市区三级联动json数据
五、说明
此文章仅有省市区三级数据的爬取程序,后续会更新省市区街道村五级程序。
此文章是玄清本人在工作之余自己写的,不喜勿喷。
版权声明:本文为lock_xuanqing原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。