java爬取国家统计局之统计用区划和城乡划分代码(省市区三级)

一、国家统计局网址
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

二、引入jsoup依赖

   <dependency>
      <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
       <version>1.9.2</version>
   </dependency>

三、爬虫程序


/**
 * @ClassName JavaJsoupUtil
 * @Description TODO
 * @Author Lock-玄清
 * @Date 2022/9/15 10:59
 **/
public class JavaJsoupUtil {

    /**
     * 公共路径url  
     */
    private static String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";

    /**
     * 建立连接   
     */
    private static Document connect(String url) {
        if (url == null || url.isEmpty()) {
            throw new IllegalArgumentException("无效的url");
        }
        try {
            return Jsoup.connect(url).timeout(200 * 2000).get();
        } catch (IOException e) {
            System.out.println(url+"地址不存在");
            return null;
        }
    }

    /**
     * 获取所有的省份    Lock-玄清
     * @param
     * @return
     */
    public List<SysCitys> getProvinces() {
        List<SysCitys> sysAreas = new ArrayList<>();
        Document connect = connect(url+"index.html");
        Elements rowProvince = connect.select("tr.provincetr");
        for (Element provinceElement : rowProvince) {
            Elements select = provinceElement.select("a");
            for (Element province : select) {
                String codUrl = province.select("a").attr("href");
                String fatherCode = codUrl.replace(".html", "0000");
                String name = province.text();
                SysCitys sysCitys = returnCitys(fatherCode, name, "0000", "01");
                sysAreas.add(sysCitys);
                System.err.println("++++++++++++++++++++++++++开始获取" + name + "下属市区行政区划信息++++++++++++++++++++++++");
                String provinceUrl = url + codUrl;
                List<SysCitys> sysAreasList = getCitys(provinceUrl, fatherCode);
                sysAreas.addAll(sysAreasList);

            }
        }
        return sysAreas;
    }


    /**
     * 获取市行政区划信息    Lock-玄清
     * @param provinceUrl 省份对应的地址
     * @param parentCode  需要爬取的省份行政区划(对于市的父级代码即为省行政区划)
     * @return
     */
    public List<SysCitys> getCitys(String provinceUrl,String parentCode){
        List<SysCitys> sysAreas = new ArrayList<>();
        Document connect = connect(provinceUrl);
        Elements rowCity = connect.select("tr.citytr");
        for (Element cityElement : rowCity) {
            String codUrl = cityElement.select("a").attr("href");
            String name = cityElement.select("td").text();
            String[] split = name.split(" ");
            String addrCode = split[0].substring(0,4);
            SysCitys sysCitys = returnCitys(addrCode+"00",split[1],parentCode,"02");
            sysAreas.add(sysCitys);
            System.err.println("-------------------开始获取"+split[1]+"下属区县行政区划信息-----------------------");
            String cityUrl =  url+codUrl;
            List<SysCitys> downAreaCodeList = getCountys(cityUrl,addrCode+"00");
            sysAreas.addAll(downAreaCodeList);

        }
        return sysAreas;
    }

    /**
     * 获取区县行政区划信息    Lock-玄清
     * @param cityUrl 城市对应的地址
     * @param parentCode  需要爬取的市行政区划(对于区县的父级代码即为市行政区划)
     * @return
     */
    public List<SysCitys> getCountys(String cityUrl,String parentCode){
        List<SysCitys> sysAreas = new ArrayList<>();
        Document connect = connect(cityUrl);
        Elements rowDown = connect.select("tr.countytr");
        for (Element downElement : rowDown) {
            String codUrl = downElement.select("a").attr("href");
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            if(!"市辖区".equals(split[1])){
                SysCitys sysCitys = returnCitys(split[0].substring(0,6),split[1],parentCode,"03");
                sysAreas.add(sysCitys);
            }
        }
        return sysAreas;
    }

    /**
     * 返回城市对象  Lock-玄清
     * @param addrCode
     * @param addrName
     * @param fatherCode
     * @return
     */
    private SysCitys returnCitys(String addrCode,String addrName,String fatherCode,String type){
        SysCitys sysCitys = new SysCitys();
        sysCitys.setAddrCode(addrCode);
        sysCitys.setAddrName(addrName);
        sysCitys.setFatherCode(fatherCode);
        sysCitys.setType(type);
        sysCitys.setCreateTime(new Date());
        sysCitys.setUpdateTime(new Date());
        return sysCitys;
    }

}

三、单元测试

/**
 * @ClassName JavaJsoupTest
 * @Description TODO
 * @Author Lock-玄清
 * @Date 2022/9/15 11:23
 **/
public class JavaJsoupTest {

    JavaJsoupUtil util = new JavaJsoupUtil();

    @Test
    public void cityTest(){
        //省
        List<SysCitys> sysAreas = util.getProvinces();
        System.out.println(sysAreas.size());
        System.err.println("爬虫相应数据为:"+ JSONObject.toJSONString(sysAreas));
       /* //市
        List<SysCitys> sysAreas = util.getCitys("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/50.html","500000");
        System.err.println("爬虫相应数据为:"+ JSONObject.toJSONString(sysAreas));*/

    }
}

四、附录
城市表结构及数据请点击下方链接下载
中国大陆地区省市区三级联动数据
省市区联动json数据下载中国大陆地区省市区三级联动json数据

五、说明
此文章仅有省市区三级数据的爬取程序,后续会更新省市区街道村五级程序。
此文章是玄清本人在工作之余自己写的,不喜勿喷。


版权声明:本文为lock_xuanqing原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。