定向网站爬虫---初级例子

1：url处理和html解析

package com.xiaoshuo.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xiaoshuo.to.Chapter;
import com.xiaoshuo.to.UrlTO;

/**
 * 解析html的处理类
 * @author lijunqing
 *
 */
public class PaserUrlUtil {
    
    private HttpClient httpClient=new DefaultHttpClient();
    
    /**
     * 获得html的string字符串
     * @param url
     * @return
     * @throws Exception
     */
    public String getHtmlStr(String url) throws Exception {
        HttpGet httpGet=new HttpGet(url);
        HttpResponse response;
        String htmlStr=null;
        try {
            response=httpClient.execute(httpGet);
            HttpEntity entity=response.getEntity();
            if(entity != null) {
                htmlStr=new String(EntityUtils.toString(entity));
                htmlStr=new String(htmlStr.getBytes("ISO-8859-1"), "gbk"); // 读取乱码解决
            }
        } catch(Exception e) {
            e.printStackTrace();
        }
        return htmlStr;
    }
    
    /**
     * 获得document
     * @param url
     * @return
     * @throws Exception
     */
    public Document getDocument(String url) throws Exception{
        Thread.currentThread().sleep(5000*2);
        return Jsoup.parse(getHtmlStr(url));
    }
    
    /**
     * 获得种类url连接
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getCategoryUrls(String url) throws Exception{
        Document doc = getDocument(url);
        List<UrlTO> urlList = new ArrayList<UrlTO>();
        Elements elements = doc.select(".navlist").select("li").select("a");
        String categoryUrl= null;
        UrlTO urlTO=null;
        for(Element element:elements){
            categoryUrl = element.attr("href");
            urlTO = new UrlTO();
            urlTO.setDeptValue(1);
            urlTO.setUrl(categoryUrl);
            urlList.add(urlTO);
        }
        return urlList;
    }
    
    /***
     * 通过分类url获得所有的该类下书籍url
     * @param categoryUrl
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getBookUrls(String categoryUrl) throws Exception{
        System.out.println("bookUrls-处理进入 deptvalue-==1-");
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(2);
        String nextUrl = getNextBookUrl(categoryUrl);
        while(nextUrl != null && !nextUrl.trim().equals("")){
            System.out.println("bookUrls--"+nextUrl);
            urlTO.setUrl(nextUrl);
            nextUrl = getNextBookUrl(nextUrl);
            urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    /**
     * 获得下一个分页连接
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public String getNextBookUrl(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select("#pagelink").select("strong +a");
        if(elements == null){
            return null;
        }
        return elements.first().attr("href");
    }
    
    /**
     * 获取每个页面书籍详情url
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public List<UrlTO> getDetailUrlList(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select(".grid").select("tr");
        String detailUrl = null;
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        for(Element element:elements){
          detailUrl =  element.select("td").first().attr("href");
          urlTO.setDeptValue(3);
          urlTO.setUrl(detailUrl);
          urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    public UrlTO getToReadUrl(String detailUrl) throws Exception{
        Document doc = getDocument(detailUrl);
        UrlTO urlTO = new UrlTO();
        String toReadUrl=doc.select("#bt_1").select("a").first().attr("href");
        urlTO.setDeptValue(4);
        urlTO.setUrl(toReadUrl);
        return urlTO;
    }
    
    /**
     * 获得chapter的url
     * @param url
     * @return
     * @throws Exception
     */
    public List<UrlTO> getChapterList(String detailUrl) throws Exception {

        Document doc= getDocument(detailUrl);
        Elements elements=doc.select(".list").select("dd").select("a");
        List<UrlTO> urlList=new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        String chapterUrl= null;
        for(Element element: elements) {
            chapterUrl = detailUrl + element.attr("href");
            urlTO.setDeptValue(5);
            urlTO.setUrl(chapterUrl);
        }
        return urlList;
    }
    
    /**
     * 
     * @param chapterUrl
     * @return
     * @throws Exception
     */
    public Chapter getChapter(String chapterUrl) throws Exception {
        Document doc=getDocument(chapterUrl);
        Chapter chapter=new Chapter();
        String name=doc.select("h1").text();
        String content=doc.select(".width").text();
        chapter.setName(name);
        chapter.setContent(content);
        return chapter;
    }
    
}

2：url实体类

package com.xiaoshuo.to;

/**
 * url保存类
 * @author lijunqing
 *
 */
public class UrlTO {

    private Integer deptValue;

    private String url;

    public Integer getDeptValue() {
        return deptValue;
    }

    public void setDeptValue(Integer deptValue) {
        this.deptValue=deptValue;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url=url;
    }
    
    public String toString(){
       return "dept="+deptValue+"--url--"+url;
    }

}

3：队列类

package com.xiaoshuo.url;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;

import com.xiaoshuo.to.UrlTO;

/**
 * 保存访问的url
 * @author lijunqing
 */
public class LinkQueue {

    // 已经访问的url集合
    private static Set<Object> visitedUrl=new HashSet<Object>();

    // 未被访问的url集合
    private static Queue<Object> unVisitedUrl=new LinkedList<Object>();

    public static Queue<Object> getUnVisitedUrl() {
        return unVisitedUrl;
    }

    public static void removeVisitedUrl(String url) {
        visitedUrl.remove(url);
    }

    public static Object unVisitedPoll() {
        return unVisitedUrl.poll();
    }
    
    public static void addVisitedUrl(String url){
        System.out.println("已经访问的url--"+url);
        visitedUrl.add(url);
    }

    public static void addUnVisitedUrl(UrlTO url) {
       if(url!= null && !url.getUrl().trim().equals("")&& !visitedUrl.contains(url.getUrl())){
           System.out.println("想队列中添加新的url"+url.getUrl());
           unVisitedUrl.offer(url);
       }
    }

    public static Integer getVisitedUrlNum() {
        return visitedUrl.size();
    }

    public static  boolean unVisitedUrlEmpty() {
        return unVisitedUrl.isEmpty();
    }
}

4：crawler爬虫类

package com.xiaoshuo.service;

import java.util.ArrayList;
import java.util.List;

import org.junit.Test;

import com.xiaoshuo.to.UrlTO;
import com.xiaoshuo.url.LinkQueue;
import com.xiaoshuo.util.PaserUrlUtil;

/**
 * 宽度优先
 * @author lijunqing
 *
 */
public class Crawler {
    
    PaserUrlUtil paseUrlUtil = new PaserUrlUtil();
    
    /**
     * 初始化种子
     * @param url
     */
    public void initCrawlerBySeed(String url){
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(0);
        urlTO.setUrl(url);
        LinkQueue.addUnVisitedUrl(urlTO);
        System.out.println("UrlTO-----"+urlTO);
    }
    
    /**
     * 宽度优先搜索
     * @throws Exception
     */
    public void crawlerByBSF() throws Exception{
        // 种子url
        String url = "http://www.shuoshuo520.com/";
        //种子入队
        initCrawlerBySeed(url);
        System.out.println("feeds-----"+url);
        while(!LinkQueue.unVisitedUrlEmpty()){
            UrlTO visitUrl = (UrlTO)LinkQueue.unVisitedPoll();
            if(visitUrl == null)
                continue;
            //放入已经访问的url中
            
            List<UrlTO> unVisitUrlList = null;
            Integer deptValue = visitUrl.getDeptValue();
            String nextUrl = visitUrl.getUrl();
            
            LinkQueue.addVisitedUrl(nextUrl);
            System.out.println("正在处理的url实体--deptValue--"+deptValue+"--url--"+nextUrl);
            
            if(deptValue == 0){
                unVisitUrlList = paseUrlUtil.getCategoryUrls(nextUrl);
            }else if(deptValue == 1){
                unVisitUrlList = paseUrlUtil.getBookUrls(nextUrl);
            }else if(deptValue == 2){
                unVisitUrlList = paseUrlUtil.getDetailUrlList(nextUrl);
            }else if(deptValue == 3){
                unVisitUrlList = new ArrayList<UrlTO>();
                unVisitUrlList.add(paseUrlUtil.getToReadUrl(nextUrl));
            }else if(deptValue == 4){
                unVisitUrlList = paseUrlUtil.getChapterList(nextUrl);
            }else if(deptValue == 5){
               //最后一层
            }
            
            for(UrlTO urlTO: unVisitUrlList){
                LinkQueue.addUnVisitedUrl(urlTO);
                
            }
            
            
            
        }
    }
}

5：其实原理差不多，爬虫要定制智能,我的意图是获得该网站数据到直接插入到数据库中，然后建立索引，所以我把每个页面处理封装成对象插入到数据库中，

6：爬虫的html解析可以用正则表达式，可以把所有的方法重写一个方法通过配置文件传递表达式或者参数实现对其他网站的爬虫数据

原文链接：https://blog.csdn.net/mingluoxuan/article/details/84353757