1:url处理和html解析
package com.xiaoshuo.util;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.xiaoshuo.to.Chapter;
import com.xiaoshuo.to.UrlTO;
/**
* 解析html的处理类
* @author lijunqing
*
*/
public class PaserUrlUtil {
private HttpClient httpClient=new DefaultHttpClient();
/**
* 获得html的string字符串
* @param url
* @return
* @throws Exception
*/
public String getHtmlStr(String url) throws Exception {
HttpGet httpGet=new HttpGet(url);
HttpResponse response;
String htmlStr=null;
try {
response=httpClient.execute(httpGet);
HttpEntity entity=response.getEntity();
if(entity != null) {
htmlStr=new String(EntityUtils.toString(entity));
htmlStr=new String(htmlStr.getBytes("ISO-8859-1"), "gbk"); // 读取乱码解决
}
} catch(Exception e) {
e.printStackTrace();
}
return htmlStr;
}
/**
* 获得document
* @param url
* @return
* @throws Exception
*/
public Document getDocument(String url) throws Exception{
Thread.currentThread().sleep(5000*2);
return Jsoup.parse(getHtmlStr(url));
}
/**
* 获得种类url连接
* @return
* @throws Exception
*/
public List<UrlTO> getCategoryUrls(String url) throws Exception{
Document doc = getDocument(url);
List<UrlTO> urlList = new ArrayList<UrlTO>();
Elements elements = doc.select(".navlist").select("li").select("a");
String categoryUrl= null;
UrlTO urlTO=null;
for(Element element:elements){
categoryUrl = element.attr("href");
urlTO = new UrlTO();
urlTO.setDeptValue(1);
urlTO.setUrl(categoryUrl);
urlList.add(urlTO);
}
return urlList;
}
/***
* 通过分类url获得所有的该类下书籍url
* @param categoryUrl
* @return
* @throws Exception
*/
public List<UrlTO> getBookUrls(String categoryUrl) throws Exception{
System.out.println("bookUrls-处理进入 deptvalue-==1-");
List<UrlTO> urlTOList = new ArrayList<UrlTO>();
UrlTO urlTO = new UrlTO();
urlTO.setDeptValue(2);
String nextUrl = getNextBookUrl(categoryUrl);
while(nextUrl != null && !nextUrl.trim().equals("")){
System.out.println("bookUrls--"+nextUrl);
urlTO.setUrl(nextUrl);
nextUrl = getNextBookUrl(nextUrl);
urlTOList.add(urlTO);
}
return urlTOList;
}
/**
* 获得下一个分页连接
* @param categoryUrl
* @return
* @throws Exception
*/
public String getNextBookUrl(String categoryUrl) throws Exception{
Document doc = getDocument(categoryUrl);
Elements elements = doc.select("#pagelink").select("strong +a");
if(elements == null){
return null;
}
return elements.first().attr("href");
}
/**
* 获取每个页面书籍详情url
* @param categoryUrl
* @return
* @throws Exception
*/
public List<UrlTO> getDetailUrlList(String categoryUrl) throws Exception{
Document doc = getDocument(categoryUrl);
Elements elements = doc.select(".grid").select("tr");
String detailUrl = null;
List<UrlTO> urlTOList = new ArrayList<UrlTO>();
UrlTO urlTO = new UrlTO();
for(Element element:elements){
detailUrl = element.select("td").first().attr("href");
urlTO.setDeptValue(3);
urlTO.setUrl(detailUrl);
urlTOList.add(urlTO);
}
return urlTOList;
}
public UrlTO getToReadUrl(String detailUrl) throws Exception{
Document doc = getDocument(detailUrl);
UrlTO urlTO = new UrlTO();
String toReadUrl=doc.select("#bt_1").select("a").first().attr("href");
urlTO.setDeptValue(4);
urlTO.setUrl(toReadUrl);
return urlTO;
}
/**
* 获得chapter的url
* @param url
* @return
* @throws Exception
*/
public List<UrlTO> getChapterList(String detailUrl) throws Exception {
Document doc= getDocument(detailUrl);
Elements elements=doc.select(".list").select("dd").select("a");
List<UrlTO> urlList=new ArrayList<UrlTO>();
UrlTO urlTO = new UrlTO();
String chapterUrl= null;
for(Element element: elements) {
chapterUrl = detailUrl + element.attr("href");
urlTO.setDeptValue(5);
urlTO.setUrl(chapterUrl);
}
return urlList;
}
/**
*
* @param chapterUrl
* @return
* @throws Exception
*/
public Chapter getChapter(String chapterUrl) throws Exception {
Document doc=getDocument(chapterUrl);
Chapter chapter=new Chapter();
String name=doc.select("h1").text();
String content=doc.select(".width").text();
chapter.setName(name);
chapter.setContent(content);
return chapter;
}
}
2:url实体类
package com.xiaoshuo.to;
/**
* url保存类
* @author lijunqing
*
*/
public class UrlTO {
private Integer deptValue;
private String url;
public Integer getDeptValue() {
return deptValue;
}
public void setDeptValue(Integer deptValue) {
this.deptValue=deptValue;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url=url;
}
public String toString(){
return "dept="+deptValue+"--url--"+url;
}
}
3:队列类
package com.xiaoshuo.url;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import com.xiaoshuo.to.UrlTO;
/**
* 保存访问的url
* @author lijunqing
*/
public class LinkQueue {
// 已经访问的url集合
private static Set<Object> visitedUrl=new HashSet<Object>();
// 未被访问的url集合
private static Queue<Object> unVisitedUrl=new LinkedList<Object>();
public static Queue<Object> getUnVisitedUrl() {
return unVisitedUrl;
}
public static void removeVisitedUrl(String url) {
visitedUrl.remove(url);
}
public static Object unVisitedPoll() {
return unVisitedUrl.poll();
}
public static void addVisitedUrl(String url){
System.out.println("已经访问的url--"+url);
visitedUrl.add(url);
}
public static void addUnVisitedUrl(UrlTO url) {
if(url!= null && !url.getUrl().trim().equals("")&& !visitedUrl.contains(url.getUrl())){
System.out.println("想队列中添加新的url"+url.getUrl());
unVisitedUrl.offer(url);
}
}
public static Integer getVisitedUrlNum() {
return visitedUrl.size();
}
public static boolean unVisitedUrlEmpty() {
return unVisitedUrl.isEmpty();
}
}
4:crawler爬虫类
package com.xiaoshuo.service;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
import com.xiaoshuo.to.UrlTO;
import com.xiaoshuo.url.LinkQueue;
import com.xiaoshuo.util.PaserUrlUtil;
/**
* 宽度优先
* @author lijunqing
*
*/
public class Crawler {
PaserUrlUtil paseUrlUtil = new PaserUrlUtil();
/**
* 初始化种子
* @param url
*/
public void initCrawlerBySeed(String url){
UrlTO urlTO = new UrlTO();
urlTO.setDeptValue(0);
urlTO.setUrl(url);
LinkQueue.addUnVisitedUrl(urlTO);
System.out.println("UrlTO-----"+urlTO);
}
/**
* 宽度优先搜索
* @throws Exception
*/
public void crawlerByBSF() throws Exception{
// 种子url
String url = "http://www.shuoshuo520.com/";
//种子入队
initCrawlerBySeed(url);
System.out.println("feeds-----"+url);
while(!LinkQueue.unVisitedUrlEmpty()){
UrlTO visitUrl = (UrlTO)LinkQueue.unVisitedPoll();
if(visitUrl == null)
continue;
//放入已经访问的url中
List<UrlTO> unVisitUrlList = null;
Integer deptValue = visitUrl.getDeptValue();
String nextUrl = visitUrl.getUrl();
LinkQueue.addVisitedUrl(nextUrl);
System.out.println("正在处理的url实体--deptValue--"+deptValue+"--url--"+nextUrl);
if(deptValue == 0){
unVisitUrlList = paseUrlUtil.getCategoryUrls(nextUrl);
}else if(deptValue == 1){
unVisitUrlList = paseUrlUtil.getBookUrls(nextUrl);
}else if(deptValue == 2){
unVisitUrlList = paseUrlUtil.getDetailUrlList(nextUrl);
}else if(deptValue == 3){
unVisitUrlList = new ArrayList<UrlTO>();
unVisitUrlList.add(paseUrlUtil.getToReadUrl(nextUrl));
}else if(deptValue == 4){
unVisitUrlList = paseUrlUtil.getChapterList(nextUrl);
}else if(deptValue == 5){
//最后一层
}
for(UrlTO urlTO: unVisitUrlList){
LinkQueue.addUnVisitedUrl(urlTO);
}
}
}
}
5:其实原理差不多,爬虫要定制智能,我的意图是获得该网站数据 到直接插入到数据库中 ,然后建立索引,所以我把每个页面处理封装成对象 插入到数据库中,
6:爬虫的html解析可以用正则表达式,可以把所有的方法重写一个方法 通过配置文件传递表达式或者参数实现对 其他网站的爬虫数据
版权声明:本文为mingluoxuan原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。