日期:2014-05-17 浏览次数:20723 次
package com.safetys.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.safetys.framework.exception.ApplicationAccessException;
/**
* 整合百度、谷歌搜索数据
* @author zhaozhi3758
* date:2011-04-19
*/
public class Crawler {
private final static String splitStr="zzc@cheng";
private String encoding="gbk"; //解析页面编码
public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索
public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}"
public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai="
public String keyword; //搜索关键字
public int searchNum = 0;//搜索数量
public String specifyUrl; //按指定的url 搜索
/**
* 抓取百度搜索结果页面
*/
public List<String> crawlerBaidu(){
Parser myParser = new Parser();
try {
myParser.setURL(getBaiduUrl());
myParser.setEncoding(myParser.getEncoding());
} catch (ParserException e1) {
e1.printStackTrace();
}
NodeList nodeList = null;
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
List<String> result = new ArrayList<String>();
try {
nodeList = myParser.parse(lastFilter);
for (int i = 0; i <= nodeList.size(); i++) {
if (nodeList.elementAt(i) instanceof TableTag) {
TableTag tag = (TableTag) nodeList.elementAt(i);
if(tag.getAttribute("id")!=null){
result.addAll(getBaiduLink(tag.getChildrenHTML()));
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return result;
}
private List<String> getBaiduLink(String s){
Parser myParser;
NodeList nodeList = null;
myParser = Parser.createParser(s,encoding);
List<String> result = new ArrayList<String>();
try {
//设置解析编码格式
nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter
if (nodeList!=null && nodeList.size () > 0) {
// 循环遍历每个Url 节点
for (int l = 0; l < nodeList.size () ; l ++) {
String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ;
String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ;
if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){
System.out.println("baidu--->"+LinkName + splitStr + urlLink);
result.add(LinkName + splitStr + urlLink);
}
}
}
} catch (ParserException e) {
e.printStackTrace () ;
}
return result;
}
/**
* 抓取谷歌搜索结果页面的指定范围的链接
*/
private List<String> crawlerGoogle() {
String htmlstr = getUrlHtmlByHttpClient(getGoogleU