日期:2014-05-20 浏览次数:21090 次
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2012</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import java.util.zip.*;
public class test {
//返回所有组的正则
public static ArrayDeque<String[]> regexAllGroups(String original, String regex) {
int total = 0;
String[] ary = null;
ArrayDeque Q = new ArrayDeque();
if (original == null || regex == null) {
return Q;
}
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(original);
while (m != null && m.find()) { //该正则在这里m.find卡死。。
total = m.groupCount();
if (total < 1) {
continue;
}
ary = new String[total];
for (int i = 1; i <= total; i++) {
ary[i - 1] = new String(m.group(i));
}
Q.add(ary);
}
m = null;
p = null;
return Q;
}
//获取网页源码
public static String getUrlHtml(String strURL) {
String body = null;
String contentEncoding = null;
URL _URL = null;
InputStream IN = null;
HttpURLConnection CONNECTION = null;
try {
_URL = new URL(strURL);
CONNECTION = (HttpURLConnection) _URL.openConnection();
CONNECTION.setConnectTimeout(3000);
CONNECTION.setReadTimeout(3000);
CONNECTION.setRequestProperty("Accept-Encoding", "gzip,deflate");
CONNECTION.setRequestProperty("Accept", "*/*");
CONNECTION.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)");
CONNECTION.setRequestProperty("Connection", "close");
CONNECTION.setRequestMethod("GET");
CONNECTION.setFollowRedirects(true);
CONNECTION.setUseCaches(false);
CONNECTION.setInstanceFollowRedirects(true);
/*判断是不是GZIP/DEFLATE压缩格式的网页*/
int type = 0;
contentEncoding = CONNECTION.getContentEncoding();
if (contentEncoding != null) {
contentEncoding = contentEncoding.toLowerCase();
if (contentEncoding.indexOf("gzip") != -1) {
type = 1;
}
if (contentEncoding.indexOf("deflate") != -1) {
type = 2;
}
}
switch (type) {
case 1:
IN = new GZIPInputStream(CONNECTION.getInputStream());
break;
case 2:
IN = new InflaterInputStream(CONNECTION.getInputStream());
break;
default:
IN = CONNECTION.getInputStream();
break;
}
byte[] b = null;
if (IN != null && (b = inputStreamToByte(IN)) != null) {
body = new String(b, "utf-8");
IN.close();
}
CONNECTION.disconnect();
b = null;
} catch (Exception e) {
try {
if (IN != null) {
IN.close();
}
if (CONNECTION != null) {
CONNECTION.disconnect();
}
} catch (Exception ex) {
}
body = null;
}
IN = null;
_URL = null;
CONNECTION = null;
return body;
}
public static byte[] inputStreamToByte(InputStream in) {
if (in == null) {
return null;
}
int ch;
byte[] b = null;
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try {
while ((ch = in.read()) != -1) {
stream.write(ch);
}
b = stream.toByteArray();
stream.reset();
stream.close();
in.close();
} catch (Exception e) {
e.printStackTrace();
}
in = null;
stream = null;
return b;
}
public static void main(String[] args) {
//正则表达式
String regex = "<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*><tr><td class=f><h3 class=\"t\"><a.*href=\"([\\s\\S]*?)\".*target=\"_blank\">([\\s\\S]*?)</a>([\\s\\S]*?)<br>";
//获取该网页地址的html源代码
String html = getUrlHtml("http://www.baidu.com/s?wd=%D2%F8%C1%AA%B4%F3%B0%AE%BF%A8&pn=0&rn=10&usm=1");
//分析结果,在这里出现CPU资源100%
ArrayDeque<String[]> Q = regexAllGroups(html, regex);
}
}