日期:2014-05-17 浏览次数:20817 次
package com.web.test;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
/**
* JAVA中使用Htmlparse解析HTML文档,使用htmlparse遍历出HTML文档的所有超链接(<a>标记)。
*
* @author YYmmiinngg
*/
public class ReadHTML2
{
public static void main(String[] args)
{
try
{
//1.网页HTML
String strUrl = "http://www.boc.cn/finadata/lilv/";
URL url = new URL(strUrl);
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr);
String htmlString = "";
//2.本地HTML
// File f=new File("fortest.htm");
//输入流
// InputStreamReader isr1=new InputStreamReader(new FileInputStream(f));
// BufferedReader br=new BufferedReader(isr1);
//获取html转换成String
String s;
String allContent = "";
while ((s = br.readLine()) != null)
{
allContent = allContent + s;
}
//使用后HTML Parser 控件
Parser myParser = Parser.createParser(allContent, "utf-8");
try
{
// 通过过滤器过滤出<A>标签
NodeList nodeList = myParser
.extractAllNodesThatMatch(new NodeFilter()
{
//实现该方法,用以过滤标签
public boolean accept(Node node)
{
if (node instanceof LinkTag) //<A>标记
return true;
return false;
}
});
// 打印
for (int i = 0; i < nodeList.size(); i++)
{
LinkTag n = (LinkTag) nodeList.elementAt(i);
System.out.print(n.getStringText() + " ==>> ");
System.out.println(n.extractLink());
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}