日期:2014-05-17 浏览次数:20900 次
package com.cs.parser.util;
import org.htmlparser.Node;
public class PageContent {
private StringBuffer textBuffer;
private int number;
private Node node;
public Node getNode() {
return node;
}
public void setNode(Node node) {
this.node = node;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public StringBuffer getTextBuffer() {
return textBuffer;
}
public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}
package com.cs.parser.util;
public class TableValid {
private int trnum;
private int tdnum;
private int linknum;
private int textnum;
private int scriptnum;
public int getScriptnum() {
return scriptnum;
}
public void setScriptnum(int scriptnum) {
this.scriptnum = scriptnum;
}
public int getLinknum() {
return linknum;
}
public void setLinknum(int linknum) {
this.linknum = linknum;
}
public int getTdnum() {
return tdnum;
}
public void setTdnum(int tdnum) {
this.tdnum = tdnum;
}
public int getTextnum() {
return textnum;
}
public void setTextnum(int textnum) {
this.textnum = textnum;
}
public int getTrnum() {
return trnum;
}
public void setTrnum(int trnum) {
this.trnum = trnum;
}
}
package com.cs.parser.util;
public class TableColumnValid {
int tdNum;
boolean valid;
public int getTdNum() {
return tdNum;
}
public void setTdNum(int tdNum) {
this.tdNum = tdNum;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
}
package com.cs;
public interface Parsable {
public String getTitle() ;
public String getContent() ;
public String getSummary() ;
}
package com.cs;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.cs.parser.util.PageContent;
import com.cs.parser.util.TableColumnValid;
import com.cs.parser.util.TableValid;
public class EasyHtmlParser implements Parsable {
protected static final String lineSign = System.getProperty(
"line.separator");
protected static final int lineSign_size = lineSign.length();
private File file ;
private String content ;
private String summary ;
private String title ;
public static void main(String[] args) {
EasyHtmlParser eParser = new EasyHtmlParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")) ;
System.out.println("html content : "+ePar