日期:2014-05-18 浏览次数:21071 次
public class SimpleHtmlParser
{
/// <summary>
/// 解析函数
/// </summary>
/// <param name="s">解析字符串</param>
/// <param name="elements">解析后的控件列表</param>
/// <returns>返回控件树</returns>
public static Element ParseHtml(string s,out List<Element> elements)
{
elements = new List<Element>();
elements.Clear();
Stack<Element> es = new Stack<Element>();
string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))";
RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline;
Regex regex = new Regex(pattern, options);
MatchCollection matches = regex.Matches(s);
var element = new Element();
var lastElement = element;
foreach (Match match in matches)
{
var wordindex = 0;
var wordlength = 0;
var word = "";
for (int i = 0; i < match.Groups.Count; i++)
{
var t = match.Groups[i];
if (t.Length > 0)
{
wordindex = t.Index;
wordlength = t.Length;
word = t.Value;
break;
}
}
if (wordlength <= 0) continue;
if (word == "<div/>") continue;
bool isTable = word.IndexOf("table") >= 0;
bool isDiv = word.IndexOf("div") >= 0;
bool isEnd = word.IndexOf("</") >= 0;
if (!isEnd)
{
//新标签
Element ee;
if (isDiv)
{
ee = new DivElement();
}
else if (isTable)
{
ee = new TableElement();
}
else
{
ee = new Element();
}
ee.StartTagIndex = wordindex;
ee.StartTagLength = wordlength;
ee.BegTag = word;
//设定父级
ee.Parent = lastElement;
lastElement = ee;
ee.Parent.Children.Add(ee);
//进栈
es.Push(ee);
}
else
{
//闭合标签
var t = es.Pop();
t.EndTag = word;
t.EndIndex = wordindex;
t.EndTagLength = wordlength;
lastElement = t.Parent;
t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength);
t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength));
elements.Add(t);
}
}
return element;
}
//去除代码中无用的标签
public static string ReplaceFontSpan(string s)
{
Regex r = new Regex("<head>.*?</head>");
s = r.Replace(s, "");
r = new Regex("</?font.*?>");
s = r.Replace(s, "");
r = new Regex("</?span.*?>");
s = r.Replace(s, "");
r = new Regex("</?a