日期:2014-05-17 浏览次数:20941 次
<dl id="search_773979">
<dt>
<a href="read-htm-tid-773979.html" target="_blank" class="tlink">12年的二建考试(机电)有新教材了吗?变化大吗?</a>
</dt>
<dd>
<div class="num">0条回复,1次浏览</div>
<div class="text"> 今年的新教材出来了吗?</div>
<div class="author"><cite>2012-09-09 12:09 -</cite>作者: <a href="u.php?uid=5454001">ficclaoshen</a>
- <a href="thread-htm-fid-667.html">二级建造师—交流专版</a></div>
</dd>
</dl>
<dl id="search_773978">
<dt>
<a href="read-htm-tid-773978.html" target="_blank" class="tlink">天天来报道</a>
</dt>
<dd>
<div class="num">1条回复,7次浏览</div>
<div class="text">天天来报道,学习交流</div>
<div class="author"><cite>2012-09-09 12:06 -</cite>作者: <a href="u.php?uid=9307651">z330843564</a>
- <a href="thread-htm-fid-377.html">233新手交流</a></div>
</dd>
</dl>
string html = File.ReadAllText("html.txt",Encoding.Default);//如果是采集网页直接返回网页内容
string[] 分隔符={"id=\"search_","\">","target=\"_blank\" class=\"tlink\">","</a>","<div class=\"num\">","条回复,","次浏览</div>","thread-htm-fid-",".html"};
string[] jieguo = html.Split(分隔符,System.StringSplitOptions.RemoveEmptyEntries);
int i=jieguo.Length;
File.AppendAllText("jieguo.txt",jieguo[1] + Environment.NewLine + jieguo[19] + Environment.NewLine + jieguo[21] + Environment.NewLine + jieguo[22] + Environment.NewLine + jieguo[28]);
------解决方案--------------------
循环取值就可以了
string tempStr = File.ReadAllText(@"C:\Documents and Settings\Administrator\桌面\Test.txt", Encoding.GetEncoding("GB2312"));//读取txt
string pattern = @"(?is)<dl[^>]*?id=(['""]?)search_(?<id_num>[^'""]+?)[^>]*?>\s*?<dt>\s*?<a[^>]*?>(?<a_text>[^<>]+?)</a>\s*?</dt>";
pattern += @"\s*?<dd>\s*?<div[^>]*?class=(['""]?)num\2[^>]*?>[^<]*?(?<reply_count>\d+)[^<]*?(?<read_count>\d+)[^<]*?</div>";
pattern += @"[\s\S]*?<div[^>]*?class=(['""]?)author\3[^>]*?>[\s\S]*?-?\s*?<a[^>]*?href=(['""]?)[^'""]*?(?<html_num>\d+)\.html?\4";
foreach (Match m in Regex.Matches(tempStr, pattern))
{
//循环输出
string v1 = m.Groups["id_num"].Value;//7
string v2 = m.Groups["a_text"].Value;//12年的二建考试(机电)有新教材了吗?变化大吗?
string v3 = m.Groups["reply_count"].Value;//0
string v4 = m.Groups["read_count&qu