提取网页中的超链接(C#)

王朝c#·作者佚名  2006-01-09
宽屏版  字体: |||超大  

using System;

using System.Xml;

using System.Text;

using System.Net;

using System.IO;

using System.Collections;

using System.Text.RegularExpressions;

public class App

{

public static void Main()

{

string strCode;

ArrayList alLinks;

Console.Write("请输入一个网页地址:");

string strURL = Console.ReadLine();

if(strURL.Substring(0,7) != @"http://")

{

strURL = @"http://" + strURL;

}

Console.WriteLine("正在获取页面代码,请稍侯...");

strCode = GetPageSource(strURL);

Console.WriteLine("正在提取超链接,请稍侯...");

alLinks = GetHyperLinks(strCode);

Console.WriteLine("正在写入文件,请稍侯...");

WriteToXml(strURL,alLinks);

}

// 获取指定网页的HTML代码

static string GetPageSource(string URL)

{

Uri uri =new Uri(URL);

HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);

HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

hwReq.Method = "Get";

hwReq.KeepAlive = false;

StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

return reader.ReadToEnd();

}

// 提取HTML代码中的网址

static ArrayList GetHyperLinks(string htmlCode)

{

ArrayList al = new ArrayList();

string strRegex = @"+/.)+[/w-]+(/[/w]http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

MatchCollection m = r.Matches(htmlCode);

for(int i=0; i<=m.Count-1; i++)

{

bool rep = false;

string strNew = m[i].ToString();

// 过滤重复的URL

foreach(string str in al)

{

if(strNew==str)

{

rep =true;

break;

}

}

if(!rep) al.Add(strNew);

}

al.Sort();

return al;

}

// 把网址写入xml文件

static void WriteToXml(string strURL, ArrayList alHyperLinks)

{

XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

writer.Formatting = Formatting.Indented;

writer.WriteStartDocument(false);

writer.WriteDocType("HyperLinks", null, "urls.dtd", null);

writer.WriteComment("提取自" + strURL + "的超链接");

writer.WriteStartElement("HyperLinks");

writer.WriteStartElement("HyperLinks", null);

writer.WriteAttributeString("DateTime",DateTime.Now.ToString());

foreach(string str in alHyperLinks)

{

string title = GetDomain(str);

string body = str;

writer.WriteElementString(title,null,body);

}

writer.WriteEndElement();

writer.WriteEndElement();

writer.Flush();

writer.Close();

}

// 获取网址的域名后缀

static string GetDomain(string strURL)

{

string retVal;

string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

Match m = r.Match(strURL);

retVal = m.ToString();

strRegex = @"\.|/$";

retVal = Regex.Replace(retVal, strRegex, "").ToString();

if(retVal == "")

retVal = "other";

return retVal;

}

}

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
© 2005- 王朝网络 版权所有