提取网页中的超链接(C＃)

王朝c#·作者佚名 2006-01-09

宽屏版字体: 小 | 中 | 大 | 超大

using System;

using System.Xml;

using System.Text;

using System.Net;

using System.IO;

using System.Collections;

using System.Text.RegularExpressions;

public class App

{

public static void Main()

{

string strCode;

ArrayList alLinks;

Console.Write("请输入一个网页地址：");

string strURL = Console.ReadLine();

if(strURL.Substring(0,7) != @"http://")

{

strURL = @"http://" + strURL;

}

Console.WriteLine("正在获取页面代码，请稍侯...");

strCode = GetPageSource(strURL);

Console.WriteLine("正在提取超链接，请稍侯...");

alLinks = GetHyperLinks(strCode);

Console.WriteLine("正在写入文件，请稍侯...");

WriteToXml(strURL,alLinks);

}

// 获取指定网页的HTML代码

static string GetPageSource(string URL)

{

Uri uri =new Uri(URL);

HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);

HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

hwReq.Method = "Get";

hwReq.KeepAlive = false;

StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

return reader.ReadToEnd();

}

// 提取HTML代码中的网址

static ArrayList GetHyperLinks(string htmlCode)

{

ArrayList al = new ArrayList();

string strRegex = @"+/.)+[/w-]+(/[/w]http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

MatchCollection m = r.Matches(htmlCode);

for(int i=0; i<=m.Count-1; i++)

{

bool rep = false;

string strNew = m[i].ToString();

// 过滤重复的URL

foreach(string str in al)

{

if(strNew==str)

{

rep =true;

break;

}

if(!rep) al.Add(strNew);

}

al.Sort();

return al;

}

// 把网址写入xml文件

static void WriteToXml(string strURL, ArrayList alHyperLinks)

{

XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

writer.Formatting = Formatting.Indented;

writer.WriteStartDocument(false);

writer.WriteDocType("HyperLinks", null, "urls.dtd", null);

writer.WriteComment("提取自" + strURL + "的超链接");

writer.WriteStartElement("HyperLinks");

writer.WriteStartElement("HyperLinks", null);

writer.WriteAttributeString("DateTime",DateTime.Now.ToString());

foreach(string str in alHyperLinks)

{

string title = GetDomain(str);

string body = str;

writer.WriteElementString(title,null,body);

}

writer.WriteEndElement();

writer.Flush();

writer.Close();

}

// 获取网址的域名后缀

static string GetDomain(string strURL)

{

string retVal;

string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

Match m = r.Match(strURL);

retVal = m.ToString();

strRegex = @"\.|/$";

retVal = Regex.Replace(retVal, strRegex, "").ToString();

if(retVal == "")

retVal = "other";

return retVal;

}