这几天都在搞网页抓取工具 花了一个小时自己写了一个

亲亲 发布于 2010/09/05 09:22
阅读 666
收藏 0

RT

没事自己写的工具 

核心代码如下 主要是利用httpwebrequest抓取网站 分析数据 下一步使用正则过滤 和自动抓取网站编码。

C# code

  
public static string PostData(string data, string method, string postUrl)
{
HttpWebRequest request;
HttpWebResponse response;
ASCIIEncoding encoding
= new ASCIIEncoding();
request
= WebRequest.Create(postUrl) as HttpWebRequest;
request.ProtocolVersion
= HttpVersion.Version10;
request.AllowAutoRedirect
= true;
request.KeepAlive
= true;
request.Headers.Add(
"Accept-Language", "zh-cn");
request.Accept
= "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/xaml+xml, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
request.UserAgent
= "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; InfoPath.2; CIBA; .NET4.0C; .NET4.0E)";
request.Method
= "GET";
if (method == "post")
{
byte[] b = encoding.GetBytes(data);
request.ContentType
= "application/x-www-form-urlencoded";
request.ContentLength
= b.Length;
request.Method
= "POST";
using (Stream stream = request.GetRequestStream())
{
stream.Write(b,
0, b.Length);
}
}

string html = string.Empty;

try
{
//获取服务器返回的资源
using (response = request.GetResponse() as HttpWebResponse)
{
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8")))
{
html
= reader.ReadToEnd();
}
}
}
catch (WebException wex)
{
WebResponse wr
= wex.Response;
using (Stream st = wr.GetResponseStream())
{
using (StreamReader sr = new StreamReader(st, System.Text.Encoding.Default))
{
html
= sr.ReadToEnd();
}
}
}
catch (Exception ex)
{
html
="发生异常\n\r" + ex.Message;
}
return html;
}






下载地址http://download.csdn.net/source/2670822

大家别喷啊。。 我只是菜鸟

加载中
0
Yu7
Yu7

为什么不用webclient,有什么特别考虑吗?

0
答复哈
答复哈

网页抓取有用到Post的时候吗?

返回顶部
顶部