1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
using
System;
using
System.Collections.Generic;
using
System.Web;
using
System.Text;
using
System.Net;
using
System.IO;
using
System.Text.RegularExpressions;
using
System.Collections;
using
System.IO.Compression;
/// <summary> ///Name:网页抓取类 ///Author:loafinweb ///Date:2011-09-12 /// </summary> public
class webCrawl
{ public
webCrawl() { }
//获取网页字符根据url
public
static string getHtml( string
url)
{
try
{
string
str = "" ;
Encoding en = Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set( "Pragma" , "no-cache" );
request.Timeout = 30000;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if
(response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
Stream strM = response.GetResponseStream();
StreamReader sr = new
StreamReader(strM, en);
str = sr.ReadToEnd();
strM.Close();
sr.Close();
}
return
str;
}
catch
{
return
String.Empty;
}
}
//获取编码
public
static string getEncoding( string
url)
{
HttpWebRequest request = null ;
HttpWebResponse response = null ;
StreamReader reader = null ;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 30000;
request.AllowAutoRedirect = false ;
response = (HttpWebResponse)request.GetResponse();
if
(response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if
(response.ContentEncoding != null
&& response.ContentEncoding.Equals( "gzip" , StringComparison.InvariantCultureIgnoreCase))
reader = new
StreamReader( new
GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new
StreamReader(response.GetResponseStream(), Encoding.ASCII);
string
html = reader.ReadToEnd();
Regex reg_charset = new
Regex( @"charset\b\s*=\s*(?<charset>[^""]*)" );
if
(reg_charset.IsMatch(html))
{
return
reg_charset.Match(html).Groups[ "charset" ].Value;
}
else
if (response.CharacterSet != string .Empty)
{
return
response.CharacterSet;
}
else
return
Encoding.Default.BodyName;
}
}
catch
(Exception ex)
{
throw
new Exception(ex.Message);
}
finally
{
if
(response != null )
{
response.Close();
response = null ;
}
if
(reader != null )
reader.Close();
if
(request != null )
request = null ;
}
return
Encoding.Default.BodyName;
}
//根据内容--获取标题
public
static string getTitle( string
url)
{
string
title = string .Empty;
string
htmlStr = getHtml(url); //获取网页
Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>" , RegexOptions.IgnoreCase | RegexOptions.Multiline);
title = TitleMatch.Groups[1].Value;
title = Regex.Replace(title, @"\W" , "" ); //去除空格
return
title;
}
//根据内容--获取描述信息
public
static string getDescription( string
url)
{
string
htmlStr = getHtml(url);
Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>" , RegexOptions.IgnoreCase | RegexOptions.Multiline);
string
mdd = Desc.Groups[1].Value;
return
Regex.Replace(Desc.Groups[1].Value, @"\W" , "" );
}
//根据内容--获取所有链接
public
static List< string > getLink( string
htmlStr)
{
List< string > list = new
List< string >(); //用来存放链接
Regex regex = new
Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc = regex.Matches(htmlStr);
for
( int
i = 0; i < mc.Count; i++) //存放匹配的集合
{
bool
hasExist = false ; //链接存在与否的标记
String name = mc[i].ToString();
foreach
(String one in
list)
{
if
(name == one)
{
hasExist = true ; //链接已存在
break ;
}
}
if
(!hasExist) list.Add(name); //链接不存在,添加
}
return
list;
}
//根据内容--取得body内的内容
public
static string getBody( string
url)
{
string
htmlStr = getHtml(url);
string
result = string .Empty;
Regex regBody = new
Regex( @"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>" );
Match m = regBody.Match(htmlStr);
if
(m.Success)
{
result = parseHtml(m.Value);
}
return
result;
}
//获取所有图片
public
static List< string > getImg( string
url)
{
List< string > list = new
List< string >();
string
temp = string .Empty;
string
htmlStr = getHtml(url);
MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>" ); //抽取所有图片
for
( int
i = 0; i < matchs.Count; i++)
{
list.Add(matchs[i].Value);
}
return
list;
}
//所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public
static List< string > getImgPath( string
url)
{
List< string > list = new
List< string >();
string
htmlStr = getHtml(url);
string
pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""‘<>]*)[^<>]*?/?[\s\t\r\n]*>" ;
MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach
(Match m in
matches)
{
string
imgPath = m.Groups[ "imgUrl" ].Value.Trim();
if
(Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$" )) //用了2次匹配,去除链接是网页的 只留图片
{
if
(!imgPath.Contains( "http" )) //必须包含http 否则无法下载
{
imgPath = getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return
list;
}
//下载图片
public
void DownloadImg( string
fileurl)
{
{
string
imgName = DateTime.Now.ToString( "yyyyMMddHHmmssffff" ) + fileurl.Substring(fileurl.LastIndexOf( ‘.‘ )); // 生成图片的名字
string
filepath = System.Web.HttpContext.Current.Server.MapPath( "" ) + "/"
+ imgName;
WebClient mywebclient = new
WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}
//过滤html
public
static string parseHtml( string
html)
{
string
value = Regex.Replace(html, "<[^>]*>" , string .Empty);
value = value.Replace( "<" , string .Empty);
value = value.Replace( ">" , string .Empty);
//return value.Replace(" ", string.Empty);
return
Regex.Replace(value, @"\s+" , "" );
}
//处理url路径问题
public
static string getUrl( string
url)
{
return
url = url.Substring(0, url.LastIndexOf( ‘/‘ )) + "/" ;
}
} |