分享一个c#t的网页抓取类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
using System;
using System.Collections.Generic;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;
 
/// <summary>
///Name:网页抓取类
///Author:loafinweb
///Date:2011-09-12
/// </summary>
public class webCrawl
{
    public webCrawl() { }
 
    //获取网页字符根据url 
    public static string getHtml(string url)
    {
        try
        {
            string str = "";
            Encoding en = Encoding.GetEncoding(getEncoding(url));
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            request.Headers.Set("Pragma", "no-cache");
            request.Timeout = 30000;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
            {
                Stream strM = response.GetResponseStream();
                StreamReader sr = new StreamReader(strM, en);
                str = sr.ReadToEnd();
                strM.Close();
                sr.Close();
            }
            return str;
        }
        catch
        {
            return String.Empty;
        }
    }
 
    //获取编码
    public static string getEncoding(string url)
    {
        HttpWebRequest request = null;
        HttpWebResponse response = null;
        StreamReader reader = null;
        try
        {
            request = (HttpWebRequest)WebRequest.Create(url);
            request.Timeout = 30000;
            request.AllowAutoRedirect = false;
 
            response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
            {
                if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                else
                    reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
 
                string html = reader.ReadToEnd();
 
                Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                if (reg_charset.IsMatch(html))
                {
                    return reg_charset.Match(html).Groups["charset"].Value;
                }
                else if (response.CharacterSet != string.Empty)
                {
                    return response.CharacterSet;
                }
                else
                    return Encoding.Default.BodyName;
            }
        }
        catch (Exception ex)
        {
            throw new Exception(ex.Message);
        }
        finally
        {
            if (response != null)
            {
                response.Close();
                response = null;
            }
            if (reader != null)
                reader.Close();
 
            if (request != null)
                request = null;
        }
        return Encoding.Default.BodyName;
    }
 
    //根据内容--获取标题
    public static string getTitle(string url)
    {
        string title = string.Empty;
        string htmlStr = getHtml(url);//获取网页
        Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        title = TitleMatch.Groups[1].Value;
        title = Regex.Replace(title, @"\W", "");//去除空格
        return title;
 
    }
 
    //根据内容--获取描述信息
    public static string getDescription(string url)
    {
        string htmlStr = getHtml(url);
        Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        string mdd = Desc.Groups[1].Value;
        return Regex.Replace(Desc.Groups[1].Value, @"\W", "");
    }
 
 
    //根据内容--获取所有链接
    public static List<string> getLink(string htmlStr)
    {
        List<string> list = new List<string>(); //用来存放链接      
        String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";  //链接的正则表达式     
        Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
        MatchCollection mc = regex.Matches(htmlStr);
        for (int i = 0; i < mc.Count; i++) //存放匹配的集合
        {
            bool hasExist = false;   //链接存在与否的标记        
            String name = mc[i].ToString();
            foreach (String one in list)
            {
                if (name == one)
                {
                    hasExist = true; //链接已存在                  
                    break;
                }
            }
            if (!hasExist) list.Add(name); //链接不存在,添加
        }
        return list;
 
    }
 
    //根据内容--取得body内的内容
    public static string getBody(string url)
    {
        string htmlStr = getHtml(url);
        string result = string.Empty;
        Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");
        Match m = regBody.Match(htmlStr);
        if (m.Success)
        {
            result = parseHtml(m.Value);
        }
        return result;
    }
 
    //获取所有图片
    public static List<string> getImg(string url)
    {
        List<string> list = new List<string>();
        string temp = string.Empty;
        string htmlStr = getHtml(url);
        MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片
        for (int i = 0; i < matchs.Count; i++)
        {
            list.Add(matchs[i].Value);
        }
        return list;
    }
 
    //所有图片路径(如果是相对路径的话,自动设置成绝对路径)
    public static List<string> getImgPath(string url)
    {
        List<string> list = new List<string>();
        string htmlStr = getHtml(url);
        string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""‘<>]*)[^<>]*?/?[\s\t\r\n]*>";
        MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        foreach (Match m in matches)
        {
            string imgPath = m.Groups["imgUrl"].Value.Trim();
            if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配,去除链接是网页的 只留图片
            {
                if (!imgPath.Contains("http"))//必须包含http 否则无法下载
                {
                    imgPath = getUrl(url) + imgPath;
                }
                list.Add(imgPath);
            }
        }
        return list;
    }
 
    //下载图片
    public void DownloadImg(string fileurl)
    {
        if (fileurl.Contains(‘.‘.ToString()))//url路径必须是绝对路径 例如http://xxx.com/img/logo.jpg
        {
            string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf(‘.‘)); // 生成图片的名字
            string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;
            WebClient mywebclient = new WebClient();
            mywebclient.DownloadFile(fileurl, filepath);
        }
    }
 
    //过滤html
    public static string parseHtml(string html)
    {
        string value = Regex.Replace(html, "<[^>]*>", string.Empty);
        value = value.Replace("<", string.Empty);
        value = value.Replace(">", string.Empty);
        //return value.Replace(" ", string.Empty);
 
        return Regex.Replace(value, @"\s+", "");
    }
 
    //处理url路径问题
    public static string getUrl(string url)
    {
        //如果是http://www.xxx.com/art.aspx  返回http://www.xxx.com/
        return url = url.Substring(0, url.LastIndexOf(‘/‘)) + "/";
    }
}

  

分享一个c#t的网页抓取类,布布扣,bubuko.com

分享一个c#t的网页抓取类

上一篇:基于jQuery基础表单验证


下一篇:Windows环境下使用Cmake ndk编译fdk-aac