一个简单的C#爬虫程序

2022-09-09 22:26:58

　　　　这篇这篇文章主要是展示了一个C#语言如何抓取网站中的图片。实现原理就是基于http请求。C#给我们提供了HttpWebRequest和WebClient两个对象，方便发送请求获取数据，下面看如何实

1，HttpGetAction方法。用于发送请求获取数据后处理字符串得到图片地址

 public static void HttpGetAction(string url,string path,int name)

         {

             Stopwatch sw = new Stopwatch();

             sw.Start();

             Console.WriteLine("抓取地址:" + url);

             string result = string.Empty;

             HttpWebRequest webRequest = WebRequest.CreateHttp(url);

             webRequest.Method = "GET";

             var response= webRequest.GetResponse();

             using (StreamReader reader = new StreamReader((response as HttpWebResponse).GetResponseStream(), Encoding.UTF8))

             {

                 result = reader.ReadToEnd();

                 reader.Close();

             }

             if (string.IsNullOrEmpty(result))

             {

                 Console.WriteLine("请求地址错误");

                 Console.ReadKey();

                 return;

             }

             //提取img标签src地址

             Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);

             // 搜索匹配的字符串

             MatchCollection matches = regImg.Matches(result);

             //爬取数量

             int i = ;

             WebClient web = new WebClient();

             // 取得匹配项列表

             foreach (Match match in matches)

             {

                 string imgsrc = match.Groups["imgUrl"].Value;

                 if (imgsrc.Contains("http") && !imgsrc.Contains(".svg"))

                 {

                     i++;

                     HttpGetImg(web,imgsrc, path,name);

                     name++;//图片名

                 }

             }

             sw.Stop();

             Console.WriteLine("爬取完成!总共爬取了" + i + "张图片!");

             Console.WriteLine("爬取图片耗时:" + sw.ElapsedMilliseconds /  + "秒");

         }

2，HttpGetImg方法。下载图片到指定目录

 public static void HttpGetImg(WebClient web, string src,string path,int name)

         {

             Console.WriteLine("爬取图片:" + src);

             if (!Directory.Exists(path))

             {

                 Console.WriteLine("路径错误!");

                 Console.ReadKey();

                 return;

             }

             web.DownloadFile(src, path+name+".jpg");

             Console.WriteLine("爬取图片成功:" + name+".jpg");

         }

3，控制台调用

 static void Main(string[] args)

         {

             string url= "https://www.xxxxxx.com/";

             string path = Path.Combine(@"D:\word 资料\img\冬天\");

             HttpHelper.HttpGetAction(url,path,);

             Console.ReadKey();

         }

效果图：

一个简单的C#爬虫程序就完成了。如有错误的地方还望大神指点

原文来自：一个简单的C#程序-曾亚平个人博客

码农公寓

相关文章