C# 网页图片爬虫的几种技术基础

一、文件流方式获取网络图片资源

方法1

string url = string.Format(@"http://webservice.36wu.com/DimensionalCodeService.asmx/GetCodeImgByString?size={0}&content={1}", , );
System.Net.WebRequest webreq = System.Net.WebRequest.Create(url);
System.Net.WebResponse webres = webreq.GetResponse();
using(System.IO.Stream stream = webres.GetResponseStream())
{
  ictureBox1.Image = Image.FromStream(stream);
}

方法2

生成图片的URL假设是这样:http://localhost/administrator/qrcode.aspx?pid=78

qrcode.aspx.cs的生成图片的部分代码:

Image image = new Bitmap(, );
Graphics g = Graphics.FromImage(image);
try
{
string url="http://localhost"; DotNetBarcode bc = new DotNetBarcode();
bc.Type = DotNetBarcode.Types.QRCode;
bc.PrintCheckDigitChar = true;
bc.WriteBar(url, , , , , g); System.IO.MemoryStream ms = new System.IO.MemoryStream();
image.Save(ms, System.Drawing.Imaging.ImageFormat.Png); Response.ClearContent();
//Response.ContentType = "image/Png";
//Response.BinaryWrite(ms.ToArray());
  Response.ContentType = "application/octet-stream";
Response.AddHeader("Content-Disposition", "attachment; filename=" + HttpUtility.UrlEncode("qrcode.png", System.Text.Encoding.UTF8));
Response.BinaryWrite(ms.ToArray());
ms.Dispose();
}
finally
{
g.Dispose();
image.Dispose();
}

或者这样

string fileName = "aaa.txt";//客户端保存的文件名
string filePath = Server.MapPath("DownLoad/aaa.txt");//路径 //以字符流的形式下载文件
FileStream fs = new FileStream(filePath, FileMode.Open);
byte[] bytes = new byte[(int)fs.Length];
fs.Read(bytes, 0, bytes.Length);
fs.Close();
Response.ContentType = "application/octet-stream";
//通知浏览器下载文件而不是打开
Response.AddHeader("Content-Disposition", "attachment; filename=" + HttpUtility.UrlEncode(fileName, System.Text.Encoding.UTF8));
Response.BinaryWrite(bytes);
Response.Flush();
Response.End();

二、WebClient方式从服务器上下载文件

参考方法1:

/// <summary>
/// 下载服务器文件至客户端
/// </summary>
/// <param name="url">被下载的文件地址,绝对路径</param>
/// <param name="dir">另存放的目录</param>
public void DownloadUrlFile(string url, string dir)
{
WebClient client = new WebClient();
string fileName = Path.GetFileName(url); //被下载的文件名
string path = dir + fileName; //另存为的绝对路径+文件名
try
{
if (!System.IO.Directory.Exists(dir))
{
System.IO.Directory.CreateDirectory(dir);
}
if (!System.IO.File.Exists(path))
{
client.DownloadFile(url, path);
}
}
catch (Exception)
{
// ShowError("文件下载失败!");
}
}

参考方法2 [2]

<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="GetPictureByUrl.aspx.cs" Inherits="HoverTreeMobile.GetPictureByUrl" %>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>根据网址把图片下载到服务器 - 何问起</title>
</head>
<body>
<form id="form1" runat="server">
<div>
图片网址:<br /><asp:TextBox runat="server" ID="textBoxImgUrl" Width="500" Text="http://hovertree.com/hvtimg/201508/cnvkv745.jpg" />
<br /> <asp:Button runat="server" ID="btnImg" Text="下载" OnClick="btnImg_Click" />
<br /><asp:Image runat="server" ID="hvtImg" />
<br />
<asp:Literal runat="server" ID="ltlTips" />
</div>
</form>
</body>
</html>

页面所对应的代码

using System;

namespace HoverTreeMobile
{
public partial class GetPictureByUrl : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{ } protected void btnImg_Click(object sender, EventArgs e)
{
try
{
System.Net.WebClient m_hvtWebClient = new System.Net.WebClient(); //如果不是指定格式图片
//例如http://hovertree.com/hvtart/bjae/t2lo8pf7.htm 是htm文件,不是图片
if (!(textBoxImgUrl.Text.EndsWith(".jpg")
|| textBoxImgUrl.Text.EndsWith(".gif")
|| textBoxImgUrl.Text.EndsWith(".png")))
{
ltlTips.Text = "输入的不是指定格式的图片的网址"; return;
} //生成随机的图片文件名
string m_picFileName = HoverTree.HoverTreeFrame.Utils.GetHoverTreeString()+ HoverTree.HoverTreeFrame.HoverString.GetLastStr(textBoxImgUrl.Text,); string m_keleyiPicture = Server.MapPath("/hovertreeimages/"+ m_picFileName);
//根据网址下载文件
m_hvtWebClient.DownloadFile(textBoxImgUrl.Text, m_keleyiPicture); hvtImg.ImageUrl = "/hovertreeimages/" + m_picFileName;
ltlTips.Text = string.Empty;
}
catch(Exception ex)
{
ltlTips.Text = ex.ToString();
}
}
}
}

//生成随机的图片文件名
string m_picFileName = HoverTree.HoverTreeFrame.Utils.GetHoverTreeString()+ HoverTree.HoverTreeFrame.HoverString.GetLastStr(textBoxImgUrl.Text,4);
以上代码,请下载源代码查看详细实现方法。部分可到 LINK 查看。

HoverTree 开源项目:新增根据网址把图片下载到服务器功能

请看 HoverTreeMobile 项目,http://hovertree.com,何问起,源代码下载 LINK。

三、网页相关的方式

方法1:

   public partial class DownLoadFile : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string picName = Request.QueryString["InternalSysURL"];
if (!String.IsNullOrEmpty(picName))
{
byte[] content = this.GetImageContent(picName);
this.WriteResponse(picName, content);
}
} #region
private byte[] GetImageContent(string picName)
{
string fileURL = GetImgUrlPrefix() + picName; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(fileURL);
request.AllowAutoRedirect = true; WebProxy proxy = new WebProxy();
proxy.BypassProxyOnLocal = true;
proxy.UseDefaultCredentials = true; request.Proxy = proxy; WebResponse response = request.GetResponse(); using (Stream stream = response.GetResponseStream())
{
using (MemoryStream ms = new MemoryStream())
{
Byte[] buffer = new Byte[];
int current = ;
while ((current = stream.Read(buffer, , buffer.Length)) != )
{
ms.Write(buffer, , current);
}
return ms.ToArray();
}
}
} private void WriteResponse(string picName, byte[] content)
{
Response.Clear();
Response.ClearHeaders();
Response.Buffer = false;
Response.ContentType = "application/octet-stream";
Response.AppendHeader("Content-Disposition", "attachment;filename=" + HttpUtility.UrlEncode(picName, Encoding.Default));
Response.AppendHeader("Content-Length", content.Length.ToString());
Response.BinaryWrite(content);
Response.Flush();
Response.End();
} private static string GetImgUrlPrefix()
{
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(AppDomain.CurrentDomain.BaseDirectory + "//Pages//ItemMaintain//ImageDownLoad.xml");
XmlNodeList nodes = xmlDoc.GetElementsByTagName("ProductImageOriginal");
if (nodes.Count > )
{
return nodes[].ChildNodes[].Value;
}
else { return ""; }
} #endregion
}

方法2[3]

根据URL请求获取页面HTML代码

    /// <summary>
/// 获取网页的HTML码
/// </summary>
/// <param name="url">链接地址</param>
/// <param name="encoding">编码类型</param>
/// <returns></returns>
public static string GetHtmlStr(string url, string encoding)
{
string htmlStr = "";
if (!String.IsNullOrEmpty(url))
{
WebRequest request = WebRequest.Create(url); //实例化WebRequest对象
WebResponse response = request.GetResponse(); //创建WebResponse对象
Stream datastream = response.GetResponseStream(); //创建流对象
Encoding ec = Encoding.Default;
if (encoding == "UTF8")
{
ec = Encoding.UTF8;
}
else if (encoding == "Default")
{
ec = Encoding.Default;
}
StreamReader reader = new StreamReader(datastream, ec);
htmlStr = reader.ReadToEnd(); //读取数据
reader.Close();
datastream.Close();
response.Close();
}
return htmlStr;
}

下载网站图片

    /// <summary>
/// 下载网站图片
/// </summary>
/// <param name="picUrl"></param>
/// <returns></returns>
public string SaveAsWebImg(string picUrl)
{
string result = "";
string path = AppDomain.CurrentDomain.SetupInformation.ApplicationBase + @"/File/"; //目录
try
{
if (!String.IsNullOrEmpty(picUrl))
{
Random rd = new Random();
DateTime nowTime = DateTime.Now;
string fileName = nowTime.Month.ToString() + nowTime.Day.ToString() + nowTime.Hour.ToString() + nowTime.Minute.ToString() + nowTime.Second.ToString() + rd.Next(1000, 1000000) + ".jpeg";
WebClient webClient = new WebClient();
webClient.DownloadFile(picUrl, path + fileName);
result = fileName;
}
}
catch { }
return result;
}

参考文章

1. C# 通过URL获取图片并显示在PictureBox上的方法

2. 根据网址把图片下载到服务器C#代码

3. C#获取网页的HTML码、下载网站图片

4. C#如何通过URL下载图片?

上一篇:java入门 -- 异常处理


下一篇:java中获取系统属性以及环境变量