最近研究了下如何抓取为知笔记的内容,在抓取笔记里的图片内容时,老是提示403错误,用Chorme的开发者工具看了下:
这里的Cookie来自两个域,估计为知那边是验证了token(登录后才能获取到token)
下载图片的代码:
var path = "https://note.wiz.cn/" + str.TrimStart('/'); var extension = Path.GetExtension(path); var filepath = AppPath.Combine("Images/" + DateTime.Now.Ticks + extension); const string userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36"; const string accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; const string acceptLanguage = "zh-CN,zh;q=0.8"; const string acceptEncoding = "gzip,deflate,sdch"; var cookieContainer = new CookieContainer(); var cookie = new Cookie { Name = "token".Trim(), Value = Token, Domain = ".wiz.cn".Trim() //设置cookie域 }; cookieContainer.Add(cookie); string[] cookiesArr = txtCookie.Text.Split(';'); foreach (string s in cookiesArr) { string[] keyValuePair = s.Split('='); if (keyValuePair.Length > 1) { cookie = new Cookie { Name = keyValuePair[0].Trim(), Value = keyValuePair[1].Trim(), Domain = "note.wiz.cn" //设置cookie域 }; cookieContainer.Add(cookie); } } var newUri = new Uri(path); var webRequest = (HttpWebRequest)WebRequest.Create(newUri); webRequest.Timeout = 20000; //webRequest.CookieContainer = cookieContainer; webRequest.UserAgent = userAgent; webRequest.Accept = accept; webRequest.Headers["Accept-Language"] = acceptLanguage; webRequest.Headers["Accept-Charset"] = acceptEncoding; webRequest.Headers["Accept-Encoding"] = acceptEncoding; webRequest.KeepAlive = true; webRequest.Headers["Cache-Control"] = "no-cache"; webRequest.Headers["Upgrade-Insecure-Requests"] = "1"; webRequest.Headers["Pragma"] = "no-cache"; webRequest.Headers["Cookie"] = "token=" + Token + ";" + txtCookie.Text.Trim();//todo: Cookie 要这样赋值,不能用CookieContainer?? webRequest.Referer = newUri.AbsoluteUri; HttpWebResponse rsp = (HttpWebResponse)webRequest.GetResponse(); Stream stream = null; stream = rsp.GetResponseStream(); Image.FromStream(stream).Save(filepath); // 释放资源 if (stream != null) stream.Close(); if (rsp != null) rsp.Close();奇怪的是:用 webRequest.CookieContainer = cookieContainer; 来跟cookie赋值,token参数总是赋不上,
后面改为:webRequest.Headers["Cookie"] = "token=" + Token + ";" + txtCookie.Text.Trim(); 就可以了,
CookieContainer 不是支持多个域的cookie吗,难到跨域Cookie只能webRequest.Headers["Cookie"]这样赋值吗? 没弄明白,有知道的童鞋不吝赐教。