1.获取网页源码函数
1 string getPageText(string url) 2 { 3 string retVal = ""; 4 dtUpdate = DateTime.Now; 5 labMessage.Text = ""; 6 int pageNo = 1; 7 string strUrl = ""; 8 do 9 { 10 try 11 { 12 using (var wc = new WebClient()) 13 { 14 wc.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); 15 wc.Encoding = Encoding.UTF8; 16 strUrl = url + "&page=" + pageNo; 17 log("打开页:" + strUrl + "<br/>"); 18 string str = wc.DownloadString(strUrl); 19 str = str.Replace("<script", ""); 20 pageNo = procLine(str); //处理获取到的文本 21 } 22 } 23 catch (Exception ee) 24 { 25 log("获取网页时错误:" + ee.Message); 26 } 27 } while (pageNo > 0); 28 using (SqlConnection conn = new SqlConnection(strConn)) 29 { 30 try 31 { 32 conn.Open(); 33 string strSql = "delete from 线路 where 更新时间<>@更新时间"; 34 SqlCommand cmd = new SqlCommand(strSql, conn); 35 cmd.Parameters.Add("@更新时间", System.Data.SqlDbType.DateTime, 4).Value = dtUpdate; 36 int sl = cmd.ExecuteNonQuery(); 37 if (sl > 0) log("删除了未更新的线路:" + sl + "条。"); 38 } 39 catch (Exception ee) 40 { 41 log("错误:" + ee.Message); 42 } 43 } 44 return retVal; 45 }
2.处理所获得的源码
1 int procLine(string strHtml) 2 { 3 int retVal = 0; 4 HtmlDocument htmlDoc = new HtmlDocument(); // 获取html元素(htmlContext为html页面字符串) 5 htmlDoc.LoadHtml(strHtml); // 加载html页面 6 7 HtmlNode node = htmlDoc.DocumentNode; 8 //HtmlNodeCollection has = node.SelectNodes("//div[contains(@onclick, ‘/RealtimeQuery?‘)]"); //获取各条线路的 div 9 HtmlNodeCollection has = node.SelectNodes("//a"); //获取各条线路的 <a href="/LineDetailQuery?lineId=1&direction=2&">1路(老福山花园站→博览城市场中路站)</a> 10 if (has != null) 11 foreach (HtmlNode hn in has) 12 if (hn.Attributes.Contains("href")) 13 { //<a href="/LineDetailQuery?lineId=1&direction=2&">1路(老福山花园站→博览城市场中路站)</a> 14 string href = hn.Attributes["href"].Value; 15 if (href.StartsWith("/LineDetailQuery?lineId=")) 16 { 17 string[] ss = href.Split(new char[] { ‘?‘, ‘=‘, ‘&‘, ‘;‘ }, StringSplitOptions.RemoveEmptyEntries); 18 if (ss.Length > 6) 19 { 20 int lineId = 0; 21 int dir = 0; 22 int.TryParse(ss[2], out lineId); 23 int.TryParse(ss[5], out dir); 24 string[] lineInfo = hn.InnerText.Split(new char[] { ‘(‘, ‘→‘, ‘)‘ }, StringSplitOptions.RemoveEmptyEntries); 25 string lineName = lineInfo[0]; 26 string from = lineInfo[1]; 27 string to = lineInfo[2]; 28 int bh = lineId + dir * 100000; 29 string str = "lineId={0}, dir={1}, lineName={2}, from={3}, to={4} <br/>"; 30 log(string.Format(str, lineId, dir, lineName, from, to)); 31 //将线路更新到数据库 32 using (SqlConnection conn = new SqlConnection(strConn)) 33 { 34 try 35 { 36 conn.Open(); 37 string strSql = "update 线路 set 线路号=@线路号, 行向=@行向, 线路名称=@线路名称, 始发站点=@始发站点, 开往方向=@开往方向, 更新时间=@更新时间 where 编号=@编号"; 38 SqlCommand cmd = new SqlCommand(strSql, conn); 39 cmd.Parameters.Add("@线路号", System.Data.SqlDbType.Int, 4).Value = lineId; 40 cmd.Parameters.Add("@行向", System.Data.SqlDbType.Int, 4).Value = dir; 41 cmd.Parameters.Add("@线路名称", System.Data.SqlDbType.NVarChar, 50).Value = lineName; 42 cmd.Parameters.Add("@始发站点", System.Data.SqlDbType.NVarChar, 50).Value = from; 43 cmd.Parameters.Add("@开往方向", System.Data.SqlDbType.NVarChar, 50).Value = to; 44 cmd.Parameters.Add("@更新时间", System.Data.SqlDbType.DateTime, 4).Value = dtUpdate; 45 cmd.Parameters.Add("@编号", System.Data.SqlDbType.Int, 4).Value = bh; 46 if (cmd.ExecuteNonQuery() == 0) 47 { 48 cmd.CommandText = "insert into 线路(编号, 线路号, 行向, 线路名称, 始发站点, 开往方向, 更新时间)values(@编号, @线路号,@行向,@线路名称,@始发站点,@开往方向,@更新时间)"; 49 cmd.ExecuteNonQuery(); 50 } 51 } 52 catch (Exception ee) 53 { 54 log("更新线路到数据库时错误:" + ee.Message); 55 } 56 } 57 } 58 } 59 } 60 //<font style="font-size:13px;">[2/26]</font> 61 HtmlNodeCollection hasPage = node.SelectNodes("//font"); //获取各条线路的 <a href="/LineDetailQuery?lineId=1&direction=2&">1路(老福山花园站→博览城市场中路站)</a> 62 if (hasPage != null) 63 foreach (HtmlNode hn in hasPage) 64 if (hn.Attributes.Contains("style")) 65 if (hn.Attributes["style"].Value == "font-size:13px;") 66 { 67 string[] ss = hn.InnerText.Split(new char[] { ‘[‘, ‘/‘, ‘]‘ }, StringSplitOptions.RemoveEmptyEntries); 68 if (ss.Length > 1) 69 { 70 int pageNo = 0; 71 int pageTotal = 0; 72 int.TryParse(ss[0], out pageNo); 73 int.TryParse(ss[1], out pageTotal); 74 if (pageTotal > 0) 75 if (pageNo < pageTotal) 76 { 77 log("=================第" + pageNo + "页/共" + pageTotal + "页===========<br/><br/>"); 78 retVal = pageNo + 1; 79 } 80 else 81 log("==================最后一页==============="); 82 } 83 } 84 return retVal; 85 }
注:HtmlDocument、HtmlNode、HtmlNodeCollection三个类需要引用一个类库文件。该文件在博客园的后台文件管理中。