我们可以通过在java程序中模拟浏览器一样,把数据抓下来,具体方法是在java程序中set header和cookie,下面是一个例子:
public class NetConnection { public static final int MAX_HOTWORDS_FILE_SIZE = 256 * 1024; public static void main(String[] args) { send("http://tuan.aibang.com/shenzhen/new_2033549.html"); } static void send(String _url){ HttpClient http = new HttpClient(); http.getHttpConnectionManager().getParams().setConnectionTimeout( 100000); GetMethod get = new GetMethod(_url); get.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 100000); get.setFollowRedirects(false); InputStream is; String host = "www.aibang.com"; try { get.setRequestHeader("Host",host); get.setRequestHeader( "user-agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.8) Gecko/20100202 Firefox/3.5.8"); get.setRequestHeader( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); get.setRequestHeader("Accept-Language","zh-cn,zh;q=0.5"); get.setRequestHeader("Accept-Charset","GB2312,utf-8;q=0.7,*;q=0.7"); get.setRequestHeader("Connection","Keep-Alive"); get.setRequestHeader("Cookie","582081171805; cy=1; __utma=205923334.3209590505032285000.1256126987.1269858466.1270605495.11; __utmz=205923334.1256126987.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ano=0snUeoHWygEkAAAAOTAyZjM3ZjQtNjA2NC00NWYzLWIxNWYtMjRlMDliZjAzYTM3jnk83_pKoSEk-9gwcIFx8jFOXVM1; sid=no4tpkmvedoj3gycvfz1s055; lb.dp=100729098.20480.0000; __utmb=205923334.1.10.1270605495; __utmc=205923334"); int er = 0; er = http.executeMethod(get); System.out.println("er="+er); if (er == 200) { try { is = get.getResponseBodyAsStream(); ReadFile2(is); } catch (Exception e) { System.out.println("download error="+e); } } }catch(Exception ex){ ex.printStackTrace(); } } //解析inputStream里的数据 static void ReadFile2(InputStream inputStream){ byte[] buffer = new byte[1024]; StringBuffer bab = new StringBuffer(); int nTotal = 0; try { do{ int nBytesRead = inputStream.read(buffer); System.out.println("nBytesRead="+nBytesRead); if(nBytesRead > 0){ bab.append(new String(buffer,0,nBytesRead)); nTotal += nBytesRead; }else{ System.out.println(String.format("download end. file size=%d", nTotal)); break; } }while(nTotal < MAX_HOTWORDS_FILE_SIZE); System.out.println("bab = "+bab); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
这里要引入这几个包
import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams;