Httpclient: 多层翻页网络爬虫实战(以搜房网为例)

参考:http://blog.csdn.net/qy20115549/article/details/52912532

一、创建数据表

#创建表:用来存储url地址信息
create table soufang_address
(
id varchar(255),
title varchar(255),
url varchar(255),
craw_time varchar(255)
) #创建表:用来存储房源信息
create table soufang_content
(
id varchar(255),
title varchar(255),
publishtime varchar(255),
price varchar(255),
housetype varchar(255),
acreage varchar(255),
useacreage varchar(255),
years varchar(255),
orientation varchar(255),
floor varchar(255),
structure varchar(255),
decoration varchar(255),
type varchar(255),
buildingtype varchar(255),
propertyright varchar(255),
estate varchar(255),
school varchar(255),
facilities varchar(255)
)

二、实体类

/httpClient2/src/main/java/model/Address.java

package model;

public class Address {
//地址id,用于使用map区别url地址
private String addr_id;
//url
private String addr_url;
//url对应的名称
private String title;
//爬取时间,使用jdk的date生成
private String craw_time; public String getAddr_id() {
return addr_id;
} public void setAddr_id(String addr_id) {
this.addr_id = addr_id;
} public String getTitle() {
return title;
} public void setTitle(String title) {
this.title = title;
} public String getAddr_url() {
return addr_url;
} public void setAddr_url(String addr_url) {
this.addr_url = addr_url;
} public String getCraw_time() {
return craw_time;
} public void setCraw_time(String craw_time) {
this.craw_time = craw_time;
} }

/httpClient2/src/main/java/model/Contents.java

package model;

public class Contents {
private String id;
private String title;
private String publishtime;
private String price;
private String housetype;
private String acreage;
private String useacreage;
private String years;
private String orientation;
private String floor;
private String structure;
private String decoration;
private String type;
private String buildingtype;
private String propertyright;
private String estate;
private String school;
private String facilities; public String getUseacreage() {
return useacreage;
} public void setUseacreage(String useacreage) {
this.useacreage = useacreage;
} public String getId() {
return id;
} public String getPublishtime() {
return publishtime;
} public void setPublishtime(String publishtime) {
this.publishtime = publishtime;
} public void setId(String id) {
this.id = id;
} public String getPrice() {
return price;
} public void setPrice(String price) {
this.price = price;
} public String getHousetype() {
return housetype;
} public void setHousetype(String housetype) {
this.housetype = housetype;
} public String getTitle() {
return title;
} public void setTitle(String title) {
this.title = title;
} public String getAcreage() {
return acreage;
} public void setAcreage(String acreage) {
this.acreage = acreage;
} public String getYears() {
return years;
} public void setYears(String years) {
this.years = years;
} public String getOrientation() {
return orientation;
} public void setOrientation(String orientation) {
this.orientation = orientation;
} public String getFloor() {
return floor;
} public void setFloor(String floor) {
this.floor = floor;
} public String getStructure() {
return structure;
} public void setStructure(String structure) {
this.structure = structure;
} public String getDecoration() {
return decoration;
} public void setDecoration(String decoration) {
this.decoration = decoration;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public String getBuildingtype() {
return buildingtype;
} public void setBuildingtype(String buildingtype) {
this.buildingtype = buildingtype;
} public String getPropertyright() {
return propertyright;
} public void setPropertyright(String propertyright) {
this.propertyright = propertyright;
} public String getEstate() {
return estate;
} public void setEstate(String estate) {
this.estate = estate;
} public String getSchool() {
return school;
} public void setSchool(String school) {
this.school = school;
} public String getFacilities() {
return facilities;
} public void setFacilities(String facilities) {
this.facilities = facilities;
}
}

三、utils、解析类

/httpClient2/src/main/java/util/HTTPUtils.java

package util;

import java.io.IOException;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHttpResponse;
/*
* 执行请求
* */
public class HTTPUtils {
public static HttpResponse getRawHtml(HttpClient client, String personalUrl) {
// 获取响应文件,即html,采用get方法获取响应数据
HttpGet getMethod = new HttpGet(personalUrl);
HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,
HttpStatus.SC_OK, "OK");
try {
// 执行get方法
response = client.execute(getMethod);
} catch (IOException e) {
e.printStackTrace(); } finally {
// getMethod.abort();
}
return response;
} }

/httpClient2/src/main/java/util/SouFangAddressFecter.java

package util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.util.EntityUtils; import parse.SouFangAddressParser;
import model.Address; public class SouFangAddressFecter {
public static List<Address> htmlGet(HttpClient client, String url)
throws Exception, IOException {
List<Address> AddressInfo = new ArrayList<Address>();
HttpResponse response = HTTPUtils.getRawHtml(client, url);
int StatusCode = response.getStatusLine().getStatusCode();
if (StatusCode == 200) {
// 由于此方法总是出现乱码
// String entity = EntityUtils.toString
// (response.getEntity(),"UTF-8");
// 输出实体内容,不会乱码,乱码解决。由于数据是通过zip压缩的
GzipDecompressingEntity zipRes = new GzipDecompressingEntity(
response.getEntity());
String s = EntityUtils.toString(zipRes, "gb2312");
// 解析实体内容
AddressInfo = SouFangAddressParser.getdata(s);
EntityUtils.consume(response.getEntity());
} else {
// 关闭HttpEntity的流实体
EntityUtils.consume(response.getEntity());
} return AddressInfo; } }

/httpClient2/src/main/java/parse/SouFangAddressParser.java

package parse;

import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import model.Address; public class SouFangAddressParser {
public static List<Address> getdata(String entity) throws IOException {
List<Address> addresses = new ArrayList<Address>();
// 获取html文件
Document doc = Jsoup.parse(entity); // 获取总页数
int sumPages = Integer.parseInt(doc.select("div[class=fanye gray6]")
.select("span[class=txt]").text().replaceAll("\\D", ""));
sumPages = 2;
// 由于此网站,第一页和第二页等有重复的房源。防止有重复的id出现,这里使用map
Map<String, Integer> keyMap = new HashMap<String, Integer>();
for (int i = 1; i <= sumPages; i++) {
// 每页的url
String everyPageUrl = "http://esf.hf.fang.com/house/i3" + i;
// 直接使用Jsoup
Document document = Jsoup.connect(everyPageUrl).timeout(50000)
.userAgent("bbbb").get();
// jsoup使用正则表达式
Elements elements = document.select("dl[id~=list_D03_?]");
// 获取每一个子内容
for (Element ele : elements) {
String id = ele.select("dd[class=info rel floatr]").select("p")
.select("a").attr("href").replaceAll("/chushou/", "")
.replaceAll(".htm", "");
if (!keyMap.containsKey(id)) {
keyMap.put(id, 1);
String url = "http://esf.hf.fang.com/"
+ ele.select("dd[class=info rel floatr]")
.select("p").select("a").attr("href");
String title = ele.select("dd[class=info rel floatr]")
.select("p[class=title]").select("a").text();
Date date = new Date();
DateFormat format = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
String craw_time = format.format(date);
Address address = new Address();
address.setAddr_id(id);
address.setAddr_url(url);
address.setCraw_time(craw_time);
address.setTitle(title);
addresses.add(address);
}
}
}
return addresses;
}
}

四、操作数据库

/httpClient2/src/main/java/db/MyDataSource.java

package db;

import javax.sql.DataSource;

import org.apache.commons.dbcp2.BasicDataSource;
/*
* 数据库配置信息,并且使用了dbcp2
* */
public class MyDataSource {
public static DataSource getDataSource(String connectURI) {
BasicDataSource ds = new BasicDataSource();
// MySQL的jdbc驱动
ds.setDriverClassName("com.mysql.jdbc.Driver");
ds.setUsername("root"); // 所要连接的数据库名
ds.setPassword("123456"); // MySQL的登陆密码
ds.setUrl(connectURI);
return ds;
}
}

/httpClient2/src/main/java/db/MYSQLControl.java

package db;

import java.sql.SQLException;
import java.util.List; import javax.sql.DataSource; import model.Address;
import model.Contents; import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; public class MYSQLControl {
static final Log logger = LogFactory.getLog(MYSQLControl.class);
// 根据自己的数据库地址修改
static DataSource ds = MyDataSource
.getDataSource("jdbc:mysql://127.0.0.1:3306/moviedata");
static QueryRunner qr = new QueryRunner(ds); /*
* 查询数据的方法
*/
public static <T> List<T> getListInfoBySQL(String sql, Class<T> type) {
List<T> list = null;
try {
list = qr.query(sql, new BeanListHandler<T>(type));
} catch (Exception e) {
// TODO: handle exception
}
return list;
} /*
* 插入数据soufang_address
*/
public static void executeAddressUpdate(List<Address> addresses) {
// 定义一个Object数组,行列
Object[][] params = new Object[addresses.size()][4];
for (int i = 0; i < params.length; i++) {
params[i][0] = addresses.get(i).getAddr_id();
params[i][1] = addresses.get(i).getTitle();
params[i][2] = addresses.get(i).getAddr_url();
params[i][3] = addresses.get(i).getCraw_time();
}
try {
qr.batch("insert into soufang_address (id, title,url,craw_time)"
+ "values (?,?,?,?)", params);
} catch (Exception e) {
logger.error(e);
}
} /*
* 操作数据库,插入房源的详细信息
*/
public static void executeContentInsert(List<Contents> contentinfo)
throws SQLException {
Object[][] params = new Object[contentinfo.size()][18];
for (int i = 0; i < params.length; i++) {
params[i][0] = contentinfo.get(i).getId();
params[i][1] = contentinfo.get(i).getTitle();
params[i][2] = contentinfo.get(i).getPublishtime();
params[i][3] = contentinfo.get(i).getPrice();
params[i][4] = contentinfo.get(i).getHousetype();
params[i][5] = contentinfo.get(i).getAcreage();
params[i][6] = contentinfo.get(i).getUseacreage();
params[i][7] = contentinfo.get(i).getYears();
params[i][8] = contentinfo.get(i).getOrientation();
params[i][9] = contentinfo.get(i).getFloor();
params[i][10] = contentinfo.get(i).getStructure();
params[i][11] = contentinfo.get(i).getDecoration();
params[i][12] = contentinfo.get(i).getType();
params[i][13] = contentinfo.get(i).getBuildingtype();
params[i][14] = contentinfo.get(i).getPropertyright();
params[i][15] = contentinfo.get(i).getEstate();
params[i][16] = contentinfo.get(i).getSchool();
params[i][17] = contentinfo.get(i).getFacilities();
}
try {
qr.batch(
"insert into soufang_content (id, title,publishtime, price,housetype,acreage,useacreage,years,orientation,floor,structure,decoration,type,buildingtype,propertyright,estate,school,facilities)"
+ "values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
params);
System.out.println("共插入:\t" + contentinfo.size() + "条数据!");
} catch (Exception e) {
logger.error(e);
} }
}

五、主函数

/httpClient2/src/main/java/main/SouFangAddress.java

package main;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import model.Address; import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient; import db.MYSQLControl;
import util.SouFangAddressFecter; public class SouFangAddress {
public static void main(String[] args) throws IOException, Exception {
HttpClient client = new DefaultHttpClient();
// 初始化地址
String _url = "http://esf.hf.fang.com/";
List<Address> addresses = new ArrayList<Address>();
// 调用函数,爬取数据
addresses = SouFangAddressFecter.htmlGet(client, _url);
// 操作数据库,将获取的数据插入数据库
MYSQLControl.executeAddressUpdate(addresses);
}
}

六、总结

1、爬虫如何实现翻页:搜房网的地址比较规则,地址拼接即可
2、网络爬虫如何解决主键重复问题,在写网络爬虫时,我们经常会遇到置顶帖。这种置顶帖在每页都有:本文使用的是map集合,判断id是否已经存在
3、 Content-Encoding :gzip。数据是经过压缩的:
要先解压流数据。 GzipDecompressingEntity zipRes = new GzipDecompressingEntity(response.getEntity());
String s = EntityUtils.toString(zipRes, "gb2312"); 4、本文只爬取了出租房的链接,并将连接信息导入到数据库中。如果想获取出租房的详细信息,则只需要从数据库一个一个取出连接,加以解析即可

Httpclient: 多层翻页网络爬虫实战(以搜房网为例)

上一篇:[搜片神器]直接从DHT网络下载BT种子的方法


下一篇:Heritrix工具实现网络爬虫