细节问题参考前两章。
在settings.py中设置图片和数据库
BOT_NAME = 'houses'
SPIDER_MODULES = ['houses.spiders']
NEWSPIDER_MODULE = 'houses.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL="WARNING" #日志为警告以上才显示
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'houses.pipelines.MysqlPipeline': 100,
'houses.pipelines.HouseImagePipeline': 200, # 图片下载模型
}
IMAGES_STORE='images' #图片路径【注意】
IMAGES_EXPIRES=90
IMAGES_MIN_HEIGHT=100
IMAGES_MIN_WIDTH=100
MYSQL_DB_HOST="127.0.0.1"
MYSQL_DB_PORT=3306 #端口
MYSQL_DB_NAME="spier"
MYSQL_DB_USER="root"
MYSQL_DB_PASSWORD="123456"
打开cmd,添加表【数据库】
cmd
C:\Users\admin>mysql -u root -p
mysql> show databases;
mysql> use spier;
mysql> create table HouseInfo(house varchar(255),address varchar(255),price varchar(255),total varchar(255))ENGINE=InnoDB DEFAULT CHARSET=utf8; //建表
pipelines.py
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymysql
class MysqlPipeline:
def open_spider(self,spider):
# 读取settings.py中的配置项
host = spider.settings.get("MYSQL_DB_HOST")
port = spider.settings.get("MYSQL_DB_PORT")
dbname = spider.settings.get("MYSQL_DB_NAME")
user = spider.settings.get("MYSQL_DB_USER")
pwd = spider.settings.get("MYSQL_DB_PASSWORD")
# 创建数据库链接
self.db_conn = pymysql.connect(host=host, port=port, db=dbname, user=user, password=pwd)
# 打开游标
self.db_cur = self.db_conn.cursor()
def process_item(self, item, spider):
values = (
item["house"],
item["address"],
item["price"],
item["total"]) # 与占位符%s对应的数据
# sql语句,数据部分使用占位符%s代替
sql = "insert into HouseInfo(house,address,price,total) values(%s,%s,%s,%s)"
self.db_cur.execute(sql, values) # 执行SQL语句
return item
def close_spider(self, spider):
self.db_conn.commit() # 提交事务
self.db_cur.close() # 关闭游标
self.db_conn.close() # 关闭数据库连接
class HouseImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info): #请求下载指定item对象数据
# for image_url in item["image_urls"]:
# yield Request(image_url)
yield Request(item["image_urls"])
def item_completed(self, results, item, info):#对下载结果进行处理
# results - 下载好的资源参数([(True, {'url': 'https://img.mukewang.com/5861d2500001d39406000338-240-135.jpg',
# 'path': 'full/6922b98c7acde37f0b570650844e2e660b82991a.jpg',
# 'checksum': '037f4f643599f3e7870225798ece845b', 'status': 'downloaded'})])
# item - 被爬取的item对象
image_path=[x['path'] for ok,x in results if ok]
# print(image_path)
if not image_path:
raise DropItem("items contains no images")
item["image_path"]=image_path[0]
return item
items.py
import scrapy
class HousesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
house = scrapy.Field() # 楼盘名称
address = scrapy.Field() # 地址
price = scrapy.Field() # 均价
total = scrapy.Field() # 总价
image_urls = scrapy.Field() #图片url地址
image_path=scrapy.Field #图片的本地路径
beike.py
import scrapy
from houses.items import HousesItem
class ShellnewsSpider(scrapy.Spider):
name = 'beike'
allowed_domains = ['cq.fang.ke.com']
start_urls = ['http://cq.fang.ke.com/loupan/pg{}'.format(i) for i in range(7, 8)]
def parse(self, response):
allli=response.xpath("//ul[@class='resblock-list-wrapper']/li")
for row in allli:
# 使用索引进行快速定位
item = HousesItem() # 初始化容器必须放在循环内
item["house"] = row.xpath("div/div[1]/a/text()").get().strip() # 楼盘
item["address"] = row.xpath("div/a[1]/@title").get().strip() # 地址
item["price"] = row.xpath(".//span[@class='number']/text()").get() # 均价
total = row.xpath(".//div[@class='second']/text()").get() # 总价
# 简单清洗数据,去掉总价2个字
total = total.replace("总价", "") if total is not None else ""
item["total"] = total
item["image_urls"] = row.xpath("a/img/@data-original").get() # 图片的地址
yield item
main.py
from scrapy import cmdline
cmdline.execute("scrapy crawl beike".split())
运行
1、可以直接运行main.py【main.py 可以用于调试】
2、在终端控制台,输入命令:
scrapy crawl beike -o 【要保存在某文件中】
scrapy crawl beike -o beike.csv
运行结果: