yelp纽约地区数据爬取。使用feapder爬虫框架
本文使用了国内作者的一款feapder轻量级开源爬虫框架。进行yelp站点数据抓取。
第一部分为根据开放api获取店铺信息。
第二部分为根据商店id抓取评论信息。
1.抓包过程
1.抓取商店信息。
通过翻页定位到url
https://www.yelp.com/search/snippet?find_desc=&find_loc=New%20York%2C%20NY&ns=1&start=10&parent_request_id=5fd32b6ecfc91273&request_origin=user
查看参数。修改ns 翻页。在此不推荐这种方法。能找到的数据比较少。大概只有二十多页。二百多个!
推荐使用yelp开放api。网址https://www.yelp.com/developers/v3/manage_app。
2.抓取评论信息。
这一点在开放api中做的并不友好。所以我先择了通过代理ip的方式直接上网站抓。
2.调试代码
1.抓取商店信息。商店id的代码
3.基于框架编写代码
1.抓取商店信息的代码。代码基于feapder框架。
params 参数请参考yelp API文档。yelp开放API文档
feapder 框架文档https://boris-code.gitee.io/feapder/#/
https://www.yelp.com/developers/documentation/v3
self.headers[‘Authorization’] = {APIkey}
这里的latitude 和longitude 是经纬度。通过经纬度画一个四边形
def start_requests(self):
for i in range(100):
latitude = 40.641 + i / 500
for k in range(50):
longitude = -74.017 + k / 250
params = {
'radius': '2500',
'term': self.term,
'limit': '50',
'latitude': latitude,
'longitude': longitude,
}
self.headers['Authorization'] = self.getAuth()
yield feapder.Request(url=self.url, params=params, headers=self.headers)
解析函数
def parse(self, request, response):
data = response.json.get('businesses')
if data == []:
return None
for i in data:
item = SpiderDataItem()
item.table_name = 'apiapp_shop_info'
item.term = self.term
item.store_id = i.get('id')
item.alias = i.get('alias')
item.name = i.get('name')
item.image_url = i.get('image_url')
item.is_closed = i.get('is_closed')
item.url = i.get('url')
item.rating = i.get('rating')
item.coordinates = i.get('coordinates')
item.transactions = i.get('transactions')
item.location = i.get('location')
item.display_phone = i.get('display_phone')
item.phone = i.get('phone')
item.distance = i.get('distance')
item.price = i.get('price')
item.review_count = i.get('review_count')
item.categories = i.get('categories')
yield item
下载通道
class SpiderDataItem(feapder.Item):
"""
This class was generated by feapder.
command: feapder create -i spider_data.
"""
def __init__(self, *args, **kwargs):
self.term = None
self.store_id = None
self.alias = None
self.image_url = None
self.is_closed = None
self.url = None
self.rating = None
self.coordinates = None
self.transactions = None
self.location = None
self.phone = None
self.distance = None
self.display_phone = None
self.price = None
self.review_count = None
self.categories = None
2.评论部分。这里没有使用官方api。使用了代理ip
熊猫代理代理购买地址动态代理
这里写了一个增量爬虫。我是两个脚本一起开抓的数据。大家根据实际需要修改。
def start_requests(self):
# limit_i = 0'
SQL = 'SELECT COUNT(store_id) FROM apiapp_reviews_info GROUP BY store_id'
reviewsCount = len(self.db.find(SQL, ))
limit_i = reviewsCount
while 1:
SQL = 'SELECT COUNT(1) FROM apiapp_shop_info'
dataCount = self.db.find(SQL, )[0][0]
while limit_i < dataCount:
SQL = 'SELECT * from apiapp_shop_info LIMIT {},{}'.format(limit_i, limit_i + 10)
data = self.db.find(SQL, limit=10, to_json=True)
limit_i += 10
for i in data:
print(i.get('store_id'))
store_id = i.get('store_id')
# store_id = 'lKxmH-hQ6ezKbq4URKrd-Q'
# url = 'https://www.yelp.co.uk/biz/lKxmH-hQ6ezKbq4URKrd-Q/review_feed?rl=en&q=&sort_by=date_asc&start=' + '0'
url = 'https://www.yelp.co.uk/biz/{}/review_feed?rl=en&q=&sort_by=date_asc&start='.format( store_id) + '0'
yield feapder.Request(url=url, headers=self.headers, callback=self.parser_one, store_id=store_id)
SQL = 'SELECT COUNT(store_id) FROM apiapp_reviews_info GROUP BY store_id'
reviewsCount = len(self.db.find(SQL, ))
limit_i = dataCount
if reviewsCount >=10000:
break
这里需要使用代理ip。所以多一个download_midware函数。
def download_midware(self, request):
# ip = self.ip
ip = 'dynamic.xiongmaodaili.com:8088'
request.proxies = {"https": "http://" + ip, "http": "http://" + ip}
return request
解析器
这里分了两块。第一次抓取需要判断抓多少页。后续的抓取不需要启动新任务。
def parse_no_noe(self, request, response):
data = response.json.get('reviews')
store_id = request.store_id
print(len(data))
if data == []:
return None
for i in data:
print(i)
item = reviewSpiderDataItem()
item.table_name = 'apiapp_reviews_info'
item.reviews_id = i.get('id')
item.store_id = store_id
item.term = self.term
item.userId = i.get('userId')
item.business = i.get('business')
item.user = i.get('user')
item.comment = i.get('comment')
item.localizedDate = i.get('localizedDate')
item.localizedDateVisited = i.get('localizedDateVisited')
item.rating = i.get('rating')
item.photos = i.get('photos')
item.lightboxMediaItems = i.get('lightboxMediaItems')
item.photosUrl = i.get('photosUrl')
item.totalPhotos = i.get('totalPhotos')
item.feedback = i.get('feedback')
item.isUpdated = i.get('isUpdated')
item.businessOwnerReplies = i.get('businessOwnerReplies')
item.appreciatedBy = i.get('appreciatedBy')
item.previousReviews = i.get('previousReviews')
item.tags = i.get('tags')
yield item
# return data
def parser_one(self, request, response):
print(response.json)
store_id = request.store_id
print(store_id)
num = response.json.get('reviewLanguages')[0].get('count')
print(num)
if num >= 50:
num = 40 #超过50评论取50条
num = num // 10
data = response.json.get('reviews')
for i in data:
print(i)
item = reviewSpiderDataItem()
item.table_name = 'apiapp_reviews_info'
item.reviews_id = i.get('id')
item.store_id = store_id
item.term = self.term
item.userId = i.get('userId')
item.business = i.get('business')
item.user = i.get('user')
item.comment = i.get('comment')
item.localizedDate = i.get('localizedDate')
item.localizedDateVisited = i.get('localizedDateVisited')
item.rating = i.get('rating')
item.photos = i.get('photos')
item.lightboxMediaItems = i.get('lightboxMediaItems')
item.photosUrl = i.get('photosUrl')
item.totalPhotos = i.get('totalPhotos')
item.feedback = i.get('feedback')
item.isUpdated = i.get('isUpdated')
item.businessOwnerReplies = i.get('businessOwnerReplies')
item.appreciatedBy = i.get('appreciatedBy')
item.previousReviews = i.get('previousReviews')
item.tags = i.get('tags')
yield item
for i in range(1, num + 1):
i = i * 10
print(i)
url = 'https://www.yelp.co.uk/biz/{}/review_feed?rl=en&q=&sort_by=date_asc&start='.format(store_id) + str(i)
yield feapder.Request(url=url, headers=self.headers, callback=self.parse_no_noe, store_id=store_id,
random_user_agent=True)
下载 通道
class reviewSpiderDataItem(feapder.Item):
"""
This class was generated by feapder.
command: feapder create -i spider_data.
"""
def __init__(self, *args, **kwargs):
# self.id = None
self.store_id = None
self.term = None
self.reviews_id = None
self.userId = None
self.business = None
self.user = None
self.comment = None
self.localizedDate = None
self.localizedDateVisited = None
self.rating = None
self.photos = None
self.lightboxMediaItems = None
self.photosUrl = None
self.totalPhotos = None
self.feedback = None
self.isUpdated = None
self.businessOwnerReplies = None
self.appreciatedBy = None
self.previousReviews = None
self.tags = None
4.本文代码。数据集下载
资源上传正在审核。审核通过后更新