yelp纽约地区数据爬取。使用feapder爬虫框架

yelp纽约地区数据爬取。使用feapder爬虫框架

本文使用了国内作者的一款feapder轻量级开源爬虫框架。进行yelp站点数据抓取。

第一部分为根据开放api获取店铺信息。

第二部分为根据商店id抓取评论信息。

1.抓包过程

1.抓取商店信息。
通过翻页定位到url
https://www.yelp.com/search/snippet?find_desc=&find_loc=New%20York%2C%20NY&ns=1&start=10&parent_request_id=5fd32b6ecfc91273&request_origin=user
查看参数。修改ns 翻页。在此不推荐这种方法。能找到的数据比较少。大概只有二十多页。二百多个!
推荐使用yelp开放api。网址https://www.yelp.com/developers/v3/manage_app。

2.抓取评论信息。
这一点在开放api中做的并不友好。所以我先择了通过代理ip的方式直接上网站抓。

2.调试代码

1.抓取商店信息。商店id的代码

3.基于框架编写代码

1.抓取商店信息的代码。代码基于feapder框架。
params 参数请参考yelp API文档。yelp开放API文档
feapder 框架文档https://boris-code.gitee.io/feapder/#/
https://www.yelp.com/developers/documentation/v3
self.headers[‘Authorization’] = {APIkey}
这里的latitude 和longitude 是经纬度。通过经纬度画一个四边形

    def start_requests(self):
        for i in range(100):
            latitude = 40.641 + i / 500
            for k in range(50):
                longitude = -74.017 + k / 250
                params = {
                    'radius': '2500',
                    'term': self.term,
                    'limit': '50',
                    'latitude': latitude,
                    'longitude': longitude,
                }
                self.headers['Authorization'] = self.getAuth()
                yield feapder.Request(url=self.url, params=params, headers=self.headers)

解析函数

    def parse(self, request, response):
        data = response.json.get('businesses')
        if data == []:
            return None
        for i in data:
            item = SpiderDataItem()
            item.table_name = 'apiapp_shop_info'
            item.term = self.term
            item.store_id = i.get('id')
            item.alias = i.get('alias')
            item.name = i.get('name')
            item.image_url = i.get('image_url')
            item.is_closed = i.get('is_closed')
            item.url = i.get('url')
            item.rating = i.get('rating')
            item.coordinates = i.get('coordinates')
            item.transactions = i.get('transactions')
            item.location = i.get('location')
            item.display_phone = i.get('display_phone')
            item.phone = i.get('phone')
            item.distance = i.get('distance')
            item.price = i.get('price')
            item.review_count = i.get('review_count')
            item.categories = i.get('categories')
            yield item

下载通道

class SpiderDataItem(feapder.Item):
    """
    This class was generated by feapder.
    command: feapder create -i spider_data.
    """

    def __init__(self, *args, **kwargs):
        self.term = None  
        self.store_id = None  
        self.alias = None
        self.image_url = None
        self.is_closed = None
        self.url = None
        self.rating = None
        self.coordinates = None
        self.transactions = None
        self.location = None
        self.phone = None
        self.distance = None
        self.display_phone = None
        self.price = None
        self.review_count = None
        self.categories = None

2.评论部分。这里没有使用官方api。使用了代理ip
熊猫代理代理购买地址动态代理
这里写了一个增量爬虫。我是两个脚本一起开抓的数据。大家根据实际需要修改。

    def start_requests(self):
        # limit_i = 0'

        SQL = 'SELECT COUNT(store_id) FROM apiapp_reviews_info GROUP BY store_id'
        reviewsCount = len(self.db.find(SQL, ))
        limit_i = reviewsCount
        while 1:
            SQL = 'SELECT COUNT(1) FROM apiapp_shop_info'
            dataCount = self.db.find(SQL, )[0][0]
            while limit_i < dataCount:
                SQL = 'SELECT * from apiapp_shop_info  LIMIT {},{}'.format(limit_i, limit_i + 10)
                data = self.db.find(SQL, limit=10, to_json=True)
                limit_i += 10
                for i in data:
                    print(i.get('store_id'))

                    store_id = i.get('store_id')
                    # store_id = 'lKxmH-hQ6ezKbq4URKrd-Q'
                    # url = 'https://www.yelp.co.uk/biz/lKxmH-hQ6ezKbq4URKrd-Q/review_feed?rl=en&q=&sort_by=date_asc&start=' + '0'
                    url = 'https://www.yelp.co.uk/biz/{}/review_feed?rl=en&q=&sort_by=date_asc&start='.format( store_id) + '0'

                    yield feapder.Request(url=url, headers=self.headers, callback=self.parser_one, store_id=store_id)
            SQL = 'SELECT COUNT(store_id) FROM apiapp_reviews_info GROUP BY store_id'
            reviewsCount = len(self.db.find(SQL, ))
            limit_i = dataCount
            if reviewsCount >=10000:
                break

这里需要使用代理ip。所以多一个download_midware函数。

    def download_midware(self, request):

        # ip = self.ip
        ip = 'dynamic.xiongmaodaili.com:8088'
        request.proxies = {"https": "http://" + ip, "http": "http://" + ip}
        return request

解析器
这里分了两块。第一次抓取需要判断抓多少页。后续的抓取不需要启动新任务。

    def parse_no_noe(self, request, response):
        data = response.json.get('reviews')
        store_id = request.store_id
        print(len(data))
        if data == []:
            return None
        for i in data:
            print(i)
            item = reviewSpiderDataItem()
            item.table_name = 'apiapp_reviews_info'
            item.reviews_id = i.get('id')
            item.store_id = store_id
            item.term = self.term

            item.userId = i.get('userId')
            item.business = i.get('business')
            item.user = i.get('user')
            item.comment = i.get('comment')
            item.localizedDate = i.get('localizedDate')
            item.localizedDateVisited = i.get('localizedDateVisited')
            item.rating = i.get('rating')
            item.photos = i.get('photos')
            item.lightboxMediaItems = i.get('lightboxMediaItems')
            item.photosUrl = i.get('photosUrl')
            item.totalPhotos = i.get('totalPhotos')
            item.feedback = i.get('feedback')
            item.isUpdated = i.get('isUpdated')
            item.businessOwnerReplies = i.get('businessOwnerReplies')
            item.appreciatedBy = i.get('appreciatedBy')
            item.previousReviews = i.get('previousReviews')
            item.tags = i.get('tags')
            yield item
        # return data

    def parser_one(self, request, response):

        print(response.json)
        store_id = request.store_id
        print(store_id)
        num = response.json.get('reviewLanguages')[0].get('count')
        print(num)
        if num >= 50:
            num = 40  #超过50评论取50条
        num = num // 10

        data = response.json.get('reviews')
        for i in data:
            print(i)
            item = reviewSpiderDataItem()
            item.table_name = 'apiapp_reviews_info'
            item.reviews_id = i.get('id')
            item.store_id = store_id
            item.term = self.term

            item.userId = i.get('userId')
            item.business = i.get('business')
            item.user = i.get('user')
            item.comment = i.get('comment')
            item.localizedDate = i.get('localizedDate')
            item.localizedDateVisited = i.get('localizedDateVisited')
            item.rating = i.get('rating')
            item.photos = i.get('photos')
            item.lightboxMediaItems = i.get('lightboxMediaItems')
            item.photosUrl = i.get('photosUrl')
            item.totalPhotos = i.get('totalPhotos')
            item.feedback = i.get('feedback')
            item.isUpdated = i.get('isUpdated')
            item.businessOwnerReplies = i.get('businessOwnerReplies')
            item.appreciatedBy = i.get('appreciatedBy')
            item.previousReviews = i.get('previousReviews')
            item.tags = i.get('tags')
            yield item

        for i in range(1, num + 1):
            i = i * 10
            print(i)
            url = 'https://www.yelp.co.uk/biz/{}/review_feed?rl=en&q=&sort_by=date_asc&start='.format(store_id) + str(i)
            yield feapder.Request(url=url, headers=self.headers, callback=self.parse_no_noe, store_id=store_id,
                                  random_user_agent=True)


下载 通道

class reviewSpiderDataItem(feapder.Item):
    """
    This class was generated by feapder.
    command: feapder create -i spider_data.
    """

    def __init__(self, *args, **kwargs):
        # self.id = None
        self.store_id = None
        self.term = None
        self.reviews_id = None

        self.userId = None
        self.business = None
        self.user = None
        self.comment = None
        self.localizedDate = None
        self.localizedDateVisited = None
        self.rating = None
        self.photos = None
        self.lightboxMediaItems = None
        self.photosUrl = None
        self.totalPhotos = None
        self.feedback = None
        self.isUpdated = None
        self.businessOwnerReplies = None
        self.appreciatedBy = None
        self.previousReviews = None
        self.tags = None

4.本文代码。数据集下载

资源上传正在审核。审核通过后更新

上一篇:剑指offer:数据流中的中位数


下一篇:Flink之Transform操作