91家纺网爬虫，不包含多线程登录处理，第三张表格数据没有对接

2024-03-31 11:35:46

""" 抓取解析存储 """ import re #import ast from urllib import parse from datetime import datetime
import requests import time from scrapy import Selector
from models import *
store_list_urls = [] product_list_urls = []
domain = "http://www.91jf.com/" store_domain = "http://www.91jf.com/default.php?act=corp&sort=list&page="
#函数用来保存写入测试文本 def write_txt(html_data):     f = open("a.txt", 'w')     f.write(html_data)     f.close()
def get_nodes_json():     left_menu_text = requests.get("http://www.91jf.com/").text     #write_txt(left_menu_text)     #etree.HTML(res0.text)     sel = Selector(text=left_menu_text)     all_divs = sel.xpath("//div[@class='class_child_li']//a[@href]").extract()     if all_divs:         nodes_lists = []         for i in range(len(all_divs)):             nodes_str = all_divs[i]             nodes_str = nodes_str.replace("&","&") # 此处&由于被转义成&导致需要重新进行处理             nodes_lists.append(nodes_str)         return nodes_lists     return []
url_list_names = [] def process_nodes_list(nodes_list):     #将js的格式提取出url到list中     for item in nodes_list:         #此处为对应的url数据         url = re.search('\".*\d\"', item)         url = url.group(0).replace("\"", "")         url = parse.urljoin(domain,url)         #此处为url对应的商品标签         name = re.search('<span>.*</span>',item)         name = name.group(0).replace("<span>","")         name = name.replace("</span>","")         url_list_name = [url,name] # 系列商品链接，商品系列名字         url_list_names.append(url_list_name)     return url_list_names
def get_level1_list(nodes_list):     level1_url = []     #将js的格式提取出url到list中     for item in nodes_list:         #此处为对应的url数据         url = re.search('\".*\d\"', item)         url = url.group(0).replace("\"", "")         url1 = parse.urljoin(domain,url + "&okey=salenum&order=desc&page=1")         level1_url.append(url1)     return level1_url
def get_last_urls():     #获取最终需要抓取的url     url_list = []     nodes_list = get_nodes_json()     #url_names = process_nodes_list(nodes_list)     level1_url = get_level1_list(nodes_list) # 所有系列商品对应的第一页url     for url in level1_url:         #print(url)         parse_product(url)         url_list,store_id_list = parse_data_last(url)         #url_list.extend(parse_data_last(url))     return url_list
def parse_product(url):     #获取商品的详情以及销售数量     res_text = requests.get(url).text     print(url)     #print(res_text)     sel = Selector(text=res_text)     res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")     flag_num = 0     for item in res_li:         name = item.xpath("./div[@class='row row-2 title']/a/text()").extract() # 产品名字         name = ''.join(name)         price = item.xpath('./div[@id="goods_detail_b"]/div[@class="row row-1"]/div[@class="g_price fm2"]/strong/text()').extract() # 显示价格         price = ''.join(price)         try:             price = float(price)         except:             print("价格会员可见|价格请咨询商家")             continue         sales_num = item.xpath("./div[@id='goods_detail_b']/div[2]/p[1]/text()").extract()  # 销售数量         sales_num= ''.join(sales_num)         sales_num = sales_num.split('销量：')[1]         sales_num = int(sales_num)         flag_num = sales_num         if sales_num < 1:             continue                  merchant = item.xpath("./div[@id='goods_detail_b']/div[2]/p[2]/text()").extract() # 商家         merchant = ''.join(merchant)
        main_Products = item.xpath("./div[@id='goods_detail_b']/div[2]/p[3]/text()").extract() # 主营         main_Products = ''.join(main_Products)
        merchant_Place = item.xpath("./div[@id='goods_detail_b']/div[2]/p[4]/text()").extract() # 地址         merchant_Place = ''.join(merchant_Place)              product = Product()         product.name = name         product.price = price         product.sales_num = sales_num         product.merchant = merchant         product.main_Products = main_Products         product.merchant_Place = merchant_Place                  existed_name = Product.select().where(Product.name==product.name)         if existed_name:             product.save()         else:             product.save(force_insert=True)              next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()     if len(next_page) > 2 and flag_num > 0:         url_next = re.search('\".*\d\"',next_page[-1])         url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理         url_next = url_next.replace("\"","")         url_next = parse.urljoin(domain,url_next)         #print(url_next)         parse_product(url_next)     else:         pass
#获取商品链接,上一级url为商品详情页 def parse_data_last(url):     url_list = []     store_id_list = []     flag_num = 0     #获取商品的详情标签     res_text = requests.get(url).text     sel = Selector(text=res_text)     res_li = sel.xpath("//div[@class='pro_list_div g-clearfix c']/ul//li[@class='goods_offset']")     for item in res_li:         sales_num = item.xpath("./div[@id='goods_detail_b']/div[2]/p[1]/text()").extract() # 销售数量         sales_num= ''.join(sales_num)         sales_num = sales_num.split('销量：')[1]         sales_num = int(sales_num)         flag_num = int(sales_num)
        data = item.xpath("./div[@class='pro_pic_box']/a").extract()         data = re.search('\".*\d\"',data[0])         data = data.group().replace("&","&")         data = data.replace("\"","")         data_url = parse.urljoin(domain,data) # 链接为销量排序之后的单个商品链接，传出链接         print("开始获取商品：{}".format(data_url))         store_id = parse_store_data(data_url)         store_id_list.append(store_id)         parse_product_data(data_url)         url_list.append(data_url)
    #此处代码用来切到下一页链接数据，商品的详情排布页     next_page = sel.xpath("//*[@class='pagination2']/a[@href]").extract()     if len(next_page) > 2 and flag_num > 0:         url_next = re.search('\".*\d\"',next_page[-1])         url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理         url_next = url_next.replace("\"","")         url_next = parse.urljoin(domain,url_next)         parse_data_last(url_next)
    return url_list ,store_id_list
#获取商品详细数据 def parse_product_data(url):     #获取商品的详情以及销售数量     #print(url) # 打印当前商品页的url用来定位     product_id = url.split('id=')[1] # 对商品id进行切片处理，用来获取ajax数据     res_text = requests.get(url).text     sel = Selector(text=res_text)     #筛选规则，当is_price之后的value属性值为0的时候，说明不需要咨询商家，同时需要注意的是，商品会有打折批次数量的差异导致价格差异，     #这一点需要根据具体的显示页面来处理，现在忽略，由于可能存在打折段的数据差异，所以暂时不考虑     Is_price = sel.xpath("//input[contains(@id,'is_price')]").extract()#取到的数据用来判断价格是否需要咨询商家     print(Is_price)     if len(Is_price) < 1:         print("页面数据为空")     else:             is_value = re.search('\d',Is_price[0])         if is_value.group() == '0': # 0表示商品价格不需要咨询商户             #datas = sel.xpath("//table[contains(@class,'goods_spec_list')]").extract()             datas = sel.xpath("//div[contains(@class,'show_all')]/table[contains(@class,'goods_spec_list')]//tr")             #price_base             price_base = 0.0             for item in range(len(datas)):                 price = datas[item].xpath("./input[3]").extract()                 price = re.search('value=\".*\"',price[0])                 price = re.search('\d.*\d',price[0])                 price = price.group()                 price_base = price_base + float(price)             price_base = price_base / len(datas) # 商品基准价格计算             #此处获取商品的描述信息             attributes_list = sel.xpath("//span[contains(@class,'attributes-list')]//li/text()").extract()             str_attributes = ' '.join(attributes_list)             str_attributes = str_attributes.replace(" "," ") # 商品信息描述             #此处发送请求获取商品购买数据             url_sales = parse.urljoin(domain,'default.php?act=evallist')             data = {                 'id': product_id,                 'page': '0',                 'info_type': 'sale'             }             response = requests.post(url_sales, data=data)             buyer_num = response.json().get("member") # 购买人数             sale_num = response.json().get('num') # 销售数量             buyer_rate = response.json().get('re_buyer_rate') # 商品复购率             product_id = int(product_id) # 此处对商品ID进行转换
            product_attributes = Product_attributes()             product_attributes.product_id = product_id             product_attributes.price_base = price_base             product_attributes.attributes = str_attributes             product_attributes.buyer_num = buyer_num             product_attributes.sale_num = sale_num             product_attributes.buyer_rate = buyer_rate
            existed_id = Product_attributes.select().where(Product_attributes.product_id==product_id)             if existed_id:                 product_attributes.save()             else:                 product_attributes.save(force_insert=True)         else :             price = "价格请咨询商家"
#获取商户详细数据,处理逻辑为根据单个商品目录来获取对应的商户id def parse_store_data(url):     #print(url) # 打印当前商品页的url用来定位     res_text = requests.get(url).text     sel = Selector(text=res_text)     store_id = 0     #筛选规则，当is_price之后的value属性值为0的时候，说明不需要咨询商家，同时需要注意的是，商品会有打折批次数量的差异导致价格差异，     #这一点需要根据具体的显示页面来处理，现在忽略，由于可能存在打折段的数据差异，所以暂时不考虑     Is_price = sel.xpath("//input[contains(@id,'is_price')]").extract()#取到的数据用来判断价格是否需要咨询商家     #print(Is_price)     if len(Is_price) < 1:         print("页面数据为空")     else:             is_value = re.search('\d',Is_price[0])         if is_value.group() == '0': # 0表示商品价格不需要咨询商户             #datas = sel.xpath("//table[contains(@class,'goods_spec_list')]").extract()             #store_name = sel.xpath('//span[contains(@class,"container_title_span")]/a[@href]/text()').extract()             #store_name = ''.join(store_name) # 商户的名字
            store_id = sel.xpath('//span[@class="container_title_span"]/a[@href]').extract()             store_id = ''.join(store_id)             store_id = re.search('storeid=\d*\"',store_id)             store_id = store_id.group()             store_id = store_id.split('storeid=')[1]             store_id = store_id.replace("\"","")             #print(store_id)             store_id = int(store_id) # 商户的id             '''             store_data = sel.xpath('//ul[contains(@class,"gy_info_list")]/li/text()').extract()             if len(store_data) > 3:                 store_level = store_data[2] # 商户等级                 store_level = store_level.replace(" ","")                 store_level = store_level.replace("\n","")                 store_place = store_data[3] # 商户地址                 store_place = store_place.replace(" ","")             print(store_level)             print(store_place)             '''         else :             pass     return store_id
#获取所有商户id def parse_store_id(url):     print(url) # 打印当前商户详情页的url用来定位     store_id_list = []     res_text = requests.get(url).text     sel = Selector(text=res_text)     res_li = sel.xpath("//div[contains(@class ,'corp_list')]//div[@class='supply-list']")     for item in res_li:         store_id = item.xpath(".//a[contains(@class,'supply-left-tltle')]").extract()         store_id = ''.join(store_id)         store_id = re.search('storeid=\d*\"',store_id)         store_id = store_id.group()         store_id = store_id.split('storeid=')[1]         store_id = store_id.replace("\"","")         store_id = int(store_id) # 获取店铺id         store_id_list.append(store_id)          #此处代码用来切到下一页链接数据，商户的详情排布页     next_page = sel.xpath("//*[@class='pagination2']/a[@href][last()]/text()").extract()     next_page = ''.join(next_page)     try:         next_page = int(next_page)     except:         url_next = sel.xpath("//*[@class='pagination2']/a[@href][last()]").extract()         url_next = ''.join(url_next)         url_next = re.search('\".*\d\"',url_next)         url_next = url_next.group().replace("&","&") # 此处&由于被转义成&导致需要重新进行处理         url_next = url_next.replace("\"","")         url_next = parse.urljoin(domain,url_next)         parse_store_id(url_next)           return store_id_list
def get_last_store_id():     #获取最终需要抓取的店铺id，传回拼接之后的url     store_id_list = parse_store_id(store_domain)     pass
if __name__ == "__main__":     start_time = datetime.now()     last_urls = get_last_urls()     end_time = datetime.now()     '''     for url in last_urls:         #parse_product_data(url)         #print("开始获取商品：{}".format(url))     '''

码农公寓

相关文章