python推荐淘宝物美价廉商品 2.0

2021-11-20 22:13:58

改动：

新增功能：可选择只看天猫或淘宝

代码模块化封装，参数配置或输入单独在一个py文件管理，主函数功能只留出参数传入在setting配置的py文件里。

main.py代码：

 # -*- coding: utf-8 -*-

 import urllib

 import urllib2

 import requests

 import re

 import time

 import random

 import os

 from math import log

 from math import log10

 from math import sqrt

 import sys

 import setting

 #"pageSize":44,"totalPage":100,"currentPage":3,"totalCount":29561

 '''在Python自己IDE上要注释掉以下两行'''

 reload(sys)

 sys.setdefaultencoding('utf8')  # python2.x的的defaultencoding是ascii

 class counter(object):

     #计数器

     def __init__(self):

         self.count  = 0

         self.try_time = 0

         self.try_find = 0

         self.fail_time = 0

         self.url_list = []

         self.new_flag = True

         self.results = []

         self.priSu = 0

         self.descSu = 0

         self.tm_tb = ''

     def print_counter(self):

         print 'try_time:', self.try_time,   "  get_count:" , self.count,   "  fail_time:",self.fail_time ,"try_find_time:",self.try_find

 def recommend_rate(price, description, delivery, service, comments):

     #描述为绝对值

     av_p = counter1.priSu / counter1.count

     av_d = counter1.descSu / counter1.count

     rate = (description/av_d)**20  \

             *(description + delivery + service) \

             *(av_p/(price))**0.1 \

             +log((comments+5),1000)

     #print 'all count=',counter1.count

     #print "avrage price=",av_p,';',av_p/(price),';price',price,';comments=',comments,';descrip=',description

     #print 'rate=',rate,'(price)yinzi',(av_p/(price))**0.1,'descrip_yinzi',(description/av_d)**20,'comments_factor=',log((comments+50),100)

     return rate

 def product_rank(list):

     for x in list:

         #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况

         rate = recommend_rate(x[3],x[7],x[6],x[8],x[4])

         x.append(rate)

 def get_user_rate(item_url):

     #暂时未使用该功能

     '''获取卖家信用情况；未登录情况不能访问，或者需要在头部文件中加入cookie。。。；'''

     html = urllib2.urlopen(item_url)

     #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm"

     regrex_rate = '"(//.*?user\-rate.*?)"'

     codes =  re.findall(regrex_rate,html.read())

     html.close()

     user_rate_url= 'http:'+codes[0]

     print 'uu', user_rate_url

     user_rate_html = urllib2.urlopen(user_rate_url)

     print user_rate_html.read()

     #title = "4.78589分"

     desc_regex = u'title="(4.[0-9]{5}).*?'

     de_pat = re.compile(desc_regex)

     descs = re.findall(de_pat,user_rate_html.read())

     print len(descs)

     item_url = 'https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail'

 #get_user_rate(item_url)

 '''获取卖家信用情况；未登录情况不能访问。。。暂时 无用'''

 def makeNewdir(savePath):

     while os.path.exists(savePath):

         savePath = savePath + str(random.randrange(1,10))

         #print "the path exist,we'll make a new one"

     try:

         os.makedirs(savePath)

         print 'ok,file_path we reserve results:  %s'%savePath

         print '保存的路径为：'.decode('utf-8')

     except :

         print "failed to make file path\nplease restart program"

         print '创建文件夹失败，请重新启动程序'.decode('utf-8')

     return savePath

 def get_praised_good(url, file_open, keyword, counts, descripHrequ, servHrequ, descripNrequ):

     #从给定的淘宝链接中 获取符合条件的商品list

     html = req_s.get(url)

     code = html.content

     html.close()

     regrex2 = ur'raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]'

     #每一个匹配项 返回  15个 字符串

     #x[0]开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个

     pat = re.compile(regrex2)

     meet_code = re.findall(regrex2, code)#

     if not len(meet_code):

         counter1.new_flag = False

         print 'no more new met products'

     for x in meet_code:

         # if counter1.count>=counts :

         #   print "have get enough pruducts"

         #   break

         counter1.try_find += 1

         description_higher = int(x[10])*float(x[11])/100

         service_higher = int(x[13])*float(x[14])/100

         try:

             x4 = int(x[4]) #description_count

         except:

             x4 = 0

         #如果 只要淘宝 非天猫

         if counter1.tm_tb == 'taobao':

             if counter1.tm_tb not in x[2].split('.'):

                 break

         if  (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ:

             if re.findall(keyword,x[0]) : # 中文keyword在结果中匹配问题暂时没有解决，，直接加在搜索词里吧

                 x0 = x[0].replace(' ','').replace('/','')

                 detail_url = 'http:' + x[2].decode('unicode-escape').encode('utf-8')

                 x1 = 'http:'+ x[1].decode('unicode-escape').encode('utf-8')

                 #print type(x)

                 if detail_url in counter1.url_list  or x4 == 0:

                     counter1.new_flag = False

                     print 'no more new met products'

                     print counter1.url_list

                     print detail_url

                     break

                 counter1.url_list.append(detail_url)

                 counter1.try_time += 1

                 counter1.count += 1

                 x11 = float(x[11])/100

                 x9 = float(x[9])/100

                 x12 = float(x[12])/100

                 x6 = float(x[6])/100

                 x3 = float(x[3])

                 counter1.priSu += x3

                 counter1.descSu += x9

                 x5 = unicode(x[5],'utf-8')

                 result_list = []

                 result_list.append(x0)

                 result_list.append(x1)

                 result_list.append(detail_url)

                 result_list.append(x3)

                 result_list.append(x4)

                 result_list.append(x5)

                 result_list.append(x6)

                 result_list.append(x9)

                 result_list.append(x12)

                 #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况

                 counter1.results.append(result_list)

 def save_downpic(lis,file_open,savePath):

     '''从商品list下载图片到reserve_file_path，并写入信息至fileopen'''

     #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate

     len_list = len(lis)

     print 'we find:',len_list,'products'

     cc = 0

     for x in lis:

         try :

             urllib.urlretrieve(x[1], savePath+'\\%s___'%cc +unicode(x[0],'utf-8')+'.jpg')

             txt_name = savePath+'\\'+ '%s__'%cc+ 'custome_description_%s __'%x[7] +'__comments_%s_'%x[4]+ '___price_%srmb___'%x[3] +x[5] +'.txt'

             file_o = open(txt_name, 'a')

             file_o.write(x[2])

             file_o.close()

             print '\nget_one_possible_fine_goods:\n','good_name:',x[0].decode('utf-8')

             print 'rate=',x[9]

             print 'price:',x[3],x[5].decode('utf-8')

             print 'custome_description:',x[7],'--','described_number:',x[4],'  service:',x[8]

             print x[2].decode('utf-8'),'\ngood_pic_url:',x[1].decode('utf-8')

             print txt_name

             print cc+1,"th"

             file_open.write(u'%s__'%cc \

                         + str(x[0]) \

                         + '\nprice:' \

                         + str(x[3])  \

                         + '￥,\n'    \

                         + str(x[2]) + '  \n' + str(x[5]) + '\ncustomer_description:' + str(x[7]) + 'described_number:' + str(x[4])+'\n\n\n')

             print 'get one -^-'

         except :

             print "failed to down picture or creat txt"

             counter1.fail_time += 1

         cc += 1

         time.sleep(0.5)

 def get_market_totalCount(url):

     html = urllib2.urlopen(url)

     code = html.read()

     reg = '"pageSize":[0-9]*?,"totalPage":[0-9]*?,"currentPage":[0-9]*?,"totalCount":([0-9]*?)}'

     totalCount = int(re.findall(reg,code)[0])

     return totalCount

 #"pageSize":44,"totalPage":100,"currentPage":3,"totalCount":29561

 def get_all_praised_goods(serchProd,counts,savePath ,keyword, price_min=0,price_max=0,descripHrequ =0,servHrequ=0 ,descripNrequ=0):

     #边里搜索结果每一页

     #initial url and page number

     initial_url = 'https://s.taobao.com/search?q='+serchProd + '&_input_charset=utf-8'

     if counter1.tm_tb == 'tmall':

         initial_url = initial_url + '&filter_tianmao=tmall'

     if  price_min:

         if price_min < price_max :

             initial_url = initial_url+'&filter=reserve_price%5B'+'%s'%price_min+'%2C' +'%s'%price_max

     initial_url = initial_url +'&cd=false&%5D&s='

     #tian_mall = 'https://list.tmall.com/search_product.htm?q='

     print "initial_url",initial_url+''

     page_n = 0

     reserve_file = savePath+r'\found_goods.txt'

     file_open = open(reserve_file,'a')

     file_open.write('****************************\n')

     file_open.write(time.ctime())

     file_open.write('\n****************************\n')

     total = get_market_totalCount(initial_url+'')

     print "totalcount",total

     if total>counts*10:

         total = sqrt(total)

     while counter1.new_flag and counter1.try_find<total :

         url_1 = initial_url + str(44*page_n)

         #print initial_url

         print 'url_1:', url_1

         #print 'ss',initial_url+'%s'%(44*page_n)

         page_n += 1

         get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ)

         print "let web network rest for 1s lest  make traffic jams "

         time.sleep(1)

         # except:

         print page_n, "pages have been searched"

         if total < counts :

             print "check keyword,maybe too restrict"

             break

     print url_1

     product_rank(counter1.results)

     counter1.results.sort(key = lambda x : x[9], reverse=True)

     counter1.results = counter1.results[:counts]

     counter1.print_counter()

     save_downpic(counter1.results,file_open,savePath)

     #

     for a in  counter1.results:

         for b in a :

             file_open.write(unicode(str(b),'utf-8'))

             file_open.write('\t')

         file_open.write('\n\n')

     file_open.close()

     counter1.print_counter()

 counter1 = counter()

 market_totalcounts = 0

 req_s = requests.Session()

 req_s.adapters.DEFAULT_RETRIES = 3

 req_s.keep_alive = True  

 def main():

     print "说明:".decode('utf-8')

     print '本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘（默认桌面新建find_worty_goods文件夹）并建立同序号开头的txt文件，图片显示商品，其旁的txt文件名显示价格等关键信息，txt里保存商品的淘宝链接'.decode('utf-8')  

     if setting.userDefine:      #自己输入 配置参数-筛选要求

         setting.inputPara()

                     #否则  使用setting中的配置参数

     serchProd   = setting.serchProd         #淘宝搜索词

     keyword     = setting.keyword               #raw_input().decode("gbk").encode("utf-8")       #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制

     price_min   = setting.price_min         #价格区间

     price_max   = setting.price_max

     descripHrequ = setting.descripHrequ    # %   默认高于average, 输出结果大于此值

     servHrequ    = setting.servHrequ        # %  默认高于average, 输出结果大于此值

     descripNrequ = setting.descripNrequ

     counts       = setting.counts               #要求选出多少个商品

     counter1.tm_tb = setting.tm_tb          #不区分天猫淘宝则，字符串为空，，只要天猫 则 ='tmall' ,只要淘宝 = 'taobao'

     #savePath = r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径

     savePath = u"results%s"%serchProd #结果保存路径

     savePath = makeNewdir(savePath)

     get_all_praised_goods(serchProd, counts, savePath, keyword, price_min, price_max ,descripHrequ ,servHrequ ,descripNrequ)

 if __name__ == "__main__" :

     main()

     #保存图片，以文件名为商品图片名字，并以序号开头

     #同时，输出 价格、商家名，商品描述、服务等 到 txt文本

     #在商品图片看中后，便可按序号查找

     #按描述、服务评价高于平均，购物体验应该可以的

setting.py

# -*- coding: utf-8 -*-

userDefine = False

#筛选要求设置

serchProd='背包'     #淘宝搜索词

keyword=''                 #raw_input().decode("gbk").encode("utf-8")        #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制

price_min=22            #价格区间

price_max=100

descripHrequ=0       # %   默认高于average, 输出结果大于此值

servHrequ=0          # %  默认高于average, 输出结果大于此值

descripNrequ=6

counts=25            #要求选出多少个商品

tm_tb ='tmall'       #不区分天猫淘宝则，字符串为空，，只要天猫 则 ='tmall' ,只要淘宝 = 'taobao'

def inputPara():

    ''' 用户选择是否自定义要求，根据要求进行获取商品，并按推荐排序输出'''

    print "please input reserch _goods_name"

    global serchProd , keyword , price_min, price_max, descripHrequ , servHrequ,  descripNrequ ,counts ,tm_tb

    serchProd=raw_input().replace(' ','')    #淘宝搜索词 ,并去除中间意外输入的空格

    if serchProd:

        print "if customise price_range ,decriptiom require .etc.\ninput Y/N \n default by : no price limit avarage than descriptiom,get 50 products \n 默认要求为：无价格限制，商品描述、快递、服务高于均值，获取50个商品。自定义要求请输入 ‘Y’ (区分大小写)".decode('utf-8')

        if raw_input() == 'Y':

            print "\nplease input  _minimal price and _maximal price;   \ndefault by 0,10000\nnext by 'enter'key input nothing means by default,the same below "

            print '请输入价格范围 ；默认0-10000 ；两项用半角逗号","分隔 按回车键确认；什么也不输入代表使用默认值 '.decode('utf-8')

            try:

                price_min, price_max=input()

            except:

                print 'not input or wrong number,use default range'

                price_min, price_max = 0 ,10000

            #

            print '是否要求 只看天猫/正品保障  还是只看淘宝 \n 只看天猫输入 tmall ,只看淘宝输入taobao，都看则回车略过'

            try:

                tm_tb=raw_input().decode("gbk").encode("utf-8")      #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制

            except:

                tm_tb=''

            #

                # #

            print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords"

            try:

                keyword=raw_input().decode("gbk").encode("utf-8")      #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制

            except:

                keyword=''

            #    

            print "\nplease input  _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ;   \ndefault by 0,0  I.e better than average"

            print '请输入商品描述、服务高于平均值的百分比-100 ~100'.decode('utf-8')

                 # %   默认高于average, 输出结果大于此值

            try:

                descripHrequ,servHrequ=input()

            except:

                print 'not input or wrong number,use default range'

                descripHrequ = 0  # %  默认高于average, 输出结果大于此值

                servHrequ = 0

            #

            print "\nplease input description count limit,  default more than 5\n" ,'输入最低商品评价数，默认大于5'.decode('utf-8')

            try:

                descripNrequ=input()

            except :

                print 'not input or wrong number,use default range'

                descripNrequ=5

            #

                # print "\nIF customise file reserve path, Y or N  \ndefault/sample as:  C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results "

                # print '是否自定义保存文件目录 Y or N'.decode('utf-8')

                # if raw_input()=='Y':

                #     print "please input path that you want to reserve;  \n "

                #     savePath = raw_input()

                # else:

                #     #savePath=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径

            #

            print "\nplease input how many results you want,  default by 50\n" ,'您要获取的商品数目，默认50'.decode('utf-8')

            try:

                counts=input()

            except :

                counts=50

        else :

            counts =50

            keyword = ''

            tm_tb = ''

            price_min ,price_max ,descripHrequ ,servHrequ ,descripNrequ = 0,0,0,0,0

    else:

        print "no search goods，please restart"

        print '没有输入商品名称，请重新启动程序'.decode('utf-8')

码农公寓

相关文章