1. 方法叙述:
github地址:https://github.com/AtwoodZhang/Crawler_of_Product_Comment
在获得产品url之后,向url发送请求。然后解析response之后,匹配html中的图片url,向图片url发送请求并下载保存。
1)所有img按照web id新建文件夹。每个页面有多个img 的url,因此需要考虑下载的具体是哪一张图片。
2)所有img的url,使用xpath时,xpath helper可以测试,但使用request却无法匹配结果,因此使用re模块,即正则匹配进行匹配下载。
3)web id是Macy为每个prod分配的一个序列号。
2. 具体代码:
_04_spider_of_rank4_prod_info_for_pic.py
# step1.4 下载产品图片;直接保存到本地。
import time
import os
from _00_record_of_small_functions import *
from _04_mysql_of_rank4 import MacyRank4Mysql
from concurrent.futures import ThreadPoolExecutor # 用来构建线程池
import request_test_04_get_comment as rc # 爬虫
import random
from _00_record_of_agent_pool import ua_list
from _00_record_of_xpath_and_re_dict import xpath_dict
import requests
import pymysql
def run_crawl_img():
# step1. 从数据库中取出需要request的url;
r4_img_sql = MacyRank4Mysql()
r4_img_sql.select_unrequest_prod_img(table_name='rank4_prod_specific_info')
r4_img_list = [i for i in r4_img_sql.cursor.fetchall()]
r4_img_sql.database_commit_close()
print(len(r4_img_list))
# step1.2. 首先使用一条数据进行测试;
# r4_img_list = [r4_img_list[21]]
# r4_img_list = r4_img_list[0:2]
# print(r4_img_list)
# print(len(r4_img_list))
# step2. 对url_list中的每一条数据逐一发送爬取请求;
# 开启多线程;
with ThreadPoolExecutor(1) as t:
for i in r4_img_list:
case = [i[0], i[2]]
t.submit(send_request, case)
time.sleep(random.uniform(1, 3))
def send_request(url_address):
print(url_address)
print(url_address[1], url_address[0])
rc.get_comment(url=url_address[1], x=url_address[0])
if __name__ == "__main__":
# step1. 写入爬取日志
log_path = './prod_crawl_log/'
if not os.path.exists(log_path):
os.makedirs(log_path)
log_file_name = log_path + 'crawl_img_' + 'log-' + time.strftime("%Y%m%d-%H%M%S", time.localtime()) + '.log'
sys.stdout = Logger(log_file_name)
sys.stderr = Logger(log_file_name)
# step2. 运行爬取过程;
start = time.time()
run_crawl_img()
end = time.time()
spend_time = end - start
print("finish crawl prod_img:", spend_time)
request_test_04_get_comment.py
import requests
import pymysql
from _00_record_of_agent_pool import ua_list
from _00_record_of_xpath_and_re_dict import xpath_dict
import random
import re
import os
def get_comment(url, x):
# url表示requests的具体html页
resp_status = False
# print("connect now!")
try:
resp_status = support_request(url=url, x=x)
except Exception as e:
print(e)
# 此时是请求超时
for i in range(1, 5):
print('请求超时,第%s次重复请求' % i)
resp_status = support_request(url=url, x=x)
# 若解析失败则多次请求
count = 0
if resp_status is False:
count = 5
while resp_status is False and count > 0:
resp_status = support_request(url=url, x=x)
count = count - 1
def support_request(url, x):
headers = {'User-Agent': random.choice(ua_list)}
response = requests.get(url=url, headers=headers, timeout=3)
# print(response.text)
if response.status_code == 200 and response.text != []:
response.encoding = "utf-8"
parse_html(response.text, url, x)
response.close()
resp_status = True
else:
print("本次请求失败!")
resp_status = False
return resp_status
def parse_html(html_, url, x):
re_string_rank4_img = '<img src="(.*?)".*?name='
# re_string_rank4_img = '<img src="(.*?)"'
pattern4 = re.compile(re_string_rank4_img, re.S)
img_url = pattern4.findall(html_)
img_url = list(set(img_url))
count = 0
folder_name = ".\\prod_img\\" + x
folder = os.path.exists(folder_name)
if not folder:
os.makedirs(folder_name)
for i in img_url:
result = requests.get(i)
result.raise_for_status()
filename = folder_name + "\\" + x + "_" + str(count) + ".png"
print("img_name:", filename)
with open(filename, "wb") as f:
f.write(result.content)
f.close()
count = count + 1
print("img_url:", img_url)
print("This prod ", x, "'s img download well.")
sql = "update rank4_prod_specific_info set img_request_situation='True' " \
"where prod_id='{}'".format(x)
db = pymysql.connect(
host="localhost", # 数据库服务端地址
user='root', # 链接数据库所使用的用户名
passwd='root', # 数据库密码
db='macy', # 数据库名称
charset='utf8')
# 创建游标对象
cursor = db.cursor()
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
print(sql)
print("I have updated the", x, "sql.\n")
if __name__ == "__main__":
url_ = "https://www.macys.com/shop/product/charter-club-crew-neck-heart-print-t-shirt-created-for-macys?ID=13029272&CategoryID=255"
webid_string = 'https://www.macys.com/.*?ID=(.*?)&.*?'
webid_pattern = re.compile(webid_string, re.S)
result_webid = webid_pattern.findall(url_)[0].strip()
print("web_id: ", result_webid)
x = result_webid
print(url_, x)
# print(x)
get_comment(url_, x)