"""
抓取
解析
存储
"""
import re
#import ast
from urllib import parse
from datetime import datetime
import requests
from scrapy import Selector
from models import *
domain = "http://www.91jf.com/"
#函数用来保存写入测试文本
‘‘‘
def write_txt(html_data):
f = open("a.txt", ‘w‘)
f.write(html_data)
f.close()
‘‘‘
def get_nodes_json():
left_menu_text = requests.get("http://www.91jf.com/").text
#write_txt(left_menu_text)
#etree.HTML(res0.text)
sel = Selector(text=left_menu_text)
all_divs = sel.xpath("//div[@class=‘class_child_li‘]//a[@href]").extract()
if all_divs:
nodes_lists = []
for i in range(len(all_divs)):
nodes_str = all_divs[i]
nodes_str = nodes_str.replace("&","&")#此处&由于被转义成&导致需要重新进行处理
nodes_lists.append(nodes_str)
return nodes_lists
return []
url_list_names = []
def process_nodes_list(nodes_list):
#将js的格式提取出url到list中
for item in nodes_list:
#此处为对应的url数据
url = re.search(‘\".*\d\"‘, item)
url = url.group(0).replace("\"", "")
url = parse.urljoin(domain,url)
#此处为url对应的商品标签
name = re.search(‘<span>.*</span>‘,item)
name = name.group(0).replace("<span>","")
name = name.replace("</span>","")
url_list_name = [url,name]
url_list_names.append(url_list_name)
return url_list_names
def get_level1_list(nodes_list):
level1_url = []
#将js的格式提取出url到list中
for item in nodes_list:
#此处为对应的url数据
url = re.search(‘\".*\d\"‘, item)
url = url.group(0).replace("\"", "")
url1 = parse.urljoin(domain,url + "&okey=salenum&order=desc&page=")
level1_url.append(url1)
return level1_url
def get_last_urls():
#获取最终需要抓取的url
nodes_list = get_nodes_json()
url_names = process_nodes_list(nodes_list)
level1_url = get_level1_list(nodes_list)
return level1_url
def parse_product(url):
#获取商品的详情以及销售数量
topic_id = url.split("/")[-1]
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_divs = sel.xpath("//div[starts-with(@id, ‘post-‘)]")
topic_item = all_divs[0]
content = topic_item.xpath(".//div[@class=‘post_body post_body_min_h‘]").extract()[0]
praised_nums = topic_item.xpath(".//label[@class=‘red_praise digg‘]//em/text()").extract()[0]
jtl_str = topic_item.xpath(".//div[@class=‘close_topic‘]/text()").extract()[0]
jtl = 0
jtl_match = re.search("(\d+)%", jtl_str)
if jtl_match:
jtl = int(jtl_match.group(1))
existed_topics = Topic.select().where(Topic.id == topic_id)
if existed_topics:
topic = existed_topics[0]
topic.content = content
topic.jtl = jtl
topic.praised_nums = praised_nums
topic.save()
for answer_item in all_divs[1:]:
answer = Answer()
answer.topic_id = topic_id
author_info = answer_item.xpath(".//div[@class=‘nick_name‘]//a[1]/@href").extract()[0]
author_id = author_info.split("/")[-1]
create_time = answer_item.xpath(".//label[@class=‘date_time‘]/text()").extract()[0]
create_time = datetime.strptime(create_time, "%Y-%m-%d %H:%M:%S")
answer.author = author_id
answer.create_time = create_time
praised_nums = topic_item.xpath(".//label[@class=‘red_praise digg‘]//em/text()").extract()[0]
answer.parised_nums = int(praised_nums)
content = topic_item.xpath(".//div[@class=‘post_body post_body_min_h‘]").extract()[0]
answer.content = content
answer.save()
next_page = sel.xpath("//a[@class=‘pageliststy next_page‘]/@href").extract()
if next_page:
next_url = parse.urljoin(domain, next_page[0])
parse_product(next_url)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
parse_product(url)
print(last_urls)