import json
import os
from py2neo import Graph, Node, Relationship
class GoodsKg:
def init(self):
cur = ‘/’.join(os.path.abspath(file).split(’/’)[:-1])
self.data_path = os.path.join(cur, ‘data/goods_info.json’)
self.g = Graph(
host=“127.0.0.1”, # neo4j 搭载服务器的ip地址,ifconfig可获取到
http_port=7474, # neo4j 服务器监听的端口号
user=“lhy”, # 数据库user name,如果没有更改过,应该是neo4j
password=“lhy123”)
return
'''读取数据'''
def read_data(self):
rels_goods = []
rels_brand = []
goods_attrdict = {}
concept_goods = set()
concept_brand = set()
count = 0
for line in open(self.data_path):
count += 1
print(count)
line = line.strip()
data = json.loads(line)
first_class = data['fisrt_class'].replace("'",'')
second_class = data['second_class'].replace("'",'')
third_class = data['third_class'].replace("'",'')
attr = data['attrs']
concept_goods.add(first_class)
concept_goods.add(second_class)
concept_goods.add(third_class)
rels_goods.append('@'.join([second_class, 'is_a', '属于', first_class]))
rels_goods.append('@'.join([third_class, 'is_a', '属于', second_class]))
if attr and '品牌' in attr:
brands = attr['品牌'].split(';')
for brand in brands:
brand = brand.replace("'",'')
concept_brand.add(brand)
rels_brand.append('@'.join([brand, 'sales', '销售', third_class]))
goods_attrdict[third_class] = {name:value for name,value in attr.items() if name != '品牌'}
return concept_brand, concept_goods, rels_goods, rels_brand
'''构建图谱'''
def create_graph(self):
concept_brand, concept_goods, rels_goods, rels_brand = self.read_data()
# print('creating nodes....')
# self.create_node('Product', concept_goods)
# self.create_node('Brand', concept_brand)
# print('creating edges....')
# self.create_edges(rels_goods, 'Product', 'Product')
self.create_edges(rels_brand, 'Brand', 'Product')
return
'''批量建立节点'''
def create_node(self, label, nodes):
pairs = []
bulk_size = 1000
batch = 0
bulk = 0
batch_all = len(nodes)//bulk_size
print(batch_all)
for node_name in nodes:
sql = """CREATE(:%s {name:'%s'})""" % (label, node_name)
pairs.append(sql)
bulk += 1
if bulk % bulk_size == 0 or bulk == batch_all+1:
sqls = '\n'.join(pairs)
self.g.run(sqls)
batch += 1
print(batch*bulk_size,'/', len(nodes), 'finished')
pairs = []
return
'''构造图谱关系边'''
def create_edges(self, rels, start_type, end_type):
batch = 0
count = 0
for rel in set(rels):
count += 1
rel = rel.split('@')
start_name = rel[0]
end_name = rel[3]
rel_type = rel[1]
rel_name = rel[2]
sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name,rel_type,rel_name)
try:
self.g.run(sql)
except Exception as e:
print(e)
if count%10 == 0:
print(count)
return
if name ==‘main’:
handler = GoodsKg()
handler.create_graph()
#!/usr/bin/env python3
coding: utf-8
File: collect_info.py
Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
Date: 19-3-30
import urllib.request
from urllib.parse import quote_plus
from lxml import etree
import gzip
import chardet
import json
import pymongo
class GoodSchema:
def init(self):
self.conn = pymongo.MongoClient()
return
'''获取搜索页'''
def get_html(self, url):
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
try:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read()
coding = chardet.detect(data)
html = data.decode(coding['encoding'])
except:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read()
html = data.decode('gbk')
return html
'''获取详情页'''
def get_detail_html(self, url):
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"referer": "https://www.jd.com/allSort.aspx",
"upgrade-insecure-requests": 1,
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36"
}
try:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read()
html = gzip.decompress(data)
coding = chardet.detect(html)
html = html.decode(coding['encoding'])
except Exception as e:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read()
html = gzip.decompress(data)
html = html.decode('gbk')
return html
'''根据主页获取数据'''
def home_list(self):
url = 'https://www.jd.com/allSort.aspx'
html = self.get_html(url)
selector = etree.HTML(html)
divs = selector.xpath('//div[@class= "category-item m"]')
for indx, div in enumerate(divs):
first_name = div.xpath('./div[@class="mt"]/h2/span/text()')[0]
second_classes = div.xpath('./div[@class="mc"]/div[@class="items"]/dl')
for dl in second_classes:
second_name = dl.xpath('./dt/a/text()')[0]
third_classes = ['https:' + i for i in dl.xpath('./dd/a/@href')]
third_names = dl.xpath('./dd/a/text()')
for third_name, url in zip(third_names, third_classes):
try:
attr_dict = self.parser_goods(url)
attr_brand = self.collect_brands(url)
attr_dict.update(attr_brand)
data = {}
data['fisrt_class'] = first_name
data['second_class'] = second_name
data['third_class'] = third_name
data['attrs'] = attr_dict
self.conn['goodskg']['data'].insert(data)
print(indx, len(divs), first_name, second_name, third_name)
except Exception as e:
print(e)
return
'''解析商品数据'''
def parser_goods(self, url):
html = self.get_detail_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')
attr_dict = {}
other_attrs = ''.join([i for i in html.split('\n') if 'other_exts' in i])
other_attr = other_attrs.split('other_exts =[')[-1].split('];')[0]
if other_attr and 'var other_exts ={};' not in other_attr:
for attr in other_attr.split('},'):
if '}' not in attr:
attr = attr + '}'
data = json.loads(attr)
key = data['name']
value = data['value_name']
attr_dict[key] = value
attr_divs = selector.xpath('//div[@class="sl-wrap"]')
for div in attr_divs:
attr_name = div.xpath('./div[@class="sl-key"]/span/text()')[0].replace(':','')
attr_value = ';'.join([i.replace(' ','') for i in div.xpath('./div[@class="sl-value"]/div/ul/li/a/text()')])
attr_dict[attr_name] = attr_value
return attr_dict
'''解析品牌数据'''
def collect_brands(self, url):
attr_dict = {}
brand_url = url + '&sort=sort_rank_asc&trans=1&md=1&my=list_brand'
html = self.get_html(brand_url)
if 'html' in html:
return attr_dict
data = json.loads(html)
brands = []
if 'brands' in data and data['brands'] is not None:
brands = [i['name'] for i in data['brands']]
attr_dict['品牌'] = ';'.join(brands)
return attr_dict
if name == ‘main’:
handler = GoodSchema()
handler.home_list()