2021-07-08

import json
import os
from py2neo import Graph, Node, Relationship

class GoodsKg:
def init(self):
cur = ‘/’.join(os.path.abspath(file).split(’/’)[:-1])
self.data_path = os.path.join(cur, ‘data/goods_info.json’)
self.g = Graph(
host=“127.0.0.1”, # neo4j 搭载服务器的ip地址,ifconfig可获取到
http_port=7474, # neo4j 服务器监听的端口号
user=“lhy”, # 数据库user name,如果没有更改过,应该是neo4j
password=“lhy123”)
return

'''读取数据'''
def read_data(self):
    rels_goods = []
    rels_brand = []
    goods_attrdict = {}
    concept_goods = set()
    concept_brand = set()
    count = 0
    for line in open(self.data_path):
        count += 1
        print(count)
        line = line.strip()
        data = json.loads(line)
        first_class = data['fisrt_class'].replace("'",'')
        second_class = data['second_class'].replace("'",'')
        third_class = data['third_class'].replace("'",'')
        attr = data['attrs']
        concept_goods.add(first_class)
        concept_goods.add(second_class)
        concept_goods.add(third_class)
        rels_goods.append('@'.join([second_class, 'is_a', '属于', first_class]))
        rels_goods.append('@'.join([third_class, 'is_a', '属于', second_class]))

        if attr and '品牌' in attr:
            brands = attr['品牌'].split(';')
            for brand in brands:
                brand = brand.replace("'",'')
                concept_brand.add(brand)
                rels_brand.append('@'.join([brand, 'sales', '销售', third_class]))

        goods_attrdict[third_class] = {name:value for name,value in attr.items() if name != '品牌'}

    return concept_brand, concept_goods, rels_goods, rels_brand

'''构建图谱'''
def create_graph(self):
    concept_brand, concept_goods, rels_goods, rels_brand = self.read_data()
    # print('creating nodes....')
    # self.create_node('Product', concept_goods)
    # self.create_node('Brand', concept_brand)
    # print('creating edges....')
    # self.create_edges(rels_goods, 'Product', 'Product')
    self.create_edges(rels_brand, 'Brand', 'Product')
    return

'''批量建立节点'''
def create_node(self, label, nodes):
    pairs = []
    bulk_size = 1000
    batch = 0
    bulk = 0
    batch_all = len(nodes)//bulk_size
    print(batch_all)
    for node_name in nodes:
        sql = """CREATE(:%s {name:'%s'})""" % (label, node_name)
        pairs.append(sql)
        bulk += 1
        if bulk % bulk_size == 0 or bulk == batch_all+1:
            sqls = '\n'.join(pairs)
            self.g.run(sqls)
            batch += 1
            print(batch*bulk_size,'/', len(nodes), 'finished')
            pairs = []
    return


'''构造图谱关系边'''
def create_edges(self, rels, start_type, end_type):
    batch = 0
    count = 0
    for rel in set(rels):
        count += 1
        rel = rel.split('@')
        start_name = rel[0]
        end_name = rel[3]
        rel_type = rel[1]
        rel_name = rel[2]
        sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name,rel_type,rel_name)
        try:
            self.g.run(sql)
        except Exception as e:
            print(e)
        if count%10 == 0:
            print(count)

    return

if name ==‘main’:
handler = GoodsKg()
handler.create_graph()

#!/usr/bin/env python3

coding: utf-8

File: collect_info.py

Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>

Date: 19-3-30

import urllib.request
from urllib.parse import quote_plus
from lxml import etree
import gzip
import chardet
import json
import pymongo

class GoodSchema:
def init(self):
self.conn = pymongo.MongoClient()
return

'''获取搜索页'''
def get_html(self, url):
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
    try:
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req).read()
        coding = chardet.detect(data)
        html = data.decode(coding['encoding'])
    except:
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req).read()
        html = data.decode('gbk')


    return html

'''获取详情页'''
def get_detail_html(self, url):
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US,en;q=0.9",
        "cache-control": "max-age=0",
        "referer": "https://www.jd.com/allSort.aspx",
        "upgrade-insecure-requests": 1,
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36"
    }
    try:
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req).read()
        html = gzip.decompress(data)
        coding = chardet.detect(html)
        html = html.decode(coding['encoding'])
    except Exception as e:
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req).read()
        html = gzip.decompress(data)
        html = html.decode('gbk')
    return html


'''根据主页获取数据'''
def home_list(self):
    url = 'https://www.jd.com/allSort.aspx'
    html = self.get_html(url)
    selector = etree.HTML(html)
    divs = selector.xpath('//div[@class= "category-item m"]')
    for indx, div in enumerate(divs):
        first_name = div.xpath('./div[@class="mt"]/h2/span/text()')[0]
        second_classes = div.xpath('./div[@class="mc"]/div[@class="items"]/dl')
        for dl in second_classes:
            second_name = dl.xpath('./dt/a/text()')[0]
            third_classes = ['https:' + i for i in dl.xpath('./dd/a/@href')]
            third_names = dl.xpath('./dd/a/text()')
            for third_name, url in zip(third_names, third_classes):
                try:
                    attr_dict = self.parser_goods(url)
                    attr_brand = self.collect_brands(url)
                    attr_dict.update(attr_brand)
                    data = {}
                    data['fisrt_class'] = first_name
                    data['second_class'] = second_name
                    data['third_class'] = third_name
                    data['attrs'] = attr_dict
                    self.conn['goodskg']['data'].insert(data)
                    print(indx, len(divs), first_name, second_name, third_name)
                except Exception as e:
                    print(e)
    return

'''解析商品数据'''
def parser_goods(self, url):
    html = self.get_detail_html(url)
    selector = etree.HTML(html)
    title = selector.xpath('//title/text()')
    attr_dict = {}
    other_attrs = ''.join([i for i in html.split('\n') if 'other_exts' in i])
    other_attr = other_attrs.split('other_exts =[')[-1].split('];')[0]
    if other_attr and 'var other_exts ={};' not in other_attr:
        for attr in other_attr.split('},'):
            if '}' not in attr:
                attr = attr + '}'
            data = json.loads(attr)
            key = data['name']
            value = data['value_name']
            attr_dict[key] = value
    attr_divs = selector.xpath('//div[@class="sl-wrap"]')
    for div in attr_divs:
        attr_name = div.xpath('./div[@class="sl-key"]/span/text()')[0].replace(':','')
        attr_value = ';'.join([i.replace('  ','') for i in div.xpath('./div[@class="sl-value"]/div/ul/li/a/text()')])
        attr_dict[attr_name] = attr_value

    return attr_dict

'''解析品牌数据'''
def collect_brands(self, url):
    attr_dict = {}
    brand_url = url + '&sort=sort_rank_asc&trans=1&md=1&my=list_brand'
    html = self.get_html(brand_url)
    if 'html' in html:
        return attr_dict
    data = json.loads(html)
    brands = []

    if 'brands' in data and data['brands'] is not None:
        brands = [i['name'] for i in data['brands']]
    attr_dict['品牌'] = ';'.join(brands)

    return attr_dict

if name == ‘main’:
handler = GoodSchema()
handler.home_list()

上一篇:jquery动态创建form表单并提交


下一篇:DOM盒子模型(28)