购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法。下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析。例子使用Python+MongoDB
处理过程1 数据建模(将Excel中的数据写入到MongoDB数据库), 2 从数据库中读取数据进行分析。
Excel文件http://download.csdn.net/detail/artscrafts/6805689
案例配置文件 setting.py
data_source = 'supermarket.xls'
host = 'localhost'
port = 27017
db_name = 'shopping_basket'
items_name = 'goods_items'
record_name = 'transaction_record'
读取Excel数据到MongoDB中 load_basket.py
from xlrd import open_workbook
from pymongo import MongoClient
import setting wb = open_workbook(setting.data_source, encoding_override='utf-8')
client = MongoClient(setting.host, setting.port)
db = client[setting.db_name]
items = [] #read xls
def read_one_line(workbook, sheet_index=0, row_index=0, start_col_index=0):
sheet = workbook.sheets()[0]
max_row = sheet.nrows
max_col = sheet.ncols
start_col_index = (start_col_index if (start_col_index > 0 and start_col_index <= max_col) else max_col)
if row_index < 0 or row_index >= max_row:
raise IndexError()
for col_index in xrange(start_col_index, max_col):
yield sheet.cell(row_index, col_index).value #read xls
def readlines(workbook, sheet_index=0, start_row_index=0, end_row_index=None, start_col_index=0, end_col_index=None):
sheet = workbook.sheets()[sheet_index]
max_row = sheet.nrows
max_col = sheet.ncols
end_row_index = (end_row_index if end_row_index else max_row)
end_col_index = (end_col_index if end_col_index else max_col)
for row_index in xrange(start_row_index, end_row_index):
yield [sheet.cell(row_index, col_index).value for col_index in xrange(start_col_index, end_col_index)] #from xls to mongodb
def load_items():
collection = db[setting.items_name]
items_line = read_one_line(wb, row_index=1, start_col_index=1)
id = 1
tmp = []
for item in items_line:
if id % 100 == 0:
collection.insert(tmp)
tmp = []
tmp.append({'id':id, 'name':item})
items.append(item)
id += 1 # from xls to mongodb
def load_record():
collection = db[setting.record_name]
lines = readlines(wb,start_row_index=2, start_col_index = 1)
tmp = []
id = 1
for line in lines:
if id % 100 == 0:
collection.insert(tmp)
tmp = []
tmp.append({'id':id, 'items':[items[i] for i in xrange(len(line)) if line[i] == 'T']})
id += 1 def main():
print '........start loading........'
load_items()
load_record()
client.close()
print '.........end loading.........' if __name__ == '__main__':
main()
进行数据分析 analysis_basket.py
#Apriori
from pymongo import MongoClient
import setting client = MongoClient(setting.host, setting.port)
db = client[setting.db_name]
data = [] #from mongodb to items
def filldata():
collection = db[setting.record_name]
cur = collection.find()
for row in cur:
data.append(row['items']) def connect(items):
result = {}
keys = items.keys()
length = len(keys)
for i in range(length):
prev = keys[i][:len(keys[i]) - 1]
for j in range(i + 1, length):
tmp = keys[j][:len(keys[j]) - 1]
if prev == tmp:
key = keys[i] + (keys[j][len(keys[i]) - 1],)
result[key] = getsupp(key)
else:
break
return result def pruning(items, minsupp):
result = {}
for key in items.keys():
if items[key] >= minsupp:
result[key] = items[key]
return result def contain(par, sub):
for v in sub:
if not v in par:
return False
return True def getsupp(item):
supp = 0
for row in data:
if contain(row, item):
supp+=1
return supp def apriori(data, minsupp, k):
candidate_set = {}
for row in data:
for i in row:
key = (i,)
candidate_set[key] = candidate_set.get(key, 0) + 1
frequently_set = pruning(candidate_set, minsupp)
result = {}
result['k=1'] = frequently_set
for n in range(2, k):
candidate_set = connect(frequently_set)
frequently_set = pruning(candidate_set, minsupp)
if len(frequently_set) <= 1:
return result
result['K=' + str(n)] = frequently_set
return result def main():
filldata()
client.close()
res = apriori(data, 30, 8) if __name__ == '__main__':
main()