协同过滤包括基于物品的协同过滤和基于用户的协同过滤,本文基于电影评分数据做基于用户的推荐
主要做三个部分:1、读取数据;2、构建用户与用户的相似度矩阵;3、进行推荐;
查看数据u.data
主要用到前3列分别指 用户编号user_id、电影编号item_id、用户对电影的打分score
这个文件构建item-用户的倒排表用于构建用户和用户的相似度矩阵,构建用户-item的倒排表用于推荐
ubuntu@ubuntu-2:~/workspace/jupyter_project/recommendation$ head ./data/u.data 196 242 3 881250949 186 302 3 891717742 22 377 1 878887116 244 51 2 880606923 166 346 1 886397596 298 474 4 884182806 115 265 2 881171488 253 465 5 891628467 305 451 3 886324817 6 86 3 883603013
查看数据u.item
主要用到前两列:第一列是电影id item_id 第二列是电影名称
这个文件主要用于推荐结果展示
ubuntu@ubuntu-2:~/workspace/jupyter_project/recommendation$ head ./data/u.item 1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0 2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0 3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0 4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0 5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0 6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0 7|Twelve Monkeys (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|1|0|0|0 8|Babe (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Babe%20(1995)|0|0|0|0|1|1|0|0|1|0|0|0|0|0|0|0|0|0|0 9|Dead Man Walking (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0 10|Richard III (1995)|22-Jan-1996||http://us.imdb.com/M/title-exact?Richard%20III%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|1|0
代码如下
# coding: utf-8 # In[64]: #读取数据 def read_data(udata,uitem): user_movies = {}#item - > user 用于构建相似度矩阵 user_item = {}#user -> item ->score 最后用于推荐 movies = {} for line in open(udata): user,item,score = line.split("\t")[:3] user_movies.setdefault(item,{}) user_movies[item][user] = int(score) user_item.setdefault(user,{}) user_item[user][item]= int(score) for line in open(uitem,encoding = "ISO-8859-1"): item,name = line.split("|")[:2] movies.setdefault(item) movies[item] = name return user_movies,movies,user_item # user_movies,movies,user_item = read_data("./data/u.data","./data/u.item") # In[62]: import math #建立用户相似度矩阵 def user_similarity(user_movies): C ={}#用于存放相似度矩阵 N = {}#用于存放每个人评价的电影数 for item , user_score in user_movies.items(): for user in user_score.keys(): N.setdefault(user,0) N[user] += 1 C.setdefault(user,{}) for user2 in user_score.keys(): if user == user2: continue C[user].setdefault(user2,0) C[user][user2] +=1 W = {}#存放最终的相似度矩阵 for user,user_score in C.items(): W.setdefault(user,{}) for user2,score in user_score.items(): W[user][user2] = C[user][user2]/math.sqrt(N[user]*N[user]) return W # W=user_similarity(user_movies) # In[63]: # def Recommend(user,user_item,W,N,M): rank = {} #存放推荐计算结果 user=user #N 用户相关性最大的前N个用户; #M代表推荐最终的M个结果 for user2,w_score in sorted(W[user].items(),key = lambda x:x[1],reverse = True)[:N]: for item,score in sorted(user_item[user2].items()): if item in user_item[user].keys(): continue rank.setdefault(item,{}) rank[item] = w_score*math.log(score) return sorted(rank.items(),key = lambda x:x[1],reverse = True)[:M] # In[65]: if __name__ == "__main__": print ("#导入数据") user_movies,movies,user_item = read_data("./data/u.data","./data/u.item") print("#计算相似度矩阵") W = user_similarity(user_movies) print ("#计算推荐结果") result = Recommend(",user_item,W,2,10) print ("#结果展示") print ("你可能会喜欢") for line in result: print (movies[line[0]])