这算是一个用pandas处理csv的demo,有助于理解DataFrameGroupBy的数据结构。
demo写得不好(有两个输出文件,其中一个是临时加的,典型的缝合怪代码),但其中包含了不少常见的操作,仅供参考。
import sys
import pandas as pd
infile = sys.argv[1]
outfile = sys.argv[2]
df = pd.read_csv(infile,encoding="utf-8")
df = df.sort_values(by="time",ascending=True)
df['rating'] = (df['rating']-df['rating'].min())/(df['rating'].max()-df['rating'].min()) #min-max归一
n = 300
actions = 0
#第一部分:将分组结果写入文件(从DataFrameGroupBy中的DataFrame来看)
gp = df.groupby(['target'])
#df2 = pd.DataFrame(columns=['source','target','rating','time'])
df2 = pd.DataFrame()
avglist = []
#DataFrameGroupBy由多个元组组成,每个元组内有组名和组(DataFrame)
for gpname,group in gp:
if len(group) >= n:#筛选出结果较多的点
group=group.iloc[0:n,:]#截断多余的行
df2=pd.concat([df2,group],ignore_index=True)
avg=group['rating'].mean(axis=0)
avglist.append(avg)
actions += 1
#if actions == 0:
# break;
print(actions)
df2.to_csv(outfile,index=False)
df3 = pd.DataFrame(data=avglist,columns=['average'])
df3.to_csv('avg.csv',index=False)
#第二部分:获得关于分组操作的结果(从DataFrameGroupBy总体来看)
#df3 = df.groupby(['target']).size()
#print(type(df3))
#print(df.groupby(['target']).size().values)
#print(df.groupby(['target']).size().index)