Hive--使用Python脚本实现处理

--创建原始数据表:用户id、电影id、用户评分、用户的观影时间
CREATE TABLE u_data (
  userid INT,
  movieid INT,
  rating INT,
  unixtime STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

--加载数据:
load data local inpath '/export/datas/u.data' into table u_data;

--查询数据
select count(*) from u_data;

--创建新表:用户id、电影id、用户评分、用户的时间是周几
CREATE TABLE u_data_new (
  userid INT,
  movieid INT,
  rating INT,
  weekday INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


--创建Python脚本实现将原始表的时间转为对应的星期几
vim /export/datas/weekday_mapper.py

import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  userid, movieid, rating, unixtime = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([userid, movieid, rating, str(weekday)])
  
  
--加载python脚本并将数据写入新表
add FILE /export/datas/weekday_mapper.py;

INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (userid, movieid, rating, unixtime)
  USING 'python weekday_mapper.py'
  AS (userid, movieid, rating, weekday)
FROM u_data;

--统计每周内每天用户观影的次数
SELECT 
  weekday, 
  COUNT(*)
FROM 
  u_data_new
GROUP BY 
  weekday;
上一篇:某科学的分支语句 --- switch


下一篇:Day5-JS-其他对象