1.
原始表
CREATE TABLE ml_100k (userid INT, movieid INT, rating INT, unixtime STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
数据清洗后的表
CREATE TABLE ml_100k2 (userid INT, movieid INT, rating INT, weekday int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
3.数据导入
LOAD DATA LOCAL INPATH '/home/centos/ml-100k/u.data' into table ml_100k;
4.脚本编写和脚本加载
clean_ml_100k.py
import sys import datetime for line in sys.stdin: #接收输入 line = line.strip() userid, movieid, rating, unixtime = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() #转化unixtime时间戳为日期时间,获取对应的星期值 print('\t'.join([userid, movieid, rating, str(weekday)])) # 输出清洗后的数据
然后 hdfs dfs -put
add file /home/centos/clean_ml_100k.py;
5.数据清洗+转储
INSERT OVERWRITE TABLE ml_100k2 SELECT TRANSFORM (userid, movieid, rating, unixtime) --输入值(基表) USING 'python clean_ml_100k.py' --使用脚本清洗 AS (userid, movieid, rating, weekday) --输出值(子表) FROM ml_100k;
然后我就失败了