Becareful!
All nodes include need to install ruby!
#!/usr/bin/ruby # Ruby code for map.rb ARGF.each do |line| # remove any newline line = line.chomp # do nothing will lines shorter than 2 characters next if ! line || line.length < 2 # grab our key as the two-character prefix (lower-cased) key = line[0,2].downcase # value is a count of 1 occurence value = 1 # output to STDOUT # <key><tab><value><newline> puts key + "\t" + value.to_s end
#!/usr/bin/ruby # Ruby code for reduce.rb prev_key = nil key_total = 0 ARGF.each do |line| # remove any newline line = line.chomp # split key and value on tab character (key, value) = line.split(/\t/) # check for new key if prev_key && key != prev_key && key_total > 0 # output total for previous key # <key><tab><value><newline> puts prev_key + "\t" + key_total.to_s # reset key total for new key prev_key = key key_total = 0 elsif ! prev_key prev_key = key end # add to count for this current key key_total += value.to_i end
#!/bin/bash HADOOP_HOME=/home/grid/hadoop JAR=contrib/streaming/hadoop--streaming.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" $HSTREAMING \ -mapper 'map.rb' \ # or -mapper 'ruby map.rb' -reducer 'reduce.rb' \ # or -reducer 'reducer.rb' -file map.rb \ # file path does not need specify full path -file reduce.rb \ -input '/user/grid/input/*' \ -output '/user/grid/output'
cmd line: %bin/hadoop jar ~/hadoop/contrib/streaming/hadoop--streaming.jar -input NCDC/files -output output -mapper Map.rb -reducer Reduce.rb