hadoop2.2编程:用ruby跑hadoop的完整实例

Becareful! 
All nodes include  need to install ruby!
 #!/usr/bin/ruby
 # Ruby code for map.rb

 ARGF.each do |line|

    # remove any newline
    line = line.chomp

    # do nothing will lines shorter than 2 characters
    next if ! line || line.length < 2

    # grab our key as the two-character prefix (lower-cased)
    key = line[0,2].downcase

    # value is a count of 1 occurence
    value = 1

    # output to STDOUT
    # <key><tab><value><newline>
    puts key + "\t" + value.to_s

 end
 #!/usr/bin/ruby
 # Ruby code for reduce.rb

 prev_key = nil
 key_total = 0 

 ARGF.each do |line|

    # remove any newline
    line = line.chomp

    # split key and value on tab character
    (key, value) = line.split(/\t/)

    # check for new key
    if prev_key && key != prev_key && key_total > 0 

       # output total for previous key

       # <key><tab><value><newline>
       puts prev_key + "\t" + key_total.to_s

       # reset key total for new key
       prev_key = key
       key_total = 0 

    elsif ! prev_key
       prev_key = key 

    end 

    # add to count for this current key
    key_total += value.to_i

 end
 #!/bin/bash

 HADOOP_HOME=/home/grid/hadoop
 JAR=contrib/streaming/hadoop--streaming.jar

 HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"

 $HSTREAMING \
  -mapper  'map.rb' \          # or -mapper 'ruby map.rb'
  -reducer 'reduce.rb' \       # or -reducer 'reducer.rb'
  -file map.rb \                     # file path does not need specify full path
  -file reduce.rb \
  -input '/user/grid/input/*' \
  -output '/user/grid/output'
 cmd line:
%bin/hadoop jar ~/hadoop/contrib/streaming/hadoop--streaming.jar -input NCDC/files -output output -mapper Map.rb -reducer Reduce.rb
上一篇:洛谷P1168 中位数


下一篇:C++11 并发编程基础(一):并发、并行与C++多线程