Hadoop 二次排序

需求

求每年的最高气温,年份升序,温度求最高

数据源内容如下

temperature.txt

2004 49
1981 -22
1981 -31
1965 -47
2027 -2
1964 6
2030 38
2016 -33
1963 13
2000 21
2019 0
2049 43
2039 8
1989 -18
2017 49
1952 -47
2016 -28
1991 20
1967 -39
2022 -47
2041 41
2039 -38
2021 33
1969 38
1981 0
1960 -26
2023 -12
1969 12
1996 -31
1954 -36
2026 34
2013 -4
1969 37
1990 -22
2007 -31
1987 -8
1972 -30
2019 -17
2042 -22
2011 21
2033 -25
2013 10
2047 30
2008 -2
2047 -5
1994 14
1960 7
2037 44
1990 -41
2047 32
2048 -22
1977 -27
2049 35
2023 2
1952 -44
1979 -5
1996 47
2033 8
2006 3
2030 32
1967 43
1980 -6
2001 39
2049 -31
2028 -16
2029 31
1962 -21
2043 -7
2040 34
2001 9
1977 -21
2047 1
2022 30
2002 12
1956 38
2009 7
2049 11
1981 18
2014 -29
1967 -15
2019 2
1975 25
1965 21
2013 -36
2024 -44
1959 10
1992 4
1997 15
2042 17
2013 -14
1993 -21
2027 19
2016 -44
1989 -47
1999 -6
1993 -35
1953 -21
1952 12
1969 -45
2036 10
1950 29
2022 8
1985 -45
2044 -48
1981 -12
2033 -42
1973 -49
2011 27
1958 -26
2028 35
2037 41
1955 -36
2001 -11
1965 23
1970 -14
2015 -2
1969 -19
1997 3
2016 -38
2045 9
1974 6
1956 -39
2012 1
2022 -28
1991 -31
1974 -40
1998 43
2007 12
2049 9
2034 -18
1956 48
1974 40
2009 -24
2030 -44
1957 27
1979 -23
2034 29
2024 -34
2034 -10
2007 42
2000 33
1990 -44
2048 -48
1967 -30
1969 12
2030 26
2023 -36
2029 22
2044 -2
2043 -47
2040 -18
1990 -3
1996 -16
1974 -20
2023 -11
1990 -16
1980 13
2013 -8
2001 41
2015 -30
1974 28
2031 13
1991 -33
1985 -6
1979 -34
2041 12
1957 -46
2014 25
1969 18
1958 -39
1955 -46
2031 39
2032 11
1991 38
2035 -43
2005 -1
2000 2
2027 -28
1984 -8
1985 -47
2045 -6
1987 -21
2004 35
1968 -47
1968 -19
1995 -47
1990 46
1987 18
2012 29
1987 -12
2048 -8
1987 26
2010 18
1959 -20
1978 8
1997 38
1963 24
1991 8
2005 -34
2019 -4
2042 43
1951 6
1956 -32
1952 18
2003 -15
1979 29
2026 35
2032 -26
2044 -25
2039 -36
2021 49
2037 6
2000 -22
2027 34
2024 38
2019 15
1954 -27
2016 49
2018 -43
2048 23
1978 9
1977 5
2047 -30
2028 -12
1991 -25
2022 -36
1974 -2
2038 25
2014 10
2000 -7
2033 16
2020 5
1985 7
1951 -1
1958 -8
1963 -3
1972 10
1986 9
1961 3
1972 -20
1979 -39
1958 44
2027 -48
2007 -50
2025 33
1970 22
2044 27
2043 -48
1950 1
2023 31
2041 -39
2040 43
2025 21
2038 39
1998 16
1987 -50
1967 -40
2021 -27
1961 6
1981 22
1990 7
1993 -49
2001 -5
2003 21
1990 47
1986 -19
2031 37
1987 -14
2019 16
2008 45
2044 1
1977 5
1952 10
2047 5
2044 21
2002 29
1992 28
1980 -2
1952 -47
2008 15
2017 17
1970 1
2045 -37
2016 5
1951 -28
1978 5
1954 9
1966 18
1957 45
1998 -26
1989 0
1964 10
2036 -44
2037 -22
1965 12
2035 40
1994 7
2024 7
1961 4
2007 34
1980 -36
1950 -39
1987 24
1983 -4
2007 46
2009 -5
1974 43
2026 26
1966 -21
2006 -21
1977 -3
1979 -31
2021 33
2040 39
2020 47
1953 -42
1955 2
2017 0
1973 31
1955 4
1973 -7
2027 28
1968 -17
2029 -3
2021 13
1991 9
2030 19
1952 -35
1987 14
1954 -18
2027 -23
1989 12
1983 13
1966 -45
2039 33
2014 34
2012 -30
1953 -7
2020 -21
1987 22
2041 45
2046 0
2017 26
1951 9
2000 -4
1973 27
1972 -3
2036 -14
1974 32
1987 -8
1993 3
1969 17
2011 -11
2038 -50
2040 -8
1950 -22
2036 13
2025 29
1986 27
2038 41
1971 37
1970 45
2045 -21
2036 41
1956 1
2042 -48
1955 -28
1967 -34
1999 -42
1952 -9
1962 -15
1974 -19
1959 19
1965 -42
1962 41
2003 -12
2029 14
1969 26
1992 -4
1959 8
1962 -18
2000 8
2025 -20
2048 -15
1996 25
2017 -23
1992 -10
2001 30
1960 45
2034 33
1983 -47
2046 19
2041 -4
1978 -6
1967 -49
1993 8
1987 -11
2009 3
1990 40
1972 -6
2029 -47
1990 3
2036 4
1981 22
2019 37
1980 -47
2003 -42
1965 -6
2007 45
2040 -45
1984 24
2048 -15
1984 -16
1992 -39
2040 -33
1984 -24
2046 28
2023 -3
1956 46
1969 0
1983 -4
2030 -50
2004 -36
1958 16
2025 -22
1957 -6
2001 -24
2014 -49
1965 16
2043 42
1966 -10
1971 -13
1996 48
1976 11
2026 -43
1982 2
1965 -50
2038 40
2024 -32
1988 3
2004 -45
2039 8
2029 -30
1974 -11
2033 29
1968 -2
2040 -8
1989 -11
1999 7
2001 37
2001 -44
1979 -30
2048 7
1998 -21
2005 49
1975 44
2031 31
1982 12
1987 35
2004 -33
2000 27
2008 34
1970 -26
2047 0
1974 35
1977 -45
1976 19
1956 48
2025 -37
1991 0
2041 -40
1976 38
2016 36
2024 6
2021 14
2005 27
1951 -38
2046 16
1976 26
2044 -44
1989 -47
2025 26
2045 43
2045 -23
2004 30
2044 46
1962 -20
1954 7
1975 -39
1967 18
2038 4
1956 15
2010 -14
2032 -6
1999 19
2024 7
1993 -23
1961 -43
2007 23
1998 9
2027 -29
1950 29
2010 -47
1953 43
2033 -19
1977 28
2013 -36
2001 43
2008 46
2004 19
1985 6
2043 3
2014 -21
1992 7
1990 8
2020 44
1957 -40
2030 5
1996 16
2018 -5
1989 -14
2016 -11
1988 -18
2012 -3
1998 -12
1979 -41
2043 1
1978 -12
1959 -29
2048 -26
1989 -31
2026 33
1960 32
1978 14
2003 36
2012 15
2036 34
2040 -49
1986 7
1982 19
1959 42
2041 23
2037 20
2020 -24
1977 -27
2039 18
2046 2
2017 -23
2012 30
1962 28
1985 42
2023 15
2030 -30
1983 28
1967 26
1990 -11
1968 -50
2038 -11
1995 34
2005 -43
2011 5
1978 9
1952 -48
1955 27
1958 -21
2020 -36
1985 -23
1991 10
1982 -17
1999 3
1999 -25
2005 -11
2048 -14
1985 -18
2006 -5
1970 -21
2026 -26
1956 -20
2043 -50
1982 -24
1998 8
2034 28
1966 -10
2045 5
1968 -49
2001 48
2026 -9
2005 49
2036 39
2027 -45
1972 -24
2009 -49
1961 38
1991 36
1975 37
1978 12
2003 -45
2021 -46
1962 -8
1972 -8
1961 39
2009 23
1995 30
1996 -19
1983 45
1952 19
1974 -24
1992 33
1981 -1
1981 -32
1984 0
2049 -41
2030 13
1993 -27
1980 -45
1964 -10
2013 39
1975 24
1972 43
1977 -33
1962 -44
2016 -22
2029 47
1999 41
2030 -17
2023 36
2018 32
2025 20
1966 14
1986 29
2036 -20
2022 -36
2027 -46
1994 -8
1992 34
2017 1
2021 32
1966 28
1987 -22
1996 26
1991 48
1993 4
1973 -28
1981 -16
2011 45
1963 -14
1986 -50
1984 -26
1980 30
2024 42
1979 31
2030 3
2035 17
2036 30
2017 -43
1997 9
2004 -25
1999 40
1993 16
1965 -42
2043 24
2017 29
2034 -39
1952 -49
2023 26
1999 -31
1986 23
1962 -10
1960 22
2036 -30
2044 38
2014 -50
1986 0
2024 -40
1962 -15
1950 11
2019 30
1980 -16
1992 -18
1994 -40
1989 33
1999 23
1999 -38
2021 -38
2033 17
1995 -2
2034 -9
2017 -36
1956 -41
1961 1
2020 46
1991 -17
2026 2
2004 9
1976 -7
1956 -4
1981 41
2014 0
1975 -41
2005 47
1966 -47
1968 -27
1953 48
2028 32
1963 40
1982 34
2031 27
2008 1
2037 10
2000 -1
2038 -4
2044 -12
1960 -4
2014 10
2038 -42
1964 -48
1994 -47
1953 -30
1987 -24
2038 5
2027 43
1991 7
2015 21
2038 -2
1999 28
2026 -50
1986 25
2041 -24
2029 -1
2008 18
1952 -41
1969 -50
1973 6
1956 -20
1966 -21
1967 44
1967 39
2035 16
1973 -45
2035 38
1958 22
2000 -6
2004 16
2004 16
2037 -38
2028 -47
1957 -41
1985 41
2028 -3
2014 -32
1980 -14
1960 13
2012 10
1960 -27
1983 -6
1953 8
1954 -42
1979 43
1992 -48
1976 19
1964 -11
1970 -14
2042 -10
1990 -36
1987 -8
2023 31
1959 -12
2008 -40
2033 7
2012 46
2002 -3
1992 -35
2044 17
2010 14
2018 -35
1961 26
2004 -24
2045 33
1965 -9
1970 -16
1977 40
2030 -42
2046 -30
1963 36
2019 -47
2020 -12
2026 -27
1994 21
1951 27
1999 -10
1990 36
2003 -8
1984 31
2015 -26
2015 14
1981 -20
1971 -47
2033 -4
1976 -29
2037 25
2013 33
2011 1
2000 -27
2037 31
1960 8
2048 -26
2037 -8
2039 42
1986 -38
2038 13
1984 -44
2049 -43
2012 3
1962 -39
1959 3
1979 -3
1996 -1
1983 27
1950 -43
1957 36
1951 -28
2010 44
2045 -22
2023 0
2038 37
2011 -30
2009 4
1952 47
1965 -35
2005 -35
1954 -9
2040 14
1987 -24
1978 -15
2009 22
1964 48
2003 -38
1969 -20
1983 -47
2030 13
1990 -45
2013 42
1988 -26
2017 9
2041 -43
1964 -20
2005 30
2024 25
2043 26
1993 27
2018 -41
2008 -14
2013 16
2028 44
1967 29
1973 -5
2027 -38
1954 -12
1963 -21
2008 -3
2049 -14
2022 -34
1976 -39
1976 13
2007 30
2032 -15
2007 -7
2028 -37
2012 29
2029 -7
2002 19
2046 -1
1979 0
2008 -17
1980 42
1986 28
1957 -5
1966 48
1994 43
2047 23
2024 -37
1974 -36
2022 -29
2040 -21
2004 12
1978 40
1982 -22
1984 -8
2030 6
1968 -3
1965 32
1998 -15
2039 10
2033 36
1977 36
2045 43
2045 -17
2021 38
1969 -43
2021 -7
2018 10
2008 40
2012 31
2011 28
1999 -36
1985 -18
2008 4
2040 -46
1954 33
2035 -28
1980 -3
2038 20
1959 29
1979 13
2006 8
2029 22
1962 -44
1978 37
1993 -3
1988 23
1991 39
2013 8
1955 43
1973 0
1976 -3
1963 3
2031 -15
2003 31
2002 16
1981 -44
1959 19
2023 -34
2039 4
1994 -21
1951 36
1997 11
2013 13
1950 32
2020 -12
2016 -22
2009 -38
2031 13
1986 -43
1959 28
2049 10
1954 -45
2018 -1
2008 48
2034 -41
1982 -2
1972 -11
2045 -34
1958 10
1997 31
2013 -13
2025 -19
2038 -32
2041 -21
2013 0
2034 3
2036 -23
2008 -22
2034 3
2042 41
2002 1
2043 -2
1950 19
2041 21
2005 -16
2030 -36
2001 45
1964 33
2027 -25
2046 -5
2044 -42
1965 -37
2004 22
2029 46
1966 7
2008 -48
2016 -22
2033 -28
1999 -33
1987 11
1995 18
1969 -13
2023 9
2018 1
2015 39
2017 31
1975 44
1991 32
2045 10
2046 -35
1952 40
1950 -38
1996 -39
2031 14
2037 -48
2002 41

思路

需要排序2次,先比较年份,再比较相同年份下,温度最高的值,也就是说需要对2个纬度的值做排序。但是hadoop只能够在key上进行排序,所以气温和年份的值都得放在key里面,也就是需要创建自定义组合key

pom依赖

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.7.3</version>
</dependency>

代码

1.自定义key

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 所有自定义的组合key应该实现接口WritableComparable,WritableComparable接口继承自writable和comparable这两个接
 因为writable接口是可序列化的并且可比较的。WritableComparable。组合key按照年份升序按照气温降序
 */

public class ComboKey implements WritableComparable<ComboKey> {
    private int year ;
    private int temp ;

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getTemp() {
        return temp;
    }

    public void setTemp(int temp) {
        this.temp = temp;
    }

    /**
     * 对key进行比较实现
     */
    public int compareTo(ComboKey o) {
        int y0 = o.getYear();
        int t0 = o.getTemp();

        if(year == y0){
            //气温降序
            return -(temp - t0) ;
        }
        //年份升序
        else{
            return year - y0 ;
        }
    }



    /**
     * 串行化过程
     */
    public void write(DataOutput out) throws IOException {
        //年份
        out.writeInt(year);
        //气温
        out.writeInt(temp);
    }

    ////反串行化的过程
    public void readFields(DataInput in) throws IOException {
        year = in.readInt();
        temp = in.readInt();
    }
}

2.自定义分区

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//该分区类按照年份进行分区,相同的年份会进入到同一个分区中去
public class YearPartitioner extends Partitioner<ComboKey,NullWritable> {
    public int getPartition(ComboKey key, NullWritable nullWritable, int numPartitions) {
        int year = key.getYear();
        return year % numPartitions;
    }
}

3.CombokeyComparator

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 同时完成Combokey中的first和second排序。
 */
public class ComboKeyComparator extends WritableComparator {

    protected ComboKeyComparator() {
        super(ComboKey.class, true);
    }

    public int compare(WritableComparable a, WritableComparable b) {
        ComboKey k1 = (ComboKey) a;
        ComboKey k2 = (ComboKey) b;
        return k1.compareTo(k2);
    }
}

4.分组函数

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 分组在Reduce阶段,构造一个与 Key 相对应的 Value 迭代器的时候,只要year相同就属于同一个组,放在一个Value迭代器,不同的year按照年份升序进行排序。
 最后,ComboKey的year相同,但是temp不同的数据会进入一组,并且按temp降序排列。如:
 2018 40
 2018 38
 2018 37
 分组后的第一条数据就是我们需要的(也就是reduce阶段的key)
 */
public class YearGroupComparator extends WritableComparator {
    protected YearGroupComparator() {
        super(ComboKey.class, true);
    }

    public int compare(WritableComparable a, WritableComparable b) {
        ComboKey k1 = (ComboKey)a ;
        ComboKey k2 = (ComboKey)b ;
        return k1.getYear() - k2.getYear() ;
    }
}

5.Map

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Map端,输入的(key,value)缩进长度和文本文档,输出的key是组合key,value值是空值
 */
public class MaxTempMapper extends Mapper<LongWritable,Text,ComboKey,NullWritable>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println("MaxTempMapper.map");
        String[] arr = value.toString().split(" ");
        ComboKey keyout = new ComboKey();
        keyout.setYear(Integer.parseInt(arr[0]));
        keyout.setTemp(Integer.parseInt(arr[1]));
        context.write(keyout,NullWritable.get());
    }
}

6.reduce

/**
 * Reduce端,将组合key切割成key为year,value为气温的一个列表
 */
public class MaxTempReducer extends Reducer<ComboKey, NullWritable, IntWritable, IntWritable>{

    protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        int year = key.getYear();
        int temp = key.getTemp();
        for(NullWritable v : values){
            System.out.println(key.getYear() + " : " + key.getTemp());
        }
        context.write(new IntWritable(year),new IntWritable(temp));
    }
}

7.APP

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;


/**
 *二次排序
 *求每年最高的温度,年份升序,温度求最高
 *
 * 数据格式
 2004 49
 1981 -22
 1981 -31
 1965 -47
 2017 -2
 */
public class APP {
    public static String run_mode = "local";
    //public static String run_mode = "cluster";
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //本地
        if(run_mode.equals("local")) {
            File dir = new File("c:\\out");
            if (dir.exists()) {
                APP.delFile(dir);
            }
            conf.set("fs.defaultFS", "file:///");
            //添加输入路径
            FileInputFormat.addInputPath(job,new Path("C://temperature.txt"));
            //设置输出路径
            FileOutputFormat.setOutputPath(job,new Path("C://out"));
            //linux集群
        } else if(run_mode.equals("cluster")) {
            conf.set("fs.defaultFS", "hdfs://master:9000");
            //添加输入路径
            FileInputFormat.addInputPath(job,new Path("/temperature.txt"));
            //设置输出路径
            FileOutputFormat.setOutputPath(job,new Path("/out"));
        }

        //设置job的各种属性
        job.setJobName("WCApp");                        //作业名称
        job.setJarByClass(APP.class);                 //搜索类
        //job.setInputFormatClass(FileInputFormat.class); //设置输入格式


        //添加输入路径
        //FileInputFormat.addInputPath(job,new Path(args[0]));
        //设置输出路径
        //FileOutputFormat.setOutputPath(job,new Path(args[1]));


        job.setMapperClass(MaxTempMapper.class);             //mapper类
        job.setReducerClass(MaxTempReducer.class);           //reducer类

        job.setNumReduceTasks(1);                       //reduce个数

        job.setMapOutputKeyClass(ComboKey.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        //设置分区类
        job.setPartitionerClass(YearPartitioner.class);
        //设置分组对比器
        job.setGroupingComparatorClass(YearGroupComparator.class);
        //设置排序对比器(好像不写也行,因为不写的话,WritableComparator的compare方法底层还是会调用自定义key ComboKey.class里的compareTo方法)
        job.setSortComparatorClass(ComboKeyComparator.class);

        job.waitForCompletion(true);
    }

    static boolean delFile(File file) {
        if (!file.exists()) {
            return false;
        }

        if (file.isDirectory()) {
            File[] files = file.listFiles();
            for (File f : files) {
                delFile(f);
            }
        }
        return file.delete();
    }
}

 

上一篇:43-Kruskal 算法


下一篇:OCP-052考试题库汇总(43)-CUUG内部解答版