一个mapreduce得到需要计算单词概率的基础数据

2023-01-09 19:08:59

第一步，先计算需要计算概率的词频，单词种类数，类别单词总数（类别均是按照文件夹名区分）（基础数据以及分词了，每个单词一行，以及预处理好）

package org.lukey.hadoop.classifyBayes;

import java.io.IOException;

import java.net.URI;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataOutputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Counter;

import org.apache.hadoop.mapreduce.Counters;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

/**

 *

 * 一次将需要的结果都统计到对应的文件夹中 AFRICA 484017newsML.txt afford 1

 *

 * 按照这个格式输出给后面处理得到需要的： 1. AFRICA 484017newsML.txt AFRICA 487141newsML.txt

 * 类别中的文本数， ---> 计算先验概率(单独解决这个) 所有类别中的文本总数， ---> 可以由上面得到，计算先验概率

 *

 * 2. AFRICA afford 1 AFRICA boy 3 每个类中的每个单词的个数，---> 计算各个类中单词的概率

 *

 * 3. AFRICA 768 类中单词总数， ---> 将2中的第一个key相同的第三个数相加即可

 *

 * 4. AllWORDS 12345 所有类别中单词种类数 ---> 将1中的第三个key归并，计算个数

 *

 */

public class MyWordCount {

    private static MultipleOutputs<Text, IntWritable> mos;

    static String baseOutputPath = "/user/hadoop/test_out";

    // 设计两个map分别计算每个类别的文本数//和每个类别的单词总数

    private static Map<String, List<String>> fileCountMap = new HashMap<String, List<String>>();

    private static Map<String, Integer> fileCount = new HashMap<String, Integer>();

    // static Map<String, List<String>> wordsCountInClassMap = new

    // HashMap<String, List<String>>();

    static enum WordsNature {

        CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        // 设置不同文件的路径

        // 文本数路径

        String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbality.txt";

        conf.set("priorProbality", priorProbality);

        String[] otherArgs = { "/user/hadoop/input/NBCorpus/Country", "/user/hadoop/mid/wordsFre" };

        Job job = new Job(conf, "file count");

        job.setJarByClass(MyWordCount.class);

        // job.setInputFormatClass(CustomInputFormat.class);

        job.setMapperClass(First_Mapper.class);

        job.setReducerClass(First_Reducer.class);

        //过滤掉文本数少于10的类别

        List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);

        for (Path path : inputPaths) {

            FileInputFormat.addInputPath(job, path);

        }

        // 调用自己写的方法

//        MyUtils.addInputPath(job, inputpath, conf);

        // CustomInputFormat.setInputPaths(job, inputpath);

        // FileInputFormat.addInputPath(job, inputpath);

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        int exitCode = job.waitForCompletion(true) ? 0 : 1;

        // 调用计数器

        Counters counters = job.getCounters();

        Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);

        System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());

        // 将单词种类数写入文件中

        Path totalWordsPath = new Path("/user/hadoop/output/totalwords.txt");

        FileSystem fs = FileSystem.get(conf);

        FSDataOutputStream outputStream = fs.create(totalWordsPath);

        outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());

        IOUtils.closeStream(outputStream);

        // 下次求概率是尝试单词总种类数写到configuration中

        //

        // conf.set("TOTALWORDS", totalWords.toString());

        System.exit(exitCode);

    }

    // Mapper

    static class First_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);

        private final static IntWritable zero = new IntWritable(0);

        private Text className = new Text();

        private Text countryName = new Text();

        @Override

        protected void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            String file = conf.get("priorProbality");

            FileSystem fs = FileSystem.get(URI.create(file), conf);

            Path priorPath = new Path(file);

            FSDataOutputStream priorStream = fs.create(priorPath);

            for (Map.Entry<String, List<String>> entry : fileCountMap.entrySet()) {

                fileCount.put(entry.getKey(), entry.getValue().size());

                priorStream.writeBytes(entry.getKey() + "\t" + entry.getValue().size());

            }

            // 求文本总数

            int fileSum = 0;

            for (Integer num : fileCount.values()) {

                fileSum += num;

            }

            System.out.println("fileSum = " + fileSum);

            // 计算每个类的先验概率并写入文件

            for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {

                double p = (double) entry.getValue() / fileSum;

                priorStream.writeBytes(entry.getKey() + ":" + p);

            }

            IOUtils.closeStream(priorStream);

        }

        @Override

        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            FileSplit fileSplit = (FileSplit) context.getInputSplit();

            // 文件名

            String fileName = fileSplit.getPath().getName();

            // 文件夹名(即类别名)

            String dirName = fileSplit.getPath().getParent().getName();

            className.set(dirName + "\t" + value.toString());

            countryName.set(dirName + "\t" + fileName + "\t" + value.toString());

            // 将文件名添加到map中用于统计文本个数（单独跑了一个程序计算主要还是为了筛选文本数太少的类别）

            if (fileCountMap.containsKey(dirName)) {

                if (!fileCountMap.get(dirName).contains(fileName)) {

                    fileCountMap.get(dirName).add(fileName);

                }

            } else {

                List<String> oneList = new ArrayList<String>();

                oneList.add(fileName);

                fileCountMap.put(dirName, oneList);

            }

            context.write(className, one); // 每个类别的每个单词数 // ABDBI hello 1

            context.write(new Text(dirName), one);// 统计每个类中的单词总数 //ABDBI 1

            context.write(value, zero); // 用于统计所有类中单词个数

        }

    }

    // Reducer

    static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        // result 表示每个类别中每个单词的个数

        IntWritable result = new IntWritable();

        Map<String, List<String>> classMap = new HashMap<String, List<String>>();

        Map<String, List<String>> fileMap = new HashMap<String, List<String>>();

        @Override

        protected void reduce(Text key, Iterable<IntWritable> values,

                Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                        throws IOException, InterruptedException {

            int sum = 0;

            for (IntWritable value : values) {

                sum += value.get();

            }

            // sum为0，总得单词数加1，统计所有单词的种类

            if (sum == 0) {

                context.getCounter(WordsNature.TOTALWORDS).increment(1);

            } else {// sum不为0时，通过key的长度来判断，

                String[] temp = key.toString().split("\t");

                if (temp.length == 2) { // 用tab分隔类别和单词

                    result.set(sum);

                    context.write(key, result);

                    // mos.write(new Text(temp[1]), result, temp[0]);

                } else { // 类别中单词总数

                    result.set(sum);

                    mos.write(key, result, "wordsInClass");

                }

            }

        }

        @Override

        protected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            mos.close();

        }

        @Override

        protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            mos = new MultipleOutputs<Text, IntWritable>(context);

        }

    }

    // 获取文件夹下面二级文件夹路径的方法

        static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {

            FileSystem fs = FileSystem.get(conf);

            Path path = new Path(folder);

            FileStatus[] stats = fs.listStatus(path);

            List<Path> folderPath = new ArrayList<Path>();

            for (FileStatus stat : stats) {

                if (stat.isDir()) {

                    if (fs.listStatus(stat.getPath()).length > 10) {    //筛选出文件数大于10个的类别作为 输入路径

                        folderPath.add(stat.getPath());

                    }

                }

            }

            return folderPath;

        }

}

第二步，计算每个类别单词的概率，需提前读取每个类别单词总数，以及总得单词种类数（都可以通过configuration.set）也可以在setup里面先于map处理前读取数据。

package org.lukey.hadoop.classifyBayes;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.URI;

import java.util.HashMap;

import java.util.Map;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.DoubleWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class Probability {

    private static final Log LOG = LogFactory.getLog(FileInputFormat.class);

    public static int total = 0;

    private static MultipleOutputs<Text, DoubleWritable> mos;

    // Client

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        conf.set("mapred.job.tracker", "192.168.190.128:9001");

        conf.set("mapred.jar", "probability.jar");

        // 读取单词总数，设置到congfiguration中

        String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";

        String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/wordsInClass-r-00000";

        conf.set("wordsInClassPath", wordsInClassPath);

        // Map<String, Integer> wordsInClassMap = new HashMap<String,

        // Integer>();//保存每个类别的单词总数

        // 先读取单词总类别数

        FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);

        FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));

        BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

        String strLine = buffer.readLine();

        String[] temp = strLine.split(":");

        if (temp.length == 2) {

            // temp[0] = TOTALWORDS

            conf.set(temp[0], temp[1]);// 设置两个String

        }

        total = Integer.parseInt(conf.get("TOTALWORDS"));

        LOG.info("------>total = " + total);

        System.out.println("total ==== " + total);

        /*

         * String[] otherArgs = new GenericOptionsParser(conf,

         * args).getRemainingArgs();

         *

         * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");

         * System.exit(-1); }

         */

        Job job = new Job(conf, "file count");

        job.setJarByClass(Probability.class);

        job.setMapperClass(WordsOfClassCountMapper.class);

        job.setReducerClass(WordsOfClassCountReducer.class);

        String input = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";

        String output = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";

        FileInputFormat.addInputPath(job, new Path(input));

        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(DoubleWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    // Mapper

    static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

        private static DoubleWritable number = new DoubleWritable();

        private static Text className = new Text();

        // 保存类别中单词总数

        private static Map<String, Integer> filemap = new HashMap<String, Integer>();

        protected void map(LongWritable key, Text value,

                Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            Configuration conf = context.getConfiguration();

            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            System.out.println("total = " + total);

            System.out.println("tot = " + tot);

            // 输入的格式如下：

            // ALB weekend 1

            // ALB weeks 3

            Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据

            // Map<String, Map<String, Double>> priorMap = new HashMap<String,

            // Map<String, Double>>(); // 保存每个单词出现的概率

            String[] temp = value.toString().split("\t");

            // 先将数据存到baseMap中

            if (temp.length == 3) {

                // 文件夹名类别名

                if (baseMap.containsKey(temp[0])) {

                    baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));

                } else {

                    Map<String, Integer> oneMap = new HashMap<String, Integer>();

                    oneMap.put(temp[1], Integer.parseInt(temp[2]));

                    baseMap.put(temp[0], oneMap);

                }

            } // 读取数据完毕，全部保存在baseMap中

            int allWordsInClass = 0;

            for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别

                allWordsInClass = filemap.get(entries.getKey());

                for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率

                    double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);

                    className.set(entries.getKey() + "\t" + entry.getKey());

                    number.set(p);

                    LOG.info("------>p = " + p);

                    context.write(className, number);

                }

            }

        }

        protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            mos.close();

        }

        protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)

                throws IOException, InterruptedException {

            // TODO Auto-generated method stub

            Configuration conf = context.getConfiguration();

            mos = new MultipleOutputs<Text, DoubleWritable>(context);

            String filePath = conf.get("wordsInClassPath");

            FileSystem fs = FileSystem.get(URI.create(filePath), conf);

            FSDataInputStream inputStream = fs.open(new Path(filePath));

            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));

            String strLine = null;

            while ((strLine = buffer.readLine()) != null) {

                String[] temp = strLine.split("\t");

                filemap.put(temp[0], Integer.parseInt(temp[1]));

            }

        }

    }

    // Reducer

    static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {

        // result 表示每个文件里面单词个数

        DoubleWritable result = new DoubleWritable();

        // Configuration conf = new Configuration();

        // int total = conf.getInt("TOTALWORDS", 1);

        protected void reduce(Text key, Iterable<DoubleWritable> values,

                Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)

                        throws IOException, InterruptedException {

            double sum = 0L;

            for (DoubleWritable value : values) {

                sum += value.get();

            }

            result.set(sum);

            context.write(key, result);

        }

    }

}

码农公寓

相关文章