[Java] 数据分析 -- 大数据

单词计数

  • 需求:输入小说文本,输出每个单词出现的次数
  • 实现:分map、combine、reduce三个阶段实现
[Java] 数据分析 -- 大数据
  1 /*  Data Analysis with Java
  2  *  John R. Hubbard
  3  *  Aug 4, 2017
  4  */
  5 
  6 package com.hongfeng.Chapter11;
  7 
  8 import java.io.File;
  9 import java.io.IOException;
 10 import java.io.PrintWriter;
 11 import java.util.ArrayList;
 12 import java.util.Collections;
 13 import java.util.HashMap;
 14 import java.util.List;
 15 import java.util.Map;
 16 import java.util.Scanner;
 17 
 18 public class Example1 {
 19     public static void main(String[] args) {
 20         try {
 21             File tempFile = new File("data/Temp.dat");
 22             map("data/sonnets/", 80, tempFile);
 23 
 24             Map<String,StringBuilder> hashTable = new HashMap(2500);
 25             combine(tempFile, hashTable);
 26 
 27             File outFile = new File("data/Output.dat");
 28             reduce(hashTable, outFile);
 29         } catch (IOException e) {
 30             System.err.println(e);
 31         }
 32     }
 33 
 34     public static void map(String src, int n, File temp) throws IOException {
 35         PrintWriter writer = new PrintWriter(temp);
 36         for (int i = 0; i < n; i++) {
 37             String filename = String.format("%sSonnet%03d.txt", src, i+1);
 38             map(filename, writer);
 39         }
 40         writer.close();
 41     }
 42 
 43     public static void combine(File temp, Map<String,StringBuilder> table) 
 44             throws IOException {
 45         Scanner scanner = new Scanner(temp);
 46         while (scanner.hasNext()) {
 47             String word = scanner.next();
 48             StringBuilder value = table.get(word);
 49             if (value == null) {
 50                 value = new StringBuilder("");
 51             }
 52             table.put(word, value.append(" 1"));
 53             scanner.nextLine();  // scan past the rest of the line (a "1")
 54         }
 55         scanner.close();
 56     }
 57         
 58     public static void reduce(Map<String,StringBuilder> table, File out) 
 59             throws IOException {
 60         PrintWriter writer = new PrintWriter(out);
 61         for (Map.Entry<String, StringBuilder> entry : table.entrySet()) {
 62             String key = entry.getKey();  // e.g., "speak"
 63             String value = entry.getValue().toString();  // e.g., "1 1 1 1 1"
 64             reduce(key, value, writer);
 65         }
 66         writer.close();
 67     }
 68 
 69     
 70     /*  Writes the pair (word, 1) for each word in the specified file.
 71     */
 72     public static void map(String filename, PrintWriter writer) 
 73             throws IOException {
 74         Scanner input = new Scanner(new File(filename));
 75         input.useDelimiter("[.,:;()?!\"\\s]+");
 76         while (input.hasNext()) {
 77             String word = input.next();
 78             writer.printf("%s 1%n", word.toLowerCase());
 79         }
 80         input.close();
 81     }
 82 
 83     /*  Counts the 1s in the value argument and writes (key, count) to file.
 84     */
 85     public static void reduce(String key, String value, PrintWriter writer)
 86             throws IOException {
 87         int count = (value.length() + 1)/2;  // e.g. "1 1 1 1 1" => 5
 88         writer.printf("%s %d%n", key, count);
 89     }
 90     
 91     private static void sort(File file) throws IOException {
 92         Scanner input = new Scanner(file);
 93         List<String> list = new ArrayList();
 94         while (input.hasNext()) {
 95             list.add(input.nextLine());
 96         }
 97         input.close();
 98         Collections.sort(list);
 99         PrintWriter output = new PrintWriter(file);
100         for (String string : list) {
101             output.println(string);
102         }
103         output.close();
104     }
105 }
View Code

 

上一篇:爬取腾讯网的世界疫情实时数据


下一篇:抄书抄博客毒害社区的Writer几时休?