如何在有限内存下对外部大文件进行排序

package org.hbin.sort; import java.io.*; import java.util.*; /** * @author Haley * @version 1.0 * 2024/10/25 */ public class SplitSort { /** 最小数字 */ private static int min = 1000_0000; /** 最大数字 */ private static int max = 8888_0000; /** 单个文件的数字个数 */ private static int fileCapacity = 100_0000; /** 小文件数量 */ private static int fileCount = max / fileCapacity - min / fileCapacity + 1; private static int bound = max - min + 1; /** 位数组 */ private static boolean[] bits = new boolean[bound]; /** 统计集合 */ private static Map<Integer, Integer> map = new HashMap<>(bound); private static String fileName = "1MB.txt"; private static String prefixBatchPath = "prefix_batch//"; private static String prefixOutFileName = "prefix_merge_sort_"; private static String outFileName; public static List<File> split() throws IOException { System.out.println("clean ..."); File batchPath = new File(prefixBatchPath); if(batchPath.exists()) { batchPath.delete(); } batchPath.mkdir(); System.out.println("create batch files ..."); List<File> files = new ArrayList<>(); int offset = min / fileCapacity; BufferedWriter[] writers = new BufferedWriter[fileCount]; for (int i = 0; i < fileCount; i++) { files.add(new File(prefixBatchPath + (i + offset))); writers[i] = new BufferedWriter(new FileWriter(prefixBatchPath + (i + offset))); } System.out.println("split file ..."); long s = System.currentTimeMillis(); long printCapacity = fileCapacity; try (BufferedReader reader = new BufferedReader(new FileReader(fileName))) { String line; int count = 0; while((line = reader.readLine()) != null) { int slot = Integer.parseInt(line) / fileCapacity; writers[slot - offset].write(line + System.lineSeparator()); count ++; if(count % printCapacity == 0) { System.out.println("deal " + count); if(count / printCapacity >= 10) { printCapacity *= 10; } } } } catch (Exception e) { e.printStackTrace(); } for (int i = 0; i < fileCount; i++) { writers[i].flush(); writers[i].close(); } System.out.println("split file. " + (System.currentTimeMillis() - s) / 1000 + " s."); return files; } public static void bitSort() throws IOException { // 清空 outFileName = prefixOutFileName + fileName; File outFile = new File(outFileName); if(outFile.exists()) { outFile.delete(); } outFile.createNewFile(); long start = System.currentTimeMillis(); int end = min / fileCapacity; for (int i = max / fileCapacity; i >= end; i--) { // 重置位数组 Arrays.fill(bits, false); map.clear(); int total = 0; int batch = 500_0000; File f = new File(prefixBatchPath + i); try (BufferedReader reader = new BufferedReader(new FileReader(f))) { String line; while((line = reader.readLine())!=null) { int num = Integer.parseInt(line); int index = num - min; if(!bits[index]) { bits[index] = true; map.put(num, map.getOrDefault(num, 0) + 1); } total ++; if(total % batch == 0) { System.out.println("deal ... " + total/10000 + "w, map's size: " + map.size()); if(total > 2500_0000) { if(total % 100 == 0) { System.out.println("\tdeal ... " + total + ", map's size: " + map.size()); } } } } } System.out.println(i + " sort " + (System.currentTimeMillis() - start) / 1000 + " s."); print(); } } public static void print() { long start = System.currentTimeMillis(); try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFileName, true))) { for (int i = bits.length - 1; i >= 0; i--) { if(bits[i]) { int num = i + min; for (int j = 0; j < map.get(num); j++) { writer.write(num + System.lineSeparator()); } } } writer.flush(); } catch (IOException e) { e.printStackTrace(); } System.out.println("print " + (System.currentTimeMillis() - start) / 1000 + " s."); } public static void main(String[] args) throws IOException { // fileName = "200MB.txt"; // split-46s - bit 46s // fileName = "1GB.txt"; // 30+320=360s - 25+85=110s/25+65=93s fileName = "10GB.txt"; // - 300+200=550s/310+270=590s long start = System.currentTimeMillis(); split(); bitSort(); System.out.println("total: " + (System.currentTimeMillis() - start) / 1000 + " s."); } }
上一篇:第七部分:2. STM32之ADC实验--AD多通道(AD采集三路传感器模块实验:光敏传感器、热敏传感器、反射式传感器附赠温湿度传感器教程)


下一篇:无人驾驶汽车——AI技术在交通领域的进展与未来展望