如何在有限内存下对外部大文件进行排序
package org.hbin.sort;
import java.io.*;
import java.util.*;
/**
* @author Haley
* @version 1.0
* 2024/10/25
*/
public class SplitSort {
/** 最小数字 */
private static int min = 1000_0000;
/** 最大数字 */
private static int max = 8888_0000;
/** 单个文件的数字个数 */
private static int fileCapacity = 100_0000;
/** 小文件数量 */
private static int fileCount = max / fileCapacity - min / fileCapacity + 1;
private static int bound = max - min + 1;
/** 位数组 */
private static boolean[] bits = new boolean[bound];
/** 统计集合 */
private static Map<Integer, Integer> map = new HashMap<>(bound);
private static String fileName = "1MB.txt";
private static String prefixBatchPath = "prefix_batch//";
private static String prefixOutFileName = "prefix_merge_sort_";
private static String outFileName;
public static List<File> split() throws IOException {
System.out.println("clean ...");
File batchPath = new File(prefixBatchPath);
if(batchPath.exists()) {
batchPath.delete();
}
batchPath.mkdir();
System.out.println("create batch files ...");
List<File> files = new ArrayList<>();
int offset = min / fileCapacity;
BufferedWriter[] writers = new BufferedWriter[fileCount];
for (int i = 0; i < fileCount; i++) {
files.add(new File(prefixBatchPath + (i + offset)));
writers[i] = new BufferedWriter(new FileWriter(prefixBatchPath + (i + offset)));
}
System.out.println("split file ...");
long s = System.currentTimeMillis();
long printCapacity = fileCapacity;
try (BufferedReader reader = new BufferedReader(new FileReader(fileName))) {
String line;
int count = 0;
while((line = reader.readLine()) != null) {
int slot = Integer.parseInt(line) / fileCapacity;
writers[slot - offset].write(line + System.lineSeparator());
count ++;
if(count % printCapacity == 0) {
System.out.println("deal " + count);
if(count / printCapacity >= 10) {
printCapacity *= 10;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
for (int i = 0; i < fileCount; i++) {
writers[i].flush();
writers[i].close();
}
System.out.println("split file. " + (System.currentTimeMillis() - s) / 1000 + " s.");
return files;
}
public static void bitSort() throws IOException {
// 清空
outFileName = prefixOutFileName + fileName;
File outFile = new File(outFileName);
if(outFile.exists()) {
outFile.delete();
}
outFile.createNewFile();
long start = System.currentTimeMillis();
int end = min / fileCapacity;
for (int i = max / fileCapacity; i >= end; i--) {
// 重置位数组
Arrays.fill(bits, false);
map.clear();
int total = 0;
int batch = 500_0000;
File f = new File(prefixBatchPath + i);
try (BufferedReader reader = new BufferedReader(new FileReader(f))) {
String line;
while((line = reader.readLine())!=null) {
int num = Integer.parseInt(line);
int index = num - min;
if(!bits[index]) {
bits[index] = true;
map.put(num, map.getOrDefault(num, 0) + 1);
}
total ++;
if(total % batch == 0) {
System.out.println("deal ... " + total/10000 + "w, map's size: " + map.size());
if(total > 2500_0000) {
if(total % 100 == 0) {
System.out.println("\tdeal ... " + total + ", map's size: " + map.size());
}
}
}
}
}
System.out.println(i + " sort " + (System.currentTimeMillis() - start) / 1000 + " s.");
print();
}
}
public static void print() {
long start = System.currentTimeMillis();
try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFileName, true))) {
for (int i = bits.length - 1; i >= 0; i--) {
if(bits[i]) {
int num = i + min;
for (int j = 0; j < map.get(num); j++) {
writer.write(num + System.lineSeparator());
}
}
}
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("print " + (System.currentTimeMillis() - start) / 1000 + " s.");
}
public static void main(String[] args) throws IOException {
// fileName = "200MB.txt"; // split-46s - bit 46s
// fileName = "1GB.txt"; // 30+320=360s - 25+85=110s/25+65=93s
fileName = "10GB.txt"; // - 300+200=550s/310+270=590s
long start = System.currentTimeMillis();
split();
bitSort();
System.out.println("total: " + (System.currentTimeMillis() - start) / 1000 + " s.");
}
}