java通过ocr实现识别pdf中的文字

import org.springframework.util.StringUtils; import com.benjaminwan.ocrlibrary.OcrResult; import com.benjaminwan.ocrlibrary.TextBlock; import io.github.mymonstercat.Model; import io.github.mymonstercat.ocr.InferenceEngine; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.springframework.stereotype.Service; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.Base64; import java.util.List; import java.util.UUID; @Service public class PdfOCRConverter { //临时输出png文件路径 private static final String outputDirs = "D:/pdfToImg/temp/"; public static void main(String[] args) throws IOException { List<String> fileNameList = getWords("D:/Download/123.pdf"); for (String fileName : fileNameList) { System.out.println(fileName); } } public static List<String> getWords(String pdfFilePath) throws IOException { String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", ""); List<String> fileNameList = convertPdfToImage(pdfFilePath, outputDir); List<String> wordsList = new ArrayList<>(); for (String fileName : fileNameList) { System.out.println("识别图片:"+fileName); if (StringUtils.isEmpty(fileName)){break;} List<String> words = runOcr(fileName); for (String word : words) { System.out.println(word); wordsList.add(word); } } deleteDirectory(outputDir); return wordsList; } public static List<String> runOcr(String path) { List<String> results = new ArrayList<>(); InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3); OcrResult ocrResult = engine.runOcr(path); for (TextBlock textBlock : ocrResult.getTextBlocks()) { results.add(textBlock.getText()); } return results; } public static List<String> convertPdfToImage(String pdfFilePath, String outputDir) { // 设置DPI(越高图片越清晰,但文件也会更大) int dpi = 300; List<String> fileNameList = new ArrayList<>(); File file = new File(pdfFilePath); try (PDDocument document = Loader.loadPDF(file)) { PDFRenderer pdfRenderer = new PDFRenderer(document); String pdfFileName = file.getName().replace(".pdf", ""); String name = pdfFileName; for (int page = 0; page < document.getNumberOfPages(); page++) { BufferedImage bim = pdfRenderer.renderImageWithDPI(page, dpi); String folder = createFolder(outputDir + "/" + name); String fileName = folder + "/" + pdfFileName + "_page_" + (page + 1) + ".png"; ImageIO.write(bim, "png", new File(fileName)); fileNameList.add(fileName); System.out.println("生成图片:"+fileName); } } catch (IOException e) { e.printStackTrace(); } return fileNameList; } public static void deleteDirectory(String path) throws IOException { // 如果路径不指向一个目录,则抛出异常 Path directory = Paths.get(path); if (!Files.isDirectory(directory)) { throw new IOException("The provided path is not a directory."); } // 遍历目录中的所有文件和子目录 Files.walkFileTree(directory, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { // 删除文件 Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { // 所有内容被删除后删除目录本身 Files.delete(dir); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { // 如果访问文件失败,则抛出异常 throw exc; } }); } public static String createFolder(String folderPath) { String txt = folderPath; try { File myFilePath = new File(txt); txt = folderPath; if (!myFilePath.exists()) { myFilePath.mkdirs(); } } catch (Exception e) { e.printStackTrace(); } return txt; } public static List<String> getWordsByBase64(String base64) throws IOException { List<String> words = new ArrayList<>(); if (StringUtils.isEmpty(base64)) { return null; } String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", ""); // 解码Base64字符串 byte[] decodedBytes = Base64.getDecoder().decode(base64); createFolder(outputDir); // 输出的PDF文件名 String outputFilePath = outputDir+"/output.pdf"; try (FileOutputStream fos = new FileOutputStream(outputFilePath)) { // 将解码后的字节数组写入文件 fos.write(decodedBytes); System.out.println("PDF文件已成功生成: " + outputFilePath); words = getWords(outputFilePath); } catch (Exception e) { e.printStackTrace(); } deleteDirectory(outputDir); return words; } }
上一篇:32单片机从入门到精通之软件编程——通信协议(十一)


下一篇:.NET 依赖注入中的 Captive Dependency