首先引入第三方组件的使用Tabula,这个是一个开源的组件,该组件基于pdfpig组件实现
/// <summary> /// 提取表格的方法 /// </summary> /// <param name="pdfPath"></param> /// <param name="startNumber"></param> /// <param name="endNumber"></param> /// <returns></returns> private List<DataTable> ExtractTables(string pdfPath, int startNumber, int endNumber) { try { using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(pdfPath, new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(document); IExtractionAlgorithm ea = new SpreadsheetExtractionAlgorithm(); var pagesNumber = document.NumberOfPages; if (startNumber < pagesNumber && endNumber > pagesNumber) { endNumber = pagesNumber; } if (startNumber > pagesNumber || endNumber > pagesNumber) { throw new IndexOutOfRangeException("页码超出范围!"); } List<DataTable> dtList = new List<DataTable>(); for (int i = startNumber; i <= endNumber; i++) { PageArea page = oe.Extract(i); List<Table> tables = ea.Extract(page); foreach (Table tb in tables) { DataTable dt = new DataTable(); var columnCount = tb.ColumnCount; for (int b = 0; b < columnCount; b++) { dt.Columns.Add(b.ToString(), typeof(string)); } var rows = tb.Rows; foreach (IReadOnlyList<Cell> row in tb.Rows) { DataRow dr = dt.NewRow(); for (int c = 0; c < columnCount; c++) { dr[c] = row[c]; } dt.Rows.Add(dr); } dtList.Add(dt); } } return dtList; } } catch (Exception ex) { throw ex; } }