use of technology.tabula.Table in project drill by apache.
the class TestPdfUtils method testGetSpecificTable.
@Test
public void testGetSpecificTable() throws Exception {
PDDocument document = getDocument("us-020.pdf");
Table table = PdfUtils.getSpecificTable(document, 0, null);
assertNotNull(table);
assertEquals(7, table.getColCount());
}
use of technology.tabula.Table in project drill by apache.
the class TestPdfUtils method testFirstRowExtractor.
@Test
public void testFirstRowExtractor() throws Exception {
PDDocument document = getDocument("schools.pdf");
List<Table> tableList = PdfUtils.extractTablesFromPDF(document);
document.close();
List<String> values = PdfUtils.extractFirstRowValues(tableList.get(0));
assertEquals(values.size(), 11);
}
use of technology.tabula.Table in project drill by apache.
the class TestPdfUtils method testGetFullPageSpecificTable.
@Test
public void testGetFullPageSpecificTable() throws Exception {
PDDocument document = getDocument("schools.pdf");
Table table = PdfUtils.getSpecificTable(document, 3, null);
assertNotNull(table);
}
use of technology.tabula.Table in project drill by apache.
the class TestPdfUtils method testTableExtractor.
@Test
public void testTableExtractor() throws Exception {
PDDocument document = getDocument("argentina_diputados_voting_record.pdf");
List<Table> tableList = PdfUtils.extractTablesFromPDF(document);
document.close();
assertEquals(tableList.size(), 1);
PDDocument document2 = getDocument("twotables.pdf");
List<Table> tableList2 = PdfUtils.extractTablesFromPDF(document2);
document2.close();
assertEquals(tableList2.size(), 2);
}
use of technology.tabula.Table in project drill by apache.
the class PdfUtils method extractTablesFromPDF.
/**
* Returns a list of tables found in a given PDF document. There are several extraction algorithms
* available and this function allows the user to select which to use.
* @param document The input PDF document to search for tables
* @param algorithm The extraction algorithm
* @return A list of tables found in the document.
*/
public static List<Table> extractTablesFromPDF(PDDocument document, ExtractionAlgorithm algorithm) {
NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
ExtractionAlgorithm algExtractor;
SpreadsheetExtractionAlgorithm extractor = new SpreadsheetExtractionAlgorithm();
ObjectExtractor objectExtractor = new ObjectExtractor(document);
PageIterator pages = objectExtractor.extract();
List<Table> tables = new ArrayList<>();
while (pages.hasNext()) {
Page page = pages.next();
algExtractor = algorithm;
List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);
for (Rectangle guessRect : tablesOnPage) {
Page guess = page.getArea(guessRect);
tables.addAll(algExtractor.extract(guess));
}
}
try {
objectExtractor.close();
} catch (Exception e) {
throw UserException.parseError(e).message("Error extracting table: " + e.getMessage()).build(logger);
}
return tables;
}
Aggregations