use of technology.tabula.extractors.SpreadsheetExtractionAlgorithm in project drill by apache.
the class PdfUtils method extractTablesFromPDF.
/**
* Returns a list of tables found in a given PDF document. There are several extraction algorithms
* available and this function allows the user to select which to use.
* @param document The input PDF document to search for tables
* @param algorithm The extraction algorithm
* @return A list of tables found in the document.
*/
public static List<Table> extractTablesFromPDF(PDDocument document, ExtractionAlgorithm algorithm) {
NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
ExtractionAlgorithm algExtractor;
SpreadsheetExtractionAlgorithm extractor = new SpreadsheetExtractionAlgorithm();
ObjectExtractor objectExtractor = new ObjectExtractor(document);
PageIterator pages = objectExtractor.extract();
List<Table> tables = new ArrayList<>();
while (pages.hasNext()) {
Page page = pages.next();
algExtractor = algorithm;
List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);
for (Rectangle guessRect : tablesOnPage) {
Page guess = page.getArea(guessRect);
tables.addAll(algExtractor.extract(guess));
}
}
try {
objectExtractor.close();
} catch (Exception e) {
throw UserException.parseError(e).message("Error extracting table: " + e.getMessage()).build(logger);
}
return tables;
}
Aggregations