use of technology.tabula.extractors.ExtractionAlgorithm in project drill by apache.
the class PdfUtils method extractTablesFromPDF.
/**
* Returns a list of tables found in a given PDF document. There are several extraction algorithms
* available and this function allows the user to select which to use.
* @param document The input PDF document to search for tables
* @param algorithm The extraction algorithm
* @return A list of tables found in the document.
*/
public static List<Table> extractTablesFromPDF(PDDocument document, ExtractionAlgorithm algorithm) {
NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
ExtractionAlgorithm algExtractor;
SpreadsheetExtractionAlgorithm extractor = new SpreadsheetExtractionAlgorithm();
ObjectExtractor objectExtractor = new ObjectExtractor(document);
PageIterator pages = objectExtractor.extract();
List<Table> tables = new ArrayList<>();
while (pages.hasNext()) {
Page page = pages.next();
algExtractor = algorithm;
List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);
for (Rectangle guessRect : tablesOnPage) {
Page guess = page.getArea(guessRect);
tables.addAll(algExtractor.extract(guess));
}
}
try {
objectExtractor.close();
} catch (Exception e) {
throw UserException.parseError(e).message("Error extracting table: " + e.getMessage()).build(logger);
}
return tables;
}
use of technology.tabula.extractors.ExtractionAlgorithm in project drill by apache.
the class PdfUtils method getSpecificTable.
/**
* Returns a specific table from a PDF document. Returns null in the event that
* the user requests a table that does not exist. If there is an error with the document
* the function will throw a UserException.
* @param document The source PDF document
* @param tableIndex The index of the desired table
* @return The desired Table, null if the table is not valid, or if the document has no tables.
*/
public static Table getSpecificTable(PDDocument document, int tableIndex, ExtractionAlgorithm algorithm) {
NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
ExtractionAlgorithm algExtractor;
if (algorithm == null) {
algExtractor = DEFAULT_ALGORITHM;
} else {
algExtractor = algorithm;
}
ObjectExtractor objectExtractor = new ObjectExtractor(document);
PageIterator pages = objectExtractor.extract();
Table specificTable;
int tableCounter = 0;
while (pages.hasNext()) {
Page page = pages.next();
List<Rectangle> rectanglesOnPage = detectionAlgorithm.detect(page);
List<Table> tablesOnPage = new ArrayList<>();
for (Rectangle guessRect : rectanglesOnPage) {
Page guess = page.getArea(guessRect);
tablesOnPage.addAll(algExtractor.extract(guess));
if (tablesOnPage.size() == 0) {
return null;
}
for (Table table : tablesOnPage) {
if (tableCounter == tableIndex) {
specificTable = table;
return specificTable;
}
tableCounter++;
}
}
}
try {
objectExtractor.close();
} catch (Exception e) {
throw UserException.parseError(e).message("Error extracting table: " + e.getMessage()).build(logger);
}
return null;
}
Aggregations