use of org.apache.tika.parser.ocr.TesseractOCRConfig in project tika by apache.
the class TikaResourceTest method testPDFOCRConfig.
//TIKA-2290
@Test
public void testPDFOCRConfig() throws Exception {
if (!new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
return;
}
Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "no_ocr").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
assertTrue(responseMsg.trim().equals(""));
response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "ocr_only").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
responseMsg = getStringFromInputStream((InputStream) response.getEntity());
assertContains("Happy New Year 2003!", responseMsg);
//now try a bad value
response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "non-sense-value").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(500, response.getStatus());
}
use of org.apache.tika.parser.ocr.TesseractOCRConfig in project tika by apache.
the class AbstractPDF2XHTML method doOCROnCurrentPage.
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
}
TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
}
PDFRenderer renderer = new PDFRenderer(pdDocument);
TemporaryResources tmp = new TemporaryResources();
try {
BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
}
} catch (IOException e) {
handleCatchableIOE(e);
} catch (SAXException e) {
throw new IOExceptionWithCause("error writing OCR content from PDF", e);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.parser.ocr.TesseractOCRConfig in project tika by apache.
the class TikaResource method fillParseContext.
public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders, Parser embeddedParser) {
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
PDFParserConfig pdfParserConfig = new PDFParserConfig();
for (String key : httpHeaders.keySet()) {
if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
} else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
}
}
parseContext.set(TesseractOCRConfig.class, ocrConfig);
parseContext.set(PDFParserConfig.class, pdfParserConfig);
if (embeddedParser != null) {
parseContext.set(Parser.class, embeddedParser);
}
}
Aggregations