use of org.apache.tika.parser.ocr.TesseractOCRParser in project tika by apache.
the class TikaResourceTest method testPDFOCRConfig.
//TIKA-2290
@Test
public void testPDFOCRConfig() throws Exception {
if (!new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
return;
}
Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "no_ocr").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
assertTrue(responseMsg.trim().equals(""));
response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "ocr_only").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
responseMsg = getStringFromInputStream((InputStream) response.getEntity());
assertContains("Happy New Year 2003!", responseMsg);
//now try a bad value
response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain").header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "non-sense-value").put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(500, response.getStatus());
}
use of org.apache.tika.parser.ocr.TesseractOCRParser in project tika by apache.
the class AbstractPDF2XHTML method doOCROnCurrentPage.
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
}
TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
}
PDFRenderer renderer = new PDFRenderer(pdDocument);
TemporaryResources tmp = new TemporaryResources();
try {
BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
}
} catch (IOException e) {
handleCatchableIOE(e);
} catch (SAXException e) {
throw new IOExceptionWithCause("error writing OCR content from PDF", e);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.parser.ocr.TesseractOCRParser in project tika by apache.
the class BundleIT method testTesseractParser.
@Test
public void testTesseractParser() throws Exception {
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
Parser tesseractParser = new TesseractOCRParser();
try (InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg")) {
tesseractParser.parse(stream, handler, new Metadata(), context);
}
}
Aggregations