use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TesseractOCRParser method parse.
private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
File tmpTxtOutput = null;
try {
File input = tikaInputStream.getFile();
long size = tikaInputStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
// Process image if ImageMagick Tool is present
if (config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
// copy the contents of the original input file into a temporary file
// which will be preprocessed for OCR
TemporaryResources tmp = new TemporaryResources();
try {
File tmpFile = tmp.createTemporaryFile();
FileUtils.copyFile(input, tmpFile);
processImage(tmpFile, config);
doOCR(tmpFile, tmpOCROutputFile, config);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
} else {
doOCR(input, tmpOCROutputFile, config);
}
// Tesseract appends the output type (.txt or .hocr) to output file name
tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US));
if (tmpTxtOutput.exists()) {
try (InputStream is = new FileInputStream(tmpTxtOutput)) {
if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
extractHOCROutput(is, parseContext, xhtml);
} else {
extractOutput(is, xhtml);
}
}
}
}
} finally {
if (tmpTxtOutput != null) {
tmpTxtOutput.delete();
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TesseractOCRParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
//trigger the spooling to a tmp file if the stream wasn't
//already a TikaInputStream that contained a file
tikaStream.getPath();
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix added.
File tmpOCROutputFile = tmp.createTemporaryFile();
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
xhtml.endDocument();
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TesseractOCRParser method processImage.
/**
* This method is used to process the image to an OCR-friendly format.
* @param streamingObject input image to be processed
* @param config TesseractOCRconfig class to get ImageMagick properties
* @throws IOException if an input error occurred
* @throws TikaException if an exception timed out
*/
private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException {
// fetch rotation script from resources
InputStream in = getClass().getResourceAsStream("rotation.py");
TemporaryResources tmp = new TemporaryResources();
File rotationScript = tmp.createTemporaryFile();
Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING);
String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath();
String angle = "0";
DefaultExecutor executor = new DefaultExecutor();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
executor.setStreamHandler(streamHandler);
// determine the angle of rotation required to make the text horizontal
CommandLine cmdLine = CommandLine.parse(cmd);
if (hasPython()) {
try {
executor.execute(cmdLine);
angle = outputStream.toString("UTF-8").trim();
} catch (Exception e) {
}
}
// process the image - parameter values can be set in TesseractOCRConfig.properties
String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() + " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() + " -resize " + config.getResize() + "% -rotate " + angle + " " + streamingObject.getAbsolutePath() + " " + streamingObject.getAbsolutePath();
cmdLine = CommandLine.parse(line);
try {
executor.execute(cmdLine);
} catch (Exception e) {
}
tmp.close();
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TesseractOCRParser method parseInline.
/**
* Use this to parse content without starting a new document.
* This appends SAX events to xhtml without re-adding the metadata, body start, etc.
*
* @param stream inputstream
* @param xhtml handler
* @param config TesseractOCRConfig to use for this parse
* @throws IOException
* @throws SAXException
* @throws TikaException
*
*/
public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File tmpImgFile = tmp.createTemporaryFile();
parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class AbstractPDF2XHTML method doOCROnCurrentPage.
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
}
TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
}
PDFRenderer renderer = new PDFRenderer(pdDocument);
TemporaryResources tmp = new TemporaryResources();
try {
BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
}
} catch (IOException e) {
handleCatchableIOE(e);
} catch (SAXException e) {
throw new IOExceptionWithCause("error writing OCR content from PDF", e);
} finally {
tmp.dispose();
}
}
Aggregations