Search in sources :

Example 6 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class TesseractOCRParser method parse.

private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
    File tmpTxtOutput = null;
    try {
        File input = tikaInputStream.getFile();
        long size = tikaInputStream.getLength();
        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
            // Process image if ImageMagick Tool is present
            if (config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
                // copy the contents of the original input file into a temporary file
                // which will be preprocessed for OCR
                TemporaryResources tmp = new TemporaryResources();
                try {
                    File tmpFile = tmp.createTemporaryFile();
                    FileUtils.copyFile(input, tmpFile);
                    processImage(tmpFile, config);
                    doOCR(tmpFile, tmpOCROutputFile, config);
                } finally {
                    if (tmp != null) {
                        tmp.dispose();
                    }
                }
            } else {
                doOCR(input, tmpOCROutputFile, config);
            }
            // Tesseract appends the output type (.txt or .hocr) to output file name
            tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US));
            if (tmpTxtOutput.exists()) {
                try (InputStream is = new FileInputStream(tmpTxtOutput)) {
                    if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
                        extractHOCROutput(is, parseContext, xhtml);
                    } else {
                        extractOutput(is, xhtml);
                    }
                }
            }
        }
    } finally {
        if (tmpTxtOutput != null) {
            tmpTxtOutput.delete();
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 7 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class TesseractOCRParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (!hasTesseract(config))
        return;
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        //trigger the spooling to a tmp file if the stream wasn't
        //already a TikaInputStream that contained a file
        tikaStream.getPath();
        //this is the text output file name specified on the tesseract
        //commandline.  The actual output file name will have a suffix added.
        File tmpOCROutputFile = tmp.createTemporaryFile();
        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
        xhtml.endDocument();
    } finally {
        tmp.dispose();
    }
}
Also used : TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 8 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class TesseractOCRParser method processImage.

/**
     * This method is used to process the image to an OCR-friendly format.
     * @param streamingObject input image to be processed
     * @param config TesseractOCRconfig class to get ImageMagick properties
     * @throws IOException if an input error occurred
     * @throws TikaException if an exception timed out
     */
private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException {
    // fetch rotation script from resources
    InputStream in = getClass().getResourceAsStream("rotation.py");
    TemporaryResources tmp = new TemporaryResources();
    File rotationScript = tmp.createTemporaryFile();
    Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING);
    String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath();
    String angle = "0";
    DefaultExecutor executor = new DefaultExecutor();
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
    executor.setStreamHandler(streamHandler);
    // determine the angle of rotation required to make the text horizontal
    CommandLine cmdLine = CommandLine.parse(cmd);
    if (hasPython()) {
        try {
            executor.execute(cmdLine);
            angle = outputStream.toString("UTF-8").trim();
        } catch (Exception e) {
        }
    }
    // process the image - parameter values can be set in TesseractOCRConfig.properties
    String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() + " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() + " -resize " + config.getResize() + "% -rotate " + angle + " " + streamingObject.getAbsolutePath() + " " + streamingObject.getAbsolutePath();
    cmdLine = CommandLine.parse(line);
    try {
        executor.execute(cmdLine);
    } catch (Exception e) {
    }
    tmp.close();
}
Also used : CommandLine(org.apache.commons.exec.CommandLine) PumpStreamHandler(org.apache.commons.exec.PumpStreamHandler) DefaultExecutor(org.apache.commons.exec.DefaultExecutor) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) ByteArrayOutputStream(java.io.ByteArrayOutputStream) File(java.io.File) TimeoutException(java.util.concurrent.TimeoutException) SAXException(org.xml.sax.SAXException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 9 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class TesseractOCRParser method parseInline.

/**
     * Use this to parse content without starting a new document.
     * This appends SAX events to xhtml without re-adding the metadata, body start, etc.
     *
     * @param stream inputstream
     * @param xhtml handler
     * @param config TesseractOCRConfig to use for this parse
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     *
     */
public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (!hasTesseract(config))
        return;
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File tmpImgFile = tmp.createTemporaryFile();
        parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
    } finally {
        tmp.dispose();
    }
}
Also used : TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File)

Example 10 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class AbstractPDF2XHTML method doOCROnCurrentPage.

void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOcrStrategy().equals(NO_OCR)) {
        return;
    }
    TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
        throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }
    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
        BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
        Path tmpFile = tmp.createTempFile();
        try (OutputStream os = Files.newOutputStream(tmpFile)) {
            //TODO: get output format from TesseractConfig
            ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
        }
        try (InputStream is = TikaInputStream.get(tmpFile)) {
            tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
        }
    } catch (IOException e) {
        handleCatchableIOE(e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
        tmp.dispose();
    }
}
Also used : TesseractOCRConfig(org.apache.tika.parser.ocr.TesseractOCRConfig) Path(java.nio.file.Path) IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) BufferedInputStream(java.io.BufferedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) IOException(java.io.IOException) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) PDFRenderer(org.apache.pdfbox.rendering.PDFRenderer) BufferedImage(java.awt.image.BufferedImage) SAXException(org.xml.sax.SAXException)

Aggregations

TemporaryResources (org.apache.tika.io.TemporaryResources)31 TikaInputStream (org.apache.tika.io.TikaInputStream)30 TikaException (org.apache.tika.exception.TikaException)15 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)14 File (java.io.File)11 IOException (java.io.IOException)8 InputStream (java.io.InputStream)6 SAXException (org.xml.sax.SAXException)6 FileInputStream (java.io.FileInputStream)4 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)4 Metadata (org.apache.tika.metadata.Metadata)4 MediaType (org.apache.tika.mime.MediaType)4 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)2 JempboxExtractor (org.apache.tika.parser.image.xmp.JempboxExtractor)2 IsoFile (com.coremedia.iso.IsoFile)1 Box (com.coremedia.iso.boxes.Box)1 FileTypeBox (com.coremedia.iso.boxes.FileTypeBox)1 MetaBox (com.coremedia.iso.boxes.MetaBox)1 MovieBox (com.coremedia.iso.boxes.MovieBox)1