Search in sources :

Example 16 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class TesseractOCRParser method parse.

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }
}
Also used : FileOutputStream(java.io.FileOutputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File) BufferedImage(java.awt.image.BufferedImage)

Example 17 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class PackageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //lazily load the MediaTypeRegistry at parse time
    //only want to call getDefaultConfig() once, and can't
    //load statically because of the ForkParser
    TikaConfig config = context.get(TikaConfig.class);
    MediaTypeRegistry mediaTypeRegistry = null;
    if (config != null) {
        mediaTypeRegistry = config.getMediaTypeRegistry();
    } else {
        if (bufferedMediaTypeRegistry == null) {
            //buffer this for next time.
            synchronized (lock) {
                //now that we're locked, check again
                if (bufferedMediaTypeRegistry == null) {
                    bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
                }
            }
        }
        mediaTypeRegistry = bufferedMediaTypeRegistry;
    }
    // Ensure that the stream supports the mark feature
    if (!stream.markSupported()) {
        stream = new BufferedInputStream(stream);
    }
    TemporaryResources tmp = new TemporaryResources();
    ArchiveInputStream ais = null;
    try {
        ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
        // At the end we want to close the archive stream to release
        // any associated resources, but the underlying document stream
        // should not be closed
        ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
    } catch (StreamingNotSupportedException sne) {
        // Most archive formats work on streams, but a few need files
        if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
            // Rework as a file, and wrap
            stream.reset();
            TikaInputStream tstream = TikaInputStream.get(stream, tmp);
            // Seven Zip suports passwords, was one given?
            String password = null;
            PasswordProvider provider = context.get(PasswordProvider.class);
            if (provider != null) {
                password = provider.getPassword(metadata);
            }
            SevenZFile sevenz;
            if (password == null) {
                sevenz = new SevenZFile(tstream.getFile());
            } else {
                sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
            }
            // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
            ais = new SevenZWrapper(sevenz);
        } else {
            tmp.close();
            throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
        }
    } catch (ArchiveException e) {
        tmp.close();
        throw new TikaException("Unable to unpack document stream", e);
    }
    updateMediaType(ais, mediaTypeRegistry, metadata);
    // Use the delegate parser to parse the contained document
    EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        ArchiveEntry entry = ais.getNextEntry();
        while (entry != null) {
            if (!entry.isDirectory()) {
                parseEntry(ais, entry, extractor, metadata, xhtml);
            }
            entry = ais.getNextEntry();
        }
    } catch (UnsupportedZipFeatureException zfe) {
        // If it's an encrypted document of unknown password, report as such
        if (zfe.getFeature() == Feature.ENCRYPTION) {
            throw new EncryptedDocumentException(zfe);
        }
        // Otherwise throw the exception
        throw new TikaException("UnsupportedZipFeature", zfe);
    } catch (PasswordRequiredException pre) {
        throw new EncryptedDocumentException(pre);
    } finally {
        ais.close();
        tmp.close();
    }
    xhtml.endDocument();
}
Also used : StreamingNotSupportedException(org.apache.commons.compress.archivers.StreamingNotSupportedException) TikaException(org.apache.tika.exception.TikaException) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaConfig(org.apache.tika.config.TikaConfig) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) PasswordRequiredException(org.apache.commons.compress.PasswordRequiredException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) UnsupportedZipFeatureException(org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException) ArchiveStreamFactory(org.apache.commons.compress.archivers.ArchiveStreamFactory) ArArchiveInputStream(org.apache.commons.compress.archivers.ar.ArArchiveInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) JarArchiveInputStream(org.apache.commons.compress.archivers.jar.JarArchiveInputStream) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) CpioArchiveInputStream(org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) DumpArchiveInputStream(org.apache.commons.compress.archivers.dump.DumpArchiveInputStream) SevenZFile(org.apache.commons.compress.archivers.sevenz.SevenZFile) BufferedInputStream(java.io.BufferedInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 18 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class PackageParser method parseEntry.

private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    String name = entry.getName();
    if (archive.canReadEntryData(entry)) {
        // Fetch the metadata on the entry contained in the archive
        Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
        // Recurse into the entry if desired
        if (extractor.shouldParseEmbedded(entrydata)) {
            // For detectors to work, we need a mark/reset supporting
            // InputStream, which ArchiveInputStream isn't, so wrap
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tis = TikaInputStream.get(archive, tmp);
                extractor.parseEmbedded(tis, xhtml, entrydata, true);
            } finally {
                tmp.dispose();
            }
        }
    } else {
        name = (name == null) ? "" : name;
        if (entry instanceof ZipArchiveEntry) {
            boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
            if (usesEncryption) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
            }
        } else {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
        }
        if (name.length() > 0) {
            xhtml.element("p", name);
        }
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry)

Example 19 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class CommonsDigester method digest.

@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
    TikaInputStream tis = TikaInputStream.cast(is);
    if (tis != null && tis.hasFile()) {
        long sz = -1;
        if (tis.hasFile()) {
            sz = tis.getLength();
        }
        //just digest the underlying file.
        if (sz > markLimit) {
            digestFile(tis.getFile(), m);
            return;
        }
    }
    //try the usual mark/reset stuff.
    //however, if you actually hit the bound,
    //then stop and spool to file via TikaInputStream
    SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
    boolean finishedStream = false;
    for (DigestAlgorithm algorithm : algorithms) {
        bis.mark(markLimit + 1);
        finishedStream = digestEach(algorithm, bis, m);
        bis.reset();
        if (!finishedStream) {
            break;
        }
    }
    //spool to File and digest that.
    if (!finishedStream) {
        if (tis != null) {
            digestFile(tis.getFile(), m);
        } else {
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
                digestFile(tmpTikaInputStream.getFile(), m);
            } finally {
                try {
                    tmp.dispose();
                } catch (TikaException e) {
                    throw new IOExceptionWithCause(e);
                }
            }
        }
    }
}
Also used : IOExceptionWithCause(org.apache.tika.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream)

Example 20 with TemporaryResources

use of org.apache.tika.io.TemporaryResources in project tika by apache.

the class PooledTimeSeriesParser method parse.

/**
     * Parses a document stream into a sequence of XHTML SAX events. Fills in
     * related document metadata in the given metadata object.
     * <p>
     * The given document stream is consumed but not closed by this method. The
     * responsibility to close the stream remains on the caller.
     * <p>
     * Information about the parsing context can be passed in the context
     * parameter. See the parser implementations for the kinds of context
     * information they expect.
     *
     * @param stream   the document stream (input)
     * @param handler  handler for the XHTML SAX events (output)
     * @param metadata document metadata (input and output)
     * @param context  parse context
     * @throws IOException   if the document stream could not be read
     * @throws SAXException  if the SAX events could not be processed
     * @throws TikaException if the document could not be parsed
     * @since Apache Tika 0.5
     */
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (!isAvailable) {
        LOG.warn("PooledTimeSeries not installed!");
        return;
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        String cmdOutput = computePoT(input);
        try (InputStream ofStream = new FileInputStream(new File(input.getAbsoluteFile() + ".of.txt"))) {
            try (InputStream ogStream = new FileInputStream(new File(input.getAbsoluteFile() + ".hog.txt"))) {
                extractHeaderOutput(ofStream, metadata, "of");
                extractHeaderOutput(ogStream, metadata, "og");
                xhtml.startDocument();
                doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", metadata.get("of_frames"), metadata.get("of_vecSize"));
                doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", metadata.get("og_frames"), metadata.get("og_vecSize"));
                xhtml.endDocument();
            }
        }
        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_VIDEO_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) FileInputStream(java.io.FileInputStream)

Aggregations

TemporaryResources (org.apache.tika.io.TemporaryResources)31 TikaInputStream (org.apache.tika.io.TikaInputStream)30 TikaException (org.apache.tika.exception.TikaException)15 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)14 File (java.io.File)11 IOException (java.io.IOException)8 InputStream (java.io.InputStream)6 SAXException (org.xml.sax.SAXException)6 FileInputStream (java.io.FileInputStream)4 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)4 Metadata (org.apache.tika.metadata.Metadata)4 MediaType (org.apache.tika.mime.MediaType)4 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)2 JempboxExtractor (org.apache.tika.parser.image.xmp.JempboxExtractor)2 IsoFile (com.coremedia.iso.IsoFile)1 Box (com.coremedia.iso.boxes.Box)1 FileTypeBox (com.coremedia.iso.boxes.FileTypeBox)1 MetaBox (com.coremedia.iso.boxes.MetaBox)1 MovieBox (com.coremedia.iso.boxes.MovieBox)1