Search in sources :

Example 86 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class PackageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //lazily load the MediaTypeRegistry at parse time
    //only want to call getDefaultConfig() once, and can't
    //load statically because of the ForkParser
    TikaConfig config = context.get(TikaConfig.class);
    MediaTypeRegistry mediaTypeRegistry = null;
    if (config != null) {
        mediaTypeRegistry = config.getMediaTypeRegistry();
    } else {
        if (bufferedMediaTypeRegistry == null) {
            //buffer this for next time.
            synchronized (lock) {
                //now that we're locked, check again
                if (bufferedMediaTypeRegistry == null) {
                    bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
                }
            }
        }
        mediaTypeRegistry = bufferedMediaTypeRegistry;
    }
    // Ensure that the stream supports the mark feature
    if (!stream.markSupported()) {
        stream = new BufferedInputStream(stream);
    }
    TemporaryResources tmp = new TemporaryResources();
    ArchiveInputStream ais = null;
    try {
        ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
        // At the end we want to close the archive stream to release
        // any associated resources, but the underlying document stream
        // should not be closed
        ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
    } catch (StreamingNotSupportedException sne) {
        // Most archive formats work on streams, but a few need files
        if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
            // Rework as a file, and wrap
            stream.reset();
            TikaInputStream tstream = TikaInputStream.get(stream, tmp);
            // Seven Zip suports passwords, was one given?
            String password = null;
            PasswordProvider provider = context.get(PasswordProvider.class);
            if (provider != null) {
                password = provider.getPassword(metadata);
            }
            SevenZFile sevenz;
            if (password == null) {
                sevenz = new SevenZFile(tstream.getFile());
            } else {
                sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
            }
            // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
            ais = new SevenZWrapper(sevenz);
        } else {
            tmp.close();
            throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
        }
    } catch (ArchiveException e) {
        tmp.close();
        throw new TikaException("Unable to unpack document stream", e);
    }
    updateMediaType(ais, mediaTypeRegistry, metadata);
    // Use the delegate parser to parse the contained document
    EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        ArchiveEntry entry = ais.getNextEntry();
        while (entry != null) {
            if (!entry.isDirectory()) {
                parseEntry(ais, entry, extractor, metadata, xhtml);
            }
            entry = ais.getNextEntry();
        }
    } catch (UnsupportedZipFeatureException zfe) {
        // If it's an encrypted document of unknown password, report as such
        if (zfe.getFeature() == Feature.ENCRYPTION) {
            throw new EncryptedDocumentException(zfe);
        }
        // Otherwise throw the exception
        throw new TikaException("UnsupportedZipFeature", zfe);
    } catch (PasswordRequiredException pre) {
        throw new EncryptedDocumentException(pre);
    } finally {
        ais.close();
        tmp.close();
    }
    xhtml.endDocument();
}
Also used : StreamingNotSupportedException(org.apache.commons.compress.archivers.StreamingNotSupportedException) TikaException(org.apache.tika.exception.TikaException) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaConfig(org.apache.tika.config.TikaConfig) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) PasswordRequiredException(org.apache.commons.compress.PasswordRequiredException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) UnsupportedZipFeatureException(org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException) ArchiveStreamFactory(org.apache.commons.compress.archivers.ArchiveStreamFactory) ArArchiveInputStream(org.apache.commons.compress.archivers.ar.ArArchiveInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) JarArchiveInputStream(org.apache.commons.compress.archivers.jar.JarArchiveInputStream) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) CpioArchiveInputStream(org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) DumpArchiveInputStream(org.apache.commons.compress.archivers.dump.DumpArchiveInputStream) SevenZFile(org.apache.commons.compress.archivers.sevenz.SevenZFile) BufferedInputStream(java.io.BufferedInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 87 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class PackageParser method parseEntry.

private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    String name = entry.getName();
    if (archive.canReadEntryData(entry)) {
        // Fetch the metadata on the entry contained in the archive
        Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
        // Recurse into the entry if desired
        if (extractor.shouldParseEmbedded(entrydata)) {
            // For detectors to work, we need a mark/reset supporting
            // InputStream, which ArchiveInputStream isn't, so wrap
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tis = TikaInputStream.get(archive, tmp);
                extractor.parseEmbedded(tis, xhtml, entrydata, true);
            } finally {
                tmp.dispose();
            }
        }
    } else {
        name = (name == null) ? "" : name;
        if (entry instanceof ZipArchiveEntry) {
            boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
            if (usesEncryption) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
            }
        } else {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
        }
        if (name.length() > 0) {
            xhtml.element("p", name);
        }
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry)

Example 88 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class RTFEmbObjHandler method extractObj.

private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException {
    if (bytes == null) {
        return;
    }
    metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
    if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = TikaInputStream.get(bytes);
        if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
            String extension = embeddedDocumentUtil.getExtension(stream, metadata);
            if (inObject && state == EMB_STATE.PICT) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
                metadata.set(RTFMetadata.THUMBNAIL, "true");
            } else {
                metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension);
            }
        }
        try {
            embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        } finally {
            stream.close();
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) IOException(java.io.IOException)

Example 89 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class PDFParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
    PDDocument pdfDocument = null;
    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            } else {
                if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                }
                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) TikaInputStream(org.apache.tika.io.TikaInputStream) InvalidPasswordException(org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException) COSString(org.apache.pdfbox.cos.COSString) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 90 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class CommonsDigester method digest.

@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
    TikaInputStream tis = TikaInputStream.cast(is);
    if (tis != null && tis.hasFile()) {
        long sz = -1;
        if (tis.hasFile()) {
            sz = tis.getLength();
        }
        //just digest the underlying file.
        if (sz > markLimit) {
            digestFile(tis.getFile(), m);
            return;
        }
    }
    //try the usual mark/reset stuff.
    //however, if you actually hit the bound,
    //then stop and spool to file via TikaInputStream
    SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
    boolean finishedStream = false;
    for (DigestAlgorithm algorithm : algorithms) {
        bis.mark(markLimit + 1);
        finishedStream = digestEach(algorithm, bis, m);
        bis.reset();
        if (!finishedStream) {
            break;
        }
    }
    //spool to File and digest that.
    if (!finishedStream) {
        if (tis != null) {
            digestFile(tis.getFile(), m);
        } else {
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
                digestFile(tmpTikaInputStream.getFile(), m);
            } finally {
                try {
                    tmp.dispose();
                } catch (TikaException e) {
                    throw new IOExceptionWithCause(e);
                }
            }
        }
    }
}
Also used : IOExceptionWithCause(org.apache.tika.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6