Search in sources :

Example 26 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class JackcessParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TikaInputStream tis = TikaInputStream.get(stream);
    Database db = null;
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    String password = null;
    PasswordProvider passwordProvider = context.get(PasswordProvider.class);
    if (passwordProvider != null) {
        password = passwordProvider.getPassword(metadata);
    }
    try {
        if (password == null) {
            //do this to ensure encryption/wrong password exception vs. more generic
            //"need right codec" error message.
            db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider()).setReadOnly(true).open();
        } else {
            db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider(password)).setReadOnly(true).open();
        }
        //just in case
        db.setLinkResolver(IGNORE_LINK_RESOLVER);
        JackcessExtractor ex = new JackcessExtractor(metadata, context, locale);
        ex.parse(db, xhtml);
    } catch (IllegalStateException e) {
        if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
            throw new EncryptedDocumentException(e);
        }
        throw e;
    } finally {
        if (db != null) {
            try {
                db.close();
            } catch (IOException e) {
            //swallow = silent close
            }
        }
    }
    xhtml.endDocument();
}
Also used : DatabaseBuilder(com.healthmarketscience.jackcess.DatabaseBuilder) CryptCodecProvider(com.healthmarketscience.jackcess.CryptCodecProvider) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) Database(com.healthmarketscience.jackcess.Database) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider)

Example 27 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class AbstractOOXMLExtractor method handleEmbeddedOLE.

/**
     * Handles an embedded OLE object in the document
     */
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel, Metadata parentMetadata) throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
        // Too small, skip
        return;
    }
    InputStream is = part.getInputStream();
    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = null;
    try {
        fs = new POIFSFileSystem(part.getInputStream());
    } catch (Exception e) {
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
        return;
    }
    TikaInputStream stream = null;
    try {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);
        if (root.hasEntry("CONTENTS") && root.hasEntry("Ole") && root.hasEntry("CompObj")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            //TODO: figure out if the equivalent of OLE 1.0's
            //getCommand() and getFileName() exist for OLE 2.0 to populate
            //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            if (ole.getLabel() != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            }
            if (ole.getCommand() != null) {
                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
            }
            if (ole.getFileName() != null) {
                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
            }
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }
            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
    // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
    // Could not process an OLE 1.0 entry, so skip this part
    } catch (IOException e) {
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
    } finally {
        if (fs != null) {
            fs.close();
        }
        if (stream != null) {
            stream.close();
        }
    }
}
Also used : Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) Ole10Native(org.apache.poi.poifs.filesystem.Ole10Native) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Metadata(org.apache.tika.metadata.Metadata) FileNotFoundException(java.io.FileNotFoundException) TikaInputStream(org.apache.tika.io.TikaInputStream) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) POIFSDocumentType(org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) IOException(java.io.IOException) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) TikaException(org.apache.tika.exception.TikaException) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) XmlException(org.apache.xmlbeans.XmlException) SAXException(org.xml.sax.SAXException)

Example 28 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class AbstractOOXMLExtractor method handleEmbeddedFile.

/**
     * Handles an embedded file in the document
     */
protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
    // Get the name
    String name = part.getPartName().getName();
    metadata.set(Metadata.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1));
    // Get the content type
    metadata.set(Metadata.CONTENT_TYPE, part.getContentType());
    // Call the recursing handler
    if (embeddedExtractor.shouldParseEmbedded(metadata)) {
        try (TikaInputStream tis = TikaInputStream.get(part.getInputStream())) {
            embeddedExtractor.parseEmbedded(tis, new EmbeddedContentHandler(handler), metadata, false);
        }
    }
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler)

Example 29 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class NetCDFParser method parse.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
     * org.apache.tika.parser.ParseContext)
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
    TikaInputStream tis = TikaInputStream.get(stream, tmp);
    NetcdfFile ncFile = null;
    try {
        ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
        metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
        // first parse out the set of global attributes
        for (Attribute attr : ncFile.getGlobalAttributes()) {
            Property property = resolveMetadataKey(attr.getFullName());
            if (attr.getDataType().isString()) {
                metadata.add(property, attr.getStringValue());
            } else if (attr.getDataType().isNumeric()) {
                int value = attr.getNumericValue().intValue();
                metadata.add(property, String.valueOf(value));
            }
        }
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.newline();
        xhtml.element("h1", "dimensions");
        xhtml.startElement("ul");
        xhtml.newline();
        for (Dimension dim : ncFile.getDimensions()) {
            xhtml.element("li", dim.getFullName() + " = " + dim.getLength());
        }
        xhtml.endElement("ul");
        xhtml.element("h1", "variables");
        xhtml.startElement("ul");
        xhtml.newline();
        for (Variable var : ncFile.getVariables()) {
            xhtml.startElement("li");
            xhtml.characters(var.getDataType() + " " + var.getNameAndDimensions());
            xhtml.newline();
            List<Attribute> attributes = var.getAttributes();
            if (!attributes.isEmpty()) {
                xhtml.startElement("ul");
                for (Attribute element : attributes) {
                    xhtml.element("li", element.toString());
                }
                xhtml.endElement("ul");
            }
            xhtml.endElement("li");
        }
        xhtml.endElement("ul");
        xhtml.endDocument();
    } catch (IOException e) {
        throw new TikaException("NetCDF parse error", e);
    } finally {
        if (ncFile != null) {
            ncFile.close();
        }
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : NetcdfFile(ucar.nc2.NetcdfFile) Variable(ucar.nc2.Variable) TikaException(org.apache.tika.exception.TikaException) Attribute(ucar.nc2.Attribute) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) Dimension(ucar.nc2.Dimension) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Property(org.apache.tika.metadata.Property)

Example 30 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TesseractOCRParser method parse.

private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
    File tmpTxtOutput = null;
    try {
        File input = tikaInputStream.getFile();
        long size = tikaInputStream.getLength();
        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
            // Process image if ImageMagick Tool is present
            if (config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
                // copy the contents of the original input file into a temporary file
                // which will be preprocessed for OCR
                TemporaryResources tmp = new TemporaryResources();
                try {
                    File tmpFile = tmp.createTemporaryFile();
                    FileUtils.copyFile(input, tmpFile);
                    processImage(tmpFile, config);
                    doOCR(tmpFile, tmpOCROutputFile, config);
                } finally {
                    if (tmp != null) {
                        tmp.dispose();
                    }
                }
            } else {
                doOCR(input, tmpOCROutputFile, config);
            }
            // Tesseract appends the output type (.txt or .hocr) to output file name
            tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US));
            if (tmpTxtOutput.exists()) {
                try (InputStream is = new FileInputStream(tmpTxtOutput)) {
                    if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
                        extractHOCROutput(is, parseContext, xhtml);
                    } else {
                        extractOutput(is, xhtml);
                    }
                }
            }
        }
    } finally {
        if (tmpTxtOutput != null) {
            tmpTxtOutput.delete();
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) File(java.io.File) FileInputStream(java.io.FileInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6