Search in sources :

Example 16 with EmbeddedContentHandler

use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.

the class RTFEmbObjHandler method extractObj.

private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException {
    if (bytes == null) {
        return;
    }
    metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
    if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = TikaInputStream.get(bytes);
        if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
            String extension = embeddedDocumentUtil.getExtension(stream, metadata);
            if (inObject && state == EMB_STATE.PICT) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
                metadata.set(RTFMetadata.THUMBNAIL, "true");
            } else {
                metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension);
            }
        }
        try {
            embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        } finally {
            stream.close();
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) IOException(java.io.IOException)

Example 17 with EmbeddedContentHandler

use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.

the class DIFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // TODO Auto-generated method stub
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");
    TaggedContentHandler tagged = new TaggedContentHandler(handler);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 18 with EmbeddedContentHandler

use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.

the class OfficeParser method parse.

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);
    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);
    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }
    switch(type) {
        case SOLIDWORKS_PART:
        case SOLIDWORKS_ASSEMBLY:
        case SOLIDWORKS_DRAWING:
            break;
        case PUBLISHER:
            PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
            xhtml.element("p", publisherTextExtractor.getText());
            break;
        case WORDDOCUMENT:
            new WordExtractor(context, metadata).parse(root, xhtml);
            break;
        case POWERPOINT:
            new HSLFExtractor(context, metadata).parse(root, xhtml);
            break;
        case WORKBOOK:
        case XLR:
            Locale locale = context.get(Locale.class, Locale.getDefault());
            new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
            break;
        case PROJECT:
            // We currently can't do anything beyond the metadata
            break;
        case VISIO:
            VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
            for (String text : visioTextExtractor.getAllText()) {
                xhtml.element("p", text);
            }
            break;
        case OUTLOOK:
            OutlookExtractor extractor = new OutlookExtractor(root, context);
            extractor.parse(xhtml, metadata);
            break;
        case ENCRYPTED:
            EncryptionInfo info = new EncryptionInfo(root);
            Decryptor d = Decryptor.getInstance(info);
            try {
                // By default, use the default Office Password
                String password = Decryptor.DEFAULT_PASSWORD;
                // If they supplied a Password Provider, ask that for the password,
                //  and use the provider given one if available (stick with default if not)
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                    String suppliedPassword = passwordProvider.getPassword(metadata);
                    if (suppliedPassword != null) {
                        password = suppliedPassword;
                    }
                }
                // Check if we've the right password or not
                if (!d.verifyPassword(password)) {
                    throw new EncryptedDocumentException();
                }
                // Decrypt the OLE2 stream, and delegate the resulting OOXML
                //  file to the regular OOXML parser for normal handling
                OOXMLParser parser = new OOXMLParser();
                parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
            } catch (GeneralSecurityException ex) {
                throw new EncryptedDocumentException(ex);
            }
        default:
            //  is extracted, which happened above
            break;
    }
}
Also used : Locale(java.util.Locale) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) GeneralSecurityException(java.security.GeneralSecurityException) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor)

Example 19 with EmbeddedContentHandler

use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.

the class AbstractOOXMLExtractor method handleThumbnail.

private void handleThumbnail(ContentHandler handler) {
    try {
        OPCPackage opcPackage = extractor.getPackage();
        for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
            PackagePart tPart = opcPackage.getPart(rel);
            InputStream tStream = tPart.getInputStream();
            Metadata thumbnailMetadata = new Metadata();
            String thumbName = tPart.getPartName().getName();
            thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded");
            attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
            handler.startElement(XHTML, "div", "div", attributes);
            handler.endElement(XHTML, "div", "div");
            thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName);
            thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType());
            thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
            if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
            }
            tStream.close();
        }
    } catch (Exception ex) {
    }
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) AttributesImpl(org.xml.sax.helpers.AttributesImpl) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) TikaException(org.apache.tika.exception.TikaException) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) XmlException(org.apache.xmlbeans.XmlException) SAXException(org.xml.sax.SAXException)

Example 20 with EmbeddedContentHandler

use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.

the class EMFParser method handleWMF.

private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException {
    Metadata embeddedMetadata = new Metadata();
    embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
    if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
        try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) {
            embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(contentHandler), embeddedMetadata, false);
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler)

Aggregations

EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)20 TikaException (org.apache.tika.exception.TikaException)10 Metadata (org.apache.tika.metadata.Metadata)10 InputStream (java.io.InputStream)8 TikaInputStream (org.apache.tika.io.TikaInputStream)8 IOException (java.io.IOException)7 SAXException (org.xml.sax.SAXException)7 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)6 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)6 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)5 AttributesImpl (org.xml.sax.helpers.AttributesImpl)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 InvalidFormatException (org.apache.poi.openxml4j.exceptions.InvalidFormatException)3 FileNotFoundException (java.io.FileNotFoundException)2 PackagePart (org.apache.poi.openxml4j.opc.PackagePart)2 PackageRelationship (org.apache.poi.openxml4j.opc.PackageRelationship)2 TaggedContentHandler (org.apache.tika.sax.TaggedContentHandler)2 ContentHandler (org.xml.sax.ContentHandler)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1