Search in sources :

Example 31 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class MatParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //Set MIME type as Matlab
    metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
    try {
        // Use TIS so we can spool a temp file for parsing.
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        //Extract information from header file
        //input .mat file
        MatFileReader mfr = new MatFileReader(tis.getFile());
        //.mat header information
        MatFileHeader hdr = mfr.getMatFileHeader();
        // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar  2 23:41:57 2014"
        // Break header information into its parts
        String[] parts = hdr.getDescription().split(",");
        if (parts[2].contains("Created")) {
            int lastIndex1 = parts[2].lastIndexOf("Created on:");
            String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
            metadata.set("createdOn", dateCreated);
        }
        if (parts[1].contains("Platform")) {
            int lastIndex2 = parts[1].lastIndexOf("Platform:");
            String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
            metadata.set("platform", platform);
        }
        if (parts[0].contains("MATLAB")) {
            metadata.set("fileType", parts[0]);
        }
        // Get endian indicator from header file
        // Retrieve endian bytes and convert to string
        String endianBytes = new String(hdr.getEndianIndicator(), UTF_8);
        // Convert bytes to characters to string
        String endianCode = String.valueOf(endianBytes.toCharArray());
        metadata.set("endian", endianCode);
        //Text output	
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.newline();
        //Loop through each variable
        for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
            String varName = entry.getKey();
            MLArray varData = entry.getValue();
            xhtml.element("p", varName + ":" + String.valueOf(varData));
            // If the variable is a structure, extract variable info from structure
            if (varData.isStruct()) {
                MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
                xhtml.startElement("ul");
                xhtml.newline();
                for (MLArray element : mlStructure.getAllFields()) {
                    xhtml.startElement("li");
                    xhtml.characters(String.valueOf(element));
                    // If there is an embedded structure, extract variable info.
                    if (element.isStruct()) {
                        xhtml.startElement("ul");
                        // Should this actually be a recursive call?
                        xhtml.element("li", element.contentToString());
                        xhtml.endElement("ul");
                    }
                    xhtml.endElement("li");
                }
                xhtml.endElement("ul");
            }
        }
        xhtml.endDocument();
    } catch (IOException e) {
        throw new TikaException("Error parsing Matlab file with MatParser", e);
    } finally {
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : MatFileReader(com.jmatio.io.MatFileReader) MLArray(com.jmatio.types.MLArray) TikaException(org.apache.tika.exception.TikaException) MatFileHeader(com.jmatio.io.MatFileHeader) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) MLStructure(com.jmatio.types.MLStructure) Map(java.util.Map)

Example 32 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 33 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class HSLFExtractor method handleSlideEmbeddedPictures.

private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
    for (HSLFPictureData pic : slideshow.getPictureData()) {
        String mediaType;
        switch(pic.getType()) {
            case EMF:
                mediaType = "image/emf";
                break;
            case WMF:
                mediaType = "image/wmf";
                break;
            case DIB:
                mediaType = "image/bmp";
                break;
            default:
                mediaType = pic.getContentType();
                break;
        }
        byte[] data = null;
        try {
            data = pic.getData();
        } catch (Exception e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
            continue;
        }
        try (TikaInputStream picIs = TikaInputStream.get(data)) {
            handleEmbeddedResource(picIs, null, null, mediaType, xhtml, false);
        }
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) HSLFPictureData(org.apache.poi.hslf.usermodel.HSLFPictureData) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Example 34 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method handleSlidePart.

private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
    Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata);
    //        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
    xhtml.startElement("div", "class", "slide-content");
    try (InputStream stream = slidePart.getInputStream()) {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
    } catch (TikaException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    xhtml.endElement("div");
    handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)));
    handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
    handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
    handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart, new XSLFCommentsHandler(xhtml));
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 35 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method handleBasicRelatedParts.

/**
     * This should handle the comments, master, notes, etc
     *
     * @param contentType
     * @param xhtmlClassLabel
     * @param parentPart
     * @param contentHandler
     */
private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
    PackageRelationshipCollection relatedPartPRC = null;
    try {
        relatedPartPRC = parentPart.getRelationshipsByType(contentType);
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
        contentHandler.startElement("", "div", "div", attributes);
        for (int i = 0; i < relatedPartPRC.size(); i++) {
            PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
            try {
                PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
                try (InputStream stream = relatedPartPart.getInputStream()) {
                    context.getSAXParser().parse(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
                } catch (IOException | TikaException e) {
                    metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
                }
            } catch (InvalidFormatException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
        }
        contentHandler.endElement("", "div", "div");
    }
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) AttributesImpl(org.xml.sax.helpers.AttributesImpl) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) IOException(java.io.IOException) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Aggregations

TikaException (org.apache.tika.exception.TikaException)144 IOException (java.io.IOException)56 SAXException (org.xml.sax.SAXException)44 InputStream (java.io.InputStream)37 Metadata (org.apache.tika.metadata.Metadata)35 TikaInputStream (org.apache.tika.io.TikaInputStream)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 ParseContext (org.apache.tika.parser.ParseContext)19 Test (org.junit.Test)19 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)17 ContentHandler (org.xml.sax.ContentHandler)17 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)14 Parser (org.apache.tika.parser.Parser)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8