Search in sources :

Example 61 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class BundleIT method testTikaBundle.

@Test
public void testTikaBundle() throws Exception {
    Tika tika = new Tika();
    // Package extraction
    ContentHandler handler = new BodyContentHandler();
    Parser parser = tika.getParser();
    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    try (InputStream stream = new FileInputStream("src/test/resources/test-documents.zip")) {
        parser.parse(stream, handler, new Metadata(), context);
    }
    String content = handler.toString();
    assertTrue(content.contains("testEXCEL.xls"));
    assertTrue(content.contains("Sample Excel Worksheet"));
    assertTrue(content.contains("testHTML.html"));
    assertTrue(content.contains("Test Indexation Html"));
    assertTrue(content.contains("testOpenOffice2.odt"));
    assertTrue(content.contains("This is a sample Open Office document"));
    assertTrue(content.contains("testPDF.pdf"));
    assertTrue(content.contains("Apache Tika"));
    assertTrue(content.contains("testPPT.ppt"));
    assertTrue(content.contains("Sample Powerpoint Slide"));
    assertTrue(content.contains("testRTF.rtf"));
    assertTrue(content.contains("indexation Word"));
    assertTrue(content.contains("testTXT.txt"));
    assertTrue(content.contains("Test d'indexation de Txt"));
    assertTrue(content.contains("testWORD.doc"));
    assertTrue(content.contains("This is a sample Microsoft Word Document"));
    assertTrue(content.contains("testXML.xml"));
    assertTrue(content.contains("Rida Benjelloun"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) JarInputStream(java.util.jar.JarInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) FileInputStream(java.io.FileInputStream) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) ForkParser(org.apache.tika.fork.ForkParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test)

Example 62 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 63 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ParsingEmbeddedDocumentExtractor method parseEmbedded.

public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
    if (outputHtml) {
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
        handler.startElement(XHTML, "div", "div", attributes);
    }
    String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0 && outputHtml) {
        handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
        char[] chars = name.toCharArray();
        handler.characters(chars, 0, chars.length);
        handler.endElement(XHTML, "h1", "h1");
    }
    // Use the delegate parser to parse this entry
    try (TemporaryResources tmp = new TemporaryResources()) {
        final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
        if (stream instanceof TikaInputStream) {
            final Object container = ((TikaInputStream) stream).getOpenContainer();
            if (container != null) {
                newStream.setOpenContainer(container);
            }
        }
        DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
    } catch (EncryptedDocumentException ede) {
    // TODO: can we log a warning that we lack the password?
    // For now, just skip the content
    } catch (TikaException e) {
    // TODO: can we log a warning somehow?
    // Could not parse the entry, just skip the content
    }
    if (outputHtml) {
        handler.endElement(XHTML, "div", "div");
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) AttributesImpl(org.xml.sax.helpers.AttributesImpl) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream)

Example 64 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class EpubParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Because an EPub file is often made up of multiple XHTML files,
    //  we need explicit control over the start and end of the document
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
        if (entry.getName().equals("mimetype")) {
            String type = IOUtils.toString(zip, UTF_8);
            //often has trailing new lines
            if (type != null) {
                type = type.trim();
            }
            metadata.set(Metadata.CONTENT_TYPE, type);
        } else if (entry.getName().equals("metadata.xml")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".opf")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
            content.parse(zip, childHandler, metadata, context);
        }
        entry = zip.getNextEntry();
    }
    // Finish everything
    xhtml.endDocument();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ZipInputStream(java.util.zip.ZipInputStream) ZipEntry(java.util.zip.ZipEntry) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ContentHandler(org.xml.sax.ContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 65 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class NetCDFParserTest method testParseGlobalMetadata.

@Test
public void testParseGlobalMetadata() throws Exception {
    Parser parser = new NetCDFParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = NetCDFParser.class.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals(metadata.get(TikaCoreProperties.TITLE), "model output prepared for IPCC AR4");
    assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
    assertEquals(metadata.get(Metadata.PROJECT_ID), "IPCC Fourth Assessment");
    assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
    assertEquals(metadata.get(Metadata.REALIZATION), "1");
    assertEquals(metadata.get(Metadata.EXPERIMENT_ID), "720 ppm stabilization experiment (SRESA1B)");
    assertEquals(metadata.get("File-Type-Description"), "NetCDF-3/CDM");
    String content = handler.toString();
    assertContains("long_name = \"Surface area\"", content);
    assertContains("float area(lat=128, lon=256)", content);
    assertContains("float lat(lat=128)", content);
    assertContains("double lat_bnds(lat=128, bnds=2)", content);
    assertContains("double lon_bnds(lon=256, bnds=2)", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10