Search in sources :

Example 76 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testPowerPointMetadataEarly.

/**
     * Test that the metadata is already extracted when the body is processed.
     * See TIKA-1109
     */
@Test
public void testPowerPointMetadataEarly() throws Exception {
    String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
    final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        final String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        final Metadata metadata = new Metadata();
        // Allow the value to be access from the inner class
        final int currentI = i;
        ContentHandler handler = new BodyContentHandler() {

            public void startDocument() {
                assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
                assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
                assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
            }
        };
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 77 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testUnsupportedPowerPoint.

/**
     * For the PowerPoint formats we don't currently support, ensure that
     * we don't break either
     */
@Test
public void testUnsupportedPowerPoint() throws Exception {
    String[] extensions = new String[] { "xps", "thmx" };
    String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
    "application/vnd.openxmlformats-officedocument" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
            // Should get the metadata
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
        // But that's about it
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 78 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OldExcelParserTest method testMetadata.

// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    OldExcelParser parser = new OldExcelParser();
    parser.parse(stream, handler, metadata, new ParseContext());
    // We can get the content type
    assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
    // But no other metadata
    assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
    assertEquals(null, metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Ignore(org.junit.Ignore) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 79 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OutlookParserTest method testOutlookParsing.

@Test
public void testOutlookParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
    //ensure that "raw" header is correctly decoded
    assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
    // Stored as Thu, 5 Apr 2007 09:26:06 -0700
    assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
    String content = handler.toString();
    assertContains("Microsoft Outlook Express 6", content);
    assertContains("L'Équipe Microsoft Outlook Express", content);
    assertContains("Nouvel utilisateur de Outlook Express", content);
    assertContains("Messagerie et groupes de discussion", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 80 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OutlookParserTest method testOutlookNew.

/**
     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
@Test
public void testOutlookNew() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
    String content = handler.toString();
    assertContains("Outlook 2003", content);
    assertContains("Streamlined Mail Experience", content);
    assertContains("Navigation Pane", content);
    //make sure these are parallel
    assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20