Search in sources :

Example 36 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OldExcelParserTest method testMetadata.

// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
    TikaInputStream stream = getTestFile(file);
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    OldExcelParser parser = new OldExcelParser();
    parser.parse(stream, handler, metadata, new ParseContext());
    // We can get the content type
    assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
    // But no other metadata
    assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
    assertEquals(null, metadata.get(Metadata.SUBJECT));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Ignore(org.junit.Ignore) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 37 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OutlookParserTest method testOutlookParsing.

@Test
public void testOutlookParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
    //ensure that "raw" header is correctly decoded
    assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
    // Stored as Thu, 5 Apr 2007 09:26:06 -0700
    assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
    String content = handler.toString();
    assertContains("Microsoft Outlook Express 6", content);
    assertContains("L'Équipe Microsoft Outlook Express", content);
    assertContains("Nouvel utilisateur de Outlook Express", content);
    assertContains("Messagerie et groupes de discussion", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 38 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OutlookParserTest method testOutlookNew.

/**
     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
@Test
public void testOutlookNew() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
    String content = handler.toString();
    assertContains("Outlook 2003", content);
    assertContains("Streamlined Mail Experience", content);
    assertContains("Navigation Pane", content);
    //make sure these are parallel
    assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 39 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OOXMLParserTest method testEncrypted.

@Test
public void testEncrypted() throws Exception {
    Map<String, String> tests = new HashMap<String, String>();
    tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
    tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
    tests.put("testEXCEL_protected_passtika.xlsx", "This is an Encrypted Excel spreadsheet.");
    Parser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    PasswordProvider passwordProvider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    };
    ParseContext passwordContext = new ParseContext();
    passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
    for (Map.Entry<String, String> e : tests.entrySet()) {
        try (InputStream is = getTestDocument(e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, passwordContext);
            assertContains(e.getValue(), handler.toString());
        }
    }
    ParseContext context = new ParseContext();
    //now try with no password
    for (Map.Entry<String, String> e : tests.entrySet()) {
        boolean exc = false;
        try (InputStream is = getTestDocument(e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, context);
        } catch (EncryptedDocumentException ex) {
            exc = true;
        }
        assertTrue(exc);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) HashMap(java.util.HashMap) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Map(java.util.Map) HashMap(java.util.HashMap) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 40 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class OOXMLParserTest method testWordArt.

@Test
public void testWordArt() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWordArt.pptx")) {
        new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Here is some red word Art", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)252 Metadata (org.apache.tika.metadata.Metadata)243 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)202 InputStream (java.io.InputStream)189 ParseContext (org.apache.tika.parser.ParseContext)170 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)88 Parser (org.apache.tika.parser.Parser)81 ByteArrayInputStream (java.io.ByteArrayInputStream)65 TikaInputStream (org.apache.tika.io.TikaInputStream)65 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)18 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)14 SAXException (org.xml.sax.SAXException)14 MediaType (org.apache.tika.mime.MediaType)10 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10