Search in sources :

Example 41 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OOXMLParserTest method testProtectedExcelSheets.

/**
     * Documents with some sheets are protected, but not all.
     * See TIKA-364.
     */
@Test
public void testProtectedExcelSheets() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 42 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OOXMLParserTest method testPowerPointMetadataEarly.

/**
     * Test that the metadata is already extracted when the body is processed.
     * See TIKA-1109
     */
@Test
public void testPowerPointMetadataEarly() throws Exception {
    String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
    final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        final String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        final Metadata metadata = new Metadata();
        // Allow the value to be access from the inner class
        final int currentI = i;
        ContentHandler handler = new BodyContentHandler() {

            public void startDocument() {
                assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
                assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
                assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
            }
        };
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 43 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OOXMLParserTest method testUnsupportedPowerPoint.

/**
     * For the PowerPoint formats we don't currently support, ensure that
     * we don't break either
     */
@Test
public void testUnsupportedPowerPoint() throws Exception {
    String[] extensions = new String[] { "xps", "thmx" };
    String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
    "application/vnd.openxmlformats-officedocument" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
            // Should get the metadata
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
        // But that's about it
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 44 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OutlookParserTest method testOutlookHTMLVersion.

@Test
public void testOutlookHTMLVersion() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // Check the HTML version
    StringWriter sw = new StringWriter();
    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
    TransformerHandler handler = factory.newTransformerHandler();
    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
    handler.setResult(new StreamResult(sw));
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    // As the HTML version should have been processed, ensure
    //  we got some of the links
    String content = sw.toString();
    assertContains("<dd>tests.chang@fengttt.com</dd>", content);
    assertContains("<p>Alfresco MSG format testing", content);
    assertContains("<li>1", content);
    assertContains("<li>2", content);
    // Make sure we don't have nested html docs
    assertEquals(2, content.split("<body>").length);
    assertEquals(2, content.split("<\\/body>").length);
    // Make sure that the Chinese actually came through
    assertContains("張毓倫", metadata.get(TikaCoreProperties.CREATOR));
    assertContains("陳惠珍", content);
    assertEquals("tests.chang@fengttt.com", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("Tests Chang@FT (張毓倫)", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
    assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG", metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StringWriter(java.io.StringWriter) StreamResult(javax.xml.transform.stream.StreamResult) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 45 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OutlookParserTest method testOutlookParsing.

@Test
public void testOutlookParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
    //ensure that "raw" header is correctly decoded
    assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
    // Stored as Thu, 5 Apr 2007 09:26:06 -0700
    assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
    String content = handler.toString();
    assertContains("Microsoft Outlook Express 6", content);
    assertContains("L'Équipe Microsoft Outlook Express", content);
    assertContains("Nouvel utilisateur de Outlook Express", content);
    assertContains("Messagerie et groupes de discussion", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)164 Metadata (org.apache.tika.metadata.Metadata)136 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)111 ParseContext (org.apache.tika.parser.ParseContext)103 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)96 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12 TikaException (org.apache.tika.exception.TikaException)11