Search in sources :

Example 51 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ExcelParserTest method testExcelParserPassword.

@Test
public void testExcelParserPassword() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        fail("Document is encrypted, shouldn't parse");
    } catch (EncryptedDocumentException e) {
    // Good
    }
    // Try again, this time with the password
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        context.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "tika";
            }
        });
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED));
        String content = handler.toString();
        assertContains("This is an Encrypted Excel spreadsheet", content);
        assertNotContained("9.0", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 52 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PowerPointParserTest method testPowerPointParser.

@Test
public void testPowerPointParser() throws Exception {
    try (InputStream input = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT.ppt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("Sample Powerpoint Slide", content);
        assertContains("Powerpoint X for Mac", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 53 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PowerPointParserTest method testMasterText.

/**
     * TIKA-712 Master Slide Text from PPT and PPTX files
     *  should be extracted too
     */
@Test
public void testMasterText() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Text that I added to the master slide", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 54 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PowerPointParserTest method testMasterFooter.

@Test
public void testMasterFooter() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterFooter.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Master footer is here", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 55 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PublisherParserTest method testPublisherParser.

@Test
public void testPublisherParser() throws Exception {
    try (InputStream input = PublisherParserTest.class.getResourceAsStream("/test-documents/testPUBLISHER.pub")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/x-mspublisher", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("0123456789", content);
        assertContains("abcdef", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10