Search in sources :

Example 91 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ExcelParserTest method testHeaderAndFooterExtraction.

@Test
public void testHeaderAndFooterExtraction() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_headers_footers.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.UK);
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("John Smith1", content);
        assertContains("John Smith50", content);
        assertContains("1 Corporate HQ", content);
        assertContains("Header - Corporate Spreadsheet", content);
        assertContains("Header - For Internal Use Only", content);
        assertContains("Header - Author: John Smith", content);
        assertContains("Footer - Corporate Spreadsheet", content);
        assertContains("Footer - For Internal Use Only", content);
        assertContains("Footer - Author: John Smith", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 92 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ExcelParserTest method testWorksSpreadsheet70.

@Test
public void testWorksSpreadsheet70() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testWORKSSpreadsheet7.0.xlr")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        String content = handler.toString();
        assertContains("Microsoft Works", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 93 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ExcelParserTest method testExcelParserPassword.

@Test
public void testExcelParserPassword() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        fail("Document is encrypted, shouldn't parse");
    } catch (EncryptedDocumentException e) {
    // Good
    }
    // Try again, this time with the password
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        context.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "tika";
            }
        });
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED));
        String content = handler.toString();
        assertContains("This is an Encrypted Excel spreadsheet", content);
        assertNotContained("9.0", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 94 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class PowerPointParserTest method testPowerPointParser.

@Test
public void testPowerPointParser() throws Exception {
    try (InputStream input = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT.ppt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("Sample Powerpoint Slide", content);
        assertContains("Powerpoint X for Mac", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 95 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class PowerPointParserTest method testMasterText.

/**
     * TIKA-712 Master Slide Text from PPT and PPTX files
     *  should be extracted too
     */
@Test
public void testMasterText() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Text that I added to the master slide", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20