Search in sources :

Example 81 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testEncrypted.

@Test
public void testEncrypted() throws Exception {
    Map<String, String> tests = new HashMap<String, String>();
    tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
    tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
    tests.put("testEXCEL_protected_passtika.xlsx", "This is an Encrypted Excel spreadsheet.");
    Parser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    PasswordProvider passwordProvider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    };
    ParseContext passwordContext = new ParseContext();
    passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
    for (Map.Entry<String, String> e : tests.entrySet()) {
        try (InputStream is = getTestDocument(e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, passwordContext);
            assertContains(e.getValue(), handler.toString());
        }
    }
    ParseContext context = new ParseContext();
    //now try with no password
    for (Map.Entry<String, String> e : tests.entrySet()) {
        boolean exc = false;
        try (InputStream is = getTestDocument(e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, context);
        } catch (EncryptedDocumentException ex) {
            exc = true;
        }
        assertTrue(exc);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) HashMap(java.util.HashMap) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Map(java.util.Map) HashMap(java.util.HashMap) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 82 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testWordArt.

@Test
public void testWordArt() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWordArt.pptx")) {
        new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Here is some red word Art", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 83 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testWordFootnote.

/**
     * Test the plain text output of the Word converter
     *
     * @throws Exception
     */
@Test
public void testWordFootnote() throws Exception {
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = getTestDocument("footnotes.docx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", metadata.get(Metadata.CONTENT_TYPE));
        assertTrue(handler.toString().contains("snoska"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 84 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testWordMissingOOXMLBeans.

//TIKA-792; with room for future missing bean tests
@Test
public void testWordMissingOOXMLBeans() throws Exception {
    //If a bean is missing, POI prints stack trace to stderr 
    String[] fileNames = new String[] { //TIKA-792
    "testWORD_missing_ooxml_bean1.docx" };
    PrintStream origErr = System.err;
    for (String fileName : fileNames) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        InputStream input = getTestDocument(fileName);
        //grab stderr
        ByteArrayOutputStream errContent = new ByteArrayOutputStream();
        System.setErr(new PrintStream(errContent, true, UTF_8.name()));
        parser.parse(input, handler, metadata, context);
        //return stderr
        System.setErr(origErr);
        String err = errContent.toString(UTF_8.name());
        assertTrue(err.length() == 0);
        input.close();
    }
}
Also used : PrintStream(java.io.PrintStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 85 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testProtectedExcelFile.

/**
     * An excel document which is password protected.
     * See TIKA-437.
     */
@Test
public void testProtectedExcelFile() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = getTestDocument("protectedFile.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
        String content = handler.toString();
        assertContains("Office", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20