Search in sources :

Example 11 with PasswordProvider

use of org.apache.tika.parser.PasswordProvider in project tika by apache.

the class Seven7ParserTest method testPasswordProtected.

@Test
public void testPasswordProtected() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // No password, will fail with EncryptedDocumentException
    boolean ex = false;
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
        fail("Shouldn't be able to read a password protected 7z without the password");
    } catch (EncryptedDocumentException e) {
        // Good
        ex = true;
    }
    assertTrue("test no password", ex);
    ex = false;
    // Wrong password currently silently gives no content
    // Ideally we'd like Commons Compress to give an error, but it doesn't...
    recursingContext.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "wrong";
        }
    });
    handler = new BodyContentHandler();
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
        fail("Shouldn't be able to read a password protected 7z with wrong password");
    } catch (TikaException e) {
        //if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt
        //if JCE is not installed, the message will include
        // "(do you have the JCE  Unlimited Strength Jurisdiction Policy Files installed?")
        ex = true;
    }
    assertTrue("TikaException for bad password", ex);
    // Will be empty
    assertEquals("", handler.toString());
    ex = false;
    // Right password works fine if JCE Unlimited Strength has been installed!!!
    if (isStrongCryptoAvailable()) {
        recursingContext.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "Tika";
            }
        });
        handler = new BodyContentHandler();
        try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
            parser.parse(stream, handler, metadata, recursingContext);
        }
        assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        // Should get filename
        assertContains("text.txt", content);
        // Should get contents from the text file in the 7z file
        assertContains("TEST DATA FOR TIKA.", content);
        assertContains("This is text inside an encrypted 7zip (7z) file.", content);
        assertContains("It should be processed by Tika just fine!", content);
        assertContains("TIKA-1521", content);
    } else {
        //if jce is not installed, test for IOException wrapped in TikaException
        boolean ioe = false;
        recursingContext.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "Tika";
            }
        });
        handler = new BodyContentHandler();
        try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
            parser.parse(stream, handler, metadata, recursingContext);
        } catch (TikaException e) {
            ioe = true;
        }
        assertTrue("IOException because JCE was not installed", ioe);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 12 with PasswordProvider

use of org.apache.tika.parser.PasswordProvider in project tika by apache.

the class PDFParserTest method testAccessCheckingUserPassword.

@Test
public void testAccessCheckingUserPassword() throws Exception {
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    //don't allow extraction, not even for accessibility
    config.setAccessChecker(new AccessChecker(false));
    PasswordProvider passwordProvider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "user";
        }
    };
    context.set(PasswordProvider.class, passwordProvider);
    context.set(PDFParserConfig.class, config);
    Parser parser = new AutoDetectParser();
    //test bad passwords
    for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf" }) {
        assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
    }
    //bad password is still a bad password
    config.setAccessChecker(new AccessChecker(true));
    for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf" }) {
        assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class);
    }
    //now test documents that require this "user" password
    assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf", parser, context, AccessPermissionException.class);
    assertContains("Hello World", getXML("testPDF_no_extract_yes_accessibility_owner_user.pdf", context).xml);
    config.setAccessChecker(new AccessChecker(false));
    for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf" }) {
        assertException("/test-documents/" + path, parser, context, AccessPermissionException.class);
    }
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 13 with PasswordProvider

use of org.apache.tika.parser.PasswordProvider in project tika by apache.

the class SXSLFExtractorTest method testEncrypted.

@Test
public void testEncrypted() throws Exception {
    Map<String, String> tests = new HashMap<String, String>();
    tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
    Parser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    PasswordProvider passwordProvider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    };
    ParseContext passwordContext = new ParseContext();
    passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
    passwordContext.set(OfficeParserConfig.class, officeParserConfig);
    for (Map.Entry<String, String> e : tests.entrySet()) {
        try (InputStream is = getResourceAsStream("/test-documents/" + e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, passwordContext);
            assertContains(e.getValue(), handler.toString());
        }
    }
    ParseContext context = new ParseContext();
    //now try with no password
    for (Map.Entry<String, String> e : tests.entrySet()) {
        boolean exc = false;
        try (InputStream is = getResourceAsStream("/test-documents/" + e.getKey())) {
            ContentHandler handler = new BodyContentHandler();
            parser.parse(is, handler, m, context);
        } catch (EncryptedDocumentException ex) {
            exc = true;
        }
        assertTrue(exc);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 14 with PasswordProvider

use of org.apache.tika.parser.PasswordProvider in project tika by apache.

the class RFC822ParserTest method testEncryptedZipAttachment.

/**
     * Test TIKA-1028 - If the mail contains an encrypted attachment (or
     * an attachment that others triggers an error), parsing should carry
     * on for the remainder regardless
     */
@Test
public void testEncryptedZipAttachment() throws Exception {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    context.set(Parser.class, new AutoDetectParser());
    InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, context);
    // Check we go the metadata
    assertEquals("Juha Haaga <juha.haaga@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
    assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
    // Check we got the message text, for both Plain Text and HTML
    assertContains("Includes encrypted zip file", handler.toString());
    assertContains("password is \"test\".", handler.toString());
    assertContains("This is the Plain Text part", handler.toString());
    assertContains("This is the HTML part", handler.toString());
    // We won't get the contents of the zip file, but we will get the name
    assertContains("text.txt", handler.toString());
    assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
    // Try again, this time with the password supplied
    // Check that we also get the zip's contents as well
    context.set(PasswordProvider.class, new PasswordProvider() {

        public String getPassword(Metadata metadata) {
            return "test";
        }
    });
    stream = getStream("test-documents/testRFC822_encrypted_zip");
    handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, context);
    assertContains("Includes encrypted zip file", handler.toString());
    assertContains("password is \"test\".", handler.toString());
    assertContains("This is the Plain Text part", handler.toString());
    assertContains("This is the HTML part", handler.toString());
    // We do get the name of the file in the encrypted zip file
    assertContains("text.txt", handler.toString());
    // TODO Upgrade to a version of Commons Compress with Encryption
    //  support, then verify we get the contents of the text file
    //  held within the encrypted zip
    // No Zip Encryption support yet
    assumeTrue(false);
    assertContains("TEST DATA FOR TIKA.", handler.toString());
    assertContains("ENCRYPTED ZIP FILES", handler.toString());
    assertContains("TIKA-1028", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 15 with PasswordProvider

use of org.apache.tika.parser.PasswordProvider in project tika by apache.

the class JackcessParserTest method testPassword.

@Test
public void testPassword() throws Exception {
    ParseContext c = new ParseContext();
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    });
    Parser p = new AutoDetectParser();
    String content = null;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        content = getText(is, p, c);
    }
    assertContains("red and brown", content);
    //now try wrong password
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "WRONG";
        }
    });
    boolean ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for wrong password", ex);
    //now try null
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return null;
        }
    });
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for null password", ex);
    //now try missing password provider
    c = new ParseContext();
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for missing password provider", ex);
    //now try password on file that doesn't need a password
    c = new ParseContext();
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    });
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2.accdb")) {
        content = getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertFalse("shouldn't have thrown encrypted document exception for " + "opening unencrypted file that doesn't need passowrd", ex);
    assertContains("red and brown", content);
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

PasswordProvider (org.apache.tika.parser.PasswordProvider)16 Metadata (org.apache.tika.metadata.Metadata)12 Test (org.junit.Test)11 TikaTest (org.apache.tika.TikaTest)10 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)10 ParseContext (org.apache.tika.parser.ParseContext)10 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)9 Parser (org.apache.tika.parser.Parser)9 InputStream (java.io.InputStream)8 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)7 ContentHandler (org.xml.sax.ContentHandler)6 TikaInputStream (org.apache.tika.io.TikaInputStream)5 HashMap (java.util.HashMap)3 Map (java.util.Map)3 CompositeParser (org.apache.tika.parser.CompositeParser)3 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)3 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)3 IOException (java.io.IOException)2 TikaException (org.apache.tika.exception.TikaException)2 CryptCodecProvider (com.healthmarketscience.jackcess.CryptCodecProvider)1