Search in sources :

Example 6 with EncryptedDocumentException

use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.

the class ParsingEmbeddedDocumentExtractor method parseEmbedded.

public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
    if (outputHtml) {
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
        handler.startElement(XHTML, "div", "div", attributes);
    }
    String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0 && outputHtml) {
        handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
        char[] chars = name.toCharArray();
        handler.characters(chars, 0, chars.length);
        handler.endElement(XHTML, "h1", "h1");
    }
    // Use the delegate parser to parse this entry
    try (TemporaryResources tmp = new TemporaryResources()) {
        final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
        if (stream instanceof TikaInputStream) {
            final Object container = ((TikaInputStream) stream).getOpenContainer();
            if (container != null) {
                newStream.setOpenContainer(container);
            }
        }
        DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
    } catch (EncryptedDocumentException ede) {
    // TODO: can we log a warning that we lack the password?
    // For now, just skip the content
    } catch (TikaException e) {
    // TODO: can we log a warning somehow?
    // Could not parse the entry, just skip the content
    }
    if (outputHtml) {
        handler.endElement(XHTML, "div", "div");
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) AttributesImpl(org.xml.sax.helpers.AttributesImpl) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream)

Example 7 with EncryptedDocumentException

use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.

the class JackcessParserTest method testPassword.

@Test
public void testPassword() throws Exception {
    ParseContext c = new ParseContext();
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    });
    Parser p = new AutoDetectParser();
    String content = null;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        content = getText(is, p, c);
    }
    assertContains("red and brown", content);
    //now try wrong password
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "WRONG";
        }
    });
    boolean ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for wrong password", ex);
    //now try null
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return null;
        }
    });
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for null password", ex);
    //now try missing password provider
    c = new ParseContext();
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
        getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertTrue("failed to throw encrypted document exception for missing password provider", ex);
    //now try password on file that doesn't need a password
    c = new ParseContext();
    c.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    });
    ex = false;
    try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2.accdb")) {
        content = getText(is, p, c);
    } catch (EncryptedDocumentException e) {
        ex = true;
    }
    assertFalse("shouldn't have thrown encrypted document exception for " + "opening unencrypted file that doesn't need passowrd", ex);
    assertContains("red and brown", content);
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 8 with EncryptedDocumentException

use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.

the class OfficeParser method parse.

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);
    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);
    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }
    switch(type) {
        case SOLIDWORKS_PART:
        case SOLIDWORKS_ASSEMBLY:
        case SOLIDWORKS_DRAWING:
            break;
        case PUBLISHER:
            PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
            xhtml.element("p", publisherTextExtractor.getText());
            break;
        case WORDDOCUMENT:
            new WordExtractor(context, metadata).parse(root, xhtml);
            break;
        case POWERPOINT:
            new HSLFExtractor(context, metadata).parse(root, xhtml);
            break;
        case WORKBOOK:
        case XLR:
            Locale locale = context.get(Locale.class, Locale.getDefault());
            new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
            break;
        case PROJECT:
            // We currently can't do anything beyond the metadata
            break;
        case VISIO:
            VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
            for (String text : visioTextExtractor.getAllText()) {
                xhtml.element("p", text);
            }
            break;
        case OUTLOOK:
            OutlookExtractor extractor = new OutlookExtractor(root, context);
            extractor.parse(xhtml, metadata);
            break;
        case ENCRYPTED:
            EncryptionInfo info = new EncryptionInfo(root);
            Decryptor d = Decryptor.getInstance(info);
            try {
                // By default, use the default Office Password
                String password = Decryptor.DEFAULT_PASSWORD;
                // If they supplied a Password Provider, ask that for the password,
                //  and use the provider given one if available (stick with default if not)
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                    String suppliedPassword = passwordProvider.getPassword(metadata);
                    if (suppliedPassword != null) {
                        password = suppliedPassword;
                    }
                }
                // Check if we've the right password or not
                if (!d.verifyPassword(password)) {
                    throw new EncryptedDocumentException();
                }
                // Decrypt the OLE2 stream, and delegate the resulting OOXML
                //  file to the regular OOXML parser for normal handling
                OOXMLParser parser = new OOXMLParser();
                parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
            } catch (GeneralSecurityException ex) {
                throw new EncryptedDocumentException(ex);
            }
        default:
            //  is extracted, which happened above
            break;
    }
}
Also used : Locale(java.util.Locale) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) GeneralSecurityException(java.security.GeneralSecurityException) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor)

Example 9 with EncryptedDocumentException

use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.

the class WordExtractor method parse.

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    HWPFDocument document;
    try {
        document = new HWPFDocument(root);
    } catch (org.apache.poi.EncryptedDocumentException e) {
        throw new EncryptedDocumentException(e);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    extractSavedByMetadata(document);
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
    HeaderStories headerFooter = new HeaderStories(document);
    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);
    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
    }
    if (officeParserConfig.getIncludeShapeBasedContent()) {
        // Do everything else
        for (String paragraph : wordExtractor.getMainTextboxText()) {
            xhtml.element("p", paragraph);
        }
    }
    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }
    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null; ) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }
    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) FileNotFoundException(java.io.FileNotFoundException) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) Range(org.apache.poi.hwpf.usermodel.Range) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) SavedByEntry(org.apache.poi.hwpf.model.SavedByEntry) Picture(org.apache.poi.hwpf.usermodel.Picture) OldWordFileFormatException(org.apache.poi.hwpf.OldWordFileFormatException)

Example 10 with EncryptedDocumentException

use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.

the class CryptoParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try {
        Cipher cipher;
        if (provider != null) {
            cipher = Cipher.getInstance(transformation, provider);
        } else {
            cipher = Cipher.getInstance(transformation);
        }
        Key key = context.get(Key.class);
        if (key == null) {
            throw new EncryptedDocumentException("No decryption key provided");
        }
        AlgorithmParameters params = context.get(AlgorithmParameters.class);
        SecureRandom random = context.get(SecureRandom.class);
        if (params != null && random != null) {
            cipher.init(Cipher.DECRYPT_MODE, key, params, random);
        } else if (params != null) {
            cipher.init(Cipher.DECRYPT_MODE, key, params);
        } else if (random != null) {
            cipher.init(Cipher.DECRYPT_MODE, key, random);
        } else {
            cipher.init(Cipher.DECRYPT_MODE, key);
        }
        super.parse(new CipherInputStream(stream, cipher), handler, metadata, context);
    } catch (GeneralSecurityException e) {
        throw new TikaException("Unable to decrypt document stream", e);
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) CipherInputStream(javax.crypto.CipherInputStream) GeneralSecurityException(java.security.GeneralSecurityException) SecureRandom(java.security.SecureRandom) Cipher(javax.crypto.Cipher) Key(java.security.Key) AlgorithmParameters(java.security.AlgorithmParameters)

Aggregations

EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)16 PasswordProvider (org.apache.tika.parser.PasswordProvider)10 Metadata (org.apache.tika.metadata.Metadata)9 TikaInputStream (org.apache.tika.io.TikaInputStream)8 InputStream (java.io.InputStream)7 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)7 Test (org.junit.Test)7 TikaTest (org.apache.tika.TikaTest)6 TikaException (org.apache.tika.exception.TikaException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6 ParseContext (org.apache.tika.parser.ParseContext)6 Parser (org.apache.tika.parser.Parser)6 ContentHandler (org.xml.sax.ContentHandler)5 TemporaryResources (org.apache.tika.io.TemporaryResources)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 GeneralSecurityException (java.security.GeneralSecurityException)2 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)2 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)2