use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class JackcessParserTest method testPassword.
@Test
public void testPassword() throws Exception {
ParseContext c = new ParseContext();
c.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
});
Parser p = new AutoDetectParser();
String content = null;
try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
content = getText(is, p, c);
}
assertContains("red and brown", content);
//now try wrong password
c.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "WRONG";
}
});
boolean ex = false;
try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
getText(is, p, c);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertTrue("failed to throw encrypted document exception for wrong password", ex);
//now try null
c.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return null;
}
});
ex = false;
try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
getText(is, p, c);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertTrue("failed to throw encrypted document exception for null password", ex);
//now try missing password provider
c = new ParseContext();
ex = false;
try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2_encrypted.accdb")) {
getText(is, p, c);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertTrue("failed to throw encrypted document exception for missing password provider", ex);
//now try password on file that doesn't need a password
c = new ParseContext();
c.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
});
ex = false;
try (InputStream is = this.getResourceAsStream("/test-documents/testAccess2.accdb")) {
content = getText(is, p, c);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertFalse("shouldn't have thrown encrypted document exception for " + "opening unencrypted file that doesn't need passowrd", ex);
assertContains("red and brown", content);
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class OfficeParser method parse.
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
// Parse summary entries first, to make metadata available early
new SummaryExtractor(metadata).parseSummaries(root);
// Parse remaining document entries
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type != POIFSDocumentType.UNKNOWN) {
setType(metadata, type.getType());
}
switch(type) {
case SOLIDWORKS_PART:
case SOLIDWORKS_ASSEMBLY:
case SOLIDWORKS_DRAWING:
break;
case PUBLISHER:
PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
new HSLFExtractor(context, metadata).parse(root, xhtml);
break;
case WORKBOOK:
case XLR:
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
break;
case PROJECT:
// We currently can't do anything beyond the metadata
break;
case VISIO:
VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
for (String text : visioTextExtractor.getAllText()) {
xhtml.element("p", text);
}
break;
case OUTLOOK:
OutlookExtractor extractor = new OutlookExtractor(root, context);
extractor.parse(xhtml, metadata);
break;
case ENCRYPTED:
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
// By default, use the default Office Password
String password = Decryptor.DEFAULT_PASSWORD;
// If they supplied a Password Provider, ask that for the password,
// and use the provider given one if available (stick with default if not)
PasswordProvider passwordProvider = context.get(PasswordProvider.class);
if (passwordProvider != null) {
String suppliedPassword = passwordProvider.getPassword(metadata);
if (suppliedPassword != null) {
password = suppliedPassword;
}
}
// Check if we've the right password or not
if (!d.verifyPassword(password)) {
throw new EncryptedDocumentException();
}
// Decrypt the OLE2 stream, and delegate the resulting OOXML
// file to the regular OOXML parser for normal handling
OOXMLParser parser = new OOXMLParser();
parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
} catch (GeneralSecurityException ex) {
throw new EncryptedDocumentException(ex);
}
default:
// is extracted, which happened above
break;
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class WordExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
document = new HWPFDocument(root);
} catch (org.apache.poi.EncryptedDocumentException e) {
throw new EncryptedDocumentException(e);
} catch (OldWordFileFormatException e) {
parseWord6(root, xhtml);
return;
}
extractSavedByMetadata(document);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
// Grab the list of pictures. As far as we can tell,
// the pictures should be in order, and may be directly
// placed or referenced from an anchor
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
// Do any headers, if present
Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
// Do the main paragraph text
Range r = document.getRange();
ListManager listManager = new ListManager(document);
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
}
if (officeParserConfig.getIncludeShapeBasedContent()) {
// Do everything else
for (String paragraph : wordExtractor.getMainTextboxText()) {
xhtml.element("p", paragraph);
}
}
for (String paragraph : wordExtractor.getFootnoteText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getCommentsText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getEndnoteText()) {
xhtml.element("p", paragraph);
}
// Do any footers, if present
Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
// Handle any pictures that we haven't output yet
for (Picture p = pictures.nextUnclaimed(); p != null; ) {
handlePictureCharacterRun(null, p, pictures, xhtml);
p = pictures.nextUnclaimed();
}
// Handle any embeded office documents
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
}
}
} catch (FileNotFoundException e) {
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class CryptoParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
try {
Cipher cipher;
if (provider != null) {
cipher = Cipher.getInstance(transformation, provider);
} else {
cipher = Cipher.getInstance(transformation);
}
Key key = context.get(Key.class);
if (key == null) {
throw new EncryptedDocumentException("No decryption key provided");
}
AlgorithmParameters params = context.get(AlgorithmParameters.class);
SecureRandom random = context.get(SecureRandom.class);
if (params != null && random != null) {
cipher.init(Cipher.DECRYPT_MODE, key, params, random);
} else if (params != null) {
cipher.init(Cipher.DECRYPT_MODE, key, params);
} else if (random != null) {
cipher.init(Cipher.DECRYPT_MODE, key, random);
} else {
cipher.init(Cipher.DECRYPT_MODE, key);
}
super.parse(new CipherInputStream(stream, cipher), handler, metadata, context);
} catch (GeneralSecurityException e) {
throw new TikaException("Unable to decrypt document stream", e);
}
}
Aggregations