use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class SXSLFExtractorTest method testEncrypted.
@Test
public void testEncrypted() throws Exception {
Map<String, String> tests = new HashMap<String, String>();
tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
Parser parser = new AutoDetectParser();
Metadata m = new Metadata();
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
};
ParseContext passwordContext = new ParseContext();
passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
passwordContext.set(OfficeParserConfig.class, officeParserConfig);
for (Map.Entry<String, String> e : tests.entrySet()) {
try (InputStream is = getResourceAsStream("/test-documents/" + e.getKey())) {
ContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, m, passwordContext);
assertContains(e.getValue(), handler.toString());
}
}
ParseContext context = new ParseContext();
//now try with no password
for (Map.Entry<String, String> e : tests.entrySet()) {
boolean exc = false;
try (InputStream is = getResourceAsStream("/test-documents/" + e.getKey())) {
ContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, m, context);
} catch (EncryptedDocumentException ex) {
exc = true;
}
assertTrue(exc);
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class PackageParser method parseEntry.
private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
// Fetch the metadata on the entry contained in the archive
Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
// Recurse into the entry if desired
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(archive, tmp);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
}
}
} else {
name = (name == null) ? "" : name;
if (entry instanceof ZipArchiveEntry) {
boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
if (usesEncryption) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
}
} else {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
}
if (name.length() > 0) {
xhtml.element("p", name);
}
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class PDFParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
PDDocument pdfDocument = null;
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
//TODO: make this configurable via MemoryUsageSetting
TikaInputStream tstream = TikaInputStream.cast(stream);
password = getPassword(metadata, context);
if (tstream != null && tstream.hasFile()) {
// File based -- send file directly to PDFBox
pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
} else {
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
}
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
} else {
if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
}
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
}
} catch (InvalidPasswordException e) {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
use of org.apache.tika.exception.EncryptedDocumentException in project tika by apache.
the class PDFParserTest method testProtectedPDF.
/**
* PDFs can be "protected" with the default password. This means
* they're encrypted (potentially both text and metadata),
* but we can decrypt them easily.
*/
@Test
public void testProtectedPDF() throws Exception {
XMLResult r = getXML("testPDF_protected.pdf");
Metadata metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
assertContains("On 16 November 2002", r.xml);
assertContains("In many important respects", r.xml);
// Try again with an explicit empty password
ParseContext context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "";
}
});
r = getXML("testPDF_protected.pdf", context);
metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
assertContains("On 16 November 2002", r.xml);
assertContains("In many important respects", r.xml);
//now test wrong password
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "WRONG!!!!";
}
});
boolean ex = false;
ContentHandler handler = new BodyContentHandler();
metadata = new Metadata();
try (InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
Parser parser = new AutoDetectParser();
parser.parse(stream, handler, metadata, context);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertTrue("encryption exception", ex);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
assertEquals("very little metadata should be parsed", 3, metadata.names().length);
assertEquals(0, handler.toString().length());
}
Aggregations