use of org.apache.tika.exception.TikaException in project tika by apache.
the class Seven7ParserTest method testPasswordProtected.
@Test
public void testPasswordProtected() throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
// No password, will fail with EncryptedDocumentException
boolean ex = false;
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
fail("Shouldn't be able to read a password protected 7z without the password");
} catch (EncryptedDocumentException e) {
// Good
ex = true;
}
assertTrue("test no password", ex);
ex = false;
// Wrong password currently silently gives no content
// Ideally we'd like Commons Compress to give an error, but it doesn't...
recursingContext.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "wrong";
}
});
handler = new BodyContentHandler();
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
fail("Shouldn't be able to read a password protected 7z with wrong password");
} catch (TikaException e) {
//if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt
//if JCE is not installed, the message will include
// "(do you have the JCE Unlimited Strength Jurisdiction Policy Files installed?")
ex = true;
}
assertTrue("TikaException for bad password", ex);
// Will be empty
assertEquals("", handler.toString());
ex = false;
// Right password works fine if JCE Unlimited Strength has been installed!!!
if (isStrongCryptoAvailable()) {
recursingContext.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "Tika";
}
});
handler = new BodyContentHandler();
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
// Should get filename
assertContains("text.txt", content);
// Should get contents from the text file in the 7z file
assertContains("TEST DATA FOR TIKA.", content);
assertContains("This is text inside an encrypted 7zip (7z) file.", content);
assertContains("It should be processed by Tika just fine!", content);
assertContains("TIKA-1521", content);
} else {
//if jce is not installed, test for IOException wrapped in TikaException
boolean ioe = false;
recursingContext.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "Tika";
}
});
handler = new BodyContentHandler();
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
} catch (TikaException e) {
ioe = true;
}
assertTrue("IOException because JCE was not installed", ioe);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PDFParserTest method testSkipBadPage.
@Test
public void testSkipBadPage() throws Exception {
//test file comes from govdocs1
//can't use TikaTest shortcuts because of exception
Parser p = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(-1);
Metadata m = new Metadata();
ParseContext context = new ParseContext();
boolean tikaEx = false;
try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
p.parse(is, handler, m, context);
} catch (TikaException e) {
tikaEx = true;
}
String content = handler.toString();
assertTrue("Should have thrown exception", tikaEx);
assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
assertContains("1309.61", content);
//now try throwing exception immediately
PDFParserConfig config = new PDFParserConfig();
config.setCatchIntermediateIOExceptions(false);
context.set(PDFParserConfig.class, config);
handler = new BodyContentHandler(-1);
m = new Metadata();
tikaEx = false;
try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
p.parse(is, handler, m, context);
} catch (TikaException e) {
tikaEx = true;
}
content = handler.toString();
assertTrue("Should have thrown exception", tikaEx);
assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertNotContained("1309.61", content);
}
use of org.apache.tika.exception.TikaException in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ExternalEmbedder method embed.
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
* has been called to set arguments.
*/
public void embed(final Metadata metadata, final InputStream inputStream, final OutputStream outputStream, final ParseContext context) throws IOException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasMetadataCommandArguments = (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
boolean serializeMetadataCommandArgumentsToken = false;
boolean replacedMetadataCommandArgumentsToken = false;
TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
File tempOutputFile = null;
List<String> commandMetadataSegments = null;
if (hasMetadataCommandArguments) {
commandMetadataSegments = getCommandMetadataSegments(metadata);
}
// Build our command
List<String> origCmd = Arrays.asList(command);
List<String> cmd = new ArrayList<String>();
for (String commandSegment : origCmd) {
if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, tikaInputStream.getFile().toString());
inputToStdIn = false;
}
if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
tempOutputFile = tmp.createTemporaryFile();
commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
outputFromStdOut = false;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
serializeMetadataCommandArgumentsToken = true;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
if (hasMetadataCommandArguments) {
for (String commandMetadataSegment : commandMetadataSegments) {
cmd.add(commandMetadataSegment);
}
}
replacedMetadataCommandArgumentsToken = true;
} else {
cmd.add(commandSegment);
}
}
if (hasMetadataCommandArguments) {
if (serializeMetadataCommandArgumentsToken) {
// Find all metadata tokens and replace with encapsulated metadata
int i = 0;
for (String commandSegment : cmd) {
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
commandSegment = commandSegment.replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments));
cmd.set(i, commandSegment);
}
i++;
}
} else if (!replacedMetadataCommandArgumentsToken && !serializeMetadataCommandArgumentsToken) {
// Tack metadata onto the end of the cmd as arguments
cmd.addAll(commandMetadataSegments);
}
}
// Execute
Process process;
if (cmd.toArray().length == 1) {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
} else {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
}
ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
try {
sendStdErrToOutputStream(process, stdErrOutputStream);
if (inputToStdIn) {
sendInputStreamToStdIn(inputStream, process);
} else {
// We're not writing to std in this case so close
process.getOutputStream().close();
}
if (outputFromStdOut) {
sendStdOutToOutputStream(process, outputStream);
} else {
tmp.dispose();
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
// The command is finished, read the output file into the given output stream
InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
IOUtils.copy(tempOutputFileInputStream, outputStream);
}
} finally {
if (outputFromStdOut) {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
} else {
try {
// Clean up temp output files
tempOutputFile.delete();
} catch (Exception e) {
}
}
if (!inputToStdIn) {
// Close input file (and delete if created by up TemporaryResources.createTemporaryFile)
IOUtils.closeQuietly(tikaInputStream);
}
IOUtils.closeQuietly(outputStream);
IOUtils.closeQuietly(stdErrOutputStream);
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the command line" + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ParserContainerExtractor method extract.
public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler) throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
try {
parser.parse(stream, new DefaultHandler(), new Metadata(), context);
} catch (SAXException e) {
throw new TikaException("Unexpected SAX exception", e);
}
}
Aggregations