use of org.apache.tika.exception.TikaException in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ExternalEmbedder method embed.
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
* has been called to set arguments.
*/
public void embed(final Metadata metadata, final InputStream inputStream, final OutputStream outputStream, final ParseContext context) throws IOException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasMetadataCommandArguments = (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
boolean serializeMetadataCommandArgumentsToken = false;
boolean replacedMetadataCommandArgumentsToken = false;
TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
File tempOutputFile = null;
List<String> commandMetadataSegments = null;
if (hasMetadataCommandArguments) {
commandMetadataSegments = getCommandMetadataSegments(metadata);
}
// Build our command
List<String> origCmd = Arrays.asList(command);
List<String> cmd = new ArrayList<String>();
for (String commandSegment : origCmd) {
if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, tikaInputStream.getFile().toString());
inputToStdIn = false;
}
if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
tempOutputFile = tmp.createTemporaryFile();
commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
outputFromStdOut = false;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
serializeMetadataCommandArgumentsToken = true;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
if (hasMetadataCommandArguments) {
for (String commandMetadataSegment : commandMetadataSegments) {
cmd.add(commandMetadataSegment);
}
}
replacedMetadataCommandArgumentsToken = true;
} else {
cmd.add(commandSegment);
}
}
if (hasMetadataCommandArguments) {
if (serializeMetadataCommandArgumentsToken) {
// Find all metadata tokens and replace with encapsulated metadata
int i = 0;
for (String commandSegment : cmd) {
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
commandSegment = commandSegment.replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments));
cmd.set(i, commandSegment);
}
i++;
}
} else if (!replacedMetadataCommandArgumentsToken && !serializeMetadataCommandArgumentsToken) {
// Tack metadata onto the end of the cmd as arguments
cmd.addAll(commandMetadataSegments);
}
}
// Execute
Process process;
if (cmd.toArray().length == 1) {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
} else {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
}
ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
try {
sendStdErrToOutputStream(process, stdErrOutputStream);
if (inputToStdIn) {
sendInputStreamToStdIn(inputStream, process);
} else {
// We're not writing to std in this case so close
process.getOutputStream().close();
}
if (outputFromStdOut) {
sendStdOutToOutputStream(process, outputStream);
} else {
tmp.dispose();
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
// The command is finished, read the output file into the given output stream
InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
IOUtils.copy(tempOutputFileInputStream, outputStream);
}
} finally {
if (outputFromStdOut) {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
} else {
try {
// Clean up temp output files
tempOutputFile.delete();
} catch (Exception e) {
}
}
if (!inputToStdIn) {
// Close input file (and delete if created by up TemporaryResources.createTemporaryFile)
IOUtils.closeQuietly(tikaInputStream);
}
IOUtils.closeQuietly(outputStream);
IOUtils.closeQuietly(stdErrOutputStream);
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the command line" + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ParserContainerExtractor method extract.
public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler) throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
try {
parser.parse(stream, new DefaultHandler(), new Metadata(), context);
} catch (SAXException e) {
throw new TikaException("Unexpected SAX exception", e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ParserDecorator method withFallbacks.
/**
* Decorates the given parsers into a virtual parser, where they'll
* be tried in preference order until one works without error.
* TODO Is this the right name?
* TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
* TODO Should we reset the Metadata if we try another parser?
* TODO Should we reset the ContentHandler if we try another parser?
* TODO Should we log/report failures anywhere?
* @deprecated Do not use until the TODOs are resolved, see TIKA-1509
*/
public static final Parser withFallbacks(final Collection<? extends Parser> parsers, final Set<MediaType> types) {
Parser parser = EmptyParser.INSTANCE;
if (!parsers.isEmpty())
parser = parsers.iterator().next();
return new ParserDecorator(parser) {
private static final long serialVersionUID = 1625187131782069683L;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return types;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Must have a TikaInputStream, so we can re-use it if parsing fails
// Need to close internally created tstream to release resources
TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null : new TemporaryResources();
try {
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
tstream.getFile();
// Try each parser in turn
for (Parser p : parsers) {
tstream.mark(-1);
try {
p.parse(tstream, handler, metadata, context);
return;
} catch (Exception e) {
// TODO How to log / record this failure?
}
// Prepare for the next parser, if present
tstream.reset();
}
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
@Override
public String getDecorationName() {
return "With Fallback";
}
};
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class CompositeParser method parse.
/**
* Delegates the call to the matching component parser.
* <p>
* Potential {@link RuntimeException}s, {@link IOException}s and
* {@link SAXException}s unrelated to the given input stream and content
* handler are automatically wrapped into {@link TikaException}s to better
* honor the {@link Parser} contract.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Parser parser = getParser(metadata, context);
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null;
if (parser instanceof ParserDecorator) {
metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
} else {
metadata.add("X-Parsed-By", parser.getClass().getName());
}
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
throw new TikaException("Unexpected RuntimeException from " + parser, e);
} catch (IOException e) {
taggedStream.throwIfCauseOf(e);
throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
} catch (SAXException e) {
if (taggedHandler != null)
taggedHandler.throwIfCauseOf(e);
throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
}
} finally {
tmp.dispose();
}
}
Aggregations