use of org.apache.tika.sax.ToXMLContentHandler in project tika by apache.
the class FileResourceConsumer method getXMLifiedLogMsg.
/**
* Use this for structured output that captures resourceId and other attributes.
*
* @param type entity name for exception
* @param resourceId resourceId string
* @param t throwable can be null
* @param attrs (array of key0, value0, key1, value1, etc.)
*/
protected String getXMLifiedLogMsg(String type, String resourceId, Throwable t, String... attrs) {
ContentHandler toXML = new ToXMLContentHandler();
SafeContentHandler handler = new SafeContentHandler(toXML);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "resourceId", "resourceId", "", resourceId);
for (int i = 0; i < attrs.length - 1; i++) {
attributes.addAttribute("", attrs[i], attrs[i], "", attrs[i + 1]);
}
try {
handler.startDocument();
handler.startElement("", type, type, attributes);
if (t != null) {
StringWriter stackWriter = new StringWriter();
PrintWriter printWriter = new PrintWriter(stackWriter);
t.printStackTrace(printWriter);
printWriter.flush();
stackWriter.flush();
char[] chars = stackWriter.toString().toCharArray();
handler.characters(chars, 0, chars.length);
}
handler.endElement("", type, type);
handler.endDocument();
} catch (SAXException e) {
LOG.warn("error writing xml stream for: {}", resourceId, t);
}
return handler.toString();
}
use of org.apache.tika.sax.ToXMLContentHandler in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
Aggregations