use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaMimeTypes method getMimeTypesPlain.
@GET
@Produces("text/plain")
public String getMimeTypesPlain() {
StringBuffer text = new StringBuffer();
for (MediaTypeDetails type : getMediaTypes()) {
text.append(type.type.toString());
text.append("\n");
for (MediaType alias : type.aliases) {
text.append(" alias: ").append(alias).append("\n");
}
if (type.supertype != null) {
text.append(" supertype: ").append(type.supertype.toString()).append("\n");
}
if (type.parser != null) {
text.append(" parser: ").append(type.parser).append("\n");
}
}
return text.toString();
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaParsers method parserAsMap.
private void parserAsMap(ParserDetails p, boolean withMimeTypes, Map<String, Object> details) {
details.put("name", p.className);
details.put("composite", p.isComposite);
details.put("decorated", p.isDecorated);
if (p.isComposite) {
List<Map<String, Object>> c = new ArrayList<Map<String, Object>>();
for (Parser cp : p.childParsers) {
Map<String, Object> cdet = new HashMap<String, Object>();
parserAsMap(new ParserDetails(cp), withMimeTypes, cdet);
c.add(cdet);
}
details.put("children", c);
} else if (withMimeTypes) {
List<String> mts = new ArrayList<String>(p.supportedTypes.size());
for (MediaType mt : p.supportedTypes) {
mts.add(mt.toString());
}
details.put("supportedTypes", mts);
}
}
use of org.apache.tika.mime.MediaType in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class EmbeddedDocumentUtil method getExtension.
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
TikaConfig config = getConfig();
MimeType mimeType = null;
MimeTypes types = config.getMimeRepository();
boolean detected = false;
if (mimeString != null) {
try {
mimeType = types.forName(mimeString);
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType == null) {
Detector detector = config.getDetector();
try {
MediaType mediaType = detector.detect(is, metadata);
mimeType = types.forName(mediaType.toString());
detected = true;
is.reset();
} catch (IOException e) {
//swallow
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType != null) {
if (detected) {
//set or correct the mime type
metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
return mimeType.getExtension();
}
return ".bin";
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ParserDecorator method withFallbacks.
/**
* Decorates the given parsers into a virtual parser, where they'll
* be tried in preference order until one works without error.
* TODO Is this the right name?
* TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
* TODO Should we reset the Metadata if we try another parser?
* TODO Should we reset the ContentHandler if we try another parser?
* TODO Should we log/report failures anywhere?
* @deprecated Do not use until the TODOs are resolved, see TIKA-1509
*/
public static final Parser withFallbacks(final Collection<? extends Parser> parsers, final Set<MediaType> types) {
Parser parser = EmptyParser.INSTANCE;
if (!parsers.isEmpty())
parser = parsers.iterator().next();
return new ParserDecorator(parser) {
private static final long serialVersionUID = 1625187131782069683L;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return types;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Must have a TikaInputStream, so we can re-use it if parsing fails
// Need to close internally created tstream to release resources
TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null : new TemporaryResources();
try {
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
tstream.getFile();
// Try each parser in turn
for (Parser p : parsers) {
tstream.mark(-1);
try {
p.parse(tstream, handler, metadata, context);
return;
} catch (Exception e) {
// TODO How to log / record this failure?
}
// Prepare for the next parser, if present
tstream.reset();
}
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
@Override
public String getDecorationName() {
return "With Fallback";
}
};
}
Aggregations