Search in sources :

Example 66 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaMimeTypes method getMimeTypesPlain.

@GET
@Produces("text/plain")
public String getMimeTypesPlain() {
    StringBuffer text = new StringBuffer();
    for (MediaTypeDetails type : getMediaTypes()) {
        text.append(type.type.toString());
        text.append("\n");
        for (MediaType alias : type.aliases) {
            text.append("  alias:     ").append(alias).append("\n");
        }
        if (type.supertype != null) {
            text.append("  supertype: ").append(type.supertype.toString()).append("\n");
        }
        if (type.parser != null) {
            text.append("  parser:    ").append(type.parser).append("\n");
        }
    }
    return text.toString();
}
Also used : MediaType(org.apache.tika.mime.MediaType) Produces(javax.ws.rs.Produces) GET(javax.ws.rs.GET)

Example 67 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaParsers method parserAsMap.

private void parserAsMap(ParserDetails p, boolean withMimeTypes, Map<String, Object> details) {
    details.put("name", p.className);
    details.put("composite", p.isComposite);
    details.put("decorated", p.isDecorated);
    if (p.isComposite) {
        List<Map<String, Object>> c = new ArrayList<Map<String, Object>>();
        for (Parser cp : p.childParsers) {
            Map<String, Object> cdet = new HashMap<String, Object>();
            parserAsMap(new ParserDetails(cp), withMimeTypes, cdet);
            c.add(cdet);
        }
        details.put("children", c);
    } else if (withMimeTypes) {
        List<String> mts = new ArrayList<String>(p.supportedTypes.size());
        for (MediaType mt : p.supportedTypes) {
            mts.add(mt.toString());
        }
        details.put("supportedTypes", mts);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MediaType(org.apache.tika.mime.MediaType) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser)

Example 68 with MediaType

use of org.apache.tika.mime.MediaType in project stanbol by apache.

the class TikaEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    MediaTypeAndStream mtas = extractMediaType(ci);
    if (mtas.mediaType == null) {
        //unable to parse and detect content type
        return;
    }
    MediaType plainMediaType = mtas.mediaType.getBaseType();
    if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
        //we need not to process plain text!
        return;
    }
    final ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Set<MediaType> supproted = parser.getSupportedTypes(context);
    if (supproted.contains(plainMediaType)) {
        final InputStream in;
        if (mtas.in == null) {
            in = ci.getStream();
        } else {
            in = mtas.in;
        }
        final Metadata metadata = new Metadata();
        //set the already parsed contentType
        metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
        //also explicitly set the charset as contentEncoding
        String charset = mtas.mediaType.getParameters().get("charset");
        if (charset != null) {
            metadata.set(Metadata.CONTENT_ENCODING, charset);
        }
        ContentSink plainTextSink;
        try {
            plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
        } catch (IOException e) {
            //close the input stream
            IOUtils.closeQuietly(in);
            throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
        }
        final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
        final ContentHandler textHandler = new //only the Body
        BodyContentHandler(//skip ignoreable
        new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
        final ToXMLContentHandler xhtmlHandler;
        final ContentHandler mainHandler;
        ContentSink xhtmlSink = null;
        try {
            if (!plainMediaType.equals(XHTML)) {
                //do not parse XHTML from XHTML
                try {
                    xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
                } catch (IOException e) {
                    throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
                }
                try {
                    xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
                } catch (UnsupportedEncodingException e) {
                    throw new EngineException("This system does not support the encoding " + UTF8, e);
                }
                mainHandler = new MultiHandler(textHandler, xhtmlHandler);
            } else {
                mainHandler = textHandler;
                xhtmlHandler = null;
                xhtmlSink = null;
            }
            try {
                AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {

                    public Object run() throws IOException, SAXException, TikaException {
                        /* 
                             * We need to replace the context Classloader with the Bundle ClassLoader
                             * to ensure that Singleton instances of XML frameworks (such as node4j) 
                             * do not leak into the OSGI environment.
                             * 
                             * Most Java XML libs prefer to load implementations by using the 
                             * {@link Thread#getContextClassLoader()}. However OSGI has no control over
                             * this {@link ClassLoader}. Because of that there can be situations where
                             * Interfaces are loaded via the Bundle Classloader and the implementations
                             * are taken from the context Classloader. What can cause 
                             * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
                             * 
                             * Setting the context Classloader to the Bundle classloader helps to avoid
                             * those situations.
                             */
                        ClassLoader contextClassLoader = updateContextClassLoader();
                        try {
                            parser.parse(in, mainHandler, metadata, context);
                        } finally {
                            //reset the previous context ClassLoader
                            Thread.currentThread().setContextClassLoader(contextClassLoader);
                        }
                        return null;
                    }
                });
            } catch (PrivilegedActionException pae) {
                Exception e = pae.getException();
                if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
                    throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
                } else {
                    //runtime exception
                    throw RuntimeException.class.cast(e);
                }
            }
        } finally {
            //ensure that the writers are closed correctly
            IOUtils.closeQuietly(in);
            IOUtils.closeQuietly(plainTextWriter);
            if (xhtmlSink != null) {
                IOUtils.closeQuietly(xhtmlSink.getOutputStream());
            }
        }
        String random = randomUUID().toString();
        IRI textBlobUri = new IRI("urn:tika:text:" + random);
        ci.addPart(textBlobUri, plainTextSink.getBlob());
        if (xhtmlHandler != null) {
            IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
            ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
        }
        //add the extracted metadata
        if (log.isInfoEnabled()) {
            for (String name : metadata.names()) {
                log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
            }
        }
        ci.getLock().writeLock().lock();
        try {
            Graph graph = ci.getMetadata();
            IRI id = ci.getUri();
            Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
            if (includeUnmappedProperties) {
                Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
                unmapped.removeAll(mapped);
                for (String name : unmapped) {
                    if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
                        //only mapped
                        IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
                        for (String value : metadata.getValues(name)) {
                            //TODO: without the Property for the name we have no datatype
                            //      information ... so we add PlainLiterals for now
                            graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
                        }
                    }
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
//else not supported format
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) IRI(org.apache.clerezza.commons.rdf.IRI) Metadata(org.apache.tika.metadata.Metadata) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) SAXException(org.xml.sax.SAXException) MediaType(org.apache.tika.mime.MediaType) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) HashSet(java.util.HashSet) MultiHandler(org.apache.stanbol.enhancer.engines.tika.handler.MultiHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ConfigurationException(org.osgi.service.cm.ConfigurationException) SAXException(org.xml.sax.SAXException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) Graph(org.apache.clerezza.commons.rdf.Graph) PlainTextHandler(org.apache.stanbol.enhancer.engines.tika.handler.PlainTextHandler) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 69 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class EmbeddedDocumentUtil method getExtension.

public String getExtension(TikaInputStream is, Metadata metadata) {
    String mimeString = metadata.get(Metadata.CONTENT_TYPE);
    TikaConfig config = getConfig();
    MimeType mimeType = null;
    MimeTypes types = config.getMimeRepository();
    boolean detected = false;
    if (mimeString != null) {
        try {
            mimeType = types.forName(mimeString);
        } catch (MimeTypeException e) {
        //swallow
        }
    }
    if (mimeType == null) {
        Detector detector = config.getDetector();
        try {
            MediaType mediaType = detector.detect(is, metadata);
            mimeType = types.forName(mediaType.toString());
            detected = true;
            is.reset();
        } catch (IOException e) {
        //swallow
        } catch (MimeTypeException e) {
        //swallow
        }
    }
    if (mimeType != null) {
        if (detected) {
            //set or correct the mime type
            metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
        }
        return mimeType.getExtension();
    }
    return ".bin";
}
Also used : Detector(org.apache.tika.detect.Detector) TikaConfig(org.apache.tika.config.TikaConfig) MimeTypeException(org.apache.tika.mime.MimeTypeException) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) MimeTypes(org.apache.tika.mime.MimeTypes) MimeType(org.apache.tika.mime.MimeType)

Example 70 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ParserDecorator method withFallbacks.

/**
     * Decorates the given parsers into a virtual parser, where they'll
     *  be tried in preference order until one works without error.
     * TODO Is this the right name?
     * TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
     * TODO Should we reset the Metadata if we try another parser?
     * TODO Should we reset the ContentHandler if we try another parser?
     * TODO Should we log/report failures anywhere?
     * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
     */
public static final Parser withFallbacks(final Collection<? extends Parser> parsers, final Set<MediaType> types) {
    Parser parser = EmptyParser.INSTANCE;
    if (!parsers.isEmpty())
        parser = parsers.iterator().next();
    return new ParserDecorator(parser) {

        private static final long serialVersionUID = 1625187131782069683L;

        @Override
        public Set<MediaType> getSupportedTypes(ParseContext context) {
            return types;
        }

        @Override
        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
            // Must have a TikaInputStream, so we can re-use it if parsing fails
            // Need to close internally created tstream to release resources
            TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null : new TemporaryResources();
            try {
                TikaInputStream tstream = TikaInputStream.get(stream, tmp);
                tstream.getFile();
                // Try each parser in turn
                for (Parser p : parsers) {
                    tstream.mark(-1);
                    try {
                        p.parse(tstream, handler, metadata, context);
                        return;
                    } catch (Exception e) {
                    // TODO How to log / record this failure?
                    }
                    // Prepare for the next parser, if present
                    tstream.reset();
                }
            } finally {
                if (tmp != null) {
                    tmp.dispose();
                }
            }
        }

        @Override
        public String getDecorationName() {
            return "With Fallback";
        }
    };
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) TemporaryResources(org.apache.tika.io.TemporaryResources) MediaType(org.apache.tika.mime.MediaType) TikaInputStream(org.apache.tika.io.TikaInputStream) ContentHandler(org.xml.sax.ContentHandler) TikaException(org.apache.tika.exception.TikaException) SAXException(org.xml.sax.SAXException) IOException(java.io.IOException)

Aggregations

MediaType (org.apache.tika.mime.MediaType)95 Metadata (org.apache.tika.metadata.Metadata)29 Test (org.junit.Test)28 InputStream (java.io.InputStream)26 IOException (java.io.IOException)18 Parser (org.apache.tika.parser.Parser)18 TikaInputStream (org.apache.tika.io.TikaInputStream)17 ParseContext (org.apache.tika.parser.ParseContext)17 TikaException (org.apache.tika.exception.TikaException)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)14 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 Detector (org.apache.tika.detect.Detector)11 TikaTest (org.apache.tika.TikaTest)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7