Search in sources :

Example 1 with MediaType

use of org.apache.tika.mime.MediaType in project che by eclipse.

the class MediaTypeFilter method accept.

@Override
public boolean accept(VirtualFile file) {
    try (InputStream content = file.getContent()) {
        TikaConfig tikaConfig = new TikaConfig();
        MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
        if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
            return true;
        }
        return false;
    } catch (TikaException | ForbiddenException | ServerException | IOException e) {
        return true;
    }
}
Also used : ForbiddenException(org.eclipse.che.api.core.ForbiddenException) TikaException(org.apache.tika.exception.TikaException) ServerException(org.eclipse.che.api.core.ServerException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 2 with MediaType

use of org.apache.tika.mime.MediaType in project camel by apache.

the class TikaProducer method doDetect.

private Object doDetect(Exchange exchange) throws IOException {
    InputStream inputStream = exchange.getIn().getBody(InputStream.class);
    Metadata metadata = new Metadata();
    MediaType result = this.detector.detect(inputStream, metadata);
    convertMetadataToHeaders(metadata, exchange);
    return result.toString();
}
Also used : InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType)

Example 3 with MediaType

use of org.apache.tika.mime.MediaType in project jackrabbit-oak by apache.

the class MediaRange method parse.

public static MediaRange parse(String range, MediaTypeRegistry registry) {
    MediaType type = MediaType.parse(range);
    if (type == null) {
        return null;
    }
    type = registry.normalize(type);
    Map<String, String> parameters = new HashMap<String, String>(type.getParameters());
    String q = parameters.remove("q");
    if (q != null) {
        try {
            return new MediaRange(new MediaType(type.getBaseType(), parameters), Double.parseDouble(q));
        } catch (NumberFormatException e) {
            return null;
        }
    }
    return new MediaRange(type, 1.0);
}
Also used : HashMap(java.util.HashMap) MediaType(org.apache.tika.mime.MediaType)

Example 4 with MediaType

use of org.apache.tika.mime.MediaType in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 5 with MediaType

use of org.apache.tika.mime.MediaType in project uPortal by Jasig.

the class JaxbPortalDataHandlerService method importDataArchive.

protected void importDataArchive(Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
        //Make sure the stream is buffered
        if (resourceStream instanceof BufferedInputStream) {
            bufferedResourceStream = (BufferedInputStream) resourceStream;
        } else {
            bufferedResourceStream = new BufferedInputStream(resourceStream);
        }
        //Buffer up to 100MB, bad things will happen if we bust this buffer.
        //TODO see if there is a buffered stream that will write to a file once the buffer fills up
        bufferedResourceStream.mark(100 * 1024 * 1024);
        final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());
        if (MT_JAVA_ARCHIVE.equals(type)) {
            final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
            importDataArchive(archive, archiveStream, options);
        } else if (MediaType.APPLICATION_ZIP.equals(type)) {
            final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
            importDataArchive(archive, archiveStream, options);
        } else if (MT_CPIO.equals(type)) {
            final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
            importDataArchive(archive, archiveStream, options);
        } else if (MT_AR.equals(type)) {
            final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
            importDataArchive(archive, archiveStream, options);
        } else if (MT_TAR.equals(type)) {
            final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
            importDataArchive(archive, archiveStream, options);
        } else if (MT_BZIP2.equals(type)) {
            final CompressorInputStream compressedStream = new BZip2CompressorInputStream(bufferedResourceStream);
            importDataArchive(archive, compressedStream, options);
        } else if (MT_GZIP.equals(type)) {
            final CompressorInputStream compressedStream = new GzipCompressorInputStream(bufferedResourceStream);
            importDataArchive(archive, compressedStream, options);
        } else if (MT_PACK200.equals(type)) {
            final CompressorInputStream compressedStream = new Pack200CompressorInputStream(bufferedResourceStream);
            importDataArchive(archive, compressedStream, options);
        } else if (MT_XZ.equals(type)) {
            final CompressorInputStream compressedStream = new XZCompressorInputStream(bufferedResourceStream);
            importDataArchive(archive, compressedStream, options);
        } else {
            throw new RuntimeException("Unrecognized archive media type: " + type);
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {
        IOUtils.closeQuietly(bufferedResourceStream);
    }
}
Also used : JarArchiveInputStream(org.apache.commons.compress.archivers.jar.JarArchiveInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) ArArchiveInputStream(org.apache.commons.compress.archivers.ar.ArArchiveInputStream) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream) Pack200CompressorInputStream(org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) IOException(java.io.IOException) Pack200CompressorInputStream(org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) JarArchiveInputStream(org.apache.commons.compress.archivers.jar.JarArchiveInputStream) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) CpioArchiveInputStream(org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream) ArArchiveInputStream(org.apache.commons.compress.archivers.ar.ArArchiveInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) BufferedInputStream(java.io.BufferedInputStream) MediaType(org.apache.tika.mime.MediaType) CpioArchiveInputStream(org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream) XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream)

Aggregations

MediaType (org.apache.tika.mime.MediaType)88 Test (org.junit.Test)28 Metadata (org.apache.tika.metadata.Metadata)27 InputStream (java.io.InputStream)23 TikaInputStream (org.apache.tika.io.TikaInputStream)17 Parser (org.apache.tika.parser.Parser)17 ParseContext (org.apache.tika.parser.ParseContext)16 IOException (java.io.IOException)15 TikaException (org.apache.tika.exception.TikaException)13 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)12 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 TikaTest (org.apache.tika.TikaTest)10 Detector (org.apache.tika.detect.Detector)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7 ArrayList (java.util.ArrayList)6