Search in sources :

Example 16 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class EmbeddedDocumentUtil method getExtension.

public String getExtension(TikaInputStream is, Metadata metadata) {
    String mimeString = metadata.get(Metadata.CONTENT_TYPE);
    TikaConfig config = getConfig();
    MimeType mimeType = null;
    MimeTypes types = config.getMimeRepository();
    boolean detected = false;
    if (mimeString != null) {
        try {
            mimeType = types.forName(mimeString);
        } catch (MimeTypeException e) {
        //swallow
        }
    }
    if (mimeType == null) {
        Detector detector = config.getDetector();
        try {
            MediaType mediaType = detector.detect(is, metadata);
            mimeType = types.forName(mediaType.toString());
            detected = true;
            is.reset();
        } catch (IOException e) {
        //swallow
        } catch (MimeTypeException e) {
        //swallow
        }
    }
    if (mimeType != null) {
        if (detected) {
            //set or correct the mime type
            metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
        }
        return mimeType.getExtension();
    }
    return ".bin";
}
Also used : Detector(org.apache.tika.detect.Detector) TikaConfig(org.apache.tika.config.TikaConfig) MimeTypeException(org.apache.tika.mime.MimeTypeException) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) MimeTypes(org.apache.tika.mime.MimeTypes) MimeType(org.apache.tika.mime.MimeType)

Example 17 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class MyFirstTika method parseUsingComponents.

public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
    MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
    System.out.println("Examining: [" + filename + "]");
    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");
    InputStream stream = TikaInputStream.get(new File(filename));
    System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");
    stream = TikaInputStream.get(new File(filename));
    Detector detector = tikaConfig.getDetector();
    System.out.println("The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");
    LanguageDetector langDetector = new OptimaizeLangDetector().loadModels();
    LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8));
    System.out.println("The language of this content is: [" + lang.getLanguage() + "]");
    // Get a non-detecting parser that handles all the types it can
    Parser parser = tikaConfig.getParser();
    // Tell it what we think the content is
    MediaType type = detector.detect(stream, metadata);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // Have the file parsed to get the content and metadata
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}
Also used : LanguageDetector(org.apache.tika.language.detect.LanguageDetector) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) LanguageDetector(org.apache.tika.language.detect.LanguageDetector) Detector(org.apache.tika.detect.Detector) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector) LanguageResult(org.apache.tika.language.detect.LanguageResult) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) OptimaizeLangDetector(org.apache.tika.langdetect.OptimaizeLangDetector) MediaType(org.apache.tika.mime.MediaType) MimeTypes(org.apache.tika.mime.MimeTypes) File(java.io.File) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 18 with Detector

use of org.apache.tika.detect.Detector in project tutorials by eugenp.

the class TikaAnalysis method detectDocTypeUsingDetector.

public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
    Detector detector = new DefaultDetector();
    Metadata metadata = new Metadata();
    MediaType mediaType = detector.detect(stream, metadata);
    return mediaType.toString();
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType)

Example 19 with Detector

use of org.apache.tika.detect.Detector in project uPortal by Jasig.

the class JaxbPortalDataHandlerService method getMediaType.

private MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
    final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
    try {
        final Detector detector = new DefaultDetector();
        final Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
        final MediaType type = detector.detect(tikaInputStreamStream, metadata);
        logger.debug("Determined '{}' for '{}'", type, fileName);
        return type;
    } catch (IOException e) {
        logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
        return null;
    } finally {
        IOUtils.closeQuietly(tikaInputStreamStream);
        // Reset the buffered stream to make up for anything read by the detector
        inputStream.reset();
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream)

Example 20 with Detector

use of org.apache.tika.detect.Detector in project ddf by codice.

the class MimeTypeMapperImpl method guessMimeType.

@Override
public String guessMimeType(InputStream is, String fileExtension) throws MimeTypeResolutionException {
    LOGGER.trace(ENTERING_STR, "guessMimeType()");
    String mimeType = null;
    LOGGER.debug("Looping through{} MimeTypeResolvers", mimeTypeResolvers.size());
    // This is to force the TikaMimeTypeResolver to be called
    // after the CustomMimeTypeResolvers to prevent Tika default mapping
    // from being used when a CustomMimeTypeResolver may be more appropriate.
    List<MimeTypeResolver> sortedResolvers = sortResolvers(mimeTypeResolvers);
    if (StringUtils.isEmpty(fileExtension)) {
        try (TemporaryFileBackedOutputStream tfbos = new TemporaryFileBackedOutputStream()) {
            IOUtils.copy(is, tfbos);
            try (InputStream inputStream = tfbos.asByteSource().openStream()) {
                Detector detector = new DefaultDetector();
                MediaType mediaType = detector.detect(inputStream, new Metadata());
                fileExtension = getFileExtensionForMimeType(mediaType.toString()).replace(".", "");
            } finally {
                is = tfbos.asByteSource().openStream();
            }
        } catch (Exception e) {
            LOGGER.debug("Failed to guess mimeType for file without extension.");
        }
    }
    // If file has XML extension, then read root element namespace once so
    // each MimeTypeResolver does not have to open the stream and read the namespace
    String namespace = null;
    if (fileExtension.equals(XML_FILE_EXTENSION)) {
        try {
            namespace = XML_UTILS.getRootNamespace(IOUtils.toString(is));
        } catch (IOException ioe) {
            LOGGER.debug("Could not read namespace from input stream.", ioe);
        }
        LOGGER.debug("namespace = {}", namespace);
    }
    // Once a file extension is find for the given mime type, exit the loop.
    for (MimeTypeResolver resolver : sortedResolvers) {
        LOGGER.debug(CALLING_RESOLVER_MSG, resolver.getName());
        try {
            // an InputTransformer to create a metacard for that "generic" XML file.
            if (fileExtension.equals(XML_FILE_EXTENSION)) {
                if (namespace != null && resolver.hasSchema()) {
                    if (namespace.equals(resolver.getSchema())) {
                        mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
                    }
                }
            } else {
                mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
            }
        } catch (Exception e) {
            LOGGER.debug("Error resolving mime type for file extension: {}", fileExtension);
            throw new MimeTypeResolutionException(e);
        }
        if (StringUtils.isNotEmpty(mimeType)) {
            LOGGER.debug("mimeType [{}] retrieved from MimeTypeResolver:  ", mimeType, resolver.getName());
            break;
        }
    }
    LOGGER.debug(MIME_TYPE_FILE_EXT_MSG, mimeType, fileExtension);
    LOGGER.trace(EXITING_STR, "guessMimeType()");
    return mimeType;
}
Also used : MimeTypeResolver(ddf.mime.MimeTypeResolver) DefaultDetector(org.apache.tika.detect.DefaultDetector) MimeTypeResolutionException(ddf.mime.MimeTypeResolutionException) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) TemporaryFileBackedOutputStream(org.codice.ddf.platform.util.TemporaryFileBackedOutputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) IOException(java.io.IOException) MimeTypeResolutionException(ddf.mime.MimeTypeResolutionException)

Aggregations

Detector (org.apache.tika.detect.Detector)20 Metadata (org.apache.tika.metadata.Metadata)11 MediaType (org.apache.tika.mime.MediaType)11 DefaultDetector (org.apache.tika.detect.DefaultDetector)10 InputStream (java.io.InputStream)9 CompositeDetector (org.apache.tika.detect.CompositeDetector)7 IOException (java.io.IOException)5 TikaInputStream (org.apache.tika.io.TikaInputStream)5 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)5 ParseContext (org.apache.tika.parser.ParseContext)5 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)5 Test (org.junit.Test)5 ContentHandler (org.xml.sax.ContentHandler)5 TikaTest (org.apache.tika.TikaTest)3 MimeTypeResolutionException (ddf.mime.MimeTypeResolutionException)2 ArrayList (java.util.ArrayList)2 TikaException (org.apache.tika.exception.TikaException)2 MimeTypes (org.apache.tika.mime.MimeTypes)2 Parser (org.apache.tika.parser.Parser)2 MimeTypeResolver (ddf.mime.MimeTypeResolver)1