Search in sources :

Example 1 with MimeTypeException

use of org.apache.tika.mime.MimeTypeException in project lucene-solr by apache.

the class ExtractingRequestHandler method inform.

@Override
public void inform(SolrCore core) {
    if (initArgs != null) {
        //if relative,then relative to config dir, otherwise, absolute path
        String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
        if (tikaConfigLoc != null) {
            File configFile = new File(tikaConfigLoc);
            if (configFile.isAbsolute() == false) {
                configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
            }
            try {
                config = new TikaConfig(configFile);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
        if (parseContextConfigLoc != null) {
            try {
                parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
        if (configDateFormats != null && configDateFormats.size() > 0) {
            dateFormats = new HashSet<>();
            Iterator<Map.Entry> it = configDateFormats.iterator();
            while (it.hasNext()) {
                String format = (String) it.next().getValue();
                log.info("Adding Date Format: " + format);
                dateFormats.add(format);
            }
        }
    }
    if (config == null) {
        try {
            config = getDefaultConfig(core.getResourceLoader().getClassLoader());
        } catch (MimeTypeException | IOException e) {
            throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
    }
    if (parseContextConfig == null) {
        parseContextConfig = new ParseContextConfig();
    }
    factory = createFactory();
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) NamedList(org.apache.solr.common.util.NamedList) IOException(java.io.IOException) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeTypeException(org.apache.tika.mime.MimeTypeException) File(java.io.File) SolrException(org.apache.solr.common.SolrException)

Example 2 with MimeTypeException

use of org.apache.tika.mime.MimeTypeException in project sling by apache.

the class TikaMimeTypeProvider method getExtension.

public String getExtension(String mimeType) {
    try {
        MimeType type = types.forName(mimeType);
        String extension = type.getExtension();
        if (extension != null && extension.length() > 1) {
            // skip leading "."
            return extension.substring(1);
        }
    } catch (MimeTypeException e) {
    // ignore
    }
    // fall back
    return null;
}
Also used : MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeType(org.apache.tika.mime.MimeType)

Example 3 with MimeTypeException

use of org.apache.tika.mime.MimeTypeException in project nifi by apache.

the class IdentifyMimeType method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final ComponentLog logger = getLogger();
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
    session.read(flowFile, new InputStreamCallback() {

        @Override
        public void process(final InputStream stream) throws IOException {
            try (final InputStream in = new BufferedInputStream(stream)) {
                TikaInputStream tikaStream = TikaInputStream.get(in);
                Metadata metadata = new Metadata();
                if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                }
                // Get mime type
                MediaType mediatype = detector.detect(tikaStream, metadata);
                mimeTypeRef.set(mediatype.toString());
            }
        }
    });
    String mimeType = mimeTypeRef.get();
    String extension = "";
    try {
        MimeType mimetype;
        mimetype = config.getMimeRepository().forName(mimeType);
        extension = mimetype.getExtension();
    } catch (MimeTypeException ex) {
        logger.warn("MIME type extension lookup failed: {}", new Object[] { ex });
    }
    // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
    if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
        extension = ".gz";
    }
    if (mimeType == null) {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
        flowFile = session.putAttribute(flowFile, "mime.extension", "");
        logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[] { flowFile });
    } else {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
        flowFile = session.putAttribute(flowFile, "mime.extension", extension);
        logger.info("Identified {} as having MIME Type {}", new Object[] { flowFile, mimeType });
    }
    session.getProvenanceReporter().modifyAttributes(flowFile);
    session.transfer(flowFile, REL_SUCCESS);
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) BufferedInputStream(java.io.BufferedInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) ComponentLog(org.apache.nifi.logging.ComponentLog) MimeType(org.apache.tika.mime.MimeType) BufferedInputStream(java.io.BufferedInputStream) MimeTypeException(org.apache.tika.mime.MimeTypeException) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) MediaType(org.apache.tika.mime.MediaType)

Example 4 with MimeTypeException

use of org.apache.tika.mime.MimeTypeException in project ddf by codice.

the class DownloadManager method run.

@Override
public void run() {
    String mimeType = null;
    try (ReadableByteChannel byteChannel = Channels.newChannel(url.openStream())) {
        mimeType = url.openConnection().getContentType();
        String fileExtension = allTypes.forName(mimeType).getExtension();
        LOGGER.debug("downloading product from: {}", url.toString());
        LOGGER.debug("mimetype is: {}", mimeType);
        LOGGER.debug("File Extension is: {}", fileExtension);
        try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName + fileExtension)) {
            fileOutputStream.getChannel().transferFrom(byteChannel, 0, Long.MAX_VALUE);
        } catch (IOException e) {
            LOGGER.info("Error opening stream for {}", outputFileName, e);
        }
    } catch (IOException e) {
        LOGGER.info("Error downloading file from url: {}", url, e);
    } catch (MimeTypeException e) {
        LOGGER.info("Error determining file extension from mimetype: {}", mimeType, e);
    }
}
Also used : ReadableByteChannel(java.nio.channels.ReadableByteChannel) MimeTypeException(org.apache.tika.mime.MimeTypeException) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException)

Example 5 with MimeTypeException

use of org.apache.tika.mime.MimeTypeException in project tika by apache.

the class TikaCLI method compareFileMagic.

/**
     * Compares our mime types registry with the File(1) tool's 
     *  directory of (uncompiled) Magic entries. 
     * (Well, those with mimetypes anyway)
     * @param magicDir Path to the magic directory
     */
private void compareFileMagic(String magicDir) throws Exception {
    Set<String> tikaLacking = new TreeSet<String>();
    Set<String> tikaNoMagic = new TreeSet<String>();
    // Sanity check
    File dir = new File(magicDir);
    if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
    // Looks plausible
    } else {
        throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
    }
    // Find all the mimetypes in the directory
    Set<String> fileMimes = new HashSet<String>();
    for (File mf : dir.listFiles()) {
        if (mf.isFile()) {
            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
            String line;
            while ((line = r.readLine()) != null) {
                if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                    String mime = line.substring(7).trim();
                    fileMimes.add(mime);
                }
            }
            r.close();
        }
    }
    // See how those compare to the Tika ones
    TikaConfig config = TikaConfig.getDefaultConfig();
    MimeTypes mimeTypes = config.getMimeRepository();
    MediaTypeRegistry registry = config.getMediaTypeRegistry();
    for (String mime : fileMimes) {
        try {
            final MimeType type = mimeTypes.getRegisteredMimeType(mime);
            if (type == null) {
                // Tika doesn't know about this one
                tikaLacking.add(mime);
            } else {
                // Tika knows about this one!
                // Does Tika have magic for it?
                boolean hasMagic = type.hasMagic();
                // How about the children?
                if (!hasMagic) {
                    for (MediaType child : registry.getChildTypes(type.getType())) {
                        MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                        if (childType != null && childType.hasMagic()) {
                            hasMagic = true;
                        }
                    }
                }
                // How about the parents?
                MimeType parentType = type;
                while (parentType != null && !hasMagic) {
                    if (parentType.hasMagic()) {
                        // Has magic, fine
                        hasMagic = true;
                    } else {
                        // Check the parent next
                        MediaType parent = registry.getSupertype(type.getType());
                        if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                            // Stop checking parents if we hit a top level type
                            parent = null;
                        }
                        if (parent != null) {
                            parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                        } else {
                            parentType = null;
                        }
                    }
                }
                if (!hasMagic) {
                    tikaNoMagic.add(mime);
                }
            }
        } catch (MimeTypeException e) {
        // Broken entry in the file magic directory
        // Silently skip
        }
    }
    // Check how many tika knows about
    int tikaTypes = 0;
    int tikaAliases = 0;
    for (MediaType type : registry.getTypes()) {
        tikaTypes++;
        tikaAliases += registry.getAliases(type).size();
    }
    // Report
    System.out.println("Tika knows about " + tikaTypes + " unique mime types");
    System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
    System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
    System.out.println();
    System.out.println("The following mime types are known to File but not Tika:");
    for (String mime : tikaLacking) {
        System.out.println("  " + mime);
    }
    System.out.println();
    System.out.println("The following mime types from File have no Tika magic (but their children might):");
    for (String mime : tikaNoMagic) {
        System.out.println("  " + mime);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) TikaConfig(org.apache.tika.config.TikaConfig) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) MimeTypes(org.apache.tika.mime.MimeTypes) FileInputStream(java.io.FileInputStream) MimeType(org.apache.tika.mime.MimeType) TreeSet(java.util.TreeSet) MimeTypeException(org.apache.tika.mime.MimeTypeException) BufferedReader(java.io.BufferedReader) MediaType(org.apache.tika.mime.MediaType) File(java.io.File) HashSet(java.util.HashSet)

Aggregations

MimeTypeException (org.apache.tika.mime.MimeTypeException)13 IOException (java.io.IOException)8 MimeType (org.apache.tika.mime.MimeType)8 TikaConfig (org.apache.tika.config.TikaConfig)4 MediaType (org.apache.tika.mime.MediaType)4 TikaInputStream (org.apache.tika.io.TikaInputStream)3 Metadata (org.apache.tika.metadata.Metadata)3 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 InputStream (java.io.InputStream)2 InputStreamReader (java.io.InputStreamReader)2 MalformedURLException (java.net.MalformedURLException)2 URL (java.net.URL)2 TikaException (org.apache.tika.exception.TikaException)2 MimeTypes (org.apache.tika.mime.MimeTypes)2 DataUri (com.github.ooxi.jdatauri.DataUri)1 BufferedInputStream (java.io.BufferedInputStream)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1