Search in sources :

Example 1 with MimeType

use of org.apache.tika.mime.MimeType in project sling by apache.

the class TikaMimeTypeProvider method getExtension.

public String getExtension(String mimeType) {
    try {
        MimeType type = types.forName(mimeType);
        String extension = type.getExtension();
        if (extension != null && extension.length() > 1) {
            // skip leading "."
            return extension.substring(1);
    } catch (MimeTypeException e) {
    // ignore
    // fall back
    return null;
Also used : MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeType(org.apache.tika.mime.MimeType)

Example 2 with MimeType

use of org.apache.tika.mime.MimeType in project nifi by apache.

the class IdentifyMimeType method onTrigger.

public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
    final ComponentLog logger = getLogger();
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());, new InputStreamCallback() {

        public void process(final InputStream stream) throws IOException {
            try (final InputStream in = new BufferedInputStream(stream)) {
                TikaInputStream tikaStream = TikaInputStream.get(in);
                Metadata metadata = new Metadata();
                if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                // Get mime type
                MediaType mediatype = detector.detect(tikaStream, metadata);
    String mimeType = mimeTypeRef.get();
    String extension = "";
    try {
        MimeType mimetype;
        mimetype = config.getMimeRepository().forName(mimeType);
        extension = mimetype.getExtension();
    } catch (MimeTypeException ex) {
        logger.warn("MIME type extension lookup failed: {}", new Object[] { ex });
    // Workaround for bug in Tika -
    if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
        extension = ".gz";
    if (mimeType == null) {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
        flowFile = session.putAttribute(flowFile, "mime.extension", "");"Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[] { flowFile });
    } else {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
        flowFile = session.putAttribute(flowFile, "mime.extension", extension);"Identified {} as having MIME Type {}", new Object[] { flowFile, mimeType });
    session.transfer(flowFile, REL_SUCCESS);
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) BufferedInputStream( TikaInputStream( InputStream( Metadata(org.apache.tika.metadata.Metadata) TikaInputStream( AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException( ComponentLog(org.apache.nifi.logging.ComponentLog) MimeType(org.apache.tika.mime.MimeType) BufferedInputStream( MimeTypeException(org.apache.tika.mime.MimeTypeException) InputStreamCallback( MediaType(org.apache.tika.mime.MediaType)

Example 3 with MimeType

use of org.apache.tika.mime.MimeType in project tika by apache.

the class MimeUtilTest method assertResult.

private void assertResult(String contentType, String expected) throws MimeTypeException {
    TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
    MimeTypes r = tikaConfig.getMimeRepository();
    MimeType mt = r.forName(contentType);
    //        String ext = MimeUtil.getExtension(contentType, config);
    assertEquals(expected, mt.getExtension());
Also used : TikaConfig(org.apache.tika.config.TikaConfig) MimeTypes(org.apache.tika.mime.MimeTypes) MimeType(org.apache.tika.mime.MimeType)

Example 4 with MimeType

use of org.apache.tika.mime.MimeType in project tika by apache.

the class TikaCLI method compareFileMagic.

     * Compares our mime types registry with the File(1) tool's 
     *  directory of (uncompiled) Magic entries. 
     * (Well, those with mimetypes anyway)
     * @param magicDir Path to the magic directory
private void compareFileMagic(String magicDir) throws Exception {
    Set<String> tikaLacking = new TreeSet<String>();
    Set<String> tikaNoMagic = new TreeSet<String>();
    // Sanity check
    File dir = new File(magicDir);
    if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
    // Looks plausible
    } else {
        throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
    // Find all the mimetypes in the directory
    Set<String> fileMimes = new HashSet<String>();
    for (File mf : dir.listFiles()) {
        if (mf.isFile()) {
            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
            String line;
            while ((line = r.readLine()) != null) {
                if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                    String mime = line.substring(7).trim();
    // See how those compare to the Tika ones
    TikaConfig config = TikaConfig.getDefaultConfig();
    MimeTypes mimeTypes = config.getMimeRepository();
    MediaTypeRegistry registry = config.getMediaTypeRegistry();
    for (String mime : fileMimes) {
        try {
            final MimeType type = mimeTypes.getRegisteredMimeType(mime);
            if (type == null) {
                // Tika doesn't know about this one
            } else {
                // Tika knows about this one!
                // Does Tika have magic for it?
                boolean hasMagic = type.hasMagic();
                // How about the children?
                if (!hasMagic) {
                    for (MediaType child : registry.getChildTypes(type.getType())) {
                        MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                        if (childType != null && childType.hasMagic()) {
                            hasMagic = true;
                // How about the parents?
                MimeType parentType = type;
                while (parentType != null && !hasMagic) {
                    if (parentType.hasMagic()) {
                        // Has magic, fine
                        hasMagic = true;
                    } else {
                        // Check the parent next
                        MediaType parent = registry.getSupertype(type.getType());
                        if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                            // Stop checking parents if we hit a top level type
                            parent = null;
                        if (parent != null) {
                            parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                        } else {
                            parentType = null;
                if (!hasMagic) {
        } catch (MimeTypeException e) {
        // Broken entry in the file magic directory
        // Silently skip
    // Check how many tika knows about
    int tikaTypes = 0;
    int tikaAliases = 0;
    for (MediaType type : registry.getTypes()) {
        tikaAliases += registry.getAliases(type).size();
    // Report
    System.out.println("Tika knows about " + tikaTypes + " unique mime types");
    System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
    System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
    System.out.println("The following mime types are known to File but not Tika:");
    for (String mime : tikaLacking) {
        System.out.println("  " + mime);
    System.out.println("The following mime types from File have no Tika magic (but their children might):");
    for (String mime : tikaNoMagic) {
        System.out.println("  " + mime);
Also used : InputStreamReader( TikaConfig(org.apache.tika.config.TikaConfig) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) MimeTypes(org.apache.tika.mime.MimeTypes) FileInputStream( MimeType(org.apache.tika.mime.MimeType) TreeSet(java.util.TreeSet) MimeTypeException(org.apache.tika.mime.MimeTypeException) BufferedReader( MediaType(org.apache.tika.mime.MediaType) File( HashSet(java.util.HashSet)

Example 5 with MimeType

use of org.apache.tika.mime.MimeType in project alliance by codice.

the class GetRelatedFilesRequestImpl method storeThumbnail.

private String storeThumbnail(Metacard metacard) throws IOException, MimeTypeException {
    String id = metacard.getId();
    byte[] thumbnailBytes = metacard.getThumbnail();
    TikaInputStream tis = TikaInputStream.get(thumbnailBytes);
    MediaType mediaType = DETECTOR.detect(tis, new Metadata());
    MimeType mimeType = TikaConfig.getDefaultConfig().getMimeRepository().forName(mediaType.toString());
    String fileName = id + "-THUMBNAIL" + mimeType.getExtension();
    String urlStr = DEFAULT_PROTOCOL + "://" + location.host_name + (port == null ? "" : ":" + port) + location.path_name + "/" + fileName;
    LOGGER.debug("Storing thumbnail for {} at location: {}", metacard.getTitle(), urlStr);
    HttpPut httpPut = new HttpPut(urlStr);
    HttpEntity entity = new ByteArrayEntity(thumbnailBytes);
    Header contentTypeHeader = new BasicHeader("Content-Type", mediaType.toString());
    HttpResponse response = httpClient.execute(httpPut);
    int statusCode = response.getStatusLine().getStatusCode();
    if (!(statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_CREATED || statusCode == HttpStatus.SC_ACCEPTED || statusCode == HttpStatus.SC_NO_CONTENT)) {
        fileName = null;
        LOGGER.debug("Unable to PUT file: code: {}, status: {}", statusCode, response.getStatusLine().getReasonPhrase());
    return fileName;
Also used : HttpEntity(org.apache.http.HttpEntity) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream( HttpResponse(org.apache.http.HttpResponse) MimeType(org.apache.tika.mime.MimeType) HttpPut(org.apache.http.client.methods.HttpPut) ByteArrayEntity(org.apache.http.entity.ByteArrayEntity) Header(org.apache.http.Header) BasicHeader(org.apache.http.message.BasicHeader) MediaType(org.apache.tika.mime.MediaType) BasicHeader(org.apache.http.message.BasicHeader)


MimeType (org.apache.tika.mime.MimeType)12 MimeTypeException (org.apache.tika.mime.MimeTypeException)8 MediaType (org.apache.tika.mime.MediaType)5 MimeTypes (org.apache.tika.mime.MimeTypes)5 IOException ( TikaConfig (org.apache.tika.config.TikaConfig)4 TikaInputStream ( Metadata (org.apache.tika.metadata.Metadata)4 InputStream ( BufferedInputStream ( TikaException (org.apache.tika.exception.TikaException)2 BufferedReader ( File ( FileInputStream ( FileNotFoundException ( InputStreamReader ( HttpURLConnection ( URL ( URLConnection ( HashSet (java.util.HashSet)1