Search in sources :

Example 26 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaCLI method compareFileMagic.

/**
     * Compares our mime types registry with the File(1) tool's 
     *  directory of (uncompiled) Magic entries. 
     * (Well, those with mimetypes anyway)
     * @param magicDir Path to the magic directory
     */
private void compareFileMagic(String magicDir) throws Exception {
    Set<String> tikaLacking = new TreeSet<String>();
    Set<String> tikaNoMagic = new TreeSet<String>();
    // Sanity check
    File dir = new File(magicDir);
    if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
    // Looks plausible
    } else {
        throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
    }
    // Find all the mimetypes in the directory
    Set<String> fileMimes = new HashSet<String>();
    for (File mf : dir.listFiles()) {
        if (mf.isFile()) {
            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
            String line;
            while ((line = r.readLine()) != null) {
                if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                    String mime = line.substring(7).trim();
                    fileMimes.add(mime);
                }
            }
            r.close();
        }
    }
    // See how those compare to the Tika ones
    TikaConfig config = TikaConfig.getDefaultConfig();
    MimeTypes mimeTypes = config.getMimeRepository();
    MediaTypeRegistry registry = config.getMediaTypeRegistry();
    for (String mime : fileMimes) {
        try {
            final MimeType type = mimeTypes.getRegisteredMimeType(mime);
            if (type == null) {
                // Tika doesn't know about this one
                tikaLacking.add(mime);
            } else {
                // Tika knows about this one!
                // Does Tika have magic for it?
                boolean hasMagic = type.hasMagic();
                // How about the children?
                if (!hasMagic) {
                    for (MediaType child : registry.getChildTypes(type.getType())) {
                        MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                        if (childType != null && childType.hasMagic()) {
                            hasMagic = true;
                        }
                    }
                }
                // How about the parents?
                MimeType parentType = type;
                while (parentType != null && !hasMagic) {
                    if (parentType.hasMagic()) {
                        // Has magic, fine
                        hasMagic = true;
                    } else {
                        // Check the parent next
                        MediaType parent = registry.getSupertype(type.getType());
                        if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                            // Stop checking parents if we hit a top level type
                            parent = null;
                        }
                        if (parent != null) {
                            parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                        } else {
                            parentType = null;
                        }
                    }
                }
                if (!hasMagic) {
                    tikaNoMagic.add(mime);
                }
            }
        } catch (MimeTypeException e) {
        // Broken entry in the file magic directory
        // Silently skip
        }
    }
    // Check how many tika knows about
    int tikaTypes = 0;
    int tikaAliases = 0;
    for (MediaType type : registry.getTypes()) {
        tikaTypes++;
        tikaAliases += registry.getAliases(type).size();
    }
    // Report
    System.out.println("Tika knows about " + tikaTypes + " unique mime types");
    System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
    System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
    System.out.println();
    System.out.println("The following mime types are known to File but not Tika:");
    for (String mime : tikaLacking) {
        System.out.println("  " + mime);
    }
    System.out.println();
    System.out.println("The following mime types from File have no Tika magic (but their children might):");
    for (String mime : tikaNoMagic) {
        System.out.println("  " + mime);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) TikaConfig(org.apache.tika.config.TikaConfig) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) MimeTypes(org.apache.tika.mime.MimeTypes) FileInputStream(java.io.FileInputStream) MimeType(org.apache.tika.mime.MimeType) TreeSet(java.util.TreeSet) MimeTypeException(org.apache.tika.mime.MimeTypeException) BufferedReader(java.io.BufferedReader) MediaType(org.apache.tika.mime.MediaType) File(java.io.File) HashSet(java.util.HashSet)

Example 27 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaCLI method displayParser.

private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
    String decorated = null;
    if (p instanceof ParserDecorator) {
        ParserDecorator pd = (ParserDecorator) p;
        decorated = " (Wrapped by " + pd.getDecorationName() + ")";
        p = pd.getWrappedParser();
    }
    boolean isComposite = (p instanceof CompositeParser);
    String name = p.getClass().getName();
    if (apt) {
        name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
    } else if (decorated != null) {
        name += decorated;
    }
    if ((apt && !isComposite) || !apt) {
        // Don't display Composite parsers in the apt output.
        System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
        if (apt)
            System.out.println();
        if (includeMimeTypes && !isComposite) {
            for (MediaType mt : p.getSupportedTypes(context)) {
                System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
                if (apt)
                    System.out.println();
            }
        }
    }
    if (isComposite) {
        Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
        for (Parser sp : subParsers) {
            // Don't indent for Composites in apt.
            displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
        }
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) NetworkParser(org.apache.tika.parser.NetworkParser) ForkParser(org.apache.tika.fork.ForkParser)

Example 28 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class OOXMLParserTest method testExcelXLSB.

@Test
public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
    // Should be detected correctly
    MediaType type;
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
    }
    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser will (soon) handle it
    assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // AutoDetectParser doesn't break on it
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        assertContains("This is an example spreadsheet", content);
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 29 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ParserDecoratorTest method withFallback.

/**
     * Testing one proposed implementation for TIKA-1509
     */
@Test
public void withFallback() throws Exception {
    Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
    Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
    ParseContext context = new ParseContext();
    BodyContentHandler handler;
    Metadata metadata;
    ErrorParser pFail = new ErrorParser();
    DummyParser pWork = new DummyParser(onlyOct, new HashMap<String, String>(), "Fell back!");
    EmptyParser pNothing = new EmptyParser();
    // Create a combination which will fail first
    @SuppressWarnings("deprecation") Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
    // Will claim to support the types given, not those on the child parsers
    Set<MediaType> types = p.getSupportedTypes(context);
    assertEquals(2, types.size());
    assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
    assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
    // Parsing will make it to the second one
    metadata = new Metadata();
    handler = new BodyContentHandler();
    p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
    assertEquals("Fell back!", handler.toString());
    // With a parser that will work with no output, will get nothing
    p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
    metadata = new Metadata();
    handler = new BodyContentHandler();
    p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
    assertEquals("", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ByteArrayInputStream(java.io.ByteArrayInputStream) MediaType(org.apache.tika.mime.MediaType) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 30 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MagicDetectorTest method testDetectNull.

@Test
public void testDetectNull() throws Exception {
    MediaType html = new MediaType("text", "html");
    Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));
    assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata()));
}
Also used : Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Aggregations

MediaType (org.apache.tika.mime.MediaType)88 Test (org.junit.Test)28 Metadata (org.apache.tika.metadata.Metadata)27 InputStream (java.io.InputStream)23 TikaInputStream (org.apache.tika.io.TikaInputStream)17 Parser (org.apache.tika.parser.Parser)17 ParseContext (org.apache.tika.parser.ParseContext)16 IOException (java.io.IOException)15 TikaException (org.apache.tika.exception.TikaException)13 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)12 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 TikaTest (org.apache.tika.TikaTest)10 Detector (org.apache.tika.detect.Detector)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7 ArrayList (java.util.ArrayList)6