Search in sources :

Example 6 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method testDynamicServiceLoaderFromConfig.

@Test
public void testDynamicServiceLoaderFromConfig() throws Exception {
    URL url = TikaConfigTest.class.getResource("TIKA-1700-dynamic.xml");
    TikaConfig config = new TikaConfig(url);
    DummyParser parser = (DummyParser) config.getParser();
    ServiceLoader loader = parser.getLoader();
    boolean dynamicValue = loader.isDynamic();
    assertTrue("Dynamic Service Loading Should be true", dynamicValue);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) URL(java.net.URL) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 7 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class MyFirstTika method main.

public static void main(String[] args) throws Exception {
    String filename = args[0];
    TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
    Metadata metadata = new Metadata();
    String text = parseUsingComponents(filename, tikaConfig, metadata);
    System.out.println("Parsed Metadata: ");
    System.out.println(metadata);
    System.out.println("Parsed Text: ");
    System.out.println(text);
    System.out.println("-------------------------");
    metadata = new Metadata();
    text = parseUsingAutoDetect(filename, tikaConfig, metadata);
    System.out.println("Parsed Metadata: ");
    System.out.println(metadata);
    System.out.println("Parsed Text: ");
    System.out.println(text);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata)

Example 8 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class MimeUtilTest method assertResult.

private void assertResult(String contentType, String expected) throws MimeTypeException {
    TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
    MimeTypes r = tikaConfig.getMimeRepository();
    MimeType mt = r.forName(contentType);
    //        String ext = MimeUtil.getExtension(contentType, config);
    assertEquals(expected, mt.getExtension());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) MimeTypes(org.apache.tika.mime.MimeTypes) MimeType(org.apache.tika.mime.MimeType)

Example 9 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaCLI method dumpConfig.

private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception {
    TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config;
    TikaConfigSerializer.serialize(localConfig, mode, new OutputStreamWriter(System.out, UTF_8), UTF_8);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) OutputStreamWriter(java.io.OutputStreamWriter)

Example 10 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaCLI method compareFileMagic.

/**
     * Compares our mime types registry with the File(1) tool's 
     *  directory of (uncompiled) Magic entries. 
     * (Well, those with mimetypes anyway)
     * @param magicDir Path to the magic directory
     */
private void compareFileMagic(String magicDir) throws Exception {
    Set<String> tikaLacking = new TreeSet<String>();
    Set<String> tikaNoMagic = new TreeSet<String>();
    // Sanity check
    File dir = new File(magicDir);
    if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
    // Looks plausible
    } else {
        throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
    }
    // Find all the mimetypes in the directory
    Set<String> fileMimes = new HashSet<String>();
    for (File mf : dir.listFiles()) {
        if (mf.isFile()) {
            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
            String line;
            while ((line = r.readLine()) != null) {
                if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                    String mime = line.substring(7).trim();
                    fileMimes.add(mime);
                }
            }
            r.close();
        }
    }
    // See how those compare to the Tika ones
    TikaConfig config = TikaConfig.getDefaultConfig();
    MimeTypes mimeTypes = config.getMimeRepository();
    MediaTypeRegistry registry = config.getMediaTypeRegistry();
    for (String mime : fileMimes) {
        try {
            final MimeType type = mimeTypes.getRegisteredMimeType(mime);
            if (type == null) {
                // Tika doesn't know about this one
                tikaLacking.add(mime);
            } else {
                // Tika knows about this one!
                // Does Tika have magic for it?
                boolean hasMagic = type.hasMagic();
                // How about the children?
                if (!hasMagic) {
                    for (MediaType child : registry.getChildTypes(type.getType())) {
                        MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                        if (childType != null && childType.hasMagic()) {
                            hasMagic = true;
                        }
                    }
                }
                // How about the parents?
                MimeType parentType = type;
                while (parentType != null && !hasMagic) {
                    if (parentType.hasMagic()) {
                        // Has magic, fine
                        hasMagic = true;
                    } else {
                        // Check the parent next
                        MediaType parent = registry.getSupertype(type.getType());
                        if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                            // Stop checking parents if we hit a top level type
                            parent = null;
                        }
                        if (parent != null) {
                            parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                        } else {
                            parentType = null;
                        }
                    }
                }
                if (!hasMagic) {
                    tikaNoMagic.add(mime);
                }
            }
        } catch (MimeTypeException e) {
        // Broken entry in the file magic directory
        // Silently skip
        }
    }
    // Check how many tika knows about
    int tikaTypes = 0;
    int tikaAliases = 0;
    for (MediaType type : registry.getTypes()) {
        tikaTypes++;
        tikaAliases += registry.getAliases(type).size();
    }
    // Report
    System.out.println("Tika knows about " + tikaTypes + " unique mime types");
    System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
    System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
    System.out.println();
    System.out.println("The following mime types are known to File but not Tika:");
    for (String mime : tikaLacking) {
        System.out.println("  " + mime);
    }
    System.out.println();
    System.out.println("The following mime types from File have no Tika magic (but their children might):");
    for (String mime : tikaNoMagic) {
        System.out.println("  " + mime);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) TikaConfig(org.apache.tika.config.TikaConfig) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) MimeTypes(org.apache.tika.mime.MimeTypes) FileInputStream(java.io.FileInputStream) MimeType(org.apache.tika.mime.MimeType) TreeSet(java.util.TreeSet) MimeTypeException(org.apache.tika.mime.MimeTypeException) BufferedReader(java.io.BufferedReader) MediaType(org.apache.tika.mime.MediaType) File(java.io.File) HashSet(java.util.HashSet)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5