Search in sources :

Example 1 with CompositeDetector

use of org.apache.tika.detect.CompositeDetector in project tika by apache.

the class TikaDetectorConfigTest method testPSTDetectionWithoutZipDetector.

/**
     * TIKA-1708 - If the Zip detector is disabled, either explicitly,
     *  or via giving a list of detectors that it isn't part of, ensure
     *  that detection of PST files still works
     */
@Test
public void testPSTDetectionWithoutZipDetector() throws Exception {
    // Check the one with an exclude
    TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
    assertNotNull(configWX.getParser());
    assertNotNull(configWX.getDetector());
    CompositeDetector detectorWX = (CompositeDetector) configWX.getDetector();
    // Check it has the POIFS one, but not the zip one
    assertDetectors(detectorWX, true, false);
    // Check the one with an explicit list
    TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
    assertNotNull(configCL.getParser());
    assertNotNull(configCL.getDetector());
    CompositeDetector detectorCL = (CompositeDetector) configCL.getDetector();
    assertEquals(2, detectorCL.getDetectors().size());
    // Check it also has the POIFS one, but not the zip one
    assertDetectors(detectorCL, true, false);
    // Check that both detectors have a mimetypes with entries
    assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), configWX.getMediaTypeRegistry().getTypes().size() > 100);
    assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), configCL.getMediaTypeRegistry().getTypes().size() > 100);
    // Now check they detect PST files correctly
    TikaInputStream stream = TikaInputStream.get(getResourceAsFile("/test-documents/testPST.pst"));
    assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorWX.detect(stream, new Metadata()));
    assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorCL.detect(stream, new Metadata()));
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) Test(org.junit.Test)

Example 2 with CompositeDetector

use of org.apache.tika.detect.CompositeDetector in project tika by apache.

the class TikaCLI method displayDetector.

private void displayDetector(Detector d, int i) {
    boolean isComposite = (d instanceof CompositeDetector);
    String name = d.getClass().getName();
    System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : ""));
    if (isComposite) {
        List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
        for (Detector sd : subDetectors) {
            displayDetector(sd, i + 2);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector)

Example 3 with CompositeDetector

use of org.apache.tika.detect.CompositeDetector in project tika by apache.

the class DumpTikaConfigExampleTest method testDump.

@Test
public void testDump() throws Exception {
    DumpTikaConfigExample ex = new DumpTikaConfigExample();
    for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
        for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
            Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
            TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
            writer.flush();
            writer.close();
            TikaConfig c = new TikaConfig(configFile);
            assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
            assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
            CompositeParser p = (CompositeParser) c.getParser();
            assertTrue("enough parsers?", p.getParsers().size() > 130);
            CompositeDetector d = (CompositeDetector) c.getDetector();
            assertTrue("enough detectors?", d.getDetectors().size() > 3);
            //just try to load it into autodetect to make sure no errors are thrown
            Parser auto = new AutoDetectParser(c);
            assertNotNull(auto);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) FileOutputStream(java.io.FileOutputStream) Charset(java.nio.charset.Charset) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OutputStreamWriter(java.io.OutputStreamWriter) TikaConfigSerializer(org.apache.tika.config.TikaConfigSerializer) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 4 with CompositeDetector

use of org.apache.tika.detect.CompositeDetector in project tika by apache.

the class TikaDetectors method detectorAsHTML.

private void detectorAsHTML(Detector d, StringBuffer html, int level) {
    html.append("<h");
    html.append(level);
    html.append(">");
    String name = d.getClass().getName();
    html.append(name.substring(name.lastIndexOf('.') + 1));
    html.append("</h");
    html.append(level);
    html.append(">");
    html.append("<p>Class: ");
    html.append(name);
    html.append("</p>");
    if (d instanceof CompositeDetector) {
        html.append("<p>Composite Detector</p>");
        for (Detector cd : ((CompositeDetector) d).getDetectors()) {
            detectorAsHTML(cd, html, level + 1);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector)

Example 5 with CompositeDetector

use of org.apache.tika.detect.CompositeDetector in project tika by apache.

the class TikaDetectors method renderDetector.

private void renderDetector(Detector d, StringBuffer text, int indent) {
    boolean isComposite = (d instanceof CompositeDetector);
    String name = d.getClass().getName();
    for (int i = 0; i < indent; i++) {
        text.append("  ");
    }
    text.append(name);
    if (isComposite) {
        text.append(" (Composite Detector):\n");
        List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
        for (Detector sd : subDetectors) {
            renderDetector(sd, text, indent + 1);
        }
    } else {
        text.append("\n");
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector)

Aggregations

CompositeDetector (org.apache.tika.detect.CompositeDetector)11 Detector (org.apache.tika.detect.Detector)7 DefaultDetector (org.apache.tika.detect.DefaultDetector)4 Test (org.junit.Test)3 Tika (org.apache.tika.Tika)2 Metadata (org.apache.tika.metadata.Metadata)2 FileOutputStream (java.io.FileOutputStream)1 InputStream (java.io.InputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 URL (java.net.URL)1 Charset (java.nio.charset.Charset)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 TikaConfig (org.apache.tika.config.TikaConfig)1 TikaConfigSerializer (org.apache.tika.config.TikaConfigSerializer)1 CompositeEncodingDetector (org.apache.tika.detect.CompositeEncodingDetector)1 DefaultEncodingDetector (org.apache.tika.detect.DefaultEncodingDetector)1 EmptyDetector (org.apache.tika.detect.EmptyDetector)1