Search in sources :

Example 11 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class TikaConfigSerializer method addDetectors.

private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
    Detector detector = config.getDetector();
    if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
        // Don't output anything, all using defaults
        Node detComment = doc.createComment("for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
        rootElement.appendChild(detComment);
        return;
    }
    Element detectorsElement = doc.createElement("detectors");
    if (mode == Mode.CURRENT && detector instanceof DefaultDetector || !(detector instanceof CompositeDetector)) {
        Element detectorElement = doc.createElement("detector");
        detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
        detectorsElement.appendChild(detectorElement);
    } else {
        List<Detector> children = ((CompositeDetector) detector).getDetectors();
        for (Detector d : children) {
            Element detectorElement = doc.createElement("detector");
            detectorElement.setAttribute("class", d.getClass().getCanonicalName());
            detectorsElement.appendChild(detectorElement);
        }
    }
    rootElement.appendChild(detectorsElement);
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) DefaultEncodingDetector(org.apache.tika.detect.DefaultEncodingDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) DefaultDetector(org.apache.tika.detect.DefaultDetector) Node(org.w3c.dom.Node) Element(org.w3c.dom.Element)

Example 12 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class AdvancedTypeDetector method detectWithCustomDetector.

public static String detectWithCustomDetector(String name) throws Exception {
    String config = "/org/apache/tika/mime/tika-mimetypes.xml";
    Detector detector = MimeTypesFactory.create(config);
    Detector custom = new Detector() {

        private static final long serialVersionUID = -5420638839201540749L;

        public MediaType detect(InputStream input, Metadata metadata) {
            String type = metadata.get("my-custom-type-override");
            if (type != null) {
                return MediaType.parse(type);
            } else {
                return MediaType.OCTET_STREAM;
            }
        }
    };
    Tika tika = new Tika(new CompositeDetector(custom, detector));
    return tika.detect(name);
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika)

Example 13 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class TikaDetectorConfigTest method assertDetectors.

private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS, boolean shouldHaveZip) {
    boolean hasZip = false;
    boolean hasPOIFS = false;
    for (Detector d : detector.getDetectors()) {
        if (d instanceof ZipContainerDetector) {
            if (shouldHaveZip) {
                hasZip = true;
            } else {
                fail("Shouldn't have the ZipContainerDetector from config");
            }
        }
        if (d instanceof POIFSContainerDetector) {
            if (shouldHavePOIFS) {
                hasPOIFS = true;
            } else {
                fail("Shouldn't have the POIFSContainerDetector from config");
            }
        }
    }
    if (shouldHavePOIFS)
        assertTrue("Should have the POIFSContainerDetector", hasPOIFS);
    if (shouldHaveZip)
        assertTrue("Should have the ZipContainerDetector", hasZip);
}
Also used : POIFSContainerDetector(org.apache.tika.parser.microsoft.POIFSContainerDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) EmptyDetector(org.apache.tika.detect.EmptyDetector) Detector(org.apache.tika.detect.Detector) ZipContainerDetector(org.apache.tika.parser.pkg.ZipContainerDetector) POIFSContainerDetector(org.apache.tika.parser.microsoft.POIFSContainerDetector) DefaultDetector(org.apache.tika.detect.DefaultDetector) ZipContainerDetector(org.apache.tika.parser.pkg.ZipContainerDetector)

Example 14 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class ExcelParserTest method testExcel95.

/**
     * Excel 5 and 95 are older formats, and only get basic support
     */
@Test
public void testExcel95() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    MediaType type;
    Metadata m;
    // First try detection of Excel 5
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // Now Excel 95
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // OfficeParser can handle it
    assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // Parse the Excel 5 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet names
        assertContains("Feuil1", content);
        assertContains("Feuil3", content);
        // Text
        assertContains("Sample Excel", content);
        assertContains("Number", content);
        // Numbers
        assertContains("15", content);
        assertContains("225", content);
        // Metadata was also fetched
        assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
    }
    // Parse the Excel 95 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet name
        assertContains("Foglio1", content);
        // Very boring file, no actual text or numbers!
        // Metadata was also fetched
        assertEquals(null, m.get(TikaCoreProperties.TITLE));
        assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 15 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class TikaDetectors method detectorAsMap.

private void detectorAsMap(Detector d, Map<String, Object> details) {
    details.put("name", d.getClass().getName());
    boolean isComposite = (d instanceof CompositeDetector);
    details.put("composite", isComposite);
    if (isComposite) {
        List<Map<String, Object>> c = new ArrayList<Map<String, Object>>();
        for (Detector cd : ((CompositeDetector) d).getDetectors()) {
            Map<String, Object> cdet = new HashMap<String, Object>();
            detectorAsMap(cd, cdet);
            c.add(cdet);
        }
        details.put("children", c);
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

Detector (org.apache.tika.detect.Detector)20 Metadata (org.apache.tika.metadata.Metadata)11 MediaType (org.apache.tika.mime.MediaType)11 DefaultDetector (org.apache.tika.detect.DefaultDetector)10 InputStream (java.io.InputStream)9 CompositeDetector (org.apache.tika.detect.CompositeDetector)7 IOException (java.io.IOException)5 TikaInputStream (org.apache.tika.io.TikaInputStream)5 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)5 ParseContext (org.apache.tika.parser.ParseContext)5 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)5 Test (org.junit.Test)5 ContentHandler (org.xml.sax.ContentHandler)5 TikaTest (org.apache.tika.TikaTest)3 MimeTypeResolutionException (ddf.mime.MimeTypeResolutionException)2 ArrayList (java.util.ArrayList)2 TikaException (org.apache.tika.exception.TikaException)2 MimeTypes (org.apache.tika.mime.MimeTypes)2 Parser (org.apache.tika.parser.Parser)2 MimeTypeResolver (ddf.mime.MimeTypeResolver)1