Search in sources :

Example 6 with DefaultDetector

use of org.apache.tika.detect.DefaultDetector in project tika by apache.

the class TikaDetectorConfigTest method testDetectorExcludeFromDefault.

@Test
public void testDetectorExcludeFromDefault() throws Exception {
    TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeDetector detector = (CompositeDetector) config.getDetector();
    // Should be wrapping two detectors
    assertEquals(2, detector.getDetectors().size());
    // First should be DefaultDetector, second Empty, that order
    assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass());
    assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass());
    // Get the DefaultDetector from the config
    DefaultDetector confDetector = (DefaultDetector) detector.getDetectors().get(0);
    // Get a fresh "default" DefaultParser
    DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository());
    // The default one will offer the Zip and POIFS detectors
    assertDetectors(normDetector, true, true);
    // The one from the config won't, as we excluded those
    assertDetectors(confDetector, false, false);
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Test(org.junit.Test)

Example 7 with DefaultDetector

use of org.apache.tika.detect.DefaultDetector in project tika by apache.

the class TikaConfigSerializer method addDetectors.

private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
    Detector detector = config.getDetector();
    if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
        // Don't output anything, all using defaults
        Node detComment = doc.createComment("for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
        rootElement.appendChild(detComment);
        return;
    }
    Element detectorsElement = doc.createElement("detectors");
    if (mode == Mode.CURRENT && detector instanceof DefaultDetector || !(detector instanceof CompositeDetector)) {
        Element detectorElement = doc.createElement("detector");
        detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
        detectorsElement.appendChild(detectorElement);
    } else {
        List<Detector> children = ((CompositeDetector) detector).getDetectors();
        for (Detector d : children) {
            Element detectorElement = doc.createElement("detector");
            detectorElement.setAttribute("class", d.getClass().getCanonicalName());
            detectorsElement.appendChild(detectorElement);
        }
    }
    rootElement.appendChild(detectorsElement);
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) DefaultEncodingDetector(org.apache.tika.detect.DefaultEncodingDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) DefaultDetector(org.apache.tika.detect.DefaultDetector) Node(org.w3c.dom.Node) Element(org.w3c.dom.Element)

Example 8 with DefaultDetector

use of org.apache.tika.detect.DefaultDetector in project tika by apache.

the class ExcelParserTest method testExcel95.

/**
     * Excel 5 and 95 are older formats, and only get basic support
     */
@Test
public void testExcel95() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    MediaType type;
    Metadata m;
    // First try detection of Excel 5
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // Now Excel 95
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // OfficeParser can handle it
    assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // Parse the Excel 5 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet names
        assertContains("Feuil1", content);
        assertContains("Feuil3", content);
        // Text
        assertContains("Sample Excel", content);
        assertContains("Number", content);
        // Numbers
        assertContains("15", content);
        assertContains("225", content);
        // Metadata was also fetched
        assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
    }
    // Parse the Excel 95 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet name
        assertContains("Foglio1", content);
        // Very boring file, no actual text or numbers!
        // Metadata was also fetched
        assertEquals(null, m.get(TikaCoreProperties.TITLE));
        assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 9 with DefaultDetector

use of org.apache.tika.detect.DefaultDetector in project ddf by codice.

the class MimeTypeMapperImpl method guessMimeType.

@Override
public String guessMimeType(InputStream is, String fileExtension) throws MimeTypeResolutionException {
    LOGGER.trace("ENTERING: guessMimeType()");
    String mimeType = null;
    LOGGER.debug("Looping through{} MimeTypeResolvers", mimeTypeResolvers.size());
    // This is to force the TikaMimeTypeResolver to be called
    // after the CustomMimeTypeResolvers to prevent Tika default mapping
    // from being used when a CustomMimeTypeResolver may be more appropriate.
    List<MimeTypeResolver> sortedResolvers = sortResolvers(mimeTypeResolvers);
    if (StringUtils.isEmpty(fileExtension)) {
        try (TemporaryFileBackedOutputStream tfbos = new TemporaryFileBackedOutputStream()) {
            IOUtils.copy(is, tfbos);
            try (InputStream inputStream = tfbos.asByteSource().openStream()) {
                Detector detector = new DefaultDetector();
                MediaType mediaType = detector.detect(inputStream, new Metadata());
                fileExtension = getFileExtensionForMimeType(mediaType.toString()).replace(".", "");
            } finally {
                is = tfbos.asByteSource().openStream();
            }
        } catch (Exception e) {
            LOGGER.debug("Failed to guess mimeType for file without extension.");
        }
    }
    // If file has XML extension, then read root element namespace once so
    // each MimeTypeResolver does not have to open the stream and read the namespace
    String namespace = null;
    if (fileExtension.equals(XML_FILE_EXTENSION)) {
        try {
            namespace = XMLUtils.getRootNamespace(IOUtils.toString(is));
        } catch (IOException ioe) {
            LOGGER.debug("Could not read namespace from input stream.", ioe);
        }
        LOGGER.debug("namespace = {}", namespace);
    }
    // Once a file extension is find for the given mime type, exit the loop.
    for (MimeTypeResolver resolver : sortedResolvers) {
        LOGGER.debug("Calling MimeTypeResolver {}", resolver.getName());
        try {
            // an InputTransformer to create a metacard for that "generic" XML file.
            if (fileExtension.equals(XML_FILE_EXTENSION)) {
                if (namespace != null && resolver.hasSchema()) {
                    if (namespace.equals(resolver.getSchema())) {
                        mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
                    }
                }
            } else {
                mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
            }
        } catch (Exception e) {
            LOGGER.debug("Error resolving mime type for file extension: {}", fileExtension);
            throw new MimeTypeResolutionException(e);
        }
        if (StringUtils.isNotEmpty(mimeType)) {
            LOGGER.debug("mimeType [{}] retrieved from MimeTypeResolver:  ", mimeType, resolver.getName());
            break;
        }
    }
    LOGGER.debug("mimeType = {},   file extension = [{}]", mimeType, fileExtension);
    LOGGER.trace("EXITING: guessMimeType()");
    return mimeType;
}
Also used : MimeTypeResolver(ddf.mime.MimeTypeResolver) DefaultDetector(org.apache.tika.detect.DefaultDetector) MimeTypeResolutionException(ddf.mime.MimeTypeResolutionException) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) TemporaryFileBackedOutputStream(org.codice.ddf.platform.util.TemporaryFileBackedOutputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) IOException(java.io.IOException) MimeTypeResolutionException(ddf.mime.MimeTypeResolutionException)

Aggregations

DefaultDetector (org.apache.tika.detect.DefaultDetector)9 Detector (org.apache.tika.detect.Detector)7 Metadata (org.apache.tika.metadata.Metadata)5 MediaType (org.apache.tika.mime.MediaType)5 Test (org.junit.Test)5 InputStream (java.io.InputStream)4 TikaTest (org.apache.tika.TikaTest)3 TikaInputStream (org.apache.tika.io.TikaInputStream)3 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)3 ParseContext (org.apache.tika.parser.ParseContext)3 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)3 ContentHandler (org.xml.sax.ContentHandler)3 IOException (java.io.IOException)2 CompositeDetector (org.apache.tika.detect.CompositeDetector)2 Parser (org.apache.tika.parser.Parser)2 MimeTypeResolutionException (ddf.mime.MimeTypeResolutionException)1 MimeTypeResolver (ddf.mime.MimeTypeResolver)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1