Search in sources :

Example 1 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.

/**
     * If we supply a non serializable object on the ParseContext,
     *  check we get a helpful exception back
     */
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    ParseContext context = new ParseContext();
    context.set(Detector.class, new Detector() {

        public MediaType detect(InputStream input, Metadata metadata) {
            return MediaType.OCTET_STREAM;
        }
    });
    try {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, output, new Metadata(), context);
        fail("Should have blown up with a non serializable ParseContext");
    } catch (TikaException e) {
        // Check the right details
        assertNotNull(e.getCause());
        assertEquals(NotSerializableException.class, e.getCause().getClass());
        assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) NotSerializableException(java.io.NotSerializableException) Detector(org.apache.tika.detect.Detector) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 2 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class TikaResource method fillMetadata.

@SuppressWarnings("serial")
public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) {
    String fileName = detectFilename(httpHeaders);
    if (fileName != null) {
        metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
    }
    String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
    javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
    if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
        mediaType = null;
    }
    if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
        mediaType = null;
    }
    if (mediaType != null) {
        metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
        final Detector detector = getDetector(parser);
        setDetector(parser, new Detector() {

            public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
                String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
                //make sure never to return null -- TIKA-1845
                MediaType type = null;
                if (ct != null) {
                    //this can return null if ct is not a valid mime type
                    type = MediaType.parse(ct);
                }
                if (type != null) {
                    return type;
                } else {
                    return detector.detect(inputStream, metadata);
                }
            }
        });
    }
    final String password = httpHeaders.getFirst("Password");
    if (password != null) {
        context.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return password;
            }
        });
    }
}
Also used : Detector(org.apache.tika.detect.Detector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) PasswordProvider(org.apache.tika.parser.PasswordProvider)

Example 3 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class TikaCLI method displayDetector.

private void displayDetector(Detector d, int i) {
    boolean isComposite = (d instanceof CompositeDetector);
    String name = d.getClass().getName();
    System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : ""));
    if (isComposite) {
        List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
        for (Detector sd : subDetectors) {
            displayDetector(sd, i + 2);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) CompositeDetector(org.apache.tika.detect.CompositeDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector)

Example 4 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class OOXMLParserTest method testExcelXLSB.

@Test
public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
    // Should be detected correctly
    MediaType type;
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
    }
    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser will (soon) handle it
    assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // AutoDetectParser doesn't break on it
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        assertContains("This is an example spreadsheet", content);
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 5 with Detector

use of org.apache.tika.detect.Detector in project tika by apache.

the class BundleIT method testBundleDetectors.

@Test
public void testBundleDetectors() throws Exception {
    //For some reason, the detector created by OSGi has a flat
    //list of detectors, whereas the detector created by the traditional
    //service loading method has children: DefaultDetector, MimeTypes.
    //We have to flatten the service loaded DefaultDetector to get equivalence.
    //Detection behavior should all be the same.
    // Get the classes found within OSGi
    ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class);
    DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef);
    Set<String> osgiDetectors = new HashSet<>();
    for (Detector d : detectorService.getDetectors()) {
        osgiDetectors.add(d.getClass().getName());
    }
    // Check we did get a few, just in case...
    assertTrue("Should have several Detector names, found " + osgiDetectors.size(), osgiDetectors.size() > 3);
    // Get the raw detectors list from the traditional service loading mechanism
    DefaultDetector detector = new DefaultDetector();
    Set<String> rawDetectors = new HashSet<String>();
    for (Detector d : detector.getDetectors()) {
        if (d instanceof DefaultDetector) {
            for (Detector dChild : ((DefaultDetector) d).getDetectors()) {
                rawDetectors.add(dChild.getClass().getName());
            }
        } else {
            rawDetectors.add(d.getClass().getName());
        }
    }
    assertEquals(osgiDetectors, rawDetectors);
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Detector (org.apache.tika.detect.Detector)20 Metadata (org.apache.tika.metadata.Metadata)11 MediaType (org.apache.tika.mime.MediaType)11 DefaultDetector (org.apache.tika.detect.DefaultDetector)10 InputStream (java.io.InputStream)9 CompositeDetector (org.apache.tika.detect.CompositeDetector)7 IOException (java.io.IOException)5 TikaInputStream (org.apache.tika.io.TikaInputStream)5 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)5 ParseContext (org.apache.tika.parser.ParseContext)5 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)5 Test (org.junit.Test)5 ContentHandler (org.xml.sax.ContentHandler)5 TikaTest (org.apache.tika.TikaTest)3 MimeTypeResolutionException (ddf.mime.MimeTypeResolutionException)2 ArrayList (java.util.ArrayList)2 TikaException (org.apache.tika.exception.TikaException)2 MimeTypes (org.apache.tika.mime.MimeTypes)2 Parser (org.apache.tika.parser.Parser)2 MimeTypeResolver (ddf.mime.MimeTypeResolver)1