Search in sources :

Example 1 with TikaConfig

use of org.apache.tika.config.TikaConfig in project che by eclipse.

the class MediaTypeFilter method accept.

@Override
public boolean accept(VirtualFile file) {
    try (InputStream content = file.getContent()) {
        TikaConfig tikaConfig = new TikaConfig();
        MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
        if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
            return true;
        }
        return false;
    } catch (TikaException | ForbiddenException | ServerException | IOException e) {
        return true;
    }
}
Also used : ForbiddenException(org.eclipse.che.api.core.ForbiddenException) TikaException(org.apache.tika.exception.TikaException) ServerException(org.eclipse.che.api.core.ServerException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 2 with TikaConfig

use of org.apache.tika.config.TikaConfig in project lucene-solr by apache.

the class TikaEntityProcessor method firstInit.

@Override
protected void firstInit(Context context) {
    super.firstInit(context);
    try {
        String tikaConfigFile = context.getResolvedEntityAttribute("tikaConfig");
        if (tikaConfigFile == null) {
            ClassLoader classLoader = context.getSolrCore().getResourceLoader().getClassLoader();
            tikaConfig = new TikaConfig(classLoader);
        } else {
            File configFile = new File(tikaConfigFile);
            if (!configFile.isAbsolute()) {
                configFile = new File(context.getSolrCore().getResourceLoader().getConfigDir(), tikaConfigFile);
            }
            tikaConfig = new TikaConfig(configFile);
        }
    } catch (Exception e) {
        wrapAndThrow(SEVERE, e, "Unable to load Tika Config");
    }
    String extractEmbeddedString = context.getResolvedEntityAttribute("extractEmbedded");
    if ("true".equals(extractEmbeddedString)) {
        extractEmbedded = true;
    }
    format = context.getResolvedEntityAttribute("format");
    if (format == null)
        format = "text";
    if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format) && !"none".equals(format))
        throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
    htmlMapper = context.getResolvedEntityAttribute("htmlMapper");
    if (htmlMapper == null)
        htmlMapper = "default";
    if (!"default".equals(htmlMapper) && !"identity".equals(htmlMapper))
        throw new DataImportHandlerException(SEVERE, "'htmlMapper', if present, must be 'default' or 'identity'");
    parser = context.getResolvedEntityAttribute("parser");
    if (parser == null) {
        parser = AUTO_PARSER;
    }
    spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) File(java.io.File) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) SAXException(org.xml.sax.SAXException)

Example 3 with TikaConfig

use of org.apache.tika.config.TikaConfig in project lucene-solr by apache.

the class ExtractingRequestHandler method inform.

@Override
public void inform(SolrCore core) {
    if (initArgs != null) {
        //if relative,then relative to config dir, otherwise, absolute path
        String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
        if (tikaConfigLoc != null) {
            File configFile = new File(tikaConfigLoc);
            if (configFile.isAbsolute() == false) {
                configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
            }
            try {
                config = new TikaConfig(configFile);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
        if (parseContextConfigLoc != null) {
            try {
                parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
        NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
        if (configDateFormats != null && configDateFormats.size() > 0) {
            dateFormats = new HashSet<>();
            Iterator<Map.Entry> it = configDateFormats.iterator();
            while (it.hasNext()) {
                String format = (String) it.next().getValue();
                log.info("Adding Date Format: " + format);
                dateFormats.add(format);
            }
        }
    }
    if (config == null) {
        try {
            config = getDefaultConfig(core.getResourceLoader().getClassLoader());
        } catch (MimeTypeException | IOException e) {
            throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
    }
    if (parseContextConfig == null) {
        parseContextConfig = new ParseContextConfig();
    }
    factory = createFactory();
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) NamedList(org.apache.solr.common.util.NamedList) IOException(java.io.IOException) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeTypeException(org.apache.tika.mime.MimeTypeException) File(java.io.File) SolrException(org.apache.solr.common.SolrException)

Example 4 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method testUnknownParser.

/**
     * Make sure that with a service loader given, we can
     * get different configurable behaviour on parser classes
     * which can't be found.
     */
@Test
public void testUnknownParser() throws Exception {
    ServiceLoader ignoreLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE);
    ServiceLoader warnLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN);
    ServiceLoader throwLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW);
    Path configPath = Paths.get(new URI(getConfigPath("TIKA-1700-unknown-parser.xml")));
    TikaConfig ignore = new TikaConfig(configPath, ignoreLoader);
    assertNotNull(ignore);
    assertNotNull(ignore.getParser());
    assertEquals(1, ((CompositeParser) ignore.getParser()).getAllComponentParsers().size());
    TikaConfig warn = new TikaConfig(configPath, warnLoader);
    assertNotNull(warn);
    assertNotNull(warn.getParser());
    assertEquals(1, ((CompositeParser) warn.getParser()).getAllComponentParsers().size());
    try {
        new TikaConfig(configPath, throwLoader);
        fail("Shouldn't get here, invalid parser class");
    } catch (TikaException expected) {
    }
}
Also used : Path(java.nio.file.Path) TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) URI(java.net.URI) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 5 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5