Search in sources :

Example 21 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class DumpTikaConfigExampleTest method testDump.

@Test
public void testDump() throws Exception {
    DumpTikaConfigExample ex = new DumpTikaConfigExample();
    for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
        for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
            Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
            TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
            writer.flush();
            writer.close();
            TikaConfig c = new TikaConfig(configFile);
            assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
            assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
            CompositeParser p = (CompositeParser) c.getParser();
            assertTrue("enough parsers?", p.getParsers().size() > 130);
            CompositeDetector d = (CompositeDetector) c.getDetector();
            assertTrue("enough detectors?", d.getDetectors().size() > 3);
            //just try to load it into autodetect to make sure no errors are thrown
            Parser auto = new AutoDetectParser(c);
            assertNotNull(auto);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) FileOutputStream(java.io.FileOutputStream) Charset(java.nio.charset.Charset) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OutputStreamWriter(java.io.OutputStreamWriter) TikaConfigSerializer(org.apache.tika.config.TikaConfigSerializer) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 22 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaServerCli method main.

public static void main(String[] args) {
    LOG.info("Starting {} server", new Tika());
    try {
        Options options = getOptions();
        CommandLineParser cliParser = new GnuParser();
        CommandLine line = cliParser.parse(options, args);
        if (line.hasOption("help")) {
            HelpFormatter helpFormatter = new HelpFormatter();
            helpFormatter.printHelp("tikaserver", options);
            System.exit(-1);
        }
        String host = DEFAULT_HOST;
        if (line.hasOption("host")) {
            host = line.getOptionValue("host");
            if ("*".equals(host)) {
                host = "0.0.0.0";
            }
        }
        int port = DEFAULT_PORT;
        if (line.hasOption("port")) {
            port = Integer.valueOf(line.getOptionValue("port"));
        }
        boolean returnStackTrace = false;
        if (line.hasOption("includeStack")) {
            returnStackTrace = true;
        }
        TikaLoggingFilter logFilter = null;
        if (line.hasOption("log")) {
            String logLevel = line.getOptionValue("log");
            if (LOG_LEVELS.contains(logLevel)) {
                boolean isInfoLevel = "info".equals(logLevel);
                logFilter = new TikaLoggingFilter(isInfoLevel);
            } else {
                LOG.info("Unsupported request URI log level: {}", logLevel);
            }
        }
        CrossOriginResourceSharingFilter corsFilter = null;
        if (line.hasOption("cors")) {
            corsFilter = new CrossOriginResourceSharingFilter();
            String url = line.getOptionValue("cors");
            List<String> origins = new ArrayList<String>();
            // Empty list allows all origins.
            if (!url.equals("*"))
                origins.add(url);
            corsFilter.setAllowOrigins(origins);
        }
        // The Tika Configuration to use throughout            
        TikaConfig tika;
        if (line.hasOption("config")) {
            String configFilePath = line.getOptionValue("config");
            LOG.info("Using custom config: {}", configFilePath);
            tika = new TikaConfig(configFilePath);
        } else {
            tika = TikaConfig.getDefaultConfig();
        }
        DigestingParser.Digester digester = null;
        if (line.hasOption("digest")) {
            int digestMarkLimit = DEFAULT_DIGEST_MARK_LIMIT;
            if (line.hasOption("dml")) {
                String dmlS = line.getOptionValue("dml");
                try {
                    digestMarkLimit = Integer.parseInt(dmlS);
                } catch (NumberFormatException e) {
                    throw new RuntimeException("Must have parseable int after digestMarkLimit(dml): " + dmlS);
                }
            }
            digester = new CommonsDigester(digestMarkLimit, CommonsDigester.parse(line.getOptionValue("digest")));
        }
        if (line.hasOption("enableFileUrl") && !line.hasOption("enableUnsecureFeatures")) {
            System.err.println("If you want to enable fileUrl, you must also acknowledge the security risks\n" + "by including --enableUnsecureFeatures.  See CVE-2015-3271.");
            System.exit(-1);
        }
        InputStreamFactory inputStreamFactory = null;
        if (line.hasOption("enableFileUrl") && line.hasOption("enableUnsecureFeatures")) {
            inputStreamFactory = new URLEnabledInputStreamFactory();
            System.out.println(FILE_URL_WARNING);
        } else {
            inputStreamFactory = new DefaultInputStreamFactory();
        }
        TikaResource.init(tika, digester, inputStreamFactory);
        JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
        List<ResourceProvider> rCoreProviders = new ArrayList<>();
        rCoreProviders.add(new SingletonResourceProvider(new MetadataResource()));
        rCoreProviders.add(new SingletonResourceProvider(new RecursiveMetadataResource()));
        rCoreProviders.add(new SingletonResourceProvider(new DetectorResource()));
        rCoreProviders.add(new SingletonResourceProvider(new LanguageResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TranslateResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaResource()));
        rCoreProviders.add(new SingletonResourceProvider(new UnpackerResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaMimeTypes()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaDetectors()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaParsers()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaVersion()));
        List<ResourceProvider> rAllProviders = new ArrayList<>(rCoreProviders);
        rAllProviders.add(new SingletonResourceProvider(new TikaWelcome(rCoreProviders)));
        sf.setResourceProviders(rAllProviders);
        List<Object> providers = new ArrayList<>();
        providers.add(new TarWriter());
        providers.add(new ZipWriter());
        providers.add(new CSVMessageBodyWriter());
        providers.add(new MetadataListMessageBodyWriter());
        providers.add(new JSONMessageBodyWriter());
        providers.add(new XMPMessageBodyWriter());
        providers.add(new TextMessageBodyWriter());
        providers.add(new TikaServerParseExceptionMapper(returnStackTrace));
        if (logFilter != null) {
            providers.add(logFilter);
        }
        if (corsFilter != null) {
            providers.add(corsFilter);
        }
        sf.setProviders(providers);
        String url = "http://" + host + ":" + port + "/";
        sf.setAddress(url);
        BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
        JAXRSBindingFactory factory = new JAXRSBindingFactory();
        factory.setBus(sf.getBus());
        manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
        sf.create();
        LOG.info("Started Apache Tika server at {}", url);
    } catch (Exception ex) {
        LOG.error("Can't start", ex);
        System.exit(-1);
    }
}
Also used : TikaParsers(org.apache.tika.server.resource.TikaParsers) JAXRSBindingFactory(org.apache.cxf.jaxrs.JAXRSBindingFactory) GnuParser(org.apache.commons.cli.GnuParser) ArrayList(java.util.ArrayList) UnpackerResource(org.apache.tika.server.resource.UnpackerResource) BindingFactoryManager(org.apache.cxf.binding.BindingFactoryManager) Tika(org.apache.tika.Tika) HelpFormatter(org.apache.commons.cli.HelpFormatter) MetadataListMessageBodyWriter(org.apache.tika.server.writer.MetadataListMessageBodyWriter) SingletonResourceProvider(org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider) ResourceProvider(org.apache.cxf.jaxrs.lifecycle.ResourceProvider) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) XMPMessageBodyWriter(org.apache.tika.server.writer.XMPMessageBodyWriter) TikaDetectors(org.apache.tika.server.resource.TikaDetectors) MetadataResource(org.apache.tika.server.resource.MetadataResource) RecursiveMetadataResource(org.apache.tika.server.resource.RecursiveMetadataResource) JSONMessageBodyWriter(org.apache.tika.server.writer.JSONMessageBodyWriter) DigestingParser(org.apache.tika.parser.DigestingParser) CrossOriginResourceSharingFilter(org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter) DetectorResource(org.apache.tika.server.resource.DetectorResource) TranslateResource(org.apache.tika.server.resource.TranslateResource) RecursiveMetadataResource(org.apache.tika.server.resource.RecursiveMetadataResource) Options(org.apache.commons.cli.Options) ZipWriter(org.apache.tika.server.writer.ZipWriter) LanguageResource(org.apache.tika.server.resource.LanguageResource) SingletonResourceProvider(org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider) CSVMessageBodyWriter(org.apache.tika.server.writer.CSVMessageBodyWriter) TextMessageBodyWriter(org.apache.tika.server.writer.TextMessageBodyWriter) TikaMimeTypes(org.apache.tika.server.resource.TikaMimeTypes) CommandLineParser(org.apache.commons.cli.CommandLineParser) TarWriter(org.apache.tika.server.writer.TarWriter) TikaConfig(org.apache.tika.config.TikaConfig) TikaResource(org.apache.tika.server.resource.TikaResource) JAXRSServerFactoryBean(org.apache.cxf.jaxrs.JAXRSServerFactoryBean) TikaVersion(org.apache.tika.server.resource.TikaVersion) CommandLine(org.apache.commons.cli.CommandLine) TikaWelcome(org.apache.tika.server.resource.TikaWelcome)

Example 23 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class RTFParserTest method testConfig.

@Test
public void testConfig() throws Exception {
    //test that memory allocation of the bin element is limited
    //via the config file.  Unfortunately, this test file's bin embedding contains 10 bytes
    //so we had to set the config to 0.
    InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
    assertNotNull(is);
    TikaConfig tikaConfig = new TikaConfig(is);
    Parser p = new AutoDetectParser(tikaConfig);
    List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
    assertEquals(1, metadataList.size());
    assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RTFMetadata(org.apache.tika.metadata.RTFMetadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 24 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ConcurrentUtilsTest method testExecuteExecutor.

@Test
public void testExecuteExecutor() throws Exception {
    TikaConfig config = TikaConfig.getDefaultConfig();
    ParseContext context = new ParseContext();
    context.set(ExecutorService.class, config.getExecutorService());
    Future result = ConcurrentUtils.execute(context, new Runnable() {

        @Override
        public void run() {
        //Do nothing
        }
    });
    assertNull(result.get());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) ParseContext(org.apache.tika.parser.ParseContext) Future(java.util.concurrent.Future) Test(org.junit.Test)

Example 25 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class DL4JInceptionV3NetTest method recognise.

@Test
@Ignore("until we can make this more robust across platforms")
public void recognise() throws Exception {
    TikaConfig config;
    try {
        config = new TikaConfig(getClass().getResourceAsStream("dl4j-inception3-config.xml"));
    } catch (TikaConfigException e) {
        if (e.getMessage() != null && e.getMessage().contains("Connection refused")) {
            return;
        }
        throw e;
    }
    Tika tika = new Tika(config);
    Metadata md = new Metadata();
    tika.parse(getClass().getResourceAsStream("cat.jpg"), md);
    String[] objects = md.getValues("OBJECT");
    boolean found = false;
    for (String object : objects) {
        if (object.contains("_cat")) {
            found = true;
        }
    }
    assertTrue(found);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaConfigException(org.apache.tika.exception.TikaConfigException) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5