Search in sources :

Example 11 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class SimpleTypeDetector method main.

public static void main(String[] args) throws Exception {
    Tika tika = new Tika();
    for (String file : args) {
        String type = tika.detect(new File(file));
        System.out.println(file + ": " + type);
    }
}
Also used : Tika(org.apache.tika.Tika) File(java.io.File)

Example 12 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class TIAParsingExample method parseToStringExample.

public static String parseToStringExample() throws Exception {
    File document = new File("example.doc");
    String content = new Tika().parseToString(document);
    System.out.print(content);
    return content;
}
Also used : Tika(org.apache.tika.Tika) File(java.io.File)

Example 13 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class TikaServerCli method main.

public static void main(String[] args) {
    LOG.info("Starting {} server", new Tika());
    try {
        Options options = getOptions();
        CommandLineParser cliParser = new GnuParser();
        CommandLine line = cliParser.parse(options, args);
        if (line.hasOption("help")) {
            HelpFormatter helpFormatter = new HelpFormatter();
            helpFormatter.printHelp("tikaserver", options);
            System.exit(-1);
        }
        String host = DEFAULT_HOST;
        if (line.hasOption("host")) {
            host = line.getOptionValue("host");
            if ("*".equals(host)) {
                host = "0.0.0.0";
            }
        }
        int port = DEFAULT_PORT;
        if (line.hasOption("port")) {
            port = Integer.valueOf(line.getOptionValue("port"));
        }
        boolean returnStackTrace = false;
        if (line.hasOption("includeStack")) {
            returnStackTrace = true;
        }
        TikaLoggingFilter logFilter = null;
        if (line.hasOption("log")) {
            String logLevel = line.getOptionValue("log");
            if (LOG_LEVELS.contains(logLevel)) {
                boolean isInfoLevel = "info".equals(logLevel);
                logFilter = new TikaLoggingFilter(isInfoLevel);
            } else {
                LOG.info("Unsupported request URI log level: {}", logLevel);
            }
        }
        CrossOriginResourceSharingFilter corsFilter = null;
        if (line.hasOption("cors")) {
            corsFilter = new CrossOriginResourceSharingFilter();
            String url = line.getOptionValue("cors");
            List<String> origins = new ArrayList<String>();
            // Empty list allows all origins.
            if (!url.equals("*"))
                origins.add(url);
            corsFilter.setAllowOrigins(origins);
        }
        // The Tika Configuration to use throughout            
        TikaConfig tika;
        if (line.hasOption("config")) {
            String configFilePath = line.getOptionValue("config");
            LOG.info("Using custom config: {}", configFilePath);
            tika = new TikaConfig(configFilePath);
        } else {
            tika = TikaConfig.getDefaultConfig();
        }
        DigestingParser.Digester digester = null;
        if (line.hasOption("digest")) {
            int digestMarkLimit = DEFAULT_DIGEST_MARK_LIMIT;
            if (line.hasOption("dml")) {
                String dmlS = line.getOptionValue("dml");
                try {
                    digestMarkLimit = Integer.parseInt(dmlS);
                } catch (NumberFormatException e) {
                    throw new RuntimeException("Must have parseable int after digestMarkLimit(dml): " + dmlS);
                }
            }
            digester = new CommonsDigester(digestMarkLimit, CommonsDigester.parse(line.getOptionValue("digest")));
        }
        if (line.hasOption("enableFileUrl") && !line.hasOption("enableUnsecureFeatures")) {
            System.err.println("If you want to enable fileUrl, you must also acknowledge the security risks\n" + "by including --enableUnsecureFeatures.  See CVE-2015-3271.");
            System.exit(-1);
        }
        InputStreamFactory inputStreamFactory = null;
        if (line.hasOption("enableFileUrl") && line.hasOption("enableUnsecureFeatures")) {
            inputStreamFactory = new URLEnabledInputStreamFactory();
            System.out.println(FILE_URL_WARNING);
        } else {
            inputStreamFactory = new DefaultInputStreamFactory();
        }
        TikaResource.init(tika, digester, inputStreamFactory);
        JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
        List<ResourceProvider> rCoreProviders = new ArrayList<>();
        rCoreProviders.add(new SingletonResourceProvider(new MetadataResource()));
        rCoreProviders.add(new SingletonResourceProvider(new RecursiveMetadataResource()));
        rCoreProviders.add(new SingletonResourceProvider(new DetectorResource()));
        rCoreProviders.add(new SingletonResourceProvider(new LanguageResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TranslateResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaResource()));
        rCoreProviders.add(new SingletonResourceProvider(new UnpackerResource()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaMimeTypes()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaDetectors()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaParsers()));
        rCoreProviders.add(new SingletonResourceProvider(new TikaVersion()));
        List<ResourceProvider> rAllProviders = new ArrayList<>(rCoreProviders);
        rAllProviders.add(new SingletonResourceProvider(new TikaWelcome(rCoreProviders)));
        sf.setResourceProviders(rAllProviders);
        List<Object> providers = new ArrayList<>();
        providers.add(new TarWriter());
        providers.add(new ZipWriter());
        providers.add(new CSVMessageBodyWriter());
        providers.add(new MetadataListMessageBodyWriter());
        providers.add(new JSONMessageBodyWriter());
        providers.add(new XMPMessageBodyWriter());
        providers.add(new TextMessageBodyWriter());
        providers.add(new TikaServerParseExceptionMapper(returnStackTrace));
        if (logFilter != null) {
            providers.add(logFilter);
        }
        if (corsFilter != null) {
            providers.add(corsFilter);
        }
        sf.setProviders(providers);
        String url = "http://" + host + ":" + port + "/";
        sf.setAddress(url);
        BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
        JAXRSBindingFactory factory = new JAXRSBindingFactory();
        factory.setBus(sf.getBus());
        manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
        sf.create();
        LOG.info("Started Apache Tika server at {}", url);
    } catch (Exception ex) {
        LOG.error("Can't start", ex);
        System.exit(-1);
    }
}
Also used : TikaParsers(org.apache.tika.server.resource.TikaParsers) JAXRSBindingFactory(org.apache.cxf.jaxrs.JAXRSBindingFactory) GnuParser(org.apache.commons.cli.GnuParser) ArrayList(java.util.ArrayList) UnpackerResource(org.apache.tika.server.resource.UnpackerResource) BindingFactoryManager(org.apache.cxf.binding.BindingFactoryManager) Tika(org.apache.tika.Tika) HelpFormatter(org.apache.commons.cli.HelpFormatter) MetadataListMessageBodyWriter(org.apache.tika.server.writer.MetadataListMessageBodyWriter) SingletonResourceProvider(org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider) ResourceProvider(org.apache.cxf.jaxrs.lifecycle.ResourceProvider) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) XMPMessageBodyWriter(org.apache.tika.server.writer.XMPMessageBodyWriter) TikaDetectors(org.apache.tika.server.resource.TikaDetectors) MetadataResource(org.apache.tika.server.resource.MetadataResource) RecursiveMetadataResource(org.apache.tika.server.resource.RecursiveMetadataResource) JSONMessageBodyWriter(org.apache.tika.server.writer.JSONMessageBodyWriter) DigestingParser(org.apache.tika.parser.DigestingParser) CrossOriginResourceSharingFilter(org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter) DetectorResource(org.apache.tika.server.resource.DetectorResource) TranslateResource(org.apache.tika.server.resource.TranslateResource) RecursiveMetadataResource(org.apache.tika.server.resource.RecursiveMetadataResource) Options(org.apache.commons.cli.Options) ZipWriter(org.apache.tika.server.writer.ZipWriter) LanguageResource(org.apache.tika.server.resource.LanguageResource) SingletonResourceProvider(org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider) CSVMessageBodyWriter(org.apache.tika.server.writer.CSVMessageBodyWriter) TextMessageBodyWriter(org.apache.tika.server.writer.TextMessageBodyWriter) TikaMimeTypes(org.apache.tika.server.resource.TikaMimeTypes) CommandLineParser(org.apache.commons.cli.CommandLineParser) TarWriter(org.apache.tika.server.writer.TarWriter) TikaConfig(org.apache.tika.config.TikaConfig) TikaResource(org.apache.tika.server.resource.TikaResource) JAXRSServerFactoryBean(org.apache.cxf.jaxrs.JAXRSServerFactoryBean) TikaVersion(org.apache.tika.server.resource.TikaVersion) CommandLine(org.apache.commons.cli.CommandLine) TikaWelcome(org.apache.tika.server.resource.TikaWelcome)

Example 14 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class SentimentParserTest method testCategorical.

@Test
public void testCategorical() throws Exception {
    Tika tika = getTika("tika-config-sentiment-opennlp-cat.xml");
    if (tika == null) {
        return;
    }
    String text = "Whatever, I need some cooling off time!";
    ByteArrayInputStream stream = new ByteArrayInputStream(text.getBytes(Charset.defaultCharset()));
    Metadata md = new Metadata();
    tika.parse(stream, md);
    String sentiment = md.get("Sentiment");
    assertNotNull(sentiment);
    assertEquals("angry", sentiment);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Example 15 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class RTFParserTest method testMaxLength.

@Test
public void testMaxLength() throws Exception {
    File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
    Metadata metadata = new Metadata();
    InputStream stream = TikaInputStream.get(file, metadata);
    // Test w/ default limit:
    Tika localTika = new Tika();
    String content = localTika.parseToString(stream, metadata);
    // parseToString closes for convenience:
    //stream.close();
    assertTrue(content.length() > 500);
    // Test setting max length on the instance:
    localTika.setMaxStringLength(200);
    stream = TikaInputStream.get(file, metadata);
    content = localTika.parseToString(stream, metadata);
    // parseToString closes for convenience:
    //stream.close();
    assertTrue(content.length() <= 200);
    // Test setting max length per-call:
    stream = TikaInputStream.get(file, metadata);
    content = localTika.parseToString(stream, metadata, 100);
    // parseToString closes for convenience:
    //stream.close();
    assertTrue(content.length() <= 100);
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RTFMetadata(org.apache.tika.metadata.RTFMetadata) Tika(org.apache.tika.Tika) File(java.io.File) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Tika (org.apache.tika.Tika)50 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)28 TikaTest (org.apache.tika.TikaTest)12 TikaConfig (org.apache.tika.config.TikaConfig)12 ByteArrayInputStream (java.io.ByteArrayInputStream)11 File (java.io.File)6 InputStream (java.io.InputStream)6 URL (java.net.URL)5 HashSet (java.util.HashSet)4 TikaInputStream (org.apache.tika.io.TikaInputStream)4 Ignore (org.junit.Ignore)4 FileInputStream (java.io.FileInputStream)3 Before (org.junit.Before)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Response (javax.ws.rs.core.Response)2 CompositeDetector (org.apache.tika.detect.CompositeDetector)2 TikaException (org.apache.tika.exception.TikaException)2 MimeTypes (org.apache.tika.mime.MimeTypes)2