Search in sources :

Example 31 with TikaConfig

use of org.apache.tika.config.TikaConfig in project jackrabbit-oak by apache.

the class BinaryTextExtractor method createDefaultParser.

private static AutoDetectParser createDefaultParser() {
    ClassLoader current = Thread.currentThread().getContextClassLoader();
    URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
    InputStream is = null;
    if (configUrl != null) {
        try {
            Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
            is = configUrl.openStream();
            TikaConfig config = new TikaConfig(is);
            log.info("Loaded default Tika Config from classpath {}", configUrl);
            return new AutoDetectParser(config);
        } catch (Exception e) {
            log.warn("Tika configuration not available : " + configUrl, e);
        } finally {
            IOUtils.closeQuietly(is);
            Thread.currentThread().setContextClassLoader(current);
        }
    } else {
        log.warn("Default Tika configuration not found");
    }
    return new AutoDetectParser();
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) CountingInputStream(com.google.common.io.CountingInputStream) InputStream(java.io.InputStream) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) LuceneIndexEditorContext(org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorContext) URL(java.net.URL)

Example 32 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class RecursiveParserWrapperFSConsumerTest method testEmbeddedThenNPE.

@Test
public void testEmbeddedThenNPE() throws Exception {
    final String path = "/test-documents/embedded_then_npe.xml";
    final Metadata metadata = new Metadata();
    metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
    ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
    queue.add(new FileResource() {

        @Override
        public String getResourceId() {
            return "testFile";
        }

        @Override
        public Metadata getMetadata() {
            return metadata;
        }

        @Override
        public InputStream openInputStream() throws IOException {
            return this.getClass().getResourceAsStream(path);
        }
    });
    queue.add(new PoisonFileResource());
    MockOSFactory mockOSFactory = new MockOSFactory();
    RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
    IFileProcessorFutureResult result = consumer.call();
    mockOSFactory.getStreams().get(0).flush();
    byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
    List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
    assertEquals(2, results.size());
    assertContains("another null pointer", results.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"));
    assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
    assertEquals("embeddedAuthor", results.get(1).get("author"));
    assertContains("some_embedded_content", results.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
}
Also used : RecursiveParserWrapperFSConsumer(org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaConfig(org.apache.tika.config.TikaConfig) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 33 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TestMimeTypes method setUp.

@Before
public void setUp() throws Exception {
    TikaConfig config = TikaConfig.getDefaultConfig();
    repo = config.getMimeRepository();
    tika = new Tika(config);
    u = new URL("http://mydomain.com/x.pdf?x=y");
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Tika(org.apache.tika.Tika) URL(java.net.URL) Before(org.junit.Before)

Example 34 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ExcelParserTest method testMacros.

@Test
public void testMacros() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", context));
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", parser));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 35 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class BasicTikaFSConsumersBuilder method build.

@Override
public ConsumersManager build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
    //figure out if we're building a recursiveParserWrapper
    boolean recursiveParserWrapper = false;
    String recursiveParserWrapperString = runtimeAttributes.get("recursiveParserWrapper");
    if (recursiveParserWrapperString != null) {
        recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperString, recursiveParserWrapper);
    } else {
        Node recursiveParserWrapperNode = node.getAttributes().getNamedItem("recursiveParserWrapper");
        if (recursiveParserWrapperNode != null) {
            recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperNode.getNodeValue(), recursiveParserWrapper);
        }
    }
    //how long to let the consumersManager run on init() and shutdown()
    Long consumersManagerMaxMillis = null;
    String consumersManagerMaxMillisString = runtimeAttributes.get("consumersManagerMaxMillis");
    if (consumersManagerMaxMillisString != null) {
        consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisString, null);
    } else {
        Node consumersManagerMaxMillisNode = node.getAttributes().getNamedItem("consumersManagerMaxMillis");
        if (consumersManagerMaxMillis == null && consumersManagerMaxMillisNode != null) {
            consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisNode.getNodeValue(), null);
        }
    }
    TikaConfig config = null;
    String tikaConfigPath = runtimeAttributes.get("c");
    if (tikaConfigPath == null) {
        Node tikaConfigNode = node.getAttributes().getNamedItem("tikaConfig");
        if (tikaConfigNode != null) {
            tikaConfigPath = PropsUtil.getString(tikaConfigNode.getNodeValue(), null);
        }
    }
    if (tikaConfigPath != null) {
        try (InputStream is = Files.newInputStream(Paths.get(tikaConfigPath))) {
            config = new TikaConfig(is);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    } else {
        config = TikaConfig.getDefaultConfig();
    }
    List<FileResourceConsumer> consumers = new LinkedList<FileResourceConsumer>();
    int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
    NodeList nodeList = node.getChildNodes();
    Node contentHandlerFactoryNode = null;
    Node parserFactoryNode = null;
    Node outputStreamFactoryNode = null;
    for (int i = 0; i < nodeList.getLength(); i++) {
        Node child = nodeList.item(i);
        String cn = child.getNodeName();
        if (cn.equals("parser")) {
            parserFactoryNode = child;
        } else if (cn.equals("contenthandler")) {
            contentHandlerFactoryNode = child;
        } else if (cn.equals("outputstream")) {
            outputStreamFactoryNode = child;
        }
    }
    if (contentHandlerFactoryNode == null || parserFactoryNode == null || outputStreamFactoryNode == null) {
        throw new RuntimeException("You must specify a ContentHandlerFactory, " + "a ParserFactory and an OutputStreamFactory");
    }
    ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
    ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
    OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes, contentHandlerFactory, recursiveParserWrapper);
    if (recursiveParserWrapper) {
        for (int i = 0; i < numConsumers; i++) {
            FileResourceConsumer c = new RecursiveParserWrapperFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
            consumers.add(c);
        }
    } else {
        for (int i = 0; i < numConsumers; i++) {
            FileResourceConsumer c = new BasicTikaFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
            consumers.add(c);
        }
    }
    ConsumersManager manager = new FSConsumersManager(consumers);
    if (consumersManagerMaxMillis != null) {
        manager.setConsumersManagerMaxMillis(consumersManagerMaxMillis);
    }
    return manager;
}
Also used : ContentHandlerFactory(org.apache.tika.sax.ContentHandlerFactory) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) FSConsumersManager(org.apache.tika.batch.fs.FSConsumersManager) RecursiveParserWrapperFSConsumer(org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) FSOutputStreamFactory(org.apache.tika.batch.fs.FSOutputStreamFactory) OutputStreamFactory(org.apache.tika.batch.OutputStreamFactory) ParserFactory(org.apache.tika.batch.ParserFactory) LinkedList(java.util.LinkedList) ConsumersManager(org.apache.tika.batch.ConsumersManager) FSConsumersManager(org.apache.tika.batch.fs.FSConsumersManager) BasicTikaFSConsumer(org.apache.tika.batch.fs.BasicTikaFSConsumer) FileResourceConsumer(org.apache.tika.batch.FileResourceConsumer)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5