use of org.apache.tika.config.TikaConfig in project jackrabbit-oak by apache.
the class BinaryTextExtractor method createDefaultParser.
private static AutoDetectParser createDefaultParser() {
ClassLoader current = Thread.currentThread().getContextClassLoader();
URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
InputStream is = null;
if (configUrl != null) {
try {
Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
is = configUrl.openStream();
TikaConfig config = new TikaConfig(is);
log.info("Loaded default Tika Config from classpath {}", configUrl);
return new AutoDetectParser(config);
} catch (Exception e) {
log.warn("Tika configuration not available : " + configUrl, e);
} finally {
IOUtils.closeQuietly(is);
Thread.currentThread().setContextClassLoader(current);
}
} else {
log.warn("Default Tika configuration not found");
}
return new AutoDetectParser();
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class RecursiveParserWrapperFSConsumerTest method testEmbeddedThenNPE.
@Test
public void testEmbeddedThenNPE() throws Exception {
final String path = "/test-documents/embedded_then_npe.xml";
final Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
queue.add(new FileResource() {
@Override
public String getResourceId() {
return "testFile";
}
@Override
public Metadata getMetadata() {
return metadata;
}
@Override
public InputStream openInputStream() throws IOException {
return this.getClass().getResourceAsStream(path);
}
});
queue.add(new PoisonFileResource());
MockOSFactory mockOSFactory = new MockOSFactory();
RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
IFileProcessorFutureResult result = consumer.call();
mockOSFactory.getStreams().get(0).flush();
byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
assertEquals(2, results.size());
assertContains("another null pointer", results.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"));
assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
assertEquals("embeddedAuthor", results.get(1).get("author"));
assertContains("some_embedded_content", results.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TestMimeTypes method setUp.
@Before
public void setUp() throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
repo = config.getMimeRepository();
tika = new Tika(config);
u = new URL("http://mydomain.com/x.pdf?x=y");
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ExcelParserTest method testMacros.
@Test
public void testMacros() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", parser));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class BasicTikaFSConsumersBuilder method build.
@Override
public ConsumersManager build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
//figure out if we're building a recursiveParserWrapper
boolean recursiveParserWrapper = false;
String recursiveParserWrapperString = runtimeAttributes.get("recursiveParserWrapper");
if (recursiveParserWrapperString != null) {
recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperString, recursiveParserWrapper);
} else {
Node recursiveParserWrapperNode = node.getAttributes().getNamedItem("recursiveParserWrapper");
if (recursiveParserWrapperNode != null) {
recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperNode.getNodeValue(), recursiveParserWrapper);
}
}
//how long to let the consumersManager run on init() and shutdown()
Long consumersManagerMaxMillis = null;
String consumersManagerMaxMillisString = runtimeAttributes.get("consumersManagerMaxMillis");
if (consumersManagerMaxMillisString != null) {
consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisString, null);
} else {
Node consumersManagerMaxMillisNode = node.getAttributes().getNamedItem("consumersManagerMaxMillis");
if (consumersManagerMaxMillis == null && consumersManagerMaxMillisNode != null) {
consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisNode.getNodeValue(), null);
}
}
TikaConfig config = null;
String tikaConfigPath = runtimeAttributes.get("c");
if (tikaConfigPath == null) {
Node tikaConfigNode = node.getAttributes().getNamedItem("tikaConfig");
if (tikaConfigNode != null) {
tikaConfigPath = PropsUtil.getString(tikaConfigNode.getNodeValue(), null);
}
}
if (tikaConfigPath != null) {
try (InputStream is = Files.newInputStream(Paths.get(tikaConfigPath))) {
config = new TikaConfig(is);
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
config = TikaConfig.getDefaultConfig();
}
List<FileResourceConsumer> consumers = new LinkedList<FileResourceConsumer>();
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
NodeList nodeList = node.getChildNodes();
Node contentHandlerFactoryNode = null;
Node parserFactoryNode = null;
Node outputStreamFactoryNode = null;
for (int i = 0; i < nodeList.getLength(); i++) {
Node child = nodeList.item(i);
String cn = child.getNodeName();
if (cn.equals("parser")) {
parserFactoryNode = child;
} else if (cn.equals("contenthandler")) {
contentHandlerFactoryNode = child;
} else if (cn.equals("outputstream")) {
outputStreamFactoryNode = child;
}
}
if (contentHandlerFactoryNode == null || parserFactoryNode == null || outputStreamFactoryNode == null) {
throw new RuntimeException("You must specify a ContentHandlerFactory, " + "a ParserFactory and an OutputStreamFactory");
}
ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes, contentHandlerFactory, recursiveParserWrapper);
if (recursiveParserWrapper) {
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = new RecursiveParserWrapperFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
consumers.add(c);
}
} else {
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = new BasicTikaFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
consumers.add(c);
}
}
ConsumersManager manager = new FSConsumersManager(consumers);
if (consumersManagerMaxMillis != null) {
manager.setConsumersManagerMaxMillis(consumersManagerMaxMillis);
}
return manager;
}
Aggregations