use of org.apache.tika.batch.FileResourceConsumer in project tika by apache.
the class EvalConsumersBuilder method build.
@Override
public ConsumersManager build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
List<FileResourceConsumer> consumers = new LinkedList<>();
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
Path db = getPath(localAttrs, "db");
String jdbcConnectionString = localAttrs.get("jdbc");
Path langModelDir = getPath(localAttrs, "langModelDir");
try {
if (langModelDir == null) {
LanguageIDWrapper.loadBuiltInModels();
} else {
LanguageIDWrapper.loadModels(langModelDir);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
Path commonTokens = getPath(localAttrs, "commonTokens");
String defaultLangCode = localAttrs.get("defaultLangCode");
if (defaultLangCode == null || "".equals(defaultLangCode)) {
defaultLangCode = "en";
}
//can be null, in which case will load from memory
try {
AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
} catch (IOException e) {
throw new RuntimeException(e);
}
JDBCUtil jdbcUtil = null;
if (db != null) {
jdbcUtil = new H2Util(db);
} else if (jdbcConnectionString != null) {
jdbcUtil = new JDBCUtil(jdbcConnectionString, localAttrs.get("jdbcDriver"));
} else {
throw new RuntimeException("Must specify: -db or -jdbc");
}
EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class, PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null));
if (consumerBuilder == null) {
throw new RuntimeException("Must specify consumerBuilderClass in config file");
}
boolean forceDrop = PropsUtil.getBoolean(localAttrs.get("drop"), false);
MimeBuffer mimeBuffer = null;
try {
mimeBuffer = consumerBuilder.init(queue, localAttrs, jdbcUtil, forceDrop);
} catch (IOException | SQLException e) {
throw new RuntimeException(e);
}
for (int i = 0; i < numConsumers; i++) {
try {
consumers.add(consumerBuilder.build());
} catch (IOException | SQLException e) {
throw new RuntimeException(e);
}
}
DBConsumersManager manager;
try {
manager = new DBConsumersManager(jdbcUtil, mimeBuffer, consumers);
} catch (SQLException e) {
throw new RuntimeException(e);
}
consumerBuilder.addErrorLogTablePairs(manager);
return manager;
}
use of org.apache.tika.batch.FileResourceConsumer in project tika by apache.
the class BasicTikaFSConsumersBuilder method build.
@Override
public ConsumersManager build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
//figure out if we're building a recursiveParserWrapper
boolean recursiveParserWrapper = false;
String recursiveParserWrapperString = runtimeAttributes.get("recursiveParserWrapper");
if (recursiveParserWrapperString != null) {
recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperString, recursiveParserWrapper);
} else {
Node recursiveParserWrapperNode = node.getAttributes().getNamedItem("recursiveParserWrapper");
if (recursiveParserWrapperNode != null) {
recursiveParserWrapper = PropsUtil.getBoolean(recursiveParserWrapperNode.getNodeValue(), recursiveParserWrapper);
}
}
//how long to let the consumersManager run on init() and shutdown()
Long consumersManagerMaxMillis = null;
String consumersManagerMaxMillisString = runtimeAttributes.get("consumersManagerMaxMillis");
if (consumersManagerMaxMillisString != null) {
consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisString, null);
} else {
Node consumersManagerMaxMillisNode = node.getAttributes().getNamedItem("consumersManagerMaxMillis");
if (consumersManagerMaxMillis == null && consumersManagerMaxMillisNode != null) {
consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisNode.getNodeValue(), null);
}
}
TikaConfig config = null;
String tikaConfigPath = runtimeAttributes.get("c");
if (tikaConfigPath == null) {
Node tikaConfigNode = node.getAttributes().getNamedItem("tikaConfig");
if (tikaConfigNode != null) {
tikaConfigPath = PropsUtil.getString(tikaConfigNode.getNodeValue(), null);
}
}
if (tikaConfigPath != null) {
try (InputStream is = Files.newInputStream(Paths.get(tikaConfigPath))) {
config = new TikaConfig(is);
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
config = TikaConfig.getDefaultConfig();
}
List<FileResourceConsumer> consumers = new LinkedList<FileResourceConsumer>();
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
NodeList nodeList = node.getChildNodes();
Node contentHandlerFactoryNode = null;
Node parserFactoryNode = null;
Node outputStreamFactoryNode = null;
for (int i = 0; i < nodeList.getLength(); i++) {
Node child = nodeList.item(i);
String cn = child.getNodeName();
if (cn.equals("parser")) {
parserFactoryNode = child;
} else if (cn.equals("contenthandler")) {
contentHandlerFactoryNode = child;
} else if (cn.equals("outputstream")) {
outputStreamFactoryNode = child;
}
}
if (contentHandlerFactoryNode == null || parserFactoryNode == null || outputStreamFactoryNode == null) {
throw new RuntimeException("You must specify a ContentHandlerFactory, " + "a ParserFactory and an OutputStreamFactory");
}
ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes, contentHandlerFactory, recursiveParserWrapper);
if (recursiveParserWrapper) {
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = new RecursiveParserWrapperFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
consumers.add(c);
}
} else {
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = new BasicTikaFSConsumer(queue, parserFactory, contentHandlerFactory, outputStreamFactory, config);
consumers.add(c);
}
}
ConsumersManager manager = new FSConsumersManager(consumers);
if (consumersManagerMaxMillis != null) {
manager.setConsumersManagerMaxMillis(consumersManagerMaxMillis);
}
return manager;
}
Aggregations