use of org.apache.tika.batch.FileResourceCrawler in project tika by apache.
the class BatchProcessBuilder method build.
/**
* Builds a FileResourceBatchProcessor from runtime arguments and a
* document node of a configuration file. With the exception of the QueueBuilder,
* the builders choose how to adjudicate between
* runtime arguments and the elements in the configuration file.
*
* @param docElement document element of the xml config file
* @param incomingRuntimeAttributes runtime arguments
* @return FileResourceBatchProcessor
*/
public BatchProcess build(Node docElement, Map<String, String> incomingRuntimeAttributes) {
//key components
long timeoutThresholdMillis = XMLDOMUtil.getLong("timeoutThresholdMillis", incomingRuntimeAttributes, docElement);
long timeoutCheckPulseMillis = XMLDOMUtil.getLong("timeoutCheckPulseMillis", incomingRuntimeAttributes, docElement);
long pauseOnEarlyTerminationMillis = XMLDOMUtil.getLong("pauseOnEarlyTerminationMillis", incomingRuntimeAttributes, docElement);
int maxAliveTimeSeconds = XMLDOMUtil.getInt("maxAliveTimeSeconds", incomingRuntimeAttributes, docElement);
FileResourceCrawler crawler = null;
ConsumersManager consumersManager = null;
StatusReporter reporter = null;
Interrupter interrupter = null;
/*
* TODO: This is a bit smelly. NumConsumers needs to be used by the crawler
* and the consumers. This copies the incomingRuntimeAttributes and then
* supplies the numConsumers from the commandline (if it exists) or from the config file
* At least this creates an unmodifiable defensive copy of incomingRuntimeAttributes...
*/
Map<String, String> runtimeAttributes = setNumConsumersInRuntimeAttributes(docElement, incomingRuntimeAttributes);
//build queue
ArrayBlockingQueue<FileResource> queue = buildQueue(docElement, runtimeAttributes);
NodeList children = docElement.getChildNodes();
Map<String, Node> keyNodes = new HashMap<String, Node>();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() != Node.ELEMENT_NODE) {
continue;
}
String nodeName = child.getNodeName();
keyNodes.put(nodeName, child);
}
//build consumers
consumersManager = buildConsumersManager(keyNodes.get("consumers"), runtimeAttributes, queue);
//build crawler
crawler = buildCrawler(queue, keyNodes.get("crawler"), runtimeAttributes);
reporter = buildReporter(crawler, consumersManager, keyNodes.get("reporter"), runtimeAttributes);
interrupter = buildInterrupter(keyNodes.get("interrupter"), runtimeAttributes);
BatchProcess proc = new BatchProcess(crawler, consumersManager, reporter, interrupter);
if (timeoutThresholdMillis > -1) {
proc.setTimeoutThresholdMillis(timeoutThresholdMillis);
}
if (pauseOnEarlyTerminationMillis > -1) {
proc.setPauseOnEarlyTerminationMillis(pauseOnEarlyTerminationMillis);
}
if (timeoutCheckPulseMillis > -1) {
proc.setTimeoutCheckPulseMillis(timeoutCheckPulseMillis);
}
proc.setMaxAliveTimeSeconds(maxAliveTimeSeconds);
return proc;
}
use of org.apache.tika.batch.FileResourceCrawler in project tika by apache.
the class FSCrawlerBuilder method build.
@Override
public FileResourceCrawler build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
Path inputDir = PropsUtil.getPath(attributes.get(INPUT_DIR_ATTR), Paths.get("input"));
FileResourceCrawler crawler = null;
if (attributes.containsKey("fileList")) {
String randomCrawlString = attributes.get(CRAWL_ORDER);
if (randomCrawlString != null) {
//TODO: change to logger warn or throw RuntimeException?
System.err.println("randomCrawl attribute is ignored by FSListCrawler");
}
Path fileList = PropsUtil.getPath(attributes.get("fileList"), null);
String encodingString = PropsUtil.getString(attributes.get("fileListEncoding"), "UTF-8");
try {
Charset encoding = Charset.forName(encodingString);
crawler = new FSListCrawler(queue, numConsumers, inputDir, fileList, encoding);
} catch (FileNotFoundException e) {
throw new RuntimeException("fileList file not found for FSListCrawler: " + fileList.toAbsolutePath());
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("fileList encoding not supported: " + encodingString);
} catch (IOException e) {
throw new RuntimeException("IOException while trying to open fileList: " + e.getMessage(), e);
}
} else {
FSDirectoryCrawler.CRAWL_ORDER crawlOrder = getCrawlOrder(attributes.get(CRAWL_ORDER));
Path startDir = PropsUtil.getPath(attributes.get(INPUT_START_DIR_ATTR), null);
if (startDir == null) {
crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, crawlOrder);
} else {
crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, startDir, crawlOrder);
}
}
crawler.setMaxFilesToConsider(PropsUtil.getInt(attributes.get(MAX_FILES_TO_CONSIDER_ATTR), -1));
crawler.setMaxFilesToAdd(PropsUtil.getInt(attributes.get(MAX_FILES_TO_ADD_ATTR), -1));
DocumentSelector selector = buildSelector(attributes);
if (selector != null) {
crawler.setDocumentSelector(selector);
}
//5 minutes
crawler.setMaxConsecWaitInMillis(PropsUtil.getLong(attributes.get(MAX_CONSEC_WAIT_MILLIS), 300000L));
return crawler;
}
Aggregations