use of org.apache.tika.batch.BatchProcess in project tika by apache.
the class BatchProcessBuilder method build.
/**
* Builds a FileResourceBatchProcessor from runtime arguments and a
* document node of a configuration file. With the exception of the QueueBuilder,
* the builders choose how to adjudicate between
* runtime arguments and the elements in the configuration file.
*
* @param docElement document element of the xml config file
* @param incomingRuntimeAttributes runtime arguments
* @return FileResourceBatchProcessor
*/
public BatchProcess build(Node docElement, Map<String, String> incomingRuntimeAttributes) {
//key components
long timeoutThresholdMillis = XMLDOMUtil.getLong("timeoutThresholdMillis", incomingRuntimeAttributes, docElement);
long timeoutCheckPulseMillis = XMLDOMUtil.getLong("timeoutCheckPulseMillis", incomingRuntimeAttributes, docElement);
long pauseOnEarlyTerminationMillis = XMLDOMUtil.getLong("pauseOnEarlyTerminationMillis", incomingRuntimeAttributes, docElement);
int maxAliveTimeSeconds = XMLDOMUtil.getInt("maxAliveTimeSeconds", incomingRuntimeAttributes, docElement);
FileResourceCrawler crawler = null;
ConsumersManager consumersManager = null;
StatusReporter reporter = null;
Interrupter interrupter = null;
/*
* TODO: This is a bit smelly. NumConsumers needs to be used by the crawler
* and the consumers. This copies the incomingRuntimeAttributes and then
* supplies the numConsumers from the commandline (if it exists) or from the config file
* At least this creates an unmodifiable defensive copy of incomingRuntimeAttributes...
*/
Map<String, String> runtimeAttributes = setNumConsumersInRuntimeAttributes(docElement, incomingRuntimeAttributes);
//build queue
ArrayBlockingQueue<FileResource> queue = buildQueue(docElement, runtimeAttributes);
NodeList children = docElement.getChildNodes();
Map<String, Node> keyNodes = new HashMap<String, Node>();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
if (child.getNodeType() != Node.ELEMENT_NODE) {
continue;
}
String nodeName = child.getNodeName();
keyNodes.put(nodeName, child);
}
//build consumers
consumersManager = buildConsumersManager(keyNodes.get("consumers"), runtimeAttributes, queue);
//build crawler
crawler = buildCrawler(queue, keyNodes.get("crawler"), runtimeAttributes);
reporter = buildReporter(crawler, consumersManager, keyNodes.get("reporter"), runtimeAttributes);
interrupter = buildInterrupter(keyNodes.get("interrupter"), runtimeAttributes);
BatchProcess proc = new BatchProcess(crawler, consumersManager, reporter, interrupter);
if (timeoutThresholdMillis > -1) {
proc.setTimeoutThresholdMillis(timeoutThresholdMillis);
}
if (pauseOnEarlyTerminationMillis > -1) {
proc.setPauseOnEarlyTerminationMillis(pauseOnEarlyTerminationMillis);
}
if (timeoutCheckPulseMillis > -1) {
proc.setTimeoutCheckPulseMillis(timeoutCheckPulseMillis);
}
proc.setMaxAliveTimeSeconds(maxAliveTimeSeconds);
return proc;
}
use of org.apache.tika.batch.BatchProcess in project tika by apache.
the class OutputStreamFactoryTest method testSkip.
@Test
public void testSkip() throws Exception {
Path outputDir = getNewOutputDir("os-factory-skip-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("handleExisting", "skip");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
assertEquals(1, countChildren(outputDir));
runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
result = run(runner);
assertEquals(1, countChildren(outputDir));
}
use of org.apache.tika.batch.BatchProcess in project tika by apache.
the class OutputStreamFactoryTest method testIllegalState.
@Test
public void testIllegalState() throws Exception {
Path outputDir = getNewOutputDir("os-factory-illegal-state-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
run(runner);
assertEquals(1, countChildren(outputDir));
boolean illegalState = false;
try {
ParallelFileProcessingResult result = run(runner);
} catch (ExecutionException e) {
if (e.getCause() instanceof IllegalStateException) {
illegalState = true;
}
}
assertTrue("Should have been an illegal state exception", illegalState);
}
use of org.apache.tika.batch.BatchProcess in project tika by apache.
the class FSBatchTestBase method getNewBatchRunner.
BatchProcess getNewBatchRunner(String testConfig, Map<String, String> args) throws IOException {
InputStream is = this.getClass().getResourceAsStream(testConfig);
BatchProcessBuilder b = new BatchProcessBuilder();
BatchProcess runner = b.build(is, args);
IOUtils.closeQuietly(is);
return runner;
}
use of org.apache.tika.batch.BatchProcess in project tika by apache.
the class HandlerBuilderTest method testXML.
@Test
public void testXML() throws Exception {
Path outputDir = getNewOutputDir("handler-xml-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "xml");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
Path outputFile = outputDir.resolve("test0.xml.xml");
String resultString = readFileToString(outputFile, UTF_8);
assertTrue(resultString.contains("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
assertTrue(resultString.contains("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
assertTrue(resultString.contains("This is tika-batch's first test file"));
}
Aggregations