use of org.apache.tika.batch.ParallelFileProcessingResult in project tika by apache.
the class HandlerBuilderTest method testText.
@Test
public void testText() throws Exception {
Path outputDir = getNewOutputDir("handler-txt-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "txt");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
Path outputFile = outputDir.resolve("test0.xml.txt");
String resultString = readFileToString(outputFile, UTF_8);
assertFalse(resultString.contains("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
assertFalse(resultString.contains("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
assertTrue(resultString.contains("This is tika-batch's first test file"));
}
use of org.apache.tika.batch.ParallelFileProcessingResult in project tika by apache.
the class HandlerBuilderTest method testXMLWithWriteLimit.
@Test
public void testXMLWithWriteLimit() throws Exception {
Path outputDir = getNewOutputDir("handler-xml-write-limit-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("writeLimit", "5");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
Path outputFile = outputDir.resolve("test0.xml.xml");
String resultString = readFileToString(outputFile, UTF_8);
//this is not ideal. How can we change handlers to writeout whatever
//they've gotten so far, up to the writeLimit?
assertTrue(resultString.equals(""));
}
use of org.apache.tika.batch.ParallelFileProcessingResult in project tika by apache.
the class HandlerBuilderTest method testHTML.
@Test
public void testHTML() throws Exception {
Path outputDir = getNewOutputDir("handler-html-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "html");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
Path outputFile = outputDir.resolve("test0.xml.html");
String resultString = readFileToString(outputFile, UTF_8);
assertTrue(resultString.contains("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
assertFalse(resultString.contains("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
assertTrue(resultString.contains("This is tika-batch's first test file"));
}
use of org.apache.tika.batch.ParallelFileProcessingResult in project tika by apache.
the class FSBatchProcessCLI method execute.
private void execute(String[] args) throws Exception {
CommandLineParser cliParser = new DefaultParser();
CommandLine line = cliParser.parse(options, args);
if (line.hasOption("help")) {
usage();
System.exit(BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE);
}
Map<String, String> mapArgs = new HashMap<String, String>();
for (Option option : line.getOptions()) {
String v = option.getValue();
if (v == null || v.equals("")) {
v = "true";
}
mapArgs.put(option.getOpt(), v);
}
BatchProcessBuilder b = new BatchProcessBuilder();
TikaInputStream is = null;
BatchProcess process = null;
try {
is = getConfigInputStream(args, false);
process = b.build(is, mapArgs);
} finally {
IOUtils.closeQuietly(is);
}
final Thread mainThread = Thread.currentThread();
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<ParallelFileProcessingResult> futureResult = executor.submit(process);
ParallelFileProcessingResult result = futureResult.get();
System.out.println(FINISHED_STRING);
System.out.println("\n");
System.out.println(result.toString());
System.exit(result.getExitStatus());
}
Aggregations