use of org.apache.tika.extractor.DocumentSelector in project tika by apache.
the class FSCrawlerBuilder method build.
@Override
public FileResourceCrawler build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
Path inputDir = PropsUtil.getPath(attributes.get(INPUT_DIR_ATTR), Paths.get("input"));
FileResourceCrawler crawler = null;
if (attributes.containsKey("fileList")) {
String randomCrawlString = attributes.get(CRAWL_ORDER);
if (randomCrawlString != null) {
//TODO: change to logger warn or throw RuntimeException?
System.err.println("randomCrawl attribute is ignored by FSListCrawler");
}
Path fileList = PropsUtil.getPath(attributes.get("fileList"), null);
String encodingString = PropsUtil.getString(attributes.get("fileListEncoding"), "UTF-8");
try {
Charset encoding = Charset.forName(encodingString);
crawler = new FSListCrawler(queue, numConsumers, inputDir, fileList, encoding);
} catch (FileNotFoundException e) {
throw new RuntimeException("fileList file not found for FSListCrawler: " + fileList.toAbsolutePath());
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("fileList encoding not supported: " + encodingString);
} catch (IOException e) {
throw new RuntimeException("IOException while trying to open fileList: " + e.getMessage(), e);
}
} else {
FSDirectoryCrawler.CRAWL_ORDER crawlOrder = getCrawlOrder(attributes.get(CRAWL_ORDER));
Path startDir = PropsUtil.getPath(attributes.get(INPUT_START_DIR_ATTR), null);
if (startDir == null) {
crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, crawlOrder);
} else {
crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, startDir, crawlOrder);
}
}
crawler.setMaxFilesToConsider(PropsUtil.getInt(attributes.get(MAX_FILES_TO_CONSIDER_ATTR), -1));
crawler.setMaxFilesToAdd(PropsUtil.getInt(attributes.get(MAX_FILES_TO_ADD_ATTR), -1));
DocumentSelector selector = buildSelector(attributes);
if (selector != null) {
crawler.setDocumentSelector(selector);
}
//5 minutes
crawler.setMaxConsecWaitInMillis(PropsUtil.getLong(attributes.get(MAX_CONSEC_WAIT_MILLIS), 300000L));
return crawler;
}
Aggregations