Search in sources :

Example 1 with FileResourceCrawler

use of org.apache.tika.batch.FileResourceCrawler in project tika by apache.

the class BatchProcessBuilder method build.

/**
     * Builds a FileResourceBatchProcessor from runtime arguments and a
     * document node of a configuration file.  With the exception of the QueueBuilder,
     * the builders choose how to adjudicate between
     * runtime arguments and the elements in the configuration file.
     *
     * @param docElement   document element of the xml config file
     * @param incomingRuntimeAttributes runtime arguments
     * @return FileResourceBatchProcessor
     */
public BatchProcess build(Node docElement, Map<String, String> incomingRuntimeAttributes) {
    //key components
    long timeoutThresholdMillis = XMLDOMUtil.getLong("timeoutThresholdMillis", incomingRuntimeAttributes, docElement);
    long timeoutCheckPulseMillis = XMLDOMUtil.getLong("timeoutCheckPulseMillis", incomingRuntimeAttributes, docElement);
    long pauseOnEarlyTerminationMillis = XMLDOMUtil.getLong("pauseOnEarlyTerminationMillis", incomingRuntimeAttributes, docElement);
    int maxAliveTimeSeconds = XMLDOMUtil.getInt("maxAliveTimeSeconds", incomingRuntimeAttributes, docElement);
    FileResourceCrawler crawler = null;
    ConsumersManager consumersManager = null;
    StatusReporter reporter = null;
    Interrupter interrupter = null;
    /*
         * TODO: This is a bit smelly.  NumConsumers needs to be used by the crawler
         * and the consumers.  This copies the incomingRuntimeAttributes and then
         * supplies the numConsumers from the commandline (if it exists) or from the config file
         * At least this creates an unmodifiable defensive copy of incomingRuntimeAttributes...
         */
    Map<String, String> runtimeAttributes = setNumConsumersInRuntimeAttributes(docElement, incomingRuntimeAttributes);
    //build queue
    ArrayBlockingQueue<FileResource> queue = buildQueue(docElement, runtimeAttributes);
    NodeList children = docElement.getChildNodes();
    Map<String, Node> keyNodes = new HashMap<String, Node>();
    for (int i = 0; i < children.getLength(); i++) {
        Node child = children.item(i);
        if (child.getNodeType() != Node.ELEMENT_NODE) {
            continue;
        }
        String nodeName = child.getNodeName();
        keyNodes.put(nodeName, child);
    }
    //build consumers
    consumersManager = buildConsumersManager(keyNodes.get("consumers"), runtimeAttributes, queue);
    //build crawler
    crawler = buildCrawler(queue, keyNodes.get("crawler"), runtimeAttributes);
    reporter = buildReporter(crawler, consumersManager, keyNodes.get("reporter"), runtimeAttributes);
    interrupter = buildInterrupter(keyNodes.get("interrupter"), runtimeAttributes);
    BatchProcess proc = new BatchProcess(crawler, consumersManager, reporter, interrupter);
    if (timeoutThresholdMillis > -1) {
        proc.setTimeoutThresholdMillis(timeoutThresholdMillis);
    }
    if (pauseOnEarlyTerminationMillis > -1) {
        proc.setPauseOnEarlyTerminationMillis(pauseOnEarlyTerminationMillis);
    }
    if (timeoutCheckPulseMillis > -1) {
        proc.setTimeoutCheckPulseMillis(timeoutCheckPulseMillis);
    }
    proc.setMaxAliveTimeSeconds(maxAliveTimeSeconds);
    return proc;
}
Also used : Interrupter(org.apache.tika.batch.Interrupter) FileResourceCrawler(org.apache.tika.batch.FileResourceCrawler) HashMap(java.util.HashMap) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) BatchProcess(org.apache.tika.batch.BatchProcess) FileResource(org.apache.tika.batch.FileResource) ConsumersManager(org.apache.tika.batch.ConsumersManager) StatusReporter(org.apache.tika.batch.StatusReporter)

Example 2 with FileResourceCrawler

use of org.apache.tika.batch.FileResourceCrawler in project tika by apache.

the class FSCrawlerBuilder method build.

@Override
public FileResourceCrawler build(Node node, Map<String, String> runtimeAttributes, ArrayBlockingQueue<FileResource> queue) {
    Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
    int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
    Path inputDir = PropsUtil.getPath(attributes.get(INPUT_DIR_ATTR), Paths.get("input"));
    FileResourceCrawler crawler = null;
    if (attributes.containsKey("fileList")) {
        String randomCrawlString = attributes.get(CRAWL_ORDER);
        if (randomCrawlString != null) {
            //TODO: change to logger warn or throw RuntimeException?
            System.err.println("randomCrawl attribute is ignored by FSListCrawler");
        }
        Path fileList = PropsUtil.getPath(attributes.get("fileList"), null);
        String encodingString = PropsUtil.getString(attributes.get("fileListEncoding"), "UTF-8");
        try {
            Charset encoding = Charset.forName(encodingString);
            crawler = new FSListCrawler(queue, numConsumers, inputDir, fileList, encoding);
        } catch (FileNotFoundException e) {
            throw new RuntimeException("fileList file not found for FSListCrawler: " + fileList.toAbsolutePath());
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("fileList encoding not supported: " + encodingString);
        } catch (IOException e) {
            throw new RuntimeException("IOException while trying to open fileList: " + e.getMessage(), e);
        }
    } else {
        FSDirectoryCrawler.CRAWL_ORDER crawlOrder = getCrawlOrder(attributes.get(CRAWL_ORDER));
        Path startDir = PropsUtil.getPath(attributes.get(INPUT_START_DIR_ATTR), null);
        if (startDir == null) {
            crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, crawlOrder);
        } else {
            crawler = new FSDirectoryCrawler(queue, numConsumers, inputDir, startDir, crawlOrder);
        }
    }
    crawler.setMaxFilesToConsider(PropsUtil.getInt(attributes.get(MAX_FILES_TO_CONSIDER_ATTR), -1));
    crawler.setMaxFilesToAdd(PropsUtil.getInt(attributes.get(MAX_FILES_TO_ADD_ATTR), -1));
    DocumentSelector selector = buildSelector(attributes);
    if (selector != null) {
        crawler.setDocumentSelector(selector);
    }
    //5 minutes
    crawler.setMaxConsecWaitInMillis(PropsUtil.getLong(attributes.get(MAX_CONSEC_WAIT_MILLIS), 300000L));
    return crawler;
}
Also used : Path(java.nio.file.Path) FSDirectoryCrawler(org.apache.tika.batch.fs.FSDirectoryCrawler) DocumentSelector(org.apache.tika.extractor.DocumentSelector) FSDocumentSelector(org.apache.tika.batch.fs.FSDocumentSelector) FileResourceCrawler(org.apache.tika.batch.FileResourceCrawler) FSListCrawler(org.apache.tika.batch.fs.FSListCrawler) FileNotFoundException(java.io.FileNotFoundException) Charset(java.nio.charset.Charset) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException)

Aggregations

FileResourceCrawler (org.apache.tika.batch.FileResourceCrawler)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 Charset (java.nio.charset.Charset)1 Path (java.nio.file.Path)1 HashMap (java.util.HashMap)1 BatchProcess (org.apache.tika.batch.BatchProcess)1 ConsumersManager (org.apache.tika.batch.ConsumersManager)1 FileResource (org.apache.tika.batch.FileResource)1 Interrupter (org.apache.tika.batch.Interrupter)1 StatusReporter (org.apache.tika.batch.StatusReporter)1 FSDirectoryCrawler (org.apache.tika.batch.fs.FSDirectoryCrawler)1 FSDocumentSelector (org.apache.tika.batch.fs.FSDocumentSelector)1 FSListCrawler (org.apache.tika.batch.fs.FSListCrawler)1 DocumentSelector (org.apache.tika.extractor.DocumentSelector)1 Node (org.w3c.dom.Node)1 NodeList (org.w3c.dom.NodeList)1