Search in sources :

Example 76 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerCollectionMessageHandler method balanceProperty.

private void balanceProperty(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
    if (StringUtils.isBlank(message.getStr(COLLECTION_PROP)) || StringUtils.isBlank(message.getStr(PROPERTY_PROP))) {
        throw new SolrException(ErrorCode.BAD_REQUEST, "The '" + COLLECTION_PROP + "' and '" + PROPERTY_PROP + "' parameters are required for the BALANCESHARDUNIQUE operation, no action taken");
    }
    SolrZkClient zkClient = zkStateReader.getZkClient();
    DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
    Map<String, Object> propMap = new HashMap<>();
    propMap.put(Overseer.QUEUE_OPERATION, BALANCESHARDUNIQUE.toLower());
    propMap.putAll(message.getProperties());
    inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
}
Also used : HashMap(java.util.HashMap) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) SolrZkClient(org.apache.solr.common.cloud.SolrZkClient) RemoteSolrException(org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException) SolrException(org.apache.solr.common.SolrException)

Example 77 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerTaskProcessor method run.

@Override
public void run() {
    log.debug("Process current queue of overseer operations");
    LeaderStatus isLeader = amILeader();
    while (isLeader == LeaderStatus.DONT_KNOW) {
        log.debug("am_i_leader unclear {}", isLeader);
        // not a no, not a yes, try ask again
        isLeader = amILeader();
    }
    String oldestItemInWorkQueue = null;
    // hasLeftOverItems - used for avoiding re-execution of async tasks that were processed by a previous Overseer.
    // This variable is set in case there's any task found on the workQueue when the OCP starts up and
    // the id for the queue tail is used as a marker to check for the task in completed/failed map in zk.
    // Beyond the marker, all tasks can safely be assumed to have never been executed.
    boolean hasLeftOverItems = true;
    try {
        oldestItemInWorkQueue = workQueue.getTailId();
    } catch (KeeperException e) {
        // We don't need to handle this. This is just a fail-safe which comes in handy in skipping already processed
        // async calls.
        SolrException.log(log, "", e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
    if (oldestItemInWorkQueue == null)
        hasLeftOverItems = false;
    else
        log.debug("Found already existing elements in the work-queue. Last element: {}", oldestItemInWorkQueue);
    try {
        prioritizer.prioritizeOverseerNodes(myId);
    } catch (Exception e) {
        if (!zkStateReader.getZkClient().isClosed()) {
            log.error("Unable to prioritize overseer ", e);
        }
    }
    // TODO: Make maxThreads configurable.
    this.tpe = new ExecutorUtil.MDCAwareThreadPoolExecutor(5, MAX_PARALLEL_TASKS, 0L, TimeUnit.MILLISECONDS, new SynchronousQueue<Runnable>(), new DefaultSolrThreadFactory("OverseerThreadFactory"));
    try {
        while (!this.isClosed) {
            try {
                isLeader = amILeader();
                if (LeaderStatus.NO == isLeader) {
                    break;
                } else if (LeaderStatus.YES != isLeader) {
                    log.debug("am_i_leader unclear {}", isLeader);
                    // not a no, not a yes, try asking again
                    continue;
                }
                log.debug("Cleaning up work-queue. #Running tasks: {}", runningTasks.size());
                cleanUpWorkQueue();
                printTrackingMaps();
                boolean waited = false;
                while (runningTasks.size() > MAX_PARALLEL_TASKS) {
                    synchronized (waitLock) {
                        //wait for 100 ms or till a task is complete
                        waitLock.wait(100);
                    }
                    waited = true;
                }
                if (waited)
                    cleanUpWorkQueue();
                ArrayList<QueueEvent> heads = new ArrayList<>(blockedTasks.size() + MAX_PARALLEL_TASKS);
                heads.addAll(blockedTasks.values());
                // to clear out at least a few items in the queue before we read more items
                if (heads.size() < MAX_BLOCKED_TASKS) {
                    //instead of reading MAX_PARALLEL_TASKS items always, we should only fetch as much as we can execute
                    int toFetch = Math.min(MAX_BLOCKED_TASKS - heads.size(), MAX_PARALLEL_TASKS - runningTasks.size());
                    List<QueueEvent> newTasks = workQueue.peekTopN(toFetch, excludedTasks, 2000L);
                    log.debug("Got {} tasks from work-queue : [{}]", newTasks.size(), newTasks);
                    heads.addAll(newTasks);
                } else {
                    // Prevent free-spinning this loop.
                    Thread.sleep(1000);
                }
                if (isClosed)
                    break;
                if (heads.isEmpty()) {
                    continue;
                }
                // clear it now; may get refilled below.
                blockedTasks.clear();
                taskBatch.batchId++;
                boolean tooManyTasks = false;
                for (QueueEvent head : heads) {
                    if (!tooManyTasks) {
                        synchronized (runningTasks) {
                            tooManyTasks = runningTasks.size() >= MAX_PARALLEL_TASKS;
                        }
                    }
                    if (tooManyTasks) {
                        // Too many tasks are running, just shove the rest into the "blocked" queue.
                        if (blockedTasks.size() < MAX_BLOCKED_TASKS)
                            blockedTasks.put(head.getId(), head);
                        continue;
                    }
                    if (runningZKTasks.contains(head.getId()))
                        continue;
                    final ZkNodeProps message = ZkNodeProps.load(head.getBytes());
                    OverseerMessageHandler messageHandler = selector.selectOverseerMessageHandler(message);
                    final String asyncId = message.getStr(ASYNC);
                    if (hasLeftOverItems) {
                        if (head.getId().equals(oldestItemInWorkQueue))
                            hasLeftOverItems = false;
                        if (asyncId != null && (completedMap.contains(asyncId) || failureMap.contains(asyncId))) {
                            log.debug("Found already processed task in workQueue, cleaning up. AsyncId [{}]", asyncId);
                            workQueue.remove(head);
                            continue;
                        }
                    }
                    String operation = message.getStr(Overseer.QUEUE_OPERATION);
                    OverseerMessageHandler.Lock lock = messageHandler.lockTask(message, taskBatch);
                    if (lock == null) {
                        log.debug("Exclusivity check failed for [{}]", message.toString());
                        //we may end crossing the size of the MAX_BLOCKED_TASKS. They are fine
                        if (blockedTasks.size() < MAX_BLOCKED_TASKS)
                            blockedTasks.put(head.getId(), head);
                        continue;
                    }
                    try {
                        markTaskAsRunning(head, asyncId);
                        log.debug("Marked task [{}] as running", head.getId());
                    } catch (KeeperException.NodeExistsException e) {
                        lock.unlock();
                        // This should never happen
                        log.error("Tried to pick up task [{}] when it was already running!", head.getId());
                        continue;
                    } catch (InterruptedException e) {
                        lock.unlock();
                        log.error("Thread interrupted while trying to pick task for execution.", head.getId());
                        Thread.currentThread().interrupt();
                        continue;
                    }
                    log.debug(messageHandler.getName() + ": Get the message id:" + head.getId() + " message:" + message.toString());
                    Runner runner = new Runner(messageHandler, message, operation, head, lock);
                    tpe.execute(runner);
                }
            } catch (KeeperException e) {
                if (e.code() == KeeperException.Code.SESSIONEXPIRED) {
                    log.warn("Overseer cannot talk to ZK");
                    return;
                }
                SolrException.log(log, "", e);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                return;
            } catch (Exception e) {
                SolrException.log(log, "", e);
            }
        }
    } finally {
        this.close();
    }
}
Also used : ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) ArrayList(java.util.ArrayList) DefaultSolrThreadFactory(org.apache.solr.util.DefaultSolrThreadFactory) LeaderStatus(org.apache.solr.cloud.Overseer.LeaderStatus) SolrException(org.apache.solr.common.SolrException) KeeperException(org.apache.zookeeper.KeeperException) ExecutorUtil(org.apache.solr.common.util.ExecutorUtil) SynchronousQueue(java.util.concurrent.SynchronousQueue) QueueEvent(org.apache.solr.cloud.OverseerTaskQueue.QueueEvent) KeeperException(org.apache.zookeeper.KeeperException)

Example 78 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerTaskProcessor method amILeader.

protected LeaderStatus amILeader() {
    String statsName = "collection_am_i_leader";
    Timer.Context timerContext = stats.time(statsName);
    boolean success = true;
    try {
        ZkNodeProps props = ZkNodeProps.load(zkStateReader.getZkClient().getData(Overseer.OVERSEER_ELECT + "/leader", null, null, true));
        if (myId.equals(props.getStr(ID))) {
            return LeaderStatus.YES;
        }
    } catch (KeeperException e) {
        success = false;
        if (e.code() == KeeperException.Code.CONNECTIONLOSS) {
            log.error("", e);
            return LeaderStatus.DONT_KNOW;
        } else if (e.code() != KeeperException.Code.SESSIONEXPIRED) {
            log.warn("", e);
        }
    } catch (InterruptedException e) {
        success = false;
        Thread.currentThread().interrupt();
    } finally {
        timerContext.stop();
        if (success) {
            stats.success(statsName);
        } else {
            stats.error(statsName);
        }
    }
    log.info("According to ZK I (id=" + myId + ") am no longer a leader.");
    return LeaderStatus.NO;
}
Also used : Timer(com.codahale.metrics.Timer) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) KeeperException(org.apache.zookeeper.KeeperException)

Example 79 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerTaskQueue method containsTaskWithRequestId.

/**
   * Returns true if the queue contains a task with the specified async id.
   */
public boolean containsTaskWithRequestId(String requestIdKey, String requestId) throws KeeperException, InterruptedException {
    List<String> childNames = zookeeper.getChildren(dir, null, true);
    stats.setQueueLength(childNames.size());
    for (String childName : childNames) {
        if (childName != null && childName.startsWith(PREFIX)) {
            try {
                byte[] data = zookeeper.getData(dir + "/" + childName, null, null, true);
                if (data != null) {
                    ZkNodeProps message = ZkNodeProps.load(data);
                    if (message.containsKey(requestIdKey)) {
                        LOG.debug(">>>> {}", message.get(requestIdKey));
                        if (message.get(requestIdKey).equals(requestId))
                            return true;
                    }
                }
            } catch (KeeperException.NoNodeException e) {
            // Another client removed the node first, try next
            }
        }
    }
    return false;
}
Also used : ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) KeeperException(org.apache.zookeeper.KeeperException)

Example 80 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class DeleteNodeCmd method cleanupReplicas.

static void cleanupReplicas(NamedList results, ClusterState clusterState, List<ZkNodeProps> sourceReplicas, OverseerCollectionMessageHandler ocmh, String node, String async) throws InterruptedException {
    CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size());
    for (ZkNodeProps sourceReplica : sourceReplicas) {
        String coll = sourceReplica.getStr(COLLECTION_PROP);
        String shard = sourceReplica.getStr(SHARD_ID_PROP);
        log.info("Deleting replica for collection={} shard={} on node={}", coll, shard, node);
        NamedList deleteResult = new NamedList();
        try {
            if (async != null)
                sourceReplica = sourceReplica.plus(ASYNC, async);
            ((DeleteReplicaCmd) ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> {
                cleanupLatch.countDown();
                if (deleteResult.get("failure") != null) {
                    synchronized (results) {
                        results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" + " on node=%s", coll, shard, node));
                    }
                }
            });
        } catch (KeeperException e) {
            log.warn("Error deleting ", e);
            cleanupLatch.countDown();
        } catch (Exception e) {
            log.warn("Error deleting ", e);
            cleanupLatch.countDown();
            throw e;
        }
    }
    log.debug("Waiting for delete node action to complete");
    cleanupLatch.await(5, TimeUnit.MINUTES);
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) CountDownLatch(java.util.concurrent.CountDownLatch) KeeperException(org.apache.zookeeper.KeeperException) KeeperException(org.apache.zookeeper.KeeperException) SolrException(org.apache.solr.common.SolrException)

Aggregations

ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)91 SolrException (org.apache.solr.common.SolrException)35 HashMap (java.util.HashMap)28 Replica (org.apache.solr.common.cloud.Replica)22 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)20 ArrayList (java.util.ArrayList)19 Slice (org.apache.solr.common.cloud.Slice)19 KeeperException (org.apache.zookeeper.KeeperException)19 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)16 Test (org.junit.Test)16 DocCollection (org.apache.solr.common.cloud.DocCollection)15 SolrZkClient (org.apache.solr.common.cloud.SolrZkClient)14 Map (java.util.Map)13 ClusterState (org.apache.solr.common.cloud.ClusterState)13 IOException (java.io.IOException)10 ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)10 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)10 NamedList (org.apache.solr.common.util.NamedList)10 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)9 SolrCore (org.apache.solr.core.SolrCore)8