Search in sources :

Example 51 with KeeperException

use of org.apache.zookeeper.KeeperException in project lucene-solr by apache.

the class ZkController method registerAllCoresAsDown.

private void registerAllCoresAsDown(final CurrentCoreDescriptorProvider registerOnReconnect, boolean updateLastPublished) {
    List<CoreDescriptor> descriptors = registerOnReconnect.getCurrentDescriptors();
    if (isClosed)
        return;
    if (descriptors != null) {
        // before registering as live, make sure everyone is in a
        // down state
        publishNodeAsDown(getNodeName());
        for (CoreDescriptor descriptor : descriptors) {
            // if it looks like we are going to be the leader, we don't
            // want to wait for the following stuff
            CloudDescriptor cloudDesc = descriptor.getCloudDescriptor();
            String collection = cloudDesc.getCollectionName();
            String slice = cloudDesc.getShardId();
            try {
                int children = zkStateReader.getZkClient().getChildren(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/" + slice + "/election", null, true).size();
                if (children == 0) {
                    log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
                    continue;
                }
            } catch (NoNodeException e) {
                log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
                continue;
            } catch (InterruptedException e2) {
                Thread.currentThread().interrupt();
            } catch (KeeperException e) {
                log.warn("", e);
                Thread.currentThread().interrupt();
            }
            final String coreZkNodeName = descriptor.getCloudDescriptor().getCoreNodeName();
            try {
                log.debug("calling waitForLeaderToSeeDownState for coreZkNodeName={} collection={} shard={}", new Object[] { coreZkNodeName, collection, slice });
                waitForLeaderToSeeDownState(descriptor, coreZkNodeName);
            } catch (Exception e) {
                SolrException.log(log, "", e);
                if (isClosed) {
                    return;
                }
            }
        }
    }
}
Also used : NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) CoreDescriptor(org.apache.solr.core.CoreDescriptor) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException) TimeoutException(java.util.concurrent.TimeoutException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) SessionExpiredException(org.apache.zookeeper.KeeperException.SessionExpiredException) ConnectionLossException(org.apache.zookeeper.KeeperException.ConnectionLossException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) SolrCoreInitializationException(org.apache.solr.core.SolrCoreInitializationException)

Example 52 with KeeperException

use of org.apache.zookeeper.KeeperException in project lucene-solr by apache.

the class ZkController method preRegister.

public void preRegister(CoreDescriptor cd) {
    String coreNodeName = getCoreNodeName(cd);
    // this also gets us our assigned shard id if it was not specified
    try {
        checkStateInZk(cd);
        CloudDescriptor cloudDesc = cd.getCloudDescriptor();
        // make sure the node name is set on the descriptor
        if (cloudDesc.getCoreNodeName() == null) {
            cloudDesc.setCoreNodeName(coreNodeName);
        }
        publish(cd, Replica.State.DOWN, false, true);
        String collectionName = cd.getCloudDescriptor().getCollectionName();
        DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
        log.debug(collection == null ? "Collection {} not visible yet, but flagging it so a watch is registered when it becomes visible" : "Registering watch for collection {}", collectionName);
        zkStateReader.registerCore(collectionName);
    } catch (KeeperException e) {
        log.error("", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        log.error("", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
    }
    if (cd.getCloudDescriptor().getShardId() == null && needsToBeAssignedShardId(cd, zkStateReader.getClusterState(), coreNodeName)) {
        doGetShardIdAndNodeNameProcess(cd);
    } else {
        // still wait till we see us in local state
        doGetShardIdAndNodeNameProcess(cd);
    }
}
Also used : ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) DocCollection(org.apache.solr.common.cloud.DocCollection) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException)

Example 53 with KeeperException

use of org.apache.zookeeper.KeeperException in project lucene-solr by apache.

the class ZkController method linkConfSet.

public static void linkConfSet(SolrZkClient zkClient, String collection, String confSetName) throws KeeperException, InterruptedException {
    String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
    log.debug("Load collection config from:" + path);
    byte[] data;
    try {
        data = zkClient.getData(path, null, null, true);
    } catch (NoNodeException e) {
        // if there is no node, we will try and create it
        // first try to make in case we are pre configuring
        ZkNodeProps props = new ZkNodeProps(CONFIGNAME_PROP, confSetName);
        try {
            zkClient.makePath(path, Utils.toJSON(props), CreateMode.PERSISTENT, null, true);
        } catch (KeeperException e2) {
            // it's okay if the node already exists
            if (e2.code() != KeeperException.Code.NODEEXISTS) {
                throw e;
            }
            // if we fail creating, setdata
            // TODO: we should consider using version
            zkClient.setData(path, Utils.toJSON(props), true);
        }
        return;
    }
    // we found existing data, let's update it
    ZkNodeProps props = null;
    if (data != null) {
        props = ZkNodeProps.load(data);
        Map<String, Object> newProps = new HashMap<>();
        newProps.putAll(props.getProperties());
        newProps.put(CONFIGNAME_PROP, confSetName);
        props = new ZkNodeProps(newProps);
    } else {
        props = new ZkNodeProps(CONFIGNAME_PROP, confSetName);
    }
    // TODO: we should consider using version
    zkClient.setData(path, Utils.toJSON(props), true);
}
Also used : NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException)

Example 54 with KeeperException

use of org.apache.zookeeper.KeeperException in project lucene-solr by apache.

the class ZkController method init.

private void init(CurrentCoreDescriptorProvider registerOnReconnect) {
    try {
        createClusterZkNodes(zkClient);
        zkStateReader.createClusterStateWatchersAndUpdate();
        this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
        checkForExistingEphemeralNode();
        // start the overseer first as following code may need it's processing
        if (!zkRunOnly) {
            overseerElector = new LeaderElector(zkClient);
            this.overseer = new Overseer(cc.getShardHandlerFactory().getShardHandler(), cc.getUpdateShardHandler(), CommonParams.CORES_HANDLER_PATH, zkStateReader, this, cloudConfig);
            ElectionContext context = new OverseerElectionContext(zkClient, overseer, getNodeName());
            overseerElector.setup(context);
            overseerElector.joinElection(context, false);
        }
        Stat stat = zkClient.exists(ZkStateReader.LIVE_NODES_ZKNODE, null, true);
        if (stat != null && stat.getNumChildren() > 0) {
            publishAndWaitForDownStates();
        }
        // Do this last to signal we're up.
        createEphemeralLiveNode();
    } catch (IOException e) {
        log.error("", e);
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Can't create ZooKeeperController", e);
    } catch (InterruptedException e) {
        // Restore the interrupted status
        Thread.currentThread().interrupt();
        log.error("", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
    } catch (KeeperException e) {
        log.error("", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
    }
}
Also used : ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) Stat(org.apache.zookeeper.data.Stat) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException)

Example 55 with KeeperException

use of org.apache.zookeeper.KeeperException in project lucene-solr by apache.

the class ReplaceNodeCmd method call.

@Override
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
    ZkStateReader zkStateReader = ocmh.zkStateReader;
    ocmh.checkRequired(message, "source", "target");
    String source = message.getStr("source");
    String target = message.getStr("target");
    String async = message.getStr("async");
    boolean parallel = message.getBool("parallel", false);
    ClusterState clusterState = zkStateReader.getClusterState();
    if (!clusterState.liveNodesContain(source)) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + source + " is not live");
    }
    if (!clusterState.liveNodesContain(target)) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Target Node: " + target + " is not live");
    }
    List<ZkNodeProps> sourceReplicas = getReplicasOfNode(source, clusterState);
    List<ZkNodeProps> createdReplicas = new ArrayList<>();
    AtomicBoolean anyOneFailed = new AtomicBoolean(false);
    CountDownLatch countDownLatch = new CountDownLatch(sourceReplicas.size());
    for (ZkNodeProps sourceReplica : sourceReplicas) {
        NamedList nl = new NamedList();
        log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
        ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
        if (async != null)
            msg.getProperties().put(ASYNC, async);
        final ZkNodeProps addedReplica = ocmh.addReplica(clusterState, msg, nl, () -> {
            countDownLatch.countDown();
            if (nl.get("failure") != null) {
                String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + " on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
                log.warn(errorString);
                // and exit
                synchronized (results) {
                    results.add("failure", errorString);
                    anyOneFailed.set(true);
                }
            } else {
                log.debug("Successfully created replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
            }
        });
        if (addedReplica != null) {
            createdReplicas.add(addedReplica);
        }
    }
    log.debug("Waiting for replace node action to complete");
    countDownLatch.await(5, TimeUnit.MINUTES);
    log.debug("Finished waiting for replace node action to complete");
    if (anyOneFailed.get()) {
        log.info("Failed to create some replicas. Cleaning up all replicas on target node");
        CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
        for (ZkNodeProps createdReplica : createdReplicas) {
            NamedList deleteResult = new NamedList();
            try {
                ocmh.deleteReplica(zkStateReader.getClusterState(), createdReplica.plus("parallel", "true"), deleteResult, () -> {
                    cleanupLatch.countDown();
                    if (deleteResult.get("failure") != null) {
                        synchronized (results) {
                            results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
                        }
                    }
                });
            } catch (KeeperException e) {
                cleanupLatch.countDown();
                log.warn("Error deleting replica ", e);
            } catch (Exception e) {
                log.warn("Error deleting replica ", e);
                cleanupLatch.countDown();
                throw e;
            }
        }
        cleanupLatch.await(5, TimeUnit.MINUTES);
    }
    // we have reached this far means all replicas could be recreated
    //now cleanup the replicas in the source node
    DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source, async);
    results.add("success", "REPLACENODE action completed successfully from  : " + source + " to : " + target);
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) NamedList(org.apache.solr.common.util.NamedList) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) ArrayList(java.util.ArrayList) CountDownLatch(java.util.concurrent.CountDownLatch) KeeperException(org.apache.zookeeper.KeeperException) SolrException(org.apache.solr.common.SolrException) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SolrException(org.apache.solr.common.SolrException) KeeperException(org.apache.zookeeper.KeeperException)

Aggregations

KeeperException (org.apache.zookeeper.KeeperException)566 IOException (java.io.IOException)188 Stat (org.apache.zookeeper.data.Stat)127 ZooKeeper (org.apache.zookeeper.ZooKeeper)87 ArrayList (java.util.ArrayList)51 NoNodeException (org.apache.zookeeper.KeeperException.NoNodeException)45 Watcher (org.apache.zookeeper.Watcher)39 WatchedEvent (org.apache.zookeeper.WatchedEvent)38 Test (org.junit.jupiter.api.Test)38 CountDownLatch (java.util.concurrent.CountDownLatch)30 SolrException (org.apache.solr.common.SolrException)30 HashMap (java.util.HashMap)29 List (java.util.List)28 ACL (org.apache.zookeeper.data.ACL)27 Test (org.junit.Test)27 HeliosRuntimeException (com.spotify.helios.common.HeliosRuntimeException)25 ServerName (org.apache.hadoop.hbase.ServerName)24 Map (java.util.Map)23 IZooReaderWriter (org.apache.accumulo.fate.zookeeper.IZooReaderWriter)23 InterruptedIOException (java.io.InterruptedIOException)20