Search in sources :

Example 21 with KeeperException

use of org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.KeeperException in project hbase by apache.

the class ReplicationPeersZKImpl method createPeer.

/**
   * Helper method to connect to a peer
   * @param peerId peer's identifier
   * @return object representing the peer
   * @throws ReplicationException
   */
private ReplicationPeerZKImpl createPeer(String peerId) throws ReplicationException {
    Pair<ReplicationPeerConfig, Configuration> pair = getPeerConf(peerId);
    if (pair == null) {
        return null;
    }
    Configuration peerConf = pair.getSecond();
    ReplicationPeerZKImpl peer = new ReplicationPeerZKImpl(zookeeper, peerConf, peerId, pair.getFirst(), abortable);
    try {
        peer.startStateTracker(this.getPeerStateNode(peerId));
    } catch (KeeperException e) {
        throw new ReplicationException("Error starting the peer state tracker for peerId=" + peerId, e);
    }
    try {
        peer.startPeerConfigTracker(this.getPeerNode(peerId));
    } catch (KeeperException e) {
        throw new ReplicationException("Error starting the peer tableCFs tracker for peerId=" + peerId, e);
    }
    return peer;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CompoundConfiguration(org.apache.hadoop.hbase.CompoundConfiguration) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) KeeperException(org.apache.zookeeper.KeeperException)

Example 22 with KeeperException

use of org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.KeeperException in project hbase by apache.

the class ReplicationQueuesZKImpl method moveQueueUsingMulti.

/**
   * It "atomically" copies one peer's wals queue from another dead region server and returns them
   * all sorted. The new peer id is equal to the old peer id appended with the dead server's znode.
   * @param znode pertaining to the region server to copy the queues from
   * @peerId peerId pertaining to the queue need to be copied
   */
private Pair<String, SortedSet<String>> moveQueueUsingMulti(String znode, String peerId) {
    try {
        // hbase/replication/rs/deadrs
        String deadRSZnodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
        List<ZKUtilOp> listOfOps = new ArrayList<>();
        ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId);
        String newPeerId = peerId + "-" + znode;
        String newPeerZnode = ZKUtil.joinZNode(this.myQueuesZnode, newPeerId);
        // check the logs queue for the old peer cluster
        String oldClusterZnode = ZKUtil.joinZNode(deadRSZnodePath, peerId);
        List<String> wals = ZKUtil.listChildrenNoWatch(this.zookeeper, oldClusterZnode);
        if (!peerExists(replicationQueueInfo.getPeerId())) {
            LOG.warn("Peer " + replicationQueueInfo.getPeerId() + " didn't exist, will move its queue to avoid the failure of multi op");
            for (String wal : wals) {
                String oldWalZnode = ZKUtil.joinZNode(oldClusterZnode, wal);
                listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalZnode));
            }
            listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
            ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
            return null;
        }
        SortedSet<String> logQueue = new TreeSet<>();
        if (wals == null || wals.isEmpty()) {
            listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
        } else {
            // create the new cluster znode
            ZKUtilOp op = ZKUtilOp.createAndFailSilent(newPeerZnode, HConstants.EMPTY_BYTE_ARRAY);
            listOfOps.add(op);
            // get the offset of the logs and set it to new znodes
            for (String wal : wals) {
                String oldWalZnode = ZKUtil.joinZNode(oldClusterZnode, wal);
                byte[] logOffset = ZKUtil.getData(this.zookeeper, oldWalZnode);
                LOG.debug("Creating " + wal + " with data " + Bytes.toString(logOffset));
                String newLogZnode = ZKUtil.joinZNode(newPeerZnode, wal);
                listOfOps.add(ZKUtilOp.createAndFailSilent(newLogZnode, logOffset));
                listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalZnode));
                logQueue.add(wal);
            }
            // add delete op for peer
            listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
            if (LOG.isTraceEnabled())
                LOG.trace(" The multi list size is: " + listOfOps.size());
        }
        ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
        LOG.info("Atomically moved " + znode + "/" + peerId + "'s WALs to my queue");
        return new Pair<>(newPeerId, logQueue);
    } catch (KeeperException e) {
        // Multi call failed; it looks like some other regionserver took away the logs.
        LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
    } catch (InterruptedException e) {
        LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
        Thread.currentThread().interrupt();
    }
    return null;
}
Also used : TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) ZKUtilOp(org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp) KeeperException(org.apache.zookeeper.KeeperException) Pair(org.apache.hadoop.hbase.util.Pair)

Example 23 with KeeperException

use of org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.KeeperException in project hbase by apache.

the class ZKSplitLogManagerCoordination method removeRecoveringRegions.

/**
   * It removes recovering regions under /hbase/recovering-regions/[encoded region name] so that the
   * region server hosting the region can allow reads to the recovered region
   * @param recoveredServerNameSet servers which are just recovered
   * @param isMetaRecovery whether current recovery is for the meta region on
   *          <code>serverNames</code>
   */
@Override
public void removeRecoveringRegions(final Set<String> recoveredServerNameSet, Boolean isMetaRecovery) throws IOException {
    final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName();
    int count = 0;
    try {
        List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode);
        if (tasks != null) {
            int listSize = tasks.size();
            for (int i = 0; i < listSize; i++) {
                if (!ZKSplitLog.isRescanNode(tasks.get(i))) {
                    count++;
                }
            }
        }
        if (count == 0 && this.details.getMaster().isInitialized() && !this.details.getMaster().getServerManager().areDeadServersInProgress()) {
            // No splitting work items left
            ZKSplitLog.deleteRecoveringRegionZNodes(watcher, null);
            // reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at
            // this point.
            lastRecoveringNodeCreationTime = Long.MAX_VALUE;
        } else if (!recoveredServerNameSet.isEmpty()) {
            // Remove recovering regions which don't have any RS associated with it
            List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
            if (regions != null) {
                int listSize = regions.size();
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Processing recovering " + regions + " and servers " + recoveredServerNameSet + ", isMetaRecovery=" + isMetaRecovery);
                }
                for (int i = 0; i < listSize; i++) {
                    String region = regions.get(i);
                    if (isMetaRecovery != null) {
                        if ((isMetaRecovery && !region.equalsIgnoreCase(metaEncodeRegionName)) || (!isMetaRecovery && region.equalsIgnoreCase(metaEncodeRegionName))) {
                            // skip the meta region when recovering user regions
                            continue;
                        }
                    }
                    String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, region);
                    List<String> failedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
                    if (failedServers == null || failedServers.isEmpty()) {
                        ZKUtil.deleteNode(watcher, nodePath);
                        continue;
                    }
                    if (recoveredServerNameSet.containsAll(failedServers)) {
                        ZKUtil.deleteNodeRecursively(watcher, nodePath);
                    } else {
                        int tmpFailedServerSize = failedServers.size();
                        for (int j = 0; j < tmpFailedServerSize; j++) {
                            String failedServer = failedServers.get(j);
                            if (recoveredServerNameSet.contains(failedServer)) {
                                String tmpPath = ZKUtil.joinZNode(nodePath, failedServer);
                                ZKUtil.deleteNode(watcher, tmpPath);
                            }
                        }
                    }
                }
            }
        }
    } catch (KeeperException ke) {
        LOG.warn("removeRecoveringRegionsFromZK got zookeeper exception. Will retry", ke);
        throw new IOException(ke);
    }
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) KeeperException(org.apache.zookeeper.KeeperException)

Example 24 with KeeperException

use of org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.KeeperException in project hbase by apache.

the class ZkSplitLogWorkerCoordination method taskLoop.

/**
   * Wait for tasks to become available at /hbase/splitlog zknode. Grab a task one at a time. This
   * policy puts an upper-limit on the number of simultaneous log splitting that could be happening
   * in a cluster.
   * <p>
   * Synchronization using <code>taskReadyLock</code> ensures that it will try to grab every task
   * that has been put up
   * @throws InterruptedException
   */
@Override
public void taskLoop() throws InterruptedException {
    while (!shouldStop) {
        int seq_start = taskReadySeq.get();
        List<String> paths = null;
        paths = getTaskList();
        if (paths == null) {
            LOG.warn("Could not get tasks, did someone remove " + watcher.znodePaths.splitLogZNode + " ... worker thread exiting.");
            return;
        }
        // pick meta wal firstly
        int offset = (int) (Math.random() * paths.size());
        for (int i = 0; i < paths.size(); i++) {
            if (AbstractFSWALProvider.isMetaFile(paths.get(i))) {
                offset = i;
                break;
            }
        }
        int numTasks = paths.size();
        for (int i = 0; i < numTasks; i++) {
            int idx = (i + offset) % paths.size();
            // double encoding of the path name
            if (this.calculateAvailableSplitters(numTasks) > 0) {
                grabTask(ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, paths.get(idx)));
            } else {
                LOG.debug("Current region server " + server.getServerName() + " has " + this.tasksInProgress.get() + " tasks in progress and can't take more.");
                break;
            }
            if (shouldStop) {
                return;
            }
        }
        SplitLogCounters.tot_wkr_task_grabing.incrementAndGet();
        synchronized (taskReadyLock) {
            while (seq_start == taskReadySeq.get()) {
                taskReadyLock.wait(checkInterval);
                if (server != null) {
                    // check to see if we have stale recovering regions in our internal memory state
                    Map<String, Region> recoveringRegions = server.getRecoveringRegions();
                    if (!recoveringRegions.isEmpty()) {
                        // Make a local copy to prevent ConcurrentModificationException when other threads
                        // modify recoveringRegions
                        List<String> tmpCopy = new ArrayList<>(recoveringRegions.keySet());
                        int listSize = tmpCopy.size();
                        for (int i = 0; i < listSize; i++) {
                            String region = tmpCopy.get(i);
                            String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, region);
                            try {
                                if (ZKUtil.checkExists(watcher, nodePath) == -1) {
                                    server.getExecutorService().submit(new FinishRegionRecoveringHandler(server, region, nodePath));
                                } else {
                                    // check the first one is good enough.
                                    break;
                                }
                            } catch (KeeperException e) {
                                // ignore zookeeper error
                                LOG.debug("Got a zookeeper when trying to open a recovering region", e);
                                break;
                            }
                        }
                    }
                }
            }
        }
    }
}
Also used : FinishRegionRecoveringHandler(org.apache.hadoop.hbase.regionserver.handler.FinishRegionRecoveringHandler) ArrayList(java.util.ArrayList) Region(org.apache.hadoop.hbase.regionserver.Region) KeeperException(org.apache.zookeeper.KeeperException)

Example 25 with KeeperException

use of org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.KeeperException in project hbase by apache.

the class ZkSplitLogWorkerCoordination method attemptToOwnTask.

/**
   * Try to own the task by transitioning the zk node data from UNASSIGNED to OWNED.
   * <p>
   * This method is also used to periodically heartbeat the task progress by transitioning the node
   * from OWNED to OWNED.
   * <p>
   * @param isFirstTime shows whther it's the first attempt.
   * @param zkw zk wathcer
   * @param server name
   * @param task to own
   * @param taskZKVersion version of the task in zk
   * @return non-negative integer value when task can be owned by current region server otherwise -1
   */
protected static int attemptToOwnTask(boolean isFirstTime, ZooKeeperWatcher zkw, ServerName server, String task, RecoveryMode mode, int taskZKVersion) {
    int latestZKVersion = FAILED_TO_OWN_TASK;
    try {
        SplitLogTask slt = new SplitLogTask.Owned(server, mode);
        Stat stat = zkw.getRecoverableZooKeeper().setData(task, slt.toByteArray(), taskZKVersion);
        if (stat == null) {
            LOG.warn("zk.setData() returned null for path " + task);
            SplitLogCounters.tot_wkr_task_heartbeat_failed.incrementAndGet();
            return FAILED_TO_OWN_TASK;
        }
        latestZKVersion = stat.getVersion();
        SplitLogCounters.tot_wkr_task_heartbeat.incrementAndGet();
        return latestZKVersion;
    } catch (KeeperException e) {
        if (!isFirstTime) {
            if (e.code().equals(KeeperException.Code.NONODE)) {
                LOG.warn("NONODE failed to assert ownership for " + task, e);
            } else if (e.code().equals(KeeperException.Code.BADVERSION)) {
                LOG.warn("BADVERSION failed to assert ownership for " + task, e);
            } else {
                LOG.warn("failed to assert ownership for " + task, e);
            }
        }
    } catch (InterruptedException e1) {
        LOG.warn("Interrupted while trying to assert ownership of " + task + " " + StringUtils.stringifyException(e1));
        Thread.currentThread().interrupt();
    }
    SplitLogCounters.tot_wkr_task_heartbeat_failed.incrementAndGet();
    return FAILED_TO_OWN_TASK;
}
Also used : Stat(org.apache.zookeeper.data.Stat) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) KeeperException(org.apache.zookeeper.KeeperException)

Aggregations

KeeperException (org.apache.zookeeper.KeeperException)608 IOException (java.io.IOException)206 Stat (org.apache.zookeeper.data.Stat)131 ZooKeeper (org.apache.zookeeper.ZooKeeper)89 ArrayList (java.util.ArrayList)55 NoNodeException (org.apache.zookeeper.KeeperException.NoNodeException)45 Watcher (org.apache.zookeeper.Watcher)43 WatchedEvent (org.apache.zookeeper.WatchedEvent)42 Test (org.junit.jupiter.api.Test)38 HashMap (java.util.HashMap)33 List (java.util.List)32 CountDownLatch (java.util.concurrent.CountDownLatch)32 SolrException (org.apache.solr.common.SolrException)30 Test (org.junit.Test)29 ACL (org.apache.zookeeper.data.ACL)27 Map (java.util.Map)26 HeliosRuntimeException (com.spotify.helios.common.HeliosRuntimeException)25 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)24 ServerName (org.apache.hadoop.hbase.ServerName)24 OpResult (org.apache.zookeeper.OpResult)21