use of org.apache.zookeeper.KeeperException in project hbase by apache.
the class ReplicationQueuesZKImpl method moveQueueUsingMulti.
/**
* It "atomically" copies one peer's wals queue from another dead region server and returns them
* all sorted. The new peer id is equal to the old peer id appended with the dead server's znode.
* @param znode pertaining to the region server to copy the queues from
* @peerId peerId pertaining to the queue need to be copied
*/
private Pair<String, SortedSet<String>> moveQueueUsingMulti(String znode, String peerId) {
try {
// hbase/replication/rs/deadrs
String deadRSZnodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
List<ZKUtilOp> listOfOps = new ArrayList<>();
ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId);
String newPeerId = peerId + "-" + znode;
String newPeerZnode = ZKUtil.joinZNode(this.myQueuesZnode, newPeerId);
// check the logs queue for the old peer cluster
String oldClusterZnode = ZKUtil.joinZNode(deadRSZnodePath, peerId);
List<String> wals = ZKUtil.listChildrenNoWatch(this.zookeeper, oldClusterZnode);
if (!peerExists(replicationQueueInfo.getPeerId())) {
LOG.warn("Peer " + replicationQueueInfo.getPeerId() + " didn't exist, will move its queue to avoid the failure of multi op");
for (String wal : wals) {
String oldWalZnode = ZKUtil.joinZNode(oldClusterZnode, wal);
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalZnode));
}
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
return null;
}
SortedSet<String> logQueue = new TreeSet<>();
if (wals == null || wals.isEmpty()) {
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
} else {
// create the new cluster znode
ZKUtilOp op = ZKUtilOp.createAndFailSilent(newPeerZnode, HConstants.EMPTY_BYTE_ARRAY);
listOfOps.add(op);
// get the offset of the logs and set it to new znodes
for (String wal : wals) {
String oldWalZnode = ZKUtil.joinZNode(oldClusterZnode, wal);
byte[] logOffset = ZKUtil.getData(this.zookeeper, oldWalZnode);
LOG.debug("Creating " + wal + " with data " + Bytes.toString(logOffset));
String newLogZnode = ZKUtil.joinZNode(newPeerZnode, wal);
listOfOps.add(ZKUtilOp.createAndFailSilent(newLogZnode, logOffset));
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalZnode));
logQueue.add(wal);
}
// add delete op for peer
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
if (LOG.isTraceEnabled())
LOG.trace(" The multi list size is: " + listOfOps.size());
}
ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
LOG.info("Atomically moved " + znode + "/" + peerId + "'s WALs to my queue");
return new Pair<>(newPeerId, logQueue);
} catch (KeeperException e) {
// Multi call failed; it looks like some other regionserver took away the logs.
LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
} catch (InterruptedException e) {
LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
Thread.currentThread().interrupt();
}
return null;
}
use of org.apache.zookeeper.KeeperException in project hbase by apache.
the class ZKSplitLogManagerCoordination method removeRecoveringRegions.
/**
* It removes recovering regions under /hbase/recovering-regions/[encoded region name] so that the
* region server hosting the region can allow reads to the recovered region
* @param recoveredServerNameSet servers which are just recovered
* @param isMetaRecovery whether current recovery is for the meta region on
* <code>serverNames</code>
*/
@Override
public void removeRecoveringRegions(final Set<String> recoveredServerNameSet, Boolean isMetaRecovery) throws IOException {
final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName();
int count = 0;
try {
List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode);
if (tasks != null) {
int listSize = tasks.size();
for (int i = 0; i < listSize; i++) {
if (!ZKSplitLog.isRescanNode(tasks.get(i))) {
count++;
}
}
}
if (count == 0 && this.details.getMaster().isInitialized() && !this.details.getMaster().getServerManager().areDeadServersInProgress()) {
// No splitting work items left
ZKSplitLog.deleteRecoveringRegionZNodes(watcher, null);
// reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at
// this point.
lastRecoveringNodeCreationTime = Long.MAX_VALUE;
} else if (!recoveredServerNameSet.isEmpty()) {
// Remove recovering regions which don't have any RS associated with it
List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
if (regions != null) {
int listSize = regions.size();
if (LOG.isDebugEnabled()) {
LOG.debug("Processing recovering " + regions + " and servers " + recoveredServerNameSet + ", isMetaRecovery=" + isMetaRecovery);
}
for (int i = 0; i < listSize; i++) {
String region = regions.get(i);
if (isMetaRecovery != null) {
if ((isMetaRecovery && !region.equalsIgnoreCase(metaEncodeRegionName)) || (!isMetaRecovery && region.equalsIgnoreCase(metaEncodeRegionName))) {
// skip the meta region when recovering user regions
continue;
}
}
String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, region);
List<String> failedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
if (failedServers == null || failedServers.isEmpty()) {
ZKUtil.deleteNode(watcher, nodePath);
continue;
}
if (recoveredServerNameSet.containsAll(failedServers)) {
ZKUtil.deleteNodeRecursively(watcher, nodePath);
} else {
int tmpFailedServerSize = failedServers.size();
for (int j = 0; j < tmpFailedServerSize; j++) {
String failedServer = failedServers.get(j);
if (recoveredServerNameSet.contains(failedServer)) {
String tmpPath = ZKUtil.joinZNode(nodePath, failedServer);
ZKUtil.deleteNode(watcher, tmpPath);
}
}
}
}
}
}
} catch (KeeperException ke) {
LOG.warn("removeRecoveringRegionsFromZK got zookeeper exception. Will retry", ke);
throw new IOException(ke);
}
}
use of org.apache.zookeeper.KeeperException in project hbase by apache.
the class ZkSplitLogWorkerCoordination method taskLoop.
/**
* Wait for tasks to become available at /hbase/splitlog zknode. Grab a task one at a time. This
* policy puts an upper-limit on the number of simultaneous log splitting that could be happening
* in a cluster.
* <p>
* Synchronization using <code>taskReadyLock</code> ensures that it will try to grab every task
* that has been put up
* @throws InterruptedException
*/
@Override
public void taskLoop() throws InterruptedException {
while (!shouldStop) {
int seq_start = taskReadySeq.get();
List<String> paths = null;
paths = getTaskList();
if (paths == null) {
LOG.warn("Could not get tasks, did someone remove " + watcher.znodePaths.splitLogZNode + " ... worker thread exiting.");
return;
}
// pick meta wal firstly
int offset = (int) (Math.random() * paths.size());
for (int i = 0; i < paths.size(); i++) {
if (AbstractFSWALProvider.isMetaFile(paths.get(i))) {
offset = i;
break;
}
}
int numTasks = paths.size();
for (int i = 0; i < numTasks; i++) {
int idx = (i + offset) % paths.size();
// double encoding of the path name
if (this.calculateAvailableSplitters(numTasks) > 0) {
grabTask(ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, paths.get(idx)));
} else {
LOG.debug("Current region server " + server.getServerName() + " has " + this.tasksInProgress.get() + " tasks in progress and can't take more.");
break;
}
if (shouldStop) {
return;
}
}
SplitLogCounters.tot_wkr_task_grabing.incrementAndGet();
synchronized (taskReadyLock) {
while (seq_start == taskReadySeq.get()) {
taskReadyLock.wait(checkInterval);
if (server != null) {
// check to see if we have stale recovering regions in our internal memory state
Map<String, Region> recoveringRegions = server.getRecoveringRegions();
if (!recoveringRegions.isEmpty()) {
// Make a local copy to prevent ConcurrentModificationException when other threads
// modify recoveringRegions
List<String> tmpCopy = new ArrayList<>(recoveringRegions.keySet());
int listSize = tmpCopy.size();
for (int i = 0; i < listSize; i++) {
String region = tmpCopy.get(i);
String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, region);
try {
if (ZKUtil.checkExists(watcher, nodePath) == -1) {
server.getExecutorService().submit(new FinishRegionRecoveringHandler(server, region, nodePath));
} else {
// check the first one is good enough.
break;
}
} catch (KeeperException e) {
// ignore zookeeper error
LOG.debug("Got a zookeeper when trying to open a recovering region", e);
break;
}
}
}
}
}
}
}
}
use of org.apache.zookeeper.KeeperException in project hbase by apache.
the class ZkSplitLogWorkerCoordination method attemptToOwnTask.
/**
* Try to own the task by transitioning the zk node data from UNASSIGNED to OWNED.
* <p>
* This method is also used to periodically heartbeat the task progress by transitioning the node
* from OWNED to OWNED.
* <p>
* @param isFirstTime shows whther it's the first attempt.
* @param zkw zk wathcer
* @param server name
* @param task to own
* @param taskZKVersion version of the task in zk
* @return non-negative integer value when task can be owned by current region server otherwise -1
*/
protected static int attemptToOwnTask(boolean isFirstTime, ZooKeeperWatcher zkw, ServerName server, String task, RecoveryMode mode, int taskZKVersion) {
int latestZKVersion = FAILED_TO_OWN_TASK;
try {
SplitLogTask slt = new SplitLogTask.Owned(server, mode);
Stat stat = zkw.getRecoverableZooKeeper().setData(task, slt.toByteArray(), taskZKVersion);
if (stat == null) {
LOG.warn("zk.setData() returned null for path " + task);
SplitLogCounters.tot_wkr_task_heartbeat_failed.incrementAndGet();
return FAILED_TO_OWN_TASK;
}
latestZKVersion = stat.getVersion();
SplitLogCounters.tot_wkr_task_heartbeat.incrementAndGet();
return latestZKVersion;
} catch (KeeperException e) {
if (!isFirstTime) {
if (e.code().equals(KeeperException.Code.NONODE)) {
LOG.warn("NONODE failed to assert ownership for " + task, e);
} else if (e.code().equals(KeeperException.Code.BADVERSION)) {
LOG.warn("BADVERSION failed to assert ownership for " + task, e);
} else {
LOG.warn("failed to assert ownership for " + task, e);
}
}
} catch (InterruptedException e1) {
LOG.warn("Interrupted while trying to assert ownership of " + task + " " + StringUtils.stringifyException(e1));
Thread.currentThread().interrupt();
}
SplitLogCounters.tot_wkr_task_heartbeat_failed.incrementAndGet();
return FAILED_TO_OWN_TASK;
}
use of org.apache.zookeeper.KeeperException in project hbase by apache.
the class ServerManager method checkForRSznode.
/**
* Check for an odd state, where we think an RS is up but it is not. Do it on OPEN.
* This is only case where the check makes sense.
*
* <p>We are checking for instance of HBASE-9593 where a RS registered but died before it put
* up its znode in zk. In this case, the RS made it into the list of online servers but it
* is not actually UP. We do the check here where there is an evident problem rather
* than do some crazy footwork where we'd have master check zk after a RS had reported
* for duty with provisional state followed by a confirmed state; that'd be a mess.
* Real fix is HBASE-17733.
*/
private void checkForRSznode(final ServerName serverName, final ServiceException se) {
if (se.getCause() == null)
return;
Throwable t = se.getCause();
if (t instanceof ConnectException) {
// If this, proceed to do cleanup.
} else {
// Look for FailedServerException
if (!(t instanceof IOException))
return;
if (t.getCause() == null)
return;
if (!(t.getCause() instanceof FailedServerException))
return;
// Ok, found FailedServerException -- continue.
}
if (!isServerOnline(serverName))
return;
// We think this server is online. Check it has a znode up. Currently, a RS
// registers an ephereral znode in zk. If not present, something is up. Maybe
// HBASE-9593 where RS crashed AFTER reportForDuty but BEFORE it put up an ephemeral
// znode.
List<String> servers = null;
try {
servers = getRegionServersInZK(this.master.getZooKeeper());
} catch (KeeperException ke) {
LOG.warn("Failed to list regionservers", ke);
// ZK is malfunctioning, don't hang here
}
boolean found = false;
if (servers != null) {
for (String serverNameAsStr : servers) {
ServerName sn = ServerName.valueOf(serverNameAsStr);
if (sn.equals(serverName)) {
// Found a server up in zk.
found = true;
break;
}
}
}
if (!found) {
LOG.warn("Online server " + serverName.toString() + " has no corresponding " + "ephemeral znode (Did it die before registering in zk?); " + "calling expire to clean it up!");
expireServer(serverName);
}
}
Aggregations