use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.
the class ZKSplitLogManagerCoordination method removeStaleRecoveringRegions.
/**
* ZooKeeper implementation of
* {@link SplitLogManagerCoordination#removeStaleRecoveringRegions(Set)}
*/
@Override
public void removeStaleRecoveringRegions(final Set<String> knownFailedServers) throws IOException, InterruptedIOException {
try {
List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode);
if (tasks != null) {
int listSize = tasks.size();
for (int i = 0; i < listSize; i++) {
String t = tasks.get(i);
byte[] data;
try {
data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, t));
} catch (InterruptedException e) {
throw new InterruptedIOException();
}
if (data != null) {
SplitLogTask slt = null;
try {
slt = SplitLogTask.parseFrom(data);
} catch (DeserializationException e) {
LOG.warn("Failed parse data for znode " + t, e);
}
if (slt != null && slt.isDone()) {
continue;
}
}
// decode the file name
t = ZKSplitLog.getFileName(t);
ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(new Path(t));
if (serverName != null) {
knownFailedServers.add(serverName.getServerName());
} else {
LOG.warn("Found invalid WAL log file name:" + t);
}
}
}
// remove recovering regions which doesn't have any RS associated with it
List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
if (regions != null) {
int listSize = regions.size();
for (int i = 0; i < listSize; i++) {
String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, regions.get(i));
List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
if (regionFailedServers == null || regionFailedServers.isEmpty()) {
ZKUtil.deleteNode(watcher, nodePath);
continue;
}
boolean needMoreRecovery = false;
int tmpFailedServerSize = regionFailedServers.size();
for (int j = 0; j < tmpFailedServerSize; j++) {
if (knownFailedServers.contains(regionFailedServers.get(j))) {
needMoreRecovery = true;
break;
}
}
if (!needMoreRecovery) {
ZKUtil.deleteNodeRecursively(watcher, nodePath);
}
}
}
} catch (KeeperException e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.
the class ZKSplitLogManagerCoordination method setRecoveryMode.
/**
* This function is to set recovery mode from outstanding split log tasks from before or current
* configuration setting
* @param isForInitialization
* @throws IOException
*/
@Override
public void setRecoveryMode(boolean isForInitialization) throws IOException {
synchronized (this) {
if (this.isDrainingDone) {
// date recovery mode
return;
}
}
if (this.watcher == null) {
// when watcher is null(testing code) and recovery mode can only be LOG_SPLITTING
synchronized (this) {
this.isDrainingDone = true;
this.recoveryMode = RecoveryMode.LOG_SPLITTING;
}
return;
}
boolean hasSplitLogTask = false;
boolean hasRecoveringRegions = false;
RecoveryMode previousRecoveryMode = RecoveryMode.UNKNOWN;
RecoveryMode recoveryModeInConfig = (isDistributedLogReplay(conf)) ? RecoveryMode.LOG_REPLAY : RecoveryMode.LOG_SPLITTING;
// Firstly check if there are outstanding recovering regions
try {
List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode);
if (regions != null && !regions.isEmpty()) {
hasRecoveringRegions = true;
previousRecoveryMode = RecoveryMode.LOG_REPLAY;
}
if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
// Secondly check if there are outstanding split log task
List<String> tasks = listSplitLogTasks();
if (!tasks.isEmpty()) {
hasSplitLogTask = true;
if (isForInitialization) {
// during initialization, try to get recovery mode from splitlogtask
int listSize = tasks.size();
for (int i = 0; i < listSize; i++) {
String task = tasks.get(i);
try {
byte[] data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, task));
if (data == null)
continue;
SplitLogTask slt = SplitLogTask.parseFrom(data);
previousRecoveryMode = slt.getMode();
if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
// created by old code base where we don't set recovery mode in splitlogtask
// we can safely set to LOG_SPLITTING because we're in master initialization code
// before SSH is enabled & there is no outstanding recovering regions
previousRecoveryMode = RecoveryMode.LOG_SPLITTING;
}
break;
} catch (DeserializationException e) {
LOG.warn("Failed parse data for znode " + task, e);
} catch (InterruptedException e) {
throw new InterruptedIOException();
}
}
}
}
}
} catch (KeeperException e) {
throw new IOException(e);
}
synchronized (this) {
if (this.isDrainingDone) {
return;
}
if (!hasSplitLogTask && !hasRecoveringRegions) {
this.isDrainingDone = true;
this.recoveryMode = recoveryModeInConfig;
return;
} else if (!isForInitialization) {
// splitlogtask hasn't drained yet, keep existing recovery mode
return;
}
if (previousRecoveryMode != RecoveryMode.UNKNOWN) {
this.isDrainingDone = (previousRecoveryMode == recoveryModeInConfig);
this.recoveryMode = previousRecoveryMode;
} else {
this.recoveryMode = recoveryModeInConfig;
}
}
}
use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.
the class ZkSplitLogWorkerCoordination method grabTask.
/**
* try to grab a 'lock' on the task zk node to own and execute the task.
* <p>
* @param path zk node for the task
* @return boolean value when grab a task success return true otherwise false
*/
private boolean grabTask(String path) {
Stat stat = new Stat();
byte[] data;
synchronized (grabTaskLock) {
currentTask = path;
workerInGrabTask = true;
if (Thread.interrupted()) {
return false;
}
}
try {
try {
if ((data = ZKUtil.getDataNoWatch(watcher, path, stat)) == null) {
SplitLogCounters.tot_wkr_failed_to_grab_task_no_data.increment();
return false;
}
} catch (KeeperException e) {
LOG.warn("Failed to get data for znode " + path, e);
SplitLogCounters.tot_wkr_failed_to_grab_task_exception.increment();
return false;
}
SplitLogTask slt;
try {
slt = SplitLogTask.parseFrom(data);
} catch (DeserializationException e) {
LOG.warn("Failed parse data for znode " + path, e);
SplitLogCounters.tot_wkr_failed_to_grab_task_exception.increment();
return false;
}
if (!slt.isUnassigned()) {
SplitLogCounters.tot_wkr_failed_to_grab_task_owned.increment();
return false;
}
currentVersion = attemptToOwnTask(true, watcher, server.getServerName(), path, stat.getVersion());
if (currentVersion < 0) {
SplitLogCounters.tot_wkr_failed_to_grab_task_lost_race.increment();
return false;
}
if (ZKSplitLog.isRescanNode(watcher, currentTask)) {
ZkSplitLogWorkerCoordination.ZkSplitTaskDetails splitTaskDetails = new ZkSplitLogWorkerCoordination.ZkSplitTaskDetails();
splitTaskDetails.setTaskNode(currentTask);
splitTaskDetails.setCurTaskZKVersion(new MutableInt(currentVersion));
endTask(new SplitLogTask.Done(server.getServerName()), SplitLogCounters.tot_wkr_task_acquired_rescan, splitTaskDetails);
return false;
}
LOG.info("worker " + server.getServerName() + " acquired task " + path);
SplitLogCounters.tot_wkr_task_acquired.increment();
getDataSetWatchAsync();
submitTask(path, currentVersion, reportPeriod);
// after a successful submit, sleep a little bit to allow other RSs to grab the rest tasks
try {
int sleepTime = RandomUtils.nextInt(0, 500) + 500;
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
LOG.warn("Interrupted while yielding for other region servers", e);
Thread.currentThread().interrupt();
}
return true;
} finally {
synchronized (grabTaskLock) {
workerInGrabTask = false;
// clear the interrupt from stopTask() otherwise the next task will
// suffer
Thread.interrupted();
}
}
}
use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.
the class ZKSplitLogManagerCoordination method getDataSetWatchSuccess.
private void getDataSetWatchSuccess(String path, byte[] data, int version) throws DeserializationException {
if (data == null) {
if (version == Integer.MIN_VALUE) {
// assume all done. The task znode suddenly disappeared.
setDone(path, SUCCESS);
return;
}
SplitLogCounters.tot_mgr_null_data.increment();
LOG.error(HBaseMarkers.FATAL, "logic error - got null data " + path);
setDone(path, FAILURE);
return;
}
data = ZKMetadata.removeMetaData(data);
SplitLogTask slt = SplitLogTask.parseFrom(data);
if (slt.isUnassigned()) {
LOG.debug("Task not yet acquired " + path + ", ver=" + version);
handleUnassignedTask(path);
} else if (slt.isOwned()) {
heartbeat(path, version, slt.getServerName());
} else if (slt.isResigned()) {
LOG.info("Task " + path + " entered state=" + slt.toString());
resubmitOrFail(path, FORCE);
} else if (slt.isDone()) {
LOG.info("Task " + path + " entered state=" + slt.toString());
if (taskFinisher != null && !ZKSplitLog.isRescanNode(watcher, path)) {
if (taskFinisher.finish(slt.getServerName(), ZKSplitLog.getFileName(path)) == Status.DONE) {
setDone(path, SUCCESS);
} else {
resubmitOrFail(path, CHECK);
}
} else {
setDone(path, SUCCESS);
}
} else if (slt.isErr()) {
LOG.info("Task " + path + " entered state=" + slt.toString());
resubmitOrFail(path, CHECK);
} else {
LOG.error(HBaseMarkers.FATAL, "logic error - unexpected zk state for path = " + path + " data = " + slt.toString());
setDone(path, FAILURE);
}
}
use of org.apache.hadoop.hbase.SplitLogTask in project hbase by apache.
the class TestSplitLogWorker method testPreemptTask.
@Test
public void testPreemptTask() throws Exception {
LOG.info("testPreemptTask");
SplitLogCounters.resetCounters();
final ServerName SRV = ServerName.valueOf("tpt_svr,1,1");
final String PATH = ZKSplitLog.getEncodedNodeName(zkw, "tpt_task");
RegionServerServices mockedRS = getRegionServer(SRV);
SplitLogWorker slw = new SplitLogWorker(ds, TEST_UTIL.getConfiguration(), mockedRS, neverEndingTask);
slw.start();
try {
// let the worker start
Thread.yield();
Thread.sleep(1000);
waitForCounter(SplitLogCounters.tot_wkr_task_grabing, 0, 1, WAIT_TIME);
// this time create a task node after starting the splitLogWorker
zkw.getRecoverableZooKeeper().create(PATH, new SplitLogTask.Unassigned(MANAGER).toByteArray(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
waitForCounter(SplitLogCounters.tot_wkr_task_acquired, 0, 1, WAIT_TIME);
assertEquals(1, slw.getTaskReadySeq());
byte[] bytes = ZKUtil.getData(zkw, PATH);
SplitLogTask slt = SplitLogTask.parseFrom(bytes);
assertTrue(slt.isOwned(SRV));
slt = new SplitLogTask.Owned(MANAGER);
ZKUtil.setData(zkw, PATH, slt.toByteArray());
waitForCounter(SplitLogCounters.tot_wkr_preempt_task, 0, 1, WAIT_TIME);
} finally {
stopSplitLogWorker(slw);
}
}
Aggregations