use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.
the class TestDistributedLogSplitting method testWorkerAbort.
/**
* The original intention of this test was to force an abort of a region
* server and to make sure that the failure path in the region servers is
* properly evaluated. But it is difficult to ensure that the region server
* doesn't finish the log splitting before it aborts. Also now, there is
* this code path where the master will preempt the region server when master
* detects that the region server has aborted.
* @throws Exception
*/
@Ignore("Disabled because flakey")
@Test(timeout = 300000)
public void testWorkerAbort() throws Exception {
LOG.info("testWorkerAbort");
startCluster(3);
final int NUM_LOG_LINES = 10000;
final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager();
FileSystem fs = master.getMasterFileSystem().getFileSystem();
final List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
HRegionServer hrs = findRSToKill(false, "table");
Path rootdir = FSUtils.getRootDir(conf);
final Path logDir = new Path(rootdir, AbstractFSWALProvider.getWALDirectoryName(hrs.getServerName().toString()));
Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), "table", "family", 40);
try {
makeWAL(hrs, ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()), "table", "family", NUM_LOG_LINES, 100);
new Thread() {
@Override
public void run() {
waitForCounter(tot_wkr_task_acquired, 0, 1, 1000);
for (RegionServerThread rst : rsts) {
rst.getRegionServer().abort("testing");
break;
}
}
}.start();
// slm.splitLogDistributed(logDir);
FileStatus[] logfiles = fs.listStatus(logDir);
TaskBatch batch = new TaskBatch();
slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
//waitForCounter but for one of the 2 counters
long curt = System.currentTimeMillis();
long waitTime = 80000;
long endt = curt + waitTime;
while (curt < endt) {
if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()) == 0) {
Thread.yield();
curt = System.currentTimeMillis();
} else {
assertTrue(1 <= (tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()));
return;
}
}
fail("none of the following counters went up in " + waitTime + " milliseconds - " + "tot_wkr_task_resigned, tot_wkr_task_err, " + "tot_wkr_final_transition_failed, tot_wkr_task_done, " + "tot_wkr_preempt_task");
} finally {
if (t != null)
t.close();
}
}
use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.
the class TestSplitLogManager method testMultipleResubmits.
@Test(timeout = 180000)
public void testMultipleResubmits() throws Exception {
LOG.info("TestMultipleResbmits - no indefinite resubmissions");
conf.setInt("hbase.splitlog.max.resubmit", 2);
slm = new SplitLogManager(master, conf);
TaskBatch batch = new TaskBatch();
String tasknode = submitTaskAndWait(batch, "foo/1");
int version = ZKUtil.checkExists(zkw, tasknode);
final ServerName worker1 = ServerName.valueOf("worker1,1,1");
final ServerName worker2 = ServerName.valueOf("worker2,1,1");
final ServerName worker3 = ServerName.valueOf("worker3,1,1");
SplitLogTask slt = new SplitLogTask.Owned(worker1, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2);
int version1 = ZKUtil.checkExists(zkw, tasknode);
assertTrue(version1 > version);
slt = new SplitLogTask.Owned(worker2, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
waitForCounter(tot_mgr_heartbeat, 1, 2, to / 2);
waitForCounter(tot_mgr_resubmit, 1, 2, to + to / 2);
int version2 = ZKUtil.checkExists(zkw, tasknode);
assertTrue(version2 > version1);
slt = new SplitLogTask.Owned(worker3, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
waitForCounter(tot_mgr_heartbeat, 2, 3, to / 2);
waitForCounter(tot_mgr_resubmit_threshold_reached, 0, 1, to + to / 2);
Thread.sleep(to + to / 2);
assertEquals(2L, tot_mgr_resubmit.get() - tot_mgr_resubmit_force.get());
}
use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.
the class TestSplitLogManager method testTaskDone.
@Test(timeout = 180000)
public void testTaskDone() throws Exception {
LOG.info("TestTaskDone - cleanup task node once in DONE state");
slm = new SplitLogManager(master, conf);
TaskBatch batch = new TaskBatch();
String tasknode = submitTaskAndWait(batch, "foo/1");
final ServerName worker1 = ServerName.valueOf("worker1,1,1");
SplitLogTask slt = new SplitLogTask.Done(worker1, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
synchronized (batch) {
while (batch.installed != batch.done) {
batch.wait();
}
}
waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2);
assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1);
}
use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.
the class TestSplitLogManager method testWorkerCrash.
@Test(timeout = 180000)
public void testWorkerCrash() throws Exception {
slm = new SplitLogManager(master, conf);
TaskBatch batch = new TaskBatch();
String tasknode = submitTaskAndWait(batch, "foo/1");
final ServerName worker1 = ServerName.valueOf("worker1,1,1");
SplitLogTask slt = new SplitLogTask.Owned(worker1, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
if (tot_mgr_heartbeat.get() == 0)
waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
// Not yet resubmitted.
Assert.assertEquals(0, tot_mgr_resubmit.get());
// This server becomes dead
Mockito.when(sm.isServerOnline(worker1)).thenReturn(false);
// The timeout checker is done every 1000 ms (hardcoded).
Thread.sleep(1300);
// It has been resubmitted
Assert.assertEquals(1, tot_mgr_resubmit.get());
}
use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.
the class TestSplitLogManager method testTaskErr.
@Test(timeout = 180000)
public void testTaskErr() throws Exception {
LOG.info("TestTaskErr - cleanup task node once in ERR state");
conf.setInt("hbase.splitlog.max.resubmit", 0);
slm = new SplitLogManager(master, conf);
TaskBatch batch = new TaskBatch();
String tasknode = submitTaskAndWait(batch, "foo/1");
final ServerName worker1 = ServerName.valueOf("worker1,1,1");
SplitLogTask slt = new SplitLogTask.Err(worker1, this.mode);
ZKUtil.setData(zkw, tasknode, slt.toByteArray());
synchronized (batch) {
while (batch.installed != batch.error) {
batch.wait();
}
}
waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2);
assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1);
conf.setInt("hbase.splitlog.max.resubmit", ZKSplitLogManagerCoordination.DEFAULT_MAX_RESUBMIT);
}
Aggregations