Search in sources :

Example 1 with TaskBatch

use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.

the class TestDistributedLogSplitting method testWorkerAbort.

/**
   * The original intention of this test was to force an abort of a region
   * server and to make sure that the failure path in the region servers is
   * properly evaluated. But it is difficult to ensure that the region server
   * doesn't finish the log splitting before it aborts. Also now, there is
   * this code path where the master will preempt the region server when master
   * detects that the region server has aborted.
   * @throws Exception
   */
@Ignore("Disabled because flakey")
@Test(timeout = 300000)
public void testWorkerAbort() throws Exception {
    LOG.info("testWorkerAbort");
    startCluster(3);
    final int NUM_LOG_LINES = 10000;
    final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager();
    FileSystem fs = master.getMasterFileSystem().getFileSystem();
    final List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
    HRegionServer hrs = findRSToKill(false, "table");
    Path rootdir = FSUtils.getRootDir(conf);
    final Path logDir = new Path(rootdir, AbstractFSWALProvider.getWALDirectoryName(hrs.getServerName().toString()));
    Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), "table", "family", 40);
    try {
        makeWAL(hrs, ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()), "table", "family", NUM_LOG_LINES, 100);
        new Thread() {

            @Override
            public void run() {
                waitForCounter(tot_wkr_task_acquired, 0, 1, 1000);
                for (RegionServerThread rst : rsts) {
                    rst.getRegionServer().abort("testing");
                    break;
                }
            }
        }.start();
        // slm.splitLogDistributed(logDir);
        FileStatus[] logfiles = fs.listStatus(logDir);
        TaskBatch batch = new TaskBatch();
        slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
        //waitForCounter but for one of the 2 counters
        long curt = System.currentTimeMillis();
        long waitTime = 80000;
        long endt = curt + waitTime;
        while (curt < endt) {
            if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()) == 0) {
                Thread.yield();
                curt = System.currentTimeMillis();
            } else {
                assertTrue(1 <= (tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()));
                return;
            }
        }
        fail("none of the following counters went up in " + waitTime + " milliseconds - " + "tot_wkr_task_resigned, tot_wkr_task_err, " + "tot_wkr_final_transition_failed, tot_wkr_task_done, " + "tot_wkr_preempt_task");
    } finally {
        if (t != null)
            t.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hbase.client.Table) FileStatus(org.apache.hadoop.fs.FileStatus) TaskBatch(org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch) HRegionServer(org.apache.hadoop.hbase.regionserver.HRegionServer) MasterThread(org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread) RegionServerThread(org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread) ZooKeeperWatcher(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher) FileSystem(org.apache.hadoop.fs.FileSystem) RegionServerThread(org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with TaskBatch

use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.

the class TestSplitLogManager method testMultipleResubmits.

@Test(timeout = 180000)
public void testMultipleResubmits() throws Exception {
    LOG.info("TestMultipleResbmits - no indefinite resubmissions");
    conf.setInt("hbase.splitlog.max.resubmit", 2);
    slm = new SplitLogManager(master, conf);
    TaskBatch batch = new TaskBatch();
    String tasknode = submitTaskAndWait(batch, "foo/1");
    int version = ZKUtil.checkExists(zkw, tasknode);
    final ServerName worker1 = ServerName.valueOf("worker1,1,1");
    final ServerName worker2 = ServerName.valueOf("worker2,1,1");
    final ServerName worker3 = ServerName.valueOf("worker3,1,1");
    SplitLogTask slt = new SplitLogTask.Owned(worker1, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
    waitForCounter(tot_mgr_resubmit, 0, 1, to + to / 2);
    int version1 = ZKUtil.checkExists(zkw, tasknode);
    assertTrue(version1 > version);
    slt = new SplitLogTask.Owned(worker2, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    waitForCounter(tot_mgr_heartbeat, 1, 2, to / 2);
    waitForCounter(tot_mgr_resubmit, 1, 2, to + to / 2);
    int version2 = ZKUtil.checkExists(zkw, tasknode);
    assertTrue(version2 > version1);
    slt = new SplitLogTask.Owned(worker3, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    waitForCounter(tot_mgr_heartbeat, 2, 3, to / 2);
    waitForCounter(tot_mgr_resubmit_threshold_reached, 0, 1, to + to / 2);
    Thread.sleep(to + to / 2);
    assertEquals(2L, tot_mgr_resubmit.get() - tot_mgr_resubmit_force.get());
}
Also used : ServerName(org.apache.hadoop.hbase.ServerName) TaskBatch(org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) Test(org.junit.Test)

Example 3 with TaskBatch

use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.

the class TestSplitLogManager method testTaskDone.

@Test(timeout = 180000)
public void testTaskDone() throws Exception {
    LOG.info("TestTaskDone - cleanup task node once in DONE state");
    slm = new SplitLogManager(master, conf);
    TaskBatch batch = new TaskBatch();
    String tasknode = submitTaskAndWait(batch, "foo/1");
    final ServerName worker1 = ServerName.valueOf("worker1,1,1");
    SplitLogTask slt = new SplitLogTask.Done(worker1, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    synchronized (batch) {
        while (batch.installed != batch.done) {
            batch.wait();
        }
    }
    waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2);
    assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1);
}
Also used : ServerName(org.apache.hadoop.hbase.ServerName) TaskBatch(org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) Test(org.junit.Test)

Example 4 with TaskBatch

use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.

the class TestSplitLogManager method testWorkerCrash.

@Test(timeout = 180000)
public void testWorkerCrash() throws Exception {
    slm = new SplitLogManager(master, conf);
    TaskBatch batch = new TaskBatch();
    String tasknode = submitTaskAndWait(batch, "foo/1");
    final ServerName worker1 = ServerName.valueOf("worker1,1,1");
    SplitLogTask slt = new SplitLogTask.Owned(worker1, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    if (tot_mgr_heartbeat.get() == 0)
        waitForCounter(tot_mgr_heartbeat, 0, 1, to / 2);
    // Not yet resubmitted.
    Assert.assertEquals(0, tot_mgr_resubmit.get());
    // This server becomes dead
    Mockito.when(sm.isServerOnline(worker1)).thenReturn(false);
    // The timeout checker is done every 1000 ms (hardcoded).
    Thread.sleep(1300);
    // It has been resubmitted
    Assert.assertEquals(1, tot_mgr_resubmit.get());
}
Also used : ServerName(org.apache.hadoop.hbase.ServerName) TaskBatch(org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) Test(org.junit.Test)

Example 5 with TaskBatch

use of org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch in project hbase by apache.

the class TestSplitLogManager method testTaskErr.

@Test(timeout = 180000)
public void testTaskErr() throws Exception {
    LOG.info("TestTaskErr - cleanup task node once in ERR state");
    conf.setInt("hbase.splitlog.max.resubmit", 0);
    slm = new SplitLogManager(master, conf);
    TaskBatch batch = new TaskBatch();
    String tasknode = submitTaskAndWait(batch, "foo/1");
    final ServerName worker1 = ServerName.valueOf("worker1,1,1");
    SplitLogTask slt = new SplitLogTask.Err(worker1, this.mode);
    ZKUtil.setData(zkw, tasknode, slt.toByteArray());
    synchronized (batch) {
        while (batch.installed != batch.error) {
            batch.wait();
        }
    }
    waitForCounter(tot_mgr_task_deleted, 0, 1, to / 2);
    assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1);
    conf.setInt("hbase.splitlog.max.resubmit", ZKSplitLogManagerCoordination.DEFAULT_MAX_RESUBMIT);
}
Also used : ServerName(org.apache.hadoop.hbase.ServerName) TaskBatch(org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch) SplitLogTask(org.apache.hadoop.hbase.SplitLogTask) Test(org.junit.Test)

Aggregations

TaskBatch (org.apache.hadoop.hbase.master.SplitLogManager.TaskBatch)10 Test (org.junit.Test)10 SplitLogTask (org.apache.hadoop.hbase.SplitLogTask)9 ServerName (org.apache.hadoop.hbase.ServerName)8 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Table (org.apache.hadoop.hbase.client.Table)1 HRegionServer (org.apache.hadoop.hbase.regionserver.HRegionServer)1 MasterThread (org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread)1 RegionServerThread (org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread)1 ZooKeeperWatcher (org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher)1 Ignore (org.junit.Ignore)1