use of org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread in project hbase by apache.
the class TestDistributedLogSplitting method testRecoveredEdits.
@Ignore("DLR is broken by HBASE-12751")
@Test(timeout = 300000)
public void testRecoveredEdits() throws Exception {
LOG.info("testRecoveredEdits");
// create more than one wal
conf.setLong("hbase.regionserver.hlog.blocksize", 30 * 1024);
conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, false);
startCluster(NUM_RS);
final int NUM_LOG_LINES = 1000;
final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager();
// turn off load balancing to prevent regions from moving around otherwise
// they will consume recovered.edits
master.balanceSwitch(false);
FileSystem fs = master.getMasterFileSystem().getFileSystem();
List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
Path rootdir = FSUtils.getRootDir(conf);
Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), "table", "family", 40);
try {
TableName table = t.getName();
List<HRegionInfo> regions = null;
HRegionServer hrs = null;
for (int i = 0; i < NUM_RS; i++) {
boolean foundRs = false;
hrs = rsts.get(i).getRegionServer();
regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices());
for (HRegionInfo region : regions) {
if (region.getTable().getNameAsString().equalsIgnoreCase("table")) {
foundRs = true;
break;
}
}
if (foundRs)
break;
}
final Path logDir = new Path(rootdir, AbstractFSWALProvider.getWALDirectoryName(hrs.getServerName().toString()));
LOG.info("#regions = " + regions.size());
Iterator<HRegionInfo> it = regions.iterator();
while (it.hasNext()) {
HRegionInfo region = it.next();
if (region.getTable().getNamespaceAsString().equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR)) {
it.remove();
}
}
makeWAL(hrs, regions, "table", "family", NUM_LOG_LINES, 100);
slm.splitLogDistributed(logDir);
int count = 0;
for (HRegionInfo hri : regions) {
Path tdir = FSUtils.getTableDir(rootdir, table);
Path editsdir = WALSplitter.getRegionDirRecoveredEditsDir(HRegion.getRegionDir(tdir, hri.getEncodedName()));
LOG.debug("checking edits dir " + editsdir);
FileStatus[] files = fs.listStatus(editsdir, new PathFilter() {
@Override
public boolean accept(Path p) {
if (WALSplitter.isSequenceIdFile(p)) {
return false;
}
return true;
}
});
assertTrue("edits dir should have more than a single file in it. instead has " + files.length, files.length > 1);
for (int i = 0; i < files.length; i++) {
int c = countWAL(files[i].getPath(), fs, conf);
count += c;
}
LOG.info(count + " edits in " + files.length + " recovered edits files.");
}
// check that the log file is moved
assertFalse(fs.exists(logDir));
assertEquals(NUM_LOG_LINES, count);
} finally {
if (t != null)
t.close();
}
}
use of org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread in project hbase by apache.
the class TestDistributedLogSplitting method testWorkerAbort.
/**
* The original intention of this test was to force an abort of a region
* server and to make sure that the failure path in the region servers is
* properly evaluated. But it is difficult to ensure that the region server
* doesn't finish the log splitting before it aborts. Also now, there is
* this code path where the master will preempt the region server when master
* detects that the region server has aborted.
* @throws Exception
*/
@Ignore("Disabled because flakey")
@Test(timeout = 300000)
public void testWorkerAbort() throws Exception {
LOG.info("testWorkerAbort");
startCluster(3);
final int NUM_LOG_LINES = 10000;
final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager();
FileSystem fs = master.getMasterFileSystem().getFileSystem();
final List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
HRegionServer hrs = findRSToKill(false, "table");
Path rootdir = FSUtils.getRootDir(conf);
final Path logDir = new Path(rootdir, AbstractFSWALProvider.getWALDirectoryName(hrs.getServerName().toString()));
Table t = installTable(new ZooKeeperWatcher(conf, "table-creation", null), "table", "family", 40);
try {
makeWAL(hrs, ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices()), "table", "family", NUM_LOG_LINES, 100);
new Thread() {
@Override
public void run() {
waitForCounter(tot_wkr_task_acquired, 0, 1, 1000);
for (RegionServerThread rst : rsts) {
rst.getRegionServer().abort("testing");
break;
}
}
}.start();
// slm.splitLogDistributed(logDir);
FileStatus[] logfiles = fs.listStatus(logDir);
TaskBatch batch = new TaskBatch();
slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
//waitForCounter but for one of the 2 counters
long curt = System.currentTimeMillis();
long waitTime = 80000;
long endt = curt + waitTime;
while (curt < endt) {
if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()) == 0) {
Thread.yield();
curt = System.currentTimeMillis();
} else {
assertTrue(1 <= (tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + tot_wkr_final_transition_failed.get() + tot_wkr_task_done.get() + tot_wkr_preempt_task.get()));
return;
}
}
fail("none of the following counters went up in " + waitTime + " milliseconds - " + "tot_wkr_task_resigned, tot_wkr_task_err, " + "tot_wkr_final_transition_failed, tot_wkr_task_done, " + "tot_wkr_preempt_task");
} finally {
if (t != null)
t.close();
}
}
use of org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread in project hbase by apache.
the class TestDistributedLogSplitting method testNonceRecovery.
@Ignore("DLR is broken by HBASE-12751")
@Test(timeout = 300000)
public void testNonceRecovery() throws Exception {
LOG.info("testNonceRecovery");
final String TABLE_NAME = "table";
final String FAMILY_NAME = "family";
final int NUM_REGIONS_TO_CREATE = 40;
conf.setLong("hbase.regionserver.hlog.blocksize", 100 * 1024);
conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true);
startCluster(NUM_RS);
master.balanceSwitch(false);
final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null);
Table ht = installTable(zkw, TABLE_NAME, FAMILY_NAME, NUM_REGIONS_TO_CREATE);
NonceGeneratorWithDups ng = new NonceGeneratorWithDups();
NonceGenerator oldNg = ConnectionUtils.injectNonceGeneratorForTesting((ClusterConnection) TEST_UTIL.getConnection(), ng);
try {
List<Increment> reqs = new ArrayList<>();
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
HRegionServer hrs = rst.getRegionServer();
List<HRegionInfo> hris = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices());
for (HRegionInfo hri : hris) {
if (TABLE_NAME.equalsIgnoreCase(hri.getTable().getNameAsString())) {
byte[] key = hri.getStartKey();
if (key == null || key.length == 0) {
key = Bytes.copy(hri.getEndKey());
--(key[key.length - 1]);
}
Increment incr = new Increment(key);
incr.addColumn(Bytes.toBytes(FAMILY_NAME), Bytes.toBytes("q"), 1);
ht.increment(incr);
reqs.add(incr);
}
}
}
HRegionServer hrs = findRSToKill(false, "table");
abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE);
ng.startDups();
for (Increment incr : reqs) {
try {
ht.increment(incr);
fail("should have thrown");
} catch (OperationConflictException ope) {
LOG.debug("Caught as expected: " + ope.getMessage());
}
}
} finally {
ConnectionUtils.injectNonceGeneratorForTesting((ClusterConnection) TEST_UTIL.getConnection(), oldNg);
if (ht != null)
ht.close();
if (zkw != null)
zkw.close();
}
}
use of org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread in project hbase by apache.
the class TestDistributedLogSplitting method testReplayCmd.
@Ignore("DLR is broken by HBASE-12751")
@Test(timeout = 300000)
public void testReplayCmd() throws Exception {
LOG.info("testReplayCmd");
conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true);
startCluster(NUM_RS);
final int NUM_REGIONS_TO_CREATE = 40;
// turn off load balancing to prevent regions from moving around otherwise
// they will consume recovered.edits
master.balanceSwitch(false);
List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null);
Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE);
try {
List<HRegionInfo> regions = null;
HRegionServer hrs = null;
for (int i = 0; i < NUM_RS; i++) {
boolean isCarryingMeta = false;
hrs = rsts.get(i).getRegionServer();
regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices());
for (HRegionInfo region : regions) {
if (region.isMetaRegion()) {
isCarryingMeta = true;
break;
}
}
if (isCarryingMeta) {
continue;
}
if (regions.size() > 0)
break;
}
this.prepareData(ht, Bytes.toBytes("family"), Bytes.toBytes("c1"));
String originalCheckSum = TEST_UTIL.checksumRows(ht);
// abort RA and trigger replay
abortRSAndWaitForRecovery(hrs, zkw, NUM_REGIONS_TO_CREATE);
assertEquals("Data should remain after reopening of regions", originalCheckSum, TEST_UTIL.checksumRows(ht));
} finally {
if (ht != null)
ht.close();
if (zkw != null)
zkw.close();
}
}
use of org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread in project hbase by apache.
the class TestDistributedLogSplitting method testDisallowWritesInRecovering.
@Ignore("DLR is broken by HBASE-12751")
@Test(timeout = 300000)
public void testDisallowWritesInRecovering() throws Exception {
LOG.info("testDisallowWritesInRecovering");
conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, true);
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 3);
conf.setBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING, true);
startCluster(NUM_RS);
final int NUM_REGIONS_TO_CREATE = 40;
// turn off load balancing to prevent regions from moving around otherwise
// they will consume recovered.edits
master.balanceSwitch(false);
List<RegionServerThread> rsts = cluster.getLiveRegionServerThreads();
final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null);
Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE);
try {
final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager();
Set<HRegionInfo> regionSet = new HashSet<>();
HRegionInfo region = null;
HRegionServer hrs = null;
HRegionServer dstRS = null;
for (int i = 0; i < NUM_RS; i++) {
hrs = rsts.get(i).getRegionServer();
List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices());
if (regions.isEmpty())
continue;
region = regions.get(0);
regionSet.add(region);
dstRS = rsts.get((i + 1) % NUM_RS).getRegionServer();
break;
}
slm.markRegionsRecovering(hrs.getServerName(), regionSet);
// move region in order for the region opened in recovering state
final HRegionInfo hri = region;
final HRegionServer tmpRS = dstRS;
TEST_UTIL.getAdmin().move(region.getEncodedNameAsBytes(), Bytes.toBytes(dstRS.getServerName().getServerName()));
// wait for region move completes
final RegionStates regionStates = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates();
TEST_UTIL.waitFor(45000, 200, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
ServerName sn = regionStates.getRegionServerOfRegion(hri);
return (sn != null && sn.equals(tmpRS.getServerName()));
}
});
try {
byte[] key = region.getStartKey();
if (key == null || key.length == 0) {
key = new byte[] { 0, 0, 0, 0, 1 };
}
Put put = new Put(key);
put.addColumn(Bytes.toBytes("family"), Bytes.toBytes("c1"), new byte[] { 'b' });
ht.put(put);
} catch (IOException ioe) {
Assert.assertTrue(ioe instanceof RetriesExhaustedWithDetailsException);
RetriesExhaustedWithDetailsException re = (RetriesExhaustedWithDetailsException) ioe;
boolean foundRegionInRecoveryException = false;
for (Throwable t : re.getCauses()) {
if (t instanceof RegionInRecoveryException) {
foundRegionInRecoveryException = true;
break;
}
}
Assert.assertTrue("No RegionInRecoveryException. Following exceptions returned=" + re.getCauses(), foundRegionInRecoveryException);
}
} finally {
if (ht != null)
ht.close();
if (ht != null)
zkw.close();
}
}
Aggregations