use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.
the class TestHRegion method testWritesWhileRollWriter.
/**
* HBASE-16429 Make sure no stuck if roll writer when ring buffer is filled with appends
* @throws IOException if IO error occurred during test
*/
@Test
public void testWritesWhileRollWriter() throws IOException {
int testCount = 10;
int numRows = 1024;
int numFamilies = 2;
int numQualifiers = 2;
final byte[][] families = new byte[numFamilies][];
for (int i = 0; i < numFamilies; i++) {
families[i] = Bytes.toBytes("family" + i);
}
final byte[][] qualifiers = new byte[numQualifiers][];
for (int i = 0; i < numQualifiers; i++) {
qualifiers[i] = Bytes.toBytes("qual" + i);
}
CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 2);
this.region = initHRegion(tableName, method, CONF, families);
try {
List<Thread> threads = new ArrayList<>();
for (int i = 0; i < numRows; i++) {
final int count = i;
Thread t = new Thread(new Runnable() {
@Override
public void run() {
byte[] row = Bytes.toBytes("row" + count);
Put put = new Put(row);
put.setDurability(Durability.SYNC_WAL);
byte[] value = Bytes.toBytes(String.valueOf(count));
for (byte[] family : families) {
for (byte[] qualifier : qualifiers) {
put.addColumn(family, qualifier, (long) count, value);
}
}
try {
region.put(put);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
threads.add(t);
}
for (Thread t : threads) {
t.start();
}
for (int i = 0; i < testCount; i++) {
region.getWAL().rollWriter();
Thread.yield();
}
} finally {
try {
HBaseTestingUtility.closeRegionAndWAL(this.region);
CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 16 * 1024);
} catch (DroppedSnapshotException dse) {
// We could get this on way out because we interrupt the background flusher and it could
// fail anywhere causing a DSE over in the background flusher... only it is not properly
// dealt with so could still be memory hanging out when we get to here -- memory we can't
// flush because the accounting is 'off' since original DSE.
}
this.region = null;
}
}
use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.
the class TestHRegion method testFlushSizeAccounting.
/**
* Test we do not lose data if we fail a flush and then close.
* Part of HBase-10466. Tests the following from the issue description:
* "Bug 1: Wrong calculation of HRegion.memstoreSize: When a flush fails, data to be flushed is
* kept in each MemStore's snapshot and wait for next flush attempt to continue on it. But when
* the next flush succeeds, the counter of total memstore size in HRegion is always deduced by
* the sum of current memstore sizes instead of snapshots left from previous failed flush. This
* calculation is problematic that almost every time there is failed flush, HRegion.memstoreSize
* gets reduced by a wrong value. If region flush could not proceed for a couple cycles, the size
* in current memstore could be much larger than the snapshot. It's likely to drift memstoreSize
* much smaller than expected. In extreme case, if the error accumulates to even bigger than
* HRegion's memstore size limit, any further flush is skipped because flush does not do anything
* if memstoreSize is not larger than 0."
* @throws Exception
*/
@Test
public void testFlushSizeAccounting() throws Exception {
final Configuration conf = HBaseConfiguration.create(CONF);
final WAL wal = createWALCompatibleWithFaultyFileSystem(method, conf, tableName);
// Only retry once.
conf.setInt("hbase.hstore.flush.retries.number", 1);
final User user = User.createUserForTesting(conf, method, new String[] { "foo" });
// Inject our faulty LocalFileSystem
conf.setClass("fs.file.impl", FaultyFileSystem.class, FileSystem.class);
user.runAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
// Make sure it worked (above is sensitive to caching details in hadoop core)
FileSystem fs = FileSystem.get(conf);
Assert.assertEquals(FaultyFileSystem.class, fs.getClass());
FaultyFileSystem ffs = (FaultyFileSystem) fs;
HRegion region = null;
try {
// Initialize region
region = initHRegion(tableName, null, null, false, Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
long size = region.getMemstoreSize();
Assert.assertEquals(0, size);
// Put one item into memstore. Measure the size of one item in memstore.
Put p1 = new Put(row);
p1.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual1, 1, (byte[]) null));
region.put(p1);
final long sizeOfOnePut = region.getMemstoreSize();
// Fail a flush which means the current memstore will hang out as memstore 'snapshot'.
try {
LOG.info("Flushing");
region.flush(true);
Assert.fail("Didn't bubble up IOE!");
} catch (DroppedSnapshotException dse) {
// What we are expecting
// this is needed for the rest of the test to work
region.closing.set(false);
}
// Make it so all writes succeed from here on out
ffs.fault.set(false);
// Check sizes. Should still be the one entry.
Assert.assertEquals(sizeOfOnePut, region.getMemstoreSize());
// Now add two entries so that on this next flush that fails, we can see if we
// subtract the right amount, the snapshot size only.
Put p2 = new Put(row);
p2.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual2, 2, (byte[]) null));
p2.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual3, 3, (byte[]) null));
region.put(p2);
long expectedSize = sizeOfOnePut * 3;
Assert.assertEquals(expectedSize, region.getMemstoreSize());
// Do a successful flush. It will clear the snapshot only. Thats how flushes work.
// If already a snapshot, we clear it else we move the memstore to be snapshot and flush
// it
region.flush(true);
// Make sure our memory accounting is right.
Assert.assertEquals(sizeOfOnePut * 2, region.getMemstoreSize());
} finally {
HBaseTestingUtility.closeRegionAndWAL(region);
}
return null;
}
});
FileSystem.closeAllForUGI(user.getUGI());
}
use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.
the class TestHRegion method testFlushMarkersWALFail.
@Test
public void testFlushMarkersWALFail() throws Exception {
// test the cases where the WAL append for flush markers fail.
byte[] family = Bytes.toBytes("family");
// spy an actual WAL implementation to throw exception (was not able to mock)
Path logDir = TEST_UTIL.getDataTestDirOnTestFS(method + "log");
final Configuration walConf = new Configuration(TEST_UTIL.getConfiguration());
FSUtils.setRootDir(walConf, logDir);
// Make up a WAL that we can manipulate at append time.
class FailAppendFlushMarkerWAL extends FSHLog {
volatile FlushAction[] flushActions = null;
public FailAppendFlushMarkerWAL(FileSystem fs, Path root, String logDir, Configuration conf) throws IOException {
super(fs, root, logDir, conf);
}
@Override
protected Writer createWriterInstance(Path path) throws IOException {
final Writer w = super.createWriterInstance(path);
return new Writer() {
@Override
public void close() throws IOException {
w.close();
}
@Override
public void sync() throws IOException {
w.sync();
}
@Override
public void append(Entry entry) throws IOException {
List<Cell> cells = entry.getEdit().getCells();
if (WALEdit.isMetaEditFamily(cells.get(0))) {
FlushDescriptor desc = WALEdit.getFlushDescriptor(cells.get(0));
if (desc != null) {
for (FlushAction flushAction : flushActions) {
if (desc.getAction().equals(flushAction)) {
throw new IOException("Failed to append flush marker! " + flushAction);
}
}
}
}
w.append(entry);
}
@Override
public long getLength() {
return w.getLength();
}
};
}
}
FailAppendFlushMarkerWAL wal = new FailAppendFlushMarkerWAL(FileSystem.get(walConf), FSUtils.getRootDir(walConf), method, walConf);
this.region = initHRegion(tableName, HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false, Durability.USE_DEFAULT, wal, family);
try {
int i = 0;
Put put = new Put(Bytes.toBytes(i));
// have to skip mocked wal
put.setDurability(Durability.SKIP_WAL);
put.addColumn(family, Bytes.toBytes(i), Bytes.toBytes(i));
region.put(put);
// 1. Test case where START_FLUSH throws exception
wal.flushActions = new FlushAction[] { FlushAction.START_FLUSH };
// start cache flush will throw exception
try {
region.flush(true);
fail("This should have thrown exception");
} catch (DroppedSnapshotException unexpected) {
// this should not be a dropped snapshot exception. Meaning that RS will not abort
throw unexpected;
} catch (IOException expected) {
// expected
}
// The WAL is hosed now. It has two edits appended. We cannot roll the log without it
// throwing a DroppedSnapshotException to force an abort. Just clean up the mess.
region.close(true);
wal.close();
// 2. Test case where START_FLUSH succeeds but COMMIT_FLUSH will throw exception
wal.flushActions = new FlushAction[] { FlushAction.COMMIT_FLUSH };
wal = new FailAppendFlushMarkerWAL(FileSystem.get(walConf), FSUtils.getRootDir(walConf), method, walConf);
this.region = initHRegion(tableName, HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false, Durability.USE_DEFAULT, wal, family);
region.put(put);
// 3. Test case where ABORT_FLUSH will throw exception.
// Even if ABORT_FLUSH throws exception, we should not fail with IOE, but continue with
// DroppedSnapshotException. Below COMMMIT_FLUSH will cause flush to abort
wal.flushActions = new FlushAction[] { FlushAction.COMMIT_FLUSH, FlushAction.ABORT_FLUSH };
try {
region.flush(true);
fail("This should have thrown exception");
} catch (DroppedSnapshotException expected) {
// we expect this exception, since we were able to write the snapshot, but failed to
// write the flush marker to WAL
} catch (IOException unexpected) {
throw unexpected;
}
} finally {
HBaseTestingUtility.closeRegionAndWAL(this.region);
this.region = null;
}
}
use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.
the class TestHRegion method testWritesWhileScanning.
/**
* Writes very wide records and scans for the latest every time.. Flushes and
* compacts the region every now and then to keep things realistic.
*
* @throws IOException
* by flush / scan / compaction
* @throws InterruptedException
* when joining threads
*/
@Test
public void testWritesWhileScanning() throws IOException, InterruptedException {
int testCount = 100;
int numRows = 1;
int numFamilies = 10;
int numQualifiers = 100;
int flushInterval = 7;
int compactInterval = 5 * flushInterval;
byte[][] families = new byte[numFamilies][];
for (int i = 0; i < numFamilies; i++) {
families[i] = Bytes.toBytes("family" + i);
}
byte[][] qualifiers = new byte[numQualifiers][];
for (int i = 0; i < numQualifiers; i++) {
qualifiers[i] = Bytes.toBytes("qual" + i);
}
this.region = initHRegion(tableName, method, CONF, families);
FlushThread flushThread = new FlushThread();
PutThread putThread = new PutThread(numRows, families, qualifiers);
try {
putThread.start();
putThread.waitForFirstPut();
flushThread.start();
Scan scan = new Scan(Bytes.toBytes("row0"), Bytes.toBytes("row1"));
int expectedCount = numFamilies * numQualifiers;
List<Cell> res = new ArrayList<>();
long prevTimestamp = 0L;
for (int i = 0; i < testCount; i++) {
if (i != 0 && i % compactInterval == 0) {
region.compact(true);
for (Store store : region.getStores()) {
store.closeAndArchiveCompactedFiles();
}
}
if (i != 0 && i % flushInterval == 0) {
flushThread.flush();
}
boolean previousEmpty = res.isEmpty();
res.clear();
InternalScanner scanner = region.getScanner(scan);
while (scanner.next(res)) ;
if (!res.isEmpty() || !previousEmpty || i > compactInterval) {
assertEquals("i=" + i, expectedCount, res.size());
long timestamp = res.get(0).getTimestamp();
assertTrue("Timestamps were broke: " + timestamp + " prev: " + prevTimestamp, timestamp >= prevTimestamp);
prevTimestamp = timestamp;
}
}
putThread.done();
region.flush(true);
} finally {
try {
flushThread.done();
flushThread.join();
flushThread.checkNoError();
putThread.join();
putThread.checkNoError();
} catch (InterruptedException ie) {
LOG.warn("Caught exception when joining with flushThread", ie);
}
try {
HBaseTestingUtility.closeRegionAndWAL(this.region);
} catch (DroppedSnapshotException dse) {
// We could get this on way out because we interrupt the background flusher and it could
// fail anywhere causing a DSE over in the background flusher... only it is not properly
// dealt with so could still be memory hanging out when we get to here -- memory we can't
// flush because the accounting is 'off' since original DSE.
}
this.region = null;
}
}
use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.
the class RSRpcServices method splitRegion.
/**
* Split a region on the region server.
*
* @param controller the RPC controller
* @param request the request
* @throws ServiceException
*/
@Override
@QosPriority(priority = HConstants.ADMIN_QOS)
public SplitRegionResponse splitRegion(final RpcController controller, final SplitRegionRequest request) throws ServiceException {
try {
checkOpen();
requestCount.increment();
Region region = getRegion(request.getRegion());
region.startRegionOperation(Operation.SPLIT_REGION);
if (region.getRegionInfo().getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
throw new IOException("Can't split replicas directly. " + "Replicas are auto-split when their primary is split.");
}
LOG.info("Splitting " + region.getRegionInfo().getRegionNameAsString());
region.flush(true);
byte[] splitPoint = null;
if (request.hasSplitPoint()) {
splitPoint = request.getSplitPoint().toByteArray();
}
((HRegion) region).forceSplit(splitPoint);
regionServer.compactSplitThread.requestSplit(region, ((HRegion) region).checkSplit(), RpcServer.getRequestUser());
return SplitRegionResponse.newBuilder().build();
} catch (DroppedSnapshotException ex) {
regionServer.abort("Replay of WAL required. Forcing server shutdown", ex);
throw new ServiceException(ex);
} catch (IOException ie) {
throw new ServiceException(ie);
}
}
Aggregations