Search in sources :

Example 6 with DroppedSnapshotException

use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.

the class TestHRegion method testWritesWhileRollWriter.

/**
   * HBASE-16429 Make sure no stuck if roll writer when ring buffer is filled with appends
   * @throws IOException if IO error occurred during test
   */
@Test
public void testWritesWhileRollWriter() throws IOException {
    int testCount = 10;
    int numRows = 1024;
    int numFamilies = 2;
    int numQualifiers = 2;
    final byte[][] families = new byte[numFamilies][];
    for (int i = 0; i < numFamilies; i++) {
        families[i] = Bytes.toBytes("family" + i);
    }
    final byte[][] qualifiers = new byte[numQualifiers][];
    for (int i = 0; i < numQualifiers; i++) {
        qualifiers[i] = Bytes.toBytes("qual" + i);
    }
    CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 2);
    this.region = initHRegion(tableName, method, CONF, families);
    try {
        List<Thread> threads = new ArrayList<>();
        for (int i = 0; i < numRows; i++) {
            final int count = i;
            Thread t = new Thread(new Runnable() {

                @Override
                public void run() {
                    byte[] row = Bytes.toBytes("row" + count);
                    Put put = new Put(row);
                    put.setDurability(Durability.SYNC_WAL);
                    byte[] value = Bytes.toBytes(String.valueOf(count));
                    for (byte[] family : families) {
                        for (byte[] qualifier : qualifiers) {
                            put.addColumn(family, qualifier, (long) count, value);
                        }
                    }
                    try {
                        region.put(put);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }
            });
            threads.add(t);
        }
        for (Thread t : threads) {
            t.start();
        }
        for (int i = 0; i < testCount; i++) {
            region.getWAL().rollWriter();
            Thread.yield();
        }
    } finally {
        try {
            HBaseTestingUtility.closeRegionAndWAL(this.region);
            CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 16 * 1024);
        } catch (DroppedSnapshotException dse) {
        // We could get this on way out because we interrupt the background flusher and it could
        // fail anywhere causing a DSE over in the background flusher... only it is not properly
        // dealt with so could still be memory hanging out when we get to here -- memory we can't
        // flush because the accounting is 'off' since original DSE.
        }
        this.region = null;
    }
}
Also used : DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) ArrayList(java.util.ArrayList) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) Put(org.apache.hadoop.hbase.client.Put) RepeatingTestThread(org.apache.hadoop.hbase.MultithreadedTestUtil.RepeatingTestThread) TestThread(org.apache.hadoop.hbase.MultithreadedTestUtil.TestThread) Test(org.junit.Test)

Example 7 with DroppedSnapshotException

use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.

the class TestHRegion method testFlushSizeAccounting.

/**
   * Test we do not lose data if we fail a flush and then close.
   * Part of HBase-10466.  Tests the following from the issue description:
   * "Bug 1: Wrong calculation of HRegion.memstoreSize: When a flush fails, data to be flushed is
   * kept in each MemStore's snapshot and wait for next flush attempt to continue on it. But when
   * the next flush succeeds, the counter of total memstore size in HRegion is always deduced by
   * the sum of current memstore sizes instead of snapshots left from previous failed flush. This
   * calculation is problematic that almost every time there is failed flush, HRegion.memstoreSize
   * gets reduced by a wrong value. If region flush could not proceed for a couple cycles, the size
   * in current memstore could be much larger than the snapshot. It's likely to drift memstoreSize
   * much smaller than expected. In extreme case, if the error accumulates to even bigger than
   * HRegion's memstore size limit, any further flush is skipped because flush does not do anything
   * if memstoreSize is not larger than 0."
   * @throws Exception
   */
@Test
public void testFlushSizeAccounting() throws Exception {
    final Configuration conf = HBaseConfiguration.create(CONF);
    final WAL wal = createWALCompatibleWithFaultyFileSystem(method, conf, tableName);
    // Only retry once.
    conf.setInt("hbase.hstore.flush.retries.number", 1);
    final User user = User.createUserForTesting(conf, method, new String[] { "foo" });
    // Inject our faulty LocalFileSystem
    conf.setClass("fs.file.impl", FaultyFileSystem.class, FileSystem.class);
    user.runAs(new PrivilegedExceptionAction<Object>() {

        @Override
        public Object run() throws Exception {
            // Make sure it worked (above is sensitive to caching details in hadoop core)
            FileSystem fs = FileSystem.get(conf);
            Assert.assertEquals(FaultyFileSystem.class, fs.getClass());
            FaultyFileSystem ffs = (FaultyFileSystem) fs;
            HRegion region = null;
            try {
                // Initialize region
                region = initHRegion(tableName, null, null, false, Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
                long size = region.getMemstoreSize();
                Assert.assertEquals(0, size);
                // Put one item into memstore.  Measure the size of one item in memstore.
                Put p1 = new Put(row);
                p1.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual1, 1, (byte[]) null));
                region.put(p1);
                final long sizeOfOnePut = region.getMemstoreSize();
                // Fail a flush which means the current memstore will hang out as memstore 'snapshot'.
                try {
                    LOG.info("Flushing");
                    region.flush(true);
                    Assert.fail("Didn't bubble up IOE!");
                } catch (DroppedSnapshotException dse) {
                    // What we are expecting
                    // this is needed for the rest of the test to work
                    region.closing.set(false);
                }
                // Make it so all writes succeed from here on out
                ffs.fault.set(false);
                // Check sizes.  Should still be the one entry.
                Assert.assertEquals(sizeOfOnePut, region.getMemstoreSize());
                // Now add two entries so that on this next flush that fails, we can see if we
                // subtract the right amount, the snapshot size only.
                Put p2 = new Put(row);
                p2.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual2, 2, (byte[]) null));
                p2.add(new KeyValue(row, COLUMN_FAMILY_BYTES, qual3, 3, (byte[]) null));
                region.put(p2);
                long expectedSize = sizeOfOnePut * 3;
                Assert.assertEquals(expectedSize, region.getMemstoreSize());
                // Do a successful flush.  It will clear the snapshot only.  Thats how flushes work.
                // If already a snapshot, we clear it else we move the memstore to be snapshot and flush
                // it
                region.flush(true);
                // Make sure our memory accounting is right.
                Assert.assertEquals(sizeOfOnePut * 2, region.getMemstoreSize());
            } finally {
                HBaseTestingUtility.closeRegionAndWAL(region);
            }
            return null;
        }
    });
    FileSystem.closeAllForUGI(user.getUGI());
}
Also used : WAL(org.apache.hadoop.hbase.wal.WAL) MetricsWAL(org.apache.hadoop.hbase.regionserver.wal.MetricsWAL) User(org.apache.hadoop.hbase.security.User) KeyValue(org.apache.hadoop.hbase.KeyValue) Configuration(org.apache.hadoop.conf.Configuration) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) FailedSanityCheckException(org.apache.hadoop.hbase.exceptions.FailedSanityCheckException) RegionTooBusyException(org.apache.hadoop.hbase.RegionTooBusyException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) NotServingRegionException(org.apache.hadoop.hbase.NotServingRegionException) DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) Put(org.apache.hadoop.hbase.client.Put) FileSystem(org.apache.hadoop.fs.FileSystem) FaultyFileSystem(org.apache.hadoop.hbase.regionserver.TestStore.FaultyFileSystem) FaultyFileSystem(org.apache.hadoop.hbase.regionserver.TestStore.FaultyFileSystem) Test(org.junit.Test)

Example 8 with DroppedSnapshotException

use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.

the class TestHRegion method testFlushMarkersWALFail.

@Test
public void testFlushMarkersWALFail() throws Exception {
    // test the cases where the WAL append for flush markers fail.
    byte[] family = Bytes.toBytes("family");
    // spy an actual WAL implementation to throw exception (was not able to mock)
    Path logDir = TEST_UTIL.getDataTestDirOnTestFS(method + "log");
    final Configuration walConf = new Configuration(TEST_UTIL.getConfiguration());
    FSUtils.setRootDir(walConf, logDir);
    // Make up a WAL that we can manipulate at append time.
    class FailAppendFlushMarkerWAL extends FSHLog {

        volatile FlushAction[] flushActions = null;

        public FailAppendFlushMarkerWAL(FileSystem fs, Path root, String logDir, Configuration conf) throws IOException {
            super(fs, root, logDir, conf);
        }

        @Override
        protected Writer createWriterInstance(Path path) throws IOException {
            final Writer w = super.createWriterInstance(path);
            return new Writer() {

                @Override
                public void close() throws IOException {
                    w.close();
                }

                @Override
                public void sync() throws IOException {
                    w.sync();
                }

                @Override
                public void append(Entry entry) throws IOException {
                    List<Cell> cells = entry.getEdit().getCells();
                    if (WALEdit.isMetaEditFamily(cells.get(0))) {
                        FlushDescriptor desc = WALEdit.getFlushDescriptor(cells.get(0));
                        if (desc != null) {
                            for (FlushAction flushAction : flushActions) {
                                if (desc.getAction().equals(flushAction)) {
                                    throw new IOException("Failed to append flush marker! " + flushAction);
                                }
                            }
                        }
                    }
                    w.append(entry);
                }

                @Override
                public long getLength() {
                    return w.getLength();
                }
            };
        }
    }
    FailAppendFlushMarkerWAL wal = new FailAppendFlushMarkerWAL(FileSystem.get(walConf), FSUtils.getRootDir(walConf), method, walConf);
    this.region = initHRegion(tableName, HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false, Durability.USE_DEFAULT, wal, family);
    try {
        int i = 0;
        Put put = new Put(Bytes.toBytes(i));
        // have to skip mocked wal
        put.setDurability(Durability.SKIP_WAL);
        put.addColumn(family, Bytes.toBytes(i), Bytes.toBytes(i));
        region.put(put);
        // 1. Test case where START_FLUSH throws exception
        wal.flushActions = new FlushAction[] { FlushAction.START_FLUSH };
        // start cache flush will throw exception
        try {
            region.flush(true);
            fail("This should have thrown exception");
        } catch (DroppedSnapshotException unexpected) {
            // this should not be a dropped snapshot exception. Meaning that RS will not abort
            throw unexpected;
        } catch (IOException expected) {
        // expected
        }
        // The WAL is hosed now. It has two edits appended. We cannot roll the log without it
        // throwing a DroppedSnapshotException to force an abort. Just clean up the mess.
        region.close(true);
        wal.close();
        // 2. Test case where START_FLUSH succeeds but COMMIT_FLUSH will throw exception
        wal.flushActions = new FlushAction[] { FlushAction.COMMIT_FLUSH };
        wal = new FailAppendFlushMarkerWAL(FileSystem.get(walConf), FSUtils.getRootDir(walConf), method, walConf);
        this.region = initHRegion(tableName, HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false, Durability.USE_DEFAULT, wal, family);
        region.put(put);
        // 3. Test case where ABORT_FLUSH will throw exception.
        // Even if ABORT_FLUSH throws exception, we should not fail with IOE, but continue with
        // DroppedSnapshotException. Below COMMMIT_FLUSH will cause flush to abort
        wal.flushActions = new FlushAction[] { FlushAction.COMMIT_FLUSH, FlushAction.ABORT_FLUSH };
        try {
            region.flush(true);
            fail("This should have thrown exception");
        } catch (DroppedSnapshotException expected) {
        // we expect this exception, since we were able to write the snapshot, but failed to
        // write the flush marker to WAL
        } catch (IOException unexpected) {
            throw unexpected;
        }
    } finally {
        HBaseTestingUtility.closeRegionAndWAL(this.region);
        this.region = null;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) ByteString(org.apache.hadoop.hbase.shaded.com.google.protobuf.ByteString) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) FlushDescriptor(org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor) StoreFlushDescriptor(org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor) Put(org.apache.hadoop.hbase.client.Put) FSHLog(org.apache.hadoop.hbase.regionserver.wal.FSHLog) FlushAction(org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction) FileSystem(org.apache.hadoop.fs.FileSystem) FaultyFileSystem(org.apache.hadoop.hbase.regionserver.TestStore.FaultyFileSystem) Cell(org.apache.hadoop.hbase.Cell) Writer(org.apache.hadoop.hbase.wal.WALProvider.Writer) Test(org.junit.Test)

Example 9 with DroppedSnapshotException

use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.

the class TestHRegion method testWritesWhileScanning.

/**
   * Writes very wide records and scans for the latest every time.. Flushes and
   * compacts the region every now and then to keep things realistic.
   *
   * @throws IOException
   *           by flush / scan / compaction
   * @throws InterruptedException
   *           when joining threads
   */
@Test
public void testWritesWhileScanning() throws IOException, InterruptedException {
    int testCount = 100;
    int numRows = 1;
    int numFamilies = 10;
    int numQualifiers = 100;
    int flushInterval = 7;
    int compactInterval = 5 * flushInterval;
    byte[][] families = new byte[numFamilies][];
    for (int i = 0; i < numFamilies; i++) {
        families[i] = Bytes.toBytes("family" + i);
    }
    byte[][] qualifiers = new byte[numQualifiers][];
    for (int i = 0; i < numQualifiers; i++) {
        qualifiers[i] = Bytes.toBytes("qual" + i);
    }
    this.region = initHRegion(tableName, method, CONF, families);
    FlushThread flushThread = new FlushThread();
    PutThread putThread = new PutThread(numRows, families, qualifiers);
    try {
        putThread.start();
        putThread.waitForFirstPut();
        flushThread.start();
        Scan scan = new Scan(Bytes.toBytes("row0"), Bytes.toBytes("row1"));
        int expectedCount = numFamilies * numQualifiers;
        List<Cell> res = new ArrayList<>();
        long prevTimestamp = 0L;
        for (int i = 0; i < testCount; i++) {
            if (i != 0 && i % compactInterval == 0) {
                region.compact(true);
                for (Store store : region.getStores()) {
                    store.closeAndArchiveCompactedFiles();
                }
            }
            if (i != 0 && i % flushInterval == 0) {
                flushThread.flush();
            }
            boolean previousEmpty = res.isEmpty();
            res.clear();
            InternalScanner scanner = region.getScanner(scan);
            while (scanner.next(res)) ;
            if (!res.isEmpty() || !previousEmpty || i > compactInterval) {
                assertEquals("i=" + i, expectedCount, res.size());
                long timestamp = res.get(0).getTimestamp();
                assertTrue("Timestamps were broke: " + timestamp + " prev: " + prevTimestamp, timestamp >= prevTimestamp);
                prevTimestamp = timestamp;
            }
        }
        putThread.done();
        region.flush(true);
    } finally {
        try {
            flushThread.done();
            flushThread.join();
            flushThread.checkNoError();
            putThread.join();
            putThread.checkNoError();
        } catch (InterruptedException ie) {
            LOG.warn("Caught exception when joining with flushThread", ie);
        }
        try {
            HBaseTestingUtility.closeRegionAndWAL(this.region);
        } catch (DroppedSnapshotException dse) {
        // We could get this on way out because we interrupt the background flusher and it could
        // fail anywhere causing a DSE over in the background flusher... only it is not properly
        // dealt with so could still be memory hanging out when we get to here -- memory we can't
        // flush because the accounting is 'off' since original DSE.
        }
        this.region = null;
    }
}
Also used : DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) ArrayList(java.util.ArrayList) Scan(org.apache.hadoop.hbase.client.Scan) Cell(org.apache.hadoop.hbase.Cell) Test(org.junit.Test)

Example 10 with DroppedSnapshotException

use of org.apache.hadoop.hbase.DroppedSnapshotException in project hbase by apache.

the class RSRpcServices method splitRegion.

/**
   * Split a region on the region server.
   *
   * @param controller the RPC controller
   * @param request the request
   * @throws ServiceException
   */
@Override
@QosPriority(priority = HConstants.ADMIN_QOS)
public SplitRegionResponse splitRegion(final RpcController controller, final SplitRegionRequest request) throws ServiceException {
    try {
        checkOpen();
        requestCount.increment();
        Region region = getRegion(request.getRegion());
        region.startRegionOperation(Operation.SPLIT_REGION);
        if (region.getRegionInfo().getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
            throw new IOException("Can't split replicas directly. " + "Replicas are auto-split when their primary is split.");
        }
        LOG.info("Splitting " + region.getRegionInfo().getRegionNameAsString());
        region.flush(true);
        byte[] splitPoint = null;
        if (request.hasSplitPoint()) {
            splitPoint = request.getSplitPoint().toByteArray();
        }
        ((HRegion) region).forceSplit(splitPoint);
        regionServer.compactSplitThread.requestSplit(region, ((HRegion) region).checkSplit(), RpcServer.getRequestUser());
        return SplitRegionResponse.newBuilder().build();
    } catch (DroppedSnapshotException ex) {
        regionServer.abort("Replay of WAL required. Forcing server shutdown", ex);
        throw new ServiceException(ex);
    } catch (IOException ie) {
        throw new ServiceException(ie);
    }
}
Also used : DroppedSnapshotException(org.apache.hadoop.hbase.DroppedSnapshotException) ServiceException(org.apache.hadoop.hbase.shaded.com.google.protobuf.ServiceException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) HBaseIOException(org.apache.hadoop.hbase.HBaseIOException) QosPriority(org.apache.hadoop.hbase.ipc.QosPriority)

Aggregations

DroppedSnapshotException (org.apache.hadoop.hbase.DroppedSnapshotException)12 IOException (java.io.IOException)9 InterruptedIOException (java.io.InterruptedIOException)7 Test (org.junit.Test)7 Put (org.apache.hadoop.hbase.client.Put)6 ArrayList (java.util.ArrayList)4 Configuration (org.apache.hadoop.conf.Configuration)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)3 DoNotRetryIOException (org.apache.hadoop.hbase.DoNotRetryIOException)3 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)3 FaultyFileSystem (org.apache.hadoop.hbase.regionserver.TestStore.FaultyFileSystem)3 AbstractList (java.util.AbstractList)2 List (java.util.List)2 Cell (org.apache.hadoop.hbase.Cell)2 HBaseIOException (org.apache.hadoop.hbase.HBaseIOException)2 KeyValue (org.apache.hadoop.hbase.KeyValue)2 NotServingRegionException (org.apache.hadoop.hbase.NotServingRegionException)2 RegionTooBusyException (org.apache.hadoop.hbase.RegionTooBusyException)2 FailedSanityCheckException (org.apache.hadoop.hbase.exceptions.FailedSanityCheckException)2