Search in sources :

Example 6 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStore method checkCapacityAndUpdateReplicaStatusDelegate.

/**
 * Checks the used capacity of the store against the configured percentage thresholds to see if the store
 * should be read-only or read-write
 */
private void checkCapacityAndUpdateReplicaStatusDelegate() {
    if (replicaStatusDelegates != null) {
        logger.debug("The current used capacity is {} bytes on store {}", index.getLogUsedCapacity(), replicaId.getPartitionId());
        // state, we bypass "isSealed()" check if there are more than one replicaStatusDelegates.
        if (index.getLogUsedCapacity() > thresholdBytesHigh && (!replicaId.isSealed() || (replicaStatusDelegates.size() > 1 && !isSealed.getAndSet(true)))) {
            for (ReplicaStatusDelegate replicaStatusDelegate : replicaStatusDelegates) {
                if (!replicaStatusDelegate.seal(replicaId)) {
                    metrics.sealSetError.inc();
                    logger.warn("Could not set the partition as read-only status on {}", replicaId);
                    isSealed.set(false);
                } else {
                    metrics.sealDoneCount.inc();
                    logger.info("Store is successfully sealed for partition : {} because current used capacity : {} bytes exceeds ReadOnly threshold : {} bytes", replicaId.getPartitionId(), index.getLogUsedCapacity(), thresholdBytesHigh);
                }
            }
        } else if (index.getLogUsedCapacity() <= thresholdBytesLow && (replicaId.isSealed() || (replicaStatusDelegates.size() > 1 && isSealed.getAndSet(false)))) {
            for (ReplicaStatusDelegate replicaStatusDelegate : replicaStatusDelegates) {
                if (!replicaStatusDelegate.unseal(replicaId)) {
                    metrics.unsealSetError.inc();
                    logger.warn("Could not set the partition as read-write status on {}", replicaId);
                    isSealed.set(true);
                } else {
                    metrics.unsealDoneCount.inc();
                    logger.info("Store is successfully unsealed for partition : {} because current used capacity : {} bytes is below ReadWrite threshold : {} bytes", replicaId.getPartitionId(), index.getLogUsedCapacity(), thresholdBytesLow);
                }
            }
        }
        // During startup, we also need to reconcile the replica state from both ZK clusters.
        if (!started && replicaStatusDelegates.size() > 1 && thresholdBytesLow < index.getLogUsedCapacity() && index.getLogUsedCapacity() <= thresholdBytesHigh) {
            // reconcile the state by reading sealing state from both clusters
            boolean sealed = false;
            String partitionName = replicaId.getPartitionId().toPathString();
            for (ReplicaStatusDelegate replicaStatusDelegate : replicaStatusDelegates) {
                Set<String> sealedReplicas = new HashSet<>(replicaStatusDelegate.getSealedReplicas());
                sealed |= sealedReplicas.contains(partitionName);
            }
            for (ReplicaStatusDelegate replicaStatusDelegate : replicaStatusDelegates) {
                boolean success = sealed ? replicaStatusDelegate.seal(replicaId) : replicaStatusDelegate.unseal(replicaId);
                if (success) {
                    logger.info("Succeeded in reconciling replica state to {} state", sealed ? "sealed" : "unsealed");
                    isSealed.set(sealed);
                } else {
                    logger.error("Failed on reconciling replica state to {} state", sealed ? "sealed" : "unsealed");
                }
            }
        }
    // else: maintain current replicaId status if percentFilled between threshold - delta and threshold
    } else {
        logger.debug("The ReplicaStatusDelegate is not instantiated");
    }
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) HashSet(java.util.HashSet)

Example 7 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method storeIoErrorCountTest.

/**
 * Tests that {@link BlobStore#onError()} and {@link BlobStore#onSuccess()} can correctly capture disk related I/O errors
 * and properly shutdown the store.
 * @throws StoreException
 */
@Test
public void storeIoErrorCountTest() throws StoreException, IOException {
    // setup testing environment
    store.shutdown();
    properties.put("store.io.error.count.to.trigger.shutdown", "2");
    MockId id1 = getUniqueId();
    MockId id2 = getUniqueId();
    MockId id3 = getUniqueId();
    MessageInfo corruptedInfo = new MessageInfo(getUniqueId(), PUT_RECORD_SIZE, Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM), Utils.Infinite_Time);
    MessageInfo info1 = new MessageInfo(id1, PUT_RECORD_SIZE, 3 * 24 * 60 * 60 * 1000, id1.getAccountId(), id1.getContainerId(), Utils.Infinite_Time);
    MessageInfo info2 = new MessageInfo(id2, PUT_RECORD_SIZE, id2.getAccountId(), id2.getContainerId(), Utils.Infinite_Time);
    MessageInfo info3 = new MessageInfo(id3, PUT_RECORD_SIZE, id3.getAccountId(), id3.getContainerId(), Utils.Infinite_Time);
    MessageWriteSet corruptedWriteSet = new MockMessageWriteSet(Collections.singletonList(corruptedInfo), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), new StoreException(StoreException.IO_ERROR_STR, StoreErrorCodes.IOError));
    MessageWriteSet validWriteSet1 = new MockMessageWriteSet(Collections.singletonList(info1), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), null);
    MessageWriteSet validWriteSet2 = new MockMessageWriteSet(Collections.singletonList(info2), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), null);
    MessageWriteSet validWriteSet3 = new MockMessageWriteSet(Collections.singletonList(info3), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), null);
    ReplicaStatusDelegate mockDelegate = mock(ReplicaStatusDelegate.class);
    // Test1: simulate StoreErrorCodes.IOError triggered by corrupted write set.
    // verify that store can capture disk I/O errors in Put/Delete/TtlUpdate methods and take proper actions.
    BlobStore testStore1 = createBlobStore(getMockReplicaId(tempDirStr), new StoreConfig(new VerifiableProperties(properties)), Collections.singletonList(mockDelegate));
    testStore1.start();
    assertTrue("Store should start successfully", testStore1.isStarted());
    // verify store can keep track of real I/O errors for Put operation and shutdown properly.
    try {
        testStore1.put(corruptedWriteSet);
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    assertTrue("Store should be up", testStore1.isStarted());
    // verify error count would be reset after successful Put operation
    testStore1.put(validWriteSet1);
    assertEquals("Error count should be reset", 0, testStore1.getErrorCount().get());
    // trigger a normal shutdown to persist data (otherwise following delete/ttl update operation will encounter ID_Not_Found error)
    testStore1.shutdown();
    // restart for subsequent tests
    testStore1.start();
    // verify consecutive two failed Puts would make store shutdown (storeIoErrorCountToTriggerShutdown = 2)
    for (int i = 0; i < 2; ++i) {
        try {
            testStore1.put(corruptedWriteSet);
        } catch (StoreException e) {
            assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
        }
    }
    assertFalse("Store should shutdown because error count exceeded threshold", testStore1.isStarted());
    testStore1.start();
    // verify store can keep track of real I/O errors for Delete and TtlUpdate operations and shutdown properly.
    assertEquals("Error count should be reset", 0, testStore1.getErrorCount().get());
    testStore1.shutdown();
    // Test2: Simulate StoreErrorCodes.IOError occurred in getStoreKey step even though WriteSet is valid
    // verify that store can capture disk I/O errors in GET method and take proper actions. Put/Delete/TtlUpdates are also tested.
    properties.put("store.index.max.number.of.inmem.elements", "1");
    properties.put("store.io.error.count.to.trigger.shutdown", "3");
    MetricRegistry registry = new MetricRegistry();
    StoreMetrics metrics = new StoreMetrics(registry);
    StoreKeyFactory mockStoreKeyFactory = Mockito.spy(STORE_KEY_FACTORY);
    BlobStore testStore2 = new BlobStore(getMockReplicaId(tempDirStr), new StoreConfig(new VerifiableProperties(properties)), scheduler, storeStatsScheduler, diskIOScheduler, diskSpaceAllocator, metrics, metrics, mockStoreKeyFactory, recovery, hardDelete, Collections.singletonList(mockDelegate), time, new InMemAccountService(false, false), null);
    testStore2.start();
    assertTrue("Store should start up", testStore2.isStarted());
    testStore2.put(validWriteSet2);
    testStore2.put(validWriteSet3);
    // shutdown and restart to make the segments be memory mapped (this is used to simulate IOException generated by mockStoreKeyFactory)
    testStore2.shutdown();
    testStore2.start();
    doThrow(new IOException(StoreException.IO_ERROR_STR)).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    // verify that store exceptions (caused by IOException and InternalError) could be captured by Get operation
    try {
        testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    doThrow(new InternalError(StoreException.INTERNAL_ERROR_STR)).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    try {
        testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    assertEquals("Mismatch in error count", 2, testStore2.getErrorCount().get());
    // test that when InternalError's error message is null, the error code should be Unknown_Error and store error count
    // stays unchanged.
    doThrow(new InternalError()).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    try {
        testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.Unknown_Error, e.getErrorCode());
    }
    assertEquals("Mismatch in error count", 2, testStore2.getErrorCount().get());
    // verify that StoreException.Unknown_Error could be captured by Get and error count stays unchanged.
    doThrow(new IOException("Unknown exception")).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    try {
        testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.Unknown_Error, e.getErrorCode());
    }
    doThrow(new InternalError("Unknown exception")).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    try {
        testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.Unknown_Error, e.getErrorCode());
    }
    assertEquals("Mismatch in error count", 2, testStore2.getErrorCount().get());
    // verify error count would be reset after successful Get operation
    Mockito.reset(mockStoreKeyFactory);
    StoreInfo storeInfo = testStore2.get(Collections.singletonList(id2), EnumSet.noneOf(StoreGetOptions.class));
    assertNotNull(storeInfo);
    assertEquals("Error count should be reset", 0, testStore2.getErrorCount().get());
    doThrow(new IOException(StoreException.IO_ERROR_STR)).when(mockStoreKeyFactory).getStoreKey(any(DataInputStream.class));
    // call put method to trigger StoreException
    try {
        testStore2.put(validWriteSet1);
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    // call TtlUpdate method to trigger StoreException
    MessageInfo ttlUpdateInfo = new MessageInfo(id2, TTL_UPDATE_RECORD_SIZE, false, true, Utils.Infinite_Time, id2.getAccountId(), id2.getContainerId(), time.milliseconds());
    try {
        testStore2.updateTtl(Collections.singletonList(ttlUpdateInfo));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    // call delete method to trigger StoreException
    MessageInfo deleteInfo = new MessageInfo(id2, DELETE_RECORD_SIZE, id2.getAccountId(), id2.getContainerId(), time.milliseconds());
    try {
        testStore2.delete(Collections.singletonList(deleteInfo));
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    // verify error count keeps track of StoreException and shut down store properly
    assertEquals("Mismatch in triggered shutdown counter", 1, metrics.storeIoErrorTriggeredShutdownCount.getCount());
    assertFalse("Store should shutdown because error count exceeded threshold", testStore2.isStarted());
    reloadStore();
}
Also used : VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) InMemAccountService(com.github.ambry.account.InMemAccountService) StoreConfig(com.github.ambry.config.StoreConfig) Test(org.junit.Test)

Example 8 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method testClusterManagerReplicaStatusDelegateUse.

/**
 * Tests blob store use of {@link ReplicaStatusDelegate}
 * @throws StoreException
 */
@Test
public void testClusterManagerReplicaStatusDelegateUse() throws StoreException, IOException, InterruptedException {
    // TODO: compaction for segmented logs) never encounters TTL updates
    if (isLogSegmented) {
        cleanup();
        scheduler = Utils.newScheduler(1, false);
        storeStatsScheduler = Utils.newScheduler(1, false);
        setupTestState(false, false);
    }
    properties.setProperty("store.set.local.partition.state.enabled", Boolean.toString(true));
    // Setup threshold test properties, replicaId, mock write status delegate
    StoreConfig defaultConfig = changeThreshold(65, 5, true);
    StoreTestUtils.MockReplicaId replicaId = getMockReplicaId(tempDirStr);
    ReplicaStatusDelegate replicaStatusDelegate = mock(ReplicaStatusDelegate.class);
    when(replicaStatusDelegate.unseal(any())).thenReturn(true);
    when(replicaStatusDelegate.seal(any())).thenReturn(true);
    // Restart store
    reloadStore(defaultConfig, replicaId, Collections.singletonList(replicaStatusDelegate));
    // Check that after start, replicaStatusDelegate is called to enable replica if it was previously disabled
    verify(replicaStatusDelegate, times(1)).enableReplica(replicaId);
    // Verify that putting in data that doesn't go over the threshold doesn't trigger the delegate
    put(1, 50, Utils.Infinite_Time);
    verify(replicaStatusDelegate, times(0)).seal(replicaId);
    // Verify that after putting in enough data, the store goes to read only
    // setupTestState already have created 3 log segments, there we create another 4 segments, it should
    // be enough to fill up to 65% of the log capacity.
    List<MockId> addedIds = put(4, (long) (SEGMENT_CAPACITY * 0.8), Utils.Infinite_Time);
    verify(replicaStatusDelegate, times(1)).seal(replicaId);
    // Assumes ClusterParticipant sets replicaId status to true
    replicaId.setSealedState(true);
    // Change config threshold but with delegate disabled, verify that nothing happens (store doesn't get unsealed)
    reloadStore(changeThreshold(99, 1, false), replicaId, Collections.singletonList(replicaStatusDelegate));
    verify(replicaStatusDelegate, times(0)).unseal(replicaId);
    // Change config threshold to higher, see that it gets changed to unsealed on reset
    reloadStore(changeThreshold(99, 1, true), replicaId, Collections.singletonList(replicaStatusDelegate));
    verify(replicaStatusDelegate, times(1)).unseal(replicaId);
    replicaId.setSealedState(false);
    // Reset thresholds, verify that it changed back
    reloadStore(defaultConfig, replicaId, Collections.singletonList(replicaStatusDelegate));
    verify(replicaStatusDelegate, times(2)).seal(replicaId);
    replicaId.setSealedState(true);
    // Remaining tests only relevant for segmented logs
    if (isLogSegmented) {
        // Delete added data
        for (MockId addedId : addedIds) {
            delete(addedId);
        }
        // Need to restart blob otherwise compaction will ignore segments in journal (which are all segments right now).
        // By restarting, only last segment will be in journal
        reloadStore(defaultConfig, replicaId, Collections.singletonList(replicaStatusDelegate));
        verify(replicaStatusDelegate, times(4)).enableReplica(replicaId);
        // Advance time by 8 days, call compaction to compact segments with deleted data, then verify
        // that the store is now read-write
        time.sleep(TimeUnit.DAYS.toMillis(8));
        store.compact(store.getCompactionDetails(new CompactAllPolicy(defaultConfig, time)), new byte[PUT_RECORD_SIZE * 2 + 1]);
        verify(replicaStatusDelegate, times(2)).unseal(replicaId);
        // Test if replicaId is erroneously true that it updates the status upon startup
        replicaId.setSealedState(true);
        reloadStore(defaultConfig, replicaId, Collections.singletonList(replicaStatusDelegate));
        verify(replicaStatusDelegate, times(3)).unseal(replicaId);
    }
    store.shutdown();
    properties.setProperty("store.set.local.partition.state.enabled", Boolean.toString(false));
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) StoreConfig(com.github.ambry.config.StoreConfig) Test(org.junit.Test)

Example 9 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method multiReplicaStatusDelegatesTest.

/**
 * Test store is able to correctly seal/unseal replica with multiple participants.
 * @throws Exception
 */
@Test
public void multiReplicaStatusDelegatesTest() throws Exception {
    Set<ReplicaId> sealedReplicas1 = new HashSet<>();
    ReplicaStatusDelegate mockDelegate1 = Mockito.mock(ReplicaStatusDelegate.class);
    doAnswer(invocation -> {
        sealedReplicas1.add(invocation.getArgument(0));
        return true;
    }).when(mockDelegate1).seal(any());
    Set<ReplicaId> sealedReplicas2 = new HashSet<>();
    ReplicaStatusDelegate mockDelegate2 = Mockito.mock(ReplicaStatusDelegate.class);
    doAnswer(invocation -> {
        sealedReplicas2.add(invocation.getArgument(0));
        return true;
    }).when(mockDelegate2).seal(any());
    doAnswer(invocation -> {
        sealedReplicas1.remove((ReplicaId) invocation.getArgument(0));
        return true;
    }).when(mockDelegate1).unseal(any());
    doAnswer(invocation -> {
        sealedReplicas2.remove((ReplicaId) invocation.getArgument(0));
        return true;
    }).when(mockDelegate2).unseal(any());
    doAnswer(invocation -> sealedReplicas1.stream().map(r -> r.getPartitionId().toPathString()).collect(Collectors.toList())).when(mockDelegate1).getSealedReplicas();
    doAnswer(invocation -> sealedReplicas2.stream().map(r -> r.getPartitionId().toPathString()).collect(Collectors.toList())).when(mockDelegate2).getSealedReplicas();
    StoreConfig defaultConfig = changeThreshold(65, 5, true);
    StoreTestUtils.MockReplicaId replicaId = getMockReplicaId(tempDirStr);
    reloadStore(defaultConfig, replicaId, Arrays.asList(mockDelegate1, mockDelegate2));
    // make the replica sealed
    put(4, (long) (SEGMENT_CAPACITY * 0.8), Utils.Infinite_Time);
    assertEquals("Sealed replica lists are different", sealedReplicas1, sealedReplicas2);
    assertEquals("Sealed replica is not correct", replicaId, sealedReplicas1.iterator().next());
    // try to bump the readonly threshold so as to unseal the replica
    replicaId.setSealedState(true);
    reloadStore(changeThreshold(99, 1, true), replicaId, Arrays.asList(mockDelegate1, mockDelegate2));
    assertTrue("Replica should be unsealed", sealedReplicas1.isEmpty() && sealedReplicas2.isEmpty());
    assertEquals("After startup, store should be in STANDBY state", STANDBY, store.getCurrentState());
    // verify store still updates sealed lists even though replica state is already sealed. ("replicaId.setSealedState(true)")
    // lower the threshold to make replica sealed again
    reloadStore(changeThreshold(50, 5, true), replicaId, Arrays.asList(mockDelegate1, mockDelegate2));
    assertEquals("Sealed replica lists are different", sealedReplicas1, sealedReplicas2);
    assertEquals("Sealed replica is not correct", replicaId, sealedReplicas1.iterator().next());
    // verify reconciliation case: we make read-write delta a wide range and clear sealedReplicas2 to make them reconcile
    sealedReplicas2.clear();
    reloadStore(changeThreshold(99, 90, true), replicaId, Arrays.asList(mockDelegate1, mockDelegate2));
    assertEquals("Sealed replica lists are different", sealedReplicas1, sealedReplicas2);
    assertEquals("Sealed replica is not correct", replicaId, sealedReplicas2.iterator().next());
    store.shutdown();
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) StoreConfig(com.github.ambry.config.StoreConfig) ReplicaId(com.github.ambry.clustermap.ReplicaId) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ReplicaStatusDelegate (com.github.ambry.clustermap.ReplicaStatusDelegate)9 StoreConfig (com.github.ambry.config.StoreConfig)6 Test (org.junit.Test)5 MetricRegistry (com.codahale.metrics.MetricRegistry)4 VerifiableProperties (com.github.ambry.config.VerifiableProperties)4 ReplicaId (com.github.ambry.clustermap.ReplicaId)3 HashSet (java.util.HashSet)3 InMemAccountService (com.github.ambry.account.InMemAccountService)2 File (java.io.File)2 ArrayList (java.util.ArrayList)2 ClusterParticipant (com.github.ambry.clustermap.ClusterParticipant)1 HelixFactory (com.github.ambry.clustermap.HelixFactory)1 MockHelixParticipant (com.github.ambry.clustermap.MockHelixParticipant)1 PartitionId (com.github.ambry.clustermap.PartitionId)1 TestUtils (com.github.ambry.clustermap.TestUtils)1 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)1 ReplicationManager (com.github.ambry.replication.ReplicationManager)1 BlobStore (com.github.ambry.store.BlobStore)1 StorageManager (com.github.ambry.store.StorageManager)1 Store (com.github.ambry.store.Store)1