Search in sources :

Example 1 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method storeErrorTriggerDisableReplicaTest.

/**
 * Test that replica is correctly disabled when store is shut down due to disk I/O error.
 * @throws Exception
 */
@Test
public void storeErrorTriggerDisableReplicaTest() throws Exception {
    final String RESOURCE_NAME = "0";
    final String CLUSTER_NAME = "BlobStoreTest";
    // setup testing environment
    store.shutdown();
    List<TestUtils.ZkInfo> zkInfoList = new ArrayList<>();
    zkInfoList.add(new TestUtils.ZkInfo(null, "DC1", (byte) 0, 2199, false));
    JSONObject zkJson = constructZkLayoutJSON(zkInfoList);
    properties.setProperty("clustermap.cluster.name", CLUSTER_NAME);
    properties.setProperty("clustermap.datacenter.name", "DC1");
    properties.setProperty("clustermap.host.name", "localhost");
    properties.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
    properties.setProperty("store.io.error.count.to.trigger.shutdown", "1");
    properties.setProperty("store.replica.status.delegate.enable", "true");
    properties.setProperty("store.set.local.partition.state.enabled", "true");
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(properties));
    AtomicReference<InstanceConfig> instanceConfig = new AtomicReference<>(new InstanceConfig("localhost"));
    instanceConfig.get().setPort("2222");
    Map<String, List<String>> listMap = new HashMap<>();
    listMap.put(storeId, null);
    ZNRecord znRecord = new ZNRecord("localhost");
    znRecord.setListFields(listMap);
    IdealState idealState = new IdealState(znRecord);
    idealState.setRebalanceMode(IdealState.RebalanceMode.SEMI_AUTO);
    // mock helix related components
    HelixAdmin mockHelixAdmin = mock(HelixAdmin.class);
    when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).then(invocation -> instanceConfig.get());
    when(mockHelixAdmin.getResourcesInCluster(eq(CLUSTER_NAME))).thenReturn(Collections.singletonList(RESOURCE_NAME));
    when(mockHelixAdmin.getResourceIdealState(eq(CLUSTER_NAME), eq(RESOURCE_NAME))).thenReturn(idealState);
    when(mockHelixAdmin.setInstanceConfig(any(), any(), any())).then(invocation -> {
        instanceConfig.set(invocation.getArgument(2));
        return true;
    });
    HelixManager mockHelixManager = mock(HelixManager.class);
    when(mockHelixManager.getClusterManagmentTool()).thenReturn(mockHelixAdmin);
    HelixFactory mockHelixFactory = new HelixFactory() {

        @Override
        public HelixManager getZKHelixManager(String clusterName, String instanceName, InstanceType instanceType, String zkAddr) {
            return mockHelixManager;
        }
    };
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockParticipant = new MockHelixParticipant(clusterMapConfig, mockHelixFactory);
    mockParticipant.overrideDisableReplicaMethod = false;
    ReplicaStatusDelegate replicaStatusDelegate = new ReplicaStatusDelegate(mockParticipant);
    BlobStore testStore = createBlobStore(getMockAmbryReplica(clusterMapConfig, tempDirStr), new StoreConfig(new VerifiableProperties(properties)), Collections.singletonList(replicaStatusDelegate));
    testStore.start();
    assertTrue("Store should start successfully", testStore.isStarted());
    // create corrupted write set
    MessageInfo corruptedInfo = new MessageInfo(getUniqueId(), PUT_RECORD_SIZE, Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM), Utils.Infinite_Time);
    MessageWriteSet corruptedWriteSet = new MockMessageWriteSet(Collections.singletonList(corruptedInfo), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), new StoreException(StoreException.IO_ERROR_STR, StoreErrorCodes.IOError));
    // 1. mock failure case
    when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).thenReturn(null);
    // trigger store exception when calling store.put()
    try {
        testStore.put(corruptedWriteSet);
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    assertNull("Disabled partition list should be null as disabling replica didn't succeed", instanceConfig.get().getDisabledPartitions(RESOURCE_NAME));
    // 2. mock success case
    when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).then(invocation -> instanceConfig.get());
    testStore.start();
    assertTrue("Store should start successfully", testStore.isStarted());
    try {
        testStore.put(corruptedWriteSet);
        fail("should throw exception");
    } catch (StoreException e) {
        assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
    }
    assertEquals("Disabled partition name is not expected", storeId, instanceConfig.get().getDisabledPartitions(RESOURCE_NAME).get(0));
    // verify "DISABLED" list in InstanceConfig has correct partition id.
    assertEquals("Disabled replica list is not expected", Collections.singletonList(storeId), getDisabledReplicas(instanceConfig.get()));
    // 3. mock disk is replaced case, restart should succeed
    testStore.start();
    assertNull("Disabled partition list should be null as restart will enable same replica", instanceConfig.get().getDisabledPartitions(RESOURCE_NAME));
    assertTrue("Disabled replica list should be empty", getDisabledReplicas(instanceConfig.get()).isEmpty());
    testStore.shutdown();
    reloadStore();
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HelixAdmin(org.apache.helix.HelixAdmin) IdealState(org.apache.helix.model.IdealState) TestUtils(com.github.ambry.utils.TestUtils) TestUtils(com.github.ambry.clustermap.TestUtils) ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) InstanceConfig(org.apache.helix.model.InstanceConfig) MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) List(java.util.List) ArrayList(java.util.ArrayList) InstanceType(org.apache.helix.InstanceType) ZNRecord(org.apache.helix.zookeeper.datamodel.ZNRecord) HelixManager(org.apache.helix.HelixManager) HelixFactory(com.github.ambry.clustermap.HelixFactory) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) AtomicReference(java.util.concurrent.atomic.AtomicReference) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) JSONObject(org.json.JSONObject) StoreConfig(com.github.ambry.config.StoreConfig) Test(org.junit.Test)

Example 2 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method catchStoreExceptionAndVerifyErrorCode.

// helpers
// general
/**
 * Verify store method can capture store exception and correctly handle it. The method also verifies that if exception
 * is really caused by disk I/O error, store shutdown process would skip any disk flush operation and no clean shutdown
 * file should exist in directory.
 * @param methodCaller the method caller to invoke store methods to trigger store exception
 * @throws StoreException
 */
private void catchStoreExceptionAndVerifyErrorCode(StoreMethodCaller methodCaller) throws StoreException {
    properties.put("store.io.error.count.to.trigger.shutdown", "1");
    ReplicaStatusDelegate mockDelegate = mock(ReplicaStatusDelegate.class);
    MockBlobStore mockBlobStore = new MockBlobStore(getMockReplicaId(tempDirStr), new StoreConfig(new VerifiableProperties(properties)), Collections.singletonList(mockDelegate), new StoreMetrics(new MetricRegistry()));
    // First, verify that a normal shutdown will create a clean shutdown file in the store directory.
    mockBlobStore.start();
    mockBlobStore.shutdown();
    File shutdownFile = new File(tempDir, PersistentIndex.CLEAN_SHUTDOWN_FILENAME);
    assertTrue("Clean shutdown file should exist", shutdownFile.exists());
    mockBlobStore.start();
    // Second, verify that store won't be shut down if Unknown_Error occurred.
    StoreException storeExceptionInIndex = new StoreException("Mock Unknown error", StoreErrorCodes.Unknown_Error);
    mockBlobStore.setPersistentIndex(storeExceptionInIndex);
    try {
        methodCaller.invoke(mockBlobStore);
        fail("should fail");
    } catch (StoreException e) {
        assertEquals("Mismatch in StoreErrorCode", StoreErrorCodes.Unknown_Error, e.getErrorCode());
    }
    assertTrue("Store should not be shut down", mockBlobStore.isStarted());
    assertEquals("Mismatch in store io error count", 0, mockBlobStore.getErrorCount().get());
    // Third, verify that store will be shut down if IOError occurred (disk I/O error)
    storeExceptionInIndex = new StoreException("Mock disk I/O error", StoreErrorCodes.IOError);
    mockBlobStore.setPersistentIndex(storeExceptionInIndex);
    try {
        methodCaller.invoke(mockBlobStore);
        fail("should fail");
    } catch (StoreException e) {
        assertEquals("Mismatch in StoreErrorCode", StoreErrorCodes.IOError, e.getErrorCode());
    }
    assertFalse("Store should be shutdown after error count exceeded threshold", mockBlobStore.isStarted());
    // In the end, verify that store shutdown would skip any disk flush operation if it is triggered by a real disk I/O error.
    assertFalse("When encountering disk I/O error, clean shutdown file shouldn't exist", shutdownFile.exists());
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) StoreConfig(com.github.ambry.config.StoreConfig) File(java.io.File)

Example 3 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class BlobStoreTest method resolveStoreInitialStateTest.

/**
 * Test that if {@link HelixParticipant} is adopted, store state is set to OFFLINE after startup (which will be updated
 * by Helix state transition later)
 * @throws Exception
 */
@Test
public void resolveStoreInitialStateTest() throws Exception {
    store.shutdown();
    properties.setProperty(StoreConfig.storeReplicaStatusDelegateEnableName, "true");
    File storeDir = StoreTestUtils.createTempDirectory("store-" + storeId);
    File reserveDir = StoreTestUtils.createTempDirectory("reserve-pool");
    reserveDir.deleteOnExit();
    DiskSpaceAllocator diskAllocator = new DiskSpaceAllocator(true, reserveDir, 0, new StorageManagerMetrics(new MetricRegistry()));
    StoreConfig config = new StoreConfig(new VerifiableProperties(properties));
    MetricRegistry registry = new MetricRegistry();
    StoreMetrics metrics = new StoreMetrics(registry);
    ClusterParticipant dynamicParticipant = Mockito.mock(ClusterParticipant.class);
    when(dynamicParticipant.supportsStateChanges()).thenReturn(true);
    ReplicaStatusDelegate delegate = new ReplicaStatusDelegate(dynamicParticipant);
    BlobStore testStore = new BlobStore(getMockReplicaId(storeDir.getAbsolutePath()), config, scheduler, storeStatsScheduler, diskIOScheduler, diskAllocator, metrics, metrics, STORE_KEY_FACTORY, recovery, hardDelete, Collections.singletonList(delegate), time, new InMemAccountService(false, false), null);
    testStore.start();
    assertEquals("Store current state should be OFFLINE if dynamic participant is adopted", OFFLINE, testStore.getCurrentState());
    testStore.shutdown();
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) InMemAccountService(com.github.ambry.account.InMemAccountService) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) StoreConfig(com.github.ambry.config.StoreConfig) File(java.io.File) ClusterParticipant(com.github.ambry.clustermap.ClusterParticipant) Test(org.junit.Test)

Example 4 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class AmbryServerRequests method handleRemoveStoreRequest.

/**
 * Handles admin request that removes a BlobStore from current node
 * @param partitionId the {@link PartitionId} associated with BlobStore
 * @return {@link ServerErrorCode} represents result of handling admin request.
 */
private ServerErrorCode handleRemoveStoreRequest(PartitionId partitionId) throws StoreException, IOException {
    ServerErrorCode errorCode = ServerErrorCode.No_Error;
    ReplicaId replicaId = storeManager.getReplica(partitionId.toPathString());
    if (replicaId == null) {
        logger.error("{} doesn't exist on current node", partitionId);
        return ServerErrorCode.Partition_Unknown;
    }
    // Attempt to remove replica from stats manager. If replica doesn't exist, log info but don't fail the request
    statsManager.removeReplica(replicaId);
    // Attempt to remove replica from replication manager. If replica doesn't exist, log info but don't fail the request
    ((ReplicationManager) replicationEngine).removeReplica(replicaId);
    Store store = ((StorageManager) storeManager).getStore(partitionId, true);
    // Attempt to remove store from storage manager.
    if (storeManager.removeBlobStore(partitionId) && store != null) {
        ((BlobStore) store).deleteStoreFiles();
        for (ReplicaStatusDelegate replicaStatusDelegate : ((BlobStore) store).getReplicaStatusDelegates()) {
            // Remove store from sealed and stopped list (if present)
            logger.info("Removing store from sealed and stopped list(if present)");
            replicaStatusDelegate.unseal(replicaId);
            replicaStatusDelegate.unmarkStopped(Collections.singletonList(replicaId));
        }
    } else {
        errorCode = ServerErrorCode.Unknown_Error;
    }
    return errorCode;
}
Also used : ReplicationManager(com.github.ambry.replication.ReplicationManager) ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) StorageManager(com.github.ambry.store.StorageManager) Store(com.github.ambry.store.Store) BlobStore(com.github.ambry.store.BlobStore) ReplicaId(com.github.ambry.clustermap.ReplicaId) BlobStore(com.github.ambry.store.BlobStore)

Example 5 with ReplicaStatusDelegate

use of com.github.ambry.clustermap.ReplicaStatusDelegate in project ambry by linkedin.

the class DiskManager method setBlobStoreStoppedState.

/**
 * Set the BlobStore stopped state with given {@link PartitionId} {@code id}.
 * @param partitionIds a list of {@link PartitionId} of the {@link BlobStore} whose stopped state should be set.
 * @param markStop whether to mark BlobStore as stopped ({@code true}) or started.
 * @return a list of {@link PartitionId} whose stopped state fails to be updated.
 */
List<PartitionId> setBlobStoreStoppedState(List<PartitionId> partitionIds, boolean markStop) {
    Set<PartitionId> failToUpdateStores = new HashSet<>();
    List<ReplicaId> replicasToUpdate = new ArrayList<>();
    rwLock.readLock().lock();
    try {
        for (PartitionId id : partitionIds) {
            BlobStore store = stores.get(id);
            if (store == null) {
                // no need to check if the store is started because this method could be called after store is successfully shutdown.
                logger.error("store is not found on this disk when trying to update stoppedReplicas list");
                failToUpdateStores.add(id);
            } else {
                replicasToUpdate.add(partitionToReplicaMap.get(id));
            }
        }
    } finally {
        rwLock.readLock().unlock();
    }
    boolean updated = true;
    if (replicaStatusDelegates != null && !replicaStatusDelegates.isEmpty()) {
        logger.trace("Setting replica stopped state via ReplicaStatusDelegate on replica {}", Arrays.toString(replicasToUpdate.toArray()));
        for (ReplicaStatusDelegate replicaStatusDelegate : replicaStatusDelegates) {
            updated &= markStop ? replicaStatusDelegate.markStopped(replicasToUpdate) : replicaStatusDelegate.unmarkStopped(replicasToUpdate);
        }
    } else {
        logger.warn("The ReplicaStatusDelegate is not instantiated");
        updated = false;
    }
    if (!updated) {
        // either mark/unmark operation fails or ReplicaStatusDelegate is not instantiated.
        failToUpdateStores.addAll(partitionIds);
    }
    return new ArrayList<>(failToUpdateStores);
}
Also used : ReplicaStatusDelegate(com.github.ambry.clustermap.ReplicaStatusDelegate) ArrayList(java.util.ArrayList) PartitionId(com.github.ambry.clustermap.PartitionId) ReplicaId(com.github.ambry.clustermap.ReplicaId) HashSet(java.util.HashSet)

Aggregations

ReplicaStatusDelegate (com.github.ambry.clustermap.ReplicaStatusDelegate)9 StoreConfig (com.github.ambry.config.StoreConfig)6 Test (org.junit.Test)5 MetricRegistry (com.codahale.metrics.MetricRegistry)4 VerifiableProperties (com.github.ambry.config.VerifiableProperties)4 ReplicaId (com.github.ambry.clustermap.ReplicaId)3 HashSet (java.util.HashSet)3 InMemAccountService (com.github.ambry.account.InMemAccountService)2 File (java.io.File)2 ArrayList (java.util.ArrayList)2 ClusterParticipant (com.github.ambry.clustermap.ClusterParticipant)1 HelixFactory (com.github.ambry.clustermap.HelixFactory)1 MockHelixParticipant (com.github.ambry.clustermap.MockHelixParticipant)1 PartitionId (com.github.ambry.clustermap.PartitionId)1 TestUtils (com.github.ambry.clustermap.TestUtils)1 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)1 ReplicationManager (com.github.ambry.replication.ReplicationManager)1 BlobStore (com.github.ambry.store.BlobStore)1 StorageManager (com.github.ambry.store.StorageManager)1 Store (com.github.ambry.store.Store)1