Search in sources :

Example 1 with AmbryReplicaSyncUpManager

use of com.github.ambry.clustermap.AmbryReplicaSyncUpManager in project ambry by linkedin.

the class ReplicationTest method replicationLagMetricAndSyncUpTest.

/**
 * Tests {@link ReplicationMetrics#getMaxLagForPartition(PartitionId)}
 * @throws Exception
 */
@Test
public void replicationLagMetricAndSyncUpTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    AmbryReplicaSyncUpManager replicaSyncUpService = new AmbryReplicaSyncUpManager(clusterMapConfig);
    Pair<MockHost, MockHost> localAndRemoteHosts = getLocalAndRemoteHosts(clusterMap);
    MockHost localHost = localAndRemoteHosts.getFirst();
    MockHost remoteHost1 = localAndRemoteHosts.getSecond();
    // create another remoteHost2 that shares spacial partition with localHost and remoteHost1
    PartitionId specialPartitionId = clusterMap.getWritablePartitionIds(MockClusterMap.SPECIAL_PARTITION_CLASS).get(0);
    MockHost remoteHost2 = new MockHost(specialPartitionId.getReplicaIds().get(2).getDataNodeId(), clusterMap);
    MockStoreKeyConverterFactory storeKeyConverterFactory = new MockStoreKeyConverterFactory(null, null);
    storeKeyConverterFactory.setConversionMap(new HashMap<>());
    storeKeyConverterFactory.setReturnInputIfAbsent(true);
    MockStoreKeyConverterFactory.MockStoreKeyConverter storeKeyConverter = storeKeyConverterFactory.getStoreKeyConverter();
    int batchSize = 4;
    List<PartitionId> partitionIds = clusterMap.getWritablePartitionIds(null);
    for (int i = 0; i < partitionIds.size(); i++) {
        PartitionId partitionId = partitionIds.get(i);
        // add batchSize + 1 messages to the remoteHost1 so that two rounds of replication is needed.
        addPutMessagesToReplicasOfPartition(partitionId, Collections.singletonList(remoteHost1), batchSize + 1);
    }
    // add batchSize - 1 messages to the remoteHost2 so that localHost can catch up during one cycle of replication
    for (ReplicaId replicaId : clusterMap.getReplicaIds(remoteHost2.dataNodeId)) {
        addPutMessagesToReplicasOfPartition(replicaId.getPartitionId(), Collections.singletonList(remoteHost2), batchSize - 1);
    }
    StoreKeyFactory storeKeyFactory = new BlobIdFactory(clusterMap);
    Transformer transformer = new BlobIdTransformer(storeKeyFactory, storeKeyConverter);
    Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread1 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost1, storeKeyConverter, transformer, null, replicaSyncUpService);
    Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate1 = replicasAndThread1.getFirst();
    ReplicaThread replicaThread1 = replicasAndThread1.getSecond();
    // mock Bootstrap-To-Standby transition in ReplicationManager: 1. update store current state; 2. initiate bootstrap
    replicasToReplicate1.get(remoteHost1.dataNodeId).forEach(info -> info.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP));
    clusterMap.getReplicaIds(localHost.dataNodeId).forEach(replicaSyncUpService::initiateBootstrap);
    List<ReplicaThread.ExchangeMetadataResponse> response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
    replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
    for (PartitionId partitionId : partitionIds) {
        List<MessageInfo> allMessageInfos = localAndRemoteHosts.getSecond().infosByPartition.get(partitionId);
        long expectedLag = allMessageInfos.subList(batchSize, allMessageInfos.size()).stream().mapToLong(MessageInfo::getSize).sum();
        assertEquals("Replication lag doesn't match expected value", expectedLag, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
    }
    response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
    replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
    for (PartitionId partitionId : partitionIds) {
        assertEquals("Replication lag should equal to 0", 0, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
    }
    // replicate with remoteHost2 to ensure special replica has caught up with enough peers
    Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread2 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost2, storeKeyConverter, transformer, null, replicaSyncUpService);
    Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate2 = replicasAndThread2.getFirst();
    ReplicaThread replicaThread2 = replicasAndThread2.getSecond();
    // initiate bootstrap on replica of special partition
    RemoteReplicaInfo specialReplicaInfo = replicasToReplicate2.get(remoteHost2.dataNodeId).stream().filter(info -> info.getReplicaId().getPartitionId() == specialPartitionId).findFirst().get();
    specialReplicaInfo.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP);
    replicaSyncUpService.initiateBootstrap(specialReplicaInfo.getLocalReplicaId());
    response = replicaThread2.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId));
    replicaThread2.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId), response, false);
    // verify replica of special partition has completed bootstrap and becomes standby
    assertEquals("Store state is not expected", ReplicaState.STANDBY, specialReplicaInfo.getLocalStore().getCurrentState());
}
Also used : ValidatingTransformer(com.github.ambry.messageformat.ValidatingTransformer) Transformer(com.github.ambry.store.Transformer) StoreKeyFactory(com.github.ambry.store.StoreKeyFactory) List(java.util.List) ArrayList(java.util.ArrayList) MockStoreKeyConverterFactory(com.github.ambry.store.MockStoreKeyConverterFactory) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) BlobIdFactory(com.github.ambry.commons.BlobIdFactory) MessageInfo(com.github.ambry.store.MessageInfo) Map(java.util.Map) HashMap(java.util.HashMap) ClusterMap(com.github.ambry.clustermap.ClusterMap) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) DataNodeId(com.github.ambry.clustermap.DataNodeId) MockDataNodeId(com.github.ambry.clustermap.MockDataNodeId) AmbryReplicaSyncUpManager(com.github.ambry.clustermap.AmbryReplicaSyncUpManager) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) Test(org.junit.Test)

Example 2 with AmbryReplicaSyncUpManager

use of com.github.ambry.clustermap.AmbryReplicaSyncUpManager in project ambry by linkedin.

the class ReplicationTest method replicaResumeDecommissionTest.

/**
 * Test that resuming decommission on certain replica behaves correctly.
 * @throws Exception
 */
@Test
public void replicaResumeDecommissionTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockHelixParticipant = Mockito.spy(new MockHelixParticipant(clusterMapConfig));
    doNothing().when(mockHelixParticipant).setPartitionDisabledState(anyString(), anyBoolean());
    // choose a replica on local node and put decommission file into its dir
    ReplicaId localReplica = clusterMap.getReplicaIds(clusterMap.getDataNodeIds().get(0)).get(0);
    String partitionName = localReplica.getPartitionId().toPathString();
    File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
    assertTrue("Can't create decommission file", decommissionFile.createNewFile());
    Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
    StorageManager storageManager = managers.getFirst();
    // failure case 1: store is not started when resuming decommission
    storageManager.shutdownBlobStore(localReplica.getPartitionId());
    try {
        mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
        fail("should fail");
    } catch (StateTransitionException e) {
        assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
    }
    storageManager.startBlobStore(localReplica.getPartitionId());
    // failure case 2: fail to remove replica from InstanceConfig in Helix
    AmbryReplicaSyncUpManager replicaSyncUpManager = (AmbryReplicaSyncUpManager) mockHelixParticipant.getReplicaSyncUpManager();
    mockHelixParticipant.updateNodeInfoReturnVal = false;
    CountDownLatch executionLatch = new CountDownLatch(1);
    AtomicBoolean exceptionOccurred = new AtomicBoolean(false);
    Utils.newThread(() -> {
        try {
            mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
            fail("should fail because updating node info returns false");
        } catch (StateTransitionException e) {
            exceptionOccurred.getAndSet(true);
            assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
        } finally {
            executionLatch.countDown();
        }
    }, false).start();
    while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
        Thread.sleep(100);
    }
    replicaSyncUpManager.onDeactivationComplete(localReplica);
    while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
        Thread.sleep(100);
    }
    replicaSyncUpManager.onDisconnectionComplete(localReplica);
    assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", executionLatch.await(1, TimeUnit.SECONDS));
    assertTrue("State transition exception should be thrown", exceptionOccurred.get());
    mockHelixParticipant.updateNodeInfoReturnVal = null;
    storageManager.startBlobStore(localReplica.getPartitionId());
    // success case
    mockHelixParticipant.mockStatsManagerListener = Mockito.mock(PartitionStateChangeListener.class);
    doNothing().when(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
    mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.StatsManagerListener, mockHelixParticipant.mockStatsManagerListener);
    CountDownLatch participantLatch = new CountDownLatch(1);
    Utils.newThread(() -> {
        mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
        participantLatch.countDown();
    }, false).start();
    while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
        Thread.sleep(100);
    }
    replicaSyncUpManager.onDeactivationComplete(localReplica);
    while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
        Thread.sleep(100);
    }
    replicaSyncUpManager.onDisconnectionComplete(localReplica);
    assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
    // verify stats manager listener is called
    verify(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
    // verify setPartitionDisabledState method is called
    verify(mockHelixParticipant).setPartitionDisabledState(partitionName, false);
    File storeDir = new File(localReplica.getReplicaPath());
    assertFalse("Store dir should not exist", storeDir.exists());
    storageManager.shutdown();
}
Also used : MetricRegistry(com.codahale.metrics.MetricRegistry) StorageManager(com.github.ambry.store.StorageManager) PartitionStateChangeListener(com.github.ambry.clustermap.PartitionStateChangeListener) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) File(java.io.File) AmbryReplicaSyncUpManager(com.github.ambry.clustermap.AmbryReplicaSyncUpManager) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) StateTransitionException(com.github.ambry.clustermap.StateTransitionException) Test(org.junit.Test)

Aggregations

AmbryReplicaSyncUpManager (com.github.ambry.clustermap.AmbryReplicaSyncUpManager)2 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)2 MockReplicaId (com.github.ambry.clustermap.MockReplicaId)2 ReplicaId (com.github.ambry.clustermap.ReplicaId)2 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)2 Test (org.junit.Test)2 MetricRegistry (com.codahale.metrics.MetricRegistry)1 ClusterMap (com.github.ambry.clustermap.ClusterMap)1 DataNodeId (com.github.ambry.clustermap.DataNodeId)1 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)1 MockHelixParticipant (com.github.ambry.clustermap.MockHelixParticipant)1 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)1 PartitionId (com.github.ambry.clustermap.PartitionId)1 PartitionStateChangeListener (com.github.ambry.clustermap.PartitionStateChangeListener)1 StateTransitionException (com.github.ambry.clustermap.StateTransitionException)1 BlobIdFactory (com.github.ambry.commons.BlobIdFactory)1 ValidatingTransformer (com.github.ambry.messageformat.ValidatingTransformer)1 MessageInfo (com.github.ambry.store.MessageInfo)1 MockStoreKeyConverterFactory (com.github.ambry.store.MockStoreKeyConverterFactory)1 StorageManager (com.github.ambry.store.StorageManager)1