Search in sources :

Example 41 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class ReplicationMetrics method generateRemoteReplicaMetricPrefix.

private String generateRemoteReplicaMetricPrefix(RemoteReplicaInfo remoteReplicaInfo) {
    ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
    DataNodeId dataNodeId = replicaId.getDataNodeId();
    return dataNodeId.getHostname() + "-" + dataNodeId.getPort() + "-" + replicaId.getPartitionId().toString();
}
Also used : DataNodeId(com.github.ambry.clustermap.DataNodeId) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 42 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class CloudToStoreReplicationManagerTest method cloudReplicaAdditionTest.

/**
 * Test both success and failure cases when adding cloud replica
 * @throws Exception
 */
@Test
public void cloudReplicaAdditionTest() throws Exception {
    StorageManager storageManager = new StorageManager(storeConfig, new DiskManagerConfig(verifiableProperties), Utils.newScheduler(1, true), clusterMap.getMetricRegistry(), null, clusterMap, currentNode, null, Collections.singletonList(mockHelixParticipant), new MockTime(), null, new InMemAccountService(false, false));
    CloudToStoreReplicationManager cloudToStoreReplicationManager = new CloudToStoreReplicationManager(replicationConfig, clusterMapConfig, storeConfig, storageManager, storeKeyFactory, clusterMap, mockScheduler, currentNode, null, clusterMap.getMetricRegistry(), null, storeKeyConverterFactory, serverConfig.serverMessageTransformer, mockClusterSpectator, mockHelixParticipant);
    storageManager.start();
    cloudToStoreReplicationManager.start();
    mockClusterSpectator.spectate();
    // 1. test adding cloud replica that is not present locally
    mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
    assertNull("Cloud replica thread should not be created", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
    // create a new partition and add corresponding store in storage manager
    PartitionId newPartition = new MockPartitionId(Long.parseLong(NEW_PARTITION_NAME), MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
    ReplicaId replicaToAdd = newPartition.getReplicaIds().get(0);
    assertTrue("Adding new store should succeed", storageManager.addBlobStore(replicaToAdd));
    // 2. we deliberately shut down the store to induce failure when adding cloud replica
    storageManager.shutdownBlobStore(newPartition);
    mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
    assertNull("Cloud replica thread should not be created", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
    storageManager.startBlobStore(newPartition);
    // 3. mock success case
    mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
    assertNotNull("Cloud replica thread should be created for DC1", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
    cloudToStoreReplicationManager.shutdown();
    storageManager.shutdown();
}
Also used : DiskManagerConfig(com.github.ambry.config.DiskManagerConfig) InMemAccountService(com.github.ambry.account.InMemAccountService) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) StorageManager(com.github.ambry.store.StorageManager) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) MockTime(com.github.ambry.utils.MockTime) ReplicaId(com.github.ambry.clustermap.ReplicaId) Test(org.junit.Test)

Example 43 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class ReplicationTest method replicaFromStandbyToInactiveTest.

/**
 * Test STANDBY -> INACTIVE transition on existing replica (both success and failure cases)
 */
@Test
public void replicaFromStandbyToInactiveTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
    Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
    StorageManager storageManager = managers.getFirst();
    MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
    // get an existing partition to test both success and failure cases
    PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
    storageManager.shutdownBlobStore(existingPartition);
    try {
        mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
        fail("should fail because store is not started");
    } catch (StateTransitionException e) {
        assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
    }
    // restart the store and trigger Standby-To-Inactive transition again
    storageManager.startBlobStore(existingPartition);
    // write a blob with size = 100 into local store (end offset of last PUT = 100 + 18 = 118)
    Store localStore = storageManager.getStore(existingPartition);
    MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
    long crc = (new Random()).nextLong();
    long blobSize = 100;
    MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
    List<MessageInfo> infos = new ArrayList<>();
    List<ByteBuffer> buffers = new ArrayList<>();
    ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
    infos.add(info);
    buffers.add(buffer);
    localStore.put(new MockMessageWriteSet(infos, buffers));
    ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
    // override partition state change listener in ReplicationManager to help thread manipulation
    mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
    CountDownLatch participantLatch = new CountDownLatch(1);
    replicationManager.listenerExecutionLatch = new CountDownLatch(1);
    // create a new thread and trigger STANDBY -> INACTIVE transition
    Utils.newThread(() -> {
        mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
        participantLatch.countDown();
    }, false).start();
    assertTrue("Partition state change listener didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
    assertEquals("Local store state should be INACTIVE", ReplicaState.INACTIVE, storageManager.getStore(existingPartition).getCurrentState());
    List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
    ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
    assertFalse("Sync up should not complete because not enough replicas have caught up", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica1, 10L, ReplicaState.INACTIVE));
    // pick another remote replica to update the replication lag
    ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
    replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), 118);
    assertFalse("Sync up shouldn't complete because only one replica has caught up with local replica", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
    // make second peer replica catch up with last PUT in local store
    replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), 118);
    assertTrue("Standby-To-Inactive transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
    // we purposely update lag against local replica to verify local replica is no longer in ReplicaSyncUpManager because
    // deactivation is complete and local replica should be removed from "replicaToLagInfos" map.
    assertFalse("Sync up should complete (2 replicas have caught up), hence updated should be false", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica2, 0L, ReplicaState.INACTIVE));
    storageManager.shutdown();
}
Also used : MetricRegistry(com.codahale.metrics.MetricRegistry) StorageManager(com.github.ambry.store.StorageManager) ArrayList(java.util.ArrayList) Store(com.github.ambry.store.Store) MockId(com.github.ambry.store.MockId) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) CountDownLatch(java.util.concurrent.CountDownLatch) ByteBuffer(java.nio.ByteBuffer) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) MessageInfo(com.github.ambry.store.MessageInfo) MockMessageWriteSet(com.github.ambry.store.MockMessageWriteSet) MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) Random(java.util.Random) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) StateTransitionException(com.github.ambry.clustermap.StateTransitionException) Test(org.junit.Test)

Example 44 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class ReplicationTest method replicaFromBootstrapToStandbyTest.

/**
 * Test BOOTSTRAP -> STANDBY transition on both existing and new replicas. For new replica, we test both failure and
 * success cases.
 * @throws Exception
 */
@Test
public void replicaFromBootstrapToStandbyTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
    Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
    StorageManager storageManager = managers.getFirst();
    MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
    // 1. test existing partition trough Bootstrap-To-Standby transition, should be no op.
    PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
    mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
    assertEquals("Store state doesn't match", ReplicaState.STANDBY, storageManager.getStore(existingPartition).getCurrentState());
    // 2. test transition failure due to store not started
    storageManager.shutdownBlobStore(existingPartition);
    try {
        mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
        fail("should fail because store is not started");
    } catch (StateTransitionException e) {
        assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
    }
    // 3. create new replica and add it into storage manager, test replica that needs to initiate bootstrap
    ReplicaId newReplicaToAdd = getNewReplicaToAdd(clusterMap);
    assertTrue("Adding new replica to Storage Manager should succeed", storageManager.addBlobStore(newReplicaToAdd));
    // override partition state change listener in ReplicationManager to help thread manipulation
    mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
    CountDownLatch participantLatch = new CountDownLatch(1);
    replicationManager.listenerExecutionLatch = new CountDownLatch(1);
    // create a new thread and trigger BOOTSTRAP -> STANDBY transition
    Utils.newThread(() -> {
        mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(newReplicaToAdd.getPartitionId().toPathString());
        participantLatch.countDown();
    }, false).start();
    assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
    assertEquals("Replica should be in BOOTSTRAP state before bootstrap is complete", ReplicaState.BOOTSTRAP, storageManager.getStore(newReplicaToAdd.getPartitionId()).getCurrentState());
    // make bootstrap succeed
    mockHelixParticipant.getReplicaSyncUpManager().onBootstrapComplete(newReplicaToAdd);
    assertTrue("Bootstrap-To-Standby transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
    storageManager.shutdown();
}
Also used : MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) MetricRegistry(com.codahale.metrics.MetricRegistry) StorageManager(com.github.ambry.store.StorageManager) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) StateTransitionException(com.github.ambry.clustermap.StateTransitionException) Test(org.junit.Test)

Example 45 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class ReplicationTest method replicationLagMetricAndSyncUpTest.

/**
 * Tests {@link ReplicationMetrics#getMaxLagForPartition(PartitionId)}
 * @throws Exception
 */
@Test
public void replicationLagMetricAndSyncUpTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    AmbryReplicaSyncUpManager replicaSyncUpService = new AmbryReplicaSyncUpManager(clusterMapConfig);
    Pair<MockHost, MockHost> localAndRemoteHosts = getLocalAndRemoteHosts(clusterMap);
    MockHost localHost = localAndRemoteHosts.getFirst();
    MockHost remoteHost1 = localAndRemoteHosts.getSecond();
    // create another remoteHost2 that shares spacial partition with localHost and remoteHost1
    PartitionId specialPartitionId = clusterMap.getWritablePartitionIds(MockClusterMap.SPECIAL_PARTITION_CLASS).get(0);
    MockHost remoteHost2 = new MockHost(specialPartitionId.getReplicaIds().get(2).getDataNodeId(), clusterMap);
    MockStoreKeyConverterFactory storeKeyConverterFactory = new MockStoreKeyConverterFactory(null, null);
    storeKeyConverterFactory.setConversionMap(new HashMap<>());
    storeKeyConverterFactory.setReturnInputIfAbsent(true);
    MockStoreKeyConverterFactory.MockStoreKeyConverter storeKeyConverter = storeKeyConverterFactory.getStoreKeyConverter();
    int batchSize = 4;
    List<PartitionId> partitionIds = clusterMap.getWritablePartitionIds(null);
    for (int i = 0; i < partitionIds.size(); i++) {
        PartitionId partitionId = partitionIds.get(i);
        // add batchSize + 1 messages to the remoteHost1 so that two rounds of replication is needed.
        addPutMessagesToReplicasOfPartition(partitionId, Collections.singletonList(remoteHost1), batchSize + 1);
    }
    // add batchSize - 1 messages to the remoteHost2 so that localHost can catch up during one cycle of replication
    for (ReplicaId replicaId : clusterMap.getReplicaIds(remoteHost2.dataNodeId)) {
        addPutMessagesToReplicasOfPartition(replicaId.getPartitionId(), Collections.singletonList(remoteHost2), batchSize - 1);
    }
    StoreKeyFactory storeKeyFactory = new BlobIdFactory(clusterMap);
    Transformer transformer = new BlobIdTransformer(storeKeyFactory, storeKeyConverter);
    Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread1 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost1, storeKeyConverter, transformer, null, replicaSyncUpService);
    Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate1 = replicasAndThread1.getFirst();
    ReplicaThread replicaThread1 = replicasAndThread1.getSecond();
    // mock Bootstrap-To-Standby transition in ReplicationManager: 1. update store current state; 2. initiate bootstrap
    replicasToReplicate1.get(remoteHost1.dataNodeId).forEach(info -> info.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP));
    clusterMap.getReplicaIds(localHost.dataNodeId).forEach(replicaSyncUpService::initiateBootstrap);
    List<ReplicaThread.ExchangeMetadataResponse> response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
    replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
    for (PartitionId partitionId : partitionIds) {
        List<MessageInfo> allMessageInfos = localAndRemoteHosts.getSecond().infosByPartition.get(partitionId);
        long expectedLag = allMessageInfos.subList(batchSize, allMessageInfos.size()).stream().mapToLong(MessageInfo::getSize).sum();
        assertEquals("Replication lag doesn't match expected value", expectedLag, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
    }
    response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
    replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
    for (PartitionId partitionId : partitionIds) {
        assertEquals("Replication lag should equal to 0", 0, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
    }
    // replicate with remoteHost2 to ensure special replica has caught up with enough peers
    Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread2 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost2, storeKeyConverter, transformer, null, replicaSyncUpService);
    Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate2 = replicasAndThread2.getFirst();
    ReplicaThread replicaThread2 = replicasAndThread2.getSecond();
    // initiate bootstrap on replica of special partition
    RemoteReplicaInfo specialReplicaInfo = replicasToReplicate2.get(remoteHost2.dataNodeId).stream().filter(info -> info.getReplicaId().getPartitionId() == specialPartitionId).findFirst().get();
    specialReplicaInfo.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP);
    replicaSyncUpService.initiateBootstrap(specialReplicaInfo.getLocalReplicaId());
    response = replicaThread2.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId));
    replicaThread2.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId), response, false);
    // verify replica of special partition has completed bootstrap and becomes standby
    assertEquals("Store state is not expected", ReplicaState.STANDBY, specialReplicaInfo.getLocalStore().getCurrentState());
}
Also used : ValidatingTransformer(com.github.ambry.messageformat.ValidatingTransformer) Transformer(com.github.ambry.store.Transformer) StoreKeyFactory(com.github.ambry.store.StoreKeyFactory) List(java.util.List) ArrayList(java.util.ArrayList) MockStoreKeyConverterFactory(com.github.ambry.store.MockStoreKeyConverterFactory) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) BlobIdFactory(com.github.ambry.commons.BlobIdFactory) MessageInfo(com.github.ambry.store.MessageInfo) Map(java.util.Map) HashMap(java.util.HashMap) ClusterMap(com.github.ambry.clustermap.ClusterMap) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) DataNodeId(com.github.ambry.clustermap.DataNodeId) MockDataNodeId(com.github.ambry.clustermap.MockDataNodeId) AmbryReplicaSyncUpManager(com.github.ambry.clustermap.AmbryReplicaSyncUpManager) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) Test(org.junit.Test)

Aggregations

ReplicaId (com.github.ambry.clustermap.ReplicaId)147 Test (org.junit.Test)83 PartitionId (com.github.ambry.clustermap.PartitionId)68 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)60 MockReplicaId (com.github.ambry.clustermap.MockReplicaId)57 ArrayList (java.util.ArrayList)55 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)43 DataNodeId (com.github.ambry.clustermap.DataNodeId)32 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)31 MetricRegistry (com.codahale.metrics.MetricRegistry)29 HashMap (java.util.HashMap)28 HashSet (java.util.HashSet)25 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)24 VerifiableProperties (com.github.ambry.config.VerifiableProperties)24 BlobStoreTest (com.github.ambry.store.BlobStoreTest)24 File (java.io.File)24 List (java.util.List)21 Map (java.util.Map)21 Port (com.github.ambry.network.Port)20 Properties (java.util.Properties)20