use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class ReplicationMetrics method generateRemoteReplicaMetricPrefix.
private String generateRemoteReplicaMetricPrefix(RemoteReplicaInfo remoteReplicaInfo) {
ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
DataNodeId dataNodeId = replicaId.getDataNodeId();
return dataNodeId.getHostname() + "-" + dataNodeId.getPort() + "-" + replicaId.getPartitionId().toString();
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class CloudToStoreReplicationManagerTest method cloudReplicaAdditionTest.
/**
* Test both success and failure cases when adding cloud replica
* @throws Exception
*/
@Test
public void cloudReplicaAdditionTest() throws Exception {
StorageManager storageManager = new StorageManager(storeConfig, new DiskManagerConfig(verifiableProperties), Utils.newScheduler(1, true), clusterMap.getMetricRegistry(), null, clusterMap, currentNode, null, Collections.singletonList(mockHelixParticipant), new MockTime(), null, new InMemAccountService(false, false));
CloudToStoreReplicationManager cloudToStoreReplicationManager = new CloudToStoreReplicationManager(replicationConfig, clusterMapConfig, storeConfig, storageManager, storeKeyFactory, clusterMap, mockScheduler, currentNode, null, clusterMap.getMetricRegistry(), null, storeKeyConverterFactory, serverConfig.serverMessageTransformer, mockClusterSpectator, mockHelixParticipant);
storageManager.start();
cloudToStoreReplicationManager.start();
mockClusterSpectator.spectate();
// 1. test adding cloud replica that is not present locally
mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
assertNull("Cloud replica thread should not be created", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
// create a new partition and add corresponding store in storage manager
PartitionId newPartition = new MockPartitionId(Long.parseLong(NEW_PARTITION_NAME), MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
ReplicaId replicaToAdd = newPartition.getReplicaIds().get(0);
assertTrue("Adding new store should succeed", storageManager.addBlobStore(replicaToAdd));
// 2. we deliberately shut down the store to induce failure when adding cloud replica
storageManager.shutdownBlobStore(newPartition);
mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
assertNull("Cloud replica thread should not be created", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
storageManager.startBlobStore(newPartition);
// 3. mock success case
mockHelixParticipant.onPartitionBecomeLeaderFromStandby(NEW_PARTITION_NAME);
assertNotNull("Cloud replica thread should be created for DC1", TestUtils.getThreadByThisName(REPLICA_THREAD_PREFIX));
cloudToStoreReplicationManager.shutdown();
storageManager.shutdown();
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class ReplicationTest method replicaFromStandbyToInactiveTest.
/**
* Test STANDBY -> INACTIVE transition on existing replica (both success and failure cases)
*/
@Test
public void replicaFromStandbyToInactiveTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// get an existing partition to test both success and failure cases
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// restart the store and trigger Standby-To-Inactive transition again
storageManager.startBlobStore(existingPartition);
// write a blob with size = 100 into local store (end offset of last PUT = 100 + 18 = 118)
Store localStore = storageManager.getStore(existingPartition);
MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
long crc = (new Random()).nextLong();
long blobSize = 100;
MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
List<MessageInfo> infos = new ArrayList<>();
List<ByteBuffer> buffers = new ArrayList<>();
ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
infos.add(info);
buffers.add(buffer);
localStore.put(new MockMessageWriteSet(infos, buffers));
ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger STANDBY -> INACTIVE transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Local store state should be INACTIVE", ReplicaState.INACTIVE, storageManager.getStore(existingPartition).getCurrentState());
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
assertFalse("Sync up should not complete because not enough replicas have caught up", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica1, 10L, ReplicaState.INACTIVE));
// pick another remote replica to update the replication lag
ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), 118);
assertFalse("Sync up shouldn't complete because only one replica has caught up with local replica", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
// make second peer replica catch up with last PUT in local store
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), 118);
assertTrue("Standby-To-Inactive transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// we purposely update lag against local replica to verify local replica is no longer in ReplicaSyncUpManager because
// deactivation is complete and local replica should be removed from "replicaToLagInfos" map.
assertFalse("Sync up should complete (2 replicas have caught up), hence updated should be false", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica2, 0L, ReplicaState.INACTIVE));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class ReplicationTest method replicaFromBootstrapToStandbyTest.
/**
* Test BOOTSTRAP -> STANDBY transition on both existing and new replicas. For new replica, we test both failure and
* success cases.
* @throws Exception
*/
@Test
public void replicaFromBootstrapToStandbyTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// 1. test existing partition trough Bootstrap-To-Standby transition, should be no op.
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
assertEquals("Store state doesn't match", ReplicaState.STANDBY, storageManager.getStore(existingPartition).getCurrentState());
// 2. test transition failure due to store not started
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// 3. create new replica and add it into storage manager, test replica that needs to initiate bootstrap
ReplicaId newReplicaToAdd = getNewReplicaToAdd(clusterMap);
assertTrue("Adding new replica to Storage Manager should succeed", storageManager.addBlobStore(newReplicaToAdd));
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger BOOTSTRAP -> STANDBY transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(newReplicaToAdd.getPartitionId().toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Replica should be in BOOTSTRAP state before bootstrap is complete", ReplicaState.BOOTSTRAP, storageManager.getStore(newReplicaToAdd.getPartitionId()).getCurrentState());
// make bootstrap succeed
mockHelixParticipant.getReplicaSyncUpManager().onBootstrapComplete(newReplicaToAdd);
assertTrue("Bootstrap-To-Standby transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class ReplicationTest method replicationLagMetricAndSyncUpTest.
/**
* Tests {@link ReplicationMetrics#getMaxLagForPartition(PartitionId)}
* @throws Exception
*/
@Test
public void replicationLagMetricAndSyncUpTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
AmbryReplicaSyncUpManager replicaSyncUpService = new AmbryReplicaSyncUpManager(clusterMapConfig);
Pair<MockHost, MockHost> localAndRemoteHosts = getLocalAndRemoteHosts(clusterMap);
MockHost localHost = localAndRemoteHosts.getFirst();
MockHost remoteHost1 = localAndRemoteHosts.getSecond();
// create another remoteHost2 that shares spacial partition with localHost and remoteHost1
PartitionId specialPartitionId = clusterMap.getWritablePartitionIds(MockClusterMap.SPECIAL_PARTITION_CLASS).get(0);
MockHost remoteHost2 = new MockHost(specialPartitionId.getReplicaIds().get(2).getDataNodeId(), clusterMap);
MockStoreKeyConverterFactory storeKeyConverterFactory = new MockStoreKeyConverterFactory(null, null);
storeKeyConverterFactory.setConversionMap(new HashMap<>());
storeKeyConverterFactory.setReturnInputIfAbsent(true);
MockStoreKeyConverterFactory.MockStoreKeyConverter storeKeyConverter = storeKeyConverterFactory.getStoreKeyConverter();
int batchSize = 4;
List<PartitionId> partitionIds = clusterMap.getWritablePartitionIds(null);
for (int i = 0; i < partitionIds.size(); i++) {
PartitionId partitionId = partitionIds.get(i);
// add batchSize + 1 messages to the remoteHost1 so that two rounds of replication is needed.
addPutMessagesToReplicasOfPartition(partitionId, Collections.singletonList(remoteHost1), batchSize + 1);
}
// add batchSize - 1 messages to the remoteHost2 so that localHost can catch up during one cycle of replication
for (ReplicaId replicaId : clusterMap.getReplicaIds(remoteHost2.dataNodeId)) {
addPutMessagesToReplicasOfPartition(replicaId.getPartitionId(), Collections.singletonList(remoteHost2), batchSize - 1);
}
StoreKeyFactory storeKeyFactory = new BlobIdFactory(clusterMap);
Transformer transformer = new BlobIdTransformer(storeKeyFactory, storeKeyConverter);
Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread1 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost1, storeKeyConverter, transformer, null, replicaSyncUpService);
Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate1 = replicasAndThread1.getFirst();
ReplicaThread replicaThread1 = replicasAndThread1.getSecond();
// mock Bootstrap-To-Standby transition in ReplicationManager: 1. update store current state; 2. initiate bootstrap
replicasToReplicate1.get(remoteHost1.dataNodeId).forEach(info -> info.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP));
clusterMap.getReplicaIds(localHost.dataNodeId).forEach(replicaSyncUpService::initiateBootstrap);
List<ReplicaThread.ExchangeMetadataResponse> response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
for (PartitionId partitionId : partitionIds) {
List<MessageInfo> allMessageInfos = localAndRemoteHosts.getSecond().infosByPartition.get(partitionId);
long expectedLag = allMessageInfos.subList(batchSize, allMessageInfos.size()).stream().mapToLong(MessageInfo::getSize).sum();
assertEquals("Replication lag doesn't match expected value", expectedLag, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
}
response = replicaThread1.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId));
replicaThread1.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost1, batchSize), replicasToReplicate1.get(remoteHost1.dataNodeId), response, false);
for (PartitionId partitionId : partitionIds) {
assertEquals("Replication lag should equal to 0", 0, replicaThread1.getReplicationMetrics().getMaxLagForPartition(partitionId));
}
// replicate with remoteHost2 to ensure special replica has caught up with enough peers
Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> replicasAndThread2 = getRemoteReplicasAndReplicaThread(batchSize, clusterMap, localHost, remoteHost2, storeKeyConverter, transformer, null, replicaSyncUpService);
Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate2 = replicasAndThread2.getFirst();
ReplicaThread replicaThread2 = replicasAndThread2.getSecond();
// initiate bootstrap on replica of special partition
RemoteReplicaInfo specialReplicaInfo = replicasToReplicate2.get(remoteHost2.dataNodeId).stream().filter(info -> info.getReplicaId().getPartitionId() == specialPartitionId).findFirst().get();
specialReplicaInfo.getLocalStore().setCurrentState(ReplicaState.BOOTSTRAP);
replicaSyncUpService.initiateBootstrap(specialReplicaInfo.getLocalReplicaId());
response = replicaThread2.exchangeMetadata(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId));
replicaThread2.fixMissingStoreKeys(new MockConnectionPool.MockConnection(remoteHost2, batchSize), replicasToReplicate2.get(remoteHost2.dataNodeId), response, false);
// verify replica of special partition has completed bootstrap and becomes standby
assertEquals("Store state is not expected", ReplicaState.STANDBY, specialReplicaInfo.getLocalStore().getCurrentState());
}
Aggregations