use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class ReplicationTest method addAndRemoveReplicaTest.
/**
* Test dynamically add/remove replica in {@link ReplicationManager}
* @throws Exception
*/
@Test
public void addAndRemoveReplicaTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
StoreConfig storeConfig = new StoreConfig(verifiableProperties);
DataNodeId dataNodeId = clusterMap.getDataNodeIds().get(0);
MockStoreKeyConverterFactory storeKeyConverterFactory = new MockStoreKeyConverterFactory(null, null);
storeKeyConverterFactory.setConversionMap(new HashMap<>());
StorageManager storageManager = new StorageManager(storeConfig, new DiskManagerConfig(verifiableProperties), Utils.newScheduler(1, true), new MetricRegistry(), null, clusterMap, dataNodeId, null, null, new MockTime(), null, new InMemAccountService(false, false));
storageManager.start();
MockReplicationManager replicationManager = new MockReplicationManager(replicationConfig, clusterMapConfig, storeConfig, storageManager, clusterMap, dataNodeId, storeKeyConverterFactory, null);
ReplicaId replicaToTest = clusterMap.getReplicaIds(dataNodeId).get(0);
// Attempting to add replica that already exists should fail
assertFalse("Adding an existing replica should fail", replicationManager.addReplica(replicaToTest));
// Create a brand new replica that sits on one of the disk of datanode, add it into replication manager
PartitionId newPartition = clusterMap.createNewPartition(clusterMap.getDataNodes());
for (ReplicaId replicaId : newPartition.getReplicaIds()) {
if (replicaId.getDataNodeId() == dataNodeId) {
replicaToTest = replicaId;
break;
}
}
// Before adding replica, partitionToPartitionInfo and mountPathToPartitionInfos should not contain new partition
assertFalse("partitionToPartitionInfo should not contain new partition", replicationManager.getPartitionToPartitionInfoMap().containsKey(newPartition));
for (PartitionInfo partitionInfo : replicationManager.getMountPathToPartitionInfosMap().get(replicaToTest.getMountPath())) {
assertNotSame("mountPathToPartitionInfos should not contain new partition", partitionInfo.getPartitionId(), newPartition);
}
// Add new replica to replication manager
assertTrue("Adding new replica to replication manager should succeed", replicationManager.addReplica(replicaToTest));
// After adding replica, partitionToPartitionInfo and mountPathToPartitionInfos should contain new partition
assertTrue("partitionToPartitionInfo should contain new partition", replicationManager.getPartitionToPartitionInfoMap().containsKey(newPartition));
Optional<PartitionInfo> newPartitionInfo = replicationManager.getMountPathToPartitionInfosMap().get(replicaToTest.getMountPath()).stream().filter(partitionInfo -> partitionInfo.getPartitionId() == newPartition).findAny();
assertTrue("mountPathToPartitionInfos should contain new partition info", newPartitionInfo.isPresent());
// Verify that all remoteReplicaInfos of new added replica have assigned thread
for (RemoteReplicaInfo remoteReplicaInfo : newPartitionInfo.get().getRemoteReplicaInfos()) {
assertNotNull("The remote replica should be assigned to one replica thread", remoteReplicaInfo.getReplicaThread());
}
// Remove replica
assertTrue("Remove replica from replication manager should succeed", replicationManager.removeReplica(replicaToTest));
// Verify replica is removed, so partitionToPartitionInfo and mountPathToPartitionInfos should not contain new partition
assertFalse("partitionToPartitionInfo should not contain new partition", replicationManager.getPartitionToPartitionInfoMap().containsKey(newPartition));
for (PartitionInfo partitionInfo : replicationManager.getMountPathToPartitionInfosMap().get(replicaToTest.getMountPath())) {
assertNotSame("mountPathToPartitionInfos should not contain new partition", partitionInfo.getPartitionId(), newPartition);
}
// Verify that none of remoteReplicaInfo should have assigned thread
for (RemoteReplicaInfo remoteReplicaInfo : newPartitionInfo.get().getRemoteReplicaInfos()) {
assertNull("The remote replica should be assigned to one replica thread", remoteReplicaInfo.getReplicaThread());
}
// Remove the same replica that doesn't exist should be no-op
ReplicationManager mockManager = Mockito.spy(replicationManager);
assertFalse("Remove non-existent replica should return false", replicationManager.removeReplica(replicaToTest));
verify(mockManager, never()).removeRemoteReplicaInfoFromReplicaThread(anyList());
storageManager.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class ReplicationTest method replicaResumeDecommissionTest.
/**
* Test that resuming decommission on certain replica behaves correctly.
* @throws Exception
*/
@Test
public void replicaResumeDecommissionTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = Mockito.spy(new MockHelixParticipant(clusterMapConfig));
doNothing().when(mockHelixParticipant).setPartitionDisabledState(anyString(), anyBoolean());
// choose a replica on local node and put decommission file into its dir
ReplicaId localReplica = clusterMap.getReplicaIds(clusterMap.getDataNodeIds().get(0)).get(0);
String partitionName = localReplica.getPartitionId().toPathString();
File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
assertTrue("Can't create decommission file", decommissionFile.createNewFile());
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
// failure case 1: store is not started when resuming decommission
storageManager.shutdownBlobStore(localReplica.getPartitionId());
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail");
} catch (StateTransitionException e) {
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
}
storageManager.startBlobStore(localReplica.getPartitionId());
// failure case 2: fail to remove replica from InstanceConfig in Helix
AmbryReplicaSyncUpManager replicaSyncUpManager = (AmbryReplicaSyncUpManager) mockHelixParticipant.getReplicaSyncUpManager();
mockHelixParticipant.updateNodeInfoReturnVal = false;
CountDownLatch executionLatch = new CountDownLatch(1);
AtomicBoolean exceptionOccurred = new AtomicBoolean(false);
Utils.newThread(() -> {
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail because updating node info returns false");
} catch (StateTransitionException e) {
exceptionOccurred.getAndSet(true);
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
} finally {
executionLatch.countDown();
}
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", executionLatch.await(1, TimeUnit.SECONDS));
assertTrue("State transition exception should be thrown", exceptionOccurred.get());
mockHelixParticipant.updateNodeInfoReturnVal = null;
storageManager.startBlobStore(localReplica.getPartitionId());
// success case
mockHelixParticipant.mockStatsManagerListener = Mockito.mock(PartitionStateChangeListener.class);
doNothing().when(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.StatsManagerListener, mockHelixParticipant.mockStatsManagerListener);
CountDownLatch participantLatch = new CountDownLatch(1);
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
participantLatch.countDown();
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// verify stats manager listener is called
verify(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
// verify setPartitionDisabledState method is called
verify(mockHelixParticipant).setPartitionDisabledState(partitionName, false);
File storeDir = new File(localReplica.getReplicaPath());
assertFalse("Store dir should not exist", storeDir.exists());
storageManager.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class ReplicationTest method replicaFromInactiveToOfflineTest.
/**
* Test INACTIVE -> OFFLINE transition on existing replica (both success and failure cases)
*/
@Test
public void replicaFromInactiveToOfflineTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// 1. test replica not found case
try {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive("-1");
fail("should fail because of invalid partition");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", ReplicaNotFound, e.getErrorCode());
}
// 2. test store not started case
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
storageManager.startBlobStore(existingPartition);
// before testing success case, let's write a blob (size = 100) into local store and add a delete record for new blob
Store localStore = storageManager.getStore(existingPartition);
MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
long crc = (new Random()).nextLong();
long blobSize = 100;
MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
List<MessageInfo> infos = new ArrayList<>();
List<ByteBuffer> buffers = new ArrayList<>();
ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
infos.add(info);
buffers.add(buffer);
localStore.put(new MockMessageWriteSet(infos, buffers));
// delete the blob
int deleteRecordSize = (int) (new DeleteMessageFormatInputStream(id, (short) 0, (short) 0, 0).getSize());
MessageInfo deleteInfo = new MessageInfo(id, deleteRecordSize, id.getAccountId(), id.getContainerId(), time.milliseconds());
localStore.delete(Collections.singletonList(deleteInfo));
int sizeOfPutAndHeader = 100 + 18;
int sizeOfWhole = sizeOfPutAndHeader + deleteRecordSize;
// note that end offset of last PUT = 100 + 18 = 118, end offset of the store is sizeOfWhole
// 3. test success case (create a new thread and trigger INACTIVE -> OFFLINE transition)
ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
// put a decommission-in-progress file into local store dir
File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
assertTrue("Couldn't create decommission file in local store", decommissionFile.createNewFile());
decommissionFile.deleteOnExit();
assertNotSame("Before disconnection, the local store state shouldn't be OFFLINE", ReplicaState.OFFLINE, localStore.getCurrentState());
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
// the state of local store should be updated to OFFLINE
assertEquals("Local store state is not expected", ReplicaState.OFFLINE, localStore.getCurrentState());
// update replication lag between local and peer replicas
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
// peer1 catches up with last PUT, peer2 catches up with end offset of local store. In this case, SyncUp is not complete
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfPutAndHeader);
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), sizeOfWhole);
assertFalse("Only one peer replica has fully caught up with end offset so sync-up should not complete", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
// make peer1 catch up with end offset
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfWhole);
// Now, sync-up should complete and transition should be able to proceed.
assertTrue("Inactive-To-Offline transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
assertFalse("Local store should be stopped after transition", localStore.isStarted());
storageManager.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class ReplicationTest method replicaFromStandbyToLeaderTest.
/**
* Test state transition in replication manager from STANDBY to LEADER
* Test setup: When creating partitions, make sure that there is exactly one replica in LEADER STATE on each data center
* Test condition: When a partition on current node moves from standby to leader, verify that in-memory map storing
* partition to peer leader replicas is updated correctly
* @throws Exception
*/
@Test
public void replicaFromStandbyToLeaderTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
ReplicationConfig initialReplicationConfig = replicationConfig;
properties.setProperty("replication.model.across.datacenters", "LEADER_BASED");
replicationConfig = new ReplicationConfig(new VerifiableProperties(properties));
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
List<ReplicaId> replicaIds = clusterMap.getReplicaIds(replicationManager.dataNodeId);
for (ReplicaId replicaId : replicaIds) {
MockReplicaId mockReplicaId = (MockReplicaId) replicaId;
if (mockReplicaId.getReplicaState() == ReplicaState.LEADER) {
PartitionId existingPartition = mockReplicaId.getPartitionId();
mockHelixParticipant.onPartitionBecomeLeaderFromStandby(existingPartition.toPathString());
Set<ReplicaId> peerLeaderReplicasInReplicationManager = replicationManager.leaderBasedReplicationAdmin.getLeaderPartitionToPeerLeaderReplicas().get(existingPartition.toPathString());
Set<ReplicaId> peerLeaderReplicasInClusterMap = new HashSet<>(existingPartition.getReplicaIdsByState(ReplicaState.LEADER, null));
peerLeaderReplicasInClusterMap.remove(mockReplicaId);
assertThat("Mismatch in list of leader peer replicas stored by partition in replication manager and cluster map", peerLeaderReplicasInReplicationManager, is(peerLeaderReplicasInClusterMap));
}
}
storageManager.shutdown();
replicationConfig = initialReplicationConfig;
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class RestServerMain method main.
public static void main(String[] args) {
final RestServer restServer;
int exitCode = 0;
ClusterMap clusterMap = null;
try {
InvocationOptions options = new InvocationOptions(args);
Properties properties = Utils.loadProps(options.serverPropsFilePath);
VerifiableProperties verifiableProperties = new VerifiableProperties(properties);
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
ClusterAgentsFactory clusterAgentsFactory = Utils.getObj(clusterMapConfig.clusterMapClusterAgentsFactory, clusterMapConfig, options.hardwareLayoutFilePath, options.partitionLayoutFilePath);
clusterMap = clusterAgentsFactory.getClusterMap();
SSLFactory sslFactory = getSSLFactoryIfRequired(verifiableProperties);
logger.info("Bootstrapping RestServer");
restServer = new RestServer(verifiableProperties, clusterMap, new LoggingNotificationSystem(), sslFactory);
// attach shutdown handler to catch control-c
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
logger.info("Received shutdown signal. Shutting down RestServer");
restServer.shutdown();
}));
restServer.start();
restServer.awaitShutdown();
} catch (Exception e) {
logger.error("Exception during bootstrap of RestServer", e);
exitCode = 1;
} finally {
if (clusterMap != null) {
clusterMap.close();
}
}
logger.info("Exiting RestServerMain");
System.exit(exitCode);
}
Aggregations