use of com.github.ambry.clustermap.StateTransitionException in project ambry by linkedin.
the class ReplicationTest method replicaFromStandbyToInactiveTest.
/**
* Test STANDBY -> INACTIVE transition on existing replica (both success and failure cases)
*/
@Test
public void replicaFromStandbyToInactiveTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// get an existing partition to test both success and failure cases
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// restart the store and trigger Standby-To-Inactive transition again
storageManager.startBlobStore(existingPartition);
// write a blob with size = 100 into local store (end offset of last PUT = 100 + 18 = 118)
Store localStore = storageManager.getStore(existingPartition);
MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
long crc = (new Random()).nextLong();
long blobSize = 100;
MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
List<MessageInfo> infos = new ArrayList<>();
List<ByteBuffer> buffers = new ArrayList<>();
ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
infos.add(info);
buffers.add(buffer);
localStore.put(new MockMessageWriteSet(infos, buffers));
ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger STANDBY -> INACTIVE transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Local store state should be INACTIVE", ReplicaState.INACTIVE, storageManager.getStore(existingPartition).getCurrentState());
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
assertFalse("Sync up should not complete because not enough replicas have caught up", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica1, 10L, ReplicaState.INACTIVE));
// pick another remote replica to update the replication lag
ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), 118);
assertFalse("Sync up shouldn't complete because only one replica has caught up with local replica", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
// make second peer replica catch up with last PUT in local store
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), 118);
assertTrue("Standby-To-Inactive transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// we purposely update lag against local replica to verify local replica is no longer in ReplicaSyncUpManager because
// deactivation is complete and local replica should be removed from "replicaToLagInfos" map.
assertFalse("Sync up should complete (2 replicas have caught up), hence updated should be false", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica2, 0L, ReplicaState.INACTIVE));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.StateTransitionException in project ambry by linkedin.
the class ReplicationTest method replicaFromBootstrapToStandbyTest.
/**
* Test BOOTSTRAP -> STANDBY transition on both existing and new replicas. For new replica, we test both failure and
* success cases.
* @throws Exception
*/
@Test
public void replicaFromBootstrapToStandbyTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// 1. test existing partition trough Bootstrap-To-Standby transition, should be no op.
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
assertEquals("Store state doesn't match", ReplicaState.STANDBY, storageManager.getStore(existingPartition).getCurrentState());
// 2. test transition failure due to store not started
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// 3. create new replica and add it into storage manager, test replica that needs to initiate bootstrap
ReplicaId newReplicaToAdd = getNewReplicaToAdd(clusterMap);
assertTrue("Adding new replica to Storage Manager should succeed", storageManager.addBlobStore(newReplicaToAdd));
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger BOOTSTRAP -> STANDBY transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(newReplicaToAdd.getPartitionId().toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Replica should be in BOOTSTRAP state before bootstrap is complete", ReplicaState.BOOTSTRAP, storageManager.getStore(newReplicaToAdd.getPartitionId()).getCurrentState());
// make bootstrap succeed
mockHelixParticipant.getReplicaSyncUpManager().onBootstrapComplete(newReplicaToAdd);
assertTrue("Bootstrap-To-Standby transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.StateTransitionException in project ambry by linkedin.
the class ReplicationTest method replicaResumeDecommissionTest.
/**
* Test that resuming decommission on certain replica behaves correctly.
* @throws Exception
*/
@Test
public void replicaResumeDecommissionTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = Mockito.spy(new MockHelixParticipant(clusterMapConfig));
doNothing().when(mockHelixParticipant).setPartitionDisabledState(anyString(), anyBoolean());
// choose a replica on local node and put decommission file into its dir
ReplicaId localReplica = clusterMap.getReplicaIds(clusterMap.getDataNodeIds().get(0)).get(0);
String partitionName = localReplica.getPartitionId().toPathString();
File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
assertTrue("Can't create decommission file", decommissionFile.createNewFile());
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
// failure case 1: store is not started when resuming decommission
storageManager.shutdownBlobStore(localReplica.getPartitionId());
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail");
} catch (StateTransitionException e) {
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
}
storageManager.startBlobStore(localReplica.getPartitionId());
// failure case 2: fail to remove replica from InstanceConfig in Helix
AmbryReplicaSyncUpManager replicaSyncUpManager = (AmbryReplicaSyncUpManager) mockHelixParticipant.getReplicaSyncUpManager();
mockHelixParticipant.updateNodeInfoReturnVal = false;
CountDownLatch executionLatch = new CountDownLatch(1);
AtomicBoolean exceptionOccurred = new AtomicBoolean(false);
Utils.newThread(() -> {
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail because updating node info returns false");
} catch (StateTransitionException e) {
exceptionOccurred.getAndSet(true);
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
} finally {
executionLatch.countDown();
}
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", executionLatch.await(1, TimeUnit.SECONDS));
assertTrue("State transition exception should be thrown", exceptionOccurred.get());
mockHelixParticipant.updateNodeInfoReturnVal = null;
storageManager.startBlobStore(localReplica.getPartitionId());
// success case
mockHelixParticipant.mockStatsManagerListener = Mockito.mock(PartitionStateChangeListener.class);
doNothing().when(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.StatsManagerListener, mockHelixParticipant.mockStatsManagerListener);
CountDownLatch participantLatch = new CountDownLatch(1);
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
participantLatch.countDown();
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// verify stats manager listener is called
verify(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
// verify setPartitionDisabledState method is called
verify(mockHelixParticipant).setPartitionDisabledState(partitionName, false);
File storeDir = new File(localReplica.getReplicaPath());
assertFalse("Store dir should not exist", storeDir.exists());
storageManager.shutdown();
}
use of com.github.ambry.clustermap.StateTransitionException in project ambry by linkedin.
the class ReplicationTest method replicaFromInactiveToOfflineTest.
/**
* Test INACTIVE -> OFFLINE transition on existing replica (both success and failure cases)
*/
@Test
public void replicaFromInactiveToOfflineTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// 1. test replica not found case
try {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive("-1");
fail("should fail because of invalid partition");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", ReplicaNotFound, e.getErrorCode());
}
// 2. test store not started case
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
storageManager.startBlobStore(existingPartition);
// before testing success case, let's write a blob (size = 100) into local store and add a delete record for new blob
Store localStore = storageManager.getStore(existingPartition);
MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
long crc = (new Random()).nextLong();
long blobSize = 100;
MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
List<MessageInfo> infos = new ArrayList<>();
List<ByteBuffer> buffers = new ArrayList<>();
ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
infos.add(info);
buffers.add(buffer);
localStore.put(new MockMessageWriteSet(infos, buffers));
// delete the blob
int deleteRecordSize = (int) (new DeleteMessageFormatInputStream(id, (short) 0, (short) 0, 0).getSize());
MessageInfo deleteInfo = new MessageInfo(id, deleteRecordSize, id.getAccountId(), id.getContainerId(), time.milliseconds());
localStore.delete(Collections.singletonList(deleteInfo));
int sizeOfPutAndHeader = 100 + 18;
int sizeOfWhole = sizeOfPutAndHeader + deleteRecordSize;
// note that end offset of last PUT = 100 + 18 = 118, end offset of the store is sizeOfWhole
// 3. test success case (create a new thread and trigger INACTIVE -> OFFLINE transition)
ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
// put a decommission-in-progress file into local store dir
File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
assertTrue("Couldn't create decommission file in local store", decommissionFile.createNewFile());
decommissionFile.deleteOnExit();
assertNotSame("Before disconnection, the local store state shouldn't be OFFLINE", ReplicaState.OFFLINE, localStore.getCurrentState());
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
// the state of local store should be updated to OFFLINE
assertEquals("Local store state is not expected", ReplicaState.OFFLINE, localStore.getCurrentState());
// update replication lag between local and peer replicas
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
// peer1 catches up with last PUT, peer2 catches up with end offset of local store. In this case, SyncUp is not complete
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfPutAndHeader);
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), sizeOfWhole);
assertFalse("Only one peer replica has fully caught up with end offset so sync-up should not complete", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
// make peer1 catch up with end offset
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfWhole);
// Now, sync-up should complete and transition should be able to proceed.
assertTrue("Inactive-To-Offline transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
assertFalse("Local store should be stopped after transition", localStore.isStarted());
storageManager.shutdown();
}
use of com.github.ambry.clustermap.StateTransitionException in project ambry by linkedin.
the class ReplicationTest method replicaFromOfflineToBootstrapTest.
/**
* Test that state transition in replication manager from OFFLINE to BOOTSTRAP
* @throws Exception
*/
@Test
public void replicaFromOfflineToBootstrapTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
DataNodeId currentNode = clusterMap.getDataNodeIds().get(0);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
assertTrue("State change listener in cluster participant should contain replication manager listener", mockHelixParticipant.getPartitionStateChangeListeners().containsKey(StateModelListenerType.ReplicationManagerListener));
// 1. test partition not found case (should throw exception)
try {
mockHelixParticipant.onPartitionBecomeBootstrapFromOffline("-1");
fail("should fail because replica is not found");
} catch (StateTransitionException e) {
assertEquals("Transition error doesn't match", ReplicaNotFound, e.getErrorCode());
}
// 2. create a new partition and test replica addition success case
ReplicaId newReplicaToAdd = getNewReplicaToAdd(clusterMap);
PartitionId newPartition = newReplicaToAdd.getPartitionId();
assertTrue("Adding new replica to Storage Manager should succeed", storageManager.addBlobStore(newReplicaToAdd));
assertFalse("partitionToPartitionInfo should not contain new partition", replicationManager.partitionToPartitionInfo.containsKey(newPartition));
mockHelixParticipant.onPartitionBecomeBootstrapFromOffline(newPartition.toPathString());
assertTrue("partitionToPartitionInfo should contain new partition", replicationManager.partitionToPartitionInfo.containsKey(newPartition));
// 3. test replica addition failure case
replicationManager.partitionToPartitionInfo.remove(newPartition);
replicationManager.addReplicaReturnVal = false;
try {
mockHelixParticipant.onPartitionBecomeBootstrapFromOffline(newPartition.toPathString());
fail("should fail due to replica addition failure");
} catch (StateTransitionException e) {
assertEquals("Transition error doesn't match", ReplicaOperationFailure, e.getErrorCode());
}
replicationManager.addReplicaReturnVal = null;
// 4. test OFFLINE -> BOOTSTRAP on existing replica (should be no-op)
ReplicaId existingReplica = clusterMap.getReplicaIds(currentNode).get(0);
assertTrue("partitionToPartitionInfo should contain existing partition", replicationManager.partitionToPartitionInfo.containsKey(existingReplica.getPartitionId()));
mockHelixParticipant.onPartitionBecomeBootstrapFromOffline(existingReplica.getPartitionId().toPathString());
storageManager.shutdown();
}
Aggregations