use of com.github.ambry.clustermap.MockHelixParticipant in project ambry by linkedin.
the class BlobStoreTest method storeErrorTriggerDisableReplicaTest.
/**
* Test that replica is correctly disabled when store is shut down due to disk I/O error.
* @throws Exception
*/
@Test
public void storeErrorTriggerDisableReplicaTest() throws Exception {
final String RESOURCE_NAME = "0";
final String CLUSTER_NAME = "BlobStoreTest";
// setup testing environment
store.shutdown();
List<TestUtils.ZkInfo> zkInfoList = new ArrayList<>();
zkInfoList.add(new TestUtils.ZkInfo(null, "DC1", (byte) 0, 2199, false));
JSONObject zkJson = constructZkLayoutJSON(zkInfoList);
properties.setProperty("clustermap.cluster.name", CLUSTER_NAME);
properties.setProperty("clustermap.datacenter.name", "DC1");
properties.setProperty("clustermap.host.name", "localhost");
properties.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
properties.setProperty("store.io.error.count.to.trigger.shutdown", "1");
properties.setProperty("store.replica.status.delegate.enable", "true");
properties.setProperty("store.set.local.partition.state.enabled", "true");
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(properties));
AtomicReference<InstanceConfig> instanceConfig = new AtomicReference<>(new InstanceConfig("localhost"));
instanceConfig.get().setPort("2222");
Map<String, List<String>> listMap = new HashMap<>();
listMap.put(storeId, null);
ZNRecord znRecord = new ZNRecord("localhost");
znRecord.setListFields(listMap);
IdealState idealState = new IdealState(znRecord);
idealState.setRebalanceMode(IdealState.RebalanceMode.SEMI_AUTO);
// mock helix related components
HelixAdmin mockHelixAdmin = mock(HelixAdmin.class);
when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).then(invocation -> instanceConfig.get());
when(mockHelixAdmin.getResourcesInCluster(eq(CLUSTER_NAME))).thenReturn(Collections.singletonList(RESOURCE_NAME));
when(mockHelixAdmin.getResourceIdealState(eq(CLUSTER_NAME), eq(RESOURCE_NAME))).thenReturn(idealState);
when(mockHelixAdmin.setInstanceConfig(any(), any(), any())).then(invocation -> {
instanceConfig.set(invocation.getArgument(2));
return true;
});
HelixManager mockHelixManager = mock(HelixManager.class);
when(mockHelixManager.getClusterManagmentTool()).thenReturn(mockHelixAdmin);
HelixFactory mockHelixFactory = new HelixFactory() {
@Override
public HelixManager getZKHelixManager(String clusterName, String instanceName, InstanceType instanceType, String zkAddr) {
return mockHelixManager;
}
};
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockParticipant = new MockHelixParticipant(clusterMapConfig, mockHelixFactory);
mockParticipant.overrideDisableReplicaMethod = false;
ReplicaStatusDelegate replicaStatusDelegate = new ReplicaStatusDelegate(mockParticipant);
BlobStore testStore = createBlobStore(getMockAmbryReplica(clusterMapConfig, tempDirStr), new StoreConfig(new VerifiableProperties(properties)), Collections.singletonList(replicaStatusDelegate));
testStore.start();
assertTrue("Store should start successfully", testStore.isStarted());
// create corrupted write set
MessageInfo corruptedInfo = new MessageInfo(getUniqueId(), PUT_RECORD_SIZE, Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM), Utils.Infinite_Time);
MessageWriteSet corruptedWriteSet = new MockMessageWriteSet(Collections.singletonList(corruptedInfo), Collections.singletonList(ByteBuffer.allocate(PUT_RECORD_SIZE)), new StoreException(StoreException.IO_ERROR_STR, StoreErrorCodes.IOError));
// 1. mock failure case
when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).thenReturn(null);
// trigger store exception when calling store.put()
try {
testStore.put(corruptedWriteSet);
fail("should throw exception");
} catch (StoreException e) {
assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
}
assertNull("Disabled partition list should be null as disabling replica didn't succeed", instanceConfig.get().getDisabledPartitions(RESOURCE_NAME));
// 2. mock success case
when(mockHelixAdmin.getInstanceConfig(eq(CLUSTER_NAME), anyString())).then(invocation -> instanceConfig.get());
testStore.start();
assertTrue("Store should start successfully", testStore.isStarted());
try {
testStore.put(corruptedWriteSet);
fail("should throw exception");
} catch (StoreException e) {
assertEquals("Mismatch in error code", StoreErrorCodes.IOError, e.getErrorCode());
}
assertEquals("Disabled partition name is not expected", storeId, instanceConfig.get().getDisabledPartitions(RESOURCE_NAME).get(0));
// verify "DISABLED" list in InstanceConfig has correct partition id.
assertEquals("Disabled replica list is not expected", Collections.singletonList(storeId), getDisabledReplicas(instanceConfig.get()));
// 3. mock disk is replaced case, restart should succeed
testStore.start();
assertNull("Disabled partition list should be null as restart will enable same replica", instanceConfig.get().getDisabledPartitions(RESOURCE_NAME));
assertTrue("Disabled replica list should be empty", getDisabledReplicas(instanceConfig.get()).isEmpty());
testStore.shutdown();
reloadStore();
}
use of com.github.ambry.clustermap.MockHelixParticipant in project ambry by linkedin.
the class LeaderBasedReplicationTest method setUp.
public void setUp() throws IOException {
properties.setProperty("replication.model.across.datacenters", "LEADER_BASED");
replicationConfig = new ReplicationConfig(new VerifiableProperties(properties));
clusterMap = new MockClusterMap();
clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
/*
Setup:
we have 3 nodes that have replicas belonging to same partitions:
a) localNode (local node that hosts partitions)
b) remoteNodeInLocalDC (remote node in local data center that shares the partitions)
c) remoteNodeInRemoteDC (remote node in remote data center that shares the partitions)
Each node have few of its partitions as leaders and others are standby. They are randomly assigned during creation
of replicas for mock partitions.
*/
DataNodeId localNode = clusterMap.getDataNodeIds().get(0);
List<DataNodeId> remoteNodes = getRemoteNodesFromLocalAndRemoteDCs(clusterMap, localNode);
remoteNodeInLocalDC = remoteNodes.get(0);
remoteNodeInRemoteDC = remoteNodes.get(1);
// mock hosts for remote nodes
localHost = new MockHost(localNode, clusterMap);
remoteHostInLocalDC = new MockHost(remoteNodeInLocalDC, clusterMap);
remoteHostInRemoteDC = new MockHost(remoteNodeInRemoteDC, clusterMap);
}
use of com.github.ambry.clustermap.MockHelixParticipant in project ambry by linkedin.
the class ReplicationTest method replicaFromStandbyToInactiveTest.
/**
* Test STANDBY -> INACTIVE transition on existing replica (both success and failure cases)
*/
@Test
public void replicaFromStandbyToInactiveTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// get an existing partition to test both success and failure cases
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// restart the store and trigger Standby-To-Inactive transition again
storageManager.startBlobStore(existingPartition);
// write a blob with size = 100 into local store (end offset of last PUT = 100 + 18 = 118)
Store localStore = storageManager.getStore(existingPartition);
MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
long crc = (new Random()).nextLong();
long blobSize = 100;
MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
List<MessageInfo> infos = new ArrayList<>();
List<ByteBuffer> buffers = new ArrayList<>();
ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
infos.add(info);
buffers.add(buffer);
localStore.put(new MockMessageWriteSet(infos, buffers));
ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger STANDBY -> INACTIVE transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeInactiveFromStandby(existingPartition.toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Local store state should be INACTIVE", ReplicaState.INACTIVE, storageManager.getStore(existingPartition).getCurrentState());
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
assertFalse("Sync up should not complete because not enough replicas have caught up", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica1, 10L, ReplicaState.INACTIVE));
// pick another remote replica to update the replication lag
ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), 118);
assertFalse("Sync up shouldn't complete because only one replica has caught up with local replica", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
// make second peer replica catch up with last PUT in local store
replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), 118);
assertTrue("Standby-To-Inactive transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// we purposely update lag against local replica to verify local replica is no longer in ReplicaSyncUpManager because
// deactivation is complete and local replica should be removed from "replicaToLagInfos" map.
assertFalse("Sync up should complete (2 replicas have caught up), hence updated should be false", mockHelixParticipant.getReplicaSyncUpManager().updateReplicaLagAndCheckSyncStatus(localReplica, peerReplica2, 0L, ReplicaState.INACTIVE));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.MockHelixParticipant in project ambry by linkedin.
the class ReplicationTest method replicaFromBootstrapToStandbyTest.
/**
* Test BOOTSTRAP -> STANDBY transition on both existing and new replicas. For new replica, we test both failure and
* success cases.
* @throws Exception
*/
@Test
public void replicaFromBootstrapToStandbyTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// 1. test existing partition trough Bootstrap-To-Standby transition, should be no op.
PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
assertEquals("Store state doesn't match", ReplicaState.STANDBY, storageManager.getStore(existingPartition).getCurrentState());
// 2. test transition failure due to store not started
storageManager.shutdownBlobStore(existingPartition);
try {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(existingPartition.toPathString());
fail("should fail because store is not started");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
}
// 3. create new replica and add it into storage manager, test replica that needs to initiate bootstrap
ReplicaId newReplicaToAdd = getNewReplicaToAdd(clusterMap);
assertTrue("Adding new replica to Storage Manager should succeed", storageManager.addBlobStore(newReplicaToAdd));
// override partition state change listener in ReplicationManager to help thread manipulation
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
CountDownLatch participantLatch = new CountDownLatch(1);
replicationManager.listenerExecutionLatch = new CountDownLatch(1);
// create a new thread and trigger BOOTSTRAP -> STANDBY transition
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeStandbyFromBootstrap(newReplicaToAdd.getPartitionId().toPathString());
participantLatch.countDown();
}, false).start();
assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
assertEquals("Replica should be in BOOTSTRAP state before bootstrap is complete", ReplicaState.BOOTSTRAP, storageManager.getStore(newReplicaToAdd.getPartitionId()).getCurrentState());
// make bootstrap succeed
mockHelixParticipant.getReplicaSyncUpManager().onBootstrapComplete(newReplicaToAdd);
assertTrue("Bootstrap-To-Standby transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
storageManager.shutdown();
}
use of com.github.ambry.clustermap.MockHelixParticipant in project ambry by linkedin.
the class ReplicationTest method replicaResumeDecommissionTest.
/**
* Test that resuming decommission on certain replica behaves correctly.
* @throws Exception
*/
@Test
public void replicaResumeDecommissionTest() throws Exception {
MockClusterMap clusterMap = new MockClusterMap();
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
MockHelixParticipant.metricRegistry = new MetricRegistry();
MockHelixParticipant mockHelixParticipant = Mockito.spy(new MockHelixParticipant(clusterMapConfig));
doNothing().when(mockHelixParticipant).setPartitionDisabledState(anyString(), anyBoolean());
// choose a replica on local node and put decommission file into its dir
ReplicaId localReplica = clusterMap.getReplicaIds(clusterMap.getDataNodeIds().get(0)).get(0);
String partitionName = localReplica.getPartitionId().toPathString();
File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
assertTrue("Can't create decommission file", decommissionFile.createNewFile());
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
StorageManager storageManager = managers.getFirst();
// failure case 1: store is not started when resuming decommission
storageManager.shutdownBlobStore(localReplica.getPartitionId());
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail");
} catch (StateTransitionException e) {
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
}
storageManager.startBlobStore(localReplica.getPartitionId());
// failure case 2: fail to remove replica from InstanceConfig in Helix
AmbryReplicaSyncUpManager replicaSyncUpManager = (AmbryReplicaSyncUpManager) mockHelixParticipant.getReplicaSyncUpManager();
mockHelixParticipant.updateNodeInfoReturnVal = false;
CountDownLatch executionLatch = new CountDownLatch(1);
AtomicBoolean exceptionOccurred = new AtomicBoolean(false);
Utils.newThread(() -> {
try {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
fail("should fail because updating node info returns false");
} catch (StateTransitionException e) {
exceptionOccurred.getAndSet(true);
assertEquals("Mismatch in error code", ReplicaOperationFailure, e.getErrorCode());
} finally {
executionLatch.countDown();
}
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", executionLatch.await(1, TimeUnit.SECONDS));
assertTrue("State transition exception should be thrown", exceptionOccurred.get());
mockHelixParticipant.updateNodeInfoReturnVal = null;
storageManager.startBlobStore(localReplica.getPartitionId());
// success case
mockHelixParticipant.mockStatsManagerListener = Mockito.mock(PartitionStateChangeListener.class);
doNothing().when(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.StatsManagerListener, mockHelixParticipant.mockStatsManagerListener);
CountDownLatch participantLatch = new CountDownLatch(1);
Utils.newThread(() -> {
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(partitionName);
participantLatch.countDown();
}, false).start();
while (!replicaSyncUpManager.getPartitionToDeactivationLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDeactivationComplete(localReplica);
while (!replicaSyncUpManager.getPartitionToDisconnectionLatch().containsKey(partitionName)) {
Thread.sleep(100);
}
replicaSyncUpManager.onDisconnectionComplete(localReplica);
assertTrue("Offline-To-Dropped transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
// verify stats manager listener is called
verify(mockHelixParticipant.mockStatsManagerListener).onPartitionBecomeDroppedFromOffline(anyString());
// verify setPartitionDisabledState method is called
verify(mockHelixParticipant).setPartitionDisabledState(partitionName, false);
File storeDir = new File(localReplica.getReplicaPath());
assertFalse("Store dir should not exist", storeDir.exists());
storageManager.shutdown();
}
Aggregations