use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class LeaderBasedReplicationTest method replicaThreadLeaderBasedReplicationStandByCrossColoFetchTest.
/**
* Test leader based replication to verify cross colo gets for standby replicas after they have have timed out
* waiting for missing keys.
* @throws Exception
*/
@Test
public void replicaThreadLeaderBasedReplicationStandByCrossColoFetchTest() throws Exception {
Map<DataNodeId, MockHost> hosts = new HashMap<>();
hosts.put(remoteNodeInLocalDC, remoteHostInLocalDC);
hosts.put(remoteNodeInRemoteDC, remoteHostInRemoteDC);
int batchSize = 5;
int numOfMessagesOnRemoteNodeInLocalDC = 3;
int numOfMessagesOnRemoteNodeInRemoteDC = 10;
ConnectionPool mockConnectionPool = new MockConnectionPool(hosts, clusterMap, batchSize);
Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant, mockConnectionPool);
StorageManager storageManager = managers.getFirst();
MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
// set mock local stores on all remoteReplicaInfos which will used during replication.
for (PartitionId partitionId : replicationManager.partitionToPartitionInfo.keySet()) {
localHost.addStore(partitionId, null);
Store localStore = localHost.getStore(partitionId);
localStore.start();
List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(partitionId).getRemoteReplicaInfos();
remoteReplicaInfos.forEach(remoteReplicaInfo -> remoteReplicaInfo.setLocalStore(localStore));
}
// get remote replicas and replica thread for remote host on local datacenter
ReplicaThread intraColoReplicaThread = replicationManager.dataNodeIdToReplicaThread.get(remoteNodeInLocalDC);
List<RemoteReplicaInfo> remoteReplicaInfosForLocalDC = intraColoReplicaThread.getRemoteReplicaInfos().get(remoteNodeInLocalDC);
// get remote replicas and replica thread for remote host on remote datacenter
ReplicaThread crossColoReplicaThread = replicationManager.dataNodeIdToReplicaThread.get(remoteNodeInRemoteDC);
List<RemoteReplicaInfo> remoteReplicaInfosForRemoteDC = crossColoReplicaThread.getRemoteReplicaInfos().get(remoteNodeInRemoteDC);
// mock helix transition state from standby to leader for local leader partitions
List<? extends ReplicaId> replicaIds = clusterMap.getReplicaIds(replicationManager.dataNodeId);
for (ReplicaId replicaId : replicaIds) {
MockReplicaId mockReplicaId = (MockReplicaId) replicaId;
if (mockReplicaId.getReplicaState() == ReplicaState.LEADER) {
MockPartitionId mockPartitionId = (MockPartitionId) replicaId.getPartitionId();
mockHelixParticipant.onPartitionBecomeLeaderFromStandby(mockPartitionId.toPathString());
}
}
// Add put messages to all partitions on remoteHost1 and remoteHost2
List<PartitionId> partitionIds = clusterMap.getWritablePartitionIds(null);
for (PartitionId partitionId : partitionIds) {
// add 3 put messages to the remoteNodeInLocalDC and remoteNodeInRemoteDC from which local host will replicate.
addPutMessagesToReplicasOfPartition(partitionId, Arrays.asList(remoteHostInLocalDC, remoteHostInRemoteDC), numOfMessagesOnRemoteNodeInLocalDC);
// add 1 put message to the remoteNodeInRemoteDC only. Since this message is not present in remoteNodeInLocalDC, it
// doesn't come to local node via intra-dc replication. We should see time out for remote standby replicas waiting for this
// message and see a cross colo fetch happening.
addPutMessagesToReplicasOfPartition(partitionId, Collections.singletonList(remoteHostInRemoteDC), numOfMessagesOnRemoteNodeInRemoteDC - numOfMessagesOnRemoteNodeInLocalDC);
}
// Choose partitions that are leaders on both local and remote nodes
Set<ReplicaId> leaderReplicasOnLocalAndRemoteNodes = getRemoteLeaderReplicasWithLeaderPartitionsOnLocalNode(clusterMap, replicationManager.dataNodeId, remoteNodeInRemoteDC);
// replicate with remote node in remote DC
crossColoReplicaThread.replicate();
// missing messages are not fetched yet.
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForRemoteDC) {
if (leaderReplicasOnLocalAndRemoteNodes.contains(remoteReplicaInfo.getReplicaId())) {
assertEquals("remote token mismatch for leader replicas", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), batchSize - 1);
} else {
assertEquals("remote token should not move forward for standby replicas until missing keys are fetched", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), 0);
}
}
// Replicate with remote node in local dc
intraColoReplicaThread.replicate();
// verify that remote token will be moved for all replicas as it is intra-dc replication
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForLocalDC) {
assertEquals("mismatch in remote token set for intra colo replicas", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), numOfMessagesOnRemoteNodeInLocalDC - 1);
}
// via intra-dc replication
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForRemoteDC) {
crossColoReplicaThread.processMissingKeysFromPreviousMetadataResponse(remoteReplicaInfo);
}
// verify that the remote token will remain 0 for standby replicas as one message in its missing set is not fetched yet.
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForRemoteDC) {
if (!leaderReplicasOnLocalAndRemoteNodes.contains(remoteReplicaInfo.getReplicaId())) {
assertTrue("missing store messages should still exist for standby replicas", crossColoReplicaThread.containsMissingKeysFromPreviousMetadataExchange(remoteReplicaInfo));
assertEquals("remote token should not move forward for standby replicas until missing keys are fetched", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), 0);
assertEquals("incorrect number of missing store messages found for standby replicas", remoteReplicaInfo.getExchangeMetadataResponse().missingStoreMessages.size(), batchSize - numOfMessagesOnRemoteNodeInLocalDC);
}
}
// Attempt replication with remoteNodeInRemoteDC, we should not see any replication attempt for standby replicas
// and their remote token stays as 0.
crossColoReplicaThread.replicate();
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForRemoteDC) {
if (!leaderReplicasOnLocalAndRemoteNodes.contains(remoteReplicaInfo.getReplicaId())) {
assertEquals("remote token should not move forward for standby replicas until missing keys are fetched", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), 0);
assertTrue("missing store messages should still exist for standby replicas", crossColoReplicaThread.containsMissingKeysFromPreviousMetadataExchange(remoteReplicaInfo));
}
}
// Move time forward by replicationStandbyWaitTimeoutToTriggerCrossColoFetchSeconds+1 seconds and attempt replication.
// We should see cross colo fetch for standby replicas now since missing keys haven't arrived for
// replicationConfig.replicationStandbyWaitTimeoutToTriggerCrossColoFetchSeconds.
time.sleep((replicationConfig.replicationStandbyWaitTimeoutToTriggerCrossColoFetchSeconds + 1) * 1000);
// verify that we get the list of standby replicas that timed out on no progress
Set<RemoteReplicaInfo> allStandbyReplicas = remoteReplicaInfosForRemoteDC.stream().filter(info -> !leaderReplicasOnLocalAndRemoteNodes.contains(info.getReplicaId())).collect(Collectors.toSet());
assertEquals("mismatch in list of standby replicas timed out on no progress", new HashSet<>(crossColoReplicaThread.getRemoteStandbyReplicasTimedOutOnNoProgress(remoteReplicaInfosForRemoteDC)), allStandbyReplicas);
crossColoReplicaThread.replicate();
// token index for all standby replicas will move forward after fetching missing keys themselves
for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfosForRemoteDC) {
if (!leaderReplicasOnLocalAndRemoteNodes.contains(remoteReplicaInfo.getReplicaId())) {
assertEquals("mismatch in remote token set for standby cross colo replicas", ((MockFindToken) remoteReplicaInfo.getToken()).getIndex(), batchSize - 1);
assertFalse("missing store messages should be empty for standby replicas now", crossColoReplicaThread.containsMissingKeysFromPreviousMetadataExchange(remoteReplicaInfo));
}
}
// verify replication metrics to track number of cross colo get requests for standby replicas. If all replicas are
// leaders, we should have 0 cross colo get requests.
String remoteDataCenter = remoteReplicaInfosForRemoteDC.get(0).getReplicaId().getDataNodeId().getDatacenterName();
assertEquals("mismatch in number of cross colo get requests tracked for standby replicas", crossColoReplicaThread.getReplicationMetrics().interColoReplicationGetRequestCountForStandbyReplicas.get(remoteDataCenter).getCount(), leaderReplicasOnLocalAndRemoteNodes.size() != remoteReplicaInfosForRemoteDC.size() ? 1 : 0);
storageManager.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class ServerTestUtil method endToEndReplicationWithMultiNodeMultiPartitionMultiDCTest.
static void endToEndReplicationWithMultiNodeMultiPartitionMultiDCTest(String sourceDatacenter, String sslEnabledDatacenters, PortType portType, MockCluster cluster, MockNotificationSystem notificationSystem, Properties routerProps) throws Exception {
Properties props = new Properties();
props.setProperty("router.hostname", "localhost");
props.setProperty("router.datacenter.name", sourceDatacenter);
props.setProperty("router.put.request.parallelism", "1");
props.setProperty("router.put.success.target", "1");
props.setProperty("clustermap.cluster.name", "test");
props.setProperty("clustermap.datacenter.name", sourceDatacenter);
props.setProperty("clustermap.host.name", "localhost");
props.setProperty("kms.default.container.key", TestUtils.getRandomKey(32));
props.putAll(routerProps);
VerifiableProperties verifiableProperties = new VerifiableProperties(props);
AccountService accountService = new InMemAccountService(false, true);
Router router = new NonBlockingRouterFactory(verifiableProperties, cluster.getClusterMap(), notificationSystem, getSSLFactoryIfRequired(verifiableProperties), accountService).getRouter();
int numberOfRequestsToSend = 15;
int numberOfVerifierThreads = 3;
final LinkedBlockingQueue<Payload> payloadQueue = new LinkedBlockingQueue<Payload>();
final AtomicReference<Exception> exceptionRef = new AtomicReference<>(null);
final CountDownLatch callbackLatch = new CountDownLatch(numberOfRequestsToSend);
List<Future<String>> putFutures = new ArrayList<>(numberOfRequestsToSend);
short accountId = Utils.getRandomShort(TestUtils.RANDOM);
short containerId = Utils.getRandomShort(TestUtils.RANDOM);
for (int i = 0; i < numberOfRequestsToSend; i++) {
int size = new Random().nextInt(5000);
final BlobProperties properties = new BlobProperties(size, "service1", "owner id check", "image/jpeg", false, TestUtils.TTL_SECS, cluster.time.milliseconds(), accountId, containerId, false, null, null, null);
final byte[] metadata = new byte[new Random().nextInt(1000)];
final byte[] blob = new byte[size];
TestUtils.RANDOM.nextBytes(metadata);
TestUtils.RANDOM.nextBytes(blob);
Future<String> future = router.putBlob(properties, metadata, new ByteBufferReadableStreamChannel(ByteBuffer.wrap(blob)), new PutBlobOptionsBuilder().build(), new Callback<String>() {
@Override
public void onCompletion(String result, Exception exception) {
if (exception == null) {
payloadQueue.add(new Payload(properties, metadata, blob, result));
} else {
exceptionRef.set(exception);
}
callbackLatch.countDown();
}
}, QUOTA_CHARGE_EVENT_LISTENER);
putFutures.add(future);
}
for (Future<String> future : putFutures) {
future.get(20, TimeUnit.SECONDS);
}
assertTrue("Did not receive all callbacks in time", callbackLatch.await(1, TimeUnit.SECONDS));
if (exceptionRef.get() != null) {
throw exceptionRef.get();
}
// put away for future use
Payload payload1 = payloadQueue.peek();
MockClusterMap clusterMap = cluster.getClusterMap();
BlobId blobId1 = new BlobId(payload1.blobId, clusterMap);
assertEquals("Did not put expected number of blobs", numberOfRequestsToSend, payloadQueue.size());
Properties sslProps = new Properties();
sslProps.putAll(routerProps);
sslProps.setProperty("clustermap.ssl.enabled.datacenters", sslEnabledDatacenters);
sslProps.setProperty("clustermap.cluster.name", "test");
sslProps.setProperty("clustermap.datacenter.name", sourceDatacenter);
sslProps.setProperty("clustermap.host.name", "localhost");
sslProps.setProperty("connectionpool.read.timeout.ms", "15000");
VerifiableProperties vProps = new VerifiableProperties(sslProps);
ConnectionPool connectionPool = new BlockingChannelConnectionPool(new ConnectionPoolConfig(vProps), new SSLConfig(vProps), new ClusterMapConfig(vProps), new MetricRegistry());
CountDownLatch verifierLatch = new CountDownLatch(numberOfVerifierThreads);
AtomicInteger totalRequests = new AtomicInteger(numberOfRequestsToSend);
AtomicInteger verifiedRequests = new AtomicInteger(0);
AtomicBoolean cancelTest = new AtomicBoolean(false);
for (int i = 0; i < numberOfVerifierThreads; i++) {
Thread thread = new Thread(new Verifier(payloadQueue, verifierLatch, totalRequests, verifiedRequests, cluster.getClusterMap(), cancelTest, portType, connectionPool, notificationSystem, cluster.time));
thread.start();
}
assertTrue("Did not verify in 2 minutes", verifierLatch.await(2, TimeUnit.MINUTES));
assertEquals(totalRequests.get(), verifiedRequests.get());
BlobIdFactory blobIdFactory = new BlobIdFactory(clusterMap);
MockDataNodeId dataNodeId = clusterMap.getDataNodes().get(0);
Port port = new Port(portType == PortType.PLAINTEXT ? dataNodeId.getPort() : dataNodeId.getSSLPort(), portType);
ConnectedChannel channel = connectionPool.checkOutConnection("localhost", port, 10000);
PartitionId partitionId = blobId1.getPartition();
// stop the store via AdminRequest
System.out.println("Begin to stop a BlobStore");
AdminRequest adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, partitionId, 1, "clientid2");
BlobStoreControlAdminRequest controlRequest = new BlobStoreControlAdminRequest((short) 0, BlobStoreControlAction.StopStore, adminRequest);
DataInputStream stream = channel.sendAndReceive(controlRequest).getInputStream();
AdminResponse adminResponse = AdminResponse.readFrom(stream);
releaseNettyBufUnderneathStream(stream);
assertEquals("Stop store admin request should succeed", ServerErrorCode.No_Error, adminResponse.getError());
// put a blob on a stopped store, which should fail
byte[] userMetadata = new byte[1000];
byte[] data = new byte[3187];
BlobProperties properties = new BlobProperties(3187, "serviceid1", accountId, containerId, false, cluster.time.milliseconds());
BlobId blobId2 = new BlobId(CommonTestUtils.getCurrentBlobIdVersion(), BlobId.BlobIdType.NATIVE, clusterMap.getLocalDatacenterId(), accountId, containerId, partitionId, false, BlobId.BlobDataType.DATACHUNK);
PutRequest putRequest2 = new PutRequest(1, "clientId2", blobId2, properties, ByteBuffer.wrap(userMetadata), Unpooled.wrappedBuffer(data), properties.getBlobSize(), BlobType.DataBlob, null);
DataInputStream putResponseStream = channel.sendAndReceive(putRequest2).getInputStream();
PutResponse response2 = PutResponse.readFrom(putResponseStream);
releaseNettyBufUnderneathStream(putResponseStream);
assertEquals("Put blob on stopped store should fail", ServerErrorCode.Replica_Unavailable, response2.getError());
// get a blob properties on a stopped store, which should fail
ArrayList<BlobId> ids = new ArrayList<>();
ids.add(blobId1);
ArrayList<PartitionRequestInfo> partitionRequestInfoList = new ArrayList<>();
PartitionRequestInfo partitionRequestInfo = new PartitionRequestInfo(partitionId, ids);
partitionRequestInfoList.add(partitionRequestInfo);
GetRequest getRequest1 = new GetRequest(1, "clientId1", MessageFormatFlags.BlobProperties, partitionRequestInfoList, GetOption.None);
stream = channel.sendAndReceive(getRequest1).getInputStream();
GetResponse resp1 = GetResponse.readFrom(stream, clusterMap);
assertEquals("Get blob properties on stopped store should fail", ServerErrorCode.Replica_Unavailable, resp1.getPartitionResponseInfoList().get(0).getErrorCode());
releaseNettyBufUnderneathStream(stream);
// delete a blob on a stopped store, which should fail
DeleteRequest deleteRequest = new DeleteRequest(1, "clientId1", blobId1, System.currentTimeMillis());
stream = channel.sendAndReceive(deleteRequest).getInputStream();
DeleteResponse deleteResponse = DeleteResponse.readFrom(stream);
releaseNettyBufUnderneathStream(stream);
assertEquals("Delete blob on stopped store should fail", ServerErrorCode.Replica_Unavailable, deleteResponse.getError());
// start the store via AdminRequest
System.out.println("Begin to restart the BlobStore");
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, partitionId, 1, "clientId");
controlRequest = new BlobStoreControlAdminRequest((short) 0, BlobStoreControlAction.StartStore, adminRequest);
stream = channel.sendAndReceive(controlRequest).getInputStream();
adminResponse = AdminResponse.readFrom(stream);
releaseNettyBufUnderneathStream(stream);
assertEquals("Start store admin request should succeed", ServerErrorCode.No_Error, adminResponse.getError());
List<? extends ReplicaId> replicaIds = partitionId.getReplicaIds();
for (ReplicaId replicaId : replicaIds) {
// forcibly mark replicas and disks as up.
MockReplicaId mockReplicaId = (MockReplicaId) replicaId;
mockReplicaId.markReplicaDownStatus(false);
((MockDiskId) mockReplicaId.getDiskId()).setDiskState(HardwareState.AVAILABLE, false);
}
// put a blob on a restarted store , which should succeed
putRequest2 = new PutRequest(1, "clientId2", blobId2, properties, ByteBuffer.wrap(userMetadata), Unpooled.wrappedBuffer(data), properties.getBlobSize(), BlobType.DataBlob, null);
putResponseStream = channel.sendAndReceive(putRequest2).getInputStream();
response2 = PutResponse.readFrom(putResponseStream);
releaseNettyBufUnderneathStream(putResponseStream);
assertEquals("Put blob on restarted store should succeed", ServerErrorCode.No_Error, response2.getError());
// verify the put blob has been replicated successfully.
notificationSystem.awaitBlobCreations(blobId2.getID());
// get a blob on a restarted store , which should succeed
ids = new ArrayList<BlobId>();
ids.add(blobId2);
partitionRequestInfoList = new ArrayList<PartitionRequestInfo>();
partitionRequestInfo = new PartitionRequestInfo(partitionId, ids);
partitionRequestInfoList.add(partitionRequestInfo);
GetRequest getRequest2 = new GetRequest(1, "clientId2", MessageFormatFlags.All, partitionRequestInfoList, GetOption.None);
stream = channel.sendAndReceive(getRequest2).getInputStream();
GetResponse resp2 = GetResponse.readFrom(stream, clusterMap);
InputStream responseStream = resp2.getInputStream();
BlobAll blobAll = MessageFormatRecord.deserializeBlobAll(responseStream, blobIdFactory);
byte[] actualBlobData = getBlobDataAndRelease(blobAll.getBlobData());
assertArrayEquals("Content mismatch.", data, actualBlobData);
releaseNettyBufUnderneathStream(stream);
// delete a blob on a restarted store , which should succeed
deleteRequest = new DeleteRequest(1, "clientId2", blobId2, System.currentTimeMillis());
stream = channel.sendAndReceive(deleteRequest).getInputStream();
deleteResponse = DeleteResponse.readFrom(stream);
releaseNettyBufUnderneathStream(stream);
assertEquals("Delete blob on restarted store should succeed", ServerErrorCode.No_Error, deleteResponse.getError());
router.close();
connectionPool.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class StatsManagerTest method testReplicaFromOfflineToDropped.
/**
* Test Offline-To-Dropped transition (both failure and success cases)
* @throws Exception
*/
@Test
public void testReplicaFromOfflineToDropped() throws Exception {
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
ReplicationConfig replicationConfig = new ReplicationConfig(verifiableProperties);
StoreConfig storeConfig = new StoreConfig(verifiableProperties);
MockClusterMap clusterMap = new MockClusterMap();
DataNodeId currentNode = clusterMap.getDataNodeIds().get(0);
List<ReplicaId> localReplicas = clusterMap.getReplicaIds(currentNode);
StorageManager storageManager = new StorageManager(storeConfig, new DiskManagerConfig(verifiableProperties), Utils.newScheduler(1, true), new MetricRegistry(), null, clusterMap, currentNode, null, Collections.singletonList(clusterParticipant), new MockTime(), null, new InMemAccountService(false, false));
storageManager.start();
MockStoreKeyConverterFactory storeKeyConverterFactory = new MockStoreKeyConverterFactory(null, null);
storeKeyConverterFactory.setConversionMap(new HashMap<>());
MockReplicationManager mockReplicationManager = new MockReplicationManager(replicationConfig, clusterMapConfig, storeConfig, storageManager, clusterMap, currentNode, storeKeyConverterFactory, clusterParticipant);
MockStatsManager mockStatsManager = new MockStatsManager(storageManager, localReplicas, new MetricRegistry(), statsManagerConfig, clusterParticipant);
// 1. attempt to remove replica while store is still running (remove store failure case)
ReplicaId replicaToDrop = localReplicas.get(0);
try {
clusterParticipant.onPartitionBecomeDroppedFromOffline(replicaToDrop.getPartitionId().toPathString());
fail("should fail because store is still running");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", ReplicaOperationFailure, e.getErrorCode());
}
// 2. shutdown the store but introduce file deletion failure (put a invalid dir in store dir)
storageManager.shutdownBlobStore(replicaToDrop.getPartitionId());
File invalidDir = new File(replicaToDrop.getReplicaPath(), "invalidDir");
invalidDir.deleteOnExit();
assertTrue("Couldn't create dir within store dir", invalidDir.mkdir());
assertTrue("Could not make unreadable", invalidDir.setReadable(false));
try {
clusterParticipant.onPartitionBecomeDroppedFromOffline(replicaToDrop.getPartitionId().toPathString());
fail("should fail because store deletion fails");
} catch (StateTransitionException e) {
assertEquals("Error code doesn't match", ReplicaOperationFailure, e.getErrorCode());
}
// reset permission to allow deletion to succeed.
assertTrue("Could not make readable", invalidDir.setReadable(true));
assertTrue("Could not delete invalid dir", invalidDir.delete());
// 3. success case (remove another replica because previous replica has been removed from in-mem data structures)
ReplicaId replica = localReplicas.get(1);
storageManager.shutdownBlobStore(replica.getPartitionId());
MockHelixParticipant mockHelixParticipant = Mockito.spy(clusterParticipant);
doNothing().when(mockHelixParticipant).setPartitionDisabledState(anyString(), anyBoolean());
mockHelixParticipant.onPartitionBecomeDroppedFromOffline(replica.getPartitionId().toPathString());
// verify that the replica is no longer present in StorageManager
assertNull("Store of removed replica should not exist", storageManager.getStore(replica.getPartitionId(), true));
// purposely remove the same replica in ReplicationManager again to verify it no longer exists
assertFalse("Should return false because replica no longer exists", mockReplicationManager.removeReplica(replica));
// purposely remove the same replica in StatsManager again to verify it no longer exists
assertFalse("Should return false because replica no longer exists", mockStatsManager.removeReplica(replica));
verify(mockHelixParticipant).setPartitionDisabledState(replica.getPartitionId().toPathString(), false);
storageManager.shutdown();
mockStatsManager.shutdown();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HelixBootstrapUpgradeToolTest method testDisablePartitionAdminOp.
/**
* Test when AdminOperation is specified to DisablePartition, Helix bootstrap tool is able to disable certain partition
* only without changing IdealState and InstanceConfig. (In practice, this is first step to decommission a replica)
* @throws Exception
*/
@Test
public void testDisablePartitionAdminOp() throws Exception {
String clusterName = CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP;
// Test regular bootstrap. This is to ensure DataNodeConfig and IdealState are there before testing disabling
// certain replica on specific node.
long expectedResourceCount = (testPartitionLayout.getPartitionLayout().getPartitionCount() - 1) / DEFAULT_MAX_PARTITIONS_PER_RESOURCE + 1;
writeBootstrapOrUpgrade(expectedResourceCount, false);
int totalPartitionCount = testPartitionLayout.getPartitionCount();
// Randomly pick a partition to remove one of its replicas
Partition testPartition = (Partition) testPartitionLayout.getPartitionLayout().getPartitions(null).get(RANDOM.nextInt(totalPartitionCount));
ReplicaId removedReplica = testPartition.getReplicaIds().stream().filter(r -> r.getDataNodeId().getDatacenterName().equals("DC1")).findFirst().get();
testPartition.getReplicas().remove(removedReplica);
ZkInfo zkInfo = dcsToZkInfo.get(removedReplica.getDataNodeId().getDatacenterName());
// create a participant that hosts this removed replica
Properties props = new Properties();
props.setProperty("clustermap.host.name", "localhost");
props.setProperty("clustermap.port", String.valueOf(removedReplica.getDataNodeId().getPort()));
props.setProperty("clustermap.cluster.name", clusterName);
props.setProperty("clustermap.datacenter.name", "DC1");
props.setProperty("clustermap.update.datanode.info", Boolean.toString(true));
props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
props.setProperty("clustermap.retry.disable.partition.completion.backoff.ms", Integer.toString(100));
props.setProperty("clustermap.data.node.config.source.type", dataNodeConfigSourceType.name());
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
HelixParticipant helixParticipant = new HelixParticipant(clusterMapConfig, new HelixFactory(), new MetricRegistry(), "localhost:" + zkInfo.getPort(), true);
PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? null : new PropertyStoreToDataNodeConfigAdapter("localhost:" + zkInfo.getPort(), clusterMapConfig);
InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(clusterMapConfig);
// create HelixAdmin
ZKHelixAdmin admin = new ZKHelixAdmin("localhost:" + zkInfo.getPort());
// Write changes to static files
Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
// make bootstrap tool blocked by count down latch before removing znodes for disabling partitions
blockRemovingNodeLatch = new CountDownLatch(1);
disablePartitionLatch = new CountDownLatch(activeDcSet.size());
CountDownLatch bootstrapCompletionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
try {
// Upgrade Helix by updating IdealState: AdminOperation = DisablePartition
HelixBootstrapUpgradeUtil.bootstrapOrUpgrade(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, DEFAULT_MAX_PARTITIONS_PER_RESOURCE, false, false, new HelixAdminFactory(), false, ClusterMapConfig.DEFAULT_STATE_MODEL_DEF, DisablePartition, dataNodeConfigSourceType, false);
bootstrapCompletionLatch.countDown();
} catch (Exception e) {
// do nothing, if there is any exception subsequent test should fail.
}
}, false).start();
assertTrue("Disable partition latch didn't come down within 5 seconds", disablePartitionLatch.await(5, TimeUnit.SECONDS));
// Let's attempt to update InstanceConfig/DataNodeConfig via HelixParticipant, which should be blocked
CountDownLatch updateCompletionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
helixParticipant.updateDataNodeInfoInCluster(removedReplica, false);
updateCompletionLatch.countDown();
}, false).start();
// sleep 100 ms to ensure updateDataNodeInfoInCluster is blocked due to disabling partition hasn't completed yet
Thread.sleep(100);
// Ensure the DataNodeConfig still has the replica
String instanceName = getInstanceName(removedReplica.getDataNodeId());
DataNodeConfig currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, true);
// verify the znode is created for the node on which partition has been disabled.
Properties properties = new Properties();
properties.setProperty("helix.property.store.root.path", "/" + clusterName + "/" + PROPERTYSTORE_STR);
HelixPropertyStoreConfig propertyStoreConfig = new HelixPropertyStoreConfig(new VerifiableProperties(properties));
HelixPropertyStore<ZNRecord> helixPropertyStore = CommonUtils.createHelixPropertyStore("localhost:" + zkInfo.getPort(), propertyStoreConfig, null);
String path = PARTITION_DISABLED_ZNODE_PATH + getInstanceName(removedReplica.getDataNodeId());
assertTrue("ZNode is not found for disabled partition node.", helixPropertyStore.exists(path, AccessOption.PERSISTENT));
helixPropertyStore.stop();
// unblock HelixBootstrapTool
blockRemovingNodeLatch.countDown();
// waiting for bootstrap tool to complete
assertTrue("Helix tool didn't complete within 5 seconds", bootstrapCompletionLatch.await(5, TimeUnit.SECONDS));
verifyResourceCount(testHardwareLayout.getHardwareLayout(), expectedResourceCount);
assertTrue("Helix participant didn't complete update within 5 seconds", updateCompletionLatch.await(5, TimeUnit.SECONDS));
InstanceConfig currentInstanceConfig = admin.getInstanceConfig(clusterName, getInstanceName(removedReplica.getDataNodeId()));
// Verify that replica has been disabled
String resourceName = null;
for (String rs : admin.getResourcesInCluster(clusterName)) {
IdealState is = admin.getResourceIdealState(clusterName, rs);
if (is.getPartitionSet().contains(removedReplica.getPartitionId().toPathString())) {
resourceName = rs;
break;
}
}
List<String> disabledPartition = currentInstanceConfig.getDisabledPartitions(resourceName);
assertEquals("Disabled partition not as expected", Collections.singletonList(removedReplica.getPartitionId().toPathString()), disabledPartition);
// Verify that IdealState has no change
verifyIdealStateForPartition(removedReplica, true, 3, expectedResourceCount);
// Verify the InstanceConfig is changed in MapFields (Disabled partitions are added to this field, also the replica entry has been removed)
String disabledPartitionStr = currentInstanceConfig.getRecord().getMapFields().keySet().stream().filter(k -> !k.startsWith("/mnt")).findFirst().get();
// Verify the disabled partition string contains correct partition
Map<String, String> expectedDisabledPartitionMap = new HashMap<>();
expectedDisabledPartitionMap.put(resourceName, removedReplica.getPartitionId().toPathString());
assertEquals("Mismatch in disabled partition string in InstanceConfig", expectedDisabledPartitionMap, currentInstanceConfig.getRecord().getMapField(disabledPartitionStr));
// verify the removed replica is no longer in InstanceConfig
currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, false);
if (propertyStoreAdapter != null) {
propertyStoreAdapter.close();
}
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HelixBootstrapUpgradeToolTest method testCreateInstanceConfig.
/**
* Tests the method to create instance config and ensures that derived fields are set correctly.
*/
@Test
public void testCreateInstanceConfig() {
DataNode dataNode = (DataNode) ((Partition) testPartitionLayout.getPartitionLayout().getPartitions(DEFAULT_PARTITION_CLASS).get(0)).getReplicas().get(0).getDataNodeId();
Map<String, Set<String>> partitionToInstances = new HashMap<>();
JSONObject jsonObject = dataNode.toJSONObject();
jsonObject.put("xid", ClusterMapUtils.DEFAULT_XID);
ClusterMapConfig clusterMapConfig = testHardwareLayout.clusterMapConfig;
dataNode = new DataNode(dataNode.getDatacenter(), jsonObject, clusterMapConfig);
InstanceConfig referenceInstanceConfig = HelixBootstrapUpgradeUtil.createInstanceConfigFromStaticInfo(dataNode, partitionToInstances, new ConcurrentHashMap<>(), null);
// Assert that xid field does not get set in InstanceConfig when it is the default.
assertNull(referenceInstanceConfig.getRecord().getSimpleField(ClusterMapUtils.XID_STR));
// Assert that xid field does get set if not the default.
jsonObject.put("xid", "10");
dataNode = new DataNode(dataNode.getDatacenter(), jsonObject, clusterMapConfig);
InstanceConfig instanceConfig = HelixBootstrapUpgradeUtil.createInstanceConfigFromStaticInfo(dataNode, partitionToInstances, new ConcurrentHashMap<>(), null);
assertEquals("10", instanceConfig.getRecord().getSimpleField(ClusterMapUtils.XID_STR));
assertThat(referenceInstanceConfig.getRecord(), not(equalTo(instanceConfig.getRecord())));
referenceInstanceConfig = instanceConfig;
// Assert that sealed list being different does not affect equality
List<String> sealedList = Arrays.asList("5", "10");
referenceInstanceConfig.getRecord().setListField(ClusterMapUtils.SEALED_STR, sealedList);
// set the field to null. The created InstanceConfig should not have null fields.
referenceInstanceConfig.getRecord().setListField(ClusterMapUtils.STOPPED_REPLICAS_STR, null);
instanceConfig = HelixBootstrapUpgradeUtil.createInstanceConfigFromStaticInfo(dataNode, partitionToInstances, new ConcurrentHashMap<>(), referenceInstanceConfig);
// Stopped replicas should be an empty list and not null, so set that in referenceInstanceConfig for comparison.
referenceInstanceConfig.getRecord().setListField(ClusterMapUtils.STOPPED_REPLICAS_STR, Collections.emptyList());
assertEquals(instanceConfig.getRecord(), referenceInstanceConfig.getRecord());
// Assert that stopped list being different does not affect equality
List<String> stoppedReplicas = Arrays.asList("11", "15");
referenceInstanceConfig.getRecord().setListField(ClusterMapUtils.STOPPED_REPLICAS_STR, stoppedReplicas);
instanceConfig = HelixBootstrapUpgradeUtil.createInstanceConfigFromStaticInfo(dataNode, partitionToInstances, new ConcurrentHashMap<>(), referenceInstanceConfig);
assertEquals(instanceConfig.getRecord(), referenceInstanceConfig.getRecord());
}
Aggregations