use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HelixBootstrapUpgradeUtil method verifyDataNodeAndDiskEquivalencyInDc.
/**
* Verify that the hardware layout information is in sync - which includes the node and disk information. Also verify
* that the replicas belonging to disks are in sync between the static cluster map and Helix.
* @param dc the datacenter whose information is to be verified.
* @param clusterName the cluster to be verified.
* @param partitionLayout the {@link PartitionLayout} of the static clustermap.
*/
private void verifyDataNodeAndDiskEquivalencyInDc(Datacenter dc, String clusterName, PartitionLayout partitionLayout) {
String dcName = dc.getName();
// The following properties are immaterial for the tool, but the ClusterMapConfig mandates their presence.
ClusterMapConfig clusterMapConfig = getClusterMapConfig(clusterName, dcName, null);
StaticClusterManager staticClusterMap = (new StaticClusterAgentsFactory(clusterMapConfig, partitionLayout)).getClusterMap();
String zkConnectStr = dataCenterToZkAddress.get(dcName).getZkConnectStrs().get(0);
try (PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = new PropertyStoreToDataNodeConfigAdapter(zkConnectStr, clusterMapConfig)) {
InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(clusterMapConfig);
Set<String> allInstancesInHelix = new HashSet<>(getInstanceNamesInHelix(dcName, propertyStoreAdapter));
for (DataNodeId dataNodeId : dc.getDataNodes()) {
Map<String, Map<String, ReplicaId>> mountPathToReplicas = getMountPathToReplicas(staticClusterMap, dataNodeId);
DataNode dataNode = (DataNode) dataNodeId;
String instanceName = getInstanceName(dataNode);
ensureOrThrow(allInstancesInHelix.remove(instanceName), "Instance not present in Helix " + instanceName);
DataNodeConfig dataNodeConfig = getDataNodeConfigFromHelix(dcName, instanceName, propertyStoreAdapter, instanceConfigConverter);
Map<String, DataNodeConfig.DiskConfig> diskInfos = new HashMap<>(dataNodeConfig.getDiskConfigs());
for (Disk disk : dataNode.getDisks()) {
DataNodeConfig.DiskConfig diskInfoInHelix = diskInfos.remove(disk.getMountPath());
ensureOrThrow(diskInfoInHelix != null, "[" + dcName.toUpperCase() + "] Disk not present for instance " + instanceName + " disk " + disk.getMountPath());
ensureOrThrow(disk.getRawCapacityInBytes() == diskInfoInHelix.getDiskCapacityInBytes(), "[" + dcName.toUpperCase() + "] Capacity mismatch for instance " + instanceName + " disk " + disk.getMountPath());
// are expected to be different on certain nodes.
if (EnumSet.of(HelixAdminOperation.BootstrapCluster, HelixAdminOperation.ValidateCluster).contains(helixAdminOperation)) {
Set<String> replicasInClusterMap = new HashSet<>();
Map<String, ReplicaId> replicaList = mountPathToReplicas.get(disk.getMountPath());
if (replicaList != null) {
replicasInClusterMap.addAll(replicaList.keySet());
}
Set<String> replicasInHelix = new HashSet<>();
Map<String, DataNodeConfig.ReplicaConfig> replicaConfigMap = diskInfoInHelix.getReplicaConfigs();
for (Map.Entry<String, DataNodeConfig.ReplicaConfig> replicaConfigEntry : replicaConfigMap.entrySet()) {
String replicaName = replicaConfigEntry.getKey();
DataNodeConfig.ReplicaConfig replicaConfig = replicaConfigEntry.getValue();
replicasInHelix.add(replicaName);
ReplicaId replica = replicaList.get(replicaName);
ensureOrThrow(replicaConfig.getReplicaCapacityInBytes() == replica.getCapacityInBytes(), "[" + dcName.toUpperCase() + "] Replica capacity should be the same.");
ensureOrThrow(replicaConfig.getPartitionClass().equals(replica.getPartitionId().getPartitionClass()), "[" + dcName.toUpperCase() + "] Partition class should be the same.");
}
ensureOrThrow(replicasInClusterMap.equals(replicasInHelix), "[" + dcName.toUpperCase() + "] Replica information not consistent for instance " + instanceName + " disk " + disk.getMountPath() + "\n in Helix: " + replicaList + "\n in static clustermap: " + replicasInClusterMap);
}
}
for (Map.Entry<String, DataNodeConfig.DiskConfig> entry : diskInfos.entrySet()) {
String mountPath = entry.getKey();
if (!mountPath.startsWith("/mnt")) {
logger.warn("[{}] Instance {} has unidentifiable mount path in Helix: {}", dcName.toUpperCase(), instanceName, mountPath);
} else {
throw new AssertionError("[" + dcName.toUpperCase() + "] Instance " + instanceName + " has extra disk in Helix: " + entry);
}
}
ensureOrThrow(!dataNode.hasSSLPort() || (dataNode.getSSLPort() == dataNodeConfig.getSslPort()), "[" + dcName.toUpperCase() + "] SSL Port mismatch for instance " + instanceName);
ensureOrThrow(!dataNode.hasHttp2Port() || (dataNode.getHttp2Port() == dataNodeConfig.getHttp2Port()), "[" + dcName.toUpperCase() + "] HTTP2 Port mismatch for instance " + instanceName);
ensureOrThrow(dataNode.getDatacenterName().equals(dataNodeConfig.getDatacenterName()), "[" + dcName.toUpperCase() + "] Datacenter mismatch for instance " + instanceName);
ensureOrThrow(Objects.equals(dataNode.getRackId(), dataNodeConfig.getRackId()), "[" + dcName.toUpperCase() + "] Rack Id mismatch for instance " + instanceName);
// xid is not set in PropertyStore based DataNodeConfig and will be decommissioned eventually, hence we don't check xid equivalence
}
if (expectMoreInHelixDuringValidate) {
ensureOrThrow(allInstancesInHelix.equals(instancesNotForceRemovedByDc.getOrDefault(dc.getName(), new HashSet<>())), "[" + dcName.toUpperCase() + "] Additional instances in Helix: " + allInstancesInHelix + " not what is expected " + instancesNotForceRemovedByDc.get(dc.getName()));
info("[{}] *** Helix may have more instances than in the given clustermap as removals were not forced.", dcName.toUpperCase());
} else {
ensureOrThrow(allInstancesInHelix.isEmpty(), "[" + dcName.toUpperCase() + "] Following instances in Helix not found in the clustermap " + allInstancesInHelix);
}
}
info("[{}] Successfully verified datanode and disk equivalency in dc {}", dcName.toUpperCase(), dc.getName());
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HardwareLayoutTest method basics.
@Test
public void basics() throws JSONException {
JSONObject jsonObject = TestUtils.getJsonHardwareLayout("Alpha", getDatacenters());
HardwareLayout hardwareLayout = new HardwareLayout(jsonObject, new ClusterMapConfig(new VerifiableProperties(props)));
assertEquals(hardwareLayout.getVersion(), TestUtils.defaultHardwareLayoutVersion);
assertEquals(hardwareLayout.getClusterName(), "Alpha");
assertEquals(hardwareLayout.getDatacenters().size(), datacenterCount);
assertEquals(hardwareLayout.getRawCapacityInBytes(), datacenterCount * dataNodeCount * diskCount * diskCapacityInBytes);
assertEquals(hardwareLayout.toJSONObject().toString(), jsonObject.toString());
assertEquals(hardwareLayout.getDataNodeInHardStateCount(HardwareState.AVAILABLE), datacenterCount * dataNodeCount);
assertEquals(hardwareLayout.getDataNodeInHardStateCount(HardwareState.UNAVAILABLE), 0);
assertEquals(hardwareLayout.calculateUnavailableDataNodeCount(), 0);
assertEquals(hardwareLayout.getDiskInHardStateCount(HardwareState.AVAILABLE), datacenterCount * dataNodeCount * diskCount);
assertEquals(hardwareLayout.getDiskInHardStateCount(HardwareState.UNAVAILABLE), 0);
assertEquals(hardwareLayout.calculateUnavailableDiskCount(), 0);
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HelixClusterManagerTest method routingTableProviderChangeTest.
/**
* Test that routing table change reflects correct state of each replica and {@link HelixClusterManager} is able to get
* replica in required state.
*/
@Test
public void routingTableProviderChangeTest() throws Exception {
assumeTrue(!useComposite && !overrideEnabled && !listenCrossColo);
// Change zk connect strings to ensure HelixClusterManager sees local DC only
JSONObject zkJson = constructZkLayoutJSON(Collections.singletonList(dcsToZkInfo.get(localDc)));
Properties props = new Properties();
props.setProperty("clustermap.host.name", hostname);
props.setProperty("clustermap.cluster.name", clusterNamePrefixInHelix + clusterNameStatic);
props.setProperty("clustermap.datacenter.name", localDc);
props.setProperty("clustermap.port", Integer.toString(portNum));
props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
props.setProperty("clustermap.current.xid", Long.toString(CURRENT_XID));
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
// Mock metricRegistry here to introduce a latch based counter for testing purpose
metricRegistry = new MetricRegistry();
HelixClusterManager helixClusterManager = new HelixClusterManager(clusterMapConfig, selfInstanceName, new MockHelixManagerFactory(helixCluster, null, null), metricRegistry);
Map<String, RoutingTableSnapshot> snapshotsByDc = helixClusterManager.getRoutingTableSnapshots();
RoutingTableSnapshot localDcSnapshot = snapshotsByDc.get(localDc);
Set<InstanceConfig> instanceConfigsInSnapshot = new HashSet<>(localDcSnapshot.getInstanceConfigs());
Set<InstanceConfig> instanceConfigsInCluster = new HashSet<>(helixCluster.getInstanceConfigsFromDcs(new String[] { localDc }));
assertEquals("Mismatch in instance configs", instanceConfigsInCluster, instanceConfigsInSnapshot);
// verify leader replica of each partition is correct
verifyLeaderReplicasInDc(helixClusterManager, localDc);
// test live instance triggered routing table change
// we purposely bring down one instance and wait for expected number of live instance unless times out.
int initialLiveCnt = localDcSnapshot.getLiveInstances().size();
MockHelixAdmin mockHelixAdmin = helixCluster.getHelixAdminFromDc(localDc);
String instance = instanceConfigsInCluster.stream().filter(insConfig -> !insConfig.getInstanceName().equals(selfInstanceName)).findFirst().get().getInstanceName();
mockHelixAdmin.bringInstanceDown(instance);
mockHelixAdmin.triggerRoutingTableNotification();
int sleepCnt = 0;
while (helixClusterManager.getRoutingTableSnapshots().get(localDc).getLiveInstances().size() != initialLiveCnt - 1) {
assertTrue("Routing table change (triggered by bringing down node) didn't come within 1 sec", sleepCnt < 5);
Thread.sleep(200);
sleepCnt++;
}
// then bring up the same instance, the number of live instances should equal to initial count
mockHelixAdmin.bringInstanceUp(instance);
mockHelixAdmin.triggerRoutingTableNotification();
sleepCnt = 0;
while (helixClusterManager.getRoutingTableSnapshots().get(localDc).getLiveInstances().size() != initialLiveCnt) {
assertTrue("Routing table change (triggered by bringing up node) didn't come within 1 sec", sleepCnt < 5);
Thread.sleep(200);
sleepCnt++;
}
// randomly choose a partition and change the leader replica of it in cluster
List<? extends PartitionId> defaultPartitionIds = helixClusterManager.getAllPartitionIds(DEFAULT_PARTITION_CLASS);
PartitionId partitionToChange = defaultPartitionIds.get((new Random()).nextInt(defaultPartitionIds.size()));
String currentLeaderInstance = mockHelixAdmin.getPartitionToLeaderReplica().get(partitionToChange.toPathString());
int currentLeaderPort = Integer.parseInt(currentLeaderInstance.split("_")[1]);
String newLeaderInstance = mockHelixAdmin.getInstancesForPartition(partitionToChange.toPathString()).stream().filter(k -> !k.equals(currentLeaderInstance)).findFirst().get();
mockHelixAdmin.changeLeaderReplicaForPartition(partitionToChange.toPathString(), newLeaderInstance);
mockHelixAdmin.triggerRoutingTableNotification();
sleepCnt = 0;
while (partitionToChange.getReplicaIdsByState(ReplicaState.LEADER, localDc).get(0).getDataNodeId().getPort() == currentLeaderPort) {
assertTrue("Routing table change (triggered by leadership change) didn't come within 1 sec", sleepCnt < 5);
Thread.sleep(200);
sleepCnt++;
}
verifyLeaderReplicasInDc(helixClusterManager, localDc);
helixClusterManager.close();
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class HelixBootstrapUpgradeUtil method getSealedPartitionsInDc.
/**
* Get sealed partitions from given datacenter.
* @param dc the datacenter where sealed partitions come from.
* @param dcToSealedPartitions a map to track sealed partitions in each dc. This entry associated with given dc will
* be populated in this method.
* @param nodeToNonExistentReplicas a map to track if any replica is in sealed list but not actually on local node.
*/
private void getSealedPartitionsInDc(Datacenter dc, Map<String, Set<String>> dcToSealedPartitions, Map<String, Set<String>> nodeToNonExistentReplicas) {
String dcName = dc.getName();
dcToSealedPartitions.put(dcName, new HashSet<>());
ClusterMapConfig config = getClusterMapConfig(clusterName, dcName, null);
String zkConnectStr = dataCenterToZkAddress.get(dcName).getZkConnectStrs().get(0);
try (PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = new PropertyStoreToDataNodeConfigAdapter(zkConnectStr, config)) {
InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(config);
Set<String> allInstancesInHelix = new HashSet<>(getInstanceNamesInHelix(dcName, propertyStoreAdapter));
for (DataNodeId dataNodeId : dc.getDataNodes()) {
DataNode dataNode = (DataNode) dataNodeId;
Set<String> replicasOnNode = staticClusterMap.getReplicas(dataNode).stream().map(replicaId -> replicaId.getPartitionId().toPathString()).collect(Collectors.toSet());
String instanceName = getInstanceName(dataNode);
ensureOrThrow(allInstancesInHelix.contains(instanceName), "Instance not present in Helix " + instanceName);
DataNodeConfig dataNodeConfig = getDataNodeConfigFromHelix(dcName, instanceName, propertyStoreAdapter, instanceConfigConverter);
Set<String> sealedReplicas = dataNodeConfig.getSealedReplicas();
if (sealedReplicas != null) {
for (String sealedReplica : sealedReplicas) {
info("Replica {} is sealed on {}", sealedReplica, instanceName);
dcToSealedPartitions.get(dcName).add(sealedReplica);
if (!replicasOnNode.contains(sealedReplica)) {
logger.warn("Replica {} is in sealed list but not on node {}", sealedReplica, instanceName);
nodeToNonExistentReplicas.computeIfAbsent(instanceName, key -> new HashSet<>()).add(sealedReplica);
}
}
}
}
}
}
use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.
the class DatacenterTest method validation.
@Test
public void validation() throws JSONException {
JSONObject jsonObject;
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
try {
// Null HardwareLayout
jsonObject = TestUtils.getJsonDatacenter("XYZ1", (byte) 1, getDataNodes());
new Datacenter(null, jsonObject, clusterMapConfig);
fail("Should have failed validation.");
} catch (IllegalStateException e) {
// Expected.
}
// Bad datacenter name
jsonObject = TestUtils.getJsonDatacenter("", (byte) 1, getDataNodes());
failValidation(jsonObject, clusterMapConfig);
// Missing rack IDs
jsonObject = TestUtils.getJsonDatacenter("XYZ1", (byte) 1, getDataNodesPartiallyRackAware());
failValidation(jsonObject, clusterMapConfig);
}
Aggregations