Search in sources :

Example 51 with ClusterMapConfig

use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.

the class HelixBootstrapUpgradeUtil method verifyDataNodeAndDiskEquivalencyInDc.

/**
 * Verify that the hardware layout information is in sync - which includes the node and disk information. Also verify
 * that the replicas belonging to disks are in sync between the static cluster map and Helix.
 * @param dc the datacenter whose information is to be verified.
 * @param clusterName the cluster to be verified.
 * @param partitionLayout the {@link PartitionLayout} of the static clustermap.
 */
private void verifyDataNodeAndDiskEquivalencyInDc(Datacenter dc, String clusterName, PartitionLayout partitionLayout) {
    String dcName = dc.getName();
    // The following properties are immaterial for the tool, but the ClusterMapConfig mandates their presence.
    ClusterMapConfig clusterMapConfig = getClusterMapConfig(clusterName, dcName, null);
    StaticClusterManager staticClusterMap = (new StaticClusterAgentsFactory(clusterMapConfig, partitionLayout)).getClusterMap();
    String zkConnectStr = dataCenterToZkAddress.get(dcName).getZkConnectStrs().get(0);
    try (PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = new PropertyStoreToDataNodeConfigAdapter(zkConnectStr, clusterMapConfig)) {
        InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(clusterMapConfig);
        Set<String> allInstancesInHelix = new HashSet<>(getInstanceNamesInHelix(dcName, propertyStoreAdapter));
        for (DataNodeId dataNodeId : dc.getDataNodes()) {
            Map<String, Map<String, ReplicaId>> mountPathToReplicas = getMountPathToReplicas(staticClusterMap, dataNodeId);
            DataNode dataNode = (DataNode) dataNodeId;
            String instanceName = getInstanceName(dataNode);
            ensureOrThrow(allInstancesInHelix.remove(instanceName), "Instance not present in Helix " + instanceName);
            DataNodeConfig dataNodeConfig = getDataNodeConfigFromHelix(dcName, instanceName, propertyStoreAdapter, instanceConfigConverter);
            Map<String, DataNodeConfig.DiskConfig> diskInfos = new HashMap<>(dataNodeConfig.getDiskConfigs());
            for (Disk disk : dataNode.getDisks()) {
                DataNodeConfig.DiskConfig diskInfoInHelix = diskInfos.remove(disk.getMountPath());
                ensureOrThrow(diskInfoInHelix != null, "[" + dcName.toUpperCase() + "] Disk not present for instance " + instanceName + " disk " + disk.getMountPath());
                ensureOrThrow(disk.getRawCapacityInBytes() == diskInfoInHelix.getDiskCapacityInBytes(), "[" + dcName.toUpperCase() + "] Capacity mismatch for instance " + instanceName + " disk " + disk.getMountPath());
                // are expected to be different on certain nodes.
                if (EnumSet.of(HelixAdminOperation.BootstrapCluster, HelixAdminOperation.ValidateCluster).contains(helixAdminOperation)) {
                    Set<String> replicasInClusterMap = new HashSet<>();
                    Map<String, ReplicaId> replicaList = mountPathToReplicas.get(disk.getMountPath());
                    if (replicaList != null) {
                        replicasInClusterMap.addAll(replicaList.keySet());
                    }
                    Set<String> replicasInHelix = new HashSet<>();
                    Map<String, DataNodeConfig.ReplicaConfig> replicaConfigMap = diskInfoInHelix.getReplicaConfigs();
                    for (Map.Entry<String, DataNodeConfig.ReplicaConfig> replicaConfigEntry : replicaConfigMap.entrySet()) {
                        String replicaName = replicaConfigEntry.getKey();
                        DataNodeConfig.ReplicaConfig replicaConfig = replicaConfigEntry.getValue();
                        replicasInHelix.add(replicaName);
                        ReplicaId replica = replicaList.get(replicaName);
                        ensureOrThrow(replicaConfig.getReplicaCapacityInBytes() == replica.getCapacityInBytes(), "[" + dcName.toUpperCase() + "] Replica capacity should be the same.");
                        ensureOrThrow(replicaConfig.getPartitionClass().equals(replica.getPartitionId().getPartitionClass()), "[" + dcName.toUpperCase() + "] Partition class should be the same.");
                    }
                    ensureOrThrow(replicasInClusterMap.equals(replicasInHelix), "[" + dcName.toUpperCase() + "] Replica information not consistent for instance " + instanceName + " disk " + disk.getMountPath() + "\n in Helix: " + replicaList + "\n in static clustermap: " + replicasInClusterMap);
                }
            }
            for (Map.Entry<String, DataNodeConfig.DiskConfig> entry : diskInfos.entrySet()) {
                String mountPath = entry.getKey();
                if (!mountPath.startsWith("/mnt")) {
                    logger.warn("[{}] Instance {} has unidentifiable mount path in Helix: {}", dcName.toUpperCase(), instanceName, mountPath);
                } else {
                    throw new AssertionError("[" + dcName.toUpperCase() + "] Instance " + instanceName + " has extra disk in Helix: " + entry);
                }
            }
            ensureOrThrow(!dataNode.hasSSLPort() || (dataNode.getSSLPort() == dataNodeConfig.getSslPort()), "[" + dcName.toUpperCase() + "] SSL Port mismatch for instance " + instanceName);
            ensureOrThrow(!dataNode.hasHttp2Port() || (dataNode.getHttp2Port() == dataNodeConfig.getHttp2Port()), "[" + dcName.toUpperCase() + "] HTTP2 Port mismatch for instance " + instanceName);
            ensureOrThrow(dataNode.getDatacenterName().equals(dataNodeConfig.getDatacenterName()), "[" + dcName.toUpperCase() + "] Datacenter mismatch for instance " + instanceName);
            ensureOrThrow(Objects.equals(dataNode.getRackId(), dataNodeConfig.getRackId()), "[" + dcName.toUpperCase() + "] Rack Id mismatch for instance " + instanceName);
        // xid is not set in PropertyStore based DataNodeConfig and will be decommissioned eventually, hence we don't check xid equivalence
        }
        if (expectMoreInHelixDuringValidate) {
            ensureOrThrow(allInstancesInHelix.equals(instancesNotForceRemovedByDc.getOrDefault(dc.getName(), new HashSet<>())), "[" + dcName.toUpperCase() + "] Additional instances in Helix: " + allInstancesInHelix + " not what is expected " + instancesNotForceRemovedByDc.get(dc.getName()));
            info("[{}] *** Helix may have more instances than in the given clustermap as removals were not forced.", dcName.toUpperCase());
        } else {
            ensureOrThrow(allInstancesInHelix.isEmpty(), "[" + dcName.toUpperCase() + "] Following instances in Helix not found in the clustermap " + allInstancesInHelix);
        }
    }
    info("[{}] Successfully verified datanode and disk equivalency in dc {}", dcName.toUpperCase(), dc.getName());
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Example 52 with ClusterMapConfig

use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.

the class HardwareLayoutTest method basics.

@Test
public void basics() throws JSONException {
    JSONObject jsonObject = TestUtils.getJsonHardwareLayout("Alpha", getDatacenters());
    HardwareLayout hardwareLayout = new HardwareLayout(jsonObject, new ClusterMapConfig(new VerifiableProperties(props)));
    assertEquals(hardwareLayout.getVersion(), TestUtils.defaultHardwareLayoutVersion);
    assertEquals(hardwareLayout.getClusterName(), "Alpha");
    assertEquals(hardwareLayout.getDatacenters().size(), datacenterCount);
    assertEquals(hardwareLayout.getRawCapacityInBytes(), datacenterCount * dataNodeCount * diskCount * diskCapacityInBytes);
    assertEquals(hardwareLayout.toJSONObject().toString(), jsonObject.toString());
    assertEquals(hardwareLayout.getDataNodeInHardStateCount(HardwareState.AVAILABLE), datacenterCount * dataNodeCount);
    assertEquals(hardwareLayout.getDataNodeInHardStateCount(HardwareState.UNAVAILABLE), 0);
    assertEquals(hardwareLayout.calculateUnavailableDataNodeCount(), 0);
    assertEquals(hardwareLayout.getDiskInHardStateCount(HardwareState.AVAILABLE), datacenterCount * dataNodeCount * diskCount);
    assertEquals(hardwareLayout.getDiskInHardStateCount(HardwareState.UNAVAILABLE), 0);
    assertEquals(hardwareLayout.calculateUnavailableDiskCount(), 0);
}
Also used : JSONObject(org.json.JSONObject) VerifiableProperties(com.github.ambry.config.VerifiableProperties) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) Test(org.junit.Test)

Example 53 with ClusterMapConfig

use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.

the class HelixClusterManagerTest method routingTableProviderChangeTest.

/**
 * Test that routing table change reflects correct state of each replica and {@link HelixClusterManager} is able to get
 * replica in required state.
 */
@Test
public void routingTableProviderChangeTest() throws Exception {
    assumeTrue(!useComposite && !overrideEnabled && !listenCrossColo);
    // Change zk connect strings to ensure HelixClusterManager sees local DC only
    JSONObject zkJson = constructZkLayoutJSON(Collections.singletonList(dcsToZkInfo.get(localDc)));
    Properties props = new Properties();
    props.setProperty("clustermap.host.name", hostname);
    props.setProperty("clustermap.cluster.name", clusterNamePrefixInHelix + clusterNameStatic);
    props.setProperty("clustermap.datacenter.name", localDc);
    props.setProperty("clustermap.port", Integer.toString(portNum));
    props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
    props.setProperty("clustermap.current.xid", Long.toString(CURRENT_XID));
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
    // Mock metricRegistry here to introduce a latch based counter for testing purpose
    metricRegistry = new MetricRegistry();
    HelixClusterManager helixClusterManager = new HelixClusterManager(clusterMapConfig, selfInstanceName, new MockHelixManagerFactory(helixCluster, null, null), metricRegistry);
    Map<String, RoutingTableSnapshot> snapshotsByDc = helixClusterManager.getRoutingTableSnapshots();
    RoutingTableSnapshot localDcSnapshot = snapshotsByDc.get(localDc);
    Set<InstanceConfig> instanceConfigsInSnapshot = new HashSet<>(localDcSnapshot.getInstanceConfigs());
    Set<InstanceConfig> instanceConfigsInCluster = new HashSet<>(helixCluster.getInstanceConfigsFromDcs(new String[] { localDc }));
    assertEquals("Mismatch in instance configs", instanceConfigsInCluster, instanceConfigsInSnapshot);
    // verify leader replica of each partition is correct
    verifyLeaderReplicasInDc(helixClusterManager, localDc);
    // test live instance triggered routing table change
    // we purposely bring down one instance and wait for expected number of live instance unless times out.
    int initialLiveCnt = localDcSnapshot.getLiveInstances().size();
    MockHelixAdmin mockHelixAdmin = helixCluster.getHelixAdminFromDc(localDc);
    String instance = instanceConfigsInCluster.stream().filter(insConfig -> !insConfig.getInstanceName().equals(selfInstanceName)).findFirst().get().getInstanceName();
    mockHelixAdmin.bringInstanceDown(instance);
    mockHelixAdmin.triggerRoutingTableNotification();
    int sleepCnt = 0;
    while (helixClusterManager.getRoutingTableSnapshots().get(localDc).getLiveInstances().size() != initialLiveCnt - 1) {
        assertTrue("Routing table change (triggered by bringing down node) didn't come within 1 sec", sleepCnt < 5);
        Thread.sleep(200);
        sleepCnt++;
    }
    // then bring up the same instance, the number of live instances should equal to initial count
    mockHelixAdmin.bringInstanceUp(instance);
    mockHelixAdmin.triggerRoutingTableNotification();
    sleepCnt = 0;
    while (helixClusterManager.getRoutingTableSnapshots().get(localDc).getLiveInstances().size() != initialLiveCnt) {
        assertTrue("Routing table change (triggered by bringing up node) didn't come within 1 sec", sleepCnt < 5);
        Thread.sleep(200);
        sleepCnt++;
    }
    // randomly choose a partition and change the leader replica of it in cluster
    List<? extends PartitionId> defaultPartitionIds = helixClusterManager.getAllPartitionIds(DEFAULT_PARTITION_CLASS);
    PartitionId partitionToChange = defaultPartitionIds.get((new Random()).nextInt(defaultPartitionIds.size()));
    String currentLeaderInstance = mockHelixAdmin.getPartitionToLeaderReplica().get(partitionToChange.toPathString());
    int currentLeaderPort = Integer.parseInt(currentLeaderInstance.split("_")[1]);
    String newLeaderInstance = mockHelixAdmin.getInstancesForPartition(partitionToChange.toPathString()).stream().filter(k -> !k.equals(currentLeaderInstance)).findFirst().get();
    mockHelixAdmin.changeLeaderReplicaForPartition(partitionToChange.toPathString(), newLeaderInstance);
    mockHelixAdmin.triggerRoutingTableNotification();
    sleepCnt = 0;
    while (partitionToChange.getReplicaIdsByState(ReplicaState.LEADER, localDc).get(0).getDataNodeId().getPort() == currentLeaderPort) {
        assertTrue("Routing table change (triggered by leadership change) didn't come within 1 sec", sleepCnt < 5);
        Thread.sleep(200);
        sleepCnt++;
    }
    verifyLeaderReplicasInDc(helixClusterManager, localDc);
    helixClusterManager.close();
}
Also used : CoreMatchers(org.hamcrest.CoreMatchers) Arrays(java.util.Arrays) IdealState(org.apache.helix.model.IdealState) ClusterMapUtils(com.github.ambry.clustermap.ClusterMapUtils) ServerErrorCode(com.github.ambry.server.ServerErrorCode) RunWith(org.junit.runner.RunWith) HashMap(java.util.HashMap) Random(java.util.Random) RoutingTableSnapshot(org.apache.helix.spectator.RoutingTableSnapshot) ByteBuffer(java.nio.ByteBuffer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) MockitoAnnotations(org.mockito.MockitoAnnotations) TestUtils(com.github.ambry.clustermap.TestUtils) JSONException(org.json.JSONException) JSONObject(org.json.JSONObject) Map(java.util.Map) After(org.junit.After) Counter(com.codahale.metrics.Counter) Assume(org.junit.Assume) Parameterized(org.junit.runners.Parameterized) MetricRegistry(com.codahale.metrics.MetricRegistry) Properties(java.util.Properties) Pair(com.github.ambry.utils.Pair) Files(java.nio.file.Files) VerifiableProperties(com.github.ambry.config.VerifiableProperties) Set(java.util.Set) HelixManager(org.apache.helix.HelixManager) Utils(com.github.ambry.utils.Utils) IOException(java.io.IOException) Test(org.junit.Test) Collectors(java.util.stream.Collectors) InstanceConfig(org.apache.helix.model.InstanceConfig) File(java.io.File) ZNRecord(org.apache.helix.zookeeper.datamodel.ZNRecord) Mockito(org.mockito.Mockito) List(java.util.List) ByteBufferInputStream(com.github.ambry.utils.ByteBufferInputStream) InstanceType(org.apache.helix.InstanceType) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) Gauge(com.codahale.metrics.Gauge) Assert(org.junit.Assert) Collections(java.util.Collections) ResponseHandler(com.github.ambry.commons.ResponseHandler) InputStream(java.io.InputStream) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) Properties(java.util.Properties) VerifiableProperties(com.github.ambry.config.VerifiableProperties) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) RoutingTableSnapshot(org.apache.helix.spectator.RoutingTableSnapshot) JSONObject(org.json.JSONObject) InstanceConfig(org.apache.helix.model.InstanceConfig) Random(java.util.Random) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 54 with ClusterMapConfig

use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.

the class HelixBootstrapUpgradeUtil method getSealedPartitionsInDc.

/**
 * Get sealed partitions from given datacenter.
 * @param dc the datacenter where sealed partitions come from.
 * @param dcToSealedPartitions a map to track sealed partitions in each dc. This entry associated with given dc will
 *                             be populated in this method.
 * @param nodeToNonExistentReplicas a map to track if any replica is in sealed list but not actually on local node.
 */
private void getSealedPartitionsInDc(Datacenter dc, Map<String, Set<String>> dcToSealedPartitions, Map<String, Set<String>> nodeToNonExistentReplicas) {
    String dcName = dc.getName();
    dcToSealedPartitions.put(dcName, new HashSet<>());
    ClusterMapConfig config = getClusterMapConfig(clusterName, dcName, null);
    String zkConnectStr = dataCenterToZkAddress.get(dcName).getZkConnectStrs().get(0);
    try (PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = new PropertyStoreToDataNodeConfigAdapter(zkConnectStr, config)) {
        InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(config);
        Set<String> allInstancesInHelix = new HashSet<>(getInstanceNamesInHelix(dcName, propertyStoreAdapter));
        for (DataNodeId dataNodeId : dc.getDataNodes()) {
            DataNode dataNode = (DataNode) dataNodeId;
            Set<String> replicasOnNode = staticClusterMap.getReplicas(dataNode).stream().map(replicaId -> replicaId.getPartitionId().toPathString()).collect(Collectors.toSet());
            String instanceName = getInstanceName(dataNode);
            ensureOrThrow(allInstancesInHelix.contains(instanceName), "Instance not present in Helix " + instanceName);
            DataNodeConfig dataNodeConfig = getDataNodeConfigFromHelix(dcName, instanceName, propertyStoreAdapter, instanceConfigConverter);
            Set<String> sealedReplicas = dataNodeConfig.getSealedReplicas();
            if (sealedReplicas != null) {
                for (String sealedReplica : sealedReplicas) {
                    info("Replica {} is sealed on {}", sealedReplica, instanceName);
                    dcToSealedPartitions.get(dcName).add(sealedReplica);
                    if (!replicasOnNode.contains(sealedReplica)) {
                        logger.warn("Replica {} is in sealed list but not on node {}", sealedReplica, instanceName);
                        nodeToNonExistentReplicas.computeIfAbsent(instanceName, key -> new HashSet<>()).add(sealedReplica);
                    }
                }
            }
        }
    }
}
Also used : Arrays(java.util.Arrays) ClusterMapUtils(com.github.ambry.clustermap.ClusterMapUtils) SortedSet(java.util.SortedSet) LoggerFactory(org.slf4j.LoggerFactory) JSONException(org.json.JSONException) JSONObject(org.json.JSONObject) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) SharedZkClientFactory(org.apache.helix.zookeeper.impl.factory.SharedZkClientFactory) AccessOption(org.apache.helix.AccessOption) EnumSet(java.util.EnumSet) LeaderStandbySMD(org.apache.helix.model.LeaderStandbySMD) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ZKUtil(org.apache.helix.manager.zk.ZKUtil) Set(java.util.Set) Utils(com.github.ambry.utils.Utils) HelixPropertyStoreConfig(com.github.ambry.config.HelixPropertyStoreConfig) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) ZNRecord(org.apache.helix.zookeeper.datamodel.ZNRecord) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) Optional(java.util.Optional) IdealState(org.apache.helix.model.IdealState) CommonUtils(com.github.ambry.commons.CommonUtils) HashMap(java.util.HashMap) HelixZkClient(org.apache.helix.zookeeper.api.client.HelixZkClient) ResourceConfig(org.apache.helix.model.ResourceConfig) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HelixPropertyStore(org.apache.helix.store.HelixPropertyStore) RealmAwareZkClient(org.apache.helix.zookeeper.api.client.RealmAwareZkClient) DataNodeConfigSourceType(com.github.ambry.clustermap.DataNodeConfigSourceType) StateModelDefinition(org.apache.helix.model.StateModelDefinition) Properties(java.util.Properties) Logger(org.slf4j.Logger) VerifiableProperties(com.github.ambry.config.VerifiableProperties) IOException(java.io.IOException) InstanceConfig(org.apache.helix.model.InstanceConfig) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) HelixAdmin(org.apache.helix.HelixAdmin) TreeMap(java.util.TreeMap) ZNRecordSerializer(org.apache.helix.manager.zk.ZNRecordSerializer) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) ZKHelixAdmin(org.apache.helix.manager.zk.ZKHelixAdmin) Comparator(java.util.Comparator) Collections(java.util.Collections) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) HashSet(java.util.HashSet)

Example 55 with ClusterMapConfig

use of com.github.ambry.config.ClusterMapConfig in project ambry by linkedin.

the class DatacenterTest method validation.

@Test
public void validation() throws JSONException {
    JSONObject jsonObject;
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
    try {
        // Null HardwareLayout
        jsonObject = TestUtils.getJsonDatacenter("XYZ1", (byte) 1, getDataNodes());
        new Datacenter(null, jsonObject, clusterMapConfig);
        fail("Should have failed validation.");
    } catch (IllegalStateException e) {
    // Expected.
    }
    // Bad datacenter name
    jsonObject = TestUtils.getJsonDatacenter("", (byte) 1, getDataNodes());
    failValidation(jsonObject, clusterMapConfig);
    // Missing rack IDs
    jsonObject = TestUtils.getJsonDatacenter("XYZ1", (byte) 1, getDataNodesPartiallyRackAware());
    failValidation(jsonObject, clusterMapConfig);
}
Also used : JSONObject(org.json.JSONObject) VerifiableProperties(com.github.ambry.config.VerifiableProperties) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) Test(org.junit.Test)

Aggregations

ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)100 VerifiableProperties (com.github.ambry.config.VerifiableProperties)81 Test (org.junit.Test)56 Properties (java.util.Properties)52 MetricRegistry (com.codahale.metrics.MetricRegistry)47 ArrayList (java.util.ArrayList)31 IOException (java.io.IOException)26 HashSet (java.util.HashSet)25 JSONObject (org.json.JSONObject)25 File (java.io.File)24 ClusterMap (com.github.ambry.clustermap.ClusterMap)23 HashMap (java.util.HashMap)21 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)19 ClusterAgentsFactory (com.github.ambry.clustermap.ClusterAgentsFactory)18 DataNodeId (com.github.ambry.clustermap.DataNodeId)18 StoreConfig (com.github.ambry.config.StoreConfig)18 ReplicaId (com.github.ambry.clustermap.ReplicaId)16 List (java.util.List)16 Map (java.util.Map)16 CountDownLatch (java.util.concurrent.CountDownLatch)16