Search in sources :

Example 36 with ZNRecord

use of org.apache.helix.zookeeper.datamodel.ZNRecord in project ambry by linkedin.

the class HelixBootstrapUpgradeToolTest method testDisablePartitionAdminOp.

/**
 * Test when AdminOperation is specified to DisablePartition, Helix bootstrap tool is able to disable certain partition
 * only without changing IdealState and InstanceConfig. (In practice, this is first step to decommission a replica)
 * @throws Exception
 */
@Test
public void testDisablePartitionAdminOp() throws Exception {
    String clusterName = CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP;
    // Test regular bootstrap. This is to ensure DataNodeConfig and IdealState are there before testing disabling
    // certain replica on specific node.
    long expectedResourceCount = (testPartitionLayout.getPartitionLayout().getPartitionCount() - 1) / DEFAULT_MAX_PARTITIONS_PER_RESOURCE + 1;
    writeBootstrapOrUpgrade(expectedResourceCount, false);
    int totalPartitionCount = testPartitionLayout.getPartitionCount();
    // Randomly pick a partition to remove one of its replicas
    Partition testPartition = (Partition) testPartitionLayout.getPartitionLayout().getPartitions(null).get(RANDOM.nextInt(totalPartitionCount));
    ReplicaId removedReplica = testPartition.getReplicaIds().stream().filter(r -> r.getDataNodeId().getDatacenterName().equals("DC1")).findFirst().get();
    testPartition.getReplicas().remove(removedReplica);
    ZkInfo zkInfo = dcsToZkInfo.get(removedReplica.getDataNodeId().getDatacenterName());
    // create a participant that hosts this removed replica
    Properties props = new Properties();
    props.setProperty("clustermap.host.name", "localhost");
    props.setProperty("clustermap.port", String.valueOf(removedReplica.getDataNodeId().getPort()));
    props.setProperty("clustermap.cluster.name", clusterName);
    props.setProperty("clustermap.datacenter.name", "DC1");
    props.setProperty("clustermap.update.datanode.info", Boolean.toString(true));
    props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
    props.setProperty("clustermap.retry.disable.partition.completion.backoff.ms", Integer.toString(100));
    props.setProperty("clustermap.data.node.config.source.type", dataNodeConfigSourceType.name());
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
    HelixParticipant helixParticipant = new HelixParticipant(clusterMapConfig, new HelixFactory(), new MetricRegistry(), "localhost:" + zkInfo.getPort(), true);
    PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? null : new PropertyStoreToDataNodeConfigAdapter("localhost:" + zkInfo.getPort(), clusterMapConfig);
    InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(clusterMapConfig);
    // create HelixAdmin
    ZKHelixAdmin admin = new ZKHelixAdmin("localhost:" + zkInfo.getPort());
    // Write changes to static files
    Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
    Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
    Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
    // make bootstrap tool blocked by count down latch before removing znodes for disabling partitions
    blockRemovingNodeLatch = new CountDownLatch(1);
    disablePartitionLatch = new CountDownLatch(activeDcSet.size());
    CountDownLatch bootstrapCompletionLatch = new CountDownLatch(1);
    Utils.newThread(() -> {
        try {
            // Upgrade Helix by updating IdealState: AdminOperation = DisablePartition
            HelixBootstrapUpgradeUtil.bootstrapOrUpgrade(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, DEFAULT_MAX_PARTITIONS_PER_RESOURCE, false, false, new HelixAdminFactory(), false, ClusterMapConfig.DEFAULT_STATE_MODEL_DEF, DisablePartition, dataNodeConfigSourceType, false);
            bootstrapCompletionLatch.countDown();
        } catch (Exception e) {
        // do nothing, if there is any exception subsequent test should fail.
        }
    }, false).start();
    assertTrue("Disable partition latch didn't come down within 5 seconds", disablePartitionLatch.await(5, TimeUnit.SECONDS));
    // Let's attempt to update InstanceConfig/DataNodeConfig via HelixParticipant, which should be blocked
    CountDownLatch updateCompletionLatch = new CountDownLatch(1);
    Utils.newThread(() -> {
        helixParticipant.updateDataNodeInfoInCluster(removedReplica, false);
        updateCompletionLatch.countDown();
    }, false).start();
    // sleep 100 ms to ensure updateDataNodeInfoInCluster is blocked due to disabling partition hasn't completed yet
    Thread.sleep(100);
    // Ensure the DataNodeConfig still has the replica
    String instanceName = getInstanceName(removedReplica.getDataNodeId());
    DataNodeConfig currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
    verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, true);
    // verify the znode is created for the node on which partition has been disabled.
    Properties properties = new Properties();
    properties.setProperty("helix.property.store.root.path", "/" + clusterName + "/" + PROPERTYSTORE_STR);
    HelixPropertyStoreConfig propertyStoreConfig = new HelixPropertyStoreConfig(new VerifiableProperties(properties));
    HelixPropertyStore<ZNRecord> helixPropertyStore = CommonUtils.createHelixPropertyStore("localhost:" + zkInfo.getPort(), propertyStoreConfig, null);
    String path = PARTITION_DISABLED_ZNODE_PATH + getInstanceName(removedReplica.getDataNodeId());
    assertTrue("ZNode is not found for disabled partition node.", helixPropertyStore.exists(path, AccessOption.PERSISTENT));
    helixPropertyStore.stop();
    // unblock HelixBootstrapTool
    blockRemovingNodeLatch.countDown();
    // waiting for bootstrap tool to complete
    assertTrue("Helix tool didn't complete within 5 seconds", bootstrapCompletionLatch.await(5, TimeUnit.SECONDS));
    verifyResourceCount(testHardwareLayout.getHardwareLayout(), expectedResourceCount);
    assertTrue("Helix participant didn't complete update within 5 seconds", updateCompletionLatch.await(5, TimeUnit.SECONDS));
    InstanceConfig currentInstanceConfig = admin.getInstanceConfig(clusterName, getInstanceName(removedReplica.getDataNodeId()));
    // Verify that replica has been disabled
    String resourceName = null;
    for (String rs : admin.getResourcesInCluster(clusterName)) {
        IdealState is = admin.getResourceIdealState(clusterName, rs);
        if (is.getPartitionSet().contains(removedReplica.getPartitionId().toPathString())) {
            resourceName = rs;
            break;
        }
    }
    List<String> disabledPartition = currentInstanceConfig.getDisabledPartitions(resourceName);
    assertEquals("Disabled partition not as expected", Collections.singletonList(removedReplica.getPartitionId().toPathString()), disabledPartition);
    // Verify that IdealState has no change
    verifyIdealStateForPartition(removedReplica, true, 3, expectedResourceCount);
    // Verify the InstanceConfig is changed in MapFields (Disabled partitions are added to this field, also the replica entry has been removed)
    String disabledPartitionStr = currentInstanceConfig.getRecord().getMapFields().keySet().stream().filter(k -> !k.startsWith("/mnt")).findFirst().get();
    // Verify the disabled partition string contains correct partition
    Map<String, String> expectedDisabledPartitionMap = new HashMap<>();
    expectedDisabledPartitionMap.put(resourceName, removedReplica.getPartitionId().toPathString());
    assertEquals("Mismatch in disabled partition string in InstanceConfig", expectedDisabledPartitionMap, currentInstanceConfig.getRecord().getMapField(disabledPartitionStr));
    // verify the removed replica is no longer in InstanceConfig
    currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
    verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, false);
    if (propertyStoreAdapter != null) {
        propertyStoreAdapter.close();
    }
}
Also used : HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HelixPropertyStoreConfig(com.github.ambry.config.HelixPropertyStoreConfig) Properties(java.util.Properties) VerifiableProperties(com.github.ambry.config.VerifiableProperties) IdealState(org.apache.helix.model.IdealState) ZKHelixAdmin(org.apache.helix.manager.zk.ZKHelixAdmin) InstanceConfig(org.apache.helix.model.InstanceConfig) ZNRecord(org.apache.helix.zookeeper.datamodel.ZNRecord) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) HelixException(org.apache.helix.HelixException) JSONException(org.json.JSONException) IOException(java.io.IOException) Test(org.junit.Test)

Example 37 with ZNRecord

use of org.apache.helix.zookeeper.datamodel.ZNRecord in project ambry by linkedin.

the class HelixBootstrapUpgradeToolTest method uploadClusterConfigsAndVerify.

/**
 * Write the layout files out from the constructed in-memory hardware and partition layouts; use the upload cluster config
 * tool to upload the partition seal states onto Zookeeper; verify that the writable partitions are consistent between the two.
 * After verification is done, partition override config will be safely deleted.
 * @throws IOException if a file read error is encountered.
 * @throws JSONException if a JSON parse error is encountered.
 */
private void uploadClusterConfigsAndVerify() throws Exception {
    List<PartitionId> writablePartitions = testPartitionLayout.getPartitionLayout().getWritablePartitions(null);
    Set<String> writableInPartitionLayout = new HashSet<>();
    writablePartitions.forEach(k -> writableInPartitionLayout.add(k.toPathString()));
    Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
    Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
    Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
    HelixBootstrapUpgradeUtil.uploadOrDeleteAdminConfigs(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, false, new String[] { ClusterMapUtils.PARTITION_OVERRIDE_STR }, null);
    // Check writable partitions in each datacenter
    for (ZkInfo zkInfo : dcsToZkInfo.values()) {
        HelixPropertyStore<ZNRecord> propertyStore = CommonUtils.createHelixPropertyStore("localhost:" + zkInfo.getPort(), propertyStoreConfig, Collections.singletonList(propertyStoreConfig.rootPath));
        ZNRecord zNRecord = propertyStore.get(ClusterMapUtils.PARTITION_OVERRIDE_ZNODE_PATH, null, AccessOption.PERSISTENT);
        if (!activeDcSet.contains(zkInfo.getDcName())) {
            assertNull(zNRecord);
        } else {
            assertNotNull(zNRecord);
            Map<String, Map<String, String>> overridePartition = zNRecord.getMapFields();
            Set<String> writableInDC = new HashSet<>();
            for (Map.Entry<String, Map<String, String>> entry : overridePartition.entrySet()) {
                if (entry.getValue().get(ClusterMapUtils.PARTITION_STATE).equals(ClusterMapUtils.READ_WRITE_STR)) {
                    writableInDC.add(entry.getKey());
                }
            }
            // Verify writable partitions in DC match writable partitions in Partition Layout
            assertEquals("Mismatch in writable partitions for partitionLayout and propertyStore", writableInPartitionLayout, writableInDC);
        }
    }
    // delete partition override config
    HelixBootstrapUpgradeUtil.uploadOrDeleteAdminConfigs(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, true, new String[] { ClusterMapUtils.PARTITION_OVERRIDE_STR }, null);
    // verify that the config is cleaned up
    for (ZkInfo zkInfo : dcsToZkInfo.values()) {
        HelixPropertyStore<ZNRecord> propertyStore = CommonUtils.createHelixPropertyStore("localhost:" + zkInfo.getPort(), propertyStoreConfig, Collections.singletonList(propertyStoreConfig.rootPath));
        ZNRecord zNRecord = propertyStore.get(ClusterMapUtils.PARTITION_OVERRIDE_ZNODE_PATH, null, AccessOption.PERSISTENT);
        assertNull("Partition override config should no longer exist", zNRecord);
    }
}
Also used : HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ZNRecord(org.apache.helix.zookeeper.datamodel.ZNRecord) HashSet(java.util.HashSet)

Aggregations

ZNRecord (org.apache.helix.zookeeper.datamodel.ZNRecord)37 HashMap (java.util.HashMap)19 Test (org.junit.Test)18 Map (java.util.Map)10 ArrayList (java.util.ArrayList)8 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)8 VerifiableProperties (com.github.ambry.config.VerifiableProperties)6 HashSet (java.util.HashSet)6 InstanceConfig (org.apache.helix.model.InstanceConfig)6 MetricRegistry (com.codahale.metrics.MetricRegistry)5 Properties (java.util.Properties)5 Stat (org.apache.zookeeper.data.Stat)5 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)4 HelixPropertyStoreConfig (com.github.ambry.config.HelixPropertyStoreConfig)4 IOException (java.io.IOException)4 JSONObject (org.json.JSONObject)4 List (java.util.List)3 Random (java.util.Random)3 PropertyKey (org.apache.helix.PropertyKey)3 IdealState (org.apache.helix.model.IdealState)3