use of org.apache.helix.manager.zk.ZKHelixAdmin in project ambry by linkedin.
the class HelixBootstrapUpgradeToolTest method testUpdateIdealStateAdminOp.
/**
* Test that when AdminOperation is specified to UpdateIdealState, Helix bootstrap tool updates IdealState only without
* changing InstanceConfig.
*/
@Test
public void testUpdateIdealStateAdminOp() throws Exception {
String clusterName = CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP;
// Test regular bootstrap. This is to ensure InstanceConfig and IdealState are there before testing changing
// IdealState (to trigger replica movement)
long expectedResourceCount = (testPartitionLayout.getPartitionLayout().getPartitionCount() - 1) / DEFAULT_MAX_PARTITIONS_PER_RESOURCE + 1;
writeBootstrapOrUpgrade(expectedResourceCount, false);
// Now, change the replica count for two partitions.
int totalPartitionCount = testPartitionLayout.getPartitionCount();
int firstPartitionIndex = RANDOM.nextInt(totalPartitionCount);
int secondPartitionIndex = (firstPartitionIndex + 1) % totalPartitionCount;
List<PartitionId> allPartitions = testPartitionLayout.getPartitionLayout().getPartitions(null);
Partition partition1 = (Partition) allPartitions.get(firstPartitionIndex);
Partition partition2 = (Partition) allPartitions.get(secondPartitionIndex);
// Add a new replica for partition1. Find a disk on a data node that does not already have a replica for partition1.
HashSet<DataNodeId> partition1Nodes = new HashSet<>();
for (ReplicaId replica : partition1.getReplicas()) {
partition1Nodes.add(replica.getDataNodeId());
}
Disk diskForNewReplica;
do {
diskForNewReplica = testHardwareLayout.getRandomDisk();
} while (partition1Nodes.contains(diskForNewReplica.getDataNode()) || !diskForNewReplica.getDataNode().getDatacenterName().equals("DC1"));
// Add new replica into partition1
ReplicaId replicaToAdd = new Replica(partition1, diskForNewReplica, testHardwareLayout.clusterMapConfig);
partition1.addReplica(replicaToAdd);
// Remove a replica from partition2.
ReplicaId removedReplica = partition2.getReplicas().remove(0);
String dcName = replicaToAdd.getDataNodeId().getDatacenterName();
ZkInfo zkInfo = dcsToZkInfo.get(dcName);
ZKHelixAdmin admin = new ZKHelixAdmin("localhost:" + zkInfo.getPort());
InstanceConfig instanceConfig = admin.getInstanceConfig(clusterName, getInstanceName(replicaToAdd.getDataNodeId()));
// deep copy for subsequent verification
InstanceConfig previousInstanceConfig = new InstanceConfig(instanceConfig.getRecord());
Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
// upgrade Helix by updating IdealState: AdminOperation = UpdateIdealState
HelixBootstrapUpgradeUtil.bootstrapOrUpgrade(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, DEFAULT_MAX_PARTITIONS_PER_RESOURCE, false, false, new HelixAdminFactory(), false, ClusterMapConfig.DEFAULT_STATE_MODEL_DEF, UpdateIdealState, dataNodeConfigSourceType, false);
verifyResourceCount(testHardwareLayout.getHardwareLayout(), expectedResourceCount);
// verify IdealState has been updated
// 1. new added replica is indeed present in the IdealState
verifyIdealStateForPartition(replicaToAdd, true, 4, expectedResourceCount);
// 2. removed old replica is no longer present in the IdealState
verifyIdealStateForPartition(removedReplica, false, 2, expectedResourceCount);
// verify the InstanceConfig stays unchanged
InstanceConfig currentInstanceConfig = admin.getInstanceConfig(clusterName, getInstanceName(replicaToAdd.getDataNodeId()));
assertEquals("InstanceConfig should stay unchanged", previousInstanceConfig.getRecord(), currentInstanceConfig.getRecord());
}
use of org.apache.helix.manager.zk.ZKHelixAdmin in project ambry by linkedin.
the class HelixBootstrapUpgradeToolTest method testDisablePartitionAdminOp.
/**
* Test when AdminOperation is specified to DisablePartition, Helix bootstrap tool is able to disable certain partition
* only without changing IdealState and InstanceConfig. (In practice, this is first step to decommission a replica)
* @throws Exception
*/
@Test
public void testDisablePartitionAdminOp() throws Exception {
String clusterName = CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP;
// Test regular bootstrap. This is to ensure DataNodeConfig and IdealState are there before testing disabling
// certain replica on specific node.
long expectedResourceCount = (testPartitionLayout.getPartitionLayout().getPartitionCount() - 1) / DEFAULT_MAX_PARTITIONS_PER_RESOURCE + 1;
writeBootstrapOrUpgrade(expectedResourceCount, false);
int totalPartitionCount = testPartitionLayout.getPartitionCount();
// Randomly pick a partition to remove one of its replicas
Partition testPartition = (Partition) testPartitionLayout.getPartitionLayout().getPartitions(null).get(RANDOM.nextInt(totalPartitionCount));
ReplicaId removedReplica = testPartition.getReplicaIds().stream().filter(r -> r.getDataNodeId().getDatacenterName().equals("DC1")).findFirst().get();
testPartition.getReplicas().remove(removedReplica);
ZkInfo zkInfo = dcsToZkInfo.get(removedReplica.getDataNodeId().getDatacenterName());
// create a participant that hosts this removed replica
Properties props = new Properties();
props.setProperty("clustermap.host.name", "localhost");
props.setProperty("clustermap.port", String.valueOf(removedReplica.getDataNodeId().getPort()));
props.setProperty("clustermap.cluster.name", clusterName);
props.setProperty("clustermap.datacenter.name", "DC1");
props.setProperty("clustermap.update.datanode.info", Boolean.toString(true));
props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
props.setProperty("clustermap.retry.disable.partition.completion.backoff.ms", Integer.toString(100));
props.setProperty("clustermap.data.node.config.source.type", dataNodeConfigSourceType.name());
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
HelixParticipant helixParticipant = new HelixParticipant(clusterMapConfig, new HelixFactory(), new MetricRegistry(), "localhost:" + zkInfo.getPort(), true);
PropertyStoreToDataNodeConfigAdapter propertyStoreAdapter = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? null : new PropertyStoreToDataNodeConfigAdapter("localhost:" + zkInfo.getPort(), clusterMapConfig);
InstanceConfigToDataNodeConfigAdapter.Converter instanceConfigConverter = new InstanceConfigToDataNodeConfigAdapter.Converter(clusterMapConfig);
// create HelixAdmin
ZKHelixAdmin admin = new ZKHelixAdmin("localhost:" + zkInfo.getPort());
// Write changes to static files
Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
// make bootstrap tool blocked by count down latch before removing znodes for disabling partitions
blockRemovingNodeLatch = new CountDownLatch(1);
disablePartitionLatch = new CountDownLatch(activeDcSet.size());
CountDownLatch bootstrapCompletionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
try {
// Upgrade Helix by updating IdealState: AdminOperation = DisablePartition
HelixBootstrapUpgradeUtil.bootstrapOrUpgrade(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, DEFAULT_MAX_PARTITIONS_PER_RESOURCE, false, false, new HelixAdminFactory(), false, ClusterMapConfig.DEFAULT_STATE_MODEL_DEF, DisablePartition, dataNodeConfigSourceType, false);
bootstrapCompletionLatch.countDown();
} catch (Exception e) {
// do nothing, if there is any exception subsequent test should fail.
}
}, false).start();
assertTrue("Disable partition latch didn't come down within 5 seconds", disablePartitionLatch.await(5, TimeUnit.SECONDS));
// Let's attempt to update InstanceConfig/DataNodeConfig via HelixParticipant, which should be blocked
CountDownLatch updateCompletionLatch = new CountDownLatch(1);
Utils.newThread(() -> {
helixParticipant.updateDataNodeInfoInCluster(removedReplica, false);
updateCompletionLatch.countDown();
}, false).start();
// sleep 100 ms to ensure updateDataNodeInfoInCluster is blocked due to disabling partition hasn't completed yet
Thread.sleep(100);
// Ensure the DataNodeConfig still has the replica
String instanceName = getInstanceName(removedReplica.getDataNodeId());
DataNodeConfig currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, true);
// verify the znode is created for the node on which partition has been disabled.
Properties properties = new Properties();
properties.setProperty("helix.property.store.root.path", "/" + clusterName + "/" + PROPERTYSTORE_STR);
HelixPropertyStoreConfig propertyStoreConfig = new HelixPropertyStoreConfig(new VerifiableProperties(properties));
HelixPropertyStore<ZNRecord> helixPropertyStore = CommonUtils.createHelixPropertyStore("localhost:" + zkInfo.getPort(), propertyStoreConfig, null);
String path = PARTITION_DISABLED_ZNODE_PATH + getInstanceName(removedReplica.getDataNodeId());
assertTrue("ZNode is not found for disabled partition node.", helixPropertyStore.exists(path, AccessOption.PERSISTENT));
helixPropertyStore.stop();
// unblock HelixBootstrapTool
blockRemovingNodeLatch.countDown();
// waiting for bootstrap tool to complete
assertTrue("Helix tool didn't complete within 5 seconds", bootstrapCompletionLatch.await(5, TimeUnit.SECONDS));
verifyResourceCount(testHardwareLayout.getHardwareLayout(), expectedResourceCount);
assertTrue("Helix participant didn't complete update within 5 seconds", updateCompletionLatch.await(5, TimeUnit.SECONDS));
InstanceConfig currentInstanceConfig = admin.getInstanceConfig(clusterName, getInstanceName(removedReplica.getDataNodeId()));
// Verify that replica has been disabled
String resourceName = null;
for (String rs : admin.getResourcesInCluster(clusterName)) {
IdealState is = admin.getResourceIdealState(clusterName, rs);
if (is.getPartitionSet().contains(removedReplica.getPartitionId().toPathString())) {
resourceName = rs;
break;
}
}
List<String> disabledPartition = currentInstanceConfig.getDisabledPartitions(resourceName);
assertEquals("Disabled partition not as expected", Collections.singletonList(removedReplica.getPartitionId().toPathString()), disabledPartition);
// Verify that IdealState has no change
verifyIdealStateForPartition(removedReplica, true, 3, expectedResourceCount);
// Verify the InstanceConfig is changed in MapFields (Disabled partitions are added to this field, also the replica entry has been removed)
String disabledPartitionStr = currentInstanceConfig.getRecord().getMapFields().keySet().stream().filter(k -> !k.startsWith("/mnt")).findFirst().get();
// Verify the disabled partition string contains correct partition
Map<String, String> expectedDisabledPartitionMap = new HashMap<>();
expectedDisabledPartitionMap.put(resourceName, removedReplica.getPartitionId().toPathString());
assertEquals("Mismatch in disabled partition string in InstanceConfig", expectedDisabledPartitionMap, currentInstanceConfig.getRecord().getMapField(disabledPartitionStr));
// verify the removed replica is no longer in InstanceConfig
currentDataNodeConfig = dataNodeConfigSourceType == DataNodeConfigSourceType.INSTANCE_CONFIG ? instanceConfigConverter.convert(admin.getInstanceConfig(clusterName, instanceName)) : propertyStoreAdapter.get(instanceName);
verifyReplicaInfoInDataNodeConfig(currentDataNodeConfig, removedReplica, false);
if (propertyStoreAdapter != null) {
propertyStoreAdapter.close();
}
}
use of org.apache.helix.manager.zk.ZKHelixAdmin in project ambry by linkedin.
the class TestUtils method isSrcDestSync.
/**
* A method to verify resources and partitions in src cluster and dest cluster are same.
*/
public static boolean isSrcDestSync(String srcZkString, String srcClusterName, String destZkString, String destClusterName) {
HelixAdmin srcAdmin = new ZKHelixAdmin(srcZkString);
Set<String> srcResources = new HashSet<>(srcAdmin.getResourcesInCluster(srcClusterName));
HelixAdmin destAdmin = new ZKHelixAdmin(destZkString);
Set<String> destResources = new HashSet<>(destAdmin.getResourcesInCluster(destClusterName));
for (String resource : srcResources) {
if (HelixVcrUtil.ignoreResourceKeyWords.stream().anyMatch(resource::contains)) {
System.out.println("Resource " + resource + " from src cluster is ignored");
continue;
}
if (destResources.contains(resource)) {
// check if every partition exist.
Set<String> srcPartitions = srcAdmin.getResourceIdealState(srcClusterName, resource).getPartitionSet();
Set<String> destPartitions = destAdmin.getResourceIdealState(destClusterName, resource).getPartitionSet();
for (String partition : srcPartitions) {
if (!destPartitions.contains(partition)) {
return false;
}
}
} else {
return false;
}
}
return true;
}
use of org.apache.helix.manager.zk.ZKHelixAdmin in project ambry by linkedin.
the class HelixBootstrapUpgradeToolTest method testAddStateModelDef.
/**
* Test {@link HelixBootstrapUpgradeUtil} addStateModelDef() method.
*/
@Test
public void testAddStateModelDef() throws Exception {
Utils.writeJsonObjectToFile(zkJson, zkLayoutPath);
Utils.writeJsonObjectToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
Utils.writeJsonObjectToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
// test add state model to non-exist cluster, which should fail
try {
HelixBootstrapUpgradeUtil.addStateModelDef(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, ClusterMapConfig.AMBRY_STATE_MODEL_DEF);
fail("should fail due to non-exist cluster");
} catch (IllegalStateException e) {
// expected
}
// bootstrap a cluster
HelixBootstrapUpgradeUtil.bootstrapOrUpgrade(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, DEFAULT_MAX_PARTITIONS_PER_RESOURCE, false, false, new HelixAdminFactory(), false, ClusterMapConfig.OLD_STATE_MODEL_DEF, BootstrapCluster, dataNodeConfigSourceType, false);
// add new state model def
HelixBootstrapUpgradeUtil.addStateModelDef(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, ClusterMapConfig.AMBRY_STATE_MODEL_DEF);
// add existing state model def should be no-op
HelixBootstrapUpgradeUtil.addStateModelDef(hardwareLayoutPath, partitionLayoutPath, zkLayoutPath, CLUSTER_NAME_PREFIX, dcStr, ClusterMapConfig.OLD_STATE_MODEL_DEF);
// ensure that active dcs have new state model def
String clusterName = CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP;
for (Datacenter dc : testHardwareLayout.getHardwareLayout().getDatacenters()) {
ZkInfo zkInfo = dcsToZkInfo.get(dc.getName());
ZKHelixAdmin admin = new ZKHelixAdmin("localhost:" + zkInfo.getPort());
if (!activeDcSet.contains(dc.getName())) {
Assert.assertFalse("Cluster should not be present, as dc " + dc.getName() + " is not enabled", admin.getClusters().contains(CLUSTER_NAME_PREFIX + CLUSTER_NAME_IN_STATIC_CLUSTER_MAP));
} else {
assertEquals("Mismatch in number of state model defs in cluster", 2, admin.getStateModelDefs(clusterName).size());
assertTrue("Missing ambry state model in cluster", admin.getStateModelDefs(clusterName).contains(ClusterMapConfig.AMBRY_STATE_MODEL_DEF));
}
}
}
use of org.apache.helix.manager.zk.ZKHelixAdmin in project helix by apache.
the class TestClusterInMaintenanceModeWhenReachingOfflineInstancesLimit method testWithDisabledInstancesLimit.
@Test
public void testWithDisabledInstancesLimit() throws Exception {
MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
Assert.assertNull(maintenanceSignal);
HelixAdmin admin = new ZKHelixAdmin(_gZkClient);
// disable instance
int i;
for (i = 2; i < 2 + _maxOfflineInstancesAllowed; i++) {
String instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, false);
}
Thread.sleep(500);
maintenanceSignal = _dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
Assert.assertNull(maintenanceSignal);
String instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, false);
Thread.sleep(500);
maintenanceSignal = _dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
Assert.assertNotNull(maintenanceSignal);
Assert.assertNotNull(maintenanceSignal.getReason());
for (i = 2; i < 2 + _maxOfflineInstancesAllowed + 1; i++) {
instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, true);
}
admin.enableMaintenanceMode(CLUSTER_NAME, false);
}
Aggregations