use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class StealerBasedRebalanceAsyncOperation method operate.
@Override
public void operate() throws Exception {
adminClient = AdminClient.createTempAdminClient(voldemortConfig, metadataStore.getCluster(), voldemortConfig.getMaxParallelStoresRebalancing());
final List<Exception> failures = new ArrayList<Exception>();
final ConcurrentLinkedQueue<String> storesRebalancing = new ConcurrentLinkedQueue<String>();
final AtomicInteger completedStoresCount = new AtomicInteger(0);
final int totalStoresCount = stealInfo.getPartitionStores().size();
try {
for (final String storeName : ImmutableList.copyOf(stealInfo.getPartitionStores())) {
executors.submit(new Runnable() {
@Override
public void run() {
try {
boolean isReadOnlyStore = metadataStore.getStoreDef(storeName).getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0;
// Add the store to the rebalancing list
storesRebalancing.add(storeName);
updateStatus(getHeader(stealInfo) + "Completed working on " + completedStoresCount.get() + " out of " + totalStoresCount + " stores. Still rebalancing " + storesRebalancing);
// Start the rebalance..
rebalanceStore(storeName, adminClient, stealInfo, isReadOnlyStore);
// We finished the store, delete it
stealInfo.removeStore(storeName);
storesRebalancing.remove(storeName);
// Increment the store count
completedStoresCount.getAndIncrement();
updateStatus(getHeader(stealInfo) + "Completed working on " + completedStoresCount.get() + " out of " + totalStoresCount + " stores. Still rebalancing " + storesRebalancing);
} catch (Exception e) {
logger.error(getHeader(stealInfo) + "Error while rebalancing for store " + storeName + " - " + e.getMessage(), e);
failures.add(e);
}
}
});
}
waitForShutdown();
// If empty, clean state
List<String> unbalancedStores = Lists.newArrayList(stealInfo.getPartitionStores());
if (unbalancedStores.isEmpty()) {
logger.info(getHeader(stealInfo) + "Rebalance of " + stealInfo + " completed successfully for all " + totalStoresCount + " stores");
updateStatus(getHeader(stealInfo) + "Rebalance of " + partitionStoreCount + " partition-stores completed successfully for all " + totalStoresCount + " stores");
metadataStore.deleteRebalancingState(stealInfo);
} else {
throw new VoldemortRebalancingException(getHeader(stealInfo) + "Failed to rebalance task " + stealInfo + ". Could only complete " + completedStoresCount.get() + " out of " + totalStoresCount + " stores", failures);
}
} finally {
// free the permit in all cases.
logger.info(getHeader(stealInfo) + "Releasing permit for donor node " + stealInfo.getDonorId());
rebalancer.releaseRebalancingPermit(stealInfo.getDonorId());
adminClient.close();
adminClient = null;
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class AdminRebalanceTest method testRebalanceStateChange.
@Test(timeout = 60000)
public void testRebalanceStateChange() throws IOException {
try {
startFourNodeRW();
// Test 1) Normal case where-in all are up
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, false, true, true, true);
List<Integer> nodesChecked = Lists.newArrayList();
for (RebalanceTaskInfo plan : plans) {
nodesChecked.add(plan.getStealerId());
assertEquals(servers[plan.getStealerId()].getMetadataStore().getRebalancerState(), new RebalancerState(Lists.newArrayList(plan)));
}
List<Integer> allNodes = Lists.newArrayList(Utils.nodeListToNodeIdList(Lists.newArrayList(currentCluster.getNodes())));
allNodes.removeAll(nodesChecked);
// Check all other nodes
for (int nodeId : allNodes) {
assertEquals(servers[nodeId].getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
}
// Clean-up everything
cleanUpAllState();
// Test 2) Add a plan before hand on one of them which should
// trigger a rollback
servers[3].getMetadataStore().getRebalancerState().update(new RebalanceTaskInfo(3, 0, new HashMap<String, List<Integer>>(), currentCluster));
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, false, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
// except node 3
for (VoldemortServer server : servers) {
if (server.getMetadataStore().getNodeId() != 3) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
}
}
// Clean-up everything
cleanUpAllState();
// Test 3) Shut one node down
ServerTestUtils.stopVoldemortServer(servers[3]);
servers[3] = null;
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, false, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
// exception node 3
for (VoldemortServer server : servers) {
if (server != null) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
}
}
} finally {
shutDown();
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class AdminRebalanceTest method testRebalanceNodeRORW.
@Test(timeout = 60000)
public void testRebalanceNodeRORW() throws IOException, InterruptedException {
try {
startFourNodeRORW();
int numChunks = 5;
for (StoreDefinition storeDef : Lists.newArrayList(storeDef1, storeDef2)) {
buildROStore(storeDef, numChunks);
}
// Set into rebalancing state
for (RebalanceTaskInfo partitionPlan : plans) {
getServer(partitionPlan.getStealerId()).getMetadataStore().put(MetadataStore.SERVER_STATE_KEY, MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER);
getServer(partitionPlan.getStealerId()).getMetadataStore().put(MetadataStore.REBALANCING_STEAL_INFO, new RebalancerState(Lists.newArrayList(RebalanceTaskInfo.create(partitionPlan.toJsonString()))));
getServer(partitionPlan.getStealerId()).getMetadataStore().put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, partitionPlan.getInitialCluster());
}
// Actually run it
try {
for (RebalanceTaskInfo currentPlan : plans) {
int asyncId = adminClient.rebalanceOps.rebalanceNode(currentPlan);
assertNotSame("Got a valid rebalanceAsyncId", -1, asyncId);
getAdminClient().rpcOps.waitForCompletion(currentPlan.getStealerId(), asyncId, 300, TimeUnit.SECONDS);
// Test that plan has been removed from the list
assertFalse(getServer(currentPlan.getStealerId()).getMetadataStore().getRebalancerState().getAll().contains(currentPlan));
}
} catch (Exception e) {
e.printStackTrace();
fail("Should not throw any exceptions");
}
// Test 1) Change one of the rebalance partitions info to force a
// failure
servers[3].getMetadataStore().getRebalancerState().update(new RebalanceTaskInfo(3, 0, new HashMap<String, List<Integer>>(), currentCluster));
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, true, true, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
// except node 3
for (VoldemortServer server : servers) {
if (server.getMetadataStore().getNodeId() != 3) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER);
}
assertEquals(server.getMetadataStore().getCluster(), currentCluster);
}
checkRO(currentCluster);
// Clean-up everything
cleanUpAllState();
// Test 2 ) Add another store to trigger a failure
servers[2].getMetadataStore().put(MetadataStore.STORES_KEY, Lists.newArrayList(storeDef1, storeDef2, storeDef3, storeDef4, new StoreDefinitionBuilder().setName("test5").setType(ReadOnlyStorageConfiguration.TYPE_NAME).setKeySerializer(new SerializerDefinition("string")).setValueSerializer(new SerializerDefinition("string")).setRoutingPolicy(RoutingTier.CLIENT).setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY).setReplicationFactor(2).setPreferredReads(1).setRequiredReads(1).setPreferredWrites(1).setRequiredWrites(1).build()));
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, true, true, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
Thread.sleep(1000);
for (VoldemortServer server : servers) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER);
assertEquals(server.getMetadataStore().getCluster(), currentCluster);
}
checkRO(currentCluster);
// Clean-up everything
cleanUpAllState();
// Put back server 2 back to normal state
servers[2].getMetadataStore().put(MetadataStore.STORES_KEY, Lists.newArrayList(storeDef1, storeDef2, storeDef3, storeDef4));
// Test 3) Everything should work
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, true, true, true, true, true);
List<Integer> nodesChecked = Lists.newArrayList();
for (RebalanceTaskInfo plan : plans) {
nodesChecked.add(plan.getStealerId());
assertEquals(servers[plan.getStealerId()].getMetadataStore().getRebalancerState(), new RebalancerState(Lists.newArrayList(plan)));
assertEquals(servers[plan.getStealerId()].getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER);
assertEquals(servers[plan.getStealerId()].getMetadataStore().getCluster(), finalCluster);
}
List<Integer> allNodes = Lists.newArrayList(Utils.nodeListToNodeIdList(Lists.newArrayList(currentCluster.getNodes())));
allNodes.removeAll(nodesChecked);
// Check all other nodes
for (int nodeId : allNodes) {
assertEquals(servers[nodeId].getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
assertEquals(servers[nodeId].getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER);
assertEquals(servers[nodeId].getMetadataStore().getCluster(), finalCluster);
}
checkRO(finalCluster);
} finally {
shutDown();
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class RebalanceUtils method validateClusterPartitionState.
/**
* Confirm that all nodes shared between clusters host exact same partition
* IDs and that nodes only in the super set cluster have no partition IDs.
*
* @param subsetCluster
* @param supersetCluster
*/
public static void validateClusterPartitionState(final Cluster subsetCluster, final Cluster supersetCluster) {
if (!supersetCluster.getNodeIds().containsAll(subsetCluster.getNodeIds())) {
throw new VoldemortException("Superset cluster does not contain all nodes from subset cluster[ subset cluster node ids (" + subsetCluster.getNodeIds() + ") are not a subset of superset cluster node ids (" + supersetCluster.getNodeIds() + ") ]");
}
for (int nodeId : subsetCluster.getNodeIds()) {
Node supersetNode = supersetCluster.getNodeById(nodeId);
Node subsetNode = subsetCluster.getNodeById(nodeId);
if (!supersetNode.getPartitionIds().equals(subsetNode.getPartitionIds())) {
throw new VoldemortRebalancingException("Partition IDs do not match between clusters for nodes with id " + nodeId + " : subset cluster has " + subsetNode.getPartitionIds() + " and superset cluster has " + supersetNode.getPartitionIds());
}
}
Set<Integer> nodeIds = supersetCluster.getNodeIds();
nodeIds.removeAll(subsetCluster.getNodeIds());
for (int nodeId : nodeIds) {
Node supersetNode = supersetCluster.getNodeById(nodeId);
if (!supersetNode.getPartitionIds().isEmpty()) {
throw new VoldemortRebalancingException("New node " + nodeId + " in superset cluster already has partitions: " + supersetNode.getPartitionIds());
}
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class RebalanceScheduler method scheduleNextTask.
/**
* Schedule at most one task.
*
* The scheduled task *must* invoke 'doneTask()' upon
* completion/termination.
*
* @param executeService flag to control execution of the service, some tests pass
* in value 'false'
* @return The task scheduled or null if not possible to schedule a task at
* this time.
*/
protected synchronized StealerBasedRebalanceTask scheduleNextTask(boolean executeService) {
// Make sure there is work left to do.
if (doneSignal.getCount() == 0) {
logger.info("All tasks completion signaled... returning");
return null;
}
// Limit number of tasks outstanding.
if (this.numTasksExecuting >= maxParallelRebalancing) {
logger.info("Executing more tasks than [" + this.numTasksExecuting + "] the parallel allowed " + maxParallelRebalancing);
return null;
}
// Shuffle list of stealer IDs each time a new task to schedule needs to
// be found. Randomizing the order should avoid prioritizing one
// specific stealer's work ahead of all others.
List<Integer> stealerIds = new ArrayList<Integer>(tasksByStealer.keySet());
Collections.shuffle(stealerIds);
for (int stealerId : stealerIds) {
if (nodeIdsWithWork.contains(stealerId)) {
logger.info("Stealer " + stealerId + " is already working... continuing");
continue;
}
for (StealerBasedRebalanceTask sbTask : tasksByStealer.get(stealerId)) {
int donorId = sbTask.getStealInfos().get(0).getDonorId();
if (nodeIdsWithWork.contains(donorId)) {
logger.info("Stealer " + stealerId + " Donor " + donorId + " is already working... continuing");
continue;
}
// Book keeping
addNodesToWorkerList(Arrays.asList(stealerId, donorId));
numTasksExecuting++;
// Remove this task from list thus destroying list being
// iterated over. This is safe because returning directly out of
// this branch.
tasksByStealer.get(stealerId).remove(sbTask);
try {
if (executeService) {
logger.info("Stealer " + stealerId + " Donor " + donorId + " going to schedule work");
service.execute(sbTask);
}
} catch (RejectedExecutionException ree) {
logger.error("Stealer " + stealerId + "Rebalancing task rejected by executor service.", ree);
throw new VoldemortRebalancingException("Stealer " + stealerId + "Rebalancing task rejected by executor service.");
}
return sbTask;
}
}
printRemainingTasks(stealerIds);
return null;
}
Aggregations