use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class RebalanceController method executeSubBatch.
// TODO: (refactor) Break this state-machine like method into multiple "sub"
// methods. AFAIK, this method either does the RO stores or the RW stores in
// a batch. I.e., there are at most 2 sub-batches for any given batch. And,
// in practice, there is one sub-batch that is either RO or RW.
// TODO: Fix the javadoc comment to be more easily understood.
/**
* The smallest granularity of rebalancing where-in we move partitions for a
* sub-set of stores. Finally at the end of the movement, the node is
* removed out of rebalance state
*
* <br>
*
* Also any errors + rollback procedures are performed at this level itself.
*
* <pre>
* | Case | hasRO | hasRW | finishedRO | Action |
* | 0 | t | t | t | rollback cluster change + swap |
* | 1 | t | t | f | nothing to do since "rebalance state change" should have removed everything |
* | 2 | t | f | t | won't be triggered since hasRW is false |
* | 3 | t | f | f | nothing to do since "rebalance state change" should have removed everything |
* | 4 | f | t | t | rollback cluster change |
* | 5 | f | t | f | won't be triggered |
* | 6 | f | f | t | won't be triggered |
* | 7 | f | f | f | won't be triggered |
* </pre>
*
* @param batchId Rebalance batch id
* @param batchRollbackCluster Cluster to rollback to if we have a problem
* @param rebalanceTaskPlanList The list of rebalance partition plans
* @param hasReadOnlyStores Are we rebalancing any read-only stores?
* @param hasReadWriteStores Are we rebalancing any read-write stores?
* @param finishedReadOnlyStores Have we finished rebalancing of read-only
* stores?
*/
private void executeSubBatch(final int batchId, RebalanceBatchPlanProgressBar progressBar, final Cluster batchRollbackCluster, final List<StoreDefinition> batchRollbackStoreDefs, final List<RebalanceTaskInfo> rebalanceTaskPlanList, boolean hasReadOnlyStores, boolean hasReadWriteStores, boolean finishedReadOnlyStores) {
RebalanceUtils.printBatchLog(batchId, logger, "Submitting rebalance tasks ");
// Get an ExecutorService in place used for submitting our tasks
ExecutorService service = RebalanceUtils.createExecutors(maxParallelRebalancing);
// Sub-list of the above list
final List<RebalanceTask> failedTasks = Lists.newArrayList();
final List<RebalanceTask> incompleteTasks = Lists.newArrayList();
// Semaphores for donor nodes - To avoid multiple disk sweeps
Map<Integer, Semaphore> donorPermits = new HashMap<Integer, Semaphore>();
for (Node node : batchRollbackCluster.getNodes()) {
donorPermits.put(node.getId(), new Semaphore(1));
}
try {
// List of tasks which will run asynchronously
List<RebalanceTask> allTasks = executeTasks(batchId, progressBar, service, rebalanceTaskPlanList, donorPermits);
RebalanceUtils.printBatchLog(batchId, logger, "All rebalance tasks submitted");
// Wait and shutdown after (infinite) timeout
RebalanceUtils.executorShutDown(service, Long.MAX_VALUE);
RebalanceUtils.printBatchLog(batchId, logger, "Finished waiting for executors");
// Collects all failures + incomplete tasks from the rebalance
// tasks.
List<Exception> failures = Lists.newArrayList();
for (RebalanceTask task : allTasks) {
if (task.hasException()) {
failedTasks.add(task);
failures.add(task.getError());
} else if (!task.isComplete()) {
incompleteTasks.add(task);
}
}
if (failedTasks.size() > 0) {
throw new VoldemortRebalancingException("Rebalance task terminated unsuccessfully on tasks " + failedTasks, failures);
}
// process.
if (incompleteTasks.size() > 0) {
throw new VoldemortException("Rebalance tasks are still incomplete / running " + incompleteTasks);
}
} catch (VoldemortRebalancingException e) {
logger.error("Failure while migrating partitions for rebalance task " + batchId);
if (hasReadOnlyStores && hasReadWriteStores && finishedReadOnlyStores) {
// Case 0
adminClient.rebalanceOps.rebalanceStateChange(null, batchRollbackCluster, null, batchRollbackStoreDefs, null, true, true, false, false, false);
} else if (hasReadWriteStores && finishedReadOnlyStores) {
// Case 4
adminClient.rebalanceOps.rebalanceStateChange(null, batchRollbackCluster, null, batchRollbackStoreDefs, null, false, true, false, false, false);
}
throw e;
} finally {
if (!service.isShutdown()) {
RebalanceUtils.printErrorLog(batchId, logger, "Could not shutdown service cleanly for rebalance task " + batchId, null);
service.shutdownNow();
}
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class AdminRebalanceTest method testClusterAndRebalanceStateChange.
@Test(timeout = 60000)
public void testClusterAndRebalanceStateChange() throws IOException {
try {
startFourNodeRW();
// Test 1) Normal case where-in all are up
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, true, true, true, true);
List<Integer> nodesChecked = Lists.newArrayList();
for (RebalanceTaskInfo plan : plans) {
nodesChecked.add(plan.getStealerId());
assertEquals(servers[plan.getStealerId()].getMetadataStore().getRebalancerState(), new RebalancerState(Lists.newArrayList(plan)));
assertEquals(servers[plan.getStealerId()].getMetadataStore().getCluster(), finalCluster);
}
List<Integer> allNodes = Lists.newArrayList(Utils.nodeListToNodeIdList(Lists.newArrayList(currentCluster.getNodes())));
allNodes.removeAll(nodesChecked);
// Check all other nodes
for (int nodeId : allNodes) {
assertEquals(servers[nodeId].getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
assertEquals(servers[nodeId].getMetadataStore().getCluster(), finalCluster);
}
// Clean-up everything
cleanUpAllState();
// Test 2) Add a plan before hand on one of them which should
// trigger a rollback
servers[3].getMetadataStore().getRebalancerState().update(new RebalanceTaskInfo(3, 0, new HashMap<String, List<Integer>>(), currentCluster));
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, true, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
// except node 3 + all of them should have old cluster metadata
for (VoldemortServer server : servers) {
if (server.getMetadataStore().getNodeId() != 3) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
}
assertEquals(server.getMetadataStore().getCluster(), currentCluster);
}
// Clean-up everything
cleanUpAllState();
// Test 3) Shut one node down
ServerTestUtils.stopVoldemortServer(servers[3]);
servers[3] = null;
try {
adminClient.rebalanceOps.rebalanceStateChange(currentCluster, finalCluster, servers[2].getMetadataStore().getStoreDefList(), servers[2].getMetadataStore().getStoreDefList(), plans, false, true, true, true, true);
fail("Should have thrown an exception since we added state before hand");
} catch (VoldemortRebalancingException e) {
}
// exception node 3
for (VoldemortServer server : servers) {
if (server != null) {
assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList<RebalanceTaskInfo>()));
assertEquals(server.getMetadataStore().getCluster(), currentCluster);
}
}
} finally {
shutDown();
}
}
use of voldemort.server.rebalance.VoldemortRebalancingException in project voldemort by voldemort.
the class RebalanceScheduler method run.
/**
* Set up scheduling structures and then start scheduling tasks to execute.
* Blocks until all tasks have been scheduled. (For all tasks to be
* scheduled, most tasks must have completed.)
*
* @param sbTaskList List of all stealer-based rebalancing tasks to be
* scheduled.
*/
public void run(List<StealerBasedRebalanceTask> sbTaskList) {
initializeLatch(sbTaskList.size());
populateTasksByStealer(sbTaskList);
// Start scheduling tasks to execute!
scheduleMoreTasks();
try {
doneSignal.await();
} catch (InterruptedException e) {
logger.error("RebalanceController scheduler interrupted while waiting for rebalance " + "tasks to be scheduled.", e);
throw new VoldemortRebalancingException("RebalanceController scheduler interrupted " + "while waiting for rebalance tasks to be " + "scheduled.");
}
}
Aggregations