use of org.apache.asterix.common.config.ReplicationProperties in project asterixdb by apache.
the class AutoFaultToleranceStrategy method prepareFailbackPlan.
private synchronized void prepareFailbackPlan(String failingBackNodeId) {
NodeFailbackPlan plan = NodeFailbackPlan.createPlan(failingBackNodeId);
pendingProcessingFailbackPlans.add(plan);
planId2FailbackPlanMap.put(plan.getPlanId(), plan);
//get all partitions this node requires to resync
ICcApplicationContext appCtx = (ICcApplicationContext) serviceCtx.getApplicationContext();
ReplicationProperties replicationProperties = appCtx.getReplicationProperties();
Set<String> nodeReplicas = replicationProperties.getNodeReplicasIds(failingBackNodeId);
clusterManager.getClusterPartitons();
for (String replicaId : nodeReplicas) {
ClusterPartition[] nodePartitions = clusterManager.getNodePartitions(replicaId);
for (ClusterPartition partition : nodePartitions) {
plan.addParticipant(partition.getActiveNodeId());
/*
* if the partition original node is the returning node,
* add it to the list of the partitions which will be failed back
*/
if (partition.getNodeId().equals(failingBackNodeId)) {
plan.addPartitionToFailback(partition.getPartitionId(), partition.getActiveNodeId());
}
}
}
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("Prepared Failback plan: " + plan.toString());
}
processPendingFailbackPlans();
}
use of org.apache.asterix.common.config.ReplicationProperties in project asterixdb by apache.
the class AutoFaultToleranceStrategy method requestPartitionsTakeover.
private synchronized void requestPartitionsTakeover(String failedNodeId) {
//replica -> list of partitions to takeover
Map<String, List<Integer>> partitionRecoveryPlan = new HashMap<>();
ICcApplicationContext appCtx = (ICcApplicationContext) serviceCtx.getApplicationContext();
ReplicationProperties replicationProperties = appCtx.getReplicationProperties();
//collect the partitions of the failed NC
List<ClusterPartition> lostPartitions = getNodeAssignedPartitions(failedNodeId);
if (!lostPartitions.isEmpty()) {
for (ClusterPartition partition : lostPartitions) {
//find replicas for this partitions
Set<String> partitionReplicas = replicationProperties.getNodeReplicasIds(partition.getNodeId());
//find a replica that is still active
for (String replica : partitionReplicas) {
//It needs to be modified to consider load balancing.
if (addActiveReplica(replica, partition, partitionRecoveryPlan)) {
break;
}
}
}
if (partitionRecoveryPlan.size() == 0) {
//no active replicas were found for the failed node
LOGGER.severe("Could not find active replicas for the partitions " + lostPartitions);
return;
} else {
LOGGER.info("Partitions to recover: " + lostPartitions);
}
//For each replica, send a request to takeover the assigned partitions
for (Entry<String, List<Integer>> entry : partitionRecoveryPlan.entrySet()) {
String replica = entry.getKey();
Integer[] partitionsToTakeover = entry.getValue().toArray(new Integer[entry.getValue().size()]);
long requestId = clusterRequestId++;
TakeoverPartitionsRequestMessage takeoverRequest = new TakeoverPartitionsRequestMessage(requestId, replica, partitionsToTakeover);
pendingTakeoverRequests.put(requestId, takeoverRequest);
try {
messageBroker.sendApplicationMessageToNC(takeoverRequest, replica);
} catch (Exception e) {
/*
* if we fail to send the request, it means the NC we tried to send the request to
* has failed. When the failure notification arrives, we will send any pending request
* that belongs to the failed NC to a different active replica.
*/
LOGGER.log(Level.WARNING, "Failed to send takeover request: " + takeoverRequest, e);
}
}
}
}
Aggregations