use of org.apache.asterix.app.replication.message.TakeoverPartitionsRequestMessage in project asterixdb by apache.
the class AutoFaultToleranceStrategy method getNodeAssignedPartitions.
public synchronized List<ClusterPartition> getNodeAssignedPartitions(String nodeId) {
List<ClusterPartition> nodePartitions = new ArrayList<>();
ClusterPartition[] clusterPartitons = clusterManager.getClusterPartitons();
Map<Integer, ClusterPartition> clusterPartitionsMap = new HashMap<>();
for (ClusterPartition partition : clusterPartitons) {
clusterPartitionsMap.put(partition.getPartitionId(), partition);
}
for (ClusterPartition partition : clusterPartitons) {
if (nodeId.equals(partition.getActiveNodeId())) {
nodePartitions.add(partition);
}
}
/*
* if there is any pending takeover request this node was supposed to handle,
* it needs to be sent to a different replica
*/
List<Long> failedTakeoverRequests = new ArrayList<>();
for (TakeoverPartitionsRequestMessage request : pendingTakeoverRequests.values()) {
if (request.getNodeId().equals(nodeId)) {
for (Integer partitionId : request.getPartitions()) {
nodePartitions.add(clusterPartitionsMap.get(partitionId));
}
failedTakeoverRequests.add(request.getRequestId());
}
}
//remove failed requests
for (Long requestId : failedTakeoverRequests) {
pendingTakeoverRequests.remove(requestId);
}
return nodePartitions;
}
use of org.apache.asterix.app.replication.message.TakeoverPartitionsRequestMessage in project asterixdb by apache.
the class AutoFaultToleranceStrategy method requestPartitionsTakeover.
private synchronized void requestPartitionsTakeover(String failedNodeId) {
//replica -> list of partitions to takeover
Map<String, List<Integer>> partitionRecoveryPlan = new HashMap<>();
ICcApplicationContext appCtx = (ICcApplicationContext) serviceCtx.getApplicationContext();
ReplicationProperties replicationProperties = appCtx.getReplicationProperties();
//collect the partitions of the failed NC
List<ClusterPartition> lostPartitions = getNodeAssignedPartitions(failedNodeId);
if (!lostPartitions.isEmpty()) {
for (ClusterPartition partition : lostPartitions) {
//find replicas for this partitions
Set<String> partitionReplicas = replicationProperties.getNodeReplicasIds(partition.getNodeId());
//find a replica that is still active
for (String replica : partitionReplicas) {
//It needs to be modified to consider load balancing.
if (addActiveReplica(replica, partition, partitionRecoveryPlan)) {
break;
}
}
}
if (partitionRecoveryPlan.size() == 0) {
//no active replicas were found for the failed node
LOGGER.severe("Could not find active replicas for the partitions " + lostPartitions);
return;
} else {
LOGGER.info("Partitions to recover: " + lostPartitions);
}
//For each replica, send a request to takeover the assigned partitions
for (Entry<String, List<Integer>> entry : partitionRecoveryPlan.entrySet()) {
String replica = entry.getKey();
Integer[] partitionsToTakeover = entry.getValue().toArray(new Integer[entry.getValue().size()]);
long requestId = clusterRequestId++;
TakeoverPartitionsRequestMessage takeoverRequest = new TakeoverPartitionsRequestMessage(requestId, replica, partitionsToTakeover);
pendingTakeoverRequests.put(requestId, takeoverRequest);
try {
messageBroker.sendApplicationMessageToNC(takeoverRequest, replica);
} catch (Exception e) {
/*
* if we fail to send the request, it means the NC we tried to send the request to
* has failed. When the failure notification arrives, we will send any pending request
* that belongs to the failed NC to a different active replica.
*/
LOGGER.log(Level.WARNING, "Failed to send takeover request: " + takeoverRequest, e);
}
}
}
}
Aggregations