Search in sources :

Example 1 with ReplicationProperties

use of org.apache.asterix.common.config.ReplicationProperties in project asterixdb by apache.

the class AutoFaultToleranceStrategy method prepareFailbackPlan.

private synchronized void prepareFailbackPlan(String failingBackNodeId) {
    NodeFailbackPlan plan = NodeFailbackPlan.createPlan(failingBackNodeId);
    pendingProcessingFailbackPlans.add(plan);
    planId2FailbackPlanMap.put(plan.getPlanId(), plan);
    //get all partitions this node requires to resync
    ICcApplicationContext appCtx = (ICcApplicationContext) serviceCtx.getApplicationContext();
    ReplicationProperties replicationProperties = appCtx.getReplicationProperties();
    Set<String> nodeReplicas = replicationProperties.getNodeReplicasIds(failingBackNodeId);
    clusterManager.getClusterPartitons();
    for (String replicaId : nodeReplicas) {
        ClusterPartition[] nodePartitions = clusterManager.getNodePartitions(replicaId);
        for (ClusterPartition partition : nodePartitions) {
            plan.addParticipant(partition.getActiveNodeId());
            /*
                 * if the partition original node is the returning node,
                 * add it to the list of the partitions which will be failed back
                 */
            if (partition.getNodeId().equals(failingBackNodeId)) {
                plan.addPartitionToFailback(partition.getPartitionId(), partition.getActiveNodeId());
            }
        }
    }
    if (LOGGER.isLoggable(Level.INFO)) {
        LOGGER.info("Prepared Failback plan: " + plan.toString());
    }
    processPendingFailbackPlans();
}
Also used : ReplicationProperties(org.apache.asterix.common.config.ReplicationProperties) ICcApplicationContext(org.apache.asterix.common.dataflow.ICcApplicationContext) ClusterPartition(org.apache.asterix.common.cluster.ClusterPartition)

Example 2 with ReplicationProperties

use of org.apache.asterix.common.config.ReplicationProperties in project asterixdb by apache.

the class AutoFaultToleranceStrategy method requestPartitionsTakeover.

private synchronized void requestPartitionsTakeover(String failedNodeId) {
    //replica -> list of partitions to takeover
    Map<String, List<Integer>> partitionRecoveryPlan = new HashMap<>();
    ICcApplicationContext appCtx = (ICcApplicationContext) serviceCtx.getApplicationContext();
    ReplicationProperties replicationProperties = appCtx.getReplicationProperties();
    //collect the partitions of the failed NC
    List<ClusterPartition> lostPartitions = getNodeAssignedPartitions(failedNodeId);
    if (!lostPartitions.isEmpty()) {
        for (ClusterPartition partition : lostPartitions) {
            //find replicas for this partitions
            Set<String> partitionReplicas = replicationProperties.getNodeReplicasIds(partition.getNodeId());
            //find a replica that is still active
            for (String replica : partitionReplicas) {
                //It needs to be modified to consider load balancing.
                if (addActiveReplica(replica, partition, partitionRecoveryPlan)) {
                    break;
                }
            }
        }
        if (partitionRecoveryPlan.size() == 0) {
            //no active replicas were found for the failed node
            LOGGER.severe("Could not find active replicas for the partitions " + lostPartitions);
            return;
        } else {
            LOGGER.info("Partitions to recover: " + lostPartitions);
        }
        //For each replica, send a request to takeover the assigned partitions
        for (Entry<String, List<Integer>> entry : partitionRecoveryPlan.entrySet()) {
            String replica = entry.getKey();
            Integer[] partitionsToTakeover = entry.getValue().toArray(new Integer[entry.getValue().size()]);
            long requestId = clusterRequestId++;
            TakeoverPartitionsRequestMessage takeoverRequest = new TakeoverPartitionsRequestMessage(requestId, replica, partitionsToTakeover);
            pendingTakeoverRequests.put(requestId, takeoverRequest);
            try {
                messageBroker.sendApplicationMessageToNC(takeoverRequest, replica);
            } catch (Exception e) {
                /*
                     * if we fail to send the request, it means the NC we tried to send the request to
                     * has failed. When the failure notification arrives, we will send any pending request
                     * that belongs to the failed NC to a different active replica.
                     */
                LOGGER.log(Level.WARNING, "Failed to send takeover request: " + takeoverRequest, e);
            }
        }
    }
}
Also used : TakeoverPartitionsRequestMessage(org.apache.asterix.app.replication.message.TakeoverPartitionsRequestMessage) ICcApplicationContext(org.apache.asterix.common.dataflow.ICcApplicationContext) HashMap(java.util.HashMap) RuntimeDataException(org.apache.asterix.common.exceptions.RuntimeDataException) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) ReplicationProperties(org.apache.asterix.common.config.ReplicationProperties) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) ClusterPartition(org.apache.asterix.common.cluster.ClusterPartition)

Aggregations

ClusterPartition (org.apache.asterix.common.cluster.ClusterPartition)2 ReplicationProperties (org.apache.asterix.common.config.ReplicationProperties)2 ICcApplicationContext (org.apache.asterix.common.dataflow.ICcApplicationContext)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 TakeoverPartitionsRequestMessage (org.apache.asterix.app.replication.message.TakeoverPartitionsRequestMessage)1 RuntimeDataException (org.apache.asterix.common.exceptions.RuntimeDataException)1 HyracksDataException (org.apache.hyracks.api.exceptions.HyracksDataException)1