Search in sources :

Example 6 with Pipeline

use of org.apache.helix.controller.pipeline.Pipeline in project helix by apache.

the class TestRebalancePipeline method testChangeIdealStateWithPendingMsg.

@Test
public void testChangeIdealStateWithPendingMsg() {
    String clusterName = "CLUSTER_" + _className + "_pending";
    System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis()));
    HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor<ZNRecord>(_gZkClient));
    HelixManager manager = new DummyClusterManager(clusterName, accessor);
    ClusterEvent event = new ClusterEvent(ClusterEventType.Unknown);
    event.addAttribute(AttributeName.helixmanager.name(), manager);
    ClusterDataCache cache = new ClusterDataCache();
    event.addAttribute(AttributeName.ClusterDataCache.name(), cache);
    refreshClusterConfig(clusterName, accessor);
    final String resourceName = "testResource_pending";
    String[] resourceGroups = new String[] { resourceName };
    // ideal state: node0 is MASTER, node1 is SLAVE
    // replica=2 means 1 master and 1 slave
    setupIdealState(clusterName, new int[] { 0 }, resourceGroups, 1, 1);
    setupLiveInstances(clusterName, new int[] { 0 });
    setupStateModel(clusterName);
    // cluster data cache refresh pipeline
    Pipeline dataRefresh = new Pipeline();
    dataRefresh.addStage(new ReadClusterDataStage());
    // rebalance pipeline
    Pipeline rebalancePipeline = new Pipeline();
    rebalancePipeline.addStage(new ResourceComputationStage());
    rebalancePipeline.addStage(new CurrentStateComputationStage());
    rebalancePipeline.addStage(new BestPossibleStateCalcStage());
    rebalancePipeline.addStage(new IntermediateStateCalcStage());
    rebalancePipeline.addStage(new MessageGenerationPhase());
    rebalancePipeline.addStage(new MessageSelectionStage());
    rebalancePipeline.addStage(new MessageThrottleStage());
    rebalancePipeline.addStage(new TaskAssignmentStage());
    // round1: set node0 currentState to OFFLINE and node1 currentState to SLAVE
    setCurrentState(clusterName, "localhost_0", resourceName, resourceName + "_0", "session_0", "OFFLINE");
    runPipeline(event, dataRefresh);
    runPipeline(event, rebalancePipeline);
    MessageSelectionStageOutput msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    List<Message> messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 1, "Should output 1 message: OFFLINE-SLAVE for node0");
    Message message = messages.get(0);
    Assert.assertEquals(message.getFromState(), "OFFLINE");
    Assert.assertEquals(message.getToState(), "SLAVE");
    Assert.assertEquals(message.getTgtName(), "localhost_0");
    // round2: drop resource, but keep the
    // message, make sure controller should not send O->DROPPED until O->S is done
    HelixAdmin admin = new ZKHelixAdmin(_gZkClient);
    admin.dropResource(clusterName, resourceName);
    List<IdealState> idealStates = accessor.getChildValues(accessor.keyBuilder().idealStates());
    cache.setIdealStates(idealStates);
    runPipeline(event, dataRefresh);
    cache = event.getAttribute(AttributeName.ClusterDataCache.name());
    cache.setClusterConfig(new ClusterConfig(clusterName));
    runPipeline(event, rebalancePipeline);
    msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 0, "Should not output only 1 message: OFFLINE->DROPPED for localhost_0");
    // round3: remove O->S message for localhost_0, localhost_0 still in OFFLINE
    // controller should now send O->DROPPED to localhost_0
    Builder keyBuilder = accessor.keyBuilder();
    List<String> msgIds = accessor.getChildNames(keyBuilder.messages("localhost_0"));
    accessor.removeProperty(keyBuilder.message("localhost_0", msgIds.get(0)));
    runPipeline(event, dataRefresh);
    runPipeline(event, rebalancePipeline);
    msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 1, "Should output 1 message: OFFLINE->DROPPED for localhost_0");
    message = messages.get(0);
    Assert.assertEquals(message.getFromState(), "OFFLINE");
    Assert.assertEquals(message.getToState(), "DROPPED");
    Assert.assertEquals(message.getTgtName(), "localhost_0");
    System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis()));
}
Also used : Message(org.apache.helix.model.Message) Builder(org.apache.helix.PropertyKey.Builder) HelixAdmin(org.apache.helix.HelixAdmin) ZKHelixAdmin(org.apache.helix.manager.zk.ZKHelixAdmin) IdealState(org.apache.helix.model.IdealState) ZKHelixAdmin(org.apache.helix.manager.zk.ZKHelixAdmin) ZNRecord(org.apache.helix.ZNRecord) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) Partition(org.apache.helix.model.Partition) HelixManager(org.apache.helix.HelixManager) Date(java.util.Date) Pipeline(org.apache.helix.controller.pipeline.Pipeline) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) HelixDataAccessor(org.apache.helix.HelixDataAccessor) ClusterConfig(org.apache.helix.model.ClusterConfig) Test(org.testng.annotations.Test)

Example 7 with Pipeline

use of org.apache.helix.controller.pipeline.Pipeline in project helix by apache.

the class TestRebalancePipeline method testMasterXfer.

@Test
public void testMasterXfer() {
    String clusterName = "CLUSTER_" + _className + "_xfer";
    System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis()));
    HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor<ZNRecord>(_gZkClient));
    HelixManager manager = new DummyClusterManager(clusterName, accessor);
    ClusterEvent event = new ClusterEvent(ClusterEventType.Unknown);
    event.addAttribute(AttributeName.helixmanager.name(), manager);
    refreshClusterConfig(clusterName, accessor);
    final String resourceName = "testResource_xfer";
    String[] resourceGroups = new String[] { resourceName };
    // ideal state: node0 is MASTER, node1 is SLAVE
    // replica=2 means 1 master and 1 slave
    setupIdealState(clusterName, new int[] { 0, 1 }, resourceGroups, 1, 2);
    setupLiveInstances(clusterName, new int[] { 1 });
    setupStateModel(clusterName);
    // cluster data cache refresh pipeline
    Pipeline dataRefresh = new Pipeline();
    dataRefresh.addStage(new ReadClusterDataStage());
    // rebalance pipeline
    Pipeline rebalancePipeline = new Pipeline();
    rebalancePipeline.addStage(new ResourceComputationStage());
    rebalancePipeline.addStage(new CurrentStateComputationStage());
    rebalancePipeline.addStage(new BestPossibleStateCalcStage());
    rebalancePipeline.addStage(new IntermediateStateCalcStage());
    rebalancePipeline.addStage(new MessageGenerationPhase());
    rebalancePipeline.addStage(new MessageSelectionStage());
    rebalancePipeline.addStage(new MessageThrottleStage());
    rebalancePipeline.addStage(new TaskAssignmentStage());
    // round1: set node1 currentState to SLAVE
    setCurrentState(clusterName, "localhost_1", resourceName, resourceName + "_0", "session_1", "SLAVE");
    runPipeline(event, dataRefresh);
    runPipeline(event, rebalancePipeline);
    MessageSelectionStageOutput msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    List<Message> messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 1, "Should output 1 message: SLAVE-MASTER for node1");
    Message message = messages.get(0);
    Assert.assertEquals(message.getFromState(), "SLAVE");
    Assert.assertEquals(message.getToState(), "MASTER");
    Assert.assertEquals(message.getTgtName(), "localhost_1");
    // round2: updates node0 currentState to SLAVE but keep the
    // message, make sure controller should not send S->M until removal is done
    setupLiveInstances(clusterName, new int[] { 0 });
    setCurrentState(clusterName, "localhost_0", resourceName, resourceName + "_0", "session_0", "SLAVE");
    runPipeline(event, dataRefresh);
    runPipeline(event, rebalancePipeline);
    msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 0, "Should NOT output 1 message: SLAVE-MASTER for node0");
    System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis()));
}
Also used : Partition(org.apache.helix.model.Partition) HelixManager(org.apache.helix.HelixManager) Message(org.apache.helix.model.Message) Date(java.util.Date) Pipeline(org.apache.helix.controller.pipeline.Pipeline) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) HelixDataAccessor(org.apache.helix.HelixDataAccessor) ZNRecord(org.apache.helix.ZNRecord) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) Test(org.testng.annotations.Test)

Example 8 with Pipeline

use of org.apache.helix.controller.pipeline.Pipeline in project helix by apache.

the class TestRebalancePipeline method testNoDuplicatedMaster.

@Test
public void testNoDuplicatedMaster() {
    String clusterName = "CLUSTER_" + _className + "_no_duplicated_master";
    System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis()));
    HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor<ZNRecord>(_gZkClient));
    HelixManager manager = new DummyClusterManager(clusterName, accessor);
    ClusterEvent event = new ClusterEvent(ClusterEventType.Unknown);
    event.addAttribute(AttributeName.helixmanager.name(), manager);
    refreshClusterConfig(clusterName, accessor);
    final String resourceName = "testResource_no_duplicated_master";
    String[] resourceGroups = new String[] { resourceName };
    // ideal state: node0 is SLAVE, node1 is MASTER
    // replica=2 means 1 master and 1 slave
    setupIdealState(clusterName, new int[] { 0, 1 }, resourceGroups, 1, 2);
    setupLiveInstances(clusterName, new int[] { 0, 1 });
    setupStateModel(clusterName);
    // cluster data cache refresh pipeline
    Pipeline dataRefresh = new Pipeline();
    dataRefresh.addStage(new ReadClusterDataStage());
    // rebalance pipeline
    Pipeline rebalancePipeline = new Pipeline();
    rebalancePipeline.addStage(new ResourceComputationStage());
    rebalancePipeline.addStage(new CurrentStateComputationStage());
    rebalancePipeline.addStage(new BestPossibleStateCalcStage());
    rebalancePipeline.addStage(new IntermediateStateCalcStage());
    rebalancePipeline.addStage(new MessageGenerationPhase());
    rebalancePipeline.addStage(new MessageSelectionStage());
    rebalancePipeline.addStage(new MessageThrottleStage());
    rebalancePipeline.addStage(new TaskAssignmentStage());
    // set node0 currentState to SLAVE, node1 currentState to MASTER
    // Helix will try to switch the state of the two instances, but it should not be two MASTER at the same time
    // so it should first transit M->S, then transit another instance S->M
    setCurrentState(clusterName, "localhost_0", resourceName, resourceName + "_0", "session_0", "SLAVE");
    setCurrentState(clusterName, "localhost_1", resourceName, resourceName + "_0", "session_1", "MASTER");
    runPipeline(event, dataRefresh);
    runPipeline(event, rebalancePipeline);
    MessageSelectionStageOutput msgSelOutput = event.getAttribute(AttributeName.MESSAGES_SELECTED.name());
    List<Message> messages = msgSelOutput.getMessages(resourceName, new Partition(resourceName + "_0"));
    Assert.assertEquals(messages.size(), 1, "Should output 1 message: MASTER-SLAVE for localhost_1");
    Message message = messages.get(0);
    Assert.assertEquals(message.getFromState(), "MASTER");
    Assert.assertEquals(message.getToState(), "SLAVE");
    Assert.assertEquals(message.getTgtName(), "localhost_1");
    System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis()));
}
Also used : Partition(org.apache.helix.model.Partition) HelixManager(org.apache.helix.HelixManager) Message(org.apache.helix.model.Message) Date(java.util.Date) Pipeline(org.apache.helix.controller.pipeline.Pipeline) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) HelixDataAccessor(org.apache.helix.HelixDataAccessor) ZNRecord(org.apache.helix.ZNRecord) ZKHelixDataAccessor(org.apache.helix.manager.zk.ZKHelixDataAccessor) Test(org.testng.annotations.Test)

Example 9 with Pipeline

use of org.apache.helix.controller.pipeline.Pipeline in project helix by apache.

the class GenericHelixController method handleEvent.

/**
 * lock-always: caller always needs to obtain an external lock before call, calls to handleEvent()
 * should be serialized
 * @param event
 */
protected void handleEvent(ClusterEvent event, ClusterDataCache cache) {
    HelixManager manager = event.getAttribute(AttributeName.helixmanager.name());
    if (manager == null) {
        logger.error("No cluster manager in event:" + event.getEventType());
        return;
    }
    if (!manager.isLeader()) {
        logger.error("Cluster manager: " + manager.getInstanceName() + " is not leader for " + manager.getClusterName() + ". Pipeline will not be invoked");
        return;
    }
    // will be excuting in un-paused mode. Which might not be the config in ZK.
    if (_paused) {
        logger.info("Cluster " + manager.getClusterName() + " is paused. Ignoring the event:" + event.getEventType());
        return;
    }
    NotificationContext context = null;
    if (event.getAttribute(AttributeName.changeContext.name()) != null) {
        context = event.getAttribute(AttributeName.changeContext.name());
    }
    if (context != null) {
        if (context.getType() == Type.FINALIZE) {
            stopRebalancingTimers();
            logger.info("Get FINALIZE notification, skip the pipeline. Event :" + event.getEventType());
            return;
        } else {
            // TODO: should be in the initialization of controller.
            if (_cache != null) {
                checkRebalancingTimer(manager, Collections.EMPTY_LIST, _cache.getClusterConfig());
            }
            if (_isMonitoring) {
                event.addAttribute(AttributeName.clusterStatusMonitor.name(), _clusterStatusMonitor);
            }
        }
    }
    // add the cache
    event.addAttribute(AttributeName.ClusterDataCache.name(), cache);
    List<Pipeline> pipelines = cache.isTaskCache() ? _taskRegistry.getPipelinesForEvent(event.getEventType()) : _registry.getPipelinesForEvent(event.getEventType());
    if (pipelines == null || pipelines.size() == 0) {
        logger.info("No " + getPipelineType(cache.isTaskCache()) + " pipeline to run for event:" + event.getEventType());
        return;
    }
    logger.info(String.format("START: Invoking %s controller pipeline for cluster %s event: %s", manager.getClusterName(), getPipelineType(cache.isTaskCache()), event.getEventType()));
    long startTime = System.currentTimeMillis();
    boolean rebalanceFail = false;
    for (Pipeline pipeline : pipelines) {
        try {
            pipeline.handle(event);
            pipeline.finish();
        } catch (Exception e) {
            logger.error("Exception while executing " + getPipelineType(cache.isTaskCache()) + "pipeline: " + pipeline + "for cluster ." + _clusterName + ". Will not continue to next pipeline", e);
            if (e instanceof HelixMetaDataAccessException) {
                rebalanceFail = true;
                // If pipeline failed due to read/write fails to zookeeper, retry the pipeline.
                cache.requireFullRefresh();
                logger.warn("Rebalance pipeline failed due to read failure from zookeeper, cluster: " + _clusterName);
                // only push a retry event when there is no pending event in the corresponding event queue.
                if (isEventQueueEmpty(cache.isTaskCache())) {
                    _continousRebalanceFailureCount++;
                    long delay = getRetryDelay(_continousRebalanceFailureCount);
                    if (delay == 0) {
                        forceRebalance(manager, ClusterEventType.RetryRebalance);
                    } else {
                        _asyncTasksThreadPool.schedule(new RebalanceTask(manager, ClusterEventType.RetryRebalance), delay, TimeUnit.MILLISECONDS);
                    }
                    logger.info("Retry rebalance pipeline with delay " + delay + "ms for cluster: " + _clusterName);
                }
            }
            _clusterStatusMonitor.reportRebalanceFailure();
            break;
        }
    }
    if (!rebalanceFail) {
        _continousRebalanceFailureCount = 0;
    }
    long endTime = System.currentTimeMillis();
    logger.info(String.format("END: Invoking %s controller pipeline for event: %s for cluster %s, took %d ms", getPipelineType(cache.isTaskCache()), event.getEventType(), manager.getClusterName(), (endTime - startTime)));
    if (!cache.isTaskCache()) {
        // report event process durations
        NotificationContext notificationContext = event.getAttribute(AttributeName.changeContext.name());
        long enqueueTime = event.getCreationTime();
        long zkCallbackTime;
        StringBuilder sb = new StringBuilder();
        if (notificationContext != null) {
            zkCallbackTime = notificationContext.getCreationTime();
            if (_isMonitoring) {
                _clusterStatusMonitor.updateClusterEventDuration(ClusterEventMonitor.PhaseName.Callback.name(), enqueueTime - zkCallbackTime);
            }
            sb.append(String.format("Callback time for event: " + event.getEventType() + " took: " + (enqueueTime - zkCallbackTime) + " ms\n"));
        }
        if (_isMonitoring) {
            _clusterStatusMonitor.updateClusterEventDuration(ClusterEventMonitor.PhaseName.InQueue.name(), startTime - enqueueTime);
            _clusterStatusMonitor.updateClusterEventDuration(ClusterEventMonitor.PhaseName.TotalProcessed.name(), endTime - startTime);
        }
        sb.append(String.format("InQueue time for event: " + event.getEventType() + " took: " + (startTime - enqueueTime) + " ms\n"));
        sb.append(String.format("TotalProcessed time for event: " + event.getEventType() + " took: " + (endTime - startTime) + " ms"));
        logger.info(sb.toString());
    } else if (_isMonitoring) {
        // report workflow status
        TaskDriver driver = new TaskDriver(manager);
        _clusterStatusMonitor.refreshWorkflowsStatus(driver);
        _clusterStatusMonitor.refreshJobsStatus(driver);
    }
    // If event handling happens before controller deactivate, the process may write unnecessary
    // MBeans to monitoring after the monitor is disabled.
    // So reset ClusterStatusMonitor according to it's status after all event handling.
    // TODO remove this once clusterStatusMonitor blocks any MBean register on isMonitoring = false.
    resetClusterStatusMonitor();
}
Also used : NotificationContext(org.apache.helix.NotificationContext) HelixManager(org.apache.helix.HelixManager) HelixMetaDataAccessException(org.apache.helix.api.exceptions.HelixMetaDataAccessException) TaskDriver(org.apache.helix.task.TaskDriver) ZkInterruptedException(org.I0Itec.zkclient.exception.ZkInterruptedException) HelixMetaDataAccessException(org.apache.helix.api.exceptions.HelixMetaDataAccessException) Pipeline(org.apache.helix.controller.pipeline.Pipeline)

Example 10 with Pipeline

use of org.apache.helix.controller.pipeline.Pipeline in project helix by apache.

the class GenericHelixController method createDefaultRegistry.

private static PipelineRegistry createDefaultRegistry(String pipelineName) {
    logger.info("createDefaultRegistry");
    synchronized (GenericHelixController.class) {
        PipelineRegistry registry = new PipelineRegistry();
        // cluster data cache refresh
        Pipeline dataRefresh = new Pipeline(pipelineName);
        dataRefresh.addStage(new ReadClusterDataStage());
        // rebalance pipeline
        Pipeline rebalancePipeline = new Pipeline(pipelineName);
        rebalancePipeline.addStage(new ResourceComputationStage());
        rebalancePipeline.addStage(new ResourceValidationStage());
        rebalancePipeline.addStage(new CurrentStateComputationStage());
        rebalancePipeline.addStage(new BestPossibleStateCalcStage());
        rebalancePipeline.addStage(new IntermediateStateCalcStage());
        rebalancePipeline.addStage(new MessageGenerationPhase());
        rebalancePipeline.addStage(new MessageSelectionStage());
        rebalancePipeline.addStage(new MessageThrottleStage());
        rebalancePipeline.addStage(new TaskAssignmentStage());
        rebalancePipeline.addStage(new PersistAssignmentStage());
        rebalancePipeline.addStage(new TargetExteralViewCalcStage());
        // external view generation
        Pipeline externalViewPipeline = new Pipeline(pipelineName);
        externalViewPipeline.addStage(new ExternalViewComputeStage());
        // backward compatibility check
        Pipeline liveInstancePipeline = new Pipeline(pipelineName);
        liveInstancePipeline.addStage(new CompatibilityCheckStage());
        registry.register(ClusterEventType.IdealStateChange, dataRefresh, rebalancePipeline);
        registry.register(ClusterEventType.CurrentStateChange, dataRefresh, rebalancePipeline, externalViewPipeline);
        registry.register(ClusterEventType.InstanceConfigChange, dataRefresh, rebalancePipeline);
        registry.register(ClusterEventType.ResourceConfigChange, dataRefresh, rebalancePipeline);
        registry.register(ClusterEventType.ClusterConfigChange, dataRefresh, rebalancePipeline);
        registry.register(ClusterEventType.LiveInstanceChange, dataRefresh, liveInstancePipeline, rebalancePipeline, externalViewPipeline);
        registry.register(ClusterEventType.MessageChange, dataRefresh, rebalancePipeline);
        registry.register(ClusterEventType.ExternalViewChange, dataRefresh);
        registry.register(ClusterEventType.Resume, dataRefresh, rebalancePipeline, externalViewPipeline);
        registry.register(ClusterEventType.PeriodicalRebalance, dataRefresh, rebalancePipeline, externalViewPipeline);
        return registry;
    }
}
Also used : PipelineRegistry(org.apache.helix.controller.pipeline.PipelineRegistry) Pipeline(org.apache.helix.controller.pipeline.Pipeline)

Aggregations

Pipeline (org.apache.helix.controller.pipeline.Pipeline)11 HelixManager (org.apache.helix.HelixManager)8 HelixDataAccessor (org.apache.helix.HelixDataAccessor)7 ZNRecord (org.apache.helix.ZNRecord)7 ZKHelixDataAccessor (org.apache.helix.manager.zk.ZKHelixDataAccessor)7 Message (org.apache.helix.model.Message)7 Partition (org.apache.helix.model.Partition)7 Test (org.testng.annotations.Test)7 Date (java.util.Date)6 ArrayList (java.util.ArrayList)2 Builder (org.apache.helix.PropertyKey.Builder)2 ZkInterruptedException (org.I0Itec.zkclient.exception.ZkInterruptedException)1 HelixAdmin (org.apache.helix.HelixAdmin)1 NotificationContext (org.apache.helix.NotificationContext)1 HelixMetaDataAccessException (org.apache.helix.api.exceptions.HelixMetaDataAccessException)1 PipelineRegistry (org.apache.helix.controller.pipeline.PipelineRegistry)1 BestPossibleStateCalcStage (org.apache.helix.controller.stages.BestPossibleStateCalcStage)1 BestPossibleStateOutput (org.apache.helix.controller.stages.BestPossibleStateOutput)1 ClusterDataCache (org.apache.helix.controller.stages.ClusterDataCache)1 CurrentStateOutput (org.apache.helix.controller.stages.CurrentStateOutput)1