Search in sources :

Example 21 with Communication

use of com.alibaba.datax.core.statistics.communication.Communication in project DataX by alibaba.

the class AbstractTGContainerCommunicator method collectState.

@Override
public final State collectState() {
    Communication communication = new Communication();
    communication.setState(State.SUCCEEDED);
    for (Communication taskCommunication : super.getCollector().getTaskCommunicationMap().values()) {
        communication.mergeStateFrom(taskCommunication);
    }
    return communication.getState();
}
Also used : Communication(com.alibaba.datax.core.statistics.communication.Communication)

Example 22 with Communication

use of com.alibaba.datax.core.statistics.communication.Communication in project DataX by alibaba.

the class TaskGroupContainer method start.

@Override
public void start() {
    try {
        /**
             * 状态check时间间隔,较短,可以把任务及时分发到对应channel中
             */
        int sleepIntervalInMillSec = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
        /**
             * 状态汇报时间间隔,稍长,避免大量汇报
             */
        long reportIntervalInMillSec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 10000);
        /**
             * 2分钟汇报一次性能统计
             */
        // 获取channel数目
        int channelNumber = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);
        int taskMaxRetryTimes = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);
        long taskRetryIntervalInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);
        long taskMaxWaitInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);
        List<Configuration> taskConfigs = this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
        if (LOG.isDebugEnabled()) {
            LOG.debug("taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs));
        }
        int taskCountInThisTaskGroup = taskConfigs.size();
        LOG.info(String.format("taskGroupId=[%d] start [%d] channels for [%d] tasks.", this.taskGroupId, channelNumber, taskCountInThisTaskGroup));
        this.containerCommunicator.registerCommunication(taskConfigs);
        //taskId与task配置
        Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs);
        //待运行task列表
        List<Configuration> taskQueue = buildRemainTasks(taskConfigs);
        //taskId与上次失败实例
        Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>();
        //正在运行task
        List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber);
        //任务开始时间
        Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>();
        long lastReportTimeStamp = 0;
        Communication lastTaskGroupContainerCommunication = new Communication();
        while (true) {
            //1.判断task状态
            boolean failedOrKilled = false;
            Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
            for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) {
                Integer taskId = entry.getKey();
                Communication taskCommunication = entry.getValue();
                if (!taskCommunication.isFinished()) {
                    continue;
                }
                TaskExecutor taskExecutor = removeTask(runTasks, taskId);
                //上面从runTasks里移除了,因此对应在monitor里移除
                taskMonitor.removeTask(taskId);
                //失败,看task是否支持failover,重试次数未超过最大限制
                if (taskCommunication.getState() == State.FAILED) {
                    taskFailedExecutorMap.put(taskId, taskExecutor);
                    if (taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes) {
                        //关闭老的executor
                        taskExecutor.shutdown();
                        //将task的状态重置
                        containerCommunicator.resetCommunication(taskId);
                        Configuration taskConfig = taskConfigMap.get(taskId);
                        //重新加入任务列表
                        taskQueue.add(taskConfig);
                    } else {
                        failedOrKilled = true;
                        break;
                    }
                } else if (taskCommunication.getState() == State.KILLED) {
                    failedOrKilled = true;
                    break;
                } else if (taskCommunication.getState() == State.SUCCEEDED) {
                    Long taskStartTime = taskStartTimeMap.get(taskId);
                    if (taskStartTime != null) {
                        Long usedTime = System.currentTimeMillis() - taskStartTime;
                        LOG.info("taskGroup[{}] taskId[{}] is successed, used[{}]ms", this.taskGroupId, taskId, usedTime);
                        //usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
                        PerfRecord.addPerfRecord(taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL, taskStartTime, usedTime * 1000L * 1000L);
                        taskStartTimeMap.remove(taskId);
                        taskConfigMap.remove(taskId);
                    }
                }
            }
            // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
            if (failedOrKilled) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable());
            }
            //3.有任务未执行,且正在运行的任务数小于最大通道限制
            Iterator<Configuration> iterator = taskQueue.iterator();
            while (iterator.hasNext() && runTasks.size() < channelNumber) {
                Configuration taskConfig = iterator.next();
                Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
                int attemptCount = 1;
                TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
                if (lastExecutor != null) {
                    attemptCount = lastExecutor.getAttemptCount() + 1;
                    long now = System.currentTimeMillis();
                    long failedTime = lastExecutor.getTimeStamp();
                    if (now - failedTime < taskRetryIntervalInMsec) {
                        //未到等待时间,继续留在队列
                        continue;
                    }
                    if (!lastExecutor.isShutdown()) {
                        //上次失败的task仍未结束
                        if (now - failedTime > taskMaxWaitInMsec) {
                            markCommunicationFailed(taskId);
                            reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                            throw DataXException.asDataXException(CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
                        } else {
                            //再次尝试关闭
                            lastExecutor.shutdown();
                            continue;
                        }
                    } else {
                        LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown", this.taskGroupId, taskId, lastExecutor.getAttemptCount());
                    }
                }
                Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
                TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
                taskStartTimeMap.put(taskId, System.currentTimeMillis());
                taskExecutor.doStart();
                iterator.remove();
                runTasks.add(taskExecutor);
                //上面,增加task到runTasks列表,因此在monitor里注册。
                taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));
                taskFailedExecutorMap.remove(taskId);
                LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] is started", this.taskGroupId, taskId, attemptCount);
            }
            //4.任务列表为空,executor已结束, 搜集状态为success--->成功
            if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) {
                // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
                break;
            }
            // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
            long now = System.currentTimeMillis();
            if (now - lastReportTimeStamp > reportIntervalInMillSec) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                lastReportTimeStamp = now;
                //taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
                for (TaskExecutor taskExecutor : runTasks) {
                    taskMonitor.report(taskExecutor.getTaskId(), this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
                }
            }
            Thread.sleep(sleepIntervalInMillSec);
        }
        //6.最后还要汇报一次
        reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
    } catch (Throwable e) {
        Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
        if (nowTaskGroupContainerCommunication.getThrowable() == null) {
            nowTaskGroupContainerCommunication.setThrowable(e);
        }
        nowTaskGroupContainerCommunication.setState(State.FAILED);
        this.containerCommunicator.report(nowTaskGroupContainerCommunication);
        throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
        if (!PerfTrace.getInstance().isJob()) {
            //最后打印cpu的平均消耗,GC的统计
            VMInfo vmInfo = VMInfo.getVmInfo();
            if (vmInfo != null) {
                vmInfo.getDelta(false);
                LOG.info(vmInfo.totalString());
            }
            LOG.info(PerfTrace.getInstance().summarizeNoException());
        }
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) VMInfo(com.alibaba.datax.common.statistics.VMInfo) Communication(com.alibaba.datax.core.statistics.communication.Communication)

Example 23 with Communication

use of com.alibaba.datax.core.statistics.communication.Communication in project DataX by alibaba.

the class TaskGroupContainer method reportTaskGroupCommunication.

private Communication reportTaskGroupCommunication(Communication lastTaskGroupContainerCommunication, int taskCount) {
    Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
    nowTaskGroupContainerCommunication.setTimestamp(System.currentTimeMillis());
    Communication reportCommunication = CommunicationTool.getReportCommunication(nowTaskGroupContainerCommunication, lastTaskGroupContainerCommunication, taskCount);
    this.containerCommunicator.report(reportCommunication);
    return reportCommunication;
}
Also used : Communication(com.alibaba.datax.core.statistics.communication.Communication)

Example 24 with Communication

use of com.alibaba.datax.core.statistics.communication.Communication in project DataX by alibaba.

the class JobContainerTest method testErrorLimitIgnoreCheck.

@Test
public void testErrorLimitIgnoreCheck() throws Exception {
    this.configuration.set(CoreConstant.DATAX_JOB_SETTING_ERRORLIMIT, -1);
    JobContainer jobContainer = new JobContainer(this.configuration);
    Communication communication = new Communication();
    communication.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 100);
    communication.setLongCounter(CommunicationTool.WRITE_RECEIVED_RECORDS, 100);
    //        LocalTaskGroupCommunicationManager.updateTaskGroupCommunication(0, communication);
    AbstractContainerCommunicator communicator = PowerMockito.mock(AbstractContainerCommunicator.class);
    jobContainer.setContainerCommunicator(communicator);
    PowerMockito.when(communicator.collect()).thenReturn(communication);
    Method initMethod = jobContainer.getClass().getDeclaredMethod("checkLimit");
    initMethod.setAccessible(true);
    initMethod.invoke(jobContainer, new Object[] {});
    initMethod.setAccessible(false);
}
Also used : JobContainer(com.alibaba.datax.core.job.JobContainer) AbstractContainerCommunicator(com.alibaba.datax.core.statistics.container.communicator.AbstractContainerCommunicator) Method(java.lang.reflect.Method) Communication(com.alibaba.datax.core.statistics.communication.Communication) Test(org.junit.Test)

Example 25 with Communication

use of com.alibaba.datax.core.statistics.communication.Communication in project DataX by alibaba.

the class JobContainerTest method testErrorLimitPercentCheck.

@Test(expected = Exception.class)
public void testErrorLimitPercentCheck() throws Exception {
    //        this.configuration.set(CoreConstant.DATAX_JOB_SETTING_ERRORLIMIT, 0.1);
    //        this.configuration.set(CoreConstant.DATAX_JOB_SETTING_ERRORLIMIT_RECORD, null);
    this.configuration.remove(CoreConstant.DATAX_JOB_SETTING_ERRORLIMIT_RECORD);
    this.configuration.set(CoreConstant.DATAX_JOB_SETTING_ERRORLIMIT_PERCENT, 0.1);
    JobContainer jobContainer = new JobContainer(this.configuration);
    Communication communication = new Communication();
    communication.setLongCounter(CommunicationTool.READ_SUCCEED_RECORDS, 100);
    communication.setLongCounter(CommunicationTool.WRITE_RECEIVED_RECORDS, 80);
    communication.setLongCounter(CommunicationTool.WRITE_FAILED_RECORDS, 20);
    //        LocalTaskGroupCommunicationManager.updateTaskGroupCommunication(0, communication);
    Method initMethod = jobContainer.getClass().getDeclaredMethod("checkLimit");
    initMethod.setAccessible(true);
    initMethod.invoke(jobContainer);
    initMethod.setAccessible(false);
}
Also used : JobContainer(com.alibaba.datax.core.job.JobContainer) Method(java.lang.reflect.Method) Communication(com.alibaba.datax.core.statistics.communication.Communication) Test(org.junit.Test)

Aggregations

Communication (com.alibaba.datax.core.statistics.communication.Communication)35 Test (org.junit.Test)20 Configuration (com.alibaba.datax.common.util.Configuration)13 Method (java.lang.reflect.Method)6 LongColumn (com.alibaba.datax.common.element.LongColumn)5 Record (com.alibaba.datax.common.element.Record)5 Channel (com.alibaba.datax.core.transport.channel.Channel)5 MemoryChannel (com.alibaba.datax.core.transport.channel.memory.MemoryChannel)5 DefaultRecord (com.alibaba.datax.core.transport.record.DefaultRecord)5 ArrayList (java.util.ArrayList)5 TaskPluginCollector (com.alibaba.datax.common.plugin.TaskPluginCollector)4 TaskGroupContainer (com.alibaba.datax.core.taskgroup.TaskGroupContainer)4 ErrorRecordChecker (com.alibaba.datax.core.util.ErrorRecordChecker)4 JobContainer (com.alibaba.datax.core.job.JobContainer)3 AbstractContainerCommunicator (com.alibaba.datax.core.statistics.container.communicator.AbstractContainerCommunicator)3 VMInfo (com.alibaba.datax.common.statistics.VMInfo)2 LocalTGCommunicationManager (com.alibaba.datax.core.statistics.communication.LocalTGCommunicationManager)2 StandAloneJobContainerCommunicator (com.alibaba.datax.core.statistics.container.communicator.job.StandAloneJobContainerCommunicator)2 ProcessInnerReporter (com.alibaba.datax.core.statistics.container.report.ProcessInnerReporter)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2