Search in sources :

Example 1 with VMInfo

use of com.alibaba.datax.common.statistics.VMInfo in project DataX by alibaba.

the class JobContainer method start.

/**
     * jobContainer主要负责的工作全部在start()里面,包括init、prepare、split、scheduler、
     * post以及destroy和statistics
     */
@Override
public void start() {
    LOG.info("DataX jobContainer starts job.");
    boolean hasException = false;
    boolean isDryRun = false;
    try {
        this.startTimeStamp = System.currentTimeMillis();
        isDryRun = configuration.getBool(CoreConstant.DATAX_JOB_SETTING_DRYRUN, false);
        if (isDryRun) {
            LOG.info("jobContainer starts to do preCheck ...");
            this.preCheck();
        } else {
            userConf = configuration.clone();
            LOG.debug("jobContainer starts to do preHandle ...");
            this.preHandle();
            LOG.debug("jobContainer starts to do init ...");
            this.init();
            LOG.info("jobContainer starts to do prepare ...");
            this.prepare();
            LOG.info("jobContainer starts to do split ...");
            this.totalStage = this.split();
            LOG.info("jobContainer starts to do schedule ...");
            this.schedule();
            LOG.debug("jobContainer starts to do post ...");
            this.post();
            LOG.debug("jobContainer starts to do postHandle ...");
            this.postHandle();
            LOG.info("DataX jobId [{}] completed successfully.", this.jobId);
            this.invokeHooks();
        }
    } catch (Throwable e) {
        LOG.error("Exception when job run", e);
        hasException = true;
        if (e instanceof OutOfMemoryError) {
            this.destroy();
            System.gc();
        }
        if (super.getContainerCommunicator() == null) {
            // 由于 containerCollector 是在 scheduler() 中初始化的,所以当在 scheduler() 之前出现异常时,需要在此处对 containerCollector 进行初始化
            AbstractContainerCommunicator tempContainerCollector;
            // standalone
            tempContainerCollector = new StandAloneJobContainerCommunicator(configuration);
            super.setContainerCommunicator(tempContainerCollector);
        }
        Communication communication = super.getContainerCommunicator().collect();
        // 汇报前的状态,不需要手动进行设置
        // communication.setState(State.FAILED);
        communication.setThrowable(e);
        communication.setTimestamp(this.endTimeStamp);
        Communication tempComm = new Communication();
        tempComm.setTimestamp(this.startTransferTimeStamp);
        Communication reportCommunication = CommunicationTool.getReportCommunication(communication, tempComm, this.totalStage);
        super.getContainerCommunicator().report(reportCommunication);
        throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
        if (!isDryRun) {
            this.destroy();
            this.endTimeStamp = System.currentTimeMillis();
            if (!hasException) {
                //最后打印cpu的平均消耗,GC的统计
                VMInfo vmInfo = VMInfo.getVmInfo();
                if (vmInfo != null) {
                    vmInfo.getDelta(false);
                    LOG.info(vmInfo.totalString());
                }
                LOG.info(PerfTrace.getInstance().summarizeNoException());
                this.logStatistics();
            }
        }
    }
}
Also used : StandAloneJobContainerCommunicator(com.alibaba.datax.core.statistics.container.communicator.job.StandAloneJobContainerCommunicator) VMInfo(com.alibaba.datax.common.statistics.VMInfo) AbstractContainerCommunicator(com.alibaba.datax.core.statistics.container.communicator.AbstractContainerCommunicator) Communication(com.alibaba.datax.core.statistics.communication.Communication)

Example 2 with VMInfo

use of com.alibaba.datax.common.statistics.VMInfo in project DataX by alibaba.

the class Engine method entry.

public static void entry(final String[] args) throws Throwable {
    Options options = new Options();
    options.addOption("job", true, "Job config.");
    options.addOption("jobid", true, "Job unique id.");
    options.addOption("mode", true, "Job runtime mode.");
    BasicParser parser = new BasicParser();
    CommandLine cl = parser.parse(options, args);
    String jobPath = cl.getOptionValue("job");
    // 如果用户没有明确指定jobid, 则 datax.py 会指定 jobid 默认值为-1
    String jobIdString = cl.getOptionValue("jobid");
    RUNTIME_MODE = cl.getOptionValue("mode");
    Configuration configuration = ConfigParser.parse(jobPath);
    long jobId;
    if (!"-1".equalsIgnoreCase(jobIdString)) {
        jobId = Long.parseLong(jobIdString);
    } else {
        // only for dsc & ds & datax 3 update
        String dscJobUrlPatternString = "/instance/(\\d{1,})/config.xml";
        String dsJobUrlPatternString = "/inner/job/(\\d{1,})/config";
        String dsTaskGroupUrlPatternString = "/inner/job/(\\d{1,})/taskGroup/";
        List<String> patternStringList = Arrays.asList(dscJobUrlPatternString, dsJobUrlPatternString, dsTaskGroupUrlPatternString);
        jobId = parseJobIdFromUrl(patternStringList, jobPath);
    }
    boolean isStandAloneMode = "standalone".equalsIgnoreCase(RUNTIME_MODE);
    if (!isStandAloneMode && jobId == -1) {
        // 如果不是 standalone 模式,那么 jobId 一定不能为-1
        throw DataXException.asDataXException(FrameworkErrorCode.CONFIG_ERROR, "非 standalone 模式必须在 URL 中提供有效的 jobId.");
    }
    configuration.set(CoreConstant.DATAX_CORE_CONTAINER_JOB_ID, jobId);
    //打印vmInfo
    VMInfo vmInfo = VMInfo.getVmInfo();
    if (vmInfo != null) {
        LOG.info(vmInfo.toString());
    }
    LOG.info("\n" + Engine.filterJobConfiguration(configuration) + "\n");
    LOG.debug(configuration.toJSON());
    ConfigurationValidate.doValidate(configuration);
    Engine engine = new Engine();
    engine.start(configuration);
}
Also used : Options(org.apache.commons.cli.Options) BasicParser(org.apache.commons.cli.BasicParser) CommandLine(org.apache.commons.cli.CommandLine) Configuration(com.alibaba.datax.common.util.Configuration) VMInfo(com.alibaba.datax.common.statistics.VMInfo)

Example 3 with VMInfo

use of com.alibaba.datax.common.statistics.VMInfo in project DataX by alibaba.

the class TaskGroupContainer method start.

@Override
public void start() {
    try {
        /**
             * 状态check时间间隔,较短,可以把任务及时分发到对应channel中
             */
        int sleepIntervalInMillSec = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
        /**
             * 状态汇报时间间隔,稍长,避免大量汇报
             */
        long reportIntervalInMillSec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL, 10000);
        /**
             * 2分钟汇报一次性能统计
             */
        // 获取channel数目
        int channelNumber = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);
        int taskMaxRetryTimes = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);
        long taskRetryIntervalInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);
        long taskMaxWaitInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);
        List<Configuration> taskConfigs = this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
        if (LOG.isDebugEnabled()) {
            LOG.debug("taskGroup[{}]'s task configs[{}]", this.taskGroupId, JSON.toJSONString(taskConfigs));
        }
        int taskCountInThisTaskGroup = taskConfigs.size();
        LOG.info(String.format("taskGroupId=[%d] start [%d] channels for [%d] tasks.", this.taskGroupId, channelNumber, taskCountInThisTaskGroup));
        this.containerCommunicator.registerCommunication(taskConfigs);
        //taskId与task配置
        Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs);
        //待运行task列表
        List<Configuration> taskQueue = buildRemainTasks(taskConfigs);
        //taskId与上次失败实例
        Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>();
        //正在运行task
        List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber);
        //任务开始时间
        Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>();
        long lastReportTimeStamp = 0;
        Communication lastTaskGroupContainerCommunication = new Communication();
        while (true) {
            //1.判断task状态
            boolean failedOrKilled = false;
            Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
            for (Map.Entry<Integer, Communication> entry : communicationMap.entrySet()) {
                Integer taskId = entry.getKey();
                Communication taskCommunication = entry.getValue();
                if (!taskCommunication.isFinished()) {
                    continue;
                }
                TaskExecutor taskExecutor = removeTask(runTasks, taskId);
                //上面从runTasks里移除了,因此对应在monitor里移除
                taskMonitor.removeTask(taskId);
                //失败,看task是否支持failover,重试次数未超过最大限制
                if (taskCommunication.getState() == State.FAILED) {
                    taskFailedExecutorMap.put(taskId, taskExecutor);
                    if (taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes) {
                        //关闭老的executor
                        taskExecutor.shutdown();
                        //将task的状态重置
                        containerCommunicator.resetCommunication(taskId);
                        Configuration taskConfig = taskConfigMap.get(taskId);
                        //重新加入任务列表
                        taskQueue.add(taskConfig);
                    } else {
                        failedOrKilled = true;
                        break;
                    }
                } else if (taskCommunication.getState() == State.KILLED) {
                    failedOrKilled = true;
                    break;
                } else if (taskCommunication.getState() == State.SUCCEEDED) {
                    Long taskStartTime = taskStartTimeMap.get(taskId);
                    if (taskStartTime != null) {
                        Long usedTime = System.currentTimeMillis() - taskStartTime;
                        LOG.info("taskGroup[{}] taskId[{}] is successed, used[{}]ms", this.taskGroupId, taskId, usedTime);
                        //usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
                        PerfRecord.addPerfRecord(taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL, taskStartTime, usedTime * 1000L * 1000L);
                        taskStartTimeMap.remove(taskId);
                        taskConfigMap.remove(taskId);
                    }
                }
            }
            // 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
            if (failedOrKilled) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable());
            }
            //3.有任务未执行,且正在运行的任务数小于最大通道限制
            Iterator<Configuration> iterator = taskQueue.iterator();
            while (iterator.hasNext() && runTasks.size() < channelNumber) {
                Configuration taskConfig = iterator.next();
                Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
                int attemptCount = 1;
                TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
                if (lastExecutor != null) {
                    attemptCount = lastExecutor.getAttemptCount() + 1;
                    long now = System.currentTimeMillis();
                    long failedTime = lastExecutor.getTimeStamp();
                    if (now - failedTime < taskRetryIntervalInMsec) {
                        //未到等待时间,继续留在队列
                        continue;
                    }
                    if (!lastExecutor.isShutdown()) {
                        //上次失败的task仍未结束
                        if (now - failedTime > taskMaxWaitInMsec) {
                            markCommunicationFailed(taskId);
                            reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                            throw DataXException.asDataXException(CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
                        } else {
                            //再次尝试关闭
                            lastExecutor.shutdown();
                            continue;
                        }
                    } else {
                        LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown", this.taskGroupId, taskId, lastExecutor.getAttemptCount());
                    }
                }
                Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
                TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
                taskStartTimeMap.put(taskId, System.currentTimeMillis());
                taskExecutor.doStart();
                iterator.remove();
                runTasks.add(taskExecutor);
                //上面,增加task到runTasks列表,因此在monitor里注册。
                taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));
                taskFailedExecutorMap.remove(taskId);
                LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] is started", this.taskGroupId, taskId, attemptCount);
            }
            //4.任务列表为空,executor已结束, 搜集状态为success--->成功
            if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) {
                // 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
                break;
            }
            // 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
            long now = System.currentTimeMillis();
            if (now - lastReportTimeStamp > reportIntervalInMillSec) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                lastReportTimeStamp = now;
                //taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
                for (TaskExecutor taskExecutor : runTasks) {
                    taskMonitor.report(taskExecutor.getTaskId(), this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
                }
            }
            Thread.sleep(sleepIntervalInMillSec);
        }
        //6.最后还要汇报一次
        reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
    } catch (Throwable e) {
        Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
        if (nowTaskGroupContainerCommunication.getThrowable() == null) {
            nowTaskGroupContainerCommunication.setThrowable(e);
        }
        nowTaskGroupContainerCommunication.setState(State.FAILED);
        this.containerCommunicator.report(nowTaskGroupContainerCommunication);
        throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    } finally {
        if (!PerfTrace.getInstance().isJob()) {
            //最后打印cpu的平均消耗,GC的统计
            VMInfo vmInfo = VMInfo.getVmInfo();
            if (vmInfo != null) {
                vmInfo.getDelta(false);
                LOG.info(vmInfo.totalString());
            }
            LOG.info(PerfTrace.getInstance().summarizeNoException());
        }
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) VMInfo(com.alibaba.datax.common.statistics.VMInfo) Communication(com.alibaba.datax.core.statistics.communication.Communication)

Aggregations

VMInfo (com.alibaba.datax.common.statistics.VMInfo)3 Configuration (com.alibaba.datax.common.util.Configuration)2 Communication (com.alibaba.datax.core.statistics.communication.Communication)2 AbstractContainerCommunicator (com.alibaba.datax.core.statistics.container.communicator.AbstractContainerCommunicator)1 StandAloneJobContainerCommunicator (com.alibaba.datax.core.statistics.container.communicator.job.StandAloneJobContainerCommunicator)1 BasicParser (org.apache.commons.cli.BasicParser)1 CommandLine (org.apache.commons.cli.CommandLine)1 Options (org.apache.commons.cli.Options)1