Search in sources :

Example 6 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class JobContainer method schedule.

/**
     * schedule首先完成的工作是把上一步reader和writer split的结果整合到具体taskGroupContainer中,
     * 同时不同的执行模式调用不同的调度策略,将所有任务调度起来
     */
private void schedule() {
    /**
         * 这里的全局speed和每个channel的速度设置为B/s
         */
    int channelsPerTaskGroup = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, 5);
    int taskNumber = this.configuration.getList(CoreConstant.DATAX_JOB_CONTENT).size();
    this.needChannelNumber = Math.min(this.needChannelNumber, taskNumber);
    PerfTrace.getInstance().setChannelNumber(needChannelNumber);
    /**
         * 通过获取配置信息得到每个taskGroup需要运行哪些tasks任务
         */
    List<Configuration> taskGroupConfigs = JobAssignUtil.assignFairly(this.configuration, this.needChannelNumber, channelsPerTaskGroup);
    LOG.info("Scheduler starts [{}] taskGroups.", taskGroupConfigs.size());
    ExecuteMode executeMode = null;
    AbstractScheduler scheduler;
    try {
        executeMode = ExecuteMode.STANDALONE;
        scheduler = initStandaloneScheduler(this.configuration);
        //设置 executeMode
        for (Configuration taskGroupConfig : taskGroupConfigs) {
            taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_JOB_MODE, executeMode.getValue());
        }
        if (executeMode == ExecuteMode.LOCAL || executeMode == ExecuteMode.DISTRIBUTE) {
            if (this.jobId <= 0) {
                throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, "在[ local | distribute ]模式下必须设置jobId,并且其值 > 0 .");
            }
        }
        LOG.info("Running by {} Mode.", executeMode);
        this.startTransferTimeStamp = System.currentTimeMillis();
        scheduler.schedule(taskGroupConfigs);
        this.endTransferTimeStamp = System.currentTimeMillis();
    } catch (Exception e) {
        LOG.error("运行scheduler 模式[{}]出错.", executeMode);
        this.endTransferTimeStamp = System.currentTimeMillis();
        throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
    }
    /**
         * 检查任务执行情况
         */
    this.checkLimit();
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) ExecuteMode(com.alibaba.datax.dataxservice.face.domain.enums.ExecuteMode) AbstractScheduler(com.alibaba.datax.core.job.scheduler.AbstractScheduler) DataXException(com.alibaba.datax.common.exception.DataXException)

Example 7 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class JobContainer method distributeTasksToTaskGroup.

/**
     * 这里比较复杂,分两步整合 1. tasks到channel 2. channel到taskGroup
     * 合起来考虑,其实就是把tasks整合到taskGroup中,需要满足计算出的channel数,同时不能多起channel
     * <p/>
     * example:
     * <p/>
     * 前提条件: 切分后是1024个分表,假设用户要求总速率是1000M/s,每个channel的速率的3M/s,
     * 每个taskGroup负责运行7个channel
     * <p/>
     * 计算: 总channel数为:1000M/s / 3M/s =
     * 333个,为平均分配,计算可知有308个每个channel有3个tasks,而有25个每个channel有4个tasks,
     * 需要的taskGroup数为:333 / 7 =
     * 47...4,也就是需要48个taskGroup,47个是每个负责7个channel,有4个负责1个channel
     * <p/>
     * 处理:我们先将这负责4个channel的taskGroup处理掉,逻辑是:
     * 先按平均为3个tasks找4个channel,设置taskGroupId为0,
     * 接下来就像发牌一样轮询分配task到剩下的包含平均channel数的taskGroup中
     * <p/>
     * TODO delete it
     *
     * @param averTaskPerChannel
     * @param channelNumber
     * @param channelsPerTaskGroup
     * @return 每个taskGroup独立的全部配置
     */
@SuppressWarnings("serial")
private List<Configuration> distributeTasksToTaskGroup(int averTaskPerChannel, int channelNumber, int channelsPerTaskGroup) {
    Validate.isTrue(averTaskPerChannel > 0 && channelNumber > 0 && channelsPerTaskGroup > 0, "每个channel的平均task数[averTaskPerChannel],channel数目[channelNumber],每个taskGroup的平均channel数[channelsPerTaskGroup]都应该为正数");
    List<Configuration> taskConfigs = this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
    int taskGroupNumber = channelNumber / channelsPerTaskGroup;
    int leftChannelNumber = channelNumber % channelsPerTaskGroup;
    if (leftChannelNumber > 0) {
        taskGroupNumber += 1;
    }
    /**
         * 如果只有一个taskGroup,直接打标返回
         */
    if (taskGroupNumber == 1) {
        final Configuration taskGroupConfig = this.configuration.clone();
        /**
             * configure的clone不能clone出
             */
        taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT));
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, channelNumber);
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, 0);
        return new ArrayList<Configuration>() {

            {
                add(taskGroupConfig);
            }
        };
    }
    List<Configuration> taskGroupConfigs = new ArrayList<Configuration>();
    /**
         * 将每个taskGroup中content的配置清空
         */
    for (int i = 0; i < taskGroupNumber; i++) {
        Configuration taskGroupConfig = this.configuration.clone();
        List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
        taskGroupJobContent.clear();
        taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
        taskGroupConfigs.add(taskGroupConfig);
    }
    int taskConfigIndex = 0;
    int channelIndex = 0;
    int taskGroupConfigIndex = 0;
    /**
         * 先处理掉taskGroup包含channel数不是平均值的taskGroup
         */
    if (leftChannelNumber > 0) {
        Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
        for (; channelIndex < leftChannelNumber; channelIndex++) {
            for (int i = 0; i < averTaskPerChannel; i++) {
                List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
                taskGroupJobContent.add(taskConfigs.get(taskConfigIndex++));
                taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
            }
        }
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, leftChannelNumber);
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, taskGroupConfigIndex++);
    }
    /**
         * 下面需要轮询分配,并打上channel数和taskGroupId标记
         */
    int equalDivisionStartIndex = taskGroupConfigIndex;
    for (; taskConfigIndex < taskConfigs.size() && equalDivisionStartIndex < taskGroupConfigs.size(); ) {
        for (taskGroupConfigIndex = equalDivisionStartIndex; taskGroupConfigIndex < taskGroupConfigs.size() && taskConfigIndex < taskConfigs.size(); taskGroupConfigIndex++) {
            Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
            List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
            taskGroupJobContent.add(taskConfigs.get(taskConfigIndex++));
            taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
        }
    }
    for (taskGroupConfigIndex = equalDivisionStartIndex; taskGroupConfigIndex < taskGroupConfigs.size(); ) {
        Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, channelsPerTaskGroup);
        taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, taskGroupConfigIndex++);
    }
    return taskGroupConfigs;
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) ArrayList(java.util.ArrayList)

Example 8 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class Engine method entry.

public static void entry(final String[] args) throws Throwable {
    Options options = new Options();
    options.addOption("job", true, "Job config.");
    options.addOption("jobid", true, "Job unique id.");
    options.addOption("mode", true, "Job runtime mode.");
    BasicParser parser = new BasicParser();
    CommandLine cl = parser.parse(options, args);
    String jobPath = cl.getOptionValue("job");
    // 如果用户没有明确指定jobid, 则 datax.py 会指定 jobid 默认值为-1
    String jobIdString = cl.getOptionValue("jobid");
    RUNTIME_MODE = cl.getOptionValue("mode");
    Configuration configuration = ConfigParser.parse(jobPath);
    long jobId;
    if (!"-1".equalsIgnoreCase(jobIdString)) {
        jobId = Long.parseLong(jobIdString);
    } else {
        // only for dsc & ds & datax 3 update
        String dscJobUrlPatternString = "/instance/(\\d{1,})/config.xml";
        String dsJobUrlPatternString = "/inner/job/(\\d{1,})/config";
        String dsTaskGroupUrlPatternString = "/inner/job/(\\d{1,})/taskGroup/";
        List<String> patternStringList = Arrays.asList(dscJobUrlPatternString, dsJobUrlPatternString, dsTaskGroupUrlPatternString);
        jobId = parseJobIdFromUrl(patternStringList, jobPath);
    }
    boolean isStandAloneMode = "standalone".equalsIgnoreCase(RUNTIME_MODE);
    if (!isStandAloneMode && jobId == -1) {
        // 如果不是 standalone 模式,那么 jobId 一定不能为-1
        throw DataXException.asDataXException(FrameworkErrorCode.CONFIG_ERROR, "非 standalone 模式必须在 URL 中提供有效的 jobId.");
    }
    configuration.set(CoreConstant.DATAX_CORE_CONTAINER_JOB_ID, jobId);
    //打印vmInfo
    VMInfo vmInfo = VMInfo.getVmInfo();
    if (vmInfo != null) {
        LOG.info(vmInfo.toString());
    }
    LOG.info("\n" + Engine.filterJobConfiguration(configuration) + "\n");
    LOG.debug(configuration.toJSON());
    ConfigurationValidate.doValidate(configuration);
    Engine engine = new Engine();
    engine.start(configuration);
}
Also used : Options(org.apache.commons.cli.Options) BasicParser(org.apache.commons.cli.BasicParser) CommandLine(org.apache.commons.cli.CommandLine) Configuration(com.alibaba.datax.common.util.Configuration) VMInfo(com.alibaba.datax.common.statistics.VMInfo)

Example 9 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class JobAssignUtil method parseAndGetResourceMarkAndTaskIdMap.

/**
     * 根据task 配置,获取到:
     * 资源名称 --> taskId(List) 的 map 映射关系
     */
private static LinkedHashMap<String, List<Integer>> parseAndGetResourceMarkAndTaskIdMap(List<Configuration> contentConfig) {
    // key: resourceMark, value: taskId
    LinkedHashMap<String, List<Integer>> readerResourceMarkAndTaskIdMap = new LinkedHashMap<String, List<Integer>>();
    LinkedHashMap<String, List<Integer>> writerResourceMarkAndTaskIdMap = new LinkedHashMap<String, List<Integer>>();
    for (Configuration aTaskConfig : contentConfig) {
        int taskId = aTaskConfig.getInt(CoreConstant.TASK_ID);
        // 把 readerResourceMark 加到 readerResourceMarkAndTaskIdMap 中
        String readerResourceMark = aTaskConfig.getString(CoreConstant.JOB_READER_PARAMETER + "." + CommonConstant.LOAD_BALANCE_RESOURCE_MARK);
        if (readerResourceMarkAndTaskIdMap.get(readerResourceMark) == null) {
            readerResourceMarkAndTaskIdMap.put(readerResourceMark, new LinkedList<Integer>());
        }
        readerResourceMarkAndTaskIdMap.get(readerResourceMark).add(taskId);
        // 把 writerResourceMark 加到 writerResourceMarkAndTaskIdMap 中
        String writerResourceMark = aTaskConfig.getString(CoreConstant.JOB_WRITER_PARAMETER + "." + CommonConstant.LOAD_BALANCE_RESOURCE_MARK);
        if (writerResourceMarkAndTaskIdMap.get(writerResourceMark) == null) {
            writerResourceMarkAndTaskIdMap.put(writerResourceMark, new LinkedList<Integer>());
        }
        writerResourceMarkAndTaskIdMap.get(writerResourceMark).add(taskId);
    }
    if (readerResourceMarkAndTaskIdMap.size() >= writerResourceMarkAndTaskIdMap.size()) {
        // 采用 reader 对资源做的标记进行 shuffle
        return readerResourceMarkAndTaskIdMap;
    } else {
        // 采用 writer 对资源做的标记进行 shuffle
        return writerResourceMarkAndTaskIdMap;
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration)

Example 10 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class ProcessInnerScheduler method startAllTaskGroup.

@Override
public void startAllTaskGroup(List<Configuration> configurations) {
    this.taskGroupContainerExecutorService = Executors.newFixedThreadPool(configurations.size());
    for (Configuration taskGroupConfiguration : configurations) {
        TaskGroupContainerRunner taskGroupContainerRunner = newTaskGroupContainerRunner(taskGroupConfiguration);
        this.taskGroupContainerExecutorService.execute(taskGroupContainerRunner);
    }
    this.taskGroupContainerExecutorService.shutdown();
}
Also used : TaskGroupContainerRunner(com.alibaba.datax.core.taskgroup.runner.TaskGroupContainerRunner) Configuration(com.alibaba.datax.common.util.Configuration)

Aggregations

Configuration (com.alibaba.datax.common.util.Configuration)82 ArrayList (java.util.ArrayList)27 Test (org.junit.Test)19 Communication (com.alibaba.datax.core.statistics.communication.Communication)13 DataXException (com.alibaba.datax.common.exception.DataXException)9 Method (java.lang.reflect.Method)8 Record (com.alibaba.datax.common.element.Record)7 JobContainer (com.alibaba.datax.core.job.JobContainer)6 IOException (java.io.IOException)5 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 LongColumn (com.alibaba.datax.common.element.LongColumn)4 TaskPluginCollector (com.alibaba.datax.common.plugin.TaskPluginCollector)4 TaskGroupContainer (com.alibaba.datax.core.taskgroup.TaskGroupContainer)4 Channel (com.alibaba.datax.core.transport.channel.Channel)4 MemoryChannel (com.alibaba.datax.core.transport.channel.memory.MemoryChannel)4 DefaultRecord (com.alibaba.datax.core.transport.record.DefaultRecord)4 File (java.io.File)4 HashSet (java.util.HashSet)3 List (java.util.List)3 VMInfo (com.alibaba.datax.common.statistics.VMInfo)2