use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.
the class JobContainer method schedule.
/**
* schedule首先完成的工作是把上一步reader和writer split的结果整合到具体taskGroupContainer中,
* 同时不同的执行模式调用不同的调度策略,将所有任务调度起来
*/
private void schedule() {
/**
* 这里的全局speed和每个channel的速度设置为B/s
*/
int channelsPerTaskGroup = this.configuration.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, 5);
int taskNumber = this.configuration.getList(CoreConstant.DATAX_JOB_CONTENT).size();
this.needChannelNumber = Math.min(this.needChannelNumber, taskNumber);
PerfTrace.getInstance().setChannelNumber(needChannelNumber);
/**
* 通过获取配置信息得到每个taskGroup需要运行哪些tasks任务
*/
List<Configuration> taskGroupConfigs = JobAssignUtil.assignFairly(this.configuration, this.needChannelNumber, channelsPerTaskGroup);
LOG.info("Scheduler starts [{}] taskGroups.", taskGroupConfigs.size());
ExecuteMode executeMode = null;
AbstractScheduler scheduler;
try {
executeMode = ExecuteMode.STANDALONE;
scheduler = initStandaloneScheduler(this.configuration);
//设置 executeMode
for (Configuration taskGroupConfig : taskGroupConfigs) {
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_JOB_MODE, executeMode.getValue());
}
if (executeMode == ExecuteMode.LOCAL || executeMode == ExecuteMode.DISTRIBUTE) {
if (this.jobId <= 0) {
throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, "在[ local | distribute ]模式下必须设置jobId,并且其值 > 0 .");
}
}
LOG.info("Running by {} Mode.", executeMode);
this.startTransferTimeStamp = System.currentTimeMillis();
scheduler.schedule(taskGroupConfigs);
this.endTransferTimeStamp = System.currentTimeMillis();
} catch (Exception e) {
LOG.error("运行scheduler 模式[{}]出错.", executeMode);
this.endTransferTimeStamp = System.currentTimeMillis();
throw DataXException.asDataXException(FrameworkErrorCode.RUNTIME_ERROR, e);
}
/**
* 检查任务执行情况
*/
this.checkLimit();
}
use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.
the class JobContainer method distributeTasksToTaskGroup.
/**
* 这里比较复杂,分两步整合 1. tasks到channel 2. channel到taskGroup
* 合起来考虑,其实就是把tasks整合到taskGroup中,需要满足计算出的channel数,同时不能多起channel
* <p/>
* example:
* <p/>
* 前提条件: 切分后是1024个分表,假设用户要求总速率是1000M/s,每个channel的速率的3M/s,
* 每个taskGroup负责运行7个channel
* <p/>
* 计算: 总channel数为:1000M/s / 3M/s =
* 333个,为平均分配,计算可知有308个每个channel有3个tasks,而有25个每个channel有4个tasks,
* 需要的taskGroup数为:333 / 7 =
* 47...4,也就是需要48个taskGroup,47个是每个负责7个channel,有4个负责1个channel
* <p/>
* 处理:我们先将这负责4个channel的taskGroup处理掉,逻辑是:
* 先按平均为3个tasks找4个channel,设置taskGroupId为0,
* 接下来就像发牌一样轮询分配task到剩下的包含平均channel数的taskGroup中
* <p/>
* TODO delete it
*
* @param averTaskPerChannel
* @param channelNumber
* @param channelsPerTaskGroup
* @return 每个taskGroup独立的全部配置
*/
@SuppressWarnings("serial")
private List<Configuration> distributeTasksToTaskGroup(int averTaskPerChannel, int channelNumber, int channelsPerTaskGroup) {
Validate.isTrue(averTaskPerChannel > 0 && channelNumber > 0 && channelsPerTaskGroup > 0, "每个channel的平均task数[averTaskPerChannel],channel数目[channelNumber],每个taskGroup的平均channel数[channelsPerTaskGroup]都应该为正数");
List<Configuration> taskConfigs = this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
int taskGroupNumber = channelNumber / channelsPerTaskGroup;
int leftChannelNumber = channelNumber % channelsPerTaskGroup;
if (leftChannelNumber > 0) {
taskGroupNumber += 1;
}
/**
* 如果只有一个taskGroup,直接打标返回
*/
if (taskGroupNumber == 1) {
final Configuration taskGroupConfig = this.configuration.clone();
/**
* configure的clone不能clone出
*/
taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, this.configuration.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT));
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, channelNumber);
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, 0);
return new ArrayList<Configuration>() {
{
add(taskGroupConfig);
}
};
}
List<Configuration> taskGroupConfigs = new ArrayList<Configuration>();
/**
* 将每个taskGroup中content的配置清空
*/
for (int i = 0; i < taskGroupNumber; i++) {
Configuration taskGroupConfig = this.configuration.clone();
List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
taskGroupJobContent.clear();
taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
taskGroupConfigs.add(taskGroupConfig);
}
int taskConfigIndex = 0;
int channelIndex = 0;
int taskGroupConfigIndex = 0;
/**
* 先处理掉taskGroup包含channel数不是平均值的taskGroup
*/
if (leftChannelNumber > 0) {
Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
for (; channelIndex < leftChannelNumber; channelIndex++) {
for (int i = 0; i < averTaskPerChannel; i++) {
List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
taskGroupJobContent.add(taskConfigs.get(taskConfigIndex++));
taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
}
}
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, leftChannelNumber);
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, taskGroupConfigIndex++);
}
/**
* 下面需要轮询分配,并打上channel数和taskGroupId标记
*/
int equalDivisionStartIndex = taskGroupConfigIndex;
for (; taskConfigIndex < taskConfigs.size() && equalDivisionStartIndex < taskGroupConfigs.size(); ) {
for (taskGroupConfigIndex = equalDivisionStartIndex; taskGroupConfigIndex < taskGroupConfigs.size() && taskConfigIndex < taskConfigs.size(); taskGroupConfigIndex++) {
Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
List<Configuration> taskGroupJobContent = taskGroupConfig.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
taskGroupJobContent.add(taskConfigs.get(taskConfigIndex++));
taskGroupConfig.set(CoreConstant.DATAX_JOB_CONTENT, taskGroupJobContent);
}
}
for (taskGroupConfigIndex = equalDivisionStartIndex; taskGroupConfigIndex < taskGroupConfigs.size(); ) {
Configuration taskGroupConfig = taskGroupConfigs.get(taskGroupConfigIndex);
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL, channelsPerTaskGroup);
taskGroupConfig.set(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID, taskGroupConfigIndex++);
}
return taskGroupConfigs;
}
use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.
the class Engine method entry.
public static void entry(final String[] args) throws Throwable {
Options options = new Options();
options.addOption("job", true, "Job config.");
options.addOption("jobid", true, "Job unique id.");
options.addOption("mode", true, "Job runtime mode.");
BasicParser parser = new BasicParser();
CommandLine cl = parser.parse(options, args);
String jobPath = cl.getOptionValue("job");
// 如果用户没有明确指定jobid, 则 datax.py 会指定 jobid 默认值为-1
String jobIdString = cl.getOptionValue("jobid");
RUNTIME_MODE = cl.getOptionValue("mode");
Configuration configuration = ConfigParser.parse(jobPath);
long jobId;
if (!"-1".equalsIgnoreCase(jobIdString)) {
jobId = Long.parseLong(jobIdString);
} else {
// only for dsc & ds & datax 3 update
String dscJobUrlPatternString = "/instance/(\\d{1,})/config.xml";
String dsJobUrlPatternString = "/inner/job/(\\d{1,})/config";
String dsTaskGroupUrlPatternString = "/inner/job/(\\d{1,})/taskGroup/";
List<String> patternStringList = Arrays.asList(dscJobUrlPatternString, dsJobUrlPatternString, dsTaskGroupUrlPatternString);
jobId = parseJobIdFromUrl(patternStringList, jobPath);
}
boolean isStandAloneMode = "standalone".equalsIgnoreCase(RUNTIME_MODE);
if (!isStandAloneMode && jobId == -1) {
// 如果不是 standalone 模式,那么 jobId 一定不能为-1
throw DataXException.asDataXException(FrameworkErrorCode.CONFIG_ERROR, "非 standalone 模式必须在 URL 中提供有效的 jobId.");
}
configuration.set(CoreConstant.DATAX_CORE_CONTAINER_JOB_ID, jobId);
//打印vmInfo
VMInfo vmInfo = VMInfo.getVmInfo();
if (vmInfo != null) {
LOG.info(vmInfo.toString());
}
LOG.info("\n" + Engine.filterJobConfiguration(configuration) + "\n");
LOG.debug(configuration.toJSON());
ConfigurationValidate.doValidate(configuration);
Engine engine = new Engine();
engine.start(configuration);
}
use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.
the class JobAssignUtil method parseAndGetResourceMarkAndTaskIdMap.
/**
* 根据task 配置,获取到:
* 资源名称 --> taskId(List) 的 map 映射关系
*/
private static LinkedHashMap<String, List<Integer>> parseAndGetResourceMarkAndTaskIdMap(List<Configuration> contentConfig) {
// key: resourceMark, value: taskId
LinkedHashMap<String, List<Integer>> readerResourceMarkAndTaskIdMap = new LinkedHashMap<String, List<Integer>>();
LinkedHashMap<String, List<Integer>> writerResourceMarkAndTaskIdMap = new LinkedHashMap<String, List<Integer>>();
for (Configuration aTaskConfig : contentConfig) {
int taskId = aTaskConfig.getInt(CoreConstant.TASK_ID);
// 把 readerResourceMark 加到 readerResourceMarkAndTaskIdMap 中
String readerResourceMark = aTaskConfig.getString(CoreConstant.JOB_READER_PARAMETER + "." + CommonConstant.LOAD_BALANCE_RESOURCE_MARK);
if (readerResourceMarkAndTaskIdMap.get(readerResourceMark) == null) {
readerResourceMarkAndTaskIdMap.put(readerResourceMark, new LinkedList<Integer>());
}
readerResourceMarkAndTaskIdMap.get(readerResourceMark).add(taskId);
// 把 writerResourceMark 加到 writerResourceMarkAndTaskIdMap 中
String writerResourceMark = aTaskConfig.getString(CoreConstant.JOB_WRITER_PARAMETER + "." + CommonConstant.LOAD_BALANCE_RESOURCE_MARK);
if (writerResourceMarkAndTaskIdMap.get(writerResourceMark) == null) {
writerResourceMarkAndTaskIdMap.put(writerResourceMark, new LinkedList<Integer>());
}
writerResourceMarkAndTaskIdMap.get(writerResourceMark).add(taskId);
}
if (readerResourceMarkAndTaskIdMap.size() >= writerResourceMarkAndTaskIdMap.size()) {
// 采用 reader 对资源做的标记进行 shuffle
return readerResourceMarkAndTaskIdMap;
} else {
// 采用 writer 对资源做的标记进行 shuffle
return writerResourceMarkAndTaskIdMap;
}
}
use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.
the class ProcessInnerScheduler method startAllTaskGroup.
@Override
public void startAllTaskGroup(List<Configuration> configurations) {
this.taskGroupContainerExecutorService = Executors.newFixedThreadPool(configurations.size());
for (Configuration taskGroupConfiguration : configurations) {
TaskGroupContainerRunner taskGroupContainerRunner = newTaskGroupContainerRunner(taskGroupConfiguration);
this.taskGroupContainerExecutorService.execute(taskGroupContainerRunner);
}
this.taskGroupContainerExecutorService.shutdown();
}
Aggregations