use of co.cask.cdap.etl.api.batch.BatchSourceContext in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
CompositeFinisher.Builder finishers = CompositeFinisher.builder();
Job job = context.getHadoopJob();
Configuration hConf = job.getConfiguration();
hConf.setBoolean("mapreduce.map.speculative", false);
hConf.setBoolean("mapreduce.reduce.speculative", false);
// plugin name -> runtime args for that plugin
Map<String, Map<String, String>> runtimeArgs = new HashMap<>();
MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
PipelinePhase phase = phaseSpec.getPhase();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec);
Map<String, String> inputAliasToStage = new HashMap<>();
for (String sourceName : phaseSpec.getPhase().getSources()) {
try {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName, evaluator);
StageInfo stageInfo = phaseSpec.getPhase().getStage(sourceName);
MapReduceBatchContext sourceContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
batchSource.prepareRun(sourceContext);
runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
for (String inputAlias : sourceContext.getInputNames()) {
inputAliasToStage.put(inputAlias, sourceName);
}
finishers.add(batchSource, sourceContext);
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize batch source '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sourceName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
}
hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
Map<String, SinkOutput> sinkOutputs = new HashMap<>();
for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
String sinkName = stageInfo.getName();
// todo: add a better way to get info for all sinks
if (!phase.getSinks().contains(sinkName)) {
continue;
}
try {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName, evaluator);
MapReduceBatchContext sinkContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
batchSink.prepareRun(sinkContext);
runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
finishers.add(batchSink, sinkContext);
sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize batch sink '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sinkName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
}
finisher = finishers.build();
hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
// setup time partition for each error dataset
for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
if (stageInfo.getErrorDatasetName() != null) {
Map<String, String> args = new HashMap<>();
args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString());
TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
}
}
job.setMapperClass(ETLMapper.class);
Set<StageInfo> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (!reducers.isEmpty()) {
job.setReducerClass(ETLReducer.class);
String reducerName = reducers.iterator().next().getName();
StageInfo stageInfo = phase.getStage(reducerName);
Class<?> outputKeyClass;
Class<?> outputValClass;
try {
if (!phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE).isEmpty()) {
BatchAggregator aggregator = pluginInstantiator.newPluginInstance(reducerName, evaluator);
DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, mrMetrics, stageInfo);
aggregator.prepareRun(aggregatorContext);
finishers.add(aggregator, aggregatorContext);
if (aggregatorContext.getNumPartitions() != null) {
job.setNumReduceTasks(aggregatorContext.getNumPartitions());
}
outputKeyClass = aggregatorContext.getGroupKeyClass();
outputValClass = aggregatorContext.getGroupValueClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
}
if (outputValClass == null) {
outputValClass = TypeChecker.getGroupValueClass(aggregator);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, outputValClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
job.setMapOutputValueClass(getOutputValClass(reducerName, outputValClass));
} else {
// reducer type is joiner
BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(reducerName, evaluator);
DefaultJoinerContext joinerContext = new DefaultJoinerContext(context, mrMetrics, stageInfo);
batchJoiner.prepareRun(joinerContext);
finishers.add(batchJoiner, joinerContext);
if (joinerContext.getNumPartitions() != null) {
job.setNumReduceTasks(joinerContext.getNumPartitions());
}
outputKeyClass = joinerContext.getJoinKeyClass();
Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
}
if (inputRecordClass == null) {
inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
getOutputValClass(reducerName, inputRecordClass);
// for joiner plugin map output is tagged with stageName
job.setMapOutputValueClass(TaggedWritable.class);
}
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", reducerName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
} else {
job.setNumReduceTasks(0);
}
hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}
use of co.cask.cdap.etl.api.batch.BatchSourceContext in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
CompositeFinisher.Builder finishers = CompositeFinisher.builder();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
for (StageInfo stageInfo : phaseSpec.getPhase()) {
String stageName = stageInfo.getName();
String pluginType = stageInfo.getPluginType();
if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
batchSource.prepareRun(sourceContext);
finishers.add(batchSource, sourceContext);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
batchSink.prepareRun(sinkContext);
finishers.add(batchSink, sinkContext);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
sparkSink.prepareRun(sparkPluginContext);
finishers.add(sparkSink, sparkPluginContext);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
aggregator.prepareRun(aggregatorContext);
finishers.add(aggregator, aggregatorContext);
stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
joiner.prepareRun(sparkJoinerContext);
finishers.add(joiner, sparkJoinerContext);
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = finishers.build();
context.localize("HydratorSpark.config", configFile.toURI());
}
Aggregations