use of io.cdap.cdap.etl.common.submit.CompositeFinisher in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
SparkClientContext context = getContext();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.speculation", "false");
// turn off auto-broadcast by default until we better understand the implications and can set this to a
// value that we are confident is safe.
sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
sparkConf.set("spark.network.timeout", "600s");
// Disable yarn app retries since spark already performs retries at a task level.
sparkConf.set("spark.yarn.maxAppAttempts", "1");
// to make sure fields that are the same but different casing are treated as different fields in auto-joins
// see CDAP-17024
sparkConf.set("spark.sql.caseSensitive", "true");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
List<Finisher> finishers = preparer.prepare(phaseSpec);
finisher = new CompositeFinisher(finishers);
}
use of io.cdap.cdap.etl.common.submit.CompositeFinisher in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
Job job = context.getHadoopJob();
Configuration hConf = job.getConfiguration();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
// should never happen if planner is correct
Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (reducers.size() > 1) {
Iterator<StageSpec> reducerIter = reducers.iterator();
StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
while (reducerIter.hasNext()) {
reducersStr.append(",");
reducersStr.append(reducerIter.next().getName());
}
throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
}
job.setMapperClass(ETLMapper.class);
if (reducers.isEmpty()) {
job.setNumReduceTasks(0);
} else {
job.setReducerClass(ETLReducer.class);
}
// instantiate plugins and call their prepare methods
Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
List<Finisher> finishers = preparer.prepare(phaseSpec, job);
finisher = new CompositeFinisher(finishers);
}
Aggregations