Search in sources :

Example 1 with Finisher

use of io.cdap.cdap.etl.common.submit.Finisher in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.speculation", "false");
    // turn off auto-broadcast by default until we better understand the implications and can set this to a
    // value that we are confident is safe.
    sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    sparkConf.set("spark.network.timeout", "600s");
    // Disable yarn app retries since spark already performs retries at a task level.
    sparkConf.set("spark.yarn.maxAppAttempts", "1");
    // to make sure fields that are the same but different casing are treated as different fields in auto-joins
    // see CDAP-17024
    sparkConf.set("spark.sql.caseSensitive", "true");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
    List<Finisher> finishers = preparer.prepare(phaseSpec);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 2 with Finisher

use of io.cdap.cdap.etl.common.submit.Finisher in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    // instantiate plugins and call their prepare methods
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
    List<Finisher> finishers = preparer.prepare(phaseSpec, job);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) Job(org.apache.hadoop.mapreduce.Job) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 3 with Finisher

use of io.cdap.cdap.etl.common.submit.Finisher in project cdap by caskdata.

the class MapReducePreparer method prepare.

public List<Finisher> prepare(BatchPhaseSpec phaseSpec, Job job) throws TransactionFailureException, InstantiationException, IOException {
    this.job = job;
    this.hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    sinkOutputs = new HashMap<>();
    inputAliasToStage = new HashMap<>();
    // Collect field operations emitted by various stages in this MapReduce program
    stageOperations = new HashMap<>();
    List<Finisher> finishers = prepare(phaseSpec);
    hConf.set(ETLMapReduce.SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    hConf.set(ETLMapReduce.INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
        // Put the collected field operations in workflow token
        token.put(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN, GSON.toJson(stageOperations));
    }
    // token is null when just the mapreduce job is run but not the entire workflow
    // we still want things to work in that case.
    hConf.set(ETLMapReduce.RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
    return finishers;
}
Also used : Finisher(io.cdap.cdap.etl.common.submit.Finisher) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with Finisher

use of io.cdap.cdap.etl.common.submit.Finisher in project cdap by caskdata.

the class SparkPreparer method prepare.

@Override
public List<Finisher> prepare(PhaseSpec phaseSpec) throws TransactionFailureException, InstantiationException, IOException {
    stageOperations = new HashMap<>();
    stagePartitions = new HashMap<>();
    File configFile = File.createTempFile("HydratorSpark", ".config");
    if (!configFile.getParentFile().exists()) {
        configFile.getParentFile().mkdirs();
    }
    if (!configFile.exists()) {
        configFile.createNewFile();
    }
    List<Finisher> finishers = super.prepare(phaseSpec);
    finishers.add(new Finisher() {

        @Override
        public void onFinish(boolean succeeded) {
            if (!configFile.delete()) {
                LOG.warn("Failed to clean up resource {} ", configFile);
            }
        }
    });
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
        // Put the collected field operations in workflow token
        token.put(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN, GSON.toJson(stageOperations));
    }
    return finishers;
}
Also used : Finisher(io.cdap.cdap.etl.common.submit.Finisher) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) Writer(java.io.Writer)

Aggregations

Finisher (io.cdap.cdap.etl.common.submit.Finisher)4 HashMap (java.util.HashMap)4 Map (java.util.Map)4 TransactionPolicy (io.cdap.cdap.api.annotation.TransactionPolicy)2 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)2 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)2 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)2 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)2 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)2 CompositeFinisher (io.cdap.cdap.etl.common.submit.CompositeFinisher)2 MapReduceContext (io.cdap.cdap.api.mapreduce.MapReduceContext)1 SparkClientContext (io.cdap.cdap.api.spark.SparkClientContext)1 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)1 File (java.io.File)1 Writer (java.io.Writer)1 Configuration (org.apache.hadoop.conf.Configuration)1 Job (org.apache.hadoop.mapreduce.Job)1 SparkConf (org.apache.spark.SparkConf)1