Search in sources :

Example 1 with MapReduceContext

use of io.cdap.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class PartitionBatchInput method setInput.

/**
 * Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
 * specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
 * It does this by reading back the previous state, determining the new partitions to read, computing the new
 * state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
 * passed in.
 *
 * @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
 *                         configured
 * @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
 * @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
 *                       managed
 * @param consumerConfiguration defines parameters for the partition consumption
 * @return a BatchPartitionCommitter used to persist the state of the partition consumer
 */
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
    PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
    final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
    Map<String, String> arguments = new HashMap<>();
    PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
    mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
    return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded);
}
Also used : Input(io.cdap.cdap.api.data.batch.Input) DatasetStatePersistor(io.cdap.cdap.api.dataset.lib.DatasetStatePersistor) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) List(java.util.List) PartitionedFileSetArguments(io.cdap.cdap.api.dataset.lib.PartitionedFileSetArguments) Beta(io.cdap.cdap.api.annotation.Beta) Map(java.util.Map) Partition(io.cdap.cdap.api.dataset.lib.Partition) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) HashMap(java.util.HashMap) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Example 2 with MapReduceContext

use of io.cdap.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    // instantiate plugins and call their prepare methods
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
    List<Finisher> finishers = preparer.prepare(phaseSpec, job);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) Job(org.apache.hadoop.mapreduce.Job) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Aggregations

MapReduceContext (io.cdap.cdap.api.mapreduce.MapReduceContext)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Beta (io.cdap.cdap.api.annotation.Beta)1 TransactionPolicy (io.cdap.cdap.api.annotation.TransactionPolicy)1 Input (io.cdap.cdap.api.data.batch.Input)1 DatasetStatePersistor (io.cdap.cdap.api.dataset.lib.DatasetStatePersistor)1 Partition (io.cdap.cdap.api.dataset.lib.Partition)1 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)1 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)1 PartitionedFileSetArguments (io.cdap.cdap.api.dataset.lib.PartitionedFileSetArguments)1 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)1 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)1 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)1 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)1 CompositeFinisher (io.cdap.cdap.etl.common.submit.CompositeFinisher)1 Finisher (io.cdap.cdap.etl.common.submit.Finisher)1 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)1 List (java.util.List)1 Configuration (org.apache.hadoop.conf.Configuration)1