use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class ETLWorker method configure.
@Override
public void configure() {
setName(NAME);
setDescription("Worker Driver for Realtime ETL Pipelines");
int instances = config.getInstances();
if (instances < 1) {
throw new IllegalArgumentException("instances must be greater than 0.");
}
setInstances(instances);
if (config.getResources() != null) {
setResources(config.getResources());
}
PipelineSpecGenerator<ETLRealtimeConfig, PipelineSpec> specGenerator = new RealtimePipelineSpecGenerator(getConfigurer(), ImmutableSet.of(RealtimeSource.PLUGIN_TYPE), ImmutableSet.of(RealtimeSink.PLUGIN_TYPE), Table.class, TableProperties.builder().setSchema(ERROR_SCHEMA).build());
PipelineSpec spec = specGenerator.generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (RealtimeSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
Map<String, String> properties = new HashMap<>();
properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
properties.put(Constants.PIPELINEID, GSON.toJson(pipeline));
// Generate unique id for this app creation.
properties.put(UNIQUE_ID, String.valueOf(System.currentTimeMillis()));
properties.put(Constants.STAGE_LOGGING_ENABLED, String.valueOf(config.isStageLoggingEnabled()));
setProperties(properties);
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
List<Finisher> finishers = new ArrayList<>();
final Job job = context.getHadoopJob();
final Configuration hConf = job.getConfiguration();
hConf.setBoolean("mapreduce.map.speculative", false);
hConf.setBoolean("mapreduce.reduce.speculative", false);
// plugin name -> runtime args for that plugin
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
final PipelinePhase phase = phaseSpec.getPhase();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());
// should never happen if planner is correct
Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (reducers.size() > 1) {
Iterator<StageSpec> reducerIter = reducers.iterator();
StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
while (reducerIter.hasNext()) {
reducersStr.append(",");
reducersStr.append(reducerIter.next().getName());
}
throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
}
job.setMapperClass(ETLMapper.class);
if (reducers.isEmpty()) {
job.setNumReduceTasks(0);
} else {
job.setReducerClass(ETLReducer.class);
}
final Map<String, SinkOutput> sinkOutputs = new HashMap<>();
final Map<String, String> inputAliasToStage = new HashMap<>();
// call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSource, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sourceContext) {
for (String inputAlias : sourceContext.getInputNames()) {
inputAliasToStage.put(inputAlias, stageName);
}
}
});
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSink, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sinkContext) {
sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
}
});
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, transform, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
final BatchAggregator<?, ?, ?> aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, aggregator, contextProvider, new SubmitterPlugin.PrepareAction<DefaultAggregatorContext>() {
@Override
public void act(DefaultAggregatorContext aggregatorContext) {
if (aggregatorContext.getNumPartitions() != null) {
job.setNumReduceTasks(aggregatorContext.getNumPartitions());
}
Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
Class<?> outputValClass = aggregatorContext.getGroupValueClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
}
if (outputValClass == null) {
outputValClass = TypeChecker.getGroupValueClass(aggregator);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, outputValClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
}
});
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
final BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchJoiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext joinerContext) {
if (joinerContext.getNumPartitions() != null) {
job.setNumReduceTasks(joinerContext.getNumPartitions());
}
Class<?> outputKeyClass = joinerContext.getJoinKeyClass();
Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
}
if (inputRecordClass == null) {
inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
getOutputValClass(stageName, inputRecordClass);
// for joiner plugin map output is tagged with stageName
job.setMapOutputValueClass(TaggedWritable.class);
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
finisher = new CompositeFinisher(finishers);
job.setMapperClass(ETLMapper.class);
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
// token is null when just the mapreduce job is run but not the entire workflow
// we still want things to work in that case.
hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class PipelineCondition method apply.
@Override
public boolean apply(@Nullable WorkflowContext input) {
if (input == null) {
// should not happen
throw new IllegalStateException("WorkflowContext for the Condition cannot be null.");
}
Map<String, String> properties = input.getConditionSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(input, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(input.getToken(), input.getRuntimeArguments()), input.getLogicalStartTime(), input, input.getNamespace());
try {
Condition condition = pluginContext.newPluginInstance(stageSpec.getName(), macroEvaluator);
PipelineRuntime pipelineRuntime = new PipelineRuntime(input, metrics);
ConditionContext conditionContext = new BasicConditionContext(input, pipelineRuntime, stageSpec);
boolean result = condition.apply(conditionContext);
WorkflowToken token = input.getToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when Condition is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
return result;
} catch (Exception e) {
String msg = String.format("Error executing condition '%s' in the pipeline.", stageSpec.getName());
throw new RuntimeException(msg, e);
}
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class PipelineAction method run.
@Override
public void run() throws Exception {
CustomActionContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace()));
ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
action.run(actionContext);
}
WorkflowToken token = context.getWorkflowToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class PipelinePlannerTest method testGeneratePlan.
@Test
public void testGeneratePlan() {
/*
|--- n2(r) ----------|
| | |-- n10
n1 --|--- n3(r) --- n5 ---|--- n6 --- n7(r) --- n8 --- n9(r) --|
| | |-- n11
|--- n4(r) ----------|
*/
// create the spec for this pipeline
Schema schema = Schema.recordOf("stuff", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).addOutputSchema(schema, "n2", "n3", "n4").build(), StageSpec.builder("n2", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n6").build(), StageSpec.builder("n3", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n5").build(), StageSpec.builder("n4", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n6").build(), StageSpec.builder("n5", NODE).addInputSchema("n3", schema).addOutputSchema(schema, "n6").build(), StageSpec.builder("n6", NODE).addInputSchemas(ImmutableMap.of("n2", schema, "n5", schema, "n4", schema)).addOutputSchema(schema, "n7").build(), StageSpec.builder("n7", REDUCE).addInputSchema("n6", schema).addOutputSchema(schema, "n8").build(), StageSpec.builder("n8", NODE).addInputSchema("n7", schema).addOutputSchema(schema, "n9").build(), StageSpec.builder("n9", REDUCE).addInputSchema("n8", schema).addOutputSchema(schema, "n10", "n11").build(), StageSpec.builder("n10", NODE).addInputSchema("n9", schema).build(), StageSpec.builder("n11", NODE).addInputSchema("n9", schema).build());
Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n1", "n3"), new Connection("n1", "n4"), new Connection("n2", "n6"), new Connection("n3", "n5"), new Connection("n4", "n6"), new Connection("n5", "n6"), new Connection("n6", "n7"), new Connection("n7", "n8"), new Connection("n8", "n9"), new Connection("n9", "n10"), new Connection("n9", "n11"));
Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE);
Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
Set<String> emptySet = ImmutableSet.of();
PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
Map<String, PipelinePhase> phases = new HashMap<>();
/*
n1 --> n1.out.connector
*/
PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).addOutputSchema(schema, "n2", "n3", "n4").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SINK_TYPE)).build()).addConnections("n1", ImmutableSet.of("n1.out.connector")).build();
String phase1Name = PipelinePlanner.getPhaseName(phase1.getDag());
phases.put(phase1Name, phase1);
/*
phase2:
n1.out.connector --- n2(r) --- n6 --- n7.connector
*/
PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n2", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutputSchema(schema, "n7").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n2").addConnection("n2", "n6").addConnection("n6", "n7.connector").build();
String phase2Name = PipelinePlanner.getPhaseName(phase2.getDag());
phases.put(phase2Name, phase2);
/*
phase3:
n1.out.connector --- n3(r) --- n5 --- n6 --- n7.connector
*/
PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n5", NODE).addInputSchema("n3", schema).addOutputSchema(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutputSchema(schema, "n7").build()).addStage(StageSpec.builder("n3", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n5").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n3").addConnection("n3", "n5").addConnection("n5", "n6").addConnection("n6", "n7.connector").build();
String phase3Name = PipelinePlanner.getPhaseName(phase3.getDag());
phases.put(phase3Name, phase3);
/*
phase4:
n1.out.connector --- n4(r) --- n6 --- n7.connector
*/
PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n4", REDUCE).addInputSchema("n1", schema).addOutputSchema(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutputSchema(schema, "n7").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n4").addConnection("n4", "n6").addConnection("n6", "n7.connector").build();
String phase4Name = PipelinePlanner.getPhaseName(phase4.getDag());
phases.put(phase4Name, phase4);
/*
phase5:
n7.connector --- n7(r) --- n8 --- n9.connector
*/
PipelinePhase phase5 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n8", NODE).addInputSchema("n7", schema).addOutputSchema(schema, "n9").build()).addStage(StageSpec.builder("n7", REDUCE).addInputSchema("n6", schema).addOutputSchema(schema, "n8").build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n9.connector", connectorSpec("n9", Constants.Connector.SINK_TYPE)).build()).addConnection("n7.connector", "n7").addConnection("n7", "n8").addConnection("n8", "n9.connector").build();
String phase5Name = PipelinePlanner.getPhaseName(phase5.getDag());
phases.put(phase5Name, phase5);
/*
phase6:
|-- n10
n9.connector --- n9(r) --|
|-- n11
*/
PipelinePhase phase6 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n10", NODE).addInputSchema("n9", schema).build()).addStage(StageSpec.builder("n11", NODE).addInputSchema("n9", schema).build()).addStage(StageSpec.builder("n9", REDUCE).addInputSchema("n8", schema).addOutputSchema(schema, "n10", "n11").build()).addStage(StageSpec.builder("n9.connector", connectorSpec("n9", Constants.Connector.SOURCE_TYPE)).build()).addConnection("n9.connector", "n9").addConnection("n9", "n10").addConnection("n9", "n11").build();
String phase6Name = PipelinePlanner.getPhaseName(phase6.getDag());
phases.put(phase6Name, phase6);
Set<Connection> phaseConnections = new HashSet<>();
phaseConnections.add(new Connection(phase1Name, phase2Name));
phaseConnections.add(new Connection(phase1Name, phase3Name));
phaseConnections.add(new Connection(phase1Name, phase4Name));
phaseConnections.add(new Connection(phase2Name, phase5Name));
phaseConnections.add(new Connection(phase3Name, phase5Name));
phaseConnections.add(new Connection(phase4Name, phase5Name));
phaseConnections.add(new Connection(phase5Name, phase6Name));
PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
PipelinePlan actual = planner.plan(pipelineSpec);
Assert.assertEquals(expected, actual);
}
Aggregations