use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class ETLWorker method initialize.
@Override
public void initialize(final WorkerContext context) throws Exception {
if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
super.initialize(context);
Map<String, String> properties = context.getSpecification().getProperties();
appName = context.getApplicationSpecification().getName();
Preconditions.checkArgument(properties.containsKey(Constants.PIPELINEID));
Preconditions.checkArgument(properties.containsKey(UNIQUE_ID));
String uniqueId = properties.get(UNIQUE_ID);
// Each worker instance should have its own unique state.
final String appName = context.getApplicationSpecification().getName();
stateStoreKey = String.format("%s%s%s%s%s", appName, SEPARATOR, uniqueId, SEPARATOR, context.getInstanceId());
stateStoreKeyBytes = Bytes.toBytes(stateStoreKey);
Transactionals.execute(getContext(), new TxRunnable() {
@Override
public void run(DatasetContext dsContext) throws Exception {
KeyValueTable stateTable = dsContext.getDataset(ETLRealtimeApplication.STATE_TABLE);
byte[] startKey = Bytes.toBytes(String.format("%s%s", appName, SEPARATOR));
// Scan the table for appname: prefixes and remove rows which doesn't match the unique id of this application.
try (CloseableIterator<KeyValue<byte[], byte[]>> rows = stateTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
while (rows.hasNext()) {
KeyValue<byte[], byte[]> row = rows.next();
if (Bytes.compareTo(stateStoreKeyBytes, row.getKey()) != 0) {
stateTable.delete(row.getKey());
}
}
}
}
}, Exception.class);
PipelinePhase pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), PipelinePhase.class);
Map<String, TransformDetail> transformationMap = new HashMap<>();
initializeSource(context, pipeline);
initializeTransforms(context, transformationMap, pipeline);
initializeSinks(context, transformationMap, pipeline);
Set<String> startStages = new HashSet<>();
startStages.addAll(pipeline.getStageOutputs(sourceStageName));
transformExecutor = new TransformExecutor(transformationMap, startStages);
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class PipelinePlannerTest method testSimpleCondition.
@Test
public void testSimpleCondition() throws Exception {
/*
n1 - n2 - condition - n3
|
|---- n4
*/
Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition", CONDITION).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build());
Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition"), new Connection("condition", "n3", true), new Connection("condition", "n4", false));
Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
Set<String> emptySet = ImmutableSet.of();
PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
Map<String, PipelinePhase> phases = new HashMap<>();
/*
n1--n2--condition.connector
*/
PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n2", NODE).build()).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n2").addConnection("n2", "condition.connector").build();
Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition")));
String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
phases.put(phase1Name, phase1);
/*
condition
*/
PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition", CONDITION).build()).build();
String phase2Name = "condition";
phases.put(phase2Name, phase2);
/*
condition.connector -- n3
*/
PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition.connector", "n3").build();
controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n3")));
String phase3Name = PipelinePlanner.getPhaseName(controlPhaseDag);
phases.put(phase3Name, phase3);
/*
condition.connector -- n4
*/
PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n4", NODE).build()).addConnection("condition.connector", "n4").build();
controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n4")));
String phase4Name = PipelinePlanner.getPhaseName(controlPhaseDag);
phases.put(phase4Name, phase4);
Set<Connection> phaseConnections = new HashSet<>();
phaseConnections.add(new Connection(phase1Name, phase2Name));
phaseConnections.add(new Connection(phase2Name, phase3Name, true));
phaseConnections.add(new Connection(phase2Name, phase4Name, false));
PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
PipelinePlan actual = planner.plan(pipelineSpec);
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
List<Finisher> finishers = new ArrayList<>();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
final Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
final Admin admin = context.getAdmin();
PipelinePhase phase = phaseSpec.getPhase();
// go through in topological order so that arguments set by one stage are seen by stages after it
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {
@Override
public BatchSourceContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {
@Override
public StageSubmitterContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {
@Override
public BatchSinkContext getContext(DatasetContext datasetContext) {
return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {
@Override
public SparkPluginContext getContext(DatasetContext datasetContext) {
return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext sparkJoinerContext) {
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = new CompositeFinisher(finishers);
context.localize("HydratorSpark.config", configFile.toURI());
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class PipelinePlanner method dagToPipeline.
/**
* Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
* The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
*
* @param dag the dag to convert
* @param connectors connector nodes across all dags
* @param specs specifications for every stage
* @return the converted dag
*/
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
for (String stageName : dag.getTopologicalOrder()) {
Set<String> outputs = dag.getNodeOutputs(stageName);
if (!outputs.isEmpty()) {
phaseBuilder.addConnections(stageName, outputs);
}
// add connectors
String originalName = connectors.get(stageName);
if (originalName != null || conditionConnectors.values().contains(stageName)) {
String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
continue;
}
// add other plugin types
StageSpec spec = specs.get(stageName);
phaseBuilder.addStage(spec);
}
return phaseBuilder.build();
}
use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.
the class SmartWorkflow method addProgram.
private WorkflowProgramAdder addProgram(String phaseName, WorkflowProgramAdder programAdder) {
PipelinePhase phase = plan.getPhase(phaseName);
// artificially added by the control dag flattening process. So nothing to add, skip it
if (phase == null) {
return programAdder;
}
// can't use phase name as a program name because it might contain invalid characters
String programName = "phase-" + phaseNum;
phaseNum++;
BatchPhaseSpec batchPhaseSpec = getPhaseSpec(programName, phase);
Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
// actions will be all by themselves in a phase
programAdder.addAction(new PipelineAction(batchPhaseSpec));
} else if (pluginTypes.contains(Condition.PLUGIN_TYPE)) {
// conditions will be all by themselves in a phase
// addCondition(programAdder, phaseName, batchPhaseSpec);
programAdder = programAdder.condition(new PipelineCondition(batchPhaseSpec));
} else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
// spark programs will be all by themselves in a phase
String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
StageSpec stageSpec = stageSpecs.get(stageName);
applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
programAdder.addSpark(programName);
} else if (useSpark) {
applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
programAdder.addSpark(programName);
} else {
applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec, new HashSet<>(connectorDatasets.values())));
programAdder.addMapReduce(programName);
}
return programAdder;
}
Aggregations