use of io.cdap.cdap.etl.proto.v2.spec.PipelineSpec in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testAutoJoin.
@Test
public void testAutoJoin() {
/*
* ---- transformA --------|
* | |
* source ---| |-- autojoin --- sink
* | |
* ---- transformABC ------|
*/
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tABC", MOCK_TRANSFORM_ABC)).addStage(new ETLStage("autojoin", MOCK_AUTO_JOINER)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "tA").addConnection("source", "tABC").addConnection("tA", "autojoin").addConnection("tABC", "autojoin").addConnection("autojoin", "sink").setNumOfRecordsPreview(100).build();
joinDefinition = JoinDefinition.builder().select(new JoinField("tA", "a"), new JoinField("tABC", "b"), new JoinField("tABC", "c")).from(JoinStage.builder("tA", SCHEMA_A).isRequired().build(), JoinStage.builder("tABC", SCHEMA_ABC).isOptional().build()).on(JoinCondition.onKeys().addKey(new JoinKey("tA", Collections.singletonList("a"))).addKey(new JoinKey("tABC", Collections.singletonList("a"))).build()).setOutputSchemaName("abc.joined").build();
Schema joinSchema = Schema.recordOf("abc.joined", Schema.Field.of("a", Schema.of(Schema.Type.STRING)), Schema.Field.of("b", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Map<String, String> emptyMap = new HashMap<>();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).addOutput(SCHEMA_A, "tA", "tABC").build()).addStage(StageSpec.builder("tA", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_A, "autojoin").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("tABC", new PluginSpec(Transform.PLUGIN_TYPE, "mockABC", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_ABC, "autojoin").setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("autojoin", new PluginSpec(BatchJoiner.PLUGIN_TYPE, "mockautojoiner", emptyMap, ARTIFACT_ID)).addInputSchema("tA", SCHEMA_A).addInputSchema("tABC", SCHEMA_ABC).addOutput(joinSchema, "sink").setErrorSchema(SCHEMA_ABC).build()).addStage(StageSpec.builder("sink", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("autojoin", joinSchema).setErrorSchema(joinSchema).build()).addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.spec.PipelineSpec in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testSQLEngineEnabledButNotConfigured.
@Test
public void testSQLEngineEnabledButNotConfigured() throws ValidationException {
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("action", MOCK_ACTION)).setPushdownEnabled(true).setTransformationPushdown(null).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Map<String, String> emptyMap = ImmutableMap.of();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("action", new PluginSpec(Action.PLUGIN_TYPE, "mockaction", emptyMap, ARTIFACT_ID)).build()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setSqlEngineStageSpec(null).build();
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.spec.PipelineSpec in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testPipelineProperties.
@Test
public void testPipelineProperties() throws ValidationException {
// populate some mock plugins.
MockPluginConfigurer pluginConfigurer = new MockPluginConfigurer();
Set<ArtifactId> artifactIds = ImmutableSet.of(ARTIFACT_ID);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action1", MockPlugin.builder().putPipelineProperty("prop1", "val1").putPipelineProperty("prop2", "val2").build(), artifactIds);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action2", MockPlugin.builder().putPipelineProperty("prop2", "val2").build(), artifactIds);
Map<String, String> empty = ImmutableMap.of();
ETLBatchConfig config = ETLBatchConfig.builder().setProperties(ImmutableMap.of("system.spark.spark.test", "abc", "system.mapreduce.prop3", "val3")).addStage(new ETLStage("a1", new ETLPlugin("action1", Action.PLUGIN_TYPE, empty))).addStage(new ETLStage("a2", new ETLPlugin("action2", Action.PLUGIN_TYPE, empty))).addConnection("a1", "a2").setEngine(Engine.MAPREDUCE).setNumOfRecordsPreview(100).build();
PipelineSpec actual = new BatchPipelineSpecGenerator(NamespaceId.DEFAULT.getNamespace(), pluginConfigurer, null, ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), Engine.MAPREDUCE, MOCK_FEATURE_FLAGS_PROVIDER).generateSpec(config);
PipelineSpec expected = BatchPipelineSpec.builder().addConnection("a1", "a2").setProperties(ImmutableMap.of("prop1", "val1", "prop2", "val2", "prop3", "val3")).addStage(StageSpec.builder("a1", new PluginSpec(Action.PLUGIN_TYPE, "action1", empty, ARTIFACT_ID)).addOutput(null, "a2").build()).addStage(StageSpec.builder("a2", new PluginSpec(Action.PLUGIN_TYPE, "action2", empty, ARTIFACT_ID)).addInputSchema("a1", null).build()).setResources(new Resources(1024)).setDriverResources(new Resources(1024)).setClientResources(new Resources(1024)).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).build();
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.spec.PipelineSpec in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testOutputPorts.
@Test
public void testOutputPorts() throws ValidationException {
/*
*
* |portA --> sinkA
* |
* source --> split --|portB --> sinkB
* |
* |portC --> sinkC
*
* portA has output schemaA, portB has output schemaB, portC has null output schema
*/
ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("split", MOCK_SPLITTER)).addStage(new ETLStage("sinkA", MOCK_SINK)).addStage(new ETLStage("sinkB", MOCK_SINK)).addStage(new ETLStage("sinkC", MOCK_SINK)).addConnection("source", "split").addConnection("source", "split").addConnection("split", "sinkA", "portA").addConnection("split", "sinkB", "portB").addConnection("split", "sinkC", "portC").setNumOfRecordsPreview(100).build();
Map<String, String> emptyMap = Collections.emptyMap();
PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", EMPTY_MAP, ARTIFACT_ID)).addOutput(SCHEMA_A, "split").build()).addStage(StageSpec.builder("split", new PluginSpec(SplitterTransform.PLUGIN_TYPE, "mocksplit", EMPTY_MAP, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput("sinkA", "portA", SCHEMA_A).addOutput("sinkB", "portB", SCHEMA_B).addOutput("sinkC", "portC", null).setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("sinkA", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", EMPTY_MAP, ARTIFACT_ID)).addInputSchema("split", SCHEMA_A).setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("sinkB", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", EMPTY_MAP, ARTIFACT_ID)).addInputSchema("split", SCHEMA_B).setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("sinkC", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", EMPTY_MAP, ARTIFACT_ID)).addInputSchema("split", null).build()).addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).build();
PipelineSpec actual = specGenerator.generateSpec(config);
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.v2.spec.PipelineSpec in project cdap by caskdata.
the class PipelinePlanner method plan.
/**
* Create an execution plan for the given logical pipeline. This is used for batch pipelines.
* Though it may eventually be useful to mark windowing points for realtime pipelines.
*
* A plan consists of one or more phases, with connections between phases.
* A connection between a phase indicates control flow, and not necessarily
* data flow. This class assumes that it receives a valid pipeline spec.
* That is, the pipeline has no cycles, all its nodes have unique names,
* sources don't have any input, sinks don't have any output,
* everything else has both an input and an output, etc.
*
* We start by inserting connector nodes into the logical dag,
* which are used to mark boundaries between mapreduce jobs.
* Each connector represents a node where we will need to write to a local dataset.
*
* Next, the logical pipeline is broken up into phases,
* using the connectors as sinks in one phase, and a source in another.
* After this point, connections between phases do not indicate data flow, but control flow.
*
* @param spec the pipeline spec, representing a logical pipeline
* @return the execution plan
*/
public PipelinePlan plan(PipelineSpec spec) {
// go through the stages and examine their plugin type to determine which stages are reduce stages
Set<String> reduceNodes = new HashSet<>();
Set<String> isolationNodes = new HashSet<>();
Set<String> actionNodes = new HashSet<>();
Set<String> multiPortNodes = new HashSet<>();
Set<String> allNodes = new HashSet<>();
// Map to hold the connection information from condition nodes to the first stage
// they connect to. Condition information also includes whether the stage is connected
// on the 'true' branch or the 'false' branch
Map<String, ConditionBranches> conditionBranches = new HashMap<>();
Map<String, Set<String>> conditionOutputs = new HashMap<>();
Map<String, Set<String>> conditionInputs = new HashMap<>();
Map<String, StageSpec> specs = new HashMap<>();
for (StageSpec stage : spec.getStages()) {
String pluginType = stage.getPlugin().getType();
allNodes.add(stage.getName());
if (reduceTypes.contains(pluginType)) {
reduceNodes.add(stage.getName());
}
if (isolationTypes.contains(pluginType)) {
isolationNodes.add(stage.getName());
}
if (actionTypes.contains(pluginType)) {
// Collect all Action nodes from spec
actionNodes.add(stage.getName());
}
if (multiPortTypes.contains(pluginType)) {
multiPortNodes.add(stage.getName());
}
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
conditionOutputs.put(stage.getName(), new HashSet<String>());
conditionInputs.put(stage.getName(), new HashSet<String>());
}
specs.put(stage.getName(), stage);
}
// Special case for action nodes when there is no connection between them
if (spec.getConnections().isEmpty()) {
// All nodes should be actions
if (!actionNodes.containsAll(allNodes)) {
throw new IllegalStateException("No connections are specified.");
}
Map<String, PipelinePhase> phases = new HashMap<>();
for (String actionNode : actionNodes) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
phases.put(actionNode, actionPhase);
}
return new PipelinePlan(phases, new HashSet<Connection>());
}
// Set representing control nodes (Conditions and Actions)
Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
Map<String, String> conditionChildToParent = new HashMap<>();
for (Connection connection : spec.getConnections()) {
if (conditionBranches.containsKey(connection.getFrom())) {
conditionOutputs.get(connection.getFrom()).add(connection.getTo());
}
if (conditionBranches.containsKey(connection.getTo())) {
conditionInputs.get(connection.getTo()).add(connection.getFrom());
}
if (conditionBranches.containsKey(connection.getFrom())) {
if (conditionBranches.containsKey(connection.getTo())) {
// conditions are chained
conditionChildToParent.put(connection.getTo(), connection.getFrom());
}
// Outgoing connection from condition
ConditionBranches branches = conditionBranches.get(connection.getFrom());
String trueOutput;
String falseOutput;
if (connection.getCondition()) {
trueOutput = connection.getTo();
falseOutput = branches.getFalseOutput();
} else {
trueOutput = branches.getTrueOutput();
falseOutput = connection.getTo();
}
conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
}
}
Map<String, String> connectorNodes = new HashMap<>();
// now split the logical pipeline into pipeline phases, using the connectors as split points
Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
Map<String, Dag> subdags = new HashMap<>();
for (Dag subdag : splittedDag) {
subdags.put(getPhaseName(subdag), subdag);
}
// build connections between phases and convert dags to PipelinePhase.
Set<Connection> phaseConnections = new HashSet<>();
Map<String, PipelinePhase> phases = new HashMap<>();
for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
String dag1Name = dagEntry1.getKey();
Dag dag1 = dagEntry1.getValue();
// convert the dag to a PipelinePhase
// add a separate pipeline phase for each control node in the subdag
Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
for (String dag1ControlNode : dag1ControlNodes) {
if (!phases.containsKey(dag1ControlNode)) {
phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
}
}
// if there are non-control nodes in the subdag, add a pipeline phase for it
if (!controlNodes.containsAll(dag1.getNodes())) {
// the updated dag replaces conditions with the corresponding connector if applicable.
Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
// Remove any control nodes from this dag
if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
updatedDag = updatedDag.createSubDag(nodes);
}
phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
}
for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
ConditionBranches branches = conditionBranches.get(controlSource);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
for (String output : dag1.getNodeOutputs(controlSource)) {
if (controlNodes.contains(output)) {
// control source -> control node, add a phase connection between the control phases
phaseConnections.add(new Connection(controlSource, output, condition));
} else {
// control source -> non-control nodes, add a phase connection from the control phase to this dag
phaseConnections.add(new Connection(controlSource, dag1Name, condition));
}
}
}
// from this dag to the control phase
for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
for (String input : dag1.getNodeInputs(controlSink)) {
if (controlNodes.contains(input)) {
// control node -> control-sink, add a phase connection between the control phases
ConditionBranches branches = conditionBranches.get(input);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
phaseConnections.add(new Connection(input, controlSink, condition));
} else {
// non-control node -> control-sink, add a phase connection from this dag to the control phase
phaseConnections.add(new Connection(dag1Name, controlSink));
}
}
}
// find connected subdags (they have a source that is a sink in dag1)
Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
String dag2Name = dagEntry2.getKey();
Dag dag2 = dagEntry2.getValue();
if (dag1Name.equals(dag2Name)) {
continue;
}
if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
phaseConnections.add(new Connection(dag1Name, dag2Name));
}
}
}
return new PipelinePlan(phases, phaseConnections);
}
Aggregations