use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.
the class PipelinePlannerTest method testGeneratePlan.
@Test
public void testGeneratePlan() {
/*
|--- n2(r) ----------|
| | |-- n10
n1 --|--- n3(r) --- n5 ---|--- n6 --- n7(r) --- n8 --- n9(r) --|
| | |-- n11
|--- n4(r) ----------|
*/
// create the spec for this pipeline
Schema schema = Schema.recordOf("stuff", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).addOutput(schema, "n2", "n3", "n4").build(), StageSpec.builder("n2", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n6").build(), StageSpec.builder("n3", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n5").build(), StageSpec.builder("n4", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n6").build(), StageSpec.builder("n5", NODE).addInputSchema("n3", schema).addOutput(schema, "n6").build(), StageSpec.builder("n6", NODE).addInputSchemas(ImmutableMap.of("n2", schema, "n5", schema, "n4", schema)).addOutput(schema, "n7").build(), StageSpec.builder("n7", REDUCE).addInputSchema("n6", schema).addOutput(schema, "n8").build(), StageSpec.builder("n8", NODE).addInputSchema("n7", schema).addOutput(schema, "n9").build(), StageSpec.builder("n9", REDUCE).addInputSchema("n8", schema).addOutput(schema, "n10", "n11").build(), StageSpec.builder("n10", NODE).addInputSchema("n9", schema).build(), StageSpec.builder("n11", NODE).addInputSchema("n9", schema).build());
Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n1", "n3"), new Connection("n1", "n4"), new Connection("n2", "n6"), new Connection("n3", "n5"), new Connection("n4", "n6"), new Connection("n5", "n6"), new Connection("n6", "n7"), new Connection("n7", "n8"), new Connection("n8", "n9"), new Connection("n9", "n10"), new Connection("n9", "n11"));
Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE);
Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
Set<String> emptySet = ImmutableSet.of();
PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
Map<String, PipelinePhase> phases = new HashMap<>();
/*
n1 --> n1.out.connector
*/
PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).addOutput(schema, "n2", "n3", "n4").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SINK_TYPE)).build()).addConnections("n1", ImmutableSet.of("n1.out.connector")).build();
String phase1Name = PipelinePlanner.getPhaseName(phase1.getDag());
phases.put(phase1Name, phase1);
/*
phase2:
n1.out.connector --- n2(r) --- n6 --- n7.connector
*/
PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n2", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutput(schema, "n7").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n2").addConnection("n2", "n6").addConnection("n6", "n7.connector").build();
String phase2Name = PipelinePlanner.getPhaseName(phase2.getDag());
phases.put(phase2Name, phase2);
/*
phase3:
n1.out.connector --- n3(r) --- n5 --- n6 --- n7.connector
*/
PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n5", NODE).addInputSchema("n3", schema).addOutput(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutput(schema, "n7").build()).addStage(StageSpec.builder("n3", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n5").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n3").addConnection("n3", "n5").addConnection("n5", "n6").addConnection("n6", "n7.connector").build();
String phase3Name = PipelinePlanner.getPhaseName(phase3.getDag());
phases.put(phase3Name, phase3);
/*
phase4:
n1.out.connector --- n4(r) --- n6 --- n7.connector
*/
PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n4", REDUCE).addInputSchema("n1", schema).addOutput(schema, "n6").build()).addStage(StageSpec.builder("n6", NODE).addInputSchema("n2", schema).addInputSchema("n4", schema).addInputSchema("n5", schema).addOutput(schema, "n7").build()).addStage(StageSpec.builder("n1.out.connector", connectorSpec("n1.out.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SINK_TYPE)).build()).addConnection("n1.out.connector", "n4").addConnection("n4", "n6").addConnection("n6", "n7.connector").build();
String phase4Name = PipelinePlanner.getPhaseName(phase4.getDag());
phases.put(phase4Name, phase4);
/*
phase5:
n7.connector --- n7(r) --- n8 --- n9.connector
*/
PipelinePhase phase5 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n8", NODE).addInputSchema("n7", schema).addOutput(schema, "n9").build()).addStage(StageSpec.builder("n7", REDUCE).addInputSchema("n6", schema).addOutput(schema, "n8").build()).addStage(StageSpec.builder("n7.connector", connectorSpec("n7", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n9.connector", connectorSpec("n9", Constants.Connector.SINK_TYPE)).build()).addConnection("n7.connector", "n7").addConnection("n7", "n8").addConnection("n8", "n9.connector").build();
String phase5Name = PipelinePlanner.getPhaseName(phase5.getDag());
phases.put(phase5Name, phase5);
/*
phase6:
|-- n10
n9.connector --- n9(r) --|
|-- n11
*/
PipelinePhase phase6 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n10", NODE).addInputSchema("n9", schema).build()).addStage(StageSpec.builder("n11", NODE).addInputSchema("n9", schema).build()).addStage(StageSpec.builder("n9", REDUCE).addInputSchema("n8", schema).addOutput(schema, "n10", "n11").build()).addStage(StageSpec.builder("n9.connector", connectorSpec("n9", Constants.Connector.SOURCE_TYPE)).build()).addConnection("n9.connector", "n9").addConnection("n9", "n10").addConnection("n9", "n11").build();
String phase6Name = PipelinePlanner.getPhaseName(phase6.getDag());
phases.put(phase6Name, phase6);
Set<Connection> phaseConnections = new HashSet<>();
phaseConnections.add(new Connection(phase1Name, phase2Name));
phaseConnections.add(new Connection(phase1Name, phase3Name));
phaseConnections.add(new Connection(phase1Name, phase4Name));
phaseConnections.add(new Connection(phase2Name, phase5Name));
phaseConnections.add(new Connection(phase3Name, phase5Name));
phaseConnections.add(new Connection(phase4Name, phase5Name));
phaseConnections.add(new Connection(phase5Name, phase6Name));
PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
PipelinePlan actual = planner.plan(pipelineSpec);
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.
the class ETLBatchConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("DataGenerator", ImmutableMap.of("p1", "v1"), null);
io.cdap.cdap.etl.proto.v1.ETLStage sourceNew = new io.cdap.cdap.etl.proto.v1.ETLStage("DataGenerator.1", new Plugin(source.getName(), source.getProperties(), artifact), source.getErrorDatasetName());
ETLStage transform1 = new ETLStage("Script", ImmutableMap.of("script", "something"), null);
io.cdap.cdap.etl.proto.v1.ETLStage transform1New = new io.cdap.cdap.etl.proto.v1.ETLStage("Script.2", new Plugin(transform1.getName(), transform1.getProperties(), artifact), transform1.getErrorDatasetName());
ETLStage transform2 = new ETLStage("Script", null, null);
io.cdap.cdap.etl.proto.v1.ETLStage transform2New = new io.cdap.cdap.etl.proto.v1.ETLStage("Script.3", new Plugin(transform2.getName(), transform2.getProperties(), artifact), transform2.getErrorDatasetName());
ETLStage transform3 = new ETLStage("Validator", ImmutableMap.of("p1", "v1", "p2", "v2"), "errorDS");
io.cdap.cdap.etl.proto.v1.ETLStage transform3New = new io.cdap.cdap.etl.proto.v1.ETLStage("Validator.4", new Plugin(transform3.getName(), transform3.getProperties(), artifact), transform3.getErrorDatasetName());
ETLStage sink1 = new ETLStage("Table", ImmutableMap.of("rowkey", "xyz"), null);
io.cdap.cdap.etl.proto.v1.ETLStage sink1New = new io.cdap.cdap.etl.proto.v1.ETLStage("Table.5", new Plugin(sink1.getName(), sink1.getProperties(), artifact), sink1.getErrorDatasetName());
ETLStage sink2 = new ETLStage("HDFS", ImmutableMap.of("name", "abc"), null);
io.cdap.cdap.etl.proto.v1.ETLStage sink2New = new io.cdap.cdap.etl.proto.v1.ETLStage("HDFS.6", new Plugin(sink2.getName(), sink2.getProperties(), artifact), sink2.getErrorDatasetName());
ETLStage action = new ETLStage("Email", ImmutableMap.of("email", "slj@example.com"), null);
io.cdap.cdap.etl.proto.v1.ETLStage actionNew = new io.cdap.cdap.etl.proto.v1.ETLStage("Email.1", new Plugin(action.getName(), action.getProperties(), artifact), action.getErrorDatasetName());
List<Connection> connections = new ArrayList<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
String schedule = "*/5 * * * *";
Resources resources = new Resources(1024, 1);
ETLBatchConfig config = new ETLBatchConfig(schedule, source, ImmutableList.of(sink1, sink2), ImmutableList.of(transform1, transform2, transform3), resources, ImmutableList.of(action));
io.cdap.cdap.etl.proto.v1.ETLBatchConfig configNew = io.cdap.cdap.etl.proto.v1.ETLBatchConfig.builder(schedule).setSource(sourceNew).addSink(sink1New).addSink(sink2New).addTransform(transform1New).addTransform(transform2New).addTransform(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).addAction(actionNew).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return new ArtifactSelectorConfig(ArtifactScope.SYSTEM.name(), "universal", "1.0.0");
}
}));
}
use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.
the class LineageOperationProcessorTest method testSimpleJoinWithAdditionalFields.
@Test
public void testSimpleJoinWithAdditionalFields() {
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n2", "n3"));
connections.add(new Connection("n3", "n4"));
EndPoint cEndPoint = EndPoint.of("default", "customer");
EndPoint pEndPoint = EndPoint.of("default", "purchase");
EndPoint cpEndPoint = EndPoint.of("default", "customer_purchase");
// customer -> (id)------------
// |
// JOIN ------->(id, customer_id)
// |
// purchase -> (customer_id)---
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("ReadCustomer", "read description", cEndPoint, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("ReadPurchase", "read description", pEndPoint, "customer_id", "item")));
List<FieldOperation> operationsFromJoin = new ArrayList<>();
operationsFromJoin.add(new FieldTransformOperation("Join", "Join Operation", Arrays.asList("n1.id", "n2.customer_id"), Arrays.asList("id", "customer_id")));
operationsFromJoin.add(new FieldTransformOperation("Identity name", "Identity Operation", Collections.singletonList("n1.name"), Collections.singletonList("name")));
operationsFromJoin.add(new FieldTransformOperation("Identity item", "Identity Operation", Collections.singletonList("n2.item"), Collections.singletonList("item")));
stageOperations.put("n3", operationsFromJoin);
stageOperations.put("n4", Collections.singletonList(new FieldWriteOperation("Write", "write description", cpEndPoint, "id", "name", "customer_id", "item")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.singleton("n3"));
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.ReadCustomer", "read description", cEndPoint, "id", "name"));
expectedOperations.add(new ReadOperation("n2.ReadPurchase", "read description", pEndPoint, "customer_id", "item"));
expectedOperations.add(new TransformOperation("n3.Join", "Join Operation", Arrays.asList(InputField.of("n1.ReadCustomer", "id"), InputField.of("n2.ReadPurchase", "customer_id")), "id", "customer_id"));
expectedOperations.add(new TransformOperation("n3.Identity name", "Identity Operation", Collections.singletonList(InputField.of("n1.ReadCustomer", "name")), "name"));
expectedOperations.add(new TransformOperation("n3.Identity item", "Identity Operation", Collections.singletonList(InputField.of("n2.ReadPurchase", "item")), "item"));
expectedOperations.add(new WriteOperation("n4.Write", "write description", cpEndPoint, Arrays.asList(InputField.of("n3.Join", "id"), InputField.of("n3.Identity name", "name"), InputField.of("n3.Join", "customer_id"), InputField.of("n3.Identity item", "item"))));
Set<Operation> processedOperations = processor.process();
Assert.assertEquals(expectedOperations, processedOperations);
}
use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.
the class LineageOperationProcessorTest method testMergeOperationsNonRepeat.
@Test
public void testMergeOperationsNonRepeat() {
// n1 -> n3 ----
// |---- n5
// n2 -> n4 ----
// operations (n1) -> (id, name)
// (n3) -> (body, offset)
// (n2.id) -> id
// (n2.name) -> name
// (n4.body) -> (id, name)
// (n5) -> (id, name)
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n3", "n5"));
connections.add(new Connection("n2", "n4"));
connections.add(new Connection("n4", "n5"));
EndPoint src1 = EndPoint.of("default", "n1");
EndPoint src2 = EndPoint.of("default", "n2");
EndPoint dest = EndPoint.of("default", "n5");
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("read1", "read description", src1, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("read2", "read description", src2, "body", "offset")));
List<FieldOperation> n3Operations = stageOperations.computeIfAbsent("n3", k -> new ArrayList<>());
n3Operations.add(new FieldTransformOperation("identity1", "identity", Collections.singletonList("id"), "id"));
n3Operations.add(new FieldTransformOperation("identity2", "identity", Collections.singletonList("name"), "name"));
stageOperations.put("n4", Collections.singletonList(new FieldTransformOperation("generate", "generate", Collections.singletonList("body"), "id", "name")));
stageOperations.put("n5", Collections.singletonList(new FieldWriteOperation("write", "write", dest, "id", "name")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.read1", "read description", src1, "id", "name"));
expectedOperations.add(new ReadOperation("n2.read2", "read description", src2, "body", "offset"));
expectedOperations.add(new TransformOperation("n3.identity1", "identity", Collections.singletonList(InputField.of("n1.read1", "id")), "id"));
expectedOperations.add(new TransformOperation("n3.identity2", "identity", Collections.singletonList(InputField.of("n1.read1", "name")), "name"));
expectedOperations.add(new TransformOperation("n4.generate", "generate", Collections.singletonList(InputField.of("n2.read2", "body")), "id", "name"));
expectedOperations.add(new TransformOperation("n3,n4.merge.id", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity1", "id"), InputField.of("n4.generate", "id")), "id"));
expectedOperations.add(new TransformOperation("n3,n4.merge.name", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity2", "name"), InputField.of("n4.generate", "name")), "name"));
expectedOperations.add(new TransformOperation("n3,n4.merge.body", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "body")), "body"));
expectedOperations.add(new TransformOperation("n3,n4.merge.offset", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "offset")), "offset"));
expectedOperations.add(new WriteOperation("n5.write", "write", dest, Arrays.asList(InputField.of("n3,n4.merge.id", "id"), InputField.of("n3,n4.merge.name", "name"))));
Set<Operation> process = processor.process();
Assert.assertEquals(expectedOperations, process);
}
use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.
the class LineageOperationProcessorTest method testSimpleJoinWithRenameOnAdditionalFields.
@Test
public void testSimpleJoinWithRenameOnAdditionalFields() {
// customer -> (id, name)----------
// |
// JOIN --->(id_from_customer, customer_id, name_from_customer, item_from_purchase)
// |
// purchase ->(customer_id, item)---
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n2", "n3"));
connections.add(new Connection("n3", "n4"));
EndPoint cEndPoint = EndPoint.of("default", "customer");
EndPoint pEndPoint = EndPoint.of("default", "purchase");
EndPoint cpEndPoint = EndPoint.of("default", "customer_purchase");
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("ReadCustomer", "read description", cEndPoint, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("ReadPurchase", "read description", pEndPoint, "customer_id", "item")));
List<FieldOperation> operationsFromJoin = new ArrayList<>();
operationsFromJoin.add(new FieldTransformOperation("Join", "Join Operation", Arrays.asList("n1.id", "n2.customer_id"), Arrays.asList("id", "customer_id")));
operationsFromJoin.add(new FieldTransformOperation("Rename id", "Rename id", Collections.singletonList("id"), "id_from_customer"));
operationsFromJoin.add(new FieldTransformOperation("Rename name", "Rename name", Collections.singletonList("n1.name"), "name_from_customer"));
operationsFromJoin.add(new FieldTransformOperation("Rename item", "Rename item", Collections.singletonList("n2.item"), "item_from_purchase"));
stageOperations.put("n3", operationsFromJoin);
stageOperations.put("n4", Collections.singletonList(new FieldWriteOperation("Write", "write description", cpEndPoint, "id_from_customer", "customer_id", "name_from_customer", "item_from_purchase")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.singleton("n3"));
Set<Operation> processedOperations = processor.process();
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.ReadCustomer", "read description", cEndPoint, "id", "name"));
expectedOperations.add(new ReadOperation("n2.ReadPurchase", "read description", pEndPoint, "customer_id", "item"));
expectedOperations.add(new TransformOperation("n3.Join", "Join Operation", Arrays.asList(InputField.of("n1.ReadCustomer", "id"), InputField.of("n2.ReadPurchase", "customer_id")), "id", "customer_id"));
expectedOperations.add(new TransformOperation("n3.Rename id", "Rename id", Collections.singletonList(InputField.of("n3.Join", "id")), "id_from_customer"));
expectedOperations.add(new TransformOperation("n3.Rename name", "Rename name", Collections.singletonList(InputField.of("n1.ReadCustomer", "name")), "name_from_customer"));
expectedOperations.add(new TransformOperation("n3.Rename item", "Rename item", Collections.singletonList(InputField.of("n2.ReadPurchase", "item")), "item_from_purchase"));
expectedOperations.add(new WriteOperation("n4.Write", "write description", cpEndPoint, Arrays.asList(InputField.of("n3.Rename id", "id_from_customer"), InputField.of("n3.Join", "customer_id"), InputField.of("n3.Rename name", "name_from_customer"), InputField.of("n3.Rename item", "item_from_purchase"))));
Assert.assertEquals(expectedOperations, processedOperations);
}
Aggregations