use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testOuterJoin.
public void testOuterJoin(Engine engine) throws Exception {
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_name", Schema.of(Schema.Type.STRING)));
String input1Name = "source1OuterJoinInput-" + engine;
String input2Name = "source2OuterJoinInput-" + engine;
String input3Name = "source3OuterJoinInput-" + engine;
String outputName = "outerJoinOutput-" + engine;
String joinerName = "outerJoiner-" + engine;
String sinkName = "outerJoinSink-" + engine;
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(input1Name, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(input2Name, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(input3Name, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage(joinerName, MockJoiner.getPlugin("t1.customer_id=t2.cust_id=t3.c_id&" + "t1.customer_name=t2.cust_name=t3.c_name", "t1", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", joinerName).addConnection("t2", joinerName).addConnection("t3", joinerName).addConnection(joinerName, sinkName).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("OuterJoinApp-" + engine);
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
Schema outSchema = Schema.recordOf("join.output", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordMartha = StructuredRecord.builder(inputSchema1).set("customer_id", "4").set("customer_name", "martha").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "2").set("c_name", "bob").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "3").set("c_name", "jane").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(input1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane, recordMartha));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(input2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordCar, recordBike));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(input3Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordTrasCar, recordTrasPlane, recordTrasBike));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
StructuredRecord joinRecordBob = StructuredRecord.builder(outSchema).set("customer_id", "2").set("customer_name", "bob").set("t_id", "2").set("c_id", "2").set("c_name", "bob").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "3").set("c_id", "3").set("c_name", "jane").build();
StructuredRecord joinRecordMartha = StructuredRecord.builder(outSchema).set("customer_id", "4").set("customer_name", "martha").build();
DataSetManager<Table> sinkManager = getDataset(outputName);
Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordBob, joinRecordMartha);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(4, appId, joinerName + ".records.out");
validateMetric(4, appId, sinkName + ".records.in");
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class DataPipelineTest method testMacrosMapReducePipeline.
@Test
public void testMacrosMapReducePipeline() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "${runtime${source}}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "${runtime}${sink}"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "MRSinkDataset", "source", "Source", "runtimeSource", "mockRuntimeMRSourceDataset");
// make sure the datasets don't exist beforehand
Assert.assertNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNull(getDataset("mockRuntimeMRSinkDataset").get());
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// now the datasets should exist
Assert.assertNotNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNotNull(getDataset("mockRuntimeMRSinkDataset").get());
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testDeadEndStage.
@Test(expected = IllegalArgumentException.class)
public void testDeadEndStage() {
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("transform", MOCK_TRANSFORM_A)).addConnection("source", "sink").addConnection("source", "transform").build();
specGenerator.generateSpec(etlConfig);
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class ETLBatchConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), "errorDS");
co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), "errorDS");
co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
Set<Connection> connections = new HashSet<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
String schedule = "*/5 * * * *";
Resources resources = new Resources(1024, 1);
ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
co.cask.cdap.etl.proto.v2.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return null;
}
}));
}
use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testConflictingInputSchemas.
@Test(expected = IllegalArgumentException.class)
public void testConflictingInputSchemas() {
/*
* ---- transformA ----
* | |
* source --- |--- sink
* | |
* ---- transformB ----
*
* sink gets schema A and schema B as input, should fail
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tB", MOCK_TRANSFORM_B)).addConnection("source", "tA").addConnection("source", "tB").addConnection("tA", "sink").addConnection("tB", "sink").build();
specGenerator.generateSpec(etlConfig);
}
Aggregations