use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineConnectionTest method testConnectionsRegistry.
@Test
public void testConnectionsRegistry() throws Exception {
// source -> sink
ETLBatchConfig conf1 = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPluginUsingConnection("conn 1"))).addStage(new ETLStage("sink", MockSink.getPluginUsingConnection("conn 3"))).addConnection("source", "sink").build();
// 3 sources -> identity -> 2 sinks
ETLBatchConfig conf2 = ETLBatchConfig.builder().addStage(new ETLStage("src1", MockSource.getPluginUsingConnection("conn 1"))).addStage(new ETLStage("src2", MockSource.getPluginUsingConnection("conn 2"))).addStage(new ETLStage("src3", MockSource.getPluginUsingConnection("conn 3"))).addStage(new ETLStage("sink1", MockSink.getPluginUsingConnection("conn 4"))).addStage(new ETLStage("sink2", MockSink.getPluginUsingConnection("conn 5"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("src1", "identity").addConnection("src2", "identity").addConnection("src3", "identity").addConnection("identity", "sink1").addConnection("identity", "sink2").build();
// deploy apps
AppRequest<ETLBatchConfig> appRequest1 = new AppRequest<>(APP_ARTIFACT, conf1);
ApplicationId appId1 = NamespaceId.DEFAULT.app("app1");
ApplicationManager appManager1 = deployApplication(appId1, appRequest1);
AppRequest<ETLBatchConfig> appRequest2 = new AppRequest<>(APP_ARTIFACT, conf2);
ApplicationId appId2 = NamespaceId.DEFAULT.app("app2");
ApplicationManager appManager2 = deployApplication(appId2, appRequest2);
// Assert metadata
Metadata app1Actual = getMetadataAdmin().getMetadata(appId1.toMetadataEntity(), MetadataScope.SYSTEM);
Set<String> app1ExpectedTags = ImmutableSet.of("_conn_1", "_conn_3");
// here assert actual tags contain all the tags about connections
Assert.assertTrue(app1Actual.getTags(MetadataScope.SYSTEM).containsAll(app1ExpectedTags));
// user metadata should be empty
Assert.assertEquals(Metadata.EMPTY, getMetadataAdmin().getMetadata(appId1.toMetadataEntity(), MetadataScope.USER));
Metadata app2Actual = getMetadataAdmin().getMetadata(appId2.toMetadataEntity(), MetadataScope.SYSTEM);
Set<String> app2ExpectedTags = ImmutableSet.of("_conn_1", "_conn_2", "_conn_3", "_conn_4", "_conn_5");
// here assert actual tags contain all the tags about connections
Assert.assertTrue(app2Actual.getTags(MetadataScope.SYSTEM).containsAll(app2ExpectedTags));
// user metadata should be empty
Assert.assertEquals(Metadata.EMPTY, getMetadataAdmin().getMetadata(appId2.toMetadataEntity(), MetadataScope.USER));
// using search query to find out the related apps
Set<MetadataEntity> appsRelated = ImmutableSet.of(appId1.toMetadataEntity(), appId2.toMetadataEntity());
assertMetadataSearch(appsRelated, "tags:_conn_1");
assertMetadataSearch(Collections.singleton(appId2.toMetadataEntity()), "tags:_conn_2");
assertMetadataSearch(appsRelated, "tags:_conn_3");
assertMetadataSearch(Collections.singleton(appId2.toMetadataEntity()), "tags:_conn_4");
assertMetadataSearch(Collections.singleton(appId2.toMetadataEntity()), "tags:_conn_5");
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineConnectionTest method testUsingConnections.
private void testUsingConnections(Engine engine) throws Exception {
String sourceConnName = "sourceConn " + engine;
String sinkConnName = "sinkConn " + engine;
String srcTableName = "src" + engine;
String sinkTableName = "sink" + engine;
// add some bad json object to the property
addConnection(sourceConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, ImmutableMap.of("tableName", srcTableName, "key1", "${badval}"), new ArtifactSelectorConfig())));
addConnection(sinkConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, ImmutableMap.of("tableName", sinkTableName, "key1", "${badval}"), new ArtifactSelectorConfig())));
// add json string to the runtime arguments to ensure plugin can get instantiated under such condition
Map<String, String> runtimeArguments = Collections.singletonMap("badval", "{\"a\" : 1}");
// source -> sink
ETLBatchConfig config = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source", MockSource.getPluginUsingConnection(sourceConnName))).addStage(new ETLStage("sink", MockSink.getPluginUsingConnection(sinkConnName))).addConnection("source", "sink").build();
Schema schema = Schema.recordOf("x", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord samuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord dwayne = StructuredRecord.builder(schema).set("name", "dwayne").build();
// add the dataset by the test, the source won't create it since table name is macro enabled
addDatasetInstance(NamespaceId.DEFAULT.dataset(srcTableName), Table.class.getName());
DataSetManager<Table> sourceTable = getDataset(srcTableName);
MockSource.writeInput(sourceTable, ImmutableList.of(samuel, dwayne));
// verify preview can run successfully using connections
PreviewManager previewManager = getPreviewManager();
PreviewConfig previewConfig = new PreviewConfig(SmartWorkflow.NAME, ProgramType.WORKFLOW, runtimeArguments, 10);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, new AppRequest<>(APP_ARTIFACT, config, previewConfig));
// Wait for the preview status go into COMPLETED.
Tasks.waitFor(PreviewStatus.Status.COMPLETED, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
PreviewStatus status = previewManager.getStatus(previewId);
return status == null ? null : status.getStatus();
}
}, 5, TimeUnit.MINUTES);
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("testApp" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// start the actual pipeline run
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.startAndWaitForGoodRun(runtimeArguments, ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
DataSetManager<Table> sinkTable = getDataset(sinkTableName);
List<StructuredRecord> outputRecords = MockSink.readOutput(sinkTable);
Assert.assertEquals(ImmutableSet.of(dwayne, samuel), new HashSet<>(outputRecords));
// modify the connection to use a new table name for source and sink
String newSrcTableName = "new" + srcTableName;
String newSinkTableName = "new" + sinkTableName;
addConnection(sourceConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, Collections.singletonMap("tableName", newSrcTableName), new ArtifactSelectorConfig())));
addConnection(sinkConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, Collections.singletonMap("tableName", newSinkTableName), new ArtifactSelectorConfig())));
addDatasetInstance(NamespaceId.DEFAULT.dataset(newSrcTableName), Table.class.getName());
StructuredRecord newRecord1 = StructuredRecord.builder(schema).set("name", "john").build();
StructuredRecord newRecord2 = StructuredRecord.builder(schema).set("name", "tom").build();
sourceTable = getDataset(newSrcTableName);
MockSource.writeInput(sourceTable, ImmutableList.of(newRecord1, newRecord2));
// run the program again, it should use the new table to read and write
manager.start(runtimeArguments);
manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 3, TimeUnit.MINUTES);
sinkTable = getDataset(newSinkTableName);
outputRecords = MockSink.readOutput(sinkTable);
Assert.assertEquals(ImmutableSet.of(newRecord1, newRecord2), new HashSet<>(outputRecords));
deleteConnection(sourceConnName);
deleteConnection(sinkConnName);
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(srcTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(sinkTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(newSrcTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(newSinkTableName));
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testSimpleMultiSource.
private void testSimpleMultiSource(Engine engine) throws Exception {
/*
* source1 --|
* |--> sleep --> sink
* source2 --|
*/
String source1Name = String.format("simpleMSInput1-%s", engine);
String source2Name = String.format("simpleMSInput2-%s", engine);
String sinkName = String.format("simpleMSOutput-%s", engine);
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(source1Name))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source1", "sleep").addConnection("source2", "sleep").addConnection("sleep", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SimpleMultiSourceApp-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Assert.assertEquals(2, appManager.getInfo().getPrograms().size());
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordVincent = StructuredRecord.builder(schema).set("name", "vincent").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordVincent));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob, recordVincent);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(1, appId, "source2.records.out");
validateMetric(3, appId, "sleep.records.in");
validateMetric(3, appId, "sleep.records.out");
validateMetric(3, appId, "sink.records.in");
Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
try (CloseableIterator<Message> messages = getMessagingContext().getMessageFetcher().fetch(appId.getNamespace(), "sleepTopic", 10, null)) {
Assert.assertTrue(messages.hasNext());
Assert.assertEquals("2", messages.next().getPayloadAsString());
Assert.assertFalse(messages.hasNext());
}
getMessagingAdmin(appId.getNamespace()).deleteTopic("sleepTopic");
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testConditionsOnBranches.
@Test
public void testConditionsOnBranches() throws Exception {
/*
* |-- true --> sink1
* |--> condition1 --|
* source --| |-- false --> sink2
* |
* | |-- true --> sink3
* |-- transform --> condition2 --|
* |-- false --> sink4
*/
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
String sourceName = "branchConditionsSource";
String sink1Name = "branchConditionsSink1";
String sink2Name = "branchConditionsSink2";
String sink3Name = "branchConditionsSink3";
String sink4Name = "branchConditionsSink4";
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(sourceName, schema))).addStage(new ETLStage("condition1", MockCondition.getPlugin("condition1"))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("condition2", MockCondition.getPlugin("condition2"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("sink3", MockSink.getPlugin(sink3Name))).addStage(new ETLStage("sink4", MockSink.getPlugin(sink4Name))).addConnection("source", "condition1").addConnection("source", "transform").addConnection("condition1", "sink1", true).addConnection("condition1", "sink2", false).addConnection("transform", "condition2").addConnection("condition2", "sink3", true).addConnection("condition2", "sink4", false).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("branchConditions");
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = Collections.singletonList(StructuredRecord.builder(schema).set("name", "samuel").build());
DataSetManager<Table> inputManager = getDataset(sourceName);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition1.branch.to.execute", "true", "condition2.branch.to.execute", "false"));
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> sink1Manager = getDataset(sink1Name);
DataSetManager<Table> sink2Manager = getDataset(sink2Name);
DataSetManager<Table> sink3Manager = getDataset(sink3Name);
DataSetManager<Table> sink4Manager = getDataset(sink4Name);
Assert.assertEquals(records, MockSink.readOutput(sink1Manager));
Assert.assertTrue(MockSink.readOutput(sink2Manager).isEmpty());
Assert.assertTrue(MockSink.readOutput(sink3Manager).isEmpty());
Assert.assertEquals(records, MockSink.readOutput(sink4Manager));
}
use of io.cdap.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testSimpleConditionWithMultipleOutputActions.
@Test
public void testSimpleConditionWithMultipleOutputActions() throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
*
* condition --Action--> file ---> trueSink
* |
* |--->Action--->file----> falseSink
*
*/
String appName = "SimpleConditionWithMultipleOutputActions";
String trueSource = "true" + appName + "Source";
String falseSource = "false" + appName + "Source";
String trueSink = "true" + appName + "Sink";
String falseSink = "false" + appName + "Sink";
String actionTable = "actionTable" + appName;
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2"))).addConnection("condition", "action1", true).addConnection("action1", "trueSource").addConnection("trueSource", "trueSink").addConnection("condition", "action2", false).addConnection("action2", "falseSource").addConnection("falseSource", "falseSink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(appName);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
for (String branch : Arrays.asList("true", "false")) {
String source = branch.equals("true") ? trueSource : falseSource;
String sink = branch.equals("true") ? trueSink : falseSink;
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
if (branch.equals("true")) {
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
} else {
workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
}
// check sink
DataSetManager<Table> sinkManager = getDataset(sink);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, branch + "Source.records.out");
validateMetric(2, appId, branch + "Sink.records.in");
// check Action1 and Action2 is executed correctly
DataSetManager<Table> actionTableDS = getDataset(actionTable);
if (branch.equals("true")) {
Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
} else {
Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
}
}
}
Aggregations