use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testNoConnectorsForSourceCondition.
@Test
public void testNoConnectorsForSourceCondition() throws Exception {
//
// condition1-->condition2-->source-->sink
//
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("simpleNoConnectorConditionSource", schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin("trueOutput"))).addStage(new ETLStage("condition1", MockCondition.getPlugin("condition1"))).addStage(new ETLStage("condition2", MockCondition.getPlugin("condition2"))).addConnection("condition1", "condition2", true).addConnection("condition2", "source", true).addConnection("source", "trueSink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("NoConnectorForSourceConditionApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("simpleNoConnectorConditionSource"));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition1.branch.to.execute", "true", "condition2.branch.to.execute", "true"));
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset("trueOutput");
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineConnectionTest method testBrowseSample.
@Test
public void testBrowseSample() throws Exception {
File directory = TEMP_FOLDER.newFolder();
List<BrowseEntity> entities = addFilesInDirectory(directory);
String conn = "BrowseSample";
addConnection(conn, new ConnectionCreationRequest("", new PluginInfo(FileConnector.NAME, Connector.PLUGIN_TYPE, null, Collections.emptyMap(), // in set up we add "-mocks" as the suffix for the artifact id
new ArtifactSelectorConfig("system", APP_ARTIFACT_ID.getArtifact() + "-mocks", APP_ARTIFACT_ID.getVersion()))));
// get all 10 results back
BrowseDetail browseDetail = browseConnection(conn, directory.getCanonicalPath(), 10);
BrowseDetail expected = BrowseDetail.builder().setTotalCount(10).setEntities(entities).build();
Assert.assertEquals(expected, browseDetail);
// only retrieve 5 back, count should still be 10
browseDetail = browseConnection(conn, directory.getCanonicalPath(), 5);
expected = BrowseDetail.builder().setTotalCount(10).setEntities(entities.subList(0, 5)).build();
Assert.assertEquals(expected, browseDetail);
// browse the created directory, should give empty result
browseDetail = browseConnection(conn, entities.get(0).getPath(), 10);
expected = BrowseDetail.builder().setTotalCount(0).build();
Assert.assertEquals(expected, browseDetail);
// browse the file, since it is not browsable, it should return itself
browseDetail = browseConnection(conn, entities.get(1).getPath(), 10);
expected = BrowseDetail.builder().setTotalCount(1).addEntity(entities.get(1)).build();
Assert.assertEquals(expected, browseDetail);
List<StructuredRecord> records = new ArrayList<>();
Schema schema = Schema.recordOf("schema", Schema.Field.of("offset", Schema.of(Schema.Type.LONG)), Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
for (int i = 0; i < 100; i++) {
records.add(StructuredRecord.builder(schema).set("offset", i * 2L).set("body", "1").build());
}
ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", APP_ARTIFACT_ID.getArtifact() + "-mocks", APP_ARTIFACT_ID.getVersion());
Map<String, String> properties = ImmutableMap.of("path", entities.get(1).getPath(), "useConnection", "true", "connection", String.format("${conn(%s)}", conn));
ConnectorDetail detail = new ConnectorDetail(ImmutableSet.of(new PluginDetail("file", "batchsource", properties, artifact, schema), new PluginDetail("file", "streamingsource", properties, artifact, schema)));
SampleResponse expectedSample = new SampleResponse(detail, schema, records);
// sample the file, the file has 100 lines, so 200 should retrieve all lines
SampleResponse sampleResponse = sampleConnection(conn, entities.get(1).getPath(), 200);
Assert.assertEquals(expectedSample, sampleResponse);
// sample 100, should get all
sampleResponse = sampleConnection(conn, entities.get(1).getPath(), 100);
Assert.assertEquals(expectedSample, sampleResponse);
// sample 50, should only get 50
sampleResponse = sampleConnection(conn, entities.get(1).getPath(), 50);
expectedSample = new SampleResponse(detail, schema, records.subList(0, 50));
Assert.assertEquals(expectedSample, sampleResponse);
deleteConnection(conn);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineConnectionTest method testUsingConnections.
private void testUsingConnections(Engine engine) throws Exception {
String sourceConnName = "sourceConn " + engine;
String sinkConnName = "sinkConn " + engine;
String srcTableName = "src" + engine;
String sinkTableName = "sink" + engine;
// add some bad json object to the property
addConnection(sourceConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, ImmutableMap.of("tableName", srcTableName, "key1", "${badval}"), new ArtifactSelectorConfig())));
addConnection(sinkConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, ImmutableMap.of("tableName", sinkTableName, "key1", "${badval}"), new ArtifactSelectorConfig())));
// add json string to the runtime arguments to ensure plugin can get instantiated under such condition
Map<String, String> runtimeArguments = Collections.singletonMap("badval", "{\"a\" : 1}");
// source -> sink
ETLBatchConfig config = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source", MockSource.getPluginUsingConnection(sourceConnName))).addStage(new ETLStage("sink", MockSink.getPluginUsingConnection(sinkConnName))).addConnection("source", "sink").build();
Schema schema = Schema.recordOf("x", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord samuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord dwayne = StructuredRecord.builder(schema).set("name", "dwayne").build();
// add the dataset by the test, the source won't create it since table name is macro enabled
addDatasetInstance(NamespaceId.DEFAULT.dataset(srcTableName), Table.class.getName());
DataSetManager<Table> sourceTable = getDataset(srcTableName);
MockSource.writeInput(sourceTable, ImmutableList.of(samuel, dwayne));
// verify preview can run successfully using connections
PreviewManager previewManager = getPreviewManager();
PreviewConfig previewConfig = new PreviewConfig(SmartWorkflow.NAME, ProgramType.WORKFLOW, runtimeArguments, 10);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, new AppRequest<>(APP_ARTIFACT, config, previewConfig));
// Wait for the preview status go into COMPLETED.
Tasks.waitFor(PreviewStatus.Status.COMPLETED, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
PreviewStatus status = previewManager.getStatus(previewId);
return status == null ? null : status.getStatus();
}
}, 5, TimeUnit.MINUTES);
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("testApp" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// start the actual pipeline run
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.startAndWaitForGoodRun(runtimeArguments, ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
DataSetManager<Table> sinkTable = getDataset(sinkTableName);
List<StructuredRecord> outputRecords = MockSink.readOutput(sinkTable);
Assert.assertEquals(ImmutableSet.of(dwayne, samuel), new HashSet<>(outputRecords));
// modify the connection to use a new table name for source and sink
String newSrcTableName = "new" + srcTableName;
String newSinkTableName = "new" + sinkTableName;
addConnection(sourceConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, Collections.singletonMap("tableName", newSrcTableName), new ArtifactSelectorConfig())));
addConnection(sinkConnName, new ConnectionCreationRequest("", new PluginInfo("test", "dummy", null, Collections.singletonMap("tableName", newSinkTableName), new ArtifactSelectorConfig())));
addDatasetInstance(NamespaceId.DEFAULT.dataset(newSrcTableName), Table.class.getName());
StructuredRecord newRecord1 = StructuredRecord.builder(schema).set("name", "john").build();
StructuredRecord newRecord2 = StructuredRecord.builder(schema).set("name", "tom").build();
sourceTable = getDataset(newSrcTableName);
MockSource.writeInput(sourceTable, ImmutableList.of(newRecord1, newRecord2));
// run the program again, it should use the new table to read and write
manager.start(runtimeArguments);
manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 3, TimeUnit.MINUTES);
sinkTable = getDataset(newSinkTableName);
outputRecords = MockSink.readOutput(sinkTable);
Assert.assertEquals(ImmutableSet.of(newRecord1, newRecord2), new HashSet<>(outputRecords));
deleteConnection(sourceConnName);
deleteConnection(sinkConnName);
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(srcTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(sinkTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(newSrcTableName));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(newSinkTableName));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testSimpleMultiSource.
private void testSimpleMultiSource(Engine engine) throws Exception {
/*
* source1 --|
* |--> sleep --> sink
* source2 --|
*/
String source1Name = String.format("simpleMSInput1-%s", engine);
String source2Name = String.format("simpleMSInput2-%s", engine);
String sinkName = String.format("simpleMSOutput-%s", engine);
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(source1Name))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source1", "sleep").addConnection("source2", "sleep").addConnection("sleep", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SimpleMultiSourceApp-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Assert.assertEquals(2, appManager.getInfo().getPrograms().size());
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordVincent = StructuredRecord.builder(schema).set("name", "vincent").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordVincent));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob, recordVincent);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(1, appId, "source2.records.out");
validateMetric(3, appId, "sleep.records.in");
validateMetric(3, appId, "sleep.records.out");
validateMetric(3, appId, "sink.records.in");
Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
try (CloseableIterator<Message> messages = getMessagingContext().getMessageFetcher().fetch(appId.getNamespace(), "sleepTopic", 10, null)) {
Assert.assertTrue(messages.hasNext());
Assert.assertEquals("2", messages.next().getPayloadAsString());
Assert.assertFalse(messages.hasNext());
}
getMessagingAdmin(appId.getNamespace()).deleteTopic("sleepTopic");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testConditionsOnBranches.
@Test
public void testConditionsOnBranches() throws Exception {
/*
* |-- true --> sink1
* |--> condition1 --|
* source --| |-- false --> sink2
* |
* | |-- true --> sink3
* |-- transform --> condition2 --|
* |-- false --> sink4
*/
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
String sourceName = "branchConditionsSource";
String sink1Name = "branchConditionsSink1";
String sink2Name = "branchConditionsSink2";
String sink3Name = "branchConditionsSink3";
String sink4Name = "branchConditionsSink4";
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(sourceName, schema))).addStage(new ETLStage("condition1", MockCondition.getPlugin("condition1"))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("condition2", MockCondition.getPlugin("condition2"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("sink3", MockSink.getPlugin(sink3Name))).addStage(new ETLStage("sink4", MockSink.getPlugin(sink4Name))).addConnection("source", "condition1").addConnection("source", "transform").addConnection("condition1", "sink1", true).addConnection("condition1", "sink2", false).addConnection("transform", "condition2").addConnection("condition2", "sink3", true).addConnection("condition2", "sink4", false).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("branchConditions");
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = Collections.singletonList(StructuredRecord.builder(schema).set("name", "samuel").build());
DataSetManager<Table> inputManager = getDataset(sourceName);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition1.branch.to.execute", "true", "condition2.branch.to.execute", "false"));
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> sink1Manager = getDataset(sink1Name);
DataSetManager<Table> sink2Manager = getDataset(sink2Name);
DataSetManager<Table> sink3Manager = getDataset(sink3Name);
DataSetManager<Table> sink4Manager = getDataset(sink4Name);
Assert.assertEquals(records, MockSink.readOutput(sink1Manager));
Assert.assertTrue(MockSink.readOutput(sink2Manager).isEmpty());
Assert.assertTrue(MockSink.readOutput(sink3Manager).isEmpty());
Assert.assertEquals(records, MockSink.readOutput(sink4Manager));
}
Aggregations