use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method deployPipelineWithSchedule.
private WorkflowManager deployPipelineWithSchedule(String pipelineName, Engine engine, String triggeringPipelineName, ArgumentMapping key1Mapping, String expectedKey1Value, PluginPropertyMapping key2Mapping, String expectedKey2Value) throws Exception {
String tableName = "actionScheduleTable" + pipelineName + engine;
String sourceName = "macroActionWithScheduleInput-" + pipelineName + engine;
String sinkName = "macroActionWithScheduleOutput-" + pipelineName + engine;
String key1 = key1Mapping.getTarget();
String key2 = key2Mapping.getTarget();
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("action1", MockAction.getPlugin(tableName, "row1", "column1", String.format("${%s}", key1)))).addStage(new ETLStage("action2", MockAction.getPlugin(tableName, "row2", "column2", String.format("${%s}", key2)))).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", String.format("${%s}", key1)))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", String.format("${%s}", key2)))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("action1", "action2").addConnection("action2", "source").addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(pipelineName);
ApplicationManager appManager = deployApplication(appId, appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
// Use the expectedKey1Value and expectedKey2Value as values for two records, so that Only record "samuel"
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordKey1Value = StructuredRecord.builder(schema).set("name", expectedKey1Value).build();
StructuredRecord recordKey2Value = StructuredRecord.builder(schema).set("name", expectedKey2Value).build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(sourceName);
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordKey1Value, recordKey2Value));
String defaultNamespace = NamespaceId.DEFAULT.getNamespace();
// Use properties from the triggering pipeline as values for runtime argument key1, key2
TriggeringPropertyMapping propertyMapping = new TriggeringPropertyMapping(ImmutableList.of(key1Mapping), ImmutableList.of(key2Mapping));
ProgramStatusTrigger completeTrigger = new ProgramStatusTrigger(new WorkflowId(defaultNamespace, triggeringPipelineName, SmartWorkflow.NAME), ImmutableSet.of(ProgramStatus.COMPLETED));
ScheduleId scheduleId = appId.schedule("completeSchedule");
appManager.addSchedule(new ScheduleDetail(scheduleId.getNamespace(), scheduleId.getApplication(), scheduleId.getVersion(), scheduleId.getSchedule(), "", new ScheduleProgramInfo(SchedulableProgramType.WORKFLOW, SmartWorkflow.NAME), ImmutableMap.of(SmartWorkflow.TRIGGERING_PROPERTIES_MAPPING, GSON.toJson(propertyMapping)), completeTrigger, ImmutableList.<Constraint>of(), Schedulers.JOB_QUEUE_TIMEOUT_MILLIS, null));
appManager.enableSchedule(scheduleId);
return appManager.getWorkflowManager(SmartWorkflow.NAME);
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testSimpleMultiSource.
private void testSimpleMultiSource(Engine engine) throws Exception {
/*
* source1 --|
* |--> sleep --> sink
* source2 --|
*/
String source1Name = String.format("simpleMSInput1-%s", engine);
String source2Name = String.format("simpleMSInput2-%s", engine);
String sinkName = String.format("simpleMSOutput-%s", engine);
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin(source1Name))).addStage(new ETLStage("source2", MockSource.getPlugin(source2Name))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source1", "sleep").addConnection("source2", "sleep").addConnection("sleep", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SimpleMultiSourceApp-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Assert.assertEquals(2, appManager.getInfo().getPrograms().size());
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordVincent = StructuredRecord.builder(schema).set("name", "vincent").build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source1Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordVincent));
inputManager = getDataset(NamespaceId.DEFAULT.dataset(source2Name));
MockSource.writeInput(inputManager, ImmutableList.of(recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sinkName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob, recordVincent);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, "source1.records.out");
validateMetric(1, appId, "source2.records.out");
validateMetric(3, appId, "sleep.records.in");
validateMetric(3, appId, "sleep.records.out");
validateMetric(3, appId, "sink.records.in");
Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
try (CloseableIterator<Message> messages = getMessagingContext().getMessageFetcher().fetch(appId.getNamespace(), "sleepTopic", 10, null)) {
Assert.assertTrue(messages.hasNext());
Assert.assertEquals("2", messages.next().getPayloadAsString());
Assert.assertFalse(messages.hasNext());
}
getMessagingAdmin(appId.getNamespace()).deleteTopic("sleepTopic");
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testSecureStorePipeline.
/**
* Tests the secure storage macro function in a pipelines by creating datasets from the secure store data.
*/
private void testSecureStorePipeline(Engine engine, String prefix) throws Exception {
/*
* Trivial pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("input", "${secure(" + prefix + "source)}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("output", "${secure(" + prefix + "sink)}"))).addConnection("source", "sink").setEngine(engine).build();
// place dataset names into secure storage
getSecureStoreManager().putSecureData("default", prefix + "source", prefix + "MockSecureSourceDataset", "secure source dataset name", new HashMap<String, String>());
getSecureStoreManager().putSecureData("default", prefix + "sink", prefix + "MockSecureSinkDataset", "secure dataset name", new HashMap<String, String>());
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("App-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
// make sure the datasets don't exist beforehand
Assert.assertNull(getDataset(prefix + "MockSecureSourceDataset").get());
Assert.assertNull(getDataset(prefix + "MockSecureSinkDataset").get());
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// now the datasets should exist
Assert.assertNotNull(getDataset(prefix + "MockSecureSourceDataset").get());
Assert.assertNotNull(getDataset(prefix + "MockSecureSinkDataset").get());
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataPipelineTest method testMacrosMapReducePipeline.
@Test
public void testMacrosMapReducePipeline() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "${runtime${source}}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "${runtime}${sink}"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "MRSinkDataset", "source", "Source", "runtimeSource", "mockRuntimeMRSourceDataset");
// make sure the datasets don't exist beforehand
Assert.assertNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNull(getDataset("mockRuntimeMRSinkDataset").get());
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// now the datasets should exist
Assert.assertNotNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNotNull(getDataset("mockRuntimeMRSinkDataset").get());
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PreviewDataPipelineTest method testPreviewFailedRun.
private void testPreviewFailedRun(Engine engine) throws Exception {
PreviewManager previewManager = getPreviewManager();
String sourceTableName = "singleInput";
String sinkTableName = "singleOutput";
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
* source --> transform -> sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin(sourceTableName, schema))).addStage(new ETLStage("transform", ExceptionTransform.getPlugin("name", "samuel"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setEngine(engine).build();
// Construct the preview config with the program name and program type.
PreviewConfig previewConfig = new PreviewConfig(SmartWorkflow.NAME, ProgramType.WORKFLOW, Collections.<String, String>emptyMap(), 10);
// Create the table for the mock source
addDatasetInstance(Table.class.getName(), sourceTableName, DatasetProperties.of(ImmutableMap.of("schema", schema.toString())));
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceTableName));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
MockSource.writeInput(inputManager, "1", recordSamuel);
MockSource.writeInput(inputManager, "2", recordBob);
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
final PreviewRunner previewRunner = previewManager.getRunner(previewId);
// Wait for the preview status go into FAILED.
Tasks.waitFor(PreviewStatus.Status.RUN_FAILED, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
PreviewStatus status = previewRunner.getStatus();
return status == null ? null : status.getStatus();
}
}, 5, TimeUnit.MINUTES);
// Get the data for stage "source" in the PreviewStore.
checkPreviewStore(previewRunner, "source", 2);
// Get the data for stage "transform" in the PreviewStore, should contain one less record than source.
checkPreviewStore(previewRunner, "transform", 1);
// Get the data for stage "sink" in the PreviewStore, should contain one less record than source.
checkPreviewStore(previewRunner, "sink", 1);
// Validate the metrics for preview
validateMetric(2, previewId, "source.records.in", previewRunner);
validateMetric(2, previewId, "source.records.out", previewRunner);
validateMetric(2, previewId, "transform.records.in", previewRunner);
validateMetric(1, previewId, "transform.records.out", previewRunner);
validateMetric(1, previewId, "sink.records.out", previewRunner);
validateMetric(1, previewId, "sink.records.in", previewRunner);
// Check the sink table is not created in the real space.
DataSetManager<Table> sinkManager = getDataset(sinkTableName);
Assert.assertNull(sinkManager.get());
deleteDatasetInstance(NamespaceId.DEFAULT.dataset(sourceTableName));
}
Aggregations