use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataStreamsTest method testSplitterTransform.
@Test
public void testSplitterTransform() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
String sink1Name = "splitSink1";
String sink2Name = "splitSink2";
/*
*
* |null --> sink1
* |null--> splitter2 --|
* source --> splitter1--| |non-null --|
* | |--> sink2
* |non-null------------------------|
*/
DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(user0, user1, user2, user3)))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "splitter2", "null").addConnection("splitter1", "sink2", "non-null").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("SplitterTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
// run pipeline
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
// check output
// sink1 should only have records where both name and email are null (user0)
final DataSetManager<Table> sink1Manager = getDataset(sink1Name);
final Set<StructuredRecord> expected1 = ImmutableSet.of(user0);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1Manager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink1Manager));
return expected1.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
// sink2 should have anything with a non-null name or non-null email
final DataSetManager<Table> sink2Manager = getDataset(sink2Name);
final Set<StructuredRecord> expected2 = ImmutableSet.of(user1, user2, user3);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink2Manager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink2Manager));
return expected2.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "splitter1.records.in", 4);
validateMetric(appId, "splitter1.records.out.non-null", 2);
validateMetric(appId, "splitter1.records.out.null", 2);
validateMetric(appId, "splitter2.records.in", 2);
validateMetric(appId, "splitter2.records.out.non-null", 1);
validateMetric(appId, "splitter2.records.out.null", 1);
validateMetric(appId, "sink1.records.in", 1);
validateMetric(appId, "sink2.records.in", 3);
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class DataStreamsTest method testTransformComputeWithMacros.
@Test
public void testTransformComputeWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = new ArrayList<>();
StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
input.add(samuelRecord);
input.add(jacksonRecord);
input.add(dwayneRecord);
input.add(johnsonRecord);
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
final Set<StructuredRecord> expected = new HashSet<>();
expected.add(samuelRecord);
expected.add(jacksonRecord);
testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "sleep.records.in", 4);
validateMetric(appId, "sleep.records.out", 4);
validateMetric(appId, "filter1.records.in", 4);
validateMetric(appId, "filter1.records.out", 3);
validateMetric(appId, "filter2.records.in", 3);
validateMetric(appId, "filter2.records.out", 2);
validateMetric(appId, "sink.records.in", 2);
Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
expected.clear();
expected.add(dwayneRecord);
expected.add(johnsonRecord);
testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class PipelineTest method testWordCount.
public void testWordCount(String pluginType) throws Exception {
String inputName = "wcInput-" + pluginType;
String outputName = "wcOutput-" + pluginType;
// create the pipeline config
ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
Map<String, String> aggProperties = new HashMap<>();
aggProperties.put("field", "text");
ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(outputName);
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
Assert.assertEquals(expected, outputRecords);
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class ETLBatchConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), null);
co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
Set<Connection> connections = new HashSet<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
String schedule = "*/5 * * * *";
Resources resources = new Resources(1024, 1);
ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
co.cask.cdap.etl.proto.v2.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return null;
}
}));
}
use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.
the class Upgrader method convertStreamsConfig.
private DataStreamsConfig convertStreamsConfig(String configStr) {
DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
for (ETLStage stage : config.getStages()) {
builder.addStage(stage.upgradeStage(dataStreamsContext));
}
return builder.build();
}
Aggregations