use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class Upgrader method convertStreamsConfig.
private DataStreamsConfig convertStreamsConfig(String configStr) {
DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
for (ETLStage stage : config.getStages()) {
builder.addStage(stage.upgradeStage(dataStreamsContext));
}
return builder.build();
}
use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class PreviewDataStreamsTest method testDataStreamsPreviewRun.
@Test
public void testDataStreamsPreviewRun() throws Exception {
PreviewManager previewManager = getPreviewManager();
String sinkTableName = "singleOutput";
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> records = new ArrayList<>();
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
records.add(recordSamuel);
records.add(recordBob);
records.add(recordTest);
/*
* source --> transform -> sink
*/
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").build();
// Construct the preview config with the program name and program type.
PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
// Start the preview and get the corresponding PreviewRunner.
ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
final PreviewRunner previewRunner = previewManager.getRunner(previewId);
// Wait for the preview to be running and wait until the records are processed in the sink.
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
Map<String, List<JsonElement>> data = previewRunner.getData("sink");
return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
}
}, 1, TimeUnit.MINUTES);
// check data in source and transform
checkPreviewStore(previewRunner, "source", 3);
checkPreviewStore(previewRunner, "transform", 3);
// Wait for the pipeline to be shutdown by timer.
TimeUnit.MINUTES.sleep(1);
Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {
@Override
public PreviewStatus.Status call() throws Exception {
return previewRunner.getStatus().getStatus();
}
}, 1, TimeUnit.MINUTES);
// Validate the metrics for preview
validateMetric(3, previewId, "source.records.out", previewRunner);
validateMetric(3, previewId, "transform.records.in", previewRunner);
validateMetric(3, previewId, "transform.records.out", previewRunner);
validateMetric(3, previewId, "sink.records.in", previewRunner);
validateMetric(3, previewId, "sink.records.out", previewRunner);
// Check the sink table is not created in the real space.
DataSetManager<Table> sinkManager = getDataset(sinkTableName);
Assert.assertNull(sinkManager.get());
}
use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testWindower.
@Test
public void testWindower() throws Exception {
/*
* source --> window(width=10,interval=1) --> aggregator --> filter --> sink
*/
Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
String sinkName = "windowOut";
// source sleeps 1 second between outputs
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
// the sink should contain at least one record with count of 3, and no records with more than 3.
// less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
// that contains all 3.
final DataSetManager<Table> outputManager = getDataset(sinkName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
outputManager.flush();
boolean sawThree = false;
for (StructuredRecord record : MockSink.readOutput(outputManager)) {
long count = record.get("ct");
if (count == 3L) {
sawThree = true;
}
Assert.assertTrue(count <= 3L);
}
return sawThree;
}
}, 2, TimeUnit.MINUTES);
sparkManager.stop();
}
use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testErrorTransform.
@Test
public void testErrorTransform() throws Exception {
String sink1TableName = "errTestOut1";
String sink2TableName = "errTestOut2";
Schema inputSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build(), StructuredRecord.builder(inputSchema).set("name", "Don").build(), StructuredRecord.builder(inputSchema).set("name", "Mike").build(), StructuredRecord.builder(inputSchema).set("name", "April").build());
/*
*
* source--> filter1 --> filter2 --> agg1 --> agg2
* | | | |
* |-----------|---------|--------|--------|--> flatten errors --> sink1
* |
* |--> filter errors --> sink2
* arrows coming out the right represent output records
* arrows coming out the bottom represent error records
* this will test multiple stages from multiple phases emitting errors to the same stage
* as well as errors from one stage going to multiple stages
*/
DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(inputSchema, input))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "Leo"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "Ralph"))).addStage(new ETLStage("agg1", GroupFilterAggregator.getPlugin("name", "Don"))).addStage(new ETLStage("agg2", GroupFilterAggregator.getPlugin("name", "Mike"))).addStage(new ETLStage("errorflatten", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("errorfilter", FilterErrorTransform.getPlugin(3))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1TableName))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2TableName))).addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "agg1").addConnection("agg1", "agg2").addConnection("filter1", "errorflatten").addConnection("filter1", "errorfilter").addConnection("filter2", "errorflatten").addConnection("filter2", "errorfilter").addConnection("agg1", "errorflatten").addConnection("agg1", "errorfilter").addConnection("agg2", "errorflatten").addConnection("agg2", "errorfilter").addConnection("errorflatten", "sink1").addConnection("errorfilter", "sink2").build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("ErrTransformTest");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
Schema flattenSchema = Schema.recordOf("erroruser", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
final Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(flattenSchema).set("name", "Leo").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter1").build(), StructuredRecord.builder(flattenSchema).set("name", "Ralph").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter2").build(), StructuredRecord.builder(flattenSchema).set("name", "Don").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg1").build(), StructuredRecord.builder(flattenSchema).set("name", "Mike").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg2").build());
final DataSetManager<Table> sink1Table = getDataset(sink1TableName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink1Table.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink1Table));
return expected.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build());
final DataSetManager<Table> sink2Table = getDataset(sink2TableName);
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sink2Table.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sink2Table));
return expected2.equals(outputRecords);
}
}, 4, TimeUnit.MINUTES);
}
use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testParallelAggregators.
@Test
public void testParallelAggregators() throws Exception {
String sink1Name = "pAggOutput1";
String sink2Name = "pAggOutput2";
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
List<StructuredRecord> input1 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build());
List<StructuredRecord> input2 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build());
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema, input2))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").disableCheckpoints().build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForStatus(true, 10, 1);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
final DataSetManager<Table> sinkManager1 = getDataset(sink1Name);
final Set<StructuredRecord> expected1 = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sinkManager1.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sinkManager1));
return expected1.equals(outputRecords);
}
}, 1, TimeUnit.MINUTES);
final DataSetManager<Table> sinkManager2 = getDataset(sink2Name);
final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
sinkManager2.flush();
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(sinkManager2));
return expected2.equals(outputRecords);
}
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStatus(false, 10, 1);
validateMetric(appId, "source1.records.out", 2);
validateMetric(appId, "source2.records.out", 3);
validateMetric(appId, "agg1.records.in", 5);
validateMetric(appId, "agg1.records.out", 3);
validateMetric(appId, "agg2.records.in", 5);
validateMetric(appId, "agg2.records.out", 5);
validateMetric(appId, "sink1.records.in", 3);
validateMetric(appId, "sink2.records.in", 5);
}
Aggregations